李宏毅的Attention讲解
现在看到的比较精华的
- 其中包含Kaggle大佬的代码
)
自己de归纳
- 大多是搬运知乎大佬的···
- 先看看Attention的科普
- 然后看看上面聊聊Transform里文头列出的文章
- 最后看看Transform作者的Attention实战
Attention公式
将softmax展开,则为1
Attention(\boldsymbol{q}_t,\boldsymbol{K},\boldsymbol{V}) = \sum_{s=1}^m \frac{1}{Z}\exp\left(\frac{\langle\boldsymbol{q}_t, \boldsymbol{k}_s\rangle}{\sqrt{d_k}}\right)\boldsymbol{v}_s
在Pytorch中的实现
出处
Scaled Dot-Product Attention
import torch
import torch.nn as nn
class ScaledDotProductAttention(nn.Module):
"""Scaled dot-product attention mechanism."""
def __init__(self, attention_dropout=0.0):
super(ScaledDotProductAttention, self).__init__()
self.dropout = nn.Dropout(attention_dropout)
self.softmax = nn.Softmax(dim=2)
def forward(self, q, k, v, scale=None, attn_mask=None):
"""
前向传播.
Args:
q: Queries张量,形状为[B, L_q, D_q]
k: Keys张量,形状为[B, L_k, D_k]
v: Values张量,形状为[B, L_v, D_v],一般来说就是k
scale: 缩放因子,一个浮点标量
attn_mask: Masking张量,形状为[B, L_q, L_k]
Returns:
上下文张量和attention张量
"""
attention = torch.bmm(q, k.transpose(1, 2))
if scale:
attention = attention * scale
if attn_mask:
# 给需要mask的地方设置一个负无穷
attention = attention.masked_fill_(attn_mask, -np.inf)
# 计算softmax
attention = self.softmax(attention)
# 添加dropout
attention = self.dropout(attention)
# 和V做点积
context = torch.bmm(attention, v)
return context, attention
在Keras中的实现
# coding=utf8
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints
from keras.layers.merge import _Merge
class Attention(Layer):
def __init__(self, step_dim,
W_regularizer=None, b_regularizer=None,
W_constraint=None, b_constraint=None,
bias=True, **kwargs):
"""
Keras Layer that implements an Attention mechanism for temporal data.
Supports Masking.
Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
# Input shape
3D tensor with shape: `(samples, steps, features)`.
# Output shape
2D tensor with shape: `(samples, features)`.
:param kwargs:
Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
The dimensions are inferred based on the output shape of the RNN.
Example:
model.add(LSTM(64, return_sequences=True))
model.add(Attention())
"""
self.supports_masking = True
# self.init = initializations.get('glorot_uniform')
self.init = initializers.get('glorot_uniform')
self.W_regularizer = regularizers.get(W_regularizer)
self.b_regularizer = regularizers.get(b_regularizer)
self.W_constraint = constraints.get(W_constraint)
self.b_constraint = constraints.get(b_constraint)
self.bias = bias
self.step_dim = step_dim
self.features_dim = 0
super(Attention, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = self.add_weight((input_shape[-1],),
initializer=self.init,
name='{}_W'.format(self.name),
regularizer=self.W_regularizer,
constraint=self.W_constraint)
self.features_dim = input_shape[-1]
if self.bias:
self.b = self.add_weight((input_shape[1],),
initializer='zero',
name='{}_b'.format(self.name),
regularizer=self.b_regularizer,
constraint=self.b_constraint)
else:
self.b = None
self.built = True
def compute_mask(self, input, input_mask=None):
# do not pass the mask to the next layers
return None
def call(self, x, mask=None):
input_shape = K.int_shape(x)
features_dim = self.features_dim
# step_dim = self.step_dim
step_dim = input_shape[1]
eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
if self.bias:
eij += self.b[:input_shape[1]]
eij = K.tanh(eij)
a = K.exp(eij)
# apply mask after the exp. will be re-normalized next
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting in theano
a *= K.cast(mask, K.floatx())
# in some cases especially in the early stages of training the sum may be almost zero
# and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
a = K.expand_dims(a)
weighted_input = x * a
# print weigthted_input.shape
return K.sum(weighted_input, axis=1)
def compute_output_shape(self, input_shape):
# return input_shape[0], input_shape[-1]
return input_shape[0], self.features_dim
# end Attention
Keras版本自己理解:K,Q,V对应 x, 权重W, x,然后根号dx替换成tanh激活来稳定...?
##### 用法区别(雾)
自己的感觉啦
- **Pytorch**的实现Scaled Dot-Product Attention用于Multi-head attention,**总体用于Transform框架**
- **Keras**实现用于**当做Layer**,单纯对输入/出做一个Attention
### 含Attention的模型构建(Keras)
##### Attention RNN Model
#Attention代码见上
import keras
from keras import Model
from keras.layers import *
from JoinAttLayer import Attention
class TextClassifier():
def model(self, embeddings_matrix, maxlen, word_index, num_class):
inp = Input(shape=(maxlen,))
encode = Bidirectional(CuDNNGRU(128, return_sequences=True))
encode2 = Bidirectional(CuDNNGRU(128, return_sequences=True))
attention = Attention(maxlen)
x_4 = Embedding(len(word_index) + 1,
embeddings_matrix.shape[1],
weights=[embeddings_matrix],
input_length=maxlen,
trainable=True)(inp)
x_3 = SpatialDropout1D(0.2)(x_4)
x_3 = encode(x_3)
x_3 = Dropout(0.2)(x_3)
x_3 = encode2(x_3)
x_3 = Dropout(0.2)(x_3)
avg_pool_3 = GlobalAveragePooling1D()(x_3)
max_pool_3 = GlobalMaxPooling1D()(x_3)
attention_3 = attention(x_3)
x = keras.layers.concatenate([avg_pool_3, max_pool_3, attention_3], name="fc")
x = Dense(num_class, activation="sigmoid")(x)
adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08,amsgrad=True)
model = Model(inputs=inp, outputs=x)
model.compile(
loss='categorical_crossentropy',
optimizer=adam)
return model
Attention RcNN Model
import keras
from keras import Model
from keras.layers import *
from JoinAttLayer import Attention
class TextClassifier():
def model(self, embeddings_matrix, maxlen, word_index, num_class):
inp = Input(shape=(maxlen,))
encode = Bidirectional(GRU(1, return_sequences=True))
encode2 = Bidirectional(GRU(1, return_sequences=True))
attention = Attention(maxlen)
x_4 = Embedding(len(word_index) + 1,
embeddings_matrix.shape[1],
weights=[embeddings_matrix],
input_length=maxlen,
trainable=True)(inp)
x_3 = SpatialDropout1D(0.2)(x_4)
x_3 = encode(x_3)
x_3 = Dropout(0.2)(x_3)
x_3 = encode2(x_3)
x_3 = Dropout(0.2)(x_3)
x_3 = Conv1D(64, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(x_3)
x_3 = Dropout(0.2)(x_3)
avg_pool_3 = GlobalAveragePooling1D()(x_3)
max_pool_3 = GlobalMaxPooling1D()(x_3)
attention_3 = attention(x_3)
x = keras.layers.concatenate([avg_pool_3, max_pool_3, attention_3])
x = Dense(num_class, activation="sigmoid")(x)
adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model = Model(inputs=inp, outputs=x)
model.compile(
loss='categorical_crossentropy',
optimizer=adam
)
return model
Capsule Model
训练技巧
- Loss Function(损失函数):不用seq2seq做,使用简单分成20个模型,loss function 设置为 categorical crossentropy
- Early Stop(提前停止)
- Class Weight(类别权重)
- EMA(指数平滑)
- Learning Rate (学习率)
- Max Length (padding 的最大句子长度)
- 如果要一次性训练20个模型的话,记得加上 python 的 gc 和 keras 的 clear_session。
del model1 del history gc.collect() K.clear_session()