李宏毅的Attention讲解
现在看到的比较精华的
- 其中包含Kaggle大佬的代码
)
自己de归纳
- 大多是搬运知乎大佬的···
- 先看看Attention的科普
- 然后看看上面聊聊Transform里文头列出的文章
- 最后看看Transform作者的Attention实战
Attention公式
将softmax展开,则为1
Attention(\boldsymbol{q}_t,\boldsymbol{K},\boldsymbol{V}) = \sum_{s=1}^m \frac{1}{Z}\exp\left(\frac{\langle\boldsymbol{q}_t, \boldsymbol{k}_s\rangle}{\sqrt{d_k}}\right)\boldsymbol{v}_s
在Pytorch中的实现
出处
Scaled Dot-Product Attention
import torch import torch.nn as nn class ScaledDotProductAttention(nn.Module): """Scaled dot-product attention mechanism.""" def __init__(self, attention_dropout=0.0): super(ScaledDotProductAttention, self).__init__() self.dropout = nn.Dropout(attention_dropout) self.softmax = nn.Softmax(dim=2) def forward(self, q, k, v, scale=None, attn_mask=None): """ 前向传播. Args: q: Queries张量,形状为[B, L_q, D_q] k: Keys张量,形状为[B, L_k, D_k] v: Values张量,形状为[B, L_v, D_v],一般来说就是k scale: 缩放因子,一个浮点标量 attn_mask: Masking张量,形状为[B, L_q, L_k] Returns: 上下文张量和attention张量 """ attention = torch.bmm(q, k.transpose(1, 2)) if scale: attention = attention * scale if attn_mask: # 给需要mask的地方设置一个负无穷 attention = attention.masked_fill_(attn_mask, -np.inf) # 计算softmax attention = self.softmax(attention) # 添加dropout attention = self.dropout(attention) # 和V做点积 context = torch.bmm(attention, v) return context, attention
在Keras中的实现
# coding=utf8 from keras import backend as K from keras.engine.topology import Layer from keras import initializers, regularizers, constraints from keras.layers.merge import _Merge class Attention(Layer): def __init__(self, step_dim, W_regularizer=None, b_regularizer=None, W_constraint=None, b_constraint=None, bias=True, **kwargs): """ Keras Layer that implements an Attention mechanism for temporal data. Supports Masking. Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756] # Input shape 3D tensor with shape: `(samples, steps, features)`. # Output shape 2D tensor with shape: `(samples, features)`. :param kwargs: Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True. The dimensions are inferred based on the output shape of the RNN. Example: model.add(LSTM(64, return_sequences=True)) model.add(Attention()) """ self.supports_masking = True # self.init = initializations.get('glorot_uniform') self.init = initializers.get('glorot_uniform') self.W_regularizer = regularizers.get(W_regularizer) self.b_regularizer = regularizers.get(b_regularizer) self.W_constraint = constraints.get(W_constraint) self.b_constraint = constraints.get(b_constraint) self.bias = bias self.step_dim = step_dim self.features_dim = 0 super(Attention, self).__init__(**kwargs) def build(self, input_shape): assert len(input_shape) == 3 self.W = self.add_weight((input_shape[-1],), initializer=self.init, name='{}_W'.format(self.name), regularizer=self.W_regularizer, constraint=self.W_constraint) self.features_dim = input_shape[-1] if self.bias: self.b = self.add_weight((input_shape[1],), initializer='zero', name='{}_b'.format(self.name), regularizer=self.b_regularizer, constraint=self.b_constraint) else: self.b = None self.built = True def compute_mask(self, input, input_mask=None): # do not pass the mask to the next layers return None def call(self, x, mask=None): input_shape = K.int_shape(x) features_dim = self.features_dim # step_dim = self.step_dim step_dim = input_shape[1] eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim)) if self.bias: eij += self.b[:input_shape[1]] eij = K.tanh(eij) a = K.exp(eij) # apply mask after the exp. will be re-normalized next if mask is not None: # Cast the mask to floatX to avoid float64 upcasting in theano a *= K.cast(mask, K.floatx()) # in some cases especially in the early stages of training the sum may be almost zero # and this results in NaN's. A workaround is to add a very small positive number ε to the sum. a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) a = K.expand_dims(a) weighted_input = x * a # print weigthted_input.shape return K.sum(weighted_input, axis=1) def compute_output_shape(self, input_shape): # return input_shape[0], input_shape[-1] return input_shape[0], self.features_dim # end AttentionKeras版本自己理解:K,Q,V对应 x, 权重W, x,然后根号dx替换成tanh激活来稳定...? ##### 用法区别(雾) 自己的感觉啦 - **Pytorch**的实现Scaled Dot-Product Attention用于Multi-head attention,**总体用于Transform框架** - **Keras**实现用于**当做Layer**,单纯对输入/出做一个Attention ### 含Attention的模型构建(Keras) ##### Attention RNN Model
#Attention代码见上 import keras from keras import Model from keras.layers import * from JoinAttLayer import Attention class TextClassifier(): def model(self, embeddings_matrix, maxlen, word_index, num_class): inp = Input(shape=(maxlen,)) encode = Bidirectional(CuDNNGRU(128, return_sequences=True)) encode2 = Bidirectional(CuDNNGRU(128, return_sequences=True)) attention = Attention(maxlen) x_4 = Embedding(len(word_index) + 1, embeddings_matrix.shape[1], weights=[embeddings_matrix], input_length=maxlen, trainable=True)(inp) x_3 = SpatialDropout1D(0.2)(x_4) x_3 = encode(x_3) x_3 = Dropout(0.2)(x_3) x_3 = encode2(x_3) x_3 = Dropout(0.2)(x_3) avg_pool_3 = GlobalAveragePooling1D()(x_3) max_pool_3 = GlobalMaxPooling1D()(x_3) attention_3 = attention(x_3) x = keras.layers.concatenate([avg_pool_3, max_pool_3, attention_3], name="fc") x = Dense(num_class, activation="sigmoid")(x) adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08,amsgrad=True) model = Model(inputs=inp, outputs=x) model.compile( loss='categorical_crossentropy', optimizer=adam) return model
Attention RcNN Model
import keras from keras import Model from keras.layers import * from JoinAttLayer import Attention class TextClassifier(): def model(self, embeddings_matrix, maxlen, word_index, num_class): inp = Input(shape=(maxlen,)) encode = Bidirectional(GRU(1, return_sequences=True)) encode2 = Bidirectional(GRU(1, return_sequences=True)) attention = Attention(maxlen) x_4 = Embedding(len(word_index) + 1, embeddings_matrix.shape[1], weights=[embeddings_matrix], input_length=maxlen, trainable=True)(inp) x_3 = SpatialDropout1D(0.2)(x_4) x_3 = encode(x_3) x_3 = Dropout(0.2)(x_3) x_3 = encode2(x_3) x_3 = Dropout(0.2)(x_3) x_3 = Conv1D(64, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(x_3) x_3 = Dropout(0.2)(x_3) avg_pool_3 = GlobalAveragePooling1D()(x_3) max_pool_3 = GlobalMaxPooling1D()(x_3) attention_3 = attention(x_3) x = keras.layers.concatenate([avg_pool_3, max_pool_3, attention_3]) x = Dense(num_class, activation="sigmoid")(x) adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) model = Model(inputs=inp, outputs=x) model.compile( loss='categorical_crossentropy', optimizer=adam ) return model
Capsule Model
训练技巧
- Loss Function(损失函数):不用seq2seq做,使用简单分成20个模型,loss function 设置为 categorical crossentropy
- Early Stop(提前停止)
- Class Weight(类别权重)
- EMA(指数平滑)
- Learning Rate (学习率)
- Max Length (padding 的最大句子长度)
- 如果要一次性训练20个模型的话,记得加上 python 的 gc 和 keras 的 clear_session。
del model1 del history gc.collect() K.clear_session()