import os from os.path import exists import torch import torch.nn as nn from torch.nn.functional import log_softmax, pad import math import copy import time from torch.optim.lr_scheduler import LambdaLR import pandas as pd import altair as alt from torchtext.data.functional import to_map_style_dataset from torch.utils.data import DataLoader from torchtext.vocab import build_vocab_from_iterator import torchtext.datasets as datasets import spacy import GPUtil
from torch.utils.data.distributed import DistributedSampler import torch.distributed as dist import torch.multiprocessing as mp from torch.nn.parallel import DistributedDataParallel as DDP
# Set to False to skip notebook execution (e.g. for debugging) RUN_EXAMPLES = True
defforward(self, src, tgt, src_mask, tgt_mask): "Take in and process masked src and target sequences." return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)
classSublayerConnection(nn.Module): """ A residual connection followed by a layer norm. Note for code simplicity the norm is first as opposed to last. """
defforward(self, x, memory, src_mask, tgt_mask): for layer in self.layers: x = layer(x, memory, src_mask, tgt_mask) return self.norm(x)
在每个encoder layer除了两个 sub-layers 外,还插入了第三个sub-layer,它在encoder stack的输出上执行multi-head attention。与encoder相同,我们在each of the two sub-layers使用残差连接,并且后接layer normalization。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
classDecoderLayer(nn.Module): "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
defforward(self, x, memory, src_mask, tgt_mask): "Follow Figure 1 (right) for connections." m = memory x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask)) x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask)) return self.sublayer[2](x, self.feed_forward)
我们还修改了decoder中的self-attention sub-layer,以防止它利用到后续位置的信息。This masking, combined with fact that the output embeddings are offset by one position, ensures that the predictions for position can depend only on the known outputs at positions less than .
defexample_mask(): LS_data = pd.concat( [ pd.DataFrame( { "Subsequent Mask": subsequent_mask(20)[0][x, y].flatten(), "Window": y, "Masking": x, } ) for y in range(20) for x in range(20) ] )
attention函数可以被描述为 mapping a query and a set of key-value pairs to an output,其中query, keys, values, and output都是向量。output是values的加权求和,其中每个value的权重是通过query with the corresponding key的compatibility function计算得到。
我们将这种特别的attention称为“Scaled Dot-Product Attention”。它的输入由维度的queries、keys,维度的values组成。 We compute the dot products of the query with all keys, divide each by , and apply a softmax function to obtain the weights on the values。
最常用的两个attention函数是additive attention和dot-product (multiplicative) attention。后者除了没有缩放,其余与我们的相同。而Additive attention computes the compatibility function using a feed-forward network with a single hidden layer. 虽然两者在理论复杂性上相似,但dot-product attention在实践中要快得多,空间效率更高,因为它可以使用高度优化的矩阵乘法代码来实现。
虽然对于较小的值,这两种机制的性能相似,但对于较大的值,additive attention优于dot product attention。我们怀疑较大的值,dot product的幅度会增大,从而将Softmax函数推入其梯度极小的区域。(To illustrate why the dot products get large, assume that the components of and are independent random variables with mean and variance . Then their dot product, , has mean and variance.). 为了抵消这种影响,我们使用对dot products进行缩放。
Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. 当仅有一个 attention head,平均化抑制了这一点。
Where the projections are parameter matrices , , and 。在本工作中,我们使用了个平行attention layers, or heads。对于其中每一个,我们使用了。由于each head的维度降低了,所以总的计算量与full dimensionality的single-head attention相同。
classMultiHeadedAttention(nn.Module): def__init__(self, h, d_model, dropout=0.1): "Take in model size and number of heads." super(MultiHeadedAttention, self).__init__() assert d_model % h == 0 # We assume d_v always equals d_k self.d_k = d_model // h self.h = h self.linears = clones(nn.Linear(d_model, d_model), 4) self.attn = None self.dropout = nn.Dropout(p=dropout)
defforward(self, query, key, value, mask=None): "Implements Figure 2" if mask isnotNone: # Same mask applied to all h heads. mask = mask.unsqueeze(1) nbatches = query.size(0)
# 1) Do all the linear projections in batch from d_model => h x d_k query, key, value = [ lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) for lin, x in zip(self.linears, (query, key, value)) ]
# 2) Apply attention on all the projected vectors in batch. x, self.attn = attention( query, key, value, mask=mask, dropout=self.dropout )
# 3) "Concat" using a view and apply a final linear. x = ( x.transpose(1, 2) .contiguous() .view(nbatches, -1, self.h * self.d_k) ) del query del key del value return self.linears[-1](x)
Applications of Attention in our Model
Transformer以三种不同的方式使用了multi-head attention。
在“encoder-decoder attention” layers中,queries来自于之前的decoder layer, memory keys and values 来自encoder的输出。This allows every position in the decoder to attend over all positions in the input sequence. 这模仿了sequence-to-sequence模型中典型的encoder-decoder attention机制。
encoder中的self-attention layers. 这里的self-attention layers中,所有的 keys, values and queries均来自于上一层的输出。Each position in the encoder can attend to all positions in the previous layer of the encoder.
decoder中的self-attention layers. self-attention layers in the decoder allow each position in the decoder to attend to all positions in the decoder up to and including that position(up to and including:直到并包括)。我们需要防止信息在decoder中向左流动,以保持自回归(auto-regressive)特性。我们在scaled dot-product attention中实现了这一点,通过屏蔽Softmax输入中对应于非法连接的所有值(设置为)。
Position-wise Feed-Forward Networks
除了attention sub-layers,encoder and decoder中的每一层都包含一个fully connected feed-forward network(完全连接的前馈网络),该网络分别且相同地应用于每个位置。它由两个线性变换组成,中间有一个ReLU激活。
与其它的sequence transduction models相似,we use learned embeddings to convert the input tokens and output tokens to vectors of dimension 。我们还使用常用的 linear transformation and softmax function 将 decoder output转换为 predicted next-token probabilities. 在我们的模型中,我们在two embedding layers 和pre-softmax linear transformation共享相同的权重矩阵。在embedding layers,我们multiply those weights by 。
defmake_model( src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1 ): "Helper: Construct a model from hyperparameters." c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), nn.Sequential(Embeddings(d_model, src_vocab), c(position)), nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), Generator(d_model, tgt_vocab), )
# This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) return model
@staticmethod defmake_std_mask(tgt, pad): "Create a mask to hide padding and future words." tgt_mask = (tgt != pad).unsqueeze(-2) tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as( tgt_mask.data ) return tgt_mask
Training Loop
接下来我们创建一个通用的训练和打分函数,来跟踪损失。我们传入一个损失函数,它还会执行参数更新。
1 2 3 4 5 6 7
classTrainState: """Track number of steps, examples, and tokens processed"""
step: int = 0# Steps in the current epoch accum_step: int = 0# Number of gradient accumulation steps samples: int = 0# total # of examples used tokens: int = 0# total # of tokens processed
defrun_epoch( data_iter, model, loss_compute, optimizer, scheduler, mode="train", accum_iter=1, train_state=TrainState(), ): """Train a single epoch""" start = time.time() total_tokens = 0 total_loss = 0 tokens = 0 n_accum = 0 for i, batch in enumerate(data_iter): out = model.forward( batch.src, batch.tgt, batch.src_mask, batch.tgt_mask ) loss, loss_node = loss_compute(out, batch.tgt_y, batch.ntokens) # loss_node = loss_node / accum_iter if mode == "train"or mode == "train+log": loss_node.backward() train_state.step += 1 train_state.samples += batch.src.shape[0] train_state.tokens += batch.ntokens if i % accum_iter == 0: optimizer.step() optimizer.zero_grad(set_to_none=True) n_accum += 1 train_state.accum_step += 1 scheduler.step()
total_loss += loss total_tokens += batch.ntokens tokens += batch.ntokens if i % 40 == 1and (mode == "train"or mode == "train+log"): lr = optimizer.param_groups[0]["lr"] elapsed = time.time() - start print( ( "Epoch Step: %6d | Accumulation Step: %3d | Loss: %6.2f " + "| Tokens / Sec: %7.1f | Learning Rate: %6.1e" ) % (i, n_accum, loss / batch.ntokens, tokens / elapsed, lr) ) start = time.time() tokens = 0 del loss del loss_node return total_loss / total_tokens, train_state
Training Data and Batching
我们在标准的WMT 2014英语-德语数据集上进行了训练,该数据集由大约450万个句子对组成。Sentences were encoded using byte-pair encoding, which has a shared source-target vocabulary of about 37000 tokens. 对于英语-法语,我们使用了更大的2014年WMT英语-法语数据集,包括3600万个句子和split tokens into a 32000 word-piece vocabulary.
Sentence pairs were batched together by approximate sequence length. 每个训练批次包含一组句子对,其中包含大约25000个source tokens和25000个target tokens。
def rate(step, model_size, factor, warmup): """ we have to default the step to 1 for LambdaLR function to avoid zero raising to negative power. """ if step == 0: step = 1 return factor * ( model_size ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5)) )
# we have 3 examples in opts list. for idx, example in enumerate(opts): # run 20000 epoch for each example optimizer = torch.optim.Adam( dummy_model.parameters(), lr=1, betas=(0.9, 0.98), eps=1e-9 ) lr_scheduler = LambdaLR( optimizer=optimizer, lr_lambda=lambda step: rate(step, *example) ) tmp = [] # take 20K dummy training steps, save the learning rate at each step for step in range(20000): tmp.append(optimizer.param_groups[0]["lr"]) optimizer.step() lr_scheduler.step() learning_rates.append(tmp)
learning_rates = torch.tensor(learning_rates)
# Enable altair to handle more than 5000 rows alt.data_transformers.disable_max_rows()
我们使用KL div loss实现label smoothing. Instead of using a one-hot target distribution, we create a distribution that has confidence of the correct word and the rest of the smoothing mass distributed throughout the vocabulary.
defdata_gen(V, batch_size, nbatches): "Generate random data for a src-tgt copy task." for i in range(nbatches): data = torch.randint(1, V, size=(batch_size, 10)) data[:, 0] = 1 src = data.requires_grad_(False).clone().detach() tgt = data.requires_grad_(False).clone().detach() yield Batch(src, tgt, 0)
Loss Computation
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
classSimpleLossCompute: "A simple loss compute and train function."
if is_interactive_notebook(): # global variables used later in the script spacy_de, spacy_en = show_example(load_tokenizers) vocab_src, vocab_tgt = show_example(load_vocab, args=[spacy_de, spacy_en])
defaverage(model, models): "Average models into model" for ps in zip(*[m.params() for m in [model] + models]): ps[0].copy_(torch.sum(*ps[1:]) / len(ps[1:]))
src_tokens = [ vocab_src.get_itos()[x] for x in rb.src[0] if x != pad_idx ] tgt_tokens = [ vocab_tgt.get_itos()[x] for x in rb.tgt[0] if x != pad_idx ]
print( "Source Text (Input) : " + " ".join(src_tokens).replace("\n", "") ) print( "Target Text (Ground Truth) : " + " ".join(tgt_tokens).replace("\n", "") ) model_out = greedy_decode(model, rb.src, rb.src_mask, 72, 0)[0] model_txt = ( " ".join( [vocab_tgt.get_itos()[x] for x in model_out if x != pad_idx] ).split(eos_string, 1)[0] + eos_string ) print("Model Output : " + model_txt.replace("\n", "")) results[idx] = (rb, src_tokens, tgt_tokens, model_out, model_txt) return results
defrun_model_example(n_examples=5): global vocab_src, vocab_tgt, spacy_de, spacy_en
defmtx2df(m, max_row, max_col, row_tokens, col_tokens): "convert a dense matrix to a data frame with row and column indices" return pd.DataFrame( [ ( r, c, float(m[r, c]), "%.3d %s" % (r, row_tokens[r] if len(row_tokens) > r else"<blank>"), "%.3d %s" % (c, col_tokens[c] if len(col_tokens) > c else"<blank>"), ) for r in range(m.shape[0]) for c in range(m.shape[1]) if r < max_row and c < max_col ], # if float(m[r,c]) != 0 and r < max_row and c < max_col], columns=["row", "column", "value", "row_token", "col_token"], )
defviz_encoder_self(): model, example_data = run_model_example(n_examples=1) example = example_data[ len(example_data) - 1 ] # batch object for the final example
Gitalking ...