吴恩达团队NLP C4_W1_Assignment

字数统计: 2.8k阅读时长: 15 min

 2021/08/28   Share

吴恩达团队NLP C4_W1_Assignment

任务：建立神经机器翻译模型（英语< — >德语）

Part1：数据准备

1.1 库的准备

from termcolor import colored
import random
import numpy as np

import trax
from trax import layers as tl
from trax.fastmath import numpy as fastnp
from trax.supervised import training

1.2 导入数据

train_stream_fn = trax.data.TFDS('opus/medical',
                                 data_dir='./data/',
                                 keys=('en', 'de'),
                                 eval_holdout_size=0.01, # 1% for eval
                                 train=True)

# Get generator function for the eval set
eval_stream_fn = trax.data.TFDS('opus/medical',
                                data_dir='./data/',
                                keys=('en', 'de'),
                                eval_holdout_size=0.01, # 1% for eval
                                train=False)

1.3 标记化和格式化

将字符串分割为一个个的单词，并转化为数字，同时只留下简单的原词：

如单词表中有fear,some,fearsome ，可以只要fear,some ，因为fearsome可通过其他单词组合而成

# global variables that state the filename and directory of the vocabulary file
VOCAB_FILE = 'ende_32k.subword'
VOCAB_DIR = 'data/'

# Tokenize the dataset.
tokenized_train_stream = trax.data.Tokenize(vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)(train_stream)
tokenized_eval_stream = trax.data.Tokenize(vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)(eval_stream)

print(next(tokenized_train_stream))

(array([ 2326, 13139,   605,  9214,  3337,  7932,  1047,    15,  1489,
        4318,  6304,   331,  2326, 31329, 11722,     5, 16276, 14026,
        2801, 11765, 14446,   363, 21981,   219, 26382,   500, 30650,
        4729,   992]), 
array([ 2326, 13139,   605,  9214,  3337,  7932,  1047,    15,  1489,
        4318,  6304,   331,  2326, 31329, 11722,     5, 16276, 14026,
        2801, 11765, 14446,   363, 21981,   219, 26382,   500, 30650,
        4729,   992]))

然后，在每个单词结尾加上一个结束标志，有助于推理/预测

EOS = 1

# generator helper function to append EOS to each sentence
def append_eos(stream):
    for (inputs, targets) in stream:
        inputs_with_eos = list(inputs) + [EOS]
        targets_with_eos = list(targets) + [EOS]
        yield np.array(inputs_with_eos), np.array(targets_with_eos)

# append EOS to the train data
tokenized_train_stream = append_eos(tokenized_train_stream)

# append EOS to the eval data
tokenized_eval_stream = append_eos(tokenized_eval_stream)

最后，我们过滤长句，限制每个句子的标记数量

filtered_train_stream = trax.data.FilterByLength(
    max_length=256, length_keys=[0, 1])(tokenized_train_stream)
filtered_eval_stream = trax.data.FilterByLength(
    max_length=512, length_keys=[0, 1])(tokenized_eval_stream)

# print a sample input-target pair of tokenized sentences
train_input, train_target = next(filtered_train_stream)
print(colored(f'Single tokenized example input:', 'red' ), train_input)
print(colored(f'Single tokenized example target:', 'red'), train_target)

1 2	Single tokenized example input: [ 8569 4094 2679 32826 22527 5 30650 4729 992 1] Single tokenized example target: [12647 19749 70 32826 10008 5 30650 4729 992 1]

1.4 标记化和取消标记化辅助函数

def tokenize(input_str, vocab_file=None, vocab_dir=None):
    """Encodes a string to an array of integers

    Args:
        input_str (str): human-readable string to encode
        vocab_file (str): filename of the vocabulary text file
        vocab_dir (str): path to the vocabulary file
  
    Returns:
        numpy.ndarray: tokenized version of the input string
    """
    
    EOS = 1
    
    # Use the trax.data.tokenize method. It takes streams and returns streams,
    # we get around it by making a 1-element stream with `iter`.
    inputs =  next(trax.data.tokenize(iter([input_str]),
                                      vocab_file=vocab_file, vocab_dir=vocab_dir))
    
    inputs = list(inputs) + [EOS]
    
    batch_inputs = np.reshape(np.array(inputs), [1, -1])
    
    return batch_inputs

def detokenize(integers, vocab_file=None, vocab_dir=None):
    """Decodes an array of integers to a human readable string

    Args:
        integers (numpy.ndarray): array of integers to decode
        vocab_file (str): filename of the vocabulary text file
        vocab_dir (str): path to the vocabulary file
  
    Returns:
        str: the decoded sentence.
    """
    
    # Remove the dimensions of size 1
    integers = list(np.squeeze(integers))
    
    EOS = 1
    
    # 删除EOS
    if EOS in integers:
        integers = integers[:integers.index(EOS)] 
    
    return trax.data.detokenize(integers, vocab_file=vocab_file, vocab_dir=vocab_dir)

1.5 Bucketing

我们将长度相似的句子放在一起，并提供最小的填充，如下图所示：

boundaries =  [8,   16,  32, 64, 128, 256, 512]
batch_sizes = [256, 128, 64, 32, 16,    8,   4,  2]

# Create the generators.
train_batch_stream = trax.data.BucketByLength(
    boundaries, batch_sizes,
    length_keys=[0, 1]  # As before: count inputs and targets to length.
)(filtered_train_stream)

eval_batch_stream = trax.data.BucketByLength(
    boundaries, batch_sizes,
    length_keys=[0, 1]  # As before: count inputs and targets to length.
)(filtered_eval_stream)

# Add masking for the padding (0s).
train_batch_stream = trax.data.AddLossWeights(id_to_mask=0)(train_batch_stream)
eval_batch_stream = trax.data.AddLossWeights(id_to_mask=0)(eval_batch_stream)

Part2：神经机器翻译模型

2.1 Input encoder

创建嵌入，并送至LSTM网络将输出激活

def input_encoder_fn(input_vocab_size, d_model, n_encoder_layers):
    """ Input encoder runs on the input sentence and creates
    activations that will be the keys and values for attention.
    
    Args:
        input_vocab_size: int: vocab size of the input
        d_model: int:  depth of embedding (n_units in the LSTM cell)
        n_encoder_layers: int: number of LSTM layers in the encoder
    Returns:
        tl.Serial: The input encoder
    """
    
    input_encoder = tl.Serial( 
        
        tl.Embedding(input_vocab_size, d_model),
        [tl.LSTM(d_model) for _ in range(n_encoder_layers)]
    )

    return input_encoder

2.2 Pre-attention decoder

在targets上运行。用于查询的激活

tl.ShiftRight：会在目标开头添加一个标记，如[8, 34, 12] → [0, 8, 34, 12]

def pre_attention_decoder_fn(mode, target_vocab_size, d_model):
    """ Pre-attention decoder runs on the targets and creates
    activations that are used as queries in attention.
    
    Args:
        mode: str: 'train' or 'eval'
        target_vocab_size: int: vocab size of the target
        d_model: int:  depth of embedding (n_units in the LSTM cell)
    Returns:
        tl.Serial: The pre-attention decoder
    """
    
    pre_attention_decoder = tl.Serial(
        
        tl.ShiftRight(mode=mode),
        tl.Embedding(target_vocab_size, d_model),
        tl.LSTM(d_model)
    )
    
    return pre_attention_decoder

2.3 为注意力层准备输入

def prepare_attention_input(encoder_activations, decoder_activations, inputs):
    """Prepare queries, keys, values and mask for attention.
    
    Args:
        encoder_activations fastnp.array(batch_size, padded_input_length, d_model): output from the input encoder
        decoder_activations fastnp.array(batch_size, padded_input_length, d_model): output from the pre-attention decoder
        inputs fastnp.array(batch_size, padded_input_length): padded input tokens
    
    Returns:
        queries, keys, values and mask for attention.
    """
    
	# 键、值
    keys = encoder_activations
    values = encoder_activations

    queries = decoder_activations # 查询
    
    # generate the mask to distinguish real tokens from padding
    # hint: inputs is 1 for real tokens and 0 where they are padding
    mask = (inputs != 0)
    
    
    # add axes to the mask for attention heads and decoder length.
    mask = fastnp.reshape(mask, (mask.shape[0], 1, 1, mask.shape[1]))
    
    # broadcast so mask shape is [batch size, attention heads, decoder-len, encoder-len].
    # note: for this assignment, attention heads is set to 1.
    mask = mask + fastnp.zeros((1, 1, decoder_activations.shape[1], 1))
        
    
    return queries, keys, values, mask

2.4 实现机器翻译模型

def NMTAttn(input_vocab_size=33300,
            target_vocab_size=33300,
            d_model=1024,
            n_encoder_layers=2,
            n_decoder_layers=2,
            n_attention_heads=4,
            attention_dropout=0.0,
            mode='train'):
    """Returns an LSTM sequence-to-sequence model with attention.

    The input to the model is a pair (input tokens, target tokens), e.g.,
    an English sentence (tokenized) and its translation into German (tokenized).

    Args:
    input_vocab_size: int: vocab size of the input
    target_vocab_size: int: vocab size of the target
    d_model: int:  depth of embedding (n_units in the LSTM cell)
    n_encoder_layers: int: number of LSTM layers in the encoder
    n_decoder_layers: int: number of LSTM layers in the decoder after attention
    n_attention_heads: int: number of attention heads
    attention_dropout: float, dropout for the attention layer
    mode: str: 'train', 'eval' or 'predict', predict mode is for fast inference

    Returns:
    A LSTM sequence-to-sequence model with attention.
    """
    
    # Step 0: call the helper function to create layers for the input encoder
    input_encoder = input_encoder_fn(input_vocab_size, d_model, n_encoder_layers)

    # Step 0: call the helper function to create layers for the pre-attention decoder
    pre_attention_decoder = pre_attention_decoder_fn(mode, target_vocab_size, d_model)

    # Step 1: create a serial network
    model = tl.Serial( 
        
      # Step 2: copy input tokens and target tokens as they will be needed later.
      tl.Select([0, 1, 0, 1]),
        
      # Step 3: run input encoder on the input and pre-attention decoder the target.
      tl.Parallel(input_encoder, pre_attention_decoder),
        
      # Step 4: prepare queries, keys, values and mask for attention.
      tl.Fn('PrepareAttentionInput', prepare_attention_input, n_out=4),
        
      # Step 5: run the AttentionQKV layer
      # nest it inside a Residual layer to add to the pre-attention decoder activations(i.e. queries)
      tl.Residual(tl.AttentionQKV(d_model, n_heads=n_attention_heads, dropout=attention_dropout, mode=mode)),
      
      # Step 6: drop attention mask (i.e. index = None
      tl.Select([0, 2]),
        
      # Step 7: run the rest of the RNN decoder
      [tl.LSTM(d_model) for _ in range(n_decoder_layers)],
        
      # Step 8: prepare output by making it the right size
      tl.Dense(target_vocab_size),
        
      # Step 9: Log-softmax for output
      tl.LogSoftmax()
    )
    
    return model

Part3：训练模型

3.1 TrainTask

计算损失，更新权重

train_task = training.TrainTask(
    
    labeled_data= train_batch_stream,
    
    loss_layer= tl.CrossEntropyLoss(),
    
    optimizer= trax.optimizers.Adam(0.01),
    
    lr_schedule= trax.lr.warmup_and_rsqrt_decay(1000, 0.01),

    n_steps_per_checkpoint= 10,
    
)

3.2 EvalTask

查看模型在运行时的训练情况，观察交叉熵损失和准确性

eval_task = training.EvalTask(
    
    labeled_data=eval_batch_stream,

    metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
)

3.3 Loop

# define the output directory
output_dir = 'output_dir/'

# define the training loop
training_loop = training.Loop(NMTAttn(mode='train'),
                              train_task,
                              eval_tasks=[eval_task],
                              output_dir=output_dir)

training_loop.run(10)

Part4：测试模型

首先加载刚才训练的模型

# instantiate the model we built in eval mode
model = NMTAttn(mode='eval')

# initialize weights from a pre-trained model
model.init_from_file("model.pkl.gz", weights_only=True)
model = tl.Accelerate(model)

4.1 贪婪解码，获取最高概率的单词

实现next_symbol() ，返回下一个单词的索引

def next_symbol(NMTAttn, input_tokens, cur_output_tokens, temperature):
    """Returns the index of the next token.

    Args:
        NMTAttn (tl.Serial): An LSTM sequence-to-sequence model with attention.
        input_tokens (np.ndarray 1 x n_tokens): tokenized representation of the input sentence
        cur_output_tokens (list): tokenized representation of previously translated words
        temperature (float): parameter for sampling ranging from 0.0 to 1.0.
            0.0: same as argmax, always pick the most probable token
            1.0: sampling from the distribution (can sometimes say random things)

    Returns:
        int: index of the next token in the translated sentence
        float: log probability of the next symbol
    """

    # set the length of the current output tokens
    token_length = len(cur_output_tokens)

    # calculate next power of 2 for padding length 
    padded_length = 2**int(np.ceil(np.log2(token_length + 1)))

    # 填充
    padded = cur_output_tokens + [0] * (padded_length - token_length)
    
    padded_with_batch = np.expand_dims(padded, axis=0) # 在0位置添加数据，扩展数组

    # get the model prediction (remember to use the `NMAttn` argument defined above)
    output, _ = NMTAttn((input_tokens, padded_with_batch))
    
    # get log probabilities from the last token output
    log_probs = output[0, token_length, :]

    # get the next symbol by getting a logsoftmax sample
    symbol = int(tl.logsoftmax_sample(log_probs, temperature))

    return symbol, float(log_probs[symbol])

多次利用上述函数，实现句子的翻译

def sampling_decode(input_sentence, NMTAttn = None, temperature=0.0, vocab_file=None, vocab_dir=None):
    """Returns the translated sentence.

    Args:
        input_sentence (str): sentence to translate.
        NMTAttn (tl.Serial): An LSTM sequence-to-sequence model with attention.
        temperature (float): parameter for sampling ranging from 0.0 to 1.0.
            0.0: same as argmax, always pick the most probable token
            1.0: sampling from the distribution (can sometimes say random things)
        vocab_file (str): filename of the vocabulary
        vocab_dir (str): path to the vocabulary file

    Returns:
        tuple: (list, str, float)
            list of int: tokenized version of the translated sentence
            float: log probability of the translated sentence
            str: the translated sentence
    """
    
    input_tokens = tokenize(input_sentence, vocab_file, vocab_dir)
    
    cur_output_tokens = []
    
    cur_output = 0
    
    EOS = 1
    
    while cur_output != EOS:
        cur_output, log_prob = next_symbol(NMTAttn, input_tokens, cur_output_tokens, temperature)
        cur_output_tokens.append(cur_output)
    
    sentence = detokenize(cur_output_tokens, vocab_file, vocab_dir)
    
    return cur_output_tokens, log_prob, sentence

4.2 最小Bayes-Risk解码

随机抽取样本
其他的样本对这些抽取的样本每个进行评分
选择得分最高的一个

首先产生样本

def generate_samples(sentence, n_samples, NMTAttn=None, temperature=0.6, vocab_file=None, vocab_dir=None):
    """Generates samples using sampling_decode()

    Args:
        sentence (str): sentence to translate.
        n_samples (int): number of samples to generate
        NMTAttn (tl.Serial): An LSTM sequence-to-sequence model with attention.
        temperature (float): parameter for sampling ranging from 0.0 to 1.0.
            0.0: same as argmax, always pick the most probable token
            1.0: sampling from the distribution (can sometimes say random things)
        vocab_file (str): filename of the vocabulary
        vocab_dir (str): path to the vocabulary file
        
    Returns:
        tuple: (list, list)
            list of lists: token list per sample
            list of floats: log probability per sample
    """
    # define lists to contain samples and probabilities
    samples, log_probs = [], []

    for _ in range(n_samples):
        
        sample, logp, _ = sampling_decode(sentence, NMTAttn, temperature, vocab_file=vocab_file, vocab_dir=vocab_dir)
        
        samples.append(sample)
        log_probs.append(logp)
                
    return samples, log_probs

比较重叠的分数

def jaccard_similarity(candidate, reference):
    """Returns the Jaccard similarity between two token lists

    Args:
        candidate (list of int): tokenized version of the candidate translation
        reference (list of int): tokenized version of the reference translation

    Returns:
        float: overlap between the two token lists
    """
    
    can_unigram_set, ref_unigram_set = set(candidate), set(reference)  
    
    # 交集
    joint_elems = can_unigram_set.intersection(ref_unigram_set)
    
    # 并集
    all_elems = can_unigram_set.union(ref_unigram_set)
    
    # 计算分数
    overlap = len(joint_elems) / len(all_elems)
    
    return overlap

ROUGE分数公式如下：

$score = 2* \frac{(precision * recall)}{(precision + recall)}$

def rouge1_similarity(system, reference):
    """Returns the ROUGE-1 score between two token lists

    Args:
        system (list of int): tokenized version of the system translation
        reference (list of int): tokenized version of the reference translation

    Returns:
        float: overlap between the two token lists
    """    
    
    # 计数字典
    sys_counter = Counter(system)
    ref_counter = Counter(reference)
    
    overlap = 0
    
    for token in sys_counter:
        token_count_sys = sys_counter.get(token, 0)
        
        token_count_ref = ref_counter.get(token, 0)
        
        overlap += min(token_count_sys, token_count_ref)
    
    # get the precision
    precision = overlap / sum(sys_counter.values())
    
    # get the recall
    recall = overlap / sum(ref_counter.values())
    
    if precision + recall != 0:
        rouge1_score = precision * recall * 2 / (precision + recall)
    else:
        rouge1_score = 0 
    
    return rouge1_score

将每个样本与所有其他样本进行比较，计算平均分

def average_overlap(similarity_fn, samples, *ignore_params):
    """Returns the arithmetic mean of each candidate sentence in the samples

    Args:
        similarity_fn (function): similarity function used to compute the overlap
        samples (list of lists): tokenized version of the translated sentences
        *ignore_params: additional parameters will be ignored

    Returns:
        dict: scores of each sample
            key: index of the sample
            value: score of the sample
    """  
    
    scores = {}
   
    for index_candidate, candidate in enumerate(samples):    
        
        overlap = 0.0
        
        for index_sample, sample in enumerate(samples): 
            
		   # 比较的是自己，跳过
            if index_candidate == index_sample:
                continue
               
            sample_overlap = similarity_fn(candidate, sample)

            overlap += sample_overlap
            
        score = overlap / index_sample
        scores[index_candidate] = score
        
    return scores

利用加权平均值来计算分数

def weighted_avg_overlap(similarity_fn, samples, log_probs):
    """Returns the weighted mean of each candidate sentence in the samples

    Args:
        samples (list of lists): tokenized version of the translated sentences
        log_probs (list of float): log probability of the translated sentences

    Returns:
        dict: scores of each sample
            key: index of the sample
            value: score of the sample
    """
    
    scores = {}
    
    for index_candidate, candidate in enumerate(samples):    
        
        overlap, weight_sum = 0.0, 0.0
        
        for index_sample, (sample, logp) in enumerate(zip(samples, log_probs)):
           
		   # 比较的是自己，跳过
            if index_candidate == index_sample:
                continue

            sample_p = float(np.exp(logp))
            weight_sum += sample_p

            sample_overlap = similarity_fn(candidate, sample)
            
            overlap += sample_p * sample_overlap
            
        score = overlap / weight_sum
      
        scores[index_candidate] = score
    
    return scores

把之前的步骤整合在一起，实现mbr_decode()函数

def mbr_decode(sentence, n_samples, score_fn, similarity_fn, NMTAttn=None, temperature=0.6, vocab_file=None, vocab_dir=None):
    """Returns the translated sentence using Minimum Bayes Risk decoding

    Args:
        sentence (str): sentence to translate.
        n_samples (int): number of samples to generate
        score_fn (function): function that generates the score for each sample
        similarity_fn (function): function used to compute the overlap between a pair of samples
        NMTAttn (tl.Serial): An LSTM sequence-to-sequence model with attention.
        temperature (float): parameter for sampling ranging from 0.0 to 1.0.
            0.0: same as argmax, always pick the most probable token
            1.0: sampling from the distribution (can sometimes say random things)
        vocab_file (str): filename of the vocabulary
        vocab_dir (str): path to the vocabulary file

    Returns:
        str: the translated sentence
    """
    
    # 生成样本
    samples, log_probs = generate_samples(sentence, n_samples, NMTAttn, temperature, vocab_file, vocab_dir)
 
	# 加权分数
    scores = weighted_avg_overlap(similarity_fn, samples, log_probs)
    
    # 寻找最高分数
    max_index = max(scores, key=scores.get)
    
    # detokenize the token list associated with the max_index
    translated_sentence = detokenize(samples[max_index], vocab_file, vocab_dir)
    
    return (translated_sentence, max_index, scores)

Next Post

吴恩达团队NLP C4_W2_Assignment
Previous Post

吴恩达团队NLP C3_W4_Assignment

