吴恩达团队NLP C3_W1_Assignment

字数统计: 1.6k阅读时长: 8 min

 2021/08/28   Share

吴恩达团队NLP C3_W1_Assignment

任务：使用深度神经网络进行情感分析

Part1：准备数据

1.1 以8 ： 2的比例准备训练集和验证集

import numpy as np

all_positive_tweets, all_negative_tweets = load_tweets()

print(f"The number of positive tweets: {len(all_positive_tweets)}")
print(f"The number of negative tweets: {len(all_negative_tweets)}")

val_pos   = all_positive_tweets[4000:] # generating validation set for positive tweets
train_pos  = all_positive_tweets[:4000]# generating training set for positive tweets

val_neg   = all_negative_tweets[4000:] # generating validation set for negative tweets
train_neg  = all_negative_tweets[:4000] # generating training set for nagative tweets

train_x = train_pos + train_neg 
val_x  = val_pos + val_neg

# Set the labels for the training set (1 for positive, 0 for negative)
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))

# Set the labels for the validation set (1 for positive, 0 for negative)
val_y  = np.append(np.ones(len(val_pos)), np.zeros(len(val_neg)))

print(f"length of train_x {len(train_x)}")
print(f"length of val_x {len(val_x)}")

1.2 构建字典，每个字典有唯一的id标志

Vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2} 

# Note that we build vocab using training data
for tweet in train_x: 
    processed_tweet = process_tweet(tweet)
    for word in processed_tweet:
        if word not in Vocab: 
            Vocab[word] = len(Vocab)
    
print("Total words in vocab are",len(Vocab))
display(Vocab)

The dictionary Vocab will look like this:

{'__PAD__': 0,
 '__</e>__': 1,
 '__UNK__': 2,
 'followfriday': 3,
 'top': 4,
 'engag': 5,
 ...

1.3 将tweet转换为tensor

Example

输入一个tweet：

1	'@happypuppy, is Maria happy?'

将有关单词提取，转换为列表：

1	['maria', 'happy']

然后转换为字典中的id标志：

[2, 56]

由于字典中没有maria，所以将其转化为__UNK__

def tweet_to_tensor(tweet, vocab_dict, unk_token='__UNK__', verbose=False):
    '''
    Input: 
        tweet - A string containing a tweet
        vocab_dict - The words dictionary
        unk_token - The special string for unknown tokens
        verbose - Print info durign runtime
    Output:
        tensor_l - A python list with
        
    '''  

    word_l = process_tweet(tweet)
    
    if verbose:
        print("List of words from the processed tweet:")
        print(word_l)
        
    tensor_l = []
    unk_ID = vocab_dict[unk_token]
    
    if verbose:
        print(f"The unique integer ID for the unk_token is {unk_ID}")
        
    for word in word_l:   
        if word not in vocab_dict:
            word_ID = unk_ID
        else:
            word_ID = vocab_dict[word]
        
        tensor_l.append(word_ID) 
    
    return tensor_l

1.4 创建批处理生成器

def data_generator(data_pos, data_neg, batch_size, loop, vocab_dict, shuffle=False):
    '''
    Input: 
        data_pos - Set of posstive examples
        data_neg - Set of negative examples
        batch_size - number of samples per batch. Must be even
        loop - True or False
        vocab_dict - The words dictionary
        shuffle - Shuffle the data order
    Yield:
        inputs - Subset of positive and negative examples
        targets - The corresponding labels for the subset
        example_weights - An array specifying the importance of each example
        
    '''     
    # 确保正面负面数据量相同，所以需要为偶数
    assert batch_size % 2 == 0
    
	# 都为batch_size的一半 //代表整数除法
    n_to_take = batch_size // 2
    
    pos_index = 0
    neg_index = 0
    
    len_data_pos = len(data_pos)
    len_data_neg = len(data_neg)
    
    pos_index_lines = list(range(len_data_pos))
    neg_index_lines = list(range(len_data_neg))
    
    # 洗牌
    if shuffle:
        rnd.shuffle(pos_index_lines)
        rnd.shuffle(neg_index_lines)
        
    stop = False
    
    while not stop:  
        
        batch = []
        
        # First part: Pack n_to_take positive examples
        for i in range(n_to_take):
                    
            # 若超过数据最大值
            if pos_index >= len_data_pos: 
                if not loop:
                    stop = True;
                    break;
                
                # 想要继续loop，重新置为0
                pos_index = 0
                
                if shuffle:
                    rnd.shuffle(pos_index_lines)
                    
            # 获取对应数据
            tweet = data_pos[pos_index_lines[pos_index]]
	          
		   # 转化为对应id标志
            tensor = tweet_to_tensor(tweet, vocab_dict)
            
            batch.append(tensor)
            
            pos_index = pos_index + 1

        # Second part: Pack n_to_take negative examples
        for i in range(n_to_take):
            if neg_index >= len_data_neg
                if not loop:
                    stop = True;
                    break;
                   
                neg_index = 0
                
                if shuffle:
                    rnd.shuffle(neg_index_lines)

            tweet = data_neg[neg_index_lines[neg_index]]
            
            tensor = tweet_to_tensor(tweet, vocab_dict)
            
            batch.append(tensor)
            
            neg_index = neg_index + 1

        if stop:
            break;

        # Update the start index for positive data
        pos_index += n_to_take
        
        # Update the start index for negative data 
        neg_index += n_to_take
        
		# 获取所有tweet中的最大长度，以便后续填充
        max_len = max([len(t) for t in batch]) 
        
        
        tensor_pad_l = []

        # 将长度不够的tweet对应的tensor填充0
        for tensor in batch:
            n_pad = max_len - len(tensor) # 需要填充的数量

            pad_l = [0] * n_pad # 补0

            tensor_pad = tensor + pad_l
            tensor_pad_l.appenda(tensor_pad)

		# 转换
        inputs = np.array(tensor_pad_l)
  
		# 生成目标为正面数据的列表，均为1
        target_pos = [1] * n_to_take
        
		# 负面，均为0
        target_neg = [0] * n_to_take
        
        target_l = target_pos + target_neg
        
        targets = np.array(target_l)

        # 将所有例子的比重设为相同
        example_weights = np.ones_like(targets)
        

        yield inputs, targets, example_weights

Part2：编写自己的layers

2.1 编写Relu类

class Relu(Layer):
    """Relu activation function implementation"""
    def forward(self, x):
        '''
        Input: 
            - x (a numpy array): the input
        Output:
            - activation (numpy array): all positive or 0 version of x
        '''
        
        activation = np.maximum(x,0)
        
        return activation

2.2 编写Dense类

其中：

$\mathrm{forward}(\mathbf{x},\mathbf{W}) = \mathbf{xW}$

随机值标准偏差stdev = 0.1
n_units ：单元数
权重矩阵由trax.fastmath.random.normal(key, shape, dtype=tf.float32)生成
- key可通过random.get_prng(seed=)生成
- shape为一个元组，(行数, 列数)
  - 行数为x的列数(为了可以相乘)，x可能为(row, col)或(batch_size, row, col)，所以取最后一个维即可（input_shape[-1]）。
  - 列数为n_units的数量，即单元数
- dtype为矩阵中数值的类型

class Dense(Layer):
    """
    A dense (fully-connected) layer.
    """
    def __init__(self, n_units, init_stdev=0.1):
        self._n_units = n_units
        self._init_stdev = 0.1

    def forward(self, x):
        dense = np.dot(x, self.weights)
        return dense

    def init_weights_and_state(self, input_signature, random_key):
        # The input_signature has a .shape attribute that gives the shape as a tuple
        input_shape = input_signature.shape

        # Generate the weight matrix from a normal distribution, 
        # and standard deviation of 'stdev'        
        w = self._init_stdev * random.normal(
            key = random_key, shape = (input_shape[-1], self._n_units))
             
        self.weights = w
        return self.weights

2.3 实现分类器

def classifier(vocab_size=len(Vocab), embedding_dim=256, output_dim=2, mode='train'):
   
    # create embedding layer
    embed_layer = tl.Embedding(
        vocab_size = vocab_size, # Size of the vocabulary
        d_feature = embedding_dim)  # Embedding dimension
    
    # Create a mean layer, to create an "average" word embedding
    mean_layer = tl.Mean(axis = 1)
    
    # Create a dense layer, one unit for each output
    dense_output_layer = tl.Dense(n_units = output_dim)

    # Create the log softmax layer (no parameters needed)
    log_softmax_layer = tl.LogSoftmax()
    
    model = tl.Serial(
      embed_layer, # embedding layer
      mean_layer, # mean layer
      dense_output_layer, # dense output layer 
      log_softmax_layer # log softmax layer
    )   
    
    # return the model of type
    return model

Part3：训练模型

3.1 定义TrainTask, EvalTask and Loop

from trax.supervised import training

batch_size = 16
rnd.seed(271)

train_task = training.TrainTask(
    labeled_data=train_generator(batch_size=batch_size, shuffle=True),
    loss_layer=tl.CrossEntropyLoss(),
    optimizer=trax.optimizers.Adam(0.01),
    n_steps_per_checkpoint=10,
)

eval_task = training.EvalTask(
    labeled_data=val_generator(batch_size=batch_size, shuffle=True),
    metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
)

model = classifier()

3.2 实现训练模型函数

实际实现中，在trax==1.3.9版本中，参数必须为eval_tasks而非eval_task

def train_model(classifier, train_task, eval_task, n_steps, output_dir):
    '''
    Input: 
        classifier - the model you are building
        train_task - Training task
        eval_task - Evaluation task
        n_steps - the evaluation steps
        output_dir - folder to save your files
    Output:
        trainer -  trax trainer
    '''
    training_loop = training.Loop(
                                classifier, # The learning model
                                train_task, # The training task
                                eval_tasks = eval_task, # The evaluation task
                                output_dir = output_dir) # The output directory

    training_loop.run(n_steps = n_steps)

    return training_loop

Part4：计算准确率

def compute_accuracy(preds, y, y_weights):
    """
    Input: 
        preds: a tensor of shape (dim_batch, output_dim) 
        y: a tensor of shape (dim_batch, output_dim) with the true labels
        y_weights: a n.ndarray with the a weight for each example
    Output: 
        accuracy: a float between 0-1 
        weighted_num_correct (np.float32): Sum of the weighted correct predictions
        sum_weights (np.float32): Sum of the weights
    """

		# 创建一个列表，正面概率大于负面概率则为True，否则为False
    is_pos =  preds[:, 1] > preds[:, 0]

    # 转换为int类型(0, 1)
    is_pos_int = is_pos.astype(np.int32)
    
    # 进行比较，判断正确与否
    correct = (is_pos_ints == y)

    # Count the sum of the weights.
    sum_weights = np.sum(y_weights)
    
    # 转换
    correct_float = correct.astype(np.float32)
    
    # Multiply each prediction with its corresponding weight.
    weighted_correct_float = correct_float * y_weights

    weighted_num_correct = np.sum(weighted_correct_float)
 
	# 计算准确率
    accuracy = weighted_num_correct / sum_weights

    return accuracy, weighted_num_correct, sum_weights

Part5：在验证集上测试模型

batch的维数为：(X, Y, weights)

X为tweet的tensor
Y为实际的标签，正面或负面
weights为相应的权重

测试模型，并返回准确率

def test_model(generator, model):
    '''
    Input: 
        generator: an iterator instance that provides batches of inputs and targets
        model: a model instance 
    Output: 
        accuracy: float corresponding to the accuracy
    '''
    
    accuracy = 0.
    total_num_correct = 0
    total_num_pred = 0
    
    for batch in generator: 
        
		# X
        inputs = batch[0]
        
        # Y
        targets = batch[1]
        
        # weights
        example_weight = batch[2]

        # 模型做出预测
        pred = model(inputs)
        
        # 计算准确率
        batch_accuracy, batch_num_correct, batch_num_pred = compute_accuracy(pred, targets, example_weight)
        
        total_num_correct += batch_num_correct
        total_num_pred += batch_num_pred

    accuracy = total_num_correct / total_num_pred
    
    return accuracy

Next Post

吴恩达团队NLP C3_W2_Assignment
Previous Post

吴恩达团队NLP C2_W4_Assignment

CATALOG

1. 吴恩达团队NLP C3_W1_Assignment



Total : 424

2023

2022

2021

2020

2019

缺失模块。
1、请确保node版本大于6.2
2、在博客根目录（注意不是archer根目录）执行以下命令：
npm i hexo-generator-json-content --save
3、在根目录_config.yml里添加配置：

jsonContent:
  meta: false
  pages: false
  posts:
    title: true
    date: true
    path: true
    text: false
    raw: false
    content: false
    slug: false
    updated: false
    comments: false
    link: false
    permalink: false
    excerpt: false
    categories: true
    tags: true