input_dim:向量的维度,由于我使用Google预训练好的BERT做的embedding,所以维度应该是768。原本还想做fine-tuning,但目前代码水平有限,就先forzen不fine-tuning吧。hidden_size:代表GRU层的维度,即每一层GRU层有多少个神经元。out_size:输入的维度,如果是2分类或者回归问题,输出维度为1,如果是n类(n>2)分类问题,则应该是n。n_layers:GRU的神经网络层数。batch_size:每次进行训练的样本的数量 import torch ''' Author:Eric_TRF Date:2020 /09 /03 ''' class HomorNetv2(torch.nn.Module): def __init__(self, input_dim,hidden_size, out_size, n_layers=1, batch_size=1): super(HomorNetv2, self).__init__() self.batch_size = batch_size self.hidden_size = hidden_size self.n_layers = n_layers self.out_size = out_size # 这里指定了BATCH FIRST,所以输入时BATCH应该在第一维度 self.gru = torch.nn.GRU(input_dim, hidden_size, n_layers, batch_first=True,bidirectional=True) # 加了一个线性层,全连接 self.fc1 = torch.nn.Linear(hidden_size*2, 300) # 加入了第二个全连接层 self.fc2 = torch.nn.Linear(300, out_size) def forward(self, word_inputs, hidden): # hidden 就是上下文输出,output 就是 RNN 输出 output, hidden = self.gru(word_inputs, hidden) # output是所有隐藏层的状态,hidden是最后一层隐藏层的状态 output = self.fc1(output) output = self.fc2(output) # 仅仅获取 time seq 维度中的最后一个向量 # the last of time_seq output = output[:,-1,:] return output, hidden def init_hidden(self): # 这个函数写在这里,有一定迷惑性,这个不是模型的一部分,是每次第一个向量没有上下文,在这里捞一个上下文,仅此而已。 hidden = torch.autograd.Variable(torch.zeros(2*self.n_layers, self.batch_size, self.hidden_size, device='cuda')) return hidden

训练神经网络 我使用的是Google Colab Notebook来运行我的代码,使用该Notebook可以免除配置环境的众多烦恼。运行我的代码的时候请注意按照transformers,并且使用Google的GPU加速。读取数据集以后,我们的数据的训练tensor是通过BERT获得的embedding,至于label则是每个新闻标题的幽默程度,幽默程度的得分在0-3之间。在以下代码中我们获得了整个数据集的train_tensor以及Label_tensor

import re import csv csvFile = open(r"./drive/My Drive/MyModels/train.csv","r",encoding="UTF-8") # 建立空字典 train = [] train_labels = [] reader = csv.reader(csvFile) for item in reader: # 忽略第一行 if reader.line_num == 1: continue edited_sentence = re.sub(u"\\<.*?\\>", item[2], item[1]) # replace the replaced word train.append("[CLS] " + edited_sentence + " [SEP]") # get the train corpus train_labels.append(float(item[-1])) # print(float(item[-1])) # print(edited_sentence) csvFile.close() ''' use BERT to get the sentence embedding ''' import torch import numpy as np import random from torch import nn from transformers import BertTokenizer,BertModel ''' load the Bert tokenizer and model ''' def fix_seed(seed=234): torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed) random.seed(seed) fix_seed() # fix the seed tokenizer = BertTokenizer.from_pretrained('bert-base-cased') bertmodel = BertModel.from_pretrained('bert-base-cased',from_tf=True) # load the TF model for Pytorch max_len = 20 # the max_len for headline if torch.cuda.is_available(): bertmodel.cuda() # put the model into evaluation mode bertmodel.eval() # store the vectorized sentences count = 0 train_tensor = torch.ones(1,768).cuda() for masked_sentence in train: tokenized_text = tokenizer.tokenize(masked_sentence) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # mark each of the tokens as belonging to sentence 1 segments_ids = [1] * len(tokenized_text) # convert our data to torch tensors and call the BERT model tokens_tensor = torch.tensor([indexed_tokens]).cuda() # 将list转为tensor segments_tensors = torch.tensor([segments_ids]).cuda() with torch.no_grad(): outputs = bertmodel(tokens_tensor, segments_tensors) word_vectors = outputs[0].cuda() if len(word_vectors[0]) >= max_len: # if the headline is longer than max_len, cut it! word_vectors = word_vectors[:,:max_len,:] else: # else padding zero padding_len = max_len - len(word_vectors[0]) padding_vector = torch.zeros(1,padding_len,768).cuda() word_vectors = torch.cat((word_vectors,padding_vector),1) # padding done if count == 0: count += 1 train_tensor = word_vectors.cuda() else: train_tensor=torch.cat((train_tensor,word_vectors.cuda()),0) # concatnate the vectors together label_tensor = torch.FloatTensor(train_labels).cuda() print(train_tensor.size()) print(label_tensor.size())

现在展示train.csv的样本: 第一行是新闻标题的id,第二行是新闻标题的原文,第三行是人工对<>单词的替换,第四行不重要,最后一行就是所有的打分者给这个修改后的标题的幽默分数的平均分,平均分越高则代表修改后的标题越幽默。 设定Hyperparameters并开始训练 我们开始训练了。

embedding_dim = 768 hidden_size = 768 num_layers = 5 batchsize = 128 output_dim = 1 len_list = [i for i in range(0,len(train_tensor))] model = HomorNetv2(embedding_dim,hidden_size,output_dim,num_layers,batchsize) if torch.cuda.is_available(): model = model.cuda() ''' set the hyparameters ''' LR = 1e-3 criterion = nn.MSELoss() # 均方误差作为优化函数 optimizer = torch.optim.Adam(model.parameters(), lr=LR) # optimizer EPOCH = 40000 def accuracy(output, target): output = torch.round(torch.sigmoid(output)) correct = (output == target).float() acc = correct.mean() return acc def get_batch(batchsize,len_list,train_tensors,label_tensors): ''' get a batch for training ''' batch_index_list = random.sample(len_list,batchsize) count = 0 for index in batch_index_list: if count == 0: count += 1 batchtesnor_for_training = train_tensors[index:index+1] # get a tensor for training batchlabeltensor_for_training = label_tensors[index:index+1] else: batchtesnor_for_training = torch.cat((batchtesnor_for_training,train_tensors[index:index+1]),0) batchlabeltensor_for_training = torch.cat((batchlabeltensor_for_training,label_tensors[index:index+1]),0) return batchtesnor_for_training,batchlabeltensor_for_training print("Begin Training") for i in range(0,EPOCH+1): model.train() initial_hidden = model.init_hidden() batchtesnor_for_training,batch_label_tensors = get_batch(batchsize,len_list,train_tensor,label_tensor) # we zero the gradients as they are not removed automatically optimizer.zero_grad() # print(train_sample_tensor.shape) # squeeze is needed as the predictions will have the shape (batch size, 1) # and we need to remove the dimension of size 1 predictions,_ = model(batchtesnor_for_training,initial_hidden) # Compute the loss predictions = predictions.squeeze(1) loss = criterion(predictions, batch_label_tensors) train_loss = loss.item() ##################### # Q: Compute accuracy ##################### train_acc = accuracy(predictions, batch_label_tensors) # calculate the gradient of each parameter loss.backward() # update the parameters using the gradients and optimizer algorithm optimizer.step() print(f'| Epoch: {i:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%') if train_loss <= 0.0005 and i>=20000: break torch.save(model,"drive/My Drive/MyModels/HumorNetv2.pkl")

总结笔记 在搭建GRU的过程中踩坑了许多,踩了很多维度不匹配,以及输入不匹配的坑。 在这里需要注意的就是:

如果神经网络GRU层是双向的,那么其输出会是hidden_size×2,所以之后线性层的输入就应该是hidden_size×2,而不是hidden_size,如果神经网络是单向的,那么就不需要乘2。 # 这里指定了BATCH FIRST,所以输入时BATCH应该在第一维度 self.gru = torch.nn.GRU(input_dim, hidden_size, n_layers, batch_first=True,bidirectional=True) # 加了一个线性层,全连接 self.fc1 = torch.nn.Linear(hidden_size*2, 300)


向量是一个三维的,第一个维度是 direction×n_layers,第二个维度是训练一轮的时候输入的样本数目,第三个参数是声明的GRU的维度,第四个参数声明模型在GPU还是CPU上运行。

def init_hidden(self): # 这个函数写在这里,有一定迷惑性,这个不是模型的一部分,是每次第一个向量没有上下文,在这里捞一个上下文,仅此而已。 hidden = torch.autograd.Variable(torch.zeros(2*self.n_layers, self.batch_size, self.hidden_size, device='cuda')) return hidden