nlp-beginner task2 基于深度学习的文本分类 part2(pytorch+textRNN)

tech2024-07-15  70

https://github.com/FudanNLP/nlp-beginner


1. 回顾

之前实现了textCNN,但是效果并不是很好,推测是随机选取train_data的锅,之前一直不知道该怎么多组一起下降,今天才理清epoch和batch的关系,可以利用pytorch中对梯度的累加机制来实现非等长数据的小批量梯度下降,所以前一篇跑出来的textCNN就当跑个乐了,之后有空再回去补。另注意跑完所有sample才算跑完一次eopch...

所以这里mini-batch采用的是

train_data_number = 156060 * 0.8 = 124848 iter_num = 10 batch_size = int(train_data_number / iter_num) = 1248 epoch = 10 update_num = epoch * iter_num = 100

更新次数就降到100次了,以前一次epoch就更新124848次,想必也是直接一头冲进哪个局部最低点了.... 难怪跑出来这么菜

 

2. textRNN

参考

直接用glove的glove.6B.zip中的glove.6B.50d.txt初始化词向量了。

2.1. 代码

import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.utils.data import random_split import pandas as pd import numpy as np import random import copy # 同前读取数据部分 read_data = pd.read_table('../train.tsv') data = [] data_len = read_data.shape[0] for i in range(data_len): data.append([read_data['Phrase'][i].lower().split(' '), read_data['Sentiment'][i]]) word_to_ix = {} # 给每个词分配index ix_to_word = {} word_set = set() for sent, _ in data: for word in sent: if word not in word_to_ix: ix_to_word[len(word_to_ix)] = word word_to_ix[word] = len(word_to_ix) word_set.add(word) unk = '<unk>' ix_to_word[len(word_to_ix)] = unk word_to_ix[unk] = len(word_to_ix) word_set.add(unk) torch.manual_seed(6) # 设置torch的seed,影响后面初始化参数和random_split train_len = int(0.8 * data_len) test_len = data_len - train_len train_data, test_data = random_split(data, [train_len, test_len]) # 分割数据集 # print(type(train_data)) # torch.utils.data.dataset.Subset train_data = list(train_data) test_data = list(test_data) # 参数字典,方便成为调参侠 args = { 'vocab_size': len(word_to_ix), # 有多少词,embedding需要以此来生成词向量 'embedding_size': 50, # 每个词向量有几维(几个特征) 'hidden_size': 16, 'type_num': 5, # 分类个数 'train_batch_size': int(train_len / 10), # 'test_batch_size': int(test_len / 10) } f = open('../glove.6B.50d.txt', 'r', encoding='utf-8') line = f.readline() glove_word2vec = {} pretrained_vec = [] while line: line = line.split() word = line[0] if word in word_set: glove_word2vec[word] = [float(v) for v in line[1:]] line = f.readline() unk_num = 0 for i in range(args['vocab_size']): if ix_to_word[i] in glove_word2vec: pretrained_vec.append(glove_word2vec[ix_to_word[i]]) else: pretrained_vec.append(glove_word2vec[unk]) unk_num += 1 print(unk_num, args['vocab_size']) pretrained_vec = np.array(pretrained_vec) train_len = int(int(train_len / args['train_batch_size']) * args['train_batch_size']) # 去掉最后多余不满一个batch的一组,不去掉对loss影响比较大,比如说前10组都是1e-4级,经过最后一组直接上1e-2级了 class textRNN(nn.Module): def __init__(self, args): super(textRNN, self).__init__() self.vocab_size = args['vocab_size'] self.embedding_size = args['embedding_size'] self.hidden_size = args['hidden_size'] self.type_num = args['type_num'] self.embed_fine_tune = nn.Embedding(self.vocab_size, self.embedding_size) self.embed_fine_tune.weight.data.copy_(torch.from_numpy(pretrained_vec)) self.embed_fine_tune.requires_grad = True self.rnn = nn.RNN(input_size=self.embedding_size, hidden_size=self.hidden_size) # 相当于一个self.embedding_size * self.hidden_size的矩阵 self.linear = nn.Linear(self.hidden_size, self.type_num) # 全连接层 def forward(self, hidden, x): x = self.embed_fine_tune(x) x = x.transpose(0, 1) # 据参考pytorch中RNN要求batch_size在第二位,反正这边都是1 out, hidden = self.rnn(x, hidden) # print(out.size()) # length * layer_num * embedding_num out = out[-1] # 只取输出层最后一个进行判断 # print(out.size()) # [1, 16] return F.log_softmax(self.linear(out), dim=1) model = textRNN(args) loss_function = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.01) def match(test_batch): acc = 0 with torch.no_grad(): for instance, label in test_batch: inputs1 = [word_to_ix[word] for word in instance] inputs1 = torch.LongTensor(inputs1).view(1, -1) hidden = torch.zeros(1, 1, args['hidden_size']) log_probs = model(hidden, hidden2, inputs1, inputs2) b = torch.argmax(log_probs, dim=1) if b[0] == label: acc += 1 print('acc = %.4lf%%' % (acc / test_len * 100)) def train(batch_data, batch_size): model.zero_grad() for instance, label in batch_data: inputs1 = [word_to_ix[word] for word in instance] # 要先把每个词转换为其对应的index inputs1 = torch.LongTensor(inputs1).view(1, -1) target = torch.LongTensor([label]) hidden = torch.zeros(1, 1, args['hidden_size']) log_probs = model(hidden, hidden2, inputs1, inputs2) loss = loss_function(log_probs, target) / batch_size # 挨个除以batch_size loss.backward() print(' loss = %.4lf' % loss) # 看看loss,虽然都是在1e-4级左右震荡 optimizer.step() # 放到外面来,一组batch结束后一起下降 # match(test_data) # 初始 11.7455% for epoch in range(10): print('now in epoch %d...' % epoch) random.shuffle(train_data) # 每次打乱顺序 for i in range(0, train_len, args['train_batch_size']): train(train_data[i: i + args['train_batch_size']], args['train_batch_size']) match(test_data)

 

2.2. 结果

跑全训练集确实有点慢,不过终于突破60大关了,最高epoch = 8时acc = 64.39%,看了看能挤进6年前leaderboard的173/860了(悲,还是被6年前的人吊打),虽然用的是伪测试集

 

3. textBiRNN

3.1. 代码(模型+训练+测试)

class textRNN(nn.Module): def __init__(self, args): super(textRNN, self).__init__() self.vocab_size = args['vocab_size'] self.embedding_size = args['embedding_size'] self.hidden_size = args['hidden_size'] self.type_num = args['type_num'] self.embed_fine_tune = nn.Embedding(self.vocab_size, self.embedding_size) self.embed_fine_tune.weight.data.copy_(torch.from_numpy(pretrained_vec)) self.embed_fine_tune.requires_grad = True self.embed_fine_tune2 = nn.Embedding(self.vocab_size, self.embedding_size) self.embed_fine_tune2.weight.data.copy_(torch.from_numpy(pretrained_vec)) self.embed_fine_tune2.requires_grad = True self.rnn = nn.RNN(input_size=self.embedding_size, hidden_size=self.hidden_size) self.rnn2 = nn.RNN(input_size=self.embedding_size, hidden_size=self.hidden_size) self.linear = nn.Linear(self.hidden_size * 2, self.type_num) def forward(self, hidden, hidden2, x, x2): x = self.embed_fine_tune(x) x = x.transpose(0, 1) x2 = self.embed_fine_tune2(x2) x2 = x2.transpose(0, 1) out, hidden = self.rnn(x, hidden) out2, hidden2 = self.rnn2(x2, hidden2) # print(out.size()) # length * layer_num * embedding_num out = out[-1] out2 = out2[-1] # print(out.size()) # [1, 16] out = torch.cat((out, out2), dim=1) return F.log_softmax(self.linear(out), dim=1) model = textRNN(args) loss_function = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.01) def match(test_batch): acc = 0 with torch.no_grad(): for instance, label in test_batch: inputs1 = [word_to_ix[word] for word in instance] inputs2 = copy.deepcopy(inputs1) inputs2.reverse() inputs1 = torch.LongTensor(inputs1).view(1, -1) inputs2 = torch.LongTensor(inputs2).view(1, -1) hidden = torch.zeros(1, 1, args['hidden_size']) hidden2 = torch.zeros(1, 1, args['hidden_size']) log_probs = model(hidden, hidden2, inputs1, inputs2) b = torch.argmax(log_probs, dim=1) if b[0] == label: acc += 1 print('acc = %.4lf%%' % (acc / test_len * 100)) def train(batch_data, batch_size): model.zero_grad() for instance, label in batch_data: inputs1 = [word_to_ix[word] for word in instance] # 要先把每个词转换为其对应的index inputs2 = copy.deepcopy(inputs1) inputs2.reverse() inputs1 = torch.LongTensor(inputs1).view(1, -1) inputs2 = torch.LongTensor(inputs2).view(1, -1) target = torch.LongTensor([label]) hidden = torch.zeros(1, 1, args['hidden_size']) hidden2 = torch.zeros(1, 1, args['hidden_size']) log_probs = model(hidden, hidden2, inputs1, inputs2) loss = loss_function(log_probs, target) / batch_size loss.backward() print(' loss = %.4lf' % loss) optimizer.step() # match(test_data) # 初始 11.7455% for epoch in range(10): print('now in epoch %d...' % epoch) random.shuffle(train_data) for i in range(0, train_len, args['train_batch_size']): train(train_data[i: i + args['train_batch_size']], args['train_batch_size']) match(test_data)

 

3.2. 结果

好慢...不过提升了1%,最高epoch = 8时acc = 65.33%,115/860

 

3.3. 小实验

在《Reasoning about Entailment with Neural Attention》里看到一种把语料库中不存在的词初始化为随机向量,然后freeze存在的那部分,fine-tune随机初始化的那部分的做法,感觉可能会比统一变成<unk>向量好,所以试了下,难点在不知道怎么把随机初始化的那部分挑出来又不失语序,所以这边直接暴力挨个cat了,代价当然是慢到怀疑人生,跑了4个epoch结果上来说和统一变成<unk>没什么大差,甚至稍逊一丢丢,所以没继续跑了,真一杯茶一包烟一个模型跑一天,这里就做个暴力做法的记录

训练和测试部分没变,就不贴了,又是和shape斗智斗勇的一天,有些view其实没必要,总之就先加上

import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.utils.data import random_split import pandas as pd import numpy as np import random import copy read_data = pd.read_table('../train.tsv') data = [] data_len = read_data.shape[0] for i in range(data_len): data.append([read_data['Phrase'][i].lower().split(' '), read_data['Sentiment'][i]]) word_to_ix = {} # 给每个词分配index ix_to_word = {} unk_ix_to_ix = {} word_set = set() for sent, _ in data: for word in sent: if word not in word_to_ix: ix_to_word[len(word_to_ix)] = word word_to_ix[word] = len(word_to_ix) word_set.add(word) unk = '<unk>' ix_to_word[len(word_to_ix)] = unk word_to_ix[unk] = len(word_to_ix) word_set.add(unk) torch.manual_seed(6) # 设置torch的seed,影响后面初始化参数和random_split train_len = int(0.8 * data_len) test_len = data_len - train_len train_data, test_data = random_split(data, [train_len, test_len]) # 分割数据集 # print(type(train_data)) # torch.utils.data.dataset.Subset train_data = list(train_data) test_data = list(test_data) # 参数字典,方便成为调参侠 args = { 'vocab_size': len(word_to_ix), # 有多少词,embedding需要以此来生成词向量 'embedding_size': 50, # 每个词向量有几维(几个特征) 'hidden_size': 16, 'type_num': 5, # 分类个数 'train_batch_size': int(train_len / 10), # 'test_batch_size': int(test_len / 10) } f = open('../glove.6B.50d.txt', 'r', encoding='utf-8') line = f.readline() glove_word2vec = {} pretrained_vec = [] while line: line = line.split() word = line[0] if word in word_set: glove_word2vec[word] = [float(v) for v in line[1:]] line = f.readline() unk_num = 0 for i in range(args['vocab_size']): if ix_to_word[i] in glove_word2vec: pretrained_vec.append(glove_word2vec[ix_to_word[i]]) else: pretrained_vec.append(glove_word2vec[unk]) unk_ix_to_ix[i] = unk_num unk_num += 1 print(unk_num, args['vocab_size']) pretrained_vec = np.array(pretrained_vec) train_len = int(int(train_len / args['train_batch_size']) * args['train_batch_size']) class textRNN(nn.Module): def __init__(self, args): super(textRNN, self).__init__() self.vocab_size = args['vocab_size'] self.embedding_size = args['embedding_size'] self.hidden_size = args['hidden_size'] self.type_num = args['type_num'] self.embed_fine_tune = nn.Embedding(self.vocab_size, self.embedding_size) self.embed_fine_tune.weight.data.copy_(torch.from_numpy(pretrained_vec)) self.embed_fine_tune.requires_grad = False self.embed_fine_tune_unk = nn.Embedding(unk_num, self.embedding_size) self.embed_fine_tune_unk.requires_grad = True self.embed_fine_tune2 = nn.Embedding(self.vocab_size, self.embedding_size) self.embed_fine_tune2.weight.data.copy_(torch.from_numpy(pretrained_vec)) self.embed_fine_tune2.requires_grad = False self.embed_fine_tune_unk2 = nn.Embedding(unk_num, self.embedding_size) self.embed_fine_tune_unk2.requires_grad = True self.rnn = nn.RNN(input_size=self.embedding_size, hidden_size=self.hidden_size) self.rnn2 = nn.RNN(input_size=self.embedding_size, hidden_size=self.hidden_size) self.linear = nn.Linear(self.hidden_size * 2, self.type_num) def forward(self, hidden, hidden2, input, input2): if int(input[0][0]) in unk_ix_to_ix: x = self.embed_fine_tune_unk(torch.LongTensor([unk_ix_to_ix[int(input[0][0])]]).view(1)) else: x = self.embed_fine_tune(input[0][0].view(1)).view(1, -1) # print(x.size()) for word in input[0][1:]: if int(word) in unk_ix_to_ix: x = torch.cat((x, self.embed_fine_tune_unk(torch.LongTensor([unk_ix_to_ix[int(word)]])).view(1, -1)), dim=0) else: x = torch.cat((x, self.embed_fine_tune(word.view(1)).view(1, -1)), dim=0) if int(input2[0][0]) in unk_ix_to_ix: x2 = self.embed_fine_tune_unk2(torch.LongTensor([unk_ix_to_ix[int(input2[0][0])]]).view(1)) else: x2 = self.embed_fine_tune2(input2[0][0].view(1)).view(1, -1) for word in input2[0][1:]: if int(word) in unk_ix_to_ix: x2 = torch.cat((x2, self.embed_fine_tune_unk2(torch.LongTensor([unk_ix_to_ix[int(word)]])).view(1, -1)), dim=0) else: # print(word.size()) x2 = torch.cat((x2, self.embed_fine_tune2(word.view(1)).view(1, -1)), dim=0) # print(x.size()) x = x.view(1, x.shape[0], x.shape[1]) x2 = x2.view(1, x2.shape[0], x2.shape[1]) x = x.transpose(0, 1) x2 = x2.transpose(0, 1) out, hidden = self.rnn(x, hidden) out2, hidden2 = self.rnn2(x2, hidden2) # print(out.size()) # length * layer_num * embedding_num out = out[-1] out2 = out2[-1] # print(out.size()) # [1, 16] out = torch.cat((out, out2), dim=1) return F.log_softmax(self.linear(out), dim=1) ...

 

最新回复(0)