https://github.com/FudanNLP/nlp-beginner
1. 回顾
之前实现了textCNN,但是效果并不是很好,推测是随机选取train_data的锅,之前一直不知道该怎么多组一起下降,今天才理清epoch和batch的关系,可以利用pytorch中对梯度的累加机制来实现非等长数据的小批量梯度下降,所以前一篇跑出来的textCNN就当跑个乐了,之后有空再回去补。另注意跑完所有sample才算跑完一次eopch...
所以这里mini-batch采用的是
train_data_number = 156060 * 0.8 = 124848
iter_num = 10
batch_size = int(train_data_number / iter_num) = 1248
epoch = 10
update_num = epoch * iter_num = 100
更新次数就降到100次了,以前一次epoch就更新124848次,想必也是直接一头冲进哪个局部最低点了.... 难怪跑出来这么菜
2. textRNN
参考
直接用glove的glove.6B.zip中的glove.6B.50d.txt初始化词向量了。
2.1. 代码
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import random_split
import pandas as pd
import numpy as np
import random
import copy
# 同前读取数据部分
read_data = pd.read_table('../train.tsv')
data = []
data_len = read_data.shape[0]
for i in range(data_len):
data.append([read_data['Phrase'][i].lower().split(' '), read_data['Sentiment'][i]])
word_to_ix = {} # 给每个词分配index
ix_to_word = {}
word_set = set()
for sent, _ in data:
for word in sent:
if word not in word_to_ix:
ix_to_word[len(word_to_ix)] = word
word_to_ix[word] = len(word_to_ix)
word_set.add(word)
unk = '<unk>'
ix_to_word[len(word_to_ix)] = unk
word_to_ix[unk] = len(word_to_ix)
word_set.add(unk)
torch.manual_seed(6) # 设置torch的seed,影响后面初始化参数和random_split
train_len = int(0.8 * data_len)
test_len = data_len - train_len
train_data, test_data = random_split(data, [train_len, test_len]) # 分割数据集
# print(type(train_data)) # torch.utils.data.dataset.Subset
train_data = list(train_data)
test_data = list(test_data)
# 参数字典,方便成为调参侠
args = {
'vocab_size': len(word_to_ix), # 有多少词,embedding需要以此来生成词向量
'embedding_size': 50, # 每个词向量有几维(几个特征)
'hidden_size': 16,
'type_num': 5, # 分类个数
'train_batch_size': int(train_len / 10),
# 'test_batch_size': int(test_len / 10)
}
f = open('../glove.6B.50d.txt', 'r', encoding='utf-8')
line = f.readline()
glove_word2vec = {}
pretrained_vec = []
while line:
line = line.split()
word = line[0]
if word in word_set:
glove_word2vec[word] = [float(v) for v in line[1:]]
line = f.readline()
unk_num = 0
for i in range(args['vocab_size']):
if ix_to_word[i] in glove_word2vec:
pretrained_vec.append(glove_word2vec[ix_to_word[i]])
else:
pretrained_vec.append(glove_word2vec[unk])
unk_num += 1
print(unk_num, args['vocab_size'])
pretrained_vec = np.array(pretrained_vec)
train_len = int(int(train_len / args['train_batch_size']) * args['train_batch_size']) # 去掉最后多余不满一个batch的一组,不去掉对loss影响比较大,比如说前10组都是1e-4级,经过最后一组直接上1e-2级了
class textRNN(nn.Module):
def __init__(self, args):
super(textRNN, self).__init__()
self.vocab_size = args['vocab_size']
self.embedding_size = args['embedding_size']
self.hidden_size = args['hidden_size']
self.type_num = args['type_num']
self.embed_fine_tune = nn.Embedding(self.vocab_size, self.embedding_size)
self.embed_fine_tune.weight.data.copy_(torch.from_numpy(pretrained_vec))
self.embed_fine_tune.requires_grad = True
self.rnn = nn.RNN(input_size=self.embedding_size, hidden_size=self.hidden_size) # 相当于一个self.embedding_size * self.hidden_size的矩阵
self.linear = nn.Linear(self.hidden_size, self.type_num) # 全连接层
def forward(self, hidden, x):
x = self.embed_fine_tune(x)
x = x.transpose(0, 1) # 据参考pytorch中RNN要求batch_size在第二位,反正这边都是1
out, hidden = self.rnn(x, hidden)
# print(out.size()) # length * layer_num * embedding_num
out = out[-1] # 只取输出层最后一个进行判断
# print(out.size()) # [1, 16]
return F.log_softmax(self.linear(out), dim=1)
model = textRNN(args)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
def match(test_batch):
acc = 0
with torch.no_grad():
for instance, label in test_batch:
inputs1 = [word_to_ix[word] for word in instance]
inputs1 = torch.LongTensor(inputs1).view(1, -1)
hidden = torch.zeros(1, 1, args['hidden_size'])
log_probs = model(hidden, hidden2, inputs1, inputs2)
b = torch.argmax(log_probs, dim=1)
if b[0] == label:
acc += 1
print('acc = %.4lf%%' % (acc / test_len * 100))
def train(batch_data, batch_size):
model.zero_grad()
for instance, label in batch_data:
inputs1 = [word_to_ix[word] for word in instance] # 要先把每个词转换为其对应的index
inputs1 = torch.LongTensor(inputs1).view(1, -1)
target = torch.LongTensor([label])
hidden = torch.zeros(1, 1, args['hidden_size'])
log_probs = model(hidden, hidden2, inputs1, inputs2)
loss = loss_function(log_probs, target) / batch_size # 挨个除以batch_size
loss.backward()
print(' loss = %.4lf' % loss) # 看看loss,虽然都是在1e-4级左右震荡
optimizer.step() # 放到外面来,一组batch结束后一起下降
# match(test_data) # 初始 11.7455%
for epoch in range(10):
print('now in epoch %d...' % epoch)
random.shuffle(train_data) # 每次打乱顺序
for i in range(0, train_len, args['train_batch_size']):
train(train_data[i: i + args['train_batch_size']], args['train_batch_size'])
match(test_data)
2.2. 结果
跑全训练集确实有点慢,不过终于突破60大关了,最高epoch = 8时acc = 64.39%,看了看能挤进6年前leaderboard的173/860了(悲,还是被6年前的人吊打),虽然用的是伪测试集
3. textBiRNN
3.1. 代码(模型+训练+测试)
class textRNN(nn.Module):
def __init__(self, args):
super(textRNN, self).__init__()
self.vocab_size = args['vocab_size']
self.embedding_size = args['embedding_size']
self.hidden_size = args['hidden_size']
self.type_num = args['type_num']
self.embed_fine_tune = nn.Embedding(self.vocab_size, self.embedding_size)
self.embed_fine_tune.weight.data.copy_(torch.from_numpy(pretrained_vec))
self.embed_fine_tune.requires_grad = True
self.embed_fine_tune2 = nn.Embedding(self.vocab_size, self.embedding_size)
self.embed_fine_tune2.weight.data.copy_(torch.from_numpy(pretrained_vec))
self.embed_fine_tune2.requires_grad = True
self.rnn = nn.RNN(input_size=self.embedding_size, hidden_size=self.hidden_size)
self.rnn2 = nn.RNN(input_size=self.embedding_size, hidden_size=self.hidden_size)
self.linear = nn.Linear(self.hidden_size * 2, self.type_num)
def forward(self, hidden, hidden2, x, x2):
x = self.embed_fine_tune(x)
x = x.transpose(0, 1)
x2 = self.embed_fine_tune2(x2)
x2 = x2.transpose(0, 1)
out, hidden = self.rnn(x, hidden)
out2, hidden2 = self.rnn2(x2, hidden2)
# print(out.size()) # length * layer_num * embedding_num
out = out[-1]
out2 = out2[-1]
# print(out.size()) # [1, 16]
out = torch.cat((out, out2), dim=1)
return F.log_softmax(self.linear(out), dim=1)
model = textRNN(args)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
def match(test_batch):
acc = 0
with torch.no_grad():
for instance, label in test_batch:
inputs1 = [word_to_ix[word] for word in instance]
inputs2 = copy.deepcopy(inputs1)
inputs2.reverse()
inputs1 = torch.LongTensor(inputs1).view(1, -1)
inputs2 = torch.LongTensor(inputs2).view(1, -1)
hidden = torch.zeros(1, 1, args['hidden_size'])
hidden2 = torch.zeros(1, 1, args['hidden_size'])
log_probs = model(hidden, hidden2, inputs1, inputs2)
b = torch.argmax(log_probs, dim=1)
if b[0] == label:
acc += 1
print('acc = %.4lf%%' % (acc / test_len * 100))
def train(batch_data, batch_size):
model.zero_grad()
for instance, label in batch_data:
inputs1 = [word_to_ix[word] for word in instance] # 要先把每个词转换为其对应的index
inputs2 = copy.deepcopy(inputs1)
inputs2.reverse()
inputs1 = torch.LongTensor(inputs1).view(1, -1)
inputs2 = torch.LongTensor(inputs2).view(1, -1)
target = torch.LongTensor([label])
hidden = torch.zeros(1, 1, args['hidden_size'])
hidden2 = torch.zeros(1, 1, args['hidden_size'])
log_probs = model(hidden, hidden2, inputs1, inputs2)
loss = loss_function(log_probs, target) / batch_size
loss.backward()
print(' loss = %.4lf' % loss)
optimizer.step()
# match(test_data) # 初始 11.7455%
for epoch in range(10):
print('now in epoch %d...' % epoch)
random.shuffle(train_data)
for i in range(0, train_len, args['train_batch_size']):
train(train_data[i: i + args['train_batch_size']], args['train_batch_size'])
match(test_data)
3.2. 结果
好慢...不过提升了1%,最高epoch = 8时acc = 65.33%,115/860
3.3. 小实验
在《Reasoning about Entailment with Neural Attention》里看到一种把语料库中不存在的词初始化为随机向量,然后freeze存在的那部分,fine-tune随机初始化的那部分的做法,感觉可能会比统一变成<unk>向量好,所以试了下,难点在不知道怎么把随机初始化的那部分挑出来又不失语序,所以这边直接暴力挨个cat了,代价当然是慢到怀疑人生,跑了4个epoch结果上来说和统一变成<unk>没什么大差,甚至稍逊一丢丢,所以没继续跑了,真一杯茶一包烟一个模型跑一天,这里就做个暴力做法的记录
训练和测试部分没变,就不贴了,又是和shape斗智斗勇的一天,有些view其实没必要,总之就先加上
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import random_split
import pandas as pd
import numpy as np
import random
import copy
read_data = pd.read_table('../train.tsv')
data = []
data_len = read_data.shape[0]
for i in range(data_len):
data.append([read_data['Phrase'][i].lower().split(' '), read_data['Sentiment'][i]])
word_to_ix = {} # 给每个词分配index
ix_to_word = {}
unk_ix_to_ix = {}
word_set = set()
for sent, _ in data:
for word in sent:
if word not in word_to_ix:
ix_to_word[len(word_to_ix)] = word
word_to_ix[word] = len(word_to_ix)
word_set.add(word)
unk = '<unk>'
ix_to_word[len(word_to_ix)] = unk
word_to_ix[unk] = len(word_to_ix)
word_set.add(unk)
torch.manual_seed(6) # 设置torch的seed,影响后面初始化参数和random_split
train_len = int(0.8 * data_len)
test_len = data_len - train_len
train_data, test_data = random_split(data, [train_len, test_len]) # 分割数据集
# print(type(train_data)) # torch.utils.data.dataset.Subset
train_data = list(train_data)
test_data = list(test_data)
# 参数字典,方便成为调参侠
args = {
'vocab_size': len(word_to_ix), # 有多少词,embedding需要以此来生成词向量
'embedding_size': 50, # 每个词向量有几维(几个特征)
'hidden_size': 16,
'type_num': 5, # 分类个数
'train_batch_size': int(train_len / 10),
# 'test_batch_size': int(test_len / 10)
}
f = open('../glove.6B.50d.txt', 'r', encoding='utf-8')
line = f.readline()
glove_word2vec = {}
pretrained_vec = []
while line:
line = line.split()
word = line[0]
if word in word_set:
glove_word2vec[word] = [float(v) for v in line[1:]]
line = f.readline()
unk_num = 0
for i in range(args['vocab_size']):
if ix_to_word[i] in glove_word2vec:
pretrained_vec.append(glove_word2vec[ix_to_word[i]])
else:
pretrained_vec.append(glove_word2vec[unk])
unk_ix_to_ix[i] = unk_num
unk_num += 1
print(unk_num, args['vocab_size'])
pretrained_vec = np.array(pretrained_vec)
train_len = int(int(train_len / args['train_batch_size']) * args['train_batch_size'])
class textRNN(nn.Module):
def __init__(self, args):
super(textRNN, self).__init__()
self.vocab_size = args['vocab_size']
self.embedding_size = args['embedding_size']
self.hidden_size = args['hidden_size']
self.type_num = args['type_num']
self.embed_fine_tune = nn.Embedding(self.vocab_size, self.embedding_size)
self.embed_fine_tune.weight.data.copy_(torch.from_numpy(pretrained_vec))
self.embed_fine_tune.requires_grad = False
self.embed_fine_tune_unk = nn.Embedding(unk_num, self.embedding_size)
self.embed_fine_tune_unk.requires_grad = True
self.embed_fine_tune2 = nn.Embedding(self.vocab_size, self.embedding_size)
self.embed_fine_tune2.weight.data.copy_(torch.from_numpy(pretrained_vec))
self.embed_fine_tune2.requires_grad = False
self.embed_fine_tune_unk2 = nn.Embedding(unk_num, self.embedding_size)
self.embed_fine_tune_unk2.requires_grad = True
self.rnn = nn.RNN(input_size=self.embedding_size, hidden_size=self.hidden_size)
self.rnn2 = nn.RNN(input_size=self.embedding_size, hidden_size=self.hidden_size)
self.linear = nn.Linear(self.hidden_size * 2, self.type_num)
def forward(self, hidden, hidden2, input, input2):
if int(input[0][0]) in unk_ix_to_ix:
x = self.embed_fine_tune_unk(torch.LongTensor([unk_ix_to_ix[int(input[0][0])]]).view(1))
else:
x = self.embed_fine_tune(input[0][0].view(1)).view(1, -1)
# print(x.size())
for word in input[0][1:]:
if int(word) in unk_ix_to_ix:
x = torch.cat((x, self.embed_fine_tune_unk(torch.LongTensor([unk_ix_to_ix[int(word)]])).view(1, -1)), dim=0)
else:
x = torch.cat((x, self.embed_fine_tune(word.view(1)).view(1, -1)), dim=0)
if int(input2[0][0]) in unk_ix_to_ix:
x2 = self.embed_fine_tune_unk2(torch.LongTensor([unk_ix_to_ix[int(input2[0][0])]]).view(1))
else:
x2 = self.embed_fine_tune2(input2[0][0].view(1)).view(1, -1)
for word in input2[0][1:]:
if int(word) in unk_ix_to_ix:
x2 = torch.cat((x2, self.embed_fine_tune_unk2(torch.LongTensor([unk_ix_to_ix[int(word)]])).view(1, -1)), dim=0)
else:
# print(word.size())
x2 = torch.cat((x2, self.embed_fine_tune2(word.view(1)).view(1, -1)), dim=0)
# print(x.size())
x = x.view(1, x.shape[0], x.shape[1])
x2 = x2.view(1, x2.shape[0], x2.shape[1])
x = x.transpose(0, 1)
x2 = x2.transpose(0, 1)
out, hidden = self.rnn(x, hidden)
out2, hidden2 = self.rnn2(x2, hidden2)
# print(out.size()) # length * layer_num * embedding_num
out = out[-1]
out2 = out2[-1]
# print(out.size()) # [1, 16]
out = torch.cat((out, out2), dim=1)
return F.log_softmax(self.linear(out), dim=1)
...