相关原理可参见:文本表示(一)—— word2vec(skip-gram CBOW) glove, transformer, BERT
其它相关链接:
文本表示(三)—— fasttext 词向量调用代码
二话不说,直接上代码
import pandas as pd import jieba import re from gensim.models import Word2Vec embedding_size = 200 # 设置词向量大小 iters = 10 # 设置迭代次数 min_count = 5 model_file = 'word2vec_{}'.format(embedding_size) splitter = '(。|!|\!|?|\?|;|;|\n)' # 文档加载 class TextLoader(object): def __init__(self): pass def __iter__(self): file_name = '/data/hanxuhong/nlp/anhuanyuan/data/hse语料.csv' df = pd.read_csv(file_name, sep=',', encoding='utf-8') texts = df['文本描述'].tolist() for text in texts: text = str(text) if len(text) == 0 or text == 'nan': continue sentences = re.split(splitter, str(text)) for sen in sentences: segments = [str(i) for i in jieba.cut(sen)] yield segments corpus = TextLoader() model = Word2Vec(corpus, workers=20, size=embedding_size, sg=1, iter=iters, min_count=min_count) model.save(model_file)