import io
import os
from gluonnlp.data.dataset import Dataset
def line_splitter(s):
"""Split a string at newlines.
按行分割字符串
Parameters
----------
s : str
The string to be split
Returns
--------
List[str]
List of strings. Obtained by calling s.splitlines().
"""
return s.splitlines()
def whitespace_splitter(s):
"""Split a string at whitespace (space, tab, newline, return, formfeed).
按空白格分割字符串
Parameters
----------
s : str
The string to be split
Returns
--------
List[str]
List of strings. Obtained by calling s.split().
"""
return s.split()
def wordtoword_splitter(s):
"""按字分割"""
return list(s)
def _corpus_dataset_process(s, bos, eos):
"""字符串前后是否加bos,eos标识符"""
tokens = [bos] if bos else []
tokens.extend(s)
if eos:
tokens.append(eos)
return tokens
def concat_sequence(sequences):
"""Concatenate sequences of tokens into a single flattened list of tokens.
拼接列表中的列表变成一个列表,即多个句子拼接成一个句子
Parameters
----------
sequences : list of list of object
Sequences of tokens, each of which is an iterable of tokens.
Returns
-------
Flattened list of tokens.
"""
return [token for seq in sequences for token in seq if token]
class SimpleDataset(Dataset):
"""Simple Dataset wrapper for lists and arrays.
Parameters
----------
data : dataset-like object
Any object that implements `len()` and `[]`.
"""
def __init__(self, data):
self._data = data
def __len__(self):
return len(self._data)
def __getitem__(self, idx):
return self._data[idx]
class CorpusDataset(SimpleDataset):
"""Common text dataset that reads a whole corpus based on provided sample splitter
and word tokenizer.
The returned dataset includes samples, each of which can either be a list of tokens if tokenizer
is specified, or otherwise a single string segment produced by the sample_splitter.
Parameters
----------
filename : str or list of str
Path to the input text file or list of paths to the input text files.
encoding : str, default 'utf8'
File encoding format.
flatten : bool, default False #是否拼接多个样本
Whether to return all samples as flattened tokens. If True, each sample is a token.
skip_empty : bool, default True 是否跳过字符串为空的样本
Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
will be added in empty samples.
sample_splitter : function, default str.splitlines
A function that splits the dataset string into samples.
tokenizer : function or None, default str.split
A function that splits each sample string into list of tokens. If None, raw samples are
returned according to `sample_splitter`.
bos : str or None, default None
The token to add at the beginning of each sequence. If None, or if tokenizer is not
specified, then nothing is added.
eos : str or None, default None
The token to add at the end of each sequence. If None, or if tokenizer is not
specified, then nothing is added.
"""
def __init__(self, filename, encoding='utf8', flatten=False, skip_empty=True,
sample_splitter=line_splitter, tokenizer=whitespace_splitter,
bos=None, eos=None):
assert sample_splitter, 'sample_splitter must be specified.'
if not isinstance(filename, (tuple, list)):
filename = (filename,)
self._filenames = [os.path.expanduser(f) for f in filename]
self._encoding = encoding
self._flatten = flatten
self._skip_empty = skip_empty
self._sample_splitter = sample_splitter
self._tokenizer = tokenizer
self._bos = bos
self._eos = eos
super(CorpusDataset, self).__init__(self._read())
def _read(self):
all_samples = []
for filename in self._filenames:
with io.open(filename, 'r', encoding=self._encoding) as fin:
content = fin.read()
samples = (s.strip() for s in self._sample_splitter(content))
if self._tokenizer:
samples = [
_corpus_dataset_process(self._tokenizer(s), self._bos, self._eos)
for s in samples if s or not self._skip_empty
]
if self._flatten:
samples = concat_sequence(samples)
elif self._skip_empty:
samples = [s for s in samples if s]
all_samples += samples
return all_samples
if __name__ == '__main__':
sd = SimpleDataset([1, 2, 3])
print(sd[0])
print(list(sd))
d=[]
sd = SimpleDataset([[1, 2, 3],[1, 2, 3]])
print('sd[0]: ',sd[0])
d.extend(sd) #extend会把数据给取出来
print('d: ',d)
cd = CorpusDataset('testcorpus.txt', tokenizer=wordtoword_splitter)
print(cd._data)
cd1 = CorpusDataset('testcorpus.txt', tokenizer=wordtoword_splitter, flatten=True, eos='<end>')
print(cd1._data)
1 [1, 2, 3] sd[0]: [1, 2, 3] d: [[1, 2, 3], [1, 2, 3]] [['新', '浪', '娱', '乐', '讯', ' ', '北', '京', '时', '间', '9', '月', '4', '日', '消', '息', ',', '据', '《', '名', '利', '场', '》', '报', '道', '称', ',', '罗', '伯', '特', '·', '帕', '丁', '森', '确', '诊', '新', '冠', '阳', '性', ',', '他', '主', '演', '的', '新', '《', '蝙', '蝠', '侠', '》', '电', '影', '拍', '摄', '也', '暂', '停', '。'], ['不', '久', '前', ',', '《', '每', '日', '邮', '报', '》', '曝', '出', '该', '片', '有', '一', '名', '剧', '组', '人', '员', '感', '染', '新', '冠', ',', '刚', '在', '英', '国', '复', '工', '几', '天', '的', '影', '片', '拍', '摄', '也', '因', '此', '暂', '停', '(', '但', '报', '道', '用', '的', '是', 'c', 'r', 'e', 'w', ',', '而', '非', 'c', 'a', 's', 't', ',', '即', '是', '指', '幕', '后', '工', '作', '人', '员', '而', '非', '演', '员', ')', '。', '两', '小', '时', '后', ',', '华', '纳', '确', '认', '有', '一', '名', '《', '蝙', '蝠', '侠', '》', '制', '作', '团', '队', '成', '员', '感', '染', '了', '新', '冠', ',', '并', '简', '短', '确', '认', '了', '拍', '摄', '暂', '停', '一', '事', ',', '按', '惯', '例', '这', '份', '声', '明', '没', '有', '透', '露', '感', '染', '者', '身', '份', ',', '只', '表', '示', '其', '按', '规', '定', '在', '隔', '离', '中', '。'], ['而', '又', '是', '两', '小', '时', '后', ',', '《', '名', '利', '场', '》', '称', '另', '有', '高', '层', '消', '息', '源', '称', '是', '帕', '丁', '森', '新', '冠', '检', '测', '阳', '性', '。', '他', '的', '代', '理', '人', '尚', '未', '就', '此', '报', '道', '做', '出', '回', '复', '。']] ['新', '浪', '娱', '乐', '讯', ' ', '北', '京', '时', '间', '9', '月', '4', '日', '消', '息', ',', '据', '《', '名', '利', '场', '》', '报', '道', '称', ',', '罗', '伯', '特', '·', '帕', '丁', '森', '确', '诊', '新', '冠', '阳', '性', ',', '他', '主', '演', '的', '新', '《', '蝙', '蝠', '侠', '》', '电', '影', '拍', '摄', '也', '暂', '停', '。', '<end>', '不', '久', '前', ',', '《', '每', '日', '邮', '报', '》', '曝', '出', '该', '片', '有', '一', '名', '剧', '组', '人', '员', '感', '染', '新', '冠', ',', '刚', '在', '英', '国', '复', '工', '几', '天', '的', '影', '片', '拍', '摄', '也', '因', '此', '暂', '停', '(', '但', '报', '道', '用', '的', '是', 'c', 'r', 'e', 'w', ',', '而', '非', 'c', 'a', 's', 't', ',', '即', '是', '指', '幕', '后', '工', '作', '人', '员', '而', '非', '演', '员', ')', '。', '两', '小', '时', '后', ',', '华', '纳', '确', '认', '有', '一', '名', '《', '蝙', '蝠', '侠', '》', '制', '作', '团', '队', '成', '员', '感', '染', '了', '新', '冠', ',', '并', '简', '短', '确', '认', '了', '拍', '摄', '暂', '停', '一', '事', ',', '按', '惯', '例', '这', '份', '声', '明', '没', '有', '透', '露', '感', '染', '者', '身', '份', ',', '只', '表', '示', '其', '按', '规', '定', '在', '隔', '离', '中', '。', '<end>', '而', '又', '是', '两', '小', '时', '后', ',', '《', '名', '利', '场', '》', '称', '另', '有', '高', '层', '消', '息', '源', '称', '是', '帕', '丁', '森', '新', '冠', '检', '测', '阳', '性', '。', '他', '的', '代', '理', '人', '尚', '未', '就', '此', '报', '道', '做', '出', '回', '复', '。', '<end>']