数据预处理
一、读入text数据(情感分析)
#把当前的目录路径和data所在的文件夹路径合并
#得到原始数据路径
data_dir = os.path.join(os.getcwd(),'aclImdb')
data_dir
##读取数据
def read_data(data_dir,is_train=True):
#data用于存储数据
#labels用于存储每条数据对应的 1:positive 0:negative
data = []
labels = []
for label in ('neg','pos'):
#把data path,‘train’,‘pos’连接起来,构成12500个text文件所在目录
folder_dir = os.path.join(data_dir,'train' if is_train else 'test',label)
#遍历每一个text文件
for file in os.listdir(folder_dir):
with open(os.path.join(folder_dir,file),'r',encoding='utf-8') as f:
#read()一次读取整个text,返回一整个str,而不是list;适用于每个text只储存了一行的情况
#readline()一次读取一个text文件中的一行,返回str
#readlines()一次读取整个text,每一行作为str,最终整个text生成一个list(与read()对比)
sentence = f.read().replace('\n','')
data.append(sentence)
labels.append(1 if label =='pos' else 0 )
return data,labels
train_data = read_imdb(data_dir, is_train=True)
print('训练集数目:', len(train_data[0]))
for x, y in zip(train_data[0][:3], train_data[1][:3]):
print('标签:', y, 'review:', x[0:60])
二、预处理数据集
(一)把得到的train_data先转变成tokens的模式
[
[‘I’,‘like’,‘you’…],
[‘I’,‘hate’,‘you’…],
…
]
双层list,一行代表一段话
def tokenize(lines,token ='word'):
if token == 'word':
return [line.split() for line in lines]
elif token =='char':
return [list(line) for line in lines]
else:
print('输入错误')
train_tokens = tokenize(train_data[0])
train_tokens[0]
(二)建立Vocab类,把每篇文章的token变成vocab,包含 token2id,id2token,token_freq
这是vocab类里面需要用到的一个函数,目的是统计token出现的次数并且按频率从高到低排
import collections
def count_corpus(tokens):
#如果tokens里面还包了一层,【【‘I like you’】,【】,...】
#需要展开
if len(tokens)==0 or isinstance(tokens[0],list):
tokens = [token for line in tokens for token in line]
#计算每个token的个数并按照频率从高到低排列
return collections.Counter(tokens).most_common()
#建立一个vocab类,把每篇文章的token变成vocab,包含 token2id,id2token,token_freq
class Vocab:
#min_freq设定小于该频率的直接去除
#reversed token保存 id2token
def __init__(self,tokens = None,min_freq=0,reversed_tokens = None):
if tokens == None:
tokens = []
if reversed_tokens == None:
reversed_tokens = []
self._tokens_freq = count_corpus(tokens)
#初始化id2token 和 token2id 两个list
self.id2token = ['<UNK>'] + reversed_tokens
self.token2id = {token:id for id,token in enumerate(self.id2token)}
#根据排序出来的 self._tokens_freq来往里面添加
for token,freq in self._tokens_freq:
if freq < min_freq:
break
else:
self.id2token.append(token)
self.token2id[token] = len(self.id2token)-1
def __len__(self):
return len(self.id2token)
#__getitem__就是当直接调用vocab[]时返回的内容,这里默认输入的token 要返回id
def __getitem__(self,tokens):
if not isinstance(tokens,(list,tuple)):
return self.token2id.get(tokens,self.unk) #存在就返回token对应的id
#不存在就返回self.unk = 0
#如果输入的是一个list就递归调用一次getitem来返回list中每一个token的id
return [self.__getitem__(token) for token in tokens]
#上面是token2id的简易写法,下面是id2token的查询
def to_tokens(self,indices):
if not isinstance(indices,(list,tuple)):
return self.id2token[indices]
return [self.id2token[indice] for indice in indices]
#装饰器,这样调用的时候不用打括号,比如直接调用vocab.unk 就显示 0
@property
def unk(self):
return 0
@property
def token_freq(self):
return self._tokens_freq
vocab = Vocab(train_tokens,min_freq=5,reversed_tokens =['<pad>'])
(三)把输入处理成 【batch size,seq len】这种可以进模型的输入模式
import matplotlib.pyplot as plt
plt.hist([len(line) for line in train_tokens], bins=range(0, 1000, 50))
可以看出seq的长度具有一定的肥尾效应,大多数集中在500以内,所以我们把超过500的seq进行阶段,不到500的进行填充
def truncate_pad(line,num_steps,padding_token):
if len(line) > num_steps:
return line[:num_steps]
return line + [padding_token]*(num_steps-len(line))
#这里vocab[line] 等同于 vocab.token2id(line)
train_features = torch.tensor([truncate_pad(vocab[line],500,vocab['<pad>']) for line in train_tokens])
train_features.size()
得到的是【batch size,seq len】,这里还没对batch size进行切分,但500就是seq len了
下面建立迭代器
#is_train 用于之后调用dataloader的时候是否shuffle,是训练集的话就会shuffle
def load_array(data_arrays,batch_size,is_train=True):
dataset = data.TensorDataset(*data_arrays)#把元组(train feature,train label)处理成输入indice就
#可以输出对应位置的元组(feature,label)的模式
return data.DataLoader(dataset,batch_size,shuffle = is_train)
train_iter = load_array((train_features,torch.tensor(train_data[1])),64)
for train_X,train_y in train_iter:
print(f'train_X的维度:{train_X.shape}\n')
print(f'train_y的维度:{train_y.shape}\n')
break
print(f'一共有{len(train_iter)}个batch')
整合上面所有的函数,最后得到train_iter, test_iter, vocab这几个可以直接输入进去模型的
#@save
def load_data_imdb(batch_size, num_steps=500):
"""返回数据迭代器和IMDb评论数据集的词表"""
data_dir = os.path.join(os.getcwd(),'aclImdb')
train_data = read_data(data_dir, True)
test_data = read_data(data_dir, False)
train_tokens = tokenize(train_data[0], token='word')
test_tokens = tokenize(test_data[0], token='word')
vocab = Vocab(train_tokens, min_freq=5)
train_features = torch.tensor([truncate_pad(
vocab[line], num_steps, vocab['<pad>']) for line in train_tokens])
test_features = torch.tensor([truncate_pad(
vocab[line], num_steps, vocab['<pad>']) for line in test_tokens])
train_iter = load_array((train_features, torch.tensor(train_data[1])),
batch_size)
test_iter = load_array((test_features, torch.tensor(test_data[1])),
batch_size,
is_train=False)
return train_iter, test_iter, vocab
train_iter2,test_iter2,vocab2 = load_data_imdb(batch_size=128,num_steps=550)