1.导入模块
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import math
import os #操作系统:操作文件模块
import zipfile #给文件解压
import numpy as np
from tempfile import gettempdir
from six.moves import urllib
from six.moves import xrange #循环
import tensorflow as
2. 获取文件并解压、初步处理
def zip_file(filename,expected_bytes): # 对相应文件进行解压
# expected_bytes 用于检验文件的完整性和正确性 /filename 用来训练模型 的语料文件,语料就是一片文章
local_filename = os.path.join('.\\',filename) # 输入文件路径
statinfo = os.stat(local_filename) # 获取文件的属性,stat statistics 文件统计信息
if statinfo.st_size == expected_bytes: # 文件大小是否与(属性)统计的大小一致,判断文件是否损坏
print('goodfile',filename)
else:
raise Exception('bad file')
#对文件解压
with zipfile.ZipFile(local_filename) as f: # 一种简化的处理异常exception的方法
data = tf.compat.as_str(f,read(f.namelist()[0])).split() #解压转换成字符串,变换列表
return
了解文章词
vocabulary = zip_file('text8.zip',31344016) #得到训练语料包含的所有词(没有进行分词,输入是英文)
print('Data size',len(vocabulary)) #一共多少个词
vocabulary_size = 50000 # 限定5万(输入层编码的维度)高频词,低频词意义不大
# 具体做项目工程的时候,根据统计来确定
3.构造输入数据 Batch
#统计,原始词列表结构化,生成batch 样本
def build_dataset(words,n_words): # words:原始数据vocabulary;n_words:vocabulary_size
count = [['UNK',-1]] # count[]列表用来统计词频,
#UNK(unknow):代表被过滤掉的所有低频词,例如囧,鳌等被UNK替代;-1表示还未出现,即初始化
count.extend(collections.Counter(words).most_common(n_words -1))
# UNK extend添加到列表的尾部
dictionary = {} # 编码,排在前面的,词频越高,编码值越小
i = 0
for word, _ in cout:
dictionary[word] = i
i += 1
data = [] # 用来存储 原文编码
unk_count = 0 # 用来统计被过滤的低频词
for word in words: # words:原文
index = dictionary.get(word,0) #default:若key不存在,则返回默认值
unk_count += 1
data.append(index)
cout[0][1] = unk_count
reversed_dictionary = dict(zip(dictionary.values():dictionary.key()))
#把key和value 反转,zip用来转换行列
return
查看统计词
data,count,dictionary,reversed_dictionary = build_dataset(vocabulary,vocabulary_size)
print('most common words:',count[:5]) #频数最高的五个词
data_index = 0 #对原文定位
4. skip-gram方法:用于生成 Batch的样本
def generate_batch(batch_size,num_skips,skip_windows): #skip_windows:总词长-1
global data_index
assert batch_size % num_skips ==0 #随机从文中取num_skips个词
assert num_skips <= 2 * skip_windows
batch = np.ndarray(shape = (batch_size),dtype = np.int32) #存放训练样本,输入,只有一行
labels = np.ndarray(shape = (batch_size,1),dtype = np.int32) #存放训练标注,输出,只有一列
span = 2 * skip_windows+1 #取词范围,长度
buffer = collections.deque(maxlen = span) #double-ended 双向队列,存放文本
if (data_index + span > len(data)):
data_index = 0 #训练语料循环使用
buffer.extend(data[data_index:data_index + span])
data_index = span
for i in range(batch_size// num_skips):
context_words = [w for w in range(span) if w != skip_windows]
#中间词的 上下文 例如【0,1,2, 4,5,6】
random.shuffle(context_words) #随机采样
word_to_use = collections.deque(context_words)
for j in range (num_skips):
batch[i * num_skips + j] = buffer[skip_windows] #中心词
context_words = word_to_use.pop()
labels[i * num_skips + j,0] = buffer[context_words]
if data_index == len(data):
buffer[:] = data[:span] #取到末尾数据的时候
data_index = span
else:
buffer.append(data[data_index])
data_index += 1
data_index = (data_index + len(data) - span) % len(data)
return batch,labels #输入和输出是分开放的
5.定义神经网络初始化数据
batch_size = 128 #适当的偏向
embedding_size = 300 #中间层节点的个数,词向量(word2vec)的维度
skip_window = 2
num_skips = 2 #采样值 偏小(经验值)
num_sampled = 64 #采样 与soft max的计算有关
valid_size = 16 #测试集的大小
valid_window = 100 #生成随机列表的值<100
valid_examples = np.random.choice(valid_window,valid_size,replace = Fale)
gragh = tf.Gragh() #直观理解,所在tensor 构成了一幅图
6.构造神经网络
with gragh.as_default():
train_input = tf.placebolder(tf.int32,shape = [batch_size]) #tensor 一行 placeholder占位符,存放batch样本
train_labels = tf.placebolder(tf.int32, shape=[batch_size,1]) #tensor 一列 placeholder占位符
with tf.device('/cpu:0'):
#存放所有的词,random_uniform()初始化,均匀分布 embeddings是weight矩阵
embeddings = tf.Variable(tf.random_uniform([vocabulary_size.embedding_size]),-1,0,1.0)
#找出一批子矩阵 其实就是找出batch样本对应的weight
embed = tf.nn.embedding_lookup(embeddings,train_input)
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
nce_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size))) #
# truncated_normal是切掉左右尾巴的正态分布,stddev=1.0 / math.sqrt(embedding_size)很有名的初始化技巧
nce_biases = tf.Variable(tf.zeros(vocabulary_size)) #定义常数bias
# 构造损失函数
# nce:noise contranstive estimation 噪声对比估计
# 默认使用了 softmax函数(封装了平均交叉熵损失函数),损失函数近似计算
loss = tf.reduce_mean(tf.nn_loss(weights = nce_weights,
biases = nce_biases,
labels = train_labels,
inputs = embed,
num_sampled = num_sampled,
num_classes = vocabulary_size))
# 优化器:梯度提升优化器(1.0: learning rate 学习率)
optimizer = tf.train.GradientDcisionOptimizer(1.0).minimize(loss) #小步长,防止抖动,梯度下降算法
#测试用的:归一化后,计算词与词距离(夹角的余弦值)
norm = tf.sqrt(tf.reduce_sum(tf.queare(embeddings),1,keep_dims = True))
normalized_embeddings = embeddings/norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,valid_dataset)
similarity = tf.matmul(valid_embeddings,
normalized_embeddings,
transpose_b = True) #内积:夹角的余弦值
init = tf.global_variable_initializer() #初始化全局变量
num_steps = 100000 #训练迭代次数
7. 训练
with tf.Session(gragh = gragh) as sesion:
init.run()
average_loss = 0
for step in arange(num_steps):
batch_inputs,batch_labels = generate_batch(batch_size,num_skips,skip_window)
feed_dict = {train_input:batch_inputs,train_labels:batch_labels}
_.loss_val = session.run([optimizer,loss],feed_dict = feed_dict)
#run()函数计算又先后
average_loss += loss_val
if (step % 2000 ==0): # 每2000次 统计一下模型误差
if(step > 0):
average_loss /= 2000
print('average loss at step is'.step,':',average_loss)
average_loss = 0
if step % 10000 ==0:
sim = similarity.eval()
for i in xrange(valid_size):
valid_word = reversed_dictionary[valid_examples[1]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i,:]).argsort()[1:top_k + 1]
log_str = 'Nearst to %s:' % valid_word
for k in xrange(top_k):
close_word = reversed_dictionary[nearest[k]]
log_str = '%s %s,'% (log_str,close_word)
print(log_str)
final.embeddings = normalized_embeddings.eval() #归一化,输出词向量