Deep LSTM siamese network for text similarity源码分析-CFANZ编程社区

上文讲解了基于SimaseLSTM来计算两个句子或单词之间相关性的原理和个人的一些经验这里来分析一下源码是怎么实现的：

Github实现地址：https://github.com/dhwajraj/deep-siamese-text-similarity

具体实现的是文章SiameseRecurrent Architectures for Learning Sentence Similarity所提出的网络结构，实现代码主要分为模型定义siamese_network.py，数据处理input_helpers.py，模型训练train.py三个部分。

第一部分是输入训练样本处理函数input_helpers.py：

class InputHelper(object):
     #读取输入的文件，文件分为三行，行与行之间用\t分隔开，前两行为需要计算相似度的两个句子，后两行为类别标记
     def getTsvData(self, filepath):
         print("Loading training data from "+filepath)
         x1=[]
         x2=[]
         y=[]
         # positive samples from file
         for line in open(filepath):
             l=line.strip().split("\t")
             if len(l)<2:
                 continue
   #随机打乱两个句子之间的位置关系
             if random() > 0.5:
                x1.append(l[0].lower())
                x2.append(l[1].lower())
             else:
                x1.append(l[1].lower())
                x2.append(l[0].lower())
   #默认原始数据为正样本
             y.append(1)#np.array([0,1]))
         # generate random negative samples
         combined = np.asarray(x1+x2)
         shuffle_indices = np.random.permutation(np.arange(len(combined)))
         combined_shuff = combined[shuffle_indices]
#对两个句子进行拼接，做为负样本
         for i in xrange(len(combined)):
             x1.append(combined[i])
             x2.append(combined_shuff[i])
             y.append(0) #np.array([1,0]))
         return np.asarray(x1),np.asarray(x2),np.asarray(y)




     def getTsvTestData(self, filepath):
#获得测试数据
         print("Loading testing/labelled data from "+filepath)
         x1=[]
         x2=[]
         y=[]
         # positive samples from file
         for line in open(filepath):
             l=line.strip().split("\t")
             if len(l)<3:
                 continue
             x1.append(l[1].lower())
             x2.append(l[2].lower())
             y.append(int(l[0])) #np.array([0,1]))
         return np.asarray(x1),np.asarray(x2),np.asarray(y)  
  
     def batch_iter(self, data, batch_size, num_epochs, shuffle=True):
         """
         生成训练需要使用的minibatch数据
         """
         data = np.asarray(data)
         print(data)
         print(data.shape)
         data_size = len(data)
         num_batches_per_epoch = int(len(data)/batch_size) + 1
         for epoch in range(num_epochs):
             # Shuffle the data at each epoch
             if shuffle:
                 shuffle_indices = np.random.permutation(np.arange(data_size))
                 shuffled_data = data[shuffle_indices]
             else:
                 shuffled_data = data
             for batch_num in range(num_batches_per_epoch):
                 start_index = batch_num * batch_size
                 end_index = min((batch_num + 1) * batch_size, data_size)
#生成训练样本minibatch的迭代器
                 yield shuffled_data[start_index:end_index]
                 
     def dumpValidation(self,x1_text,x2_text,y,shuffled_index,dev_idx,i):
         print("dumping validation "+str(i))
         x1_shuffled=x1_text[shuffled_index]
         x2_shuffled=x2_text[shuffled_index]
         y_shuffled=y[shuffled_index]
         x1_dev=x1_shuffled[dev_idx:]
         x2_dev=x2_shuffled[dev_idx:]
         y_dev=y_shuffled[dev_idx:]
         del x1_shuffled
         del y_shuffled
         with open('validation.txt'+str(i),'w') as f:
             for text1,text2,label in zip(x1_dev,x2_dev,y_dev):
                 f.write(str(label)+"\t"+text1+"\t"+text2+"\n")
             f.close()
         del x1_dev
         del y_dev
     
     # Data Preparatopn
     # ==================================================
     
     
     def getDataSets(self, training_paths, max_document_length, percent_dev, batch_size):
#读取训练样本
         x1_text, x2_text, y=self.getTsvData(training_paths)
         
         # 根据训练样本构建词典，并且把词典存放到本地，用于训练样本时构建句子的词向量
         print("Building vocabulary")
#根据词的出现顺序构造hashmap，由此构建词典。这种方法比较构建词向量虽然效率高但精度不高，建议自己用word2vec来train一个词表
         vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0)
         vocab_processor.fit_transform(np.concatenate((x2_text,x1_text),axis=0))
         print("Length of loaded vocabulary ={}".format( len(vocab_processor.vocabulary_)))
         i1=0
         train_set=[]
         dev_set=[]
         sum_no_of_batches = 0
         x1 = np.asarray(list(vocab_processor.transform(x1_text)))
         x2 = np.asarray(list(vocab_processor.transform(x2_text)))
         # Randomly shuffle data
         np.random.seed(131)
         shuffle_indices = np.random.permutation(np.arange(len(y)))
         x1_shuffled = x1[shuffle_indices]
         x2_shuffled = x2[shuffle_indices]
         y_shuffled = y[shuffle_indices]
         dev_idx = -1*len(y_shuffled)*percent_dev//100
         del x1
         del x2
         # 对训练集和测试集进行切分
         self.dumpValidation(x1_text,x2_text,y,shuffle_indices,dev_idx,0)
         # TODO: This is very crude, should use cross-validation
         x1_train, x1_dev = x1_shuffled[:dev_idx], x1_shuffled[dev_idx:]
         x2_train, x2_dev = x2_shuffled[:dev_idx], x2_shuffled[dev_idx:]
         y_train, y_dev = y_shuffled[:dev_idx], y_shuffled[dev_idx:]
         print("Train/Dev split for {}: {:d}/{:d}".format(training_paths, len(y_train), len(y_dev)))
         sum_no_of_batches = sum_no_of_batches+(len(y_train)//batch_size)
         train_set=(x1_train,x2_train,y_train)
         dev_set=(x1_dev,x2_dev,y_dev)
         gc.collect()
         return train_set,dev_set,vocab_processor,sum_no_of_batches
     
     def getTestDataSet(self, data_path, vocab_path, max_document_length):
#获取测试数据，方法与构造训练样本相同
         x1_temp,x2_temp,y = self.getTsvTestData(data_path)


         # Build vocabulary
         vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0)
         vocab_processor = vocab_processor.restore(vocab_path)
         print len(vocab_processor.vocabulary_)


         x1 = np.asarray(list(vocab_processor.transform(x1_temp)))
         x2 = np.asarray(list(vocab_processor.transform(x2_temp)))
         # Randomly shuffle data
         del vocab_processor
         gc.collect()
         return x1,x2, y

第二部分是模型的定义siamese_network.py：

class SiameseLSTM(object):
     """
     A LSTM based deep Siamese network for text similarity.
     Uses an character embedding layer, followed by a biLSTM and Energy Loss layer.
     """

#定义了一个双向的lstm结构，虽然tensorflow提供了双向rnn的函数tf.nn.bidirectional_rnn()，本人觉得这个并不好使用，因为不方便修改网络的结构

#实际训练时用起来也不方便
     def BiRNN(self, x, dropout, scope, embedding_size, sequence_length):
         n_input=embedding_size
         n_steps=sequence_length
         n_hidden=n_steps
         n_layers=3
         # Prepare data shape to match `bidirectional_rnn` function requirements
         # Current data input shape: (batch_size, n_steps, n_input) (?, seq_len, embedding_size)
         # Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
         # Permuting batch_size and n_steps
         x = tf.transpose(x, [1, 0, 2])
         # Reshape to (n_steps*batch_size, n_input)
         x = tf.reshape(x, [-1, n_input])
         # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
         x = tf.split(0, n_steps, x)
         print(x)
         # Define lstm cells with tensorflow
         # Forward direction cell
#定义正向LSTM，也可以替换为GRU，本人实践发现训练速度更快
         with tf.name_scope("fw"+scope),tf.variable_scope("fw"+scope):
             print(tf.get_variable_scope().name)
             fw_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)
             lstm_fw_cell = tf.nn.rnn_cell.DropoutWrapper(fw_cell,output_keep_prob=dropout)
             lstm_fw_cell_m=tf.nn.rnn_cell.MultiRNNCell([lstm_fw_cell]*n_layers, state_is_tuple=True)
         # Backward direction cell
#定义反向LSTM
         with tf.name_scope("bw"+scope),tf.variable_scope("bw"+scope):
             print(tf.get_variable_scope().name)
             bw_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)
             lstm_bw_cell = tf.nn.rnn_cell.DropoutWrapper(bw_cell,output_keep_prob=dropout)
             lstm_bw_cell_m = tf.nn.rnn_cell.MultiRNNCell([lstm_bw_cell]*n_layers, state_is_tuple=True)
         # Get lstm cell output
         #try:
#把正负向LSTM包到一起
         with tf.name_scope("bw"+scope),tf.variable_scope("bw"+scope):
             outputs, _, _ = tf.nn.bidirectional_rnn(lstm_fw_cell_m, lstm_bw_cell_m, x, dtype=tf.float32)
             #         except Exception: # Old TensorFlow version only returns outputs not states
             #             outputs = tf.nn.bidirectional_rnn(lstm_fw_cell_m, lstm_bw_cell_m, x,
             #                                             dtype=tf.float32)

#将正向输出和负向输出拼接到一起，并取出最后时刻得输出，这里这样做并不合适，因为当句子的长度比maxSenLen短时，实际不足的部分是

#补0的，相当于引入噪声，这里最好求平均，tf.reduce_sum(outputs,0)做为输出，并且为提升效果，还可以串接一个全连接层
return outputs[-1]
     
     def contrastive_loss(self, y,d,batch_size):
#论文中的cos目标函数
         tmp= y *tf.square(d)
         #tmp= tf.mul(y,tf.square(d))
         tmp2 = (1-y) *tf.square(tf.maximum((1 - d),0))
         return tf.reduce_sum(tmp +tmp2)/batch_size/2
     
     def __init__(
       self, sequence_length, vocab_size, embedding_size, hidden_units, l2_reg_lambda, batch_size):


       # Placeholders for input, output and dropout
       self.input_x1 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x1")
       self.input_x2 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x2")
       self.input_y = tf.placeholder(tf.float32, [None], name="input_y")
       self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")


       # Keeping track of l2 regularization loss (optional)
       l2_loss = tf.constant(0.0, name="l2_loss")
           
       # 把两个输入转换为句子向量
       with tf.name_scope("embedding"):
           self.W = tf.Variable(
               tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
               trainable=True,name="W")
           self.embedded_chars1 = tf.nn.embedding_lookup(self.W, self.input_x1)
           #self.embedded_chars_expanded1 = tf.expand_dims(self.embedded_chars1, -1)
           self.embedded_chars2 = tf.nn.embedding_lookup(self.W, self.input_x2)
           #self.embedded_chars_expanded2 = tf.expand_dims(self.embedded_chars2, -1)
 #构建两个双向LSTM并获得输出，
       # Create a convolution + maxpool layer for each filter size
       with tf.name_scope("output"):
#两个双向LSTM处于不同的variable_scope下，所以是两套参数
         self.out1=self.BiRNN(self.embedded_chars1, self.dropout_keep_prob, "side1", embedding_size, sequence_length)
         self.out2=self.BiRNN(self.embedded_chars2, self.dropout_keep_prob, "side2", embedding_size, sequence_length)
         #计算两个堆叠的双LSTM的输出特征的余弦相似度
self.distance = tf.sqrt(tf.reduce_sum(tf.square(tf.sub(self.out1,self.out2)),1,keep_dims=True))
         self.distance = tf.div(self.distance, tf.add(tf.sqrt(tf.reduce_sum(tf.square(self.out1),1,keep_dims=True)),tf.sqrt(tf.reduce_sum(tf.square(self.out2),1,keep_dims=True))))
         self.distance = tf.reshape(self.distance, [-1], name="distance")
       with tf.name_scope("loss"):
 #与target对比，构造损失函数
           self.loss = self.contrastive_loss(self.input_y,self.distance, batch_size) 
       with tf.name_scope("accuracy"):
 #这部分实际没用，当采用softmax做为损失函数时可以使用
           correct_predictions = tf.equal(self.distance, self.input_y)
           self.accuracy=tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

最后一部分，模型训练模型训练train.py：

#! /usr/bin/env python


 import tensorflow as tf
 import numpy as np
 import re
 import os
 import time
 import datetime
 import gc
 from input_helpers import InputHelper
 from siamese_network import SiameseLSTM
 from tensorflow.contrib import learn
 import gzip
 from random import random
 # Parameters
 # ==================================================
 #超参数的定义
 tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 300)")
 tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
 tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularizaion lambda (default: 0.0)")
 tf.flags.DEFINE_string("training_files", "person_match.train2", "training file (default: None)")
 tf.flags.DEFINE_integer("hidden_units", 50, "Number of hidden units in softmax regression layer (default:50)")


 # Training parameters
 tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
 tf.flags.DEFINE_integer("num_epochs", 300, "Number of training epochs (default: 200)")
 tf.flags.DEFINE_integer("evaluate_every", 1000, "Evaluate model on dev set after this many steps (default: 100)")
 tf.flags.DEFINE_integer("checkpoint_every", 1000, "Save model after this many steps (default: 100)")
 # Misc Parameters
 tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
 tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")


 #打印输出各个超参数的值
 FLAGS = tf.flags.FLAGS
 FLAGS._parse_flags()
 print("\nParameters:")
 for attr, value in sorted(FLAGS.__flags.items()):
     print("{}={}".format(attr.upper(), value))
 print("")


 if FLAGS.training_files==None:
     print "Input Files List is empty. use --training_files argument."
     exit()
  
 #设置训练样本最大长度，> 30个词的句子被截断，小于30则补0
 max_document_length=30
 inpH = InputHelper()
 #读取训练样本，词表
 train_set, dev_set, vocab_processor,sum_no_of_batches = inpH.getDataSets(FLAGS.training_files,max_document_length, 10, FLAGS.batch_size)


 # Training
 # ==================================================
 print("starting graph def")
 #配置并启动图
 with tf.Graph().as_default():
     session_conf = tf.ConfigProto(
       allow_soft_placement=FLAGS.allow_soft_placement,
       log_device_placement=FLAGS.log_device_placement)
     sess = tf.Session(config=session_conf)
     print("started session")
     with sess.as_default():
#初始化simasemodel
         siameseModel = SiameseLSTM(
             sequence_length=max_document_length,
             vocab_size=len(vocab_processor.vocabulary_),
             embedding_size=FLAGS.embedding_dim,
             hidden_units=FLAGS.hidden_units,
             l2_reg_lambda=FLAGS.l2_reg_lambda,
             batch_size=FLAGS.batch_size)


         # Define Training procedure
#创建全局step并创建adam优化器
         global_step = tf.Variable(0, name="global_step", trainable=False)
         optimizer = tf.train.AdamOptimizer(1e-3)
         print("initialized siameseModel object")
     #计算梯度并得到对应的variable
     grads_and_vars=optimizer.compute_gradients(siameseModel.loss)
#传入梯度和变量，初始化一个training op
     tr_op_set = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
     print("defined training_ops")
     # Keep track of gradient values and sparsity (optional)
#获取梯度相关信息给tensorbord
     grad_summaries = []
     for g, v in grads_and_vars:
         if g is not None:
             grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
             sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
             grad_summaries.append(grad_hist_summary)
             grad_summaries.append(sparsity_summary)
     grad_summaries_merged = tf.merge_summary(grad_summaries)
     print("defined gradient summaries")
     # Output directory for models and summaries
#定义输出路径
     timestamp = str(int(time.time()))
     out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
     print("Writing to {}\n".format(out_dir))


     # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
     checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
     checkpoint_prefix = os.path.join(checkpoint_dir, "model")
     if not os.path.exists(checkpoint_dir):
         os.makedirs(checkpoint_dir)
     saver = tf.train.Saver(tf.all_variables(), max_to_keep=100)


     # Write vocabulary
     vocab_processor.save(os.path.join(checkpoint_dir, "vocab"))


     # Initialize all variables
     sess.run(tf.initialize_all_variables())
     
     print("init all variables")
     graph_def = tf.get_default_graph().as_graph_def()
     graphpb_txt = str(graph_def)
     with open(os.path.join(checkpoint_dir, "graphpb.txt"), 'w') as f:
         f.write(graphpb_txt)




     def train_step(x1_batch, x2_batch, y_batch):
         """
         A single training step
         """
#随机打乱两个输入句子之间的位置关系
         if random()>0.5:
             feed_dict = {
                              siameseModel.input_x1: x1_batch,
                              siameseModel.input_x2: x2_batch,
                              siameseModel.input_y: y_batch,
                              siameseModel.dropout_keep_prob: FLAGS.dropout_keep_prob,
             }
         else:
             feed_dict = {
                              siameseModel.input_x1: x2_batch,
                              siameseModel.input_x2: x1_batch,
                              siameseModel.input_y: y_batch,
                              siameseModel.dropout_keep_prob: FLAGS.dropout_keep_prob,
             }
         _, step, loss, accuracy, dist = sess.run([tr_op_set, global_step, siameseModel.loss, siameseModel.accuracy, siameseModel.distance],  feed_dict)
         time_str = datetime.datetime.now().isoformat()
#获取最后的输出值 >= 0.5为0，小于0.5为1
         d = np.copy(dist)
         d[d>=0.5]=999.0
         d[d<0.5]=1
         d[d>1.0]=0
         accuracy = np.mean(y_batch==d)
         print("TRAIN {}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
         print y_batch, dist, d


     def dev_step(x1_batch, x2_batch, y_batch):
         """
         A single training step
#同训练
         """ 
         if random()>0.5:
             feed_dict = {
                              siameseModel.input_x1: x1_batch,
                              siameseModel.input_x2: x2_batch,
                              siameseModel.input_y: y_batch,
                              siameseModel.dropout_keep_prob: FLAGS.dropout_keep_prob,
             }
         else:
             feed_dict = {
                              siameseModel.input_x1: x2_batch,
                              siameseModel.input_x2: x1_batch,
                              siameseModel.input_y: y_batch,
                              siameseModel.dropout_keep_prob: FLAGS.dropout_keep_prob,
             }
         step, loss, accuracy, dist = sess.run([global_step, siameseModel.loss, siameseModel.accuracy, siameseModel.distance],  feed_dict)
         time_str = datetime.datetime.now().isoformat()
         d = np.copy(dist)
         d[d>=0.5]=999.0
         d[d<0.5]=1
         d[d>1.0]=0
         accuracy = np.mean(y_batch==d)
         print("DEV {}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
         print y_batch, dist, d
         return accuracy


     # 产生训练batch，进入主要循环
     batches=inpH.batch_iter(
                 list(zip(train_set[0], train_set[1], train_set[2])), FLAGS.batch_size, FLAGS.num_epochs)


     ptr=0
     max_validation_acc=0.0
     for nn in xrange(sum_no_of_batches*FLAGS.num_epochs):
         batch = batches.next()
         if len(batch)<1:
             continue
         x1_batch,x2_batch, y_batch = zip(*batch)
         if len(y_batch)<1:
             continue
         #训练网络参数并获取当前step
train_step(x1_batch, x2_batch, y_batch)
         current_step = tf.train.global_step(sess, global_step)
         sum_acc=0.0
         if current_step % FLAGS.evaluate_every == 0:
             print("\nEvaluation:")
             dev_batches = inpH.batch_iter(list(zip(dev_set[0],dev_set[1],dev_set[2])), FLAGS.batch_size, 1)
             for db in dev_batches:
                 if len(db)<1:
                     continue
                 x1_dev_b,x2_dev_b,y_dev_b = zip(*db)
                 if len(y_dev_b)<1:
                     continue
                 acc = dev_step(x1_dev_b, x2_dev_b, y_dev_b)
                 sum_acc = sum_acc + acc
         print("")
         if current_step % FLAGS.checkpoint_every == 0:
             if sum_acc >= max_validation_acc:
                 max_validation_acc = sum_acc
                 saver.save(sess, checkpoint_prefix, global_step=current_step)
#保存模型
                 tf.train.write_graph(sess.graph.as_graph_def(), checkpoint_prefix, "graph"+str(nn)+".pb", as_text=False)
                 print("Saved model {} with sum_accuracy={} checkpoint to {}\n".format(nn, max_validation_acc, checkpoint_prefix))