0
点赞
收藏
分享

微信扫一扫

机器学习-文本聚类实例-kmeans

机器学习-文本聚类实例-kmeans

import os

import gensim
import jieba

from gensim.models.doc2vec import Doc2Vec
from sklearn.cluster import KMeans

TaggededDocument = gensim.models.doc2vec.TaggedDocument


# 定义两个函数读取和保存文件
# 保存至文件
def savefile(savepath, content, encode):
fp = open(savepath, "w", encoding=encode)
fp.write(content)
fp.close()


# 读文件
def readfile(path, encode):
content = None
try:
fp = open(path, "r", encoding=encode)
content = fp.read()
fp.close()
except UnicodeDecodeError:
print("Error: 文件读取失败")
else:
return content


stop_words_path = '/Users/FengZhen/Desktop/accumulate/机器学习/kmeans聚类/中文停用词表.txt'
origin_text_path = '/Users/FengZhen/Desktop/accumulate/机器学习/kmeans聚类/测试文本集/'
cut_combine_path = '/Users/FengZhen/Desktop/accumulate/机器学习/kmeans聚类/cut_combine.txt'
corpus_text_path = '/Users/FengZhen/Desktop/accumulate/机器学习/kmeans聚类/corpus.txt'
result_text_path = '/Users/FengZhen/Desktop/accumulate/机器学习/kmeans聚类/own_claasify.txt'

# 分词-去停用词-保存
def segment():
stop_words = readfile(stop_words_path,'UTF-8')
fileList = os.listdir(origin_text_path)
save = open(cut_combine_path, 'w')
for file in fileList:
if not file.startswith("."):
content_result = ''
content = readfile(origin_text_path + file, 'GBK')
content_words = jieba.cut(content)
for content_word in content_words:
if content_word not in stop_words:
content_result = content_result + " " + content_word
save.write(content_result.replace('\r','').replace('\n',''))
save.write('\n')


def get_datasest():
with open(cut_combine_path, 'r') as cf:
docs = cf.readlines()
print
len(docs)

x_train = []
for i, text in enumerate(docs):
word_list = text.split(' ')
l = len(word_list)
word_list[l - 1] = word_list[l - 1].strip()
# 训练模型前,先将语料整理成规定的形式,这里用到TaggedDocument模型
# 输入输出内容都为 词袋 + tag列表
document = TaggededDocument(word_list, tags=[i])
x_train.append(document)

return x_train


def train(x_train, size=200):
# 初始化训练模型的参数,再保存训练结果以释放内存
# 提供x_train可初始化, min_cout 忽略总频率低于这个的所有单词, window 预测的词与上下文词之间最大的距离, 用于预测 size 特征向量的维数 negative 接受杂质的个数 worker 工作模块数
model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4)
# corpus_count是文件个数 epochs 训练次数
model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=100)
# 保存模型训练结果,释放内存空间,后续可用load加载
model_dm.save(corpus_text_path)
return model_dm


def cluster(x_train):
infered_vectors_list = []
print
"load doc2vec model..."
# 加载训练的模型 model_dm输出类似Doc2Vec(dm/m,d500,n5,w3,s0.001,t4)
model_dm = Doc2Vec.load(corpus_text_path)
print
"load train vectors..."
i = 0
for text, label in x_train:
vector = model_dm.infer_vector(text)
infered_vectors_list.append(vector)
i += 1

print
"train kmean model..."
kmean_model = KMeans(n_clusters=15)
kmean_model.fit(infered_vectors_list)
labels = kmean_model.predict(infered_vectors_list[0:100])
print(labels)
cluster_centers = kmean_model.cluster_centers_

with open(result_text_path, 'w') as wf:
i = 0
while i < len(x_train):
string = ""
text = x_train[i][0]
for word in text:
string = string + word
string = string + '\t'
string = string + str(labels[i])
string = string + '\n'
wf.write(string)
i = i + 1

return cluster_centers


if __name__ == '__main__':
segment()
x_train = get_datasest()
model_dm = train(x_train)
cluster_centers = cluster(x_train)

 

举报

相关推荐

0 条评论