实现效果
文件总数量为26
数据格式为:data_set=[[],[],[]],类似于如下(二元列表,每个子列表存放每个文件分词后得到的列表)
data_set [['载人', '航天', '工程', '专项', '标准', '载人', '航天', '工程', '有效载荷', '生物', '通用', '发布', '实施', '中国', '载人', '航天', '工程', '办公室', '批准', '前言', '标准', '附录', '资料性', '附录', '标准'], ['中国', '载人', '航天', '工程', '办公室', '提出', '标准', '载人', '航天', '工程', '标准化', '管理', '咨询中心', '归口', '标准', '起草', '单位', '飞行', '动物', '植物', '实验', '样品', '飞行', '确认', '实验', '样品', '包含', '生物', '危害', '凡不注', '日期', '版次', '引用', '文件', '最新', '版本', '于本', '标准', '工作', '场所', '物理', '因素', '测量', '紫外', '辐射'],[ '声学', '测量', '常用', '频率', '声学', '倍频程', '分数', '安装', '到位', '测量', '接地', '电阻', '满足要求', '设备', '自带', '接地', '螺钉', '涂胶', '拧紧', '设备', '优先', '安装', '面来', '接地', '电阻', '满足要求', '轻微', '打磨', '设备', '支架', '安装', '方法', '加以解决', '搭接', '设备', '舱体', '安装', '打磨', '安装', '安装', '后应', '实测']]
K-means
LDA聚类
可视化展示:
K-means实现
#参考博文:https://www.cnblogs.com/fengfenggirl/p/k-means.html
from __future__ import print_function
import os
import re
import jieba
from jieba._compat import xrange
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, MiniBatchKMeans
fdir=''#doc存放地址
print('文件总数量为{}'.format(len(os.listdir(fdir))))
stoppath=''#停用词存放地址
def loadDataset():
'''导入文本数据集'''
flist=os.listdir(fdir)
dataset = []
for fname in flist:
f = open(fdir+'\\'+fname, 'r',encoding='utf_8')
d=''
for line in f.readlines():
if line!='\n' and line!='':
d+=line
dataset.append(d)
f.close()
return dataset
#停用词建立
def buildSW():
'''停用词的过滤'''
typetxt = open(stoppath,encoding='utf_8') # 停用词文档地址
stoptexts = ['\u3000', '\n', ' '] # 爬取的文本中未处理的特殊字符
'''停用词库的建立'''
for word in typetxt:
word = word.strip()
stoptexts.append(word)
return stoptexts
#语料建立data_set=[doc1,doc2,...]
def buildWB(dataset,stoptexts):
'''语料库的建立'''
corpus=[]
for i in range(0, len(dataset)):
data = jieba.cut(dataset[i]) # 文本分词
data_adj = ''
delete_word = []
for item in data:
if item not in stoptexts: # 停用词过滤
# value=re.compile(r'^[0-9]+$')#去除数字
value = re.compile(r'^[\u4e00-\u9fa5]{2,}$') # 只匹配中文2字词以上
if value.match(item):
data_adj += item + ' '
else:
delete_word.append(item)
corpus.append(data_adj) # 语料库建立完成
# print(corpus)
return corpus
def transform(dataset, n_features=1000):
vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, use_idf=True)
X = vectorizer.fit_transform(dataset)
return X, vectorizer
def train(X, vectorizer, true_k=10, minibatch=False, showLable=False):
# 使用采样数据还是原始数据训练k-means,
if minibatch:
km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
init_size=1000, batch_size=1000, verbose=False)
else:
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
verbose=False)
km.fit(X)
if showLable:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
print(vectorizer.get_stop_words())
for i in range(true_k):
print("Cluster %d:" % i, end='')
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind], end='')
print()
result = list(km.predict(X))
print('Cluster distribution:')
print(dict([(i, result.count(i)) for i in result]))
return -km.score(X)
#测试在不同的聚类数设置下,可达到的效果
def test():
'''测试选择最优参数'''
dataset = loadDataset()
print("%d documents" % len(dataset))
X, vectorizer = transform(dataset, n_features=500)
true_ks = []
scores = []
for i in xrange(3, 80, 1):
score = train(X, vectorizer, true_k=i) / len(dataset)
print(i, score)
true_ks.append(i)
scores.append(score)
plt.figure(figsize=(8, 4))
plt.plot(true_ks, scores, label="error", color="red", linewidth=1)
plt.xlabel("n_features")
plt.ylabel("error")
plt.legend()
plt.show()
def out():
'''在最优参数下输出聚类结果'''
dataset = loadDataset()
stoptexts=buildSW()
corpus=buildWB(dataset,stoptexts)
X, vectorizer = transform(corpus, n_features=500)
score = train(X, vectorizer, true_k=5,minibatch=True, showLable=True) / len(dataset)
print(score)
# test()
out()
LDA代码实现
#参考博客:https://blog.csdn.net/weixin_41168304/article/details/122389948
from __future__ import print_function
import os
import re
from multiprocessing import freeze_support
import jieba
from jieba._compat import xrange
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, MiniBatchKMeans
fdir=''#doc文件夹地址
print('文件总数量为{}'.format(len(os.listdir(fdir))))
stoppath=''#停用词存放位置
#加载数据,数据为[[],[]],二元列表,每个子列表存放每个文件分词后得到的列表
class DataLoad:
def __init__(self):
pass
#将文件内容存储在dataset列表中,每个元素是一篇文件
def loadDataset(self):
'''导入文本数据集'''
flist=os.listdir(fdir)
dataset = []
for fname in flist:
f = open(fdir+'\\'+fname, 'r',encoding='utf_8')
d=''
for line in f.readlines():
if line!='\n' and line!='':
d+=line
dataset.append(d)
f.close()
return dataset
#建立停用词文件
def buildSW(self):
'''停用词的过滤'''
typetxt = open(stoppath,encoding='utf_8') # 停用词文档地址
stoptexts = ['\u3000', '\n', ' '] # 爬取的文本中未处理的特殊字符
'''停用词库的建立'''
for word in typetxt:
word = word.strip()
stoptexts.append(word)
return stoptexts
#文件分词,存储在二元列表中,每个子列表是一篇文档
def buildWB(self,dataset,stoptexts):
'''语料库的建立'''
corpus=[]
for i in range(0, len(dataset)):
data = jieba.cut(dataset[i]) # 文本分词
data_adj =[]
delete_word = []
for item in data:
if item not in stoptexts: # 停用词过滤
# value=re.compile(r'^[0-9]+$')#去除数字
value = re.compile(r'^[\u4e00-\u9fa5]{2,}$') # 只匹配中文2字词以上
if value.match(item):
data_adj.append(item)
else:
delete_word.append(item)
corpus.append(data_adj) # 语料库建立完成
# print(corpus)
return corpus
Data=DataLoad()
dataset = Data.loadDataset()
stoptexts=Data.buildSW()
data_set=Data.buildWB(dataset,stoptexts)
print('data_set',data_set[:3])
print('data_set的长度是',len(data_set))
#----------------LDA模型-------------------------
import gensim
from gensim import corpora
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import warnings
warnings.filterwarnings('ignore') # To ignore all warnings that arise here to enhance clarity
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
# 构建词典,语料向量化表示
dictionary = corpora.Dictionary(data_set) # 构建词典
corpus = [dictionary.doc2bow(text) for text in data_set] #表示为第几个单词出现了几次
num_topics=10
passes=3
ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=passes,random_state = 1) #分为10个主题
print('每个主题输出15个单词')
print(ldamodel.print_topics(num_topics=num_topics, num_words=15)) #每个主题输出15个单词
#计算困惑度
def perplexity(num_topics):
ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=passes)
print(ldamodel.print_topics(num_topics=num_topics, num_words=15))
print(ldamodel.log_perplexity(corpus))
return ldamodel.log_perplexity(corpus)
#计算coherence
def coherence(num_topics):
ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=passes,random_state = 1)
print(ldamodel.print_topics(num_topics=num_topics, num_words=10))
ldacm = CoherenceModel(model=ldamodel, texts=data_set, dictionary=dictionary, coherence='u_mass')
print(ldacm.get_coherence())
return ldacm.get_coherence()
if __name__ == '__main__':
freeze_support()
#绘制主题-coherence曲线,选择最佳主题数
x = range(1,15)
# z = [perplexity(i) for i in x] #如果想用困惑度就选这个
y = [coherence(i) for i in x]
plt.plot(x, y)
plt.xlabel('主题数目')
plt.ylabel('coherence大小')
plt.rcParams['font.sans-serif']=['SimHei']
matplotlib.rcParams['axes.unicode_minus']=False
plt.title('主题-coherence变化情况')
plt.show()
import pyLDAvis.gensim
# pyLDAvis.enable_notebook()#如果是在笔记本中,取消注释
data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.save_html(data, 'topic.html')
地址
项目本机地址:E:\python project\pythonProject_draftKG\主题聚类
项目git地址:git