import pandas as pd
data=pd.read_csv(open('D://实训课//实训课数据csdn.csv'),sep=',') //导入数据
data //数据预览
session=data.loc[:,'class'].values
set(session)//对数据表的class类别列切分
def classma(i):
class1=data.loc[data['class']==class_list[i],:]
print(class1)
return class1
//定义切分函数,按类别列作为索引返回每一类别对应的数据信息
class_list=['ai', 'algo', 'big-data', 'blockchain', 'hardware', 'math', 'miniprog']
# for i in range(len(class_list)):
# classma(i)
ai=classma(0)
file_path='D:/..csv'
def getStopword(file_path):
stop_list=[line[:-1] for line in open(file_path+'/哈工大停用词表 .txt','r',encoding='UTF-8')]
return stop_list
getStopword(file_path)
import jieba
def preProcess(all_data,stop_list):
xdata=all_data['content']
result_data=list(xdata)
result=[]
for doc in result_data:
doc=doc.strip()
cut_list=jieba.lcut(doc)
doc_result=[word for word in cut_list if word not in stop_list]
result.append(doc_result)
return result
# getStopword(file_path)
result1=preProcess(ai,getStopword(file_path))
print(result1)
from gensim.models.tfidfmodel import TfidfModel
from gensim import corpora
def calculate(resultx):
dictionary=corpora.Dictionary(resultx)
corpus=[dictionary.doc2bow(text) for text in resultx]
tf_idf_model = TfidfModel(corpus, normalize=False)
word_tf_tdf = list(tf_idf_model[corpus])
print('词典:', dictionary.token2id)
print('词频:', corpus)
print('词的tf-idf值:', word_tf_tdf)
return dictionary.token2id,corpus,word_tf_tdf
idic,corpus,word_tf_tdf=calculate(result1)
max_pic=[]
max_fre=[]
def search(resultx,a):
maxmum=[]
idic,corpus,word_tf_tdf=calculate(resultx)
for row in word_tf_tdf[a]:
maxmum.append(row[1])
for col in word_tf_tdf[a]:
if col[1]==max(maxmum):
print(max(maxmum))
max_fre.append(max(maxmum))
max_sig=col[0]
max_pic.append(max_sig)
return max_pic,max_fre
for i in range(len(word_tf_tdf)):
search(result1,i)
print(max_pic)
print(max_fre)
dictionary_s=idic
key_words=[]
for key,value in dictionary_s.items():
if value in max_pic:
key_words.append(key)
key_words.pop(-1)
print(key_words)
# 构造词频字典
dict_zip=dict(zip(key_words,max_fre))
print(dict_zip)
# 绘制词云
from wordcloud import WordCloud
import matplotlib.pyplot as plt
def draw(y):
my_cloud = WordCloud(
background_color='white', # 设置背景颜色 默认是black
width=900, height=600,
max_words=100, # 词云显示的最大词语数量
font_path='simhei.ttf', # 设置字体 显示中文
max_font_size=99, # 设置字体最大值
min_font_size=16, # 设置子图最小值
random_state=50 # 设置随机生成状态,即多少种配色方案
).generate_from_frequencies(y)
# 显示生成的词云图片
plt.imshow(my_cloud, interpolation='bilinear')
# 显示设置词云图中无坐标轴
plt.axis('off')
plt.show()
draw(dict_zip)