0
点赞
收藏
分享

微信扫一扫

python 批量打开数据


import xlrd
import xlwt
from datetime import date,datetime
import os

from sklearn.cluster import KMeans
import collections
import pandas
import numpy
import re





def opp(ggv):

workbook = xlrd.open_workbook(ggv)

#print (workbook.sheet_names()) # [u'sheet1', u'sheet2']

table = workbook.sheet_by_name(u'准确概率计算逻辑-存在竞品对比部分')
a=table.col_values(7)
#print('a=',a)
#print(a[1:])
return a[1:]

#print(opp(ggv))
#fopen = open('file', 'r')




def k_means(pp):

pv=list(pp)
if len(set(pv))>3:
gf=numpy.array([pv]).T
estimator = KMeans(n_clusters=3)#构造聚类器
estimator.fit(gf)#聚类

label_pred = estimator.labels_ #获取聚类标签
aa=collections.Counter(label_pred)
v=pandas.Series(aa)
#print(v)
gg=list(v)
#print(gg)
index_max=gg.index(max(gg))


centroids = estimator.cluster_centers_
ppv=centroids.flatten()
#print(ppv)#获取聚类中心
ffv=pandas.DataFrame()
ffv['分类个数']=gg
ffv['某类对应价格']=ppv
ffv=ffv.sort_values(by='分类个数',ascending=False)
#print(ffv)
#inertia = estimator.inertia_ # 获取聚类准则的总和
center=centroids[index_max][0]
return (int(center))
else:
return 0
"""
changzhou=pandas.read_excel('cz.xlsx')

data=changzhou['房价网均价']
"""
lis = os.listdir()#列出目录下的所有文件和目录
ll=pandas.Series(lis)
lld=ll[ll!='jjj.py']
#data=(data//500)*500+250
#out=k_means(data)

bb=[]

for i in lld:
aa=[0,0]
ggv=i
#print(ggv)
ddf=opp(ggv)
#print(ddf)
out=k_means(ddf)
st = re.sub("[A-Za-z0-9\!\%\[\]\,\。]", "", i)
aa[0]=st
aa[1]=out
bb.append(aa)
print(aa)

bbcc=pandas.DataFrame(bb,columns=['city','price'])

bbcc=bbcc[bbcc['price']>0]

bbcc.to_excel('bbcc.xls')


举报

相关推荐

0 条评论