import xlrd
import xlwt
from datetime import date,datetime
import os
from sklearn.cluster import KMeans
import collections
import pandas
import numpy
import re
def opp(ggv):
workbook = xlrd.open_workbook(ggv)
#print (workbook.sheet_names()) # [u'sheet1', u'sheet2']
table = workbook.sheet_by_name(u'准确概率计算逻辑-存在竞品对比部分')
a=table.col_values(7)
#print('a=',a)
#print(a[1:])
return a[1:]
#print(opp(ggv))
#fopen = open('file', 'r')
def k_means(pp):
pv=list(pp)
if len(set(pv))>3:
gf=numpy.array([pv]).T
estimator = KMeans(n_clusters=3)#构造聚类器
estimator.fit(gf)#聚类
label_pred = estimator.labels_ #获取聚类标签
aa=collections.Counter(label_pred)
v=pandas.Series(aa)
#print(v)
gg=list(v)
#print(gg)
index_max=gg.index(max(gg))
centroids = estimator.cluster_centers_
ppv=centroids.flatten()
#print(ppv)#获取聚类中心
ffv=pandas.DataFrame()
ffv['分类个数']=gg
ffv['某类对应价格']=ppv
ffv=ffv.sort_values(by='分类个数',ascending=False)
#print(ffv)
#inertia = estimator.inertia_ # 获取聚类准则的总和
center=centroids[index_max][0]
return (int(center))
else:
return 0
"""
changzhou=pandas.read_excel('cz.xlsx')
data=changzhou['房价网均价']
"""
lis = os.listdir()#列出目录下的所有文件和目录
ll=pandas.Series(lis)
lld=ll[ll!='jjj.py']
#data=(data//500)*500+250
#out=k_means(data)
bb=[]
for i in lld:
aa=[0,0]
ggv=i
#print(ggv)
ddf=opp(ggv)
#print(ddf)
out=k_means(ddf)
st = re.sub("[A-Za-z0-9\!\%\[\]\,\。]", "", i)
aa[0]=st
aa[1]=out
bb.append(aa)
print(aa)
bbcc=pandas.DataFrame(bb,columns=['city','price'])
bbcc=bbcc[bbcc['price']>0]
bbcc.to_excel('bbcc.xls')