0
点赞
收藏
分享

微信扫一扫

机器学习实战:knn海伦约会


机器学习实战:knn海伦约会_数据


机器学习实战:knn海伦约会_数据_02

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy

def autoNorm(x):
"""
最大值最小值归一化
:param x: 需要归一化的特征向量
:return: 新的数组、极差、最小值
"""
assert isinstance(x,np.ndarray)

minVals=x.min(axis=0)
maxVals=x.max(axis=0)
ranges=maxVals-minVals

x_new=(x-minVals)/ranges # 广播

return x_new,ranges,minVals

def getdata_normal():
"""
读取原始文本数据
:return:
"""
fp="datingTestSet.txt"

f=open(fp,mode='r')

data=[line.strip().split('\t') for line in f.readlines()]
data=np.array(data)

x=data[:,:-1].astype(np.float)
y=data[:,-1]

f.close()

# 可视化特征
# labels=copy.deepcopy(y)
# dic={"largeDoses": "r", "smallDoses": 'g', "didntLike": 'b'}
# for k, v in dic.items():
# labels[labels==k]=v
# plt.scatter(x[:, 0], x[:, 1], 10,labels)
# plt.title(dic)
# plt.show()

return x,y

def knnClaffify(testItem,trainX,trainY,k):
"""
knn分类算法,单条数据测试
:param testItem: 测试的单条数据
:param trainX: 训练集特征
:param trainY: 训练集标签
:param k: 邻居个数
:return: 分类类别
"""
distances=np.sqrt(np.sum((trainX-testItem)**2,axis=1))

ind=np.argsort(distances)

classCount={}
for i in range(k):
vote=trainY[ind[i]]
classCount[vote]=classCount.get(vote,0)+1
classCount=sorted(classCount.items(),key=lambda x:x[0])

return classCount[0][0]

def knnTest():
"""
测试算法
:return:
"""
x, y = getdata_normal()
x, _, _ = autoNorm(x)

total=len(x)
splittest=int(0.9*total) # 分割训练集和测试集,训练集占比0.8
print("分割位置:{},总数:{}".format(splittest,total))

trueCount=0
for i in range(splittest,total):
result = knnClaffify(x[i], x[:splittest],y[:splittest],k=3)
trueCount+=(y[i]==result)

print("正确率:{2}({0}/{1})".format(trueCount,total-splittest,trueCount/(total-splittest)))

def knnForPerson():

x, y = getdata_normal()
x,ranges,minVals = autoNorm(x)

t1 = float(input("每年旅行距离:"))
t2 = float(input("玩游戏时间占比:"))
t3 = float(input("每周吃的冰激凌:"))

item=(np.array([t1,t2,t3])-minVals)/ranges
result=knnClaffify(item,x,y,k=3)
print("predict: ",result)

if __name__ == '__main__':

knnTest()
knnForPerson()


举报

相关推荐

0 条评论