KNN算法代码练习-CFANZ编程社区

#  生成数据
#  使用make_blobs函数生成数据点
from sklearn.datasets.samples_generator import make_blobs

# 开始生成数据
# 先指定中心点
centers = [[-2,2],[2,2],[0,4]]

# n_samples 表示待生成的样本点的数量
# centers 表示中心点，生成的样本围绕它附近
# cluster_std 表示每个类别的标准差，
x,y = make_blobs(n_samples=60, centers=centers, cluster_std=0.6)

array([[-0.08600408,  4.76782848],
       [ 1.3264166 ,  2.37070694],
       [-1.72062744,  2.02582475],
       [ 1.82236685,  2.37271542],
       [-0.30905406,  3.33599758],
       [-0.82732245,  5.17405475],
       [-1.21796898,  2.71879751],
       [ 2.56344749,  2.46380225],
       [ 0.52656441,  3.7157634 ],
       [-1.71565964,  1.58782602],
       [-1.99734574,  2.57046735],
       [-0.6105748 ,  4.32747884],
       [-2.47076485,  0.88160997],
       [-3.09683627,  1.41810538],
       [-0.52244609,  2.88409006],
       [ 0.39124443,  3.93395445],
       [-1.06694802,  2.32322413],
       [ 1.90409727,  2.65346845],
       [-1.47557156,  1.61860603],
       [-0.04215071,  4.18023282],
       [ 1.89614869,  2.19559042],
       [ 2.02560988,  1.32335769],
       [ 2.91560799,  2.69000016],
       [ 1.6580168 ,  1.93073504],
       [ 2.21438265,  3.15312259],
       [ 1.36771487,  2.02817708],
       [ 1.47486871,  1.63766953],
       [-1.38453032,  2.39746726],
       [-2.72504984,  0.29416243],
       [ 0.21979213,  4.27991644],
       [-1.9511985 ,  2.93460142],
       [-0.95264873,  3.863339  ],
       [ 0.42596559,  4.23156268],
       [ 1.14964804,  2.04579981],
       [ 2.45720763,  1.15868596],
       [ 0.14417972,  3.2681776 ],
       [ 1.63766699,  1.79797613],
       [-0.46723236,  3.56507274],
       [-2.00768023,  2.37756443],
       [-0.77164142,  3.07097342],
       [-2.656692  ,  3.18641188],
       [-1.9520929 ,  2.09099171],
       [ 2.04712466,  1.48390184],
       [-2.75281105,  2.12433143],
       [ 1.73602442,  1.86827883],
       [-1.92312906,  3.43099513],
       [ 2.74801908,  2.44293206],
       [ 1.78400907,  2.74081806],
       [-1.7443908 ,  2.54702662],
       [ 0.6563619 ,  2.89757851],
       [-2.33900649,  1.57665199],
       [ 0.74381669,  3.35677091],
       [-2.98695279,  1.2417663 ],
       [ 2.05425203,  1.89126521],
       [-0.08953169,  4.82684634],
       [ 2.1258002 ,  3.38440537],
       [-1.61529457,  2.17528076],
       [ 0.11167727,  4.1635114 ],
       [ 0.31234374,  4.37822912],
       [-0.11923692,  4.38824902]])

y   # 每个样本的cluster的整数标签。这里的标签是0，1，2，对应有3个中心点

array([2, 1, 0, 1, 2, 2, 0, 1, 2, 0, 0, 2, 0, 0, 2, 2, 0, 1, 0, 2, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 2, 0, 2, 2, 1, 1, 2, 1, 2, 0, 2, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 2, 0, 2, 0, 1, 2, 1, 0, 2, 2, 2])

len(y)

##  生成的样本数据做个可视化看看
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(20,10))
c= np.array(centers)

plt.scatter(x[:,0], x[:,1], c=y, s=100, cmap='cool')

# 画出中心点
plt.scatter(c[:,0], c[:,1], s=100,marker='^',c='orange')   #  橙色三角符号就是中心点
plt.show()

output6

开始KNN算法训练

from sklearn.neighbors import KNeighborsClassifier

k = 5
# 模型训练
clf = KNeighborsClassifier(n_neighbors=k)
clf.fit(x, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

# 对一个新样本进行预测
x_sample =[[0, 2]]
neighbors = clf.kneighbors(x_sample, return_distance=False)
neighbors

array([[14, 49, 16, 33, 35]], dtype=int64)

** 把待预测的样本以及其最近的样本点画出来**

plt.figure(figsize=(16,10))

plt.scatter(x[:,0], x[:,1], c=y, s=100, cmap='cool')  # 样本点
plt.scatter(c[:,0], c[:,1], s=100, marker='^',c='k')   # 中心点
plt.scatter(x_sample[0][0], x_sample[0][1], marker='x', s=100, cmap='cool')  # 待预测点

# 将待预测点与距离其最近的5个点连起来
for i in neighbors[0]:
    plt.plot([x[i][0], x_sample[0][0]], [x[i][1], x_sample[0][1]], 'k--', linewidth=0.6)
plt.show()

output12

neighbors[0]

array([14, 49, 16, 33, 35], dtype=int64)

基于iris数据集的KNN算法实践

# 加载iris数据集
from sklearn.datasets import load_iris

iris = load_iris()
dir(iris)   # 查看该对象包含的属性或方法

['DESCR', 'data', 'feature_names', 'target', 'target_names']

iris_data = iris.data
feature_names = iris.feature_names
iris_target = iris.target
target_names = iris.target_names

iris_data.shape

(150, 4)

iris_data[0]

array([ 5.1,  3.5,  1.4,  0.2])

feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

iris_target.shape

(150,)

target_names

array(['setosa', 'versicolor', 'virginica'], 
      dtype='<U10')

iris_target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

x_train, x_test, y_train, y_test = train_test_split(iris_data, iris_target, test_size=0.25)
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

# 获取预测结果
y_predict = knn.predict(x_test)
y_predict

array([2, 2, 0, 2, 0, 2, 0, 0, 1, 0, 1, 0, 1, 2, 2, 0, 2, 2, 1, 2, 0, 2, 0,
       1, 0, 2, 2, 0, 2, 1, 1, 2, 1, 2, 0, 1, 2, 0])

compare_matrix = y_predict == y_test    # 比较下预测结果和真实情况的差异
compare_matrix

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True], dtype=bool)

# 标签代表的类型
target_names

array(['setosa', 'versicolor', 'virginica'], 
      dtype='<U10')

# 标签对应的实物
labels = ['山鸢尾','虹膜锦葵','变色鸢尾']
for i in range(len(y_predict)):
    print('第%d次测试：真实值是 %s \t 预测值是 %s '%(i+1, labels[y_test[i]], labels[y_predict[i]]))

第1次测试：真实值是 变色鸢尾 	 预测值是 变色鸢尾 
第2次测试：真实值是 变色鸢尾 	 预测值是 变色鸢尾 
第3次测试：真实值是 山鸢尾 	 预测值是 山鸢尾 
第4次测试：真实值是 变色鸢尾 	 预测值是 变色鸢尾 
第5次测试：真实值是 山鸢尾 	 预测值是 山鸢尾 
第6次测试：真实值是 变色鸢尾 	 预测值是 变色鸢尾 
第7次测试：真实值是 山鸢尾 	 预测值是 山鸢尾 
第8次测试：真实值是 山鸢尾 	 预测值是 山鸢尾 
第9次测试：真实值是 虹膜锦葵 	 预测值是 虹膜锦葵 
第10次测试：真实值是 山鸢尾 	 预测值是 山鸢尾 
第11次测试：真实值是 虹膜锦葵 	 预测值是 虹膜锦葵 
第12次测试：真实值是 山鸢尾 	 预测值是 山鸢尾 
第13次测试：真实值是 虹膜锦葵 	 预测值是 虹膜锦葵 
第14次测试：真实值是 变色鸢尾 	 预测值是 变色鸢尾 
第15次测试：真实值是 虹膜锦葵 	 预测值是 变色鸢尾 
第16次测试：真实值是 山鸢尾 	 预测值是 山鸢尾 
第17次测试：真实值是 变色鸢尾 	 预测值是 变色鸢尾 
第18次测试：真实值是 变色鸢尾 	 预测值是 变色鸢尾 
第19次测试：真实值是 虹膜锦葵 	 预测值是 虹膜锦葵 
第20次测试：真实值是 变色鸢尾 	 预测值是 变色鸢尾 
第21次测试：真实值是 山鸢尾 	 预测值是 山鸢尾 
第22次测试：真实值是 变色鸢尾 	 预测值是 变色鸢尾 
第23次测试：真实值是 山鸢尾 	 预测值是 山鸢尾 
第24次测试：真实值是 虹膜锦葵 	 预测值是 虹膜锦葵 
第25次测试：真实值是 山鸢尾 	 预测值是 山鸢尾 
第26次测试：真实值是 变色鸢尾 	 预测值是 变色鸢尾 
第27次测试：真实值是 变色鸢尾 	 预测值是 变色鸢尾 
第28次测试：真实值是 山鸢尾 	 预测值是 山鸢尾 
第29次测试：真实值是 变色鸢尾 	 预测值是 变色鸢尾 
第30次测试：真实值是 虹膜锦葵 	 预测值是 虹膜锦葵 
第31次测试：真实值是 虹膜锦葵 	 预测值是 虹膜锦葵 
第32次测试：真实值是 变色鸢尾 	 预测值是 变色鸢尾 
第33次测试：真实值是 虹膜锦葵 	 预测值是 虹膜锦葵 
第34次测试：真实值是 变色鸢尾 	 预测值是 变色鸢尾 
第35次测试：真实值是 山鸢尾 	 预测值是 山鸢尾 
第36次测试：真实值是 虹膜锦葵 	 预测值是 虹膜锦葵 
第37次测试：真实值是 变色鸢尾 	 预测值是 变色鸢尾 
第38次测试：真实值是 山鸢尾 	 预测值是 山鸢尾

# 查看准确率
knn.score(x_test, y_test)

0.97368421052631582

from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
font = {'family': 'SimHei', 'size':'20'}
plt.rc('font', **font)

k_range = range(1,31)
k_error=[]

# 循环，取k=1到30查看误差效果
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
#     cv参数决定数据集划分比例，这里是按照5:1划分训练集和测试集
    scores = cross_val_score(knn, iris_data, iris_target, cv = 6) #cv默认是3折交叉验证，这里设置为6
    k_error.append(1 - scores.mean())
    
# 画图
plt.plot(k_range, k_error)
plt.xlabel('K的值')
plt.ylabel('错误')
plt.show()

output_32

-----------------------------------------------END------------------------------------------------