定义
输入:
n
n
n个样本的集合
X
X
X;
输出:样本集合的聚类
C
⋅
C^{\cdot}
C⋅。
(1)初始化。令
t
=
0
t=0
t=0,随机选择
k
k
k个样本点作为初始聚类中心
m
(
0
)
=
(
m
1
(
0
)
,
⋯
,
m
l
(
l
)
,
⋯
,
m
k
(
0
)
)
m^{(0)} = \big( m_1^{(0)},\cdots,m_l^{(l)},\cdots,m_k^{(0)} \big)
m(0)=(m1(0),⋯,ml(l),⋯,mk(0))。
(2)对样本进行聚类。对固定的类中心
m
(
t
)
=
(
m
1
(
t
)
,
⋯
,
m
l
(
t
)
,
⋯
,
m
m
(
t
)
)
,
其中
m^{(t)} = \big( m_1^{(t)},\cdots,m_l^{(t)},\cdots,m_m^{(t)} \big),其中
m(t)=(m1(t),⋯,ml(t),⋯,mm(t)),其中m_l^{(t)}为类
G
l
G_l
Gl的中心,计算每个样本到类中心的距离,将每个样本指派到与其最近的中心的类中,构成聚类结果
C
(
t
)
C^{(t)}
C(t)。
(3)计算新的类中心。对聚类结果
C
(
t
)
C^{(t)}
C(t),计算当前各个类中的样本的均值,作为新的类中心
m
t
+
1
=
(
m
1
(
t
+
1
)
,
⋯
,
m
l
t
+
1
,
⋯
,
m
k
t
+
1
)
m^{t+1} = \big( m_1^{(t+1)},\cdots,m_l^{t+1},\cdots,m_k^{t+1} \big)
mt+1=(m1(t+1),⋯,mlt+1,⋯,mkt+1)
(4)如果迭代收敛或符合停止条件,输出
C
∗
=
C
(
t
)
C^* = C^{(t)}
C∗=C(t),否则,令
t
=
t
+
1
,
返回步骤
(
2
)
t = t+1,返回步骤(2)
t=t+1,返回步骤(2)
输入空间
T= { ( x 1 , x 2 , , … , x N ) } \left\{(x_1,x_2,,\dots,x_N)\right\} {(x1,x2,,…,xN)}
import numpy as np
import time
import math
import random
from scipy.special import comb
import matplotlib.pyplot as plt
%matplotlib inline
def load_data(file):
'''
加载iris数据集 下载地址:https://download.csdn.net/download/nanxiaotao/89743712
:param fileName:要加载的文件路径
:return: 数据集和标签集
'''
Xlist = []
Ylist = []
fr = open(file)
for line in fr.readlines():
cur = line.split(',')
label = cur[-1]
X = [float(x) for x in cur[:-1]]
Xlist.append(X)
Ylist.append(label)
Xarray = np.array(Xlist)
return Xarray, Ylist
Xarray, Ylist = load_data('iris.data')
np.shape(Xarray)
def Normalize(Xarray):
'''
INPUT:
Xarray - (array) 特征数据数组
OUTPUT:
Xarray - (array) 标准化处理后的特征数据数组
'''
for f in range(Xarray.shape[1]):
maxf = np.max(Xarray[:, f])
minf = np.min(Xarray[:, f])
for n in range(Xarray.shape[0]):
Xarray[n][f] = (Xarray[n][f]-minf) / (maxf-minf)
return Xarray
Xarray = Normalize(Xarray) #对特征数据进行标准化处理
np.shape(Xarray)
特征空间(Feature Space)
Xarray[0][0:4]
统计学习方法
模型
y = f ( x 1 , x 2 , x 3 , x 4 ) , x 1 , x 2 , x 3 , x 4 ∈ χ ⊆ R n , y ∈ { I r i s − s e t o s a , I r i s − v e r s i c o l o r , I r i s − v i r g i n i c a } y = f(x_1,x_2,x_3,x_4),x_1,x_2,x_3,x_4 \in \chi \subseteq R^n,y \in \{ Iris-setosa ,Iris-versicolor,Iris-virginica \} y=f(x1,x2,x3,x4),x1,x2,x3,x4∈χ⊆Rn,y∈{Iris−setosa,Iris−versicolor,Iris−virginica}
策略
C ∗ = a r g ∗ m i n C ∑ l = 1 k ∑ C ( i ) = l ∣ ∣ x i − x ‾ l ∣ ∣ 2 C^* = arg * \mathop{min}\limits_{C} \sum_{l=1}^k \sum_{C(i)=l} \big|| x_i - \overline x_l \big||^2 C∗=arg∗Cminl=1∑kC(i)=l∑ ∣xi−xl ∣2
算法
k = 3 #设定聚类数为3
iters = 100 #设定迭代次数为100
d i j = ( ∑ k = 1 m ∣ x k i − x k j ∣ 2 ) 1 2 d_{ij} = \bigg( \sum_{k=1}^m \big| x_{ki} - x_{kj} \big|^2 \bigg)^{\frac{1}{2}} dij=(k=1∑m xki−xkj 2)21
def cal_distance(xi, xj):
'''
欧式距离
INPUT:
Xi - (array) 第i条特征数据
Xj - (array) 第j条特征数据
OUTPUT:
dist - (float) 两条数据的欧式距离
'''
dist = 0
for col in range(len(xi)):
dist += (xi[col]-xj[col]) ** 2
dist = math.sqrt(dist)
return dist
x ‾ l = ( x ‾ 1 l , x ‾ 2 l , ⋯ , x ‾ m l ) \overline x_l = \big( \overline x_{1l},\overline x_{2l},\cdots,\overline x_{ml} \big) xl=(x1l,x2l,⋯,xml)
def cal_groupcenter(group, Xarray):
'''
计算类中心
INPUT:
group - (list) 类所包含的数据列表
Xarray - (array) 特征数据数组
OUTPUT:
center - (array) 新的类中心
'''
center = np.zeros(Xarray.shape[1])
for i in range(Xarray.shape[1]):
for n in group:
center[i] += Xarray[n][i]
center = center / Xarray.shape[0]
return center
def Kmeans(Xarray, k, iters):
'''
k均值聚类
INPUT:
Xarray - (array) 特征数据数组
k - (int) 设定的类别数
iters - (int) 设定的迭代次数
OUTPUT:
group_dict - (dict) 类别字典
scores - (int) 每次迭代的ARI得分列表
'''
center_inds = random.sample(range(Xarray.shape[0]), k)
centers = [Xarray[ci] for ci in center_inds]
for i in range(iters):
group_dict = {i:[] for i in range(k)}
for n in range(Xarray.shape[0]):
dists = []
for ci in range(k):
dist = cal_distance(Xarray[n], centers[ci])
dists.append(dist)
g = dists.index(min(dists))
group_dict[g].append(n)
for i in range(k):
centers[i] = cal_groupcenter(group_dict[i], Xarray) #根据每一类所包含的数据重新计算类中心
return group_dict
group_dict = Kmeans(Xarray, k, iters) #进行k均值聚类
模型评估
训练误差
A R I = R I − E [ R I ] m a x ( R I ) − E [ R I ] , A R I ∈ [ − 1 , 1 ] , 其中 , R I = a + b ( n 2 ) , R I ∈ [ 0 , 1 ] , n : 实例总数 , ( n 2 ) = C n 2 = n ( n − 1 ) 2 ARI = \dfrac{RI - E[RI]}{max(RI) - E[RI]},ARI \in [-1,1],其中,RI=\dfrac{a+b}{\left( \begin{array}{cccc} n \\ 2 \end{array} \right) },RI \in [0,1],n:实例总数,\left( \begin{array}{cccc} n \\ 2 \end{array} \right)=C_n^2=\dfrac{n(n-1)}{2} ARI=max(RI)−E[RI]RI−E[RI],ARI∈[−1,1],其中,RI=(n2)a+b,RI∈[0,1],n:实例总数,(n2)=Cn2=2n(n−1)
def Adjusted_Rand_Index(group_dict, Ylist, k):
'''
调整兰德系数(ARI)
INPUT:
group_dict - (dict) 类别字典
Ylist - (list) 类别标签列表
k - (int) 设定的类别数
OUTPUT:
(int) 调整兰德系数
'''
group_array = np.zeros((k, k))
ylabel = list(set(Ylist))
y_dict = {i:[] for i in range(k)}
for i in range(len(Ylist)):
y_dict[ylabel.index(Ylist[i])].append(i)
for i in range(k):
for j in range(k):
for n in range(len(Ylist)):
if n in group_dict[i] and n in y_dict[j]:
group_array[i][j] += 1
RI = 0 #定义兰德系数(RI)
sum_i = np.zeros(k)
sum_j = np.zeros(k)
for i in range(k):
for j in range(k):
sum_i[i] += group_array[i][j]
sum_j[j] += group_array[i][j]
if group_array[i][j] >= 2:
RI += comb(group_array[i][j], 2)
ci = 0 #ci保存聚类结果中同一类中的两两组合数之和
cj = 0 #cj保存外部标签中同一类中的两两组合数之和
for i in range(k):
if sum_i[i] >= 2:
ci += comb(sum_i[i], 2)
for j in range(k):
if sum_j[j] >= 2:
cj += comb(sum_j[j], 2)
E_RI = ci * cj / comb(len(Ylist), 2) #计算RI的期望
max_RI = (ci + cj) / 2 #计算RI的最大值
return (RI-E_RI) / (max_RI-E_RI) #返回调整兰德系数的值
ARI = Adjusted_Rand_Index(group_dict, Ylist, k)
print('Adjusted Rand Index:', ARI)