Python实现线性判别分析鸢尾花数据集或随机生成两个线性可分的数据集-CFANZ编程社区

线性判别分析是一种经典的线性学习方法，在二分类问题上最早由Fisher在1936年提出，亦称Fisher线性判别。线性判别的思想非常朴素：给定训练样例集，设法将样例投影到一条直线上，使得同类样例的投影点尽可能接近，异类样例的投影点尽可能远离；在对新样本进行分类时，将其投影到同样的直线上，再根据投影点的位置来确定新样本的类别。

使用Python：

1. 导入线性判别分析函数或其它常用机器学习库，并自学线性判别分析函数的使用方法

2. 导入鸢尾花数据集或随机生成两个线性可分的数据集

3. 编写程序，使用线性判别分析对2个数据集进行分类

4. 以列表或图像的方式输出判别结果

代码如下：

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160	f #导入包 import pandas as pd import numpy as np from matplotlib import pyplot as plt from sklearn.preprocessing import LabelEncoder from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA feature_dict = {i: label for i, label in zip(range(4), ("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width",))} df = pd.read_csv('iris_training1.csv', sep=',') df.columns = ["Number"] + [l for i, l in sorted(feature_dict.items())] + ['Species'] # to drop the empty line at file-end df.dropna(how='all', inplace=True) #把数据分成data和label X = df[["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"]].values y = df['Species'].values enc = LabelEncoder() label_encoder = enc.fit(y) y = label_encoder.transform(y) + 1 label_dict = {1: 'setosa', 2: 'versicolor', 3: 'virginica'} #三个类别均值 np.set_printoptions(precision=4) mean_vectors = [] for c1 in range(1, 4): mean_vectors.append(np.mean(X[y == c1], axis=0)) print('Mean Vector class %s : %s\n' % (c1, mean_vectors[c1 - 1])) #计算两个 44 维矩阵：类内散布矩阵和类间散布矩阵 #类内散布矩阵 S_W = np.zeros((4, 4)) for c1, mv in zip(range(1, 4), mean_vectors): # scatter matrix for every class class_sc_mat = np.zeros((4, 4)) for row in X[y == c1]: # make column vectors row, mv = row.reshape(4, 1), mv.reshape(4, 1) class_sc_mat += (row - mv).dot((row - mv).T) # sum class scatter metrices S_W += class_sc_mat #类间散布矩阵 overall_mean = np.mean(X, axis=0) S_B = np.zeros((4, 4)) for i, mean_vec in enumerate(mean_vectors): n = X[y == i + 1, :].shape[0] # make column vector mean_vec = mean_vec.reshape(4, 1) # make column vector overall_mean = overall_mean.reshape(4, 1) S_B += n (mean_vec - overall_mean).dot((mean_vec - overall_mean).T) eig_vals, eig_vecs = np.linalg.eig(np.linalg.inv(S_W).dot(S_B)) for i in range(len(eig_vals)): eigvec_sc = eig_vecs[:, i].reshape(4, 1) print('\n Eigenvector {}: \n {}'.format(i+1, eigvec_sc.real)) print('Eigenvalue {: }: {:.2e}'.format(i+1, eig_vals[i].real)) # make a list of (eigenvalue, eigenvector) tuples eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))] # sort the (eigenvalue, eigenvector) tuples from high to low eig_pairs = sorted(eig_pairs, key=lambda k: k[0], reverse=True) # Visually cinfirm that the list is correctly sorted by decreasing eigenvalues print('特征向量: \n') for i in eig_pairs: print(i[0]) #特征值 eigv_sum = sum(eig_vals) for i, j in enumerate(eig_pairs): print('eigenvalue {0:}: {1:.2%}'.format(i + 1, (j[0] / eigv_sum).real)) #选择前两维特征 #特征矩阵 W = np.hstack((eig_pairs[0][1].reshape(4, 1), eig_pairs[1][1].reshape(4, 1))) #LDA X_lda = X.dot(W) assert X_lda.shape == (150, 2), 'The matrix is not 1502 dimensional.' def plt_step_lda(): ax = plt.subplot(111) for label, marker, color in zip(range(1, 4), ('o', '^', 's'), ('green', 'orange', 'pink')): plt.scatter(x=X_lda[:, 0].real[y == label], y=X_lda[:, 1].real[y == label], marker=marker, color=color, alpha=0.5, label=label_dict[label]) plt.xlabel('LD1') plt.ylabel('LD2') leg = plt.legend(loc='upper right', fancybox=True) leg.get_frame().set_alpha(0.5) plt.title('LDA: Iris projection onto the first 2 linear discriminants') # hide axis ticks plt.tick_params(axis='both', which='both', bottom='off', top='off', labelbottom='on', left='off', labelleft='on') ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['bottom'].set_visible(False) ax.spines['left'].set_visible(False) plt.grid() plt.tight_layout() plt.show() plt_step_lda() #使用sklearn实现lda： # LDA sklearn_lda = LDA(n_components=2) X_lda_sklearn = sklearn_lda.fit_transform(X, y) def plot_scikit_lda(X, title): ax = plt.subplot(111) for label, marker, color in zip(range(1, 4), ('o', '^', 's'), ('green', 'orange', 'pink')): plt.scatter(x=X_lda[:, 0].real[y == label], # flip the figure y=X_lda[:, 1].real[y == label] -1, marker=marker, color=color, alpha=0.5, label=label_dict[label]) plt.xlabel('LD1') plt.ylabel('LD2') leg = plt.legend(loc='upper right', fancybox=True) leg.get_frame().set_alpha(0.5) plt.title(title) # hide axis ticks plt.tick_params(axis='both', which='both', bottom='off', top='off', labelbottom='on', left='off', labelleft='on') # remove axis spines ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['bottom'].set_visible(False) ax.spines['left'].set_visible(False) plt.grid() plt.tight_layout() plt.show() plot_scikit_lda(X, title='Default LDA via scikit-learn')

#导入包 import pandas as pd import numpy as np from matplotlib import pyplot as plt from sklearn.preprocessing import LabelEncoder from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA feature_dict = {i: label for i, label in zip(range(4), ("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width",))} df = pd.read_csv('iris_training1.csv', sep=',') df.columns = ["Number"] + [l for i, l in sorted(feature_dict.items())] + ['Species'] # to drop the empty line at file-end df.dropna(how='all', inplace=True) #把数据分成data和label X = df[["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"]].values y = df['Species'].values enc = LabelEncoder() label_encoder = enc.fit(y) y = label_encoder.transform(y) + 1 label_dict = {1: 'setosa', 2: 'versicolor', 3: 'virginica'} #三个类别均值 np.set_printoptions(precision=4) mean_vectors = [] for c1 in range(1, 4): mean_vectors.append(np.mean(X[y == c1], axis=0)) print('Mean Vector class %s : %s\n' % (c1, mean_vectors[c1 - 1])) #计算两个 44 维矩阵：类内散布矩阵和类间散布矩阵 #类内散布矩阵 S_W = np.zeros((4, 4)) for c1, mv in zip(range(1, 4), mean_vectors): # scatter matrix for every class class_sc_mat = np.zeros((4, 4)) for row in X[y == c1]: # make column vectors row, mv = row.reshape(4, 1), mv.reshape(4, 1) class_sc_mat += (row - mv).dot((row - mv).T) # sum class scatter metrices S_W += class_sc_mat #类间散布矩阵 overall_mean = np.mean(X, axis=0) S_B = np.zeros((4, 4)) for i, mean_vec in enumerate(mean_vectors): n = X[y == i + 1, :].shape[0] # make column vector mean_vec = mean_vec.reshape(4, 1) # make column vector overall_mean = overall_mean.reshape(4, 1) S_B += n (mean_vec - overall_mean).dot((mean_vec - overall_mean).T) eig_vals, eig_vecs = np.linalg.eig(np.linalg.inv(S_W).dot(S_B)) for i in range(len(eig_vals)): eigvec_sc = eig_vecs[:, i].reshape(4, 1) print('\n Eigenvector {}: \n {}'.format(i+1, eigvec_sc.real)) print('Eigenvalue {: }: {:.2e}'.format(i+1, eig_vals[i].real)) # make a list of (eigenvalue, eigenvector) tuples eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))] # sort the (eigenvalue, eigenvector) tuples from high to low eig_pairs = sorted(eig_pairs, key=lambda k: k[0], reverse=True) # Visually cinfirm that the list is correctly sorted by decreasing eigenvalues print('特征向量: \n') for i in eig_pairs: print(i[0]) #特征值 eigv_sum = sum(eig_vals) for i, j in enumerate(eig_pairs): print('eigenvalue {0:}: {1:.2%}'.format(i + 1, (j[0] / eigv_sum).real)) #选择前两维特征 #特征矩阵 W = np.hstack((eig_pairs[0][1].reshape(4, 1), eig_pairs[1][1].reshape(4, 1))) #LDA X_lda = X.dot(W) assert X_lda.shape == (150, 2), 'The matrix is not 1502 dimensional.' def plt_step_lda(): ax = plt.subplot(111) for label, marker, color in zip(range(1, 4), ('o', '^', 's'), ('green', 'orange', 'pink')): plt.scatter(x=X_lda[:, 0].real[y == label], y=X_lda[:, 1].real[y == label], marker=marker, color=color, alpha=0.5, label=label_dict[label]) plt.xlabel('LD1') plt.ylabel('LD2') leg = plt.legend(loc='upper right', fancybox=True) leg.get_frame().set_alpha(0.5) plt.title('LDA: Iris projection onto the first 2 linear discriminants') # hide axis ticks plt.tick_params(axis='both', which='both', bottom='off', top='off', labelbottom='on', left='off', labelleft='on') ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['bottom'].set_visible(False) ax.spines['left'].set_visible(False) plt.grid() plt.tight_layout() plt.show() plt_step_lda() #使用sklearn实现lda： # LDA sklearn_lda = LDA(n_components=2) X_lda_sklearn = sklearn_lda.fit_transform(X, y) def plot_scikit_lda(X, title): ax = plt.subplot(111) for label, marker, color in zip(range(1, 4), ('o', '^', 's'), ('green', 'orange', 'pink')): plt.scatter(x=X_lda[:, 0].real[y == label], # flip the figure y=X_lda[:, 1].real[y == label] -1, marker=marker, color=color, alpha=0.5, label=label_dict[label]) plt.xlabel('LD1') plt.ylabel('LD2') leg = plt.legend(loc='upper right', fancybox=True) leg.get_frame().set_alpha(0.5) plt.title(title) # hide axis ticks plt.tick_params(axis='both', which='both', bottom='off', top='off', labelbottom='on', left='off', labelleft='on') # remove axis spines ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['bottom'].set_visible(False) ax.spines['left'].set_visible(False) plt.grid() plt.tight_layout() plt.show() plot_scikit_lda(X, title='Default LDA via scikit-learn')

运行结果：

Python实现线性判别分析鸢尾花数据集或随机生成两个线性可分的数据集_数据集

Python实现线性判别分析鸢尾花数据集或随机生成两个线性可分的数据集_线性判别分析_02

1. 叙述线性判别分析的实现过程

答：给定训练样例集,设法将样例投影到一条直线上,使得同类样例的投影点尽可能接近、异类样例的投影点尽可能远离;在对新样本进行分类时,将其投影到同样的这条直线上，再根据投影点的位置来确定新样本的类别。

输入：数据集 D = {(x1, y1), (x2, y2), .... (xm, ym)}，其中任意样本 xi 为 n维向量， yi € {C1, c2, ...Ck}，降维到的维度 d。

输出：降维后的样本集 D'

计算类内散度矩阵Sw

Python实现线性判别分析鸢尾花数据集或随机生成两个线性可分的数据集_线性判别分析_03

Python实现线性判别分析鸢尾花数据集或随机生成两个线性可分的数据集_数据集_04

计算类间散度矩阵

Python实现线性判别分析鸢尾花数据集或随机生成两个线性可分的数据集_ci_05

Python实现线性判别分析鸢尾花数据集或随机生成两个线性可分的数据集_ci_06

计算矩阵Sw-1Sb

计算Sw-1Sb的最大的 d个特征值和对应的 d个特征向量（w1, w2, ... wd），得到投影矩阵 W

对样本集中的每一个样本特征 xii，转化为新的样本zi = WTxi

得到输出样本集D' = {(z1, y1), (z2, y2), .... (zm, ym)}

2. 说明“Linear Discriminant Analysis”函数（Python）或“Classify”函数（Matlab）中各参数的意义

答： solver : 即求LDA超平面特征矩阵使用的方法。可以选择的方法有奇异值分解"svd"，最小二乘"lsqr"和特征分解"eigen"。一般来说特征数非常多的时候推荐使用svd，而特征数不多的时候推荐使用eigen。主要注意的是，如果使用svd，则不能指定正则化参数shrinkage进行正则化。默认值是svd

shrinkage：正则化参数，可以增强LDA分类的泛化能力。如果仅仅只是为了降维，则一般可以忽略这个参数。默认是None，即不进行正则化。可以选择"auto",让算法自己决定是否正则化。当然我们也可以选择不同的[0,1]之间的值进行交叉验证调参。注意shrinkage只在solver为最小二乘"lsqr"和特征分解"eigen"时有效。

priors ：类别权重，可以在做分类模型时指定不同类别的权重，进而影响分类模型建立。降维时一般不需要关注这个参数。

n_components：即我们进行LDA降维时降到的维数。在降维时需要输入这个参数。注意只能为[1,类别数-1)范围之间的整数。如果我们不是用于降维，则这个值可以用默认的None。

从上面的描述可以看出，如果我们只是为了降维，则只需要输入n_components,注意这个值必须小于“类别数-1”。