Ex6_机器学习_吴恩达课程作业（Python）：SVM支持向量机（Support Vector Machines）

文章目录

Ex6_机器学习_吴恩达课程作业（Python）：SVM支持向量机（Support Vector Machines）
0. Pre-condition
00. Self-created Functions
1. Support Vector Machines
2. Spam Classification

0. Pre-condition

# This file includes self-created functions used in exercise 3
import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize as opt
from scipy.io import loadmat

00. Self-created Functions

loadData(path)：读取数据

# Load data from the given file  读取数据
# ARGS: { path: 数据路径 }
def loadData(path):
    data = loadmat(path)
    return data['X'], data['y']

plotData(X, y)：可视化数据

# Visualize data  可视化数据
# ARGS: { X: 训练集; y: 标签集 }
def plotData(X, y):
    plt.figure(figsize=[8, 6])
    plt.scatter(X[:, 0], X[:, 1], c=y.flatten())
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.title('Data Visualization')
    # plt.show()

plotBoundary(classifier, X)：绘制类别间的决策边界

# Plot the boundary between two classes  绘制类别间的决策边界
# ARGS: { classifier: 分类器; X: 训练集 }
def plotBoundary(classifier, X):
    x_min, x_max = X[:, 0].min() * 1.2, X[:, 0].max() * 1.1
    y_min, y_max = X[:, 1].min() * 1.2, X[:, 1].max() * 1.1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 500),
                         np.linspace(y_min, y_max, 500))
    # 利用传入的分类器，对预测样本做出类别预测
    Z = classifier.predict(np.c_[xx.flatten(), yy.flatten()])
    Z = Z.reshape(xx.shape)
    plt.contour(xx, yy, Z)

displayBoundaries(X, y)：绘制不同SVM参数C下的的决策边界（线性核）

# Display boundaries for different situations with different C (1 and 100)
# 改变SVM参数C，绘制各情况下的的决策边界
# ARGS: { X: 训练集 ; y: 标签集 }
def displayBoundaries(X, y):
    # 此处使用skilearn的包，采用线性核函数，获取多个SVM模型
    models = [svm.SVC(C=C, kernel='linear') for C in [1, 100]]
    # 给定训练集X和标签集y，训练得到的多个SVM模型，获得多个分类器
    classifiers = [model.fit(X, y.flatten()) for model in models]
    # 输出信息
    titles = ['SVM Decision Boundary with C = {}'.format(C) for C in [1, 100]]
    # 对于每个分类器，绘制其得出的决定边界
    for classifier, title in zip(classifiers, titles):
        plotData(X, y)
        plotBoundary(classifier, X)
        plt.title(title)
    # 展示数据
    plt.show()

gaussianKernel(x1, x2, sigma)：实现高斯核函数

在这里插入图片描述

# Implement a Gaussian kernel function (Could be considered as a similarity function)
# 实现高斯核函数（可以看作相似度函数，测量一对样本的距离）
# ARGS: { x1: 样本1; x2: 样本2; sigma: 高斯核函数参数 }
def gaussianKernel(x1, x2, sigma):
    return np.exp(-(np.power(x1 - x2, 2).sum() / (2 * np.power(sigma, 2))))

displayGaussKernelBoundary(X, y, C, sigma)：绘制高斯核SVM对某数据集的决策边界

# Display the decision boundary using SVM with a Gaussian kernel
# 绘制出基于高斯核的SVM对某数据集的决策边界
# ARGS: { X: 训练集; y: 标签集; C: SVM参数; sigma: 高斯核函数参数 }
def displayGaussKernelBoundary(X, y, C, sigma):
    gamma = np.power(sigma, -2.) / 2
    # 'rbf'指径向基函数/高斯核函数
    model = svm.SVC(C=1, kernel='rbf', gamma=gamma)
    classifier = model.fit(X, y.flatten())
    plotData(X, y)
    plotBoundary(classifier, X)
    plt.title('Decision boundary using SVM with a Gaussian Kernel')
    plt.show()

trainGaussParams(X, y, Xval, yval)：比较交叉验证集误差，训练最优参数C和sigma

# Train out the best parameters 'C' and 'sigma" with the least cost on the validation set
# 通过比较在交叉验证集上的误差，训练出最优的参数C和sigma
# ARGS: { X: 训练集; y: 标签集; Xval: 训练交叉验证集; yval: 标签交叉验证集 }
def trainGaussParams(X, y, Xval, yval):
    C_values = (0.01, 0.03, 0.1, 0.3, 1., 3., 10., 30.)
    sigma_values = C_values
    best_pair, best_score = (0, 0), 0
    for C in C_values:
        for sigma in sigma_values:
            gamma = np.power(sigma, -2.) / 2
            model = svm.SVC(C=C, kernel='rbf', gamma=gamma)
            classifier = model.fit(X, y.flatten())
            this_score = model.score(Xval, yval)
            if this_score > best_score:
                best_score = this_score
                best_pair = (C, sigma)
    print('Best pair(C, sigma): {}, best score: {}'.format(best_pair, best_score))
    return best_pair[0], best_pair[1]

preprocessEmail(email)：预处理邮件

# Preprocess an email  预处理邮件
# 执行除了Word Stemming和Removal of non-words的所有处理
def preprocessEmail(email):
    # 全文小写
    email = email.lower()
    # 统一化HTML格式。匹配<开头，以及所有不是< ,> 的内容，直到>结尾，相当于匹配<...>
    email = re.sub('<[^<>]>', ' ', email)
    # 统一化URL。将所有URL地址转化成"httpadddr"。
    email = re.sub('(http|https)://[^\s]*', 'httpaddr', email)
    # 统一化邮件地址。将所有邮件地址转化成"emailaddr"。
    email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email)
    # 统一化美元符号。
    email = re.sub('[\$]+', 'dollar', email)
    # 统一化数字。
    email = re.sub('[\d]+', 'number', email)
    return email

email2TokenList(email)：词干提取及去除非字符内容，返回单词列表

# Conduct Word Stemming and Removal of non-words.
# Besides, here we use "NLTK" lib's stemmer, since it's more accurate and efficient.
# 执行词干提取以及去除非字符内容的处理，返回的是一个个的处理后的单词
# 此处用NLTK包的提取器，效率更高且更准确
def email2TokenList(email):
    # Preprocess the email 预处理邮件
    email = preprocessEmail(email)
    # Instantiate the stemmer 实例化提取器
    stemmer = nltk.stem.porter.PorterStemmer()
    # Split the whole email into separated words 将邮件分割为一个个单词
    tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', email)
    # Traverse all the split contents 遍历逐个分割出来的内容
    token_list = []
    for token in tokens:
        # Remove non-word contents 删除任何非字母数字的字符
        token = re.sub('[^a-zA-Z0-9]', '', token)
        # Stem the root of the word 提取词根
        stemmed_word = stemmer.stem(token)
        # Remove empty string 去除空字符串‘’，里面不含任何字符，不添加它
        if not len(token): continue
        # Append the word into the list 添加到list中
        token_list.append(stemmed_word)
    return token_list

email2VocabularyList(email, vocab_list)：获取在邮件和词汇表中同时出现的单词的索引

# Get the indices of words that exist both in the email and the vocabulary list
# 获取在邮件和词汇表中同时出现的单词的索引
# ARGS: { email: 邮件; vocab_list: 单词表 }
def email2VocabularyList(email, vocab_list):
    token = email2TokenList(email)
    index = [i for i in range(len(vocab_list)) if vocab_list[i] in token]
    return index

email2FeatureVector(email)：提取邮件的特征

# Extract features from email, turn the email into a feature vector
# 提取邮件的特征，获取一个表示邮件的特征向量（长度为单词表长度，存在该单词则对应下标位置值为1，反之为0）
# ARGS: { email: 邮件 }
def email2FeatureVector(email):
    # 提供的单词表
    df = pd.read_table('../data/vocab.txt', names=['words'])
    vocab_list = np.asmatrix(df)
    # 长度与单词表长度相同
    feature_vector = np.zeros(len(vocab_list))
    # 邮件中存在该单词则对应下标位置值为1，反之为0
    vocab_indices = email2VocabularyList(email, vocab_list)
    for i in vocab_indices:
        feature_vector[i] = 1
    return feature_vector

1. Support Vector Machines

# 1. Support Vector Machines  支持向量机
path = '../data/ex6data1.mat'
X, y = func.loadData(path)

1.1 Example dataset 1

# 1.1 Example dataset 1  样例数据集1
# 可视化数据
func.plotData(X, y)

# 尝试不同的参数C，并且绘制各种情况下的决定边界
func.displayBoundaries(X, y)

数据可视化：
决策边界（线性核，C = 1）：
决策边界（线性核，C = 100）：

可以从上图看到：

当 $C$ 较大（即 $1/\lambda$ 较大， $\lambda$ 较小）时，模型对误分类的惩罚增大，较严格，误分类少，间隔较小。
当 $C$ 较小（即 $1/\lambda$ 较小， $\lambda$ 较大）时，模型对误分类的惩罚减小，较宽松，允许一定误分类存在，间隔较大。

1.2 SVM with Gaussian Kernels

为了用SVM找出非线性的决策边界，我们首先要实现高斯核函数。我可以把高斯核函数想象成一个相似度函数，用来测量一对样本的距离 $x^{(i)}, y^{(j)})$ 。

注意，大多数SVM库会自动帮你添加额外的特征 $x_0$ 以及 $\theta_0$ ，所以无需手动添加。

# 1.2 SVM with Gaussian Kernels  基于高斯核函数的SVM
path2 = '../data/ex6data2.mat'
X2, y2 = func.loadData(path2)

path3 = '../data/ex6data3.mat'
df3 = loadmat(path3)
X3, y3, Xval, yval = df3['X'], df3['y'], df3['Xval'], df3['yval']

1.2.1 Gaussian Kernel

在这里插入图片描述

# 1.2.1 Gaussian Kernel  高斯核函数
res_gaussianKernel = func.gaussianKernel(np.array([1, 2, 1]), np.array([0, 4, -1]), 2.)
print(res_gaussianKernel)  # 0.32465246735834974

1.2.2 Example dataset 2

# 1.2.2 Example dataset 2  样例数据集2
# 可视化数据
func.plotData(X2, y2)

# 绘制基于高斯核函数的SVM对于数据集的决策边界
func.displayGaussKernelBoundary(X2, y2, C=1, sigma=0.1)

数据可视化：
决策边界（高斯核）：

1.2.3 Example dataset 3

# 1.2.3 Example dataset 3  样例数据集3
# 可视化数据
func.plotData(X3, y3)

# 训练基于高斯核函数的SVM的参数C和sigma
final_C, final_sigma = func.trainGaussParams(X3, y3, Xval, yval)

# 绘制基于高斯核函数的SVM对于数据集的决策边界
func.displayGaussKernelBoundary(X3, y3, C=final_C, sigma=final_sigma)

数据可视化：
决策边界（高斯核，最优参数）：

2. Spam Classification

在此部分，我们将利用SVM建立垃圾邮件分类器。你需要将每个邮件变成一个 $n$ 维的特征向量，分类器将判断给定的邮件 $x$ 是垃圾邮件 $(y = 1)$ 或不是垃圾邮件 $(y = 0)$ 。

# 2. Spam Classifier  垃圾邮件分类器
# 获取邮件内容
with open('../data/emailSample1.txt', 'r') as f:
    email = f.read()
    print(email)

2.1 Preprocess Emails

可以看到，邮件内容包含 URL，邮件地址，数字以及美元符号等。很多邮件都会包含这些元素，但是每封邮件的具体内容可能不一样。因此，处理邮件时经常采用的方法是标准化数据，即把所有URL当作一样，所有数字看作一样等。

例如，我们用唯一的一个字符串‘httpaddr’来替换所有的URL，来表示邮件包含URL，而不要求具体的URL内容。这通常会提高垃圾邮件分类器的性能，因为垃圾邮件发送者通常会随机化URL，因此在新的垃圾邮件中再次看到任何特定URL的几率非常小。

我们可以做如下处理：

具体代码见文章头部的自定义函数部分。

# 2.1 Preprocess emails  预处理
# 2.1.1 Vocabulary List  单词表
# 2.2 Extract Features from emails  提取特征
feature_vector = func.email2FeatureVector(email)
print('Length of feature vector = {}\nNumber of occurred words = {}'
      .format(len(feature_vector), int(feature_vector.sum())))

2.1.1 Vocabulary List

在对邮件进行预处理之后，我们得到处理后的单词表。下一步我们选择我们在分类器中使用的单词，以及我们需要去除哪些词。

题目提供了词汇表vocab.txt，里面存有在实际中经常使用的单词，共1899个。

我们要算出处理后的邮件中含有多少vocab.txt中的单词，并返回该单词在vocab.txt中的索引，即我们想要的训练单词的索引。

2.2 Extract Features from Emails

提取邮件的特征，获取一个表示邮件的特征向量（长度为单词表长度，存在该单词则对应下标位置值为 $1$ ，反之为 $0$ ）。

2.3 Train SVM for Spam Classification

读取已经训提取好的特征向量以及相应的标签。分训练集和测试集。

# 2.3 Train SVM for Spam Classification  训练SVM
# 读取已经训提取好的特征向量以及相应的标签。分训练集和测试集。
path_train = '../data/spamTrain.mat'
path_test = '../data/spamTest.mat'
mat_train = loadmat(path_train)
mat_test = loadmat(path_test)
X_train, y_train = mat_train['X'], mat_train['y']
X_test, y_test = mat_test['Xtest'], mat_test['ytest']
# Fit the model 训练模型
model = svm.SVC(C=0.1, kernel='linear')
model.fit(X_train, y_train)

2.4 Top Predictiors for Spam

# 2.4 Top predictors for Spam  垃圾邮件的显著指标
prediction_train = model.score(X_train, y_train)
prediction_test = model.score(X_test, y_test)
print('Predictions of training: ', prediction_train)
print('Predictions of testing: ', prediction_test)