单变量线性回归

题目

在本部分的练习中，您将使用一个变量实现线性回归，以预测食品卡车的利润。假设你是一家餐馆的首席执行官，正在考虑不同的城市开设一个新的分店。该连锁店已经在各个城市拥有卡车，而且你有来自城市的利润和人口数据。
您希望使用这些数据来帮助您选择将哪个城市扩展到下一个城市。

数据

先导入数据

'''
单变量线性回归
1.Prepare datasets
'''
path = 'ex1data1.txt'
# names添加列名，header用指定的行来作为标题，若原无标题且指定标题则设为None
data = pd.read_csv(path, header=None, names=['Population', 'Profit'])
data.head()
data.describe()
# print(data.head())#显示前五行
# print(data.describe())

在开始任何任务之前，通过可视化来理解数据通常是有用的。对于这个数据集，您可以使用散点图来可视化数据，因为它只有两个属性(利润和人口)。

# 展示散点图，可视化理解数据
# data.plot(kind='scatter', x='Population', y='Profit', figsize=(8,5))
# plt.title("Scatter plot of training data") #添加描述信息
# plt.xlabel("population of city")
# plt.ylabel("profit")
# plt.show()

现在让我们使用梯度下降来实现线性回归，以最小化成本函数。以下代码示例中实现的方程在“练习”文件夹中的“ex1.pdf”中有详细说明。

首先，我们将创建一个以参数θ为特征函数的代价函数
计算代价函数 J ( θ )

'''
作用：计算代价函数,向量化来计算参数
:param X: 输入矩阵
:param y: 输出目标
:param theta: parameters
:return:
'''
def computeCost(X, y, theta):

    inner = np.power(((X * theta.T) - y), 2)
    # print(inner)
    return np.sum(inner) / (2 * len(X))

让我们在训练集中添加一列，以便我们可以使用向量化的解决方案来计算代价和梯度。

data.insert(0, 'Ones', 1)       # 增加一条第一列，全部数值为1
# print(data)

现在我们来做一些变量初始化。
取最后一列为 y，其余为 X
观察下 X (训练集) and y (目标变量)是否正确.

# 变量初始化:set X (training data) and y (target variable)
cols = data.shape[1]  # 列数
X = data.iloc[:, 0:cols - 1]        # 取前cols-1列，即输入向量
y = data.iloc[:, cols - 1:cols]     # 取最后一列作为目标向量
# print(X.head()) # 观察下 X (训练集) and y (目标变量)是否正确.
# print(y.head())

但是matrix的优势就是相对简单的运算符号，比如两个矩阵相乘，就是用符号*，但是array相乘不能这么用，得用方法.dot()
array的优势就是不仅仅表示二维，还能表示3、4、5…维，而且在大部分Python程序里，array也是更常用的。

两者区别：

对应元素相乘：matrix可以用np.multiply(X2,X1)，array直接X1X2
点乘：matrix直接X1X2，array可以 X1@X2 或 X1.dot(X2) 或 np.dot(X1, X2)
代价函数是应该是numpy矩阵，所以我们需要转换X和Y，然后才能使用它们。我们还需要初始化theta。

X = np.matrix(X.values)
y = np.matrix(y.values)
theta = np.matrix([0,0]) # theta 是一个(1,2)矩阵
computeCost(X, y, theta)
# print(X.shape, y.shape, theta.shape)  # 查看各自的行列数
# print(computeCost(X, y, theta)) # 32.072733877455676

batch gradient decent（批量梯度下降）

初始化一些附加变量 - 学习速率α和要执行的迭代次数。并开始计算合适的theta，写出梯度下降算法的函数。

alpha = 0.01 # 学习速率α
epoch = 1000 # 要执行的迭代次数
'''
2.作用:获得最终梯度下降后的theta值以及cost
:param X:
:param y:
:param theta:
:param alpha:
:param epoch:
:return:
'''
def gradientDescent(X, y, theta, alpha, epoch):
    # 变量初始化，储存数据
    np.matrix(np.zeros(theta.shape)) # 初始化一个临时矩阵(1, 2)
    # flatten()降维 即返回一个折叠成一维的数组。但是该函数只能适用于numpy对象，即array或者mat，普通的list列表是不行的。
    parameters = int(theta.flatten().shape[1]) # 参数theta的数量 2
    # print(parameters)
    cost = np.zeros(epoch)  # 初始化一个ndarray, 包含每次训练后的cost #1000个0的矩阵
    # print(cost)
    counterTheta = np.zeros((epoch, 2)) #1000 * 2的数组
    m = X.shape[0]  # 样本参数 97行

    for i in range(epoch):
        '''
        使用 vectorization同时更新所有的θ，可以大大提高效率,此处都是相对应的进行计算
        X.shape, theta.shape, y.shape, X.shape[0]
        ((97, 2), (1, 2), (97, 1), 97)
        '''
        # 相当于theta1 theta2不停做偏导并且更新 theta[theta1, theta2] temp是临时的theta
        temp = theta - (alpha / m) * (X * theta.T - y).T * X
        theta = temp
        counterTheta[i] = theta
        cost[i] = computeCost(X, y, theta)
        pass
    return counterTheta, theta, cost

调用梯度下降函数

counterTheta, final_theta, cost = gradientDescent(X, y, theta, alpha, epoch)
computeCost(X, y, final_theta)
# print(computeCost(X, y, final_theta)) # 4.515955503078912

画图

x = np.linspace(data.Population.min(), data.Population.max(), 100)  # xlabel start:返回样本数据开始点 stop:返回样本数据结束点 num:生成的样本数据量，默认为50
f = final_theta[0, 0] + (final_theta[0, 1] * x)  # ylabel profit
print(final_theta)


fig1, ax = plt.subplots(figsize=(6, 4)) # 尺寸
ax.plot(x, f, 'r', label='Predictionnnnnn') # 横坐标 纵坐标 颜色 标签
ax.scatter(data.Population, data.Profit, label='Training Data') # 点的离散值
ax.legend(loc=2) # 2表示在左上角
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs. Population Size')

fig2, ax = plt.subplots(figsize=(8, 4))
ax.plot(np.arange(epoch), cost, 'r') # 横坐标 纵坐标 颜色
ax.set_xlabel('Iteration')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()

完整代码

# Kyrie Irving
# !/9462...
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

'''
作用：计算代价函数,向量化来计算参数
:param X: 输入矩阵
:param y: 输出目标
:param theta: parameters
:return:
'''
def computeCost(X, y, theta):

    inner = np.power(((X * theta.T) - y), 2)
    # print(inner)
    return np.sum(inner) / (2 * len(X))

'''
2.作用:获得最终梯度下降后的theta值以及cost
:param X:
:param y:
:param theta:
:param alpha:
:param epoch:
:return:
'''
def gradientDescent(X, y, theta, alpha, epoch):
    # 变量初始化，储存数据
    np.matrix(np.zeros(theta.shape)) # 初始化一个临时矩阵(1, 2)
    # flatten()降维 即返回一个折叠成一维的数组。但是该函数只能适用于numpy对象，即array或者mat，普通的list列表是不行的。
    parameters = int(theta.flatten().shape[1]) # 参数theta的数量 2
    # print(parameters)
    cost = np.zeros(epoch)  # 初始化一个ndarray, 包含每次训练后的cost #1000个0的矩阵
    # print(cost)
    counterTheta = np.zeros((epoch, 2)) #1000 * 2的数组
    m = X.shape[0]  # 样本参数 97行

    for i in range(epoch):
        '''
        使用 vectorization同时更新所有的θ，可以大大提高效率,此处都是相对应的进行计算
        X.shape, theta.shape, y.shape, X.shape[0]
        ((97, 2), (1, 2), (97, 1), 97)
        '''
        # 相当于theta1 theta2不停做偏导并且更新 theta[theta1, theta2] temp是临时的theta
        temp = theta - (alpha / m) * (X * theta.T - y).T * X
        theta = temp
        counterTheta[i] = theta
        cost[i] = computeCost(X, y, theta)
        pass
    return counterTheta, theta, cost







'''
单变量线性回归
1.Prepare datasets
'''
path = 'ex1data1.txt'
# names添加列名，header用指定的行来作为标题，若原无标题且指定标题则设为None
data = pd.read_csv(path, header=None, names=['Population', 'Profit'])
data.head()
data.describe()
# print(data.head())#显示前五行
# print(data.describe())

# 展示散点图，可视化理解数据
# data.plot(kind='scatter', x='Population', y='Profit', figsize=(8,5))
# plt.title("Scatter plot of training data") #添加描述信息
# plt.xlabel("population of city")
# plt.ylabel("profit")
# plt.show()

data.insert(0, 'Ones', 1)       # 增加一条第一列，全部数值为1
# print(data)

# 变量初始化:set X (training data) and y (target variable)
cols = data.shape[1]  # 列数
X = data.iloc[:, 0:cols - 1]        # 取前cols-1列，即输入向量
y = data.iloc[:, cols - 1:cols]     # 取最后一列作为目标向量
# print(X.head()) # 观察下 X (训练集) and y (目标变量)是否正确.
# print(y.head())

X = np.matrix(X.values)
y = np.matrix(y.values)
theta = np.matrix([0,0]) # theta 是一个(1,2)矩阵
computeCost(X, y, theta)
# print(X.shape, y.shape, theta.shape)  # 查看各自的行列数
# print(computeCost(X, y, theta)) # 32.072733877455676

alpha = 0.01 # 学习速率α
epoch = 1000 # 要执行的迭代次数
counterTheta, final_theta, cost = gradientDescent(X, y, theta, alpha, epoch)
computeCost(X, y, final_theta)
# print(computeCost(X, y, final_theta)) # 4.515955503078912

x = np.linspace(data.Population.min(), data.Population.max(), 100)  # xlabel start:返回样本数据开始点 stop:返回样本数据结束点 num:生成的样本数据量，默认为50
f = final_theta[0, 0] + (final_theta[0, 1] * x)  # ylabel profit
print(final_theta)


fig1, ax = plt.subplots(figsize=(6, 4)) # 尺寸
ax.plot(x, f, 'r', label='Predictionnnnnn') # 横坐标 纵坐标 颜色 标签
ax.scatter(data.Population, data.Profit, label='Training Data') # 点的离散值
ax.legend(loc=2) # 2表示在左上角
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs. Population Size')

fig2, ax = plt.subplots(figsize=(8, 4))
ax.plot(np.arange(epoch), cost, 'r') # 横坐标 纵坐标 颜色
ax.set_xlabel('Iteration')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()

多变量线性回归

练习1还包括一个房屋价格数据集，其中有2个变量（房子的大小，卧室的数量）和目标（房子的价格）。我们使用我们已经应用的技术来分析数据集。

数据

完整代码

# Kyrie Irving
# !/9462...
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

path = 'ex1data2.txt'
data = pd.read_csv(path, names=['Size', 'Bedrooms', 'Price'])
# 数据归一化后，最优解的寻优过程明显会变得平缓，更容易正确的收敛到最优解。简而言之，归一化的目的就是使得预处理的数据被限定在一定的范围内（比如[0,1]或者[-1,1]），从而消除奇异样本数据导致的不良影响。
data = (data - data.mean()) / data.std() # data2.std()是标准差
data.head()
# print(data.head())
# add ones column
data.insert(0, 'Ones', 1)

# set X (training data) and y (target variable)
cols = data.shape[1]
X = data.iloc[:, 0:cols - 1]
y = data.iloc[:, cols - 1:cols]

# convert to matrices and initialize theta
X = np.matrix(X.values)
y = np.matrix(y.values)
theta = np.matrix(np.array([0, 0, 0])) # 此时theta的维度应该是3

'''
作用：计算代价函数,向量化来计算参数
:param X: 输入矩阵
:param y: 输出目标
:param theta: parameters
:return:
'''
def computeCost(X, y, theta):

    inner = np.power(((X * theta.T) - y), 2)
    # print(inner)
    return np.sum(inner) / (2 * len(X))
# print(computeCost(X, y, theta)) # 0.4893617021276595

'''
2.作用:获得最终梯度下降后的theta值以及cost
:param X:
:param y:
:param theta:
:param alpha:
:param epoch:
:return:
'''
def gradientDescent(X, y, theta, alpha, epoch):
    # 变量初始化，储存数据
    np.matrix(np.zeros(theta.shape)) # 初始化一个临时矩阵(1, 2)
    # flatten()降维 即返回一个折叠成一维的数组。但是该函数只能适用于numpy对象，即array或者mat，普通的list列表是不行的。
    parameters = int(theta.flatten().shape[1]) # 参数theta的数量 2
    # print(parameters)
    cost = np.zeros(epoch)  # 初始化一个ndarray, 包含每次训练后的cost #1000个0的矩阵
    # print(cost)
    counterTheta = np.zeros((epoch, 3)) #1000 * 3的数组
    m = X.shape[0]  # 样本参数 97行

    for i in range(epoch):
        '''
        使用 vectorization同时更新所有的θ，可以大大提高效率,此处都是相对应的进行计算
        X.shape, theta.shape, y.shape, X.shape[0]
        ((97, 2), (1, 2), (97, 1), 97)
        '''
        # 相当于theta1 theta2不停做偏导并且更新 theta[theta1, theta2] temp是临时的theta
        temp = theta - (alpha / m) * (X * theta.T - y).T * X
        theta = temp
        counterTheta[i] = theta
        cost[i] = computeCost(X, y, theta) # 记录每次的cost
        pass
    return counterTheta, theta, cost

'''
3.Run model and Plot
'''
alpha = 0.01
epoch = 3800
counterTheta, final_theta, cost = gradientDescent(X, y, theta, alpha, epoch)
computeCost(X, y, final_theta)
# print(computeCost(X, y, final_theta)) #0.13068648053904253

fig2, ax = plt.subplots(figsize=(8, 4))
ax.plot(np.arange(epoch), cost, 'r')
ax.set_xlabel('Iteration')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()