一、神经元
这个类代表一个简单的人工神经元,能够接收输入值的向量,在返回激活值之前合并和处理它们。
class Neuron(object):
"""
一个简单的人工神经元,处理输入向量并返回相应的激活。
Args:
num_inputs(in):输入向量大小/输入值个数。
activation_function (callable): 定义这个神经元的激活函数。
Attributes:
W (ndarray): 每个输入的权重值。
b (float): 偏置值,添加到加权和中。
activation_function (callable): 计算神经元输出的激活函数。
"""
def __init__(self, num_inputs, activation_function):
super().__init__()
# 随机初始化权重向量和偏差值(例如,使用 -1 和 1 之间的简单均匀分布):
self.W = np.random.uniform(size=num_inputs, low=-1., high=1.)
self.b = np.random.uniform(size=1, low=-1., high=1.)
self.activation_function = activation_function
def forward(self, x):
"""
通过神经元转发输入信号,返回其激活值。
Args:
x (ndarray): 输入向量,形状为 `(1, num_inputs)`
Returns:
activation (ndarray): 激活值, 形状为 `(1, layer_size)`.
"""
z = np.dot(x, self.W) + self.b
return self.activation_function(z)
我们现在将演示如何使用这个模型。首先,我们实例化我们的神经元。 让我们创建一个 perceptron,采用 2 个输入值并使用 step 函数来计算它的 activation。 它的权重和偏差值是随机设置的。
# 感知器输入大小:
input_size = 3
# Step function (returns 0 if y <= 0, or 1 if y > 0):
step_function = lambda y: 0 if y <= 0 else 1
# 实例化感知器:
perceptron = Neuron(num_inputs=input_size, activation_function=step_function)
print("Perceptron's random weights = {} , and random bias = {}".format(perceptron.W, perceptron.b))
#我们随机生成一个包含 3 个值的随机输入向量(即 (shape = `(1, 3)` 的列向量),以馈送到我们的神经元:
x = np.random.rand(input_size).reshape(1, input_size)
print("Input vector : {}".format(x))
#我们现在可以用这个输入来输入我们的感知器并显示相应的激活。 可以尝试不同的输入或编辑权重。
y = perceptron.forward(x)
print("Perceptron's output value given `x` : {}".format(y))
通过这个神经元类,我们实现了早期人工智能提出的神经元数学模型。
二、将神经元组织成层
class FullyConnectedLayer(object):
"""一个简单的全连接层.
Args:
num_inputs (int): 输入向量大小/输入值个数.
layer_size (int): 输出向量大小/层中的神经元数量.
activation_function (callable): 该层的激活函数.
Attributes:
W (ndarray): 每个输入的权重值.
b (ndarray): 偏置值,添加到加权和.
size (int): 层大小/神经元数量.
activation_function (callable):计算神经元输出的激活函数.
x (ndarray): 最后提供的输入向量,存储用于反向传播.
y (ndarray): 相应的输出,也存储用于反向传播.
derivated_activation_function (callable): 反向传播的相应派生函数.
dL_dW (ndarray): 损失的导数,关于权重 W.
dL_db (ndarray): 损失的导数,关于偏差 b.
"""
def __init__(self, num_inputs, layer_size, activation_function, derivated_activation_function=None):
super().__init__()
# 随机初始化权重向量和偏置值(这次使用正态分布):
self.W = np.random.standard_normal((num_inputs, layer_size))
self.b = np.random.standard_normal(layer_size)
self.size = layer_size
self.activation_function = activation_function
self.derivated_activation_function = derivated_activation_function
self.x, self.y = None, None
self.dL_dW, self.dL_db = None, None
def forward(self, x):
"""
通过层转发输入向量,返回其激活向量.
Args:
x (ndarray): 输入向量,形状 `(batch_size, num_inputs)`
Returns:
activation (ndarray): 激活值, 形状 `(batch_size, layer_size)`.
"""
z = np.dot(x, self.W) + self.b
self.y = self.activation_function(z)
self.x = x # (we store the input and output values for back-propagation)
return self.y
def backward(self, dL_dy):
"""
反向传播损失,计算所有导数,存储那些 w.r.t. 层参数,并返回损失 w.r.t. 其输入用于进一步传播。
Args:
dL_dy (ndarray): The loss derivative w.r.t. the layer's output (dL/dy = l'_{k+1}).
Returns:
dL_dx (ndarray): The loss derivative w.r.t. the layer's input (dL/dx).
"""
dy_dz = self.derivated_activation_function(self.y) # = f'
dL_dz = (dL_dy * dy_dz) # dL/dz = dL/dy * dy/dz = l'_{k+1} * f'
dz_dw = self.x.T
dz_dx = self.W.T
dz_db = np.ones(dL_dy.shape[0]) # dz/db = d(W.x + b)/db = 0 + db/db = "ones"-vector
# Computing the derivatives with respect to the layer's parameters, and storing them for opt. optimization:
self.dL_dW = np.dot(dz_dw, dL_dz)
self.dL_db = np.dot(dz_db, dL_dz)
# Computing the derivative with respect to the input, to be passed to the previous layers (their `dL_dy`):
dL_dx = np.dot(dL_dz, dz_dx)
return dL_dx
def optimize(self, epsilon):
"""
使用存储的导数值优化层的参数。
Args:
epsilon (float): 学习率.
"""
self.W -= epsilon * self.dL_dW
self.b -= epsilon * self.dL_db
展示如何使用该层处理输入值,一个接一个或堆叠在一起成批。 我们实例化一层 3 个神经元(即 3 个输出值),取 2 个输入值,并应用 ReLU(整流线性单元)函数进行激活:
input_size = 2
num_neurons = 3
relu_function = lambda y: np.maximum(y, 0)
layer = FullyConnectedLayer(num_inputs=input_size, layer_size=num_neurons, activation_function=relu_function)
#我们随机生成 2 个随机输入向量:
x1 = np.random.uniform(-1, 1, 2).reshape(1, 2)
print("Input vector #1: {}".format(x1))
x2 = np.random.uniform(-1, 1, 2).reshape(1, 2)
print("Input vector #2: {}".format(x2))
y1 = layer.forward(x1)
print("Layer's output value given `x1` : {}".format(y1))
y2 = layer.forward(x2)
print("Layer's output value given `x2` : {}".format(y2))
x12 = np.concatenate((x1, x2)) # stack of input vectors, of shape `(2, 2)`
y12 = layer.forward(x12)
print("Layer's output value given `[x1, x2]` :\n{}".format(y12))
三、实现一个完整的神经网络
import numpy as np
#导入之前的全连接层
from fully_connected_layer import FullyConnectedLayer
#==============================================================================
# 定义相关方法
#==============================================================================
def sigmoid(x): # sigmoid function
return 1 / (1 + np.exp(-x)) # y
def derivated_sigmoid(y): # sigmoid derivative function
return y * (1 - y)
def loss_L2(pred, target): # L2 loss function
return np.sum(np.square(pred - target)) / pred.shape[0] # opt. we divide by the batch size
def derivated_loss_L2(pred, target): # L2 derivative function
return 2 * (pred - target)
def cross_entropy(pred, target): # cross-entropy loss function
return -np.mean(np.multiply(np.log(pred), target) + np.multiply(np.log(1 - pred), (1 - target)))
def derivated_cross_entropy(pred, target): # cross-entropy derivative function
return (pred - target) / (pred * (1 - pred))
#==============================================================================
# 定义简单的网络
#==============================================================================
class SimpleNetwork(object):
"""A simple fully-connected NN.
Args:
num_inputs (int): The input vector size / number of input values.
num_outputs (int): The output vector size.
hidden_layers_sizes (list): A list of sizes for each hidden layer to add to the network
activation_function (callable): The activation function for all the layers
derivated_activation_function (callable): The derivated activation function
loss_function (callable): The loss function to train this network
derivated_loss_function (callable): The derivative of the loss function, for back-propagation
Attributes:
layers (list): The list of layers forming this simple network.
loss_function (callable): The loss function to train this network.
derivated_loss_function (callable): The derivative of the loss function, for back-propagation.
"""
def __init__(self, num_inputs, num_outputs, hidden_layers_sizes=(64, 32),
activation_function=sigmoid, derivated_activation_function=derivated_sigmoid,
loss_function=loss_L2, derivated_loss_function=derivated_loss_L2):
super().__init__()
# We build the list of layers composing the network, according to the provided arguments:
layer_sizes = [num_inputs, *hidden_layers_sizes, num_outputs]
self.layers = [
FullyConnectedLayer(layer_sizes[i], layer_sizes[i + 1], activation_function, derivated_activation_function)
for i in range(len(layer_sizes) - 1)]
self.loss_function = loss_function
self.derivated_loss_function = derivated_loss_function
def forward(self, x):
"""
Forward the input vector through the layers, returning the output vector.
Args:
x (ndarray): The input vector, of shape `(batch_size, num_inputs)`.
Returns:
activation (ndarray): The output activation value, of shape `(batch_size, layer_size)`.
"""
for layer in self.layers: # from the input layer to the output one
x = layer.forward(x)
return x
def predict(self, x):
"""
Compute the output corresponding to input `x`, and return the index of the largest output value.
Args:
x (ndarray): The input vector, of shape `(1, num_inputs)`.
Returns:
best_class (int): The predicted class ID.
"""
estimations = self.forward(x)
best_class = np.argmax(estimations)
return best_class
def backward(self, dL_dy):
"""
Back-propagate the loss hrough the layers (require `forward()` to be called before).
Args:
dL_dy (ndarray): The loss derivative w.r.t. the network's output (dL/dy).
Returns:
dL_dx (ndarray): The loss derivative w.r.t. the network's input (dL/dx).
"""
for layer in reversed(self.layers): # from the output layer to the input one
dL_dy = layer.backward(dL_dy)
return dL_dy
def optimize(self, epsilon):
"""
Optimize the network parameters according to the stored gradients (require `backward()` to be called before).
Args:
epsilon (float): The learning rate.
"""
for layer in self.layers: # the order doesn't matter here
layer.optimize(epsilon)
def evaluate_accuracy(self, X_val, y_val):
"""
Given a dataset and its ground-truth labels, evaluate the current accuracy of the network.
Args:
X_val (ndarray): The input validation dataset.
y_val (ndarray): The corresponding ground-truth validation dataset.
Returns:
accuracy (float): The accuracy of the network (= number of correct predictions / dataset size).
"""
num_corrects = 0
for i in range(len(X_val)):
pred_class = self.predict(X_val[i])
if pred_class == y_val[i]:
num_corrects += 1
return num_corrects / len(X_val)
def train(self, X_train, y_train, X_val=None, y_val=None, batch_size=32, num_epochs=5, learning_rate=1e-3):
"""
Given a dataset and its ground-truth labels, evaluate the current accuracy of the network.
Args:
X_train (ndarray): The input training dataset.
y_train (ndarray): The corresponding ground-truth training dataset.
X_val (ndarray): The input validation dataset.
y_val (ndarray): The corresponding ground-truth validation dataset.
batch_size (int): The mini-batch size.
num_epochs (int): The number of training epochs i.e. iterations over the whole dataset.
learning_rate (float): The learning rate to scale the derivatives.
Returns:
losses (list): The list of training losses for each epoch.
accuracies (list): The list of validation accuracy values for each epoch.
"""
num_batches_per_epoch = len(X_train) // batch_size
do_validation = X_val is not None and y_val is not None
losses, accuracies = [], []
for i in range(num_epochs): # for each training epoch
epoch_loss = 0
for b in range(num_batches_per_epoch): # for each batch composing the dataset
# Get batch:
batch_index_begin = b * batch_size
batch_index_end = batch_index_begin + batch_size
x = X_train[batch_index_begin: batch_index_end]
targets = y_train[batch_index_begin: batch_index_end]
# Optimize on batch:
predictions = y = self.forward(x) # forward pass
L = self.loss_function(predictions, targets) # loss computation
dL_dy = self.derivated_loss_function(predictions, targets) # loss derivation
self.backward(dL_dy) # back-propagation pass
self.optimize(learning_rate) # optimization of the NN
epoch_loss += L
# Logging training loss and validation accuracy, to follow the training:
epoch_loss /= num_batches_per_epoch
losses.append(epoch_loss)
if do_validation:
accuracy = self.evaluate_accuracy(X_val, y_val)
accuracies.append(accuracy)
else:
accuracy = np.NaN
print("Epoch {:4d}: training loss = {:.6f} | val accuracy = {:.2f}%".format(i, epoch_loss, accuracy * 100))
return losses, accuracies
四、在手写数字上训练网络
对于这项任务,使用MNIST数据集(Yann LeCun 和 Corinna Cortes 拥有该数据集的所有版权)。
在实施解决方案之前,我们应该准备数据,加载 MNIST 图像以用于训练和测试方法。 为简单起见,我们将使用 Marc Garcia 开发的 Python 模块 mnist。
%matplotlib inline
# !pip install matplotlib # 如果尚未安装 matplotlib,请取消注释并运行。
import matplotlib # 我们使用这个包来可视化一些数据和结果
import matplotlib.pyplot as plt
import mnist
np.random.seed(42)
# mnist 模块使加载训练和测试数据(图像及其标签)变得简单:
X_train, y_train = mnist.train_images(), mnist.train_labels()
X_test, y_test = mnist.test_images(), mnist.test_labels()
num_classes = 10 # classes are the digits from 0 to 9
# 我们可以检查训练/测试样本的数量和大小:
X_train.shape
# 会输出(60000, 28, 28)
X_test.shape
# 会输出(10000, 28, 28)
# 即我们有 60,000 个训练样本和 10,000 个测试样本,每个样本都有一个 28×28 像素的图像。
# 我们可以查看数据,例如使用 matplotlib:
img_idx = np.random.randint(0, X_test.shape[0])
plt.imshow(X_test[img_idx], cmap=matplotlib.cm.binary)
plt.axis("off")
plt.show()
y_test[img_idx] # 输出1
# 正如我们所看到的,我们的图像与它们的真实标签匹配
# 由于我们的网络只接受列向量,我们需要将图像展平为一维向量,即形状为 (1, 784) 的向量(因为 28×28=784)
X_train, X_test = X_train.reshape(-1, 28 * 28), X_test.reshape(-1, 28 * 28)
print("Pixel values between {} and {}".format(X_train.min(), X_train.max()))
# Pixel values between 0 and 255
X_train, X_test = X_train / 255., X_test / 255.
print("Normalized pixel values between {} and {}".format(X_train.min(), X_train.max()))
# 0.0 到 1.0 之间的归一化像素值
# 最后,为了计算损失,我们需要对标签进行 one-hot,例如 将标签 4 转换为 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]:
y_train = np.eye(num_classes)[y_train]
# 分类器 使用我们的 SimpleNetwork 类并实例化一个具有 2 个隐藏层的网络,将扁平化图像作为输入并返回一个 10 值向量,表示图像属于每个类的比例(值越高,可能性越大) :
mnist_classifier = SimpleNetwork(num_inputs=X_train.shape[1],
num_outputs=num_classes, hidden_layers_sizes=[64, 32])
predictions = mnist_classifier.forward(X_train) # forward pass
loss_untrained = mnist_classifier.loss_function(predictions, y_train) # loss computation
accuracy_untrained = mnist_classifier.evaluate_accuracy(X_test, y_test) # Accuracy
print("Untrained : training loss = {:.6f} | val accuracy = {:.2f}%".format(
loss_untrained, accuracy_untrained * 100))
# 我们现在可以检查我们的网络如何执行(计算它在训练集上的损失,以及它在测试集上的准确度):
losses, accuracies = mnist_classifier.train(X_train, y_train, X_test, y_test,
batch_size=30, num_epochs=500)
#最后,我们可以绘制训练期间损失和准确率的演变,以更好地可视化演变
losses, accuracies = [loss_untrained] + losses, [accuracy_untrained] + accuracies
fig, ax_loss = plt.subplots()
color = 'red'
ax_loss.set_xlim([0, 510])
ax_loss.set_xlabel('Epochs')
ax_loss.set_ylabel('Training Loss', color=color)
ax_loss.plot(losses, color=color)
ax_loss.tick_params(axis='y', labelcolor=color)
ax_acc = ax_loss.twinx() # instantiate a second axes that shares the same x-axis
color = 'blue'
ax_acc.set_xlim([0, 510])
ax_acc.set_ylim([0, 1])
ax_acc.set_ylabel('Val Accuracy', color=color)
ax_acc.plot(accuracies, color=color)
ax_acc.tick_params(axis='y', labelcolor=color)
fig.tight_layout()
plt.show()
正如我们所看到的,我们的网络收敛得非常快,尽管准确度一直在缓慢增加。
验证一下我们的网络现在在我们的随机测试图像上的表现:
# We use `np.expand_dims(x, 0)` to simulate a batch (transforming the image shape
# from (784,) to (1, 784)):
predicted_class = mnist_classifier.predict(np.expand_dims(X_test[img_idx], 0))
print('Predicted class: {}; Correct class: {}'.format(predicted_class, y_test[img_idx]))
将会输出如下: