一、神经元

这个类代表一个简单的人工神经元，能够接收输入值的向量，在返回激活值之前合并和处理它们。

class Neuron(object):
    """
    一个简单的人工神经元，处理输入向量并返回相应的激活。
    Args:
        num_inputs(in)：输入向量大小/输入值个数。
        activation_function (callable): 定义这个神经元的激活函数。
    Attributes:
        W (ndarray): 每个输入的权重值。
        b (float): 偏置值，添加到加权和中。
        activation_function (callable): 计算神经元输出的激活函数。
    """

    def __init__(self, num_inputs, activation_function):
        super().__init__()

        # 随机初始化权重向量和偏差值（例如，使用 -1 和 1 之间的简单均匀分布）:
        self.W = np.random.uniform(size=num_inputs, low=-1., high=1.)
        self.b = np.random.uniform(size=1, low=-1., high=1.)

        self.activation_function = activation_function

    def forward(self, x):
        """
        通过神经元转发输入信号，返回其激活值。
        Args:
            x (ndarray): 输入向量，形状为 `(1, num_inputs)`
        Returns:
            activation (ndarray): 激活值, 形状为 `(1, layer_size)`.
        """
        z = np.dot(x, self.W) + self.b
        return self.activation_function(z)

我们现在将演示如何使用这个模型。首先，我们实例化我们的神经元。让我们创建一个 perceptron，采用 2 个输入值并使用 step 函数来计算它的 activation。它的权重和偏差值是随机设置的。

# 感知器输入大小:
input_size = 3

# Step function (returns 0 if y <= 0, or 1 if y > 0):
step_function = lambda y: 0 if y <= 0 else 1

# 实例化感知器:
perceptron = Neuron(num_inputs=input_size, activation_function=step_function)
print("Perceptron's random weights = {} , and random bias = {}".format(perceptron.W, perceptron.b))

#我们随机生成一个包含 3 个值的随机输入向量（即 (shape = `(1, 3)` 的列向量），以馈送到我们的神经元：
x = np.random.rand(input_size).reshape(1, input_size)
print("Input vector : {}".format(x))

#我们现在可以用这个输入来输入我们的感知器并显示相应的激活。 可以尝试不同的输入或编辑权重。
y = perceptron.forward(x)
print("Perceptron's output value given `x` : {}".format(y))

通过这个神经元类，我们实现了早期人工智能提出的神经元数学模型。

二、将神经元组织成层

class FullyConnectedLayer(object):
    """一个简单的全连接层.
    Args:
        num_inputs (int): 输入向量大小/输入值个数.
        layer_size (int): 输出向量大小/层中的神经元数量.
        activation_function (callable): 该层的激活函数.
    Attributes:
        W (ndarray): 每个输入的权重值.
        b (ndarray): 偏置值，添加到加权和.
        size (int): 层大小/神经元数量.
        activation_function (callable):计算神经元输出的激活函数.
        x (ndarray): 最后提供的输入向量，存储用于反向传播.
        y (ndarray): 相应的输出，也存储用于反向传播.
        derivated_activation_function (callable): 反向传播的相应派生函数.
        dL_dW (ndarray): 损失的导数，关于权重 W.
        dL_db (ndarray): 损失的导数，关于偏差 b.
    """

    def __init__(self, num_inputs, layer_size, activation_function, derivated_activation_function=None):
        super().__init__()

        # 随机初始化权重向量和偏置值（这次使用正态分布）:
        self.W = np.random.standard_normal((num_inputs, layer_size))
        self.b = np.random.standard_normal(layer_size)
        self.size = layer_size

        self.activation_function = activation_function
        self.derivated_activation_function = derivated_activation_function
        self.x, self.y = None, None
        self.dL_dW, self.dL_db = None, None

    def forward(self, x):
        """
        通过层转发输入向量，返回其激活向量.
        Args:
            x (ndarray): 输入向量，形状 `(batch_size, num_inputs)`
        Returns:
            activation (ndarray): 激活值, 形状 `(batch_size, layer_size)`.
        """
        z = np.dot(x, self.W) + self.b
        self.y = self.activation_function(z)
        self.x = x  # (we store the input and output values for back-propagation)
        return self.y

    def backward(self, dL_dy):
        """
        反向传播损失，计算所有导数，存储那些 w.r.t. 层参数，并返回损失 w.r.t. 其输入用于进一步传播。
        Args:
            dL_dy (ndarray): The loss derivative w.r.t. the layer's output (dL/dy = l'_{k+1}).
        Returns:
            dL_dx (ndarray): The loss derivative w.r.t. the layer's input (dL/dx).
        """
        dy_dz = self.derivated_activation_function(self.y)  # = f'
        dL_dz = (dL_dy * dy_dz) # dL/dz = dL/dy * dy/dz = l'_{k+1} * f'
        dz_dw = self.x.T
        dz_dx = self.W.T
        dz_db = np.ones(dL_dy.shape[0]) # dz/db = d(W.x + b)/db = 0 + db/db = "ones"-vector

        # Computing the derivatives with respect to the layer's parameters, and storing them for opt. optimization:
        self.dL_dW = np.dot(dz_dw, dL_dz)
        self.dL_db = np.dot(dz_db, dL_dz)

        # Computing the derivative with respect to the input, to be passed to the previous layers (their `dL_dy`):
        dL_dx = np.dot(dL_dz, dz_dx)
        return dL_dx

    def optimize(self, epsilon):
        """
        使用存储的导数值优化层的参数。
        Args:
            epsilon (float): 学习率.
        """
        self.W -= epsilon * self.dL_dW
        self.b -= epsilon * self.dL_db

展示如何使用该层处理输入值，一个接一个或堆叠在一起成批。我们实例化一层 3 个神经元（即 3 个输出值），取 2 个输入值，并应用 ReLU（整流线性单元）函数进行激活：

input_size    = 2
num_neurons   = 3
relu_function = lambda y: np.maximum(y, 0)

layer = FullyConnectedLayer(num_inputs=input_size, layer_size=num_neurons, activation_function=relu_function)

#我们随机生成 2 个随机输入向量：
x1 = np.random.uniform(-1, 1, 2).reshape(1, 2)
print("Input vector #1: {}".format(x1))

x2 = np.random.uniform(-1, 1, 2).reshape(1, 2)
print("Input vector #2: {}".format(x2))

y1 = layer.forward(x1)
print("Layer's output value given `x1` : {}".format(y1))

y2 = layer.forward(x2)
print("Layer's output value given `x2` : {}".format(y2))

x12 = np.concatenate((x1, x2))  # stack of input vectors, of shape `(2, 2)`
y12 = layer.forward(x12)
print("Layer's output value given `[x1, x2]` :\n{}".format(y12))

三、实现一个完整的神经网络

import numpy as np

#导入之前的全连接层
from fully_connected_layer import FullyConnectedLayer


#==============================================================================
# 定义相关方法
#==============================================================================

def sigmoid(x):     # sigmoid function
    return 1 / (1 + np.exp(-x)) # y


def derivated_sigmoid(y):   # sigmoid derivative function
    return y * (1 - y)


def loss_L2(pred, target):    # L2 loss function
    return np.sum(np.square(pred - target)) / pred.shape[0] # opt. we divide by the batch size


def derivated_loss_L2(pred, target):    # L2 derivative function
    return 2 * (pred - target)


def cross_entropy(pred, target):    # cross-entropy loss function
    return -np.mean(np.multiply(np.log(pred), target) + np.multiply(np.log(1 - pred), (1 - target)))


def derivated_cross_entropy(pred, target):    # cross-entropy derivative function
    return (pred - target) / (pred * (1 - pred))


#==============================================================================
# 定义简单的网络
#==============================================================================

class SimpleNetwork(object):
    """A simple fully-connected NN.
    Args:
        num_inputs (int): The input vector size / number of input values.
        num_outputs (int): The output vector size.
        hidden_layers_sizes (list): A list of sizes for each hidden layer to add to the network
        activation_function (callable): The activation function for all the layers
        derivated_activation_function (callable): The derivated activation function
        loss_function (callable): The loss function to train this network
        derivated_loss_function (callable): The derivative of the loss function, for back-propagation
    Attributes:
        layers (list): The list of layers forming this simple network.
        loss_function (callable): The loss function to train this network.
        derivated_loss_function (callable): The derivative of the loss function, for back-propagation.
    """

    def __init__(self, num_inputs, num_outputs, hidden_layers_sizes=(64, 32),
                 activation_function=sigmoid, derivated_activation_function=derivated_sigmoid,
                 loss_function=loss_L2, derivated_loss_function=derivated_loss_L2):
        super().__init__()
        # We build the list of layers composing the network, according to the provided arguments:
        layer_sizes = [num_inputs, *hidden_layers_sizes, num_outputs]
        self.layers = [
            FullyConnectedLayer(layer_sizes[i], layer_sizes[i + 1], activation_function, derivated_activation_function)
            for i in range(len(layer_sizes) - 1)]

        self.loss_function = loss_function
        self.derivated_loss_function = derivated_loss_function

    def forward(self, x):
        """
        Forward the input vector through the layers, returning the output vector.
        Args:
            x (ndarray): The input vector, of shape `(batch_size, num_inputs)`.
        Returns:
            activation (ndarray): The output activation value, of shape `(batch_size, layer_size)`.
        """
        for layer in self.layers: # from the input layer to the output one
            x = layer.forward(x)
        return x

    def predict(self, x):
        """
        Compute the output corresponding to input `x`, and return the index of the largest output value.
        Args:
            x (ndarray): The input vector, of shape `(1, num_inputs)`.
        Returns:
            best_class (int): The predicted class ID.
        """
        estimations = self.forward(x)
        best_class = np.argmax(estimations)
        return best_class

    def backward(self, dL_dy):
        """
        Back-propagate the loss hrough the layers (require `forward()` to be called before).
        Args:
            dL_dy (ndarray): The loss derivative w.r.t. the network's output (dL/dy).
        Returns:
            dL_dx (ndarray): The loss derivative w.r.t. the network's input (dL/dx).
        """
        for layer in reversed(self.layers): # from the output layer to the input one
            dL_dy = layer.backward(dL_dy)
        return dL_dy

    def optimize(self, epsilon):
        """
        Optimize the network parameters according to the stored gradients (require `backward()` to be called before).
        Args:
            epsilon (float): The learning rate.
        """
        for layer in self.layers:             # the order doesn't matter here
            layer.optimize(epsilon)

    def evaluate_accuracy(self, X_val, y_val):
        """
        Given a dataset and its ground-truth labels, evaluate the current accuracy of the network.
        Args:
            X_val (ndarray): The input validation dataset.
            y_val (ndarray): The corresponding ground-truth validation dataset.
        Returns:
            accuracy (float): The accuracy of the network (= number of correct predictions / dataset size).
        """
        num_corrects = 0
        for i in range(len(X_val)):
            pred_class = self.predict(X_val[i])
            if pred_class == y_val[i]:
                num_corrects += 1
        return num_corrects / len(X_val)

    def train(self, X_train, y_train, X_val=None, y_val=None, batch_size=32, num_epochs=5, learning_rate=1e-3):
        """
        Given a dataset and its ground-truth labels, evaluate the current accuracy of the network.
        Args:
            X_train (ndarray): The input training dataset.
            y_train (ndarray): The corresponding ground-truth training dataset.
            X_val (ndarray): The input validation dataset.
            y_val (ndarray): The corresponding ground-truth validation dataset.
            batch_size (int): The mini-batch size.
            num_epochs (int): The number of training epochs i.e. iterations over the whole dataset.
            learning_rate (float): The learning rate to scale the derivatives.
        Returns:
            losses (list): The list of training losses for each epoch.
            accuracies (list): The list of validation accuracy values for each epoch.
        """
        num_batches_per_epoch = len(X_train) // batch_size
        do_validation = X_val is not None and y_val is not None
        losses, accuracies = [], []
        for i in range(num_epochs): # for each training epoch
            epoch_loss = 0
            for b in range(num_batches_per_epoch):  # for each batch composing the dataset
                # Get batch:
                batch_index_begin = b * batch_size
                batch_index_end = batch_index_begin + batch_size
                x = X_train[batch_index_begin: batch_index_end]
                targets = y_train[batch_index_begin: batch_index_end]
                # Optimize on batch:
                predictions = y = self.forward(x)  # forward pass
                L = self.loss_function(predictions, targets)  # loss computation
                dL_dy = self.derivated_loss_function(predictions, targets)  # loss derivation
                self.backward(dL_dy)  # back-propagation pass
                self.optimize(learning_rate)  # optimization of the NN
                epoch_loss += L

            # Logging training loss and validation accuracy, to follow the training:
            epoch_loss /= num_batches_per_epoch
            losses.append(epoch_loss)
            if do_validation:
                accuracy = self.evaluate_accuracy(X_val, y_val)
                accuracies.append(accuracy)
            else:
                accuracy = np.NaN
            print("Epoch {:4d}: training loss = {:.6f} | val accuracy = {:.2f}%".format(i, epoch_loss, accuracy * 100))
        return losses, accuracies

四、在手写数字上训练网络

对于这项任务，使用MNIST数据集（Yann LeCun 和 Corinna Cortes 拥有该数据集的所有版权）。

在实施解决方案之前，我们应该准备数据，加载 MNIST 图像以用于训练和测试方法。为简单起见，我们将使用 Marc Garcia 开发的 Python 模块 mnist。

%matplotlib inline
# !pip install matplotlib  # 如果尚未安装 matplotlib，请取消注释并运行。
import matplotlib          # 我们使用这个包来可视化一些数据和结果
import matplotlib.pyplot as plt
import mnist

np.random.seed(42)

# mnist 模块使加载训练和测试数据（图像及其标签）变得简单：
X_train, y_train = mnist.train_images(), mnist.train_labels()
X_test,  y_test  = mnist.test_images(), mnist.test_labels()
num_classes = 10    # classes are the digits from 0 to 9

# 我们可以检查训练/测试样本的数量和大小：
X_train.shape
# 会输出(60000, 28, 28)
X_test.shape
# 会输出(10000, 28, 28)

# 即我们有 60,000 个训练样本和 10,000 个测试样本，每个样本都有一个 28×28 像素的图像。
# 我们可以查看数据，例如使用 matplotlib：
img_idx = np.random.randint(0, X_test.shape[0])
plt.imshow(X_test[img_idx], cmap=matplotlib.cm.binary)
plt.axis("off")
plt.show()

y_test[img_idx]    # 输出1
# 正如我们所看到的，我们的图像与它们的真实标签匹配

# 由于我们的网络只接受列向量，我们需要将图像展平为一维向量，即形状为 (1, 784) 的向量（因为 28×28=784）
X_train, X_test = X_train.reshape(-1, 28 * 28), X_test.reshape(-1, 28 * 28)
print("Pixel values between {} and {}".format(X_train.min(), X_train.max()))
# Pixel values between 0 and 255

X_train, X_test = X_train / 255., X_test / 255.
print("Normalized pixel values between {} and {}".format(X_train.min(), X_train.max()))
# 0.0 到 1.0 之间的归一化像素值

# 最后，为了计算损失，我们需要对标签进行 one-hot，例如 将标签 4 转换为 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]：
y_train = np.eye(num_classes)[y_train]

# 分类器 使用我们的 SimpleNetwork 类并实例化一个具有 2 个隐藏层的网络，将扁平化图像作为输入并返回一个 10 值向量，表示图像属于每个类的比例（值越高，可能性越大） ：
mnist_classifier = SimpleNetwork(num_inputs=X_train.shape[1], 
                                 num_outputs=num_classes, hidden_layers_sizes=[64, 32])

predictions = mnist_classifier.forward(X_train)                         # forward pass
loss_untrained = mnist_classifier.loss_function(predictions, y_train)   # loss computation

accuracy_untrained = mnist_classifier.evaluate_accuracy(X_test, y_test)  # Accuracy
print("Untrained : training loss = {:.6f} | val accuracy = {:.2f}%".format(
    loss_untrained, accuracy_untrained * 100))

# 我们现在可以检查我们的网络如何执行（计算它在训练集上的损失，以及它在测试集上的准确度）：
losses, accuracies = mnist_classifier.train(X_train, y_train, X_test, y_test, 
                                            batch_size=30, num_epochs=500)

#最后，我们可以绘制训练期间损失和准确率的演变，以更好地可视化演变
losses, accuracies = [loss_untrained] + losses, [accuracy_untrained] + accuracies
fig, ax_loss = plt.subplots()

color = 'red'
ax_loss.set_xlim([0, 510])
ax_loss.set_xlabel('Epochs')
ax_loss.set_ylabel('Training Loss', color=color)
ax_loss.plot(losses, color=color)
ax_loss.tick_params(axis='y', labelcolor=color)

ax_acc = ax_loss.twinx()  # instantiate a second axes that shares the same x-axis
color = 'blue'
ax_acc.set_xlim([0, 510])
ax_acc.set_ylim([0, 1])
ax_acc.set_ylabel('Val Accuracy', color=color)
ax_acc.plot(accuracies, color=color)
ax_acc.tick_params(axis='y', labelcolor=color)

fig.tight_layout()
plt.show()

正如我们所看到的，我们的网络收敛得非常快，尽管准确度一直在缓慢增加。

验证一下我们的网络现在在我们的随机测试图像上的表现：

# We use `np.expand_dims(x, 0)` to simulate a batch (transforming the image shape
# from (784,) to (1, 784)):
predicted_class = mnist_classifier.predict(np.expand_dims(X_test[img_idx], 0))
print('Predicted class: {}; Correct class: {}'.format(predicted_class, y_test[img_idx]))

将会输出如下：