学习曲线
观察不同规模训练集对指标的影响
# 라이브러리를 임포트합니다.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
# 加载数据
digits = load_digits()
# 创建特征矩阵和目标向量
features, target = digits.data, digits.target
# 为不同规模训练集进行交叉验证,计算训练和测试得分
train_sizes, train_scores, test_scores = learning_curve(# 分类器
RandomForestClassifier(),
# 特征矩阵
features,
# 目标向量
target,
# 折数
cv=10,
# 性能指标
scoring='accuracy',
# 使用所有CPU
n_jobs=-1,
# 50个训练集的规模
train_sizes=np.linspace(
0.01,
1.0,
50))
# 计算训练集得分平均值和标准差
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
# 计算测试集得分平均值和标准差
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
# 划线
plt.plot(train_sizes, train_mean, '--', color="#111111", label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")
# 画带状图
# fill_between 填充两个函数之间的区域
# 填充
# 两个函数之间的区域用黄色填充
plt.fill_between(train_sizes, train_mean - train_std,
train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std,
test_mean + test_std, color="#DDDDDD")
# 创建图
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Accuracy Score"),
plt.legend(loc="best")
plt.tight_layout()
plt.show()