房价预测比赛
比赛链接
复现沐神的代码,做点笔记
下载和缓存数据集
import hashlib
import os
import tarfile
import zipfile
import requests
#@save
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
def download(name, cache_dir=os.path.join('.', 'data')): #@save
"""下载一个DATA_HUB中的文件,返回本地文件名"""
assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
url, sha1_hash = DATA_HUB[name]
os.makedirs(cache_dir, exist_ok=True)
fname = os.path.join(cache_dir, url.split('/')[-1])
if os.path.exists(fname):
sha1 = hashlib.sha1()
with open(fname, 'rb') as f:
while True:
data = f.read(1048576)
if not data:
break
sha1.update(data)
if sha1.hexdigest() == sha1_hash:
return fname # 命中缓存
print(f'正在从{url}下载{fname}...')
r = requests.get(url, stream=True, verify=True)
with open(fname, 'wb') as f:
f.write(r.content)
return fname
def download_extract(name, folder=None): #@save
"""下载并解压zip/tar文件"""
fname = download(name)
base_dir = os.path.dirname(fname)
data_dir, ext = os.path.splitext(fname)
if ext == '.zip':
fp = zipfile.ZipFile(fname, 'r')
elif ext in ('.tar', '.gz'):
fp = tarfile.open(fname, 'r')
else:
assert False, '只有zip/tar文件可以被解压缩'
fp.extractall(base_dir)
return os.path.join(base_dir, folder) if folder else data_dir
def download_all(): #@save
"""下载DATA_HUB中的所有文件"""
for name in DATA_HUB:
download(name)
访问和读取数据
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l
DATA_HUB['kaggle_house_train'] = ( #@save
DATA_URL + 'kaggle_house_pred_train.csv',
'585e9cc93e70b39160e7921475f9bcd7d31219ce')
DATA_HUB['kaggle_house_test'] = ( #@save
DATA_URL + 'kaggle_house_pred_test.csv',
'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))
数据预处理
print(train_data.shape)
print(test_data.shape)
(1460, 81)
(1459, 80)
print(train_data.iloc[0:4,[0,1,2,3,-3,-2,-1]])
Id MSSubClass MSZoning LotFrontage SaleType SaleCondition SalePrice
0 1 60 RL 65.0 WD Normal 208500
1 2 20 RL 80.0 WD Normal 181500
2 3 60 RL 68.0 WD Normal 223500
3 4 70 RL 60.0 WD Abnorml 140000
print(test_data.iloc[0:4,[0,1,2,3,-3,-2,-1]])
Id MSSubClass MSZoning LotFrontage YrSold SaleType SaleCondition
0 1461 20 RH 80.0 2010 WD Normal
1 1462 20 RL 81.0 2010 WD Normal
2 1463 60 RL 74.0 2010 WD Normal
3 1464 60 RL 78.0 2010 WD Normal
all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))
train_data.iloc[:,1:]
MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | Inside | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
1 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | FR2 | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
2 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | Inside | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | Corner | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
4 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | FR2 | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1455 | 60 | RL | 62.0 | 7917 | Pave | NaN | Reg | Lvl | AllPub | Inside | ... | 0 | NaN | NaN | NaN | 0 | 8 | 2007 | WD | Normal | 175000 |
1456 | 20 | RL | 85.0 | 13175 | Pave | NaN | Reg | Lvl | AllPub | Inside | ... | 0 | NaN | MnPrv | NaN | 0 | 2 | 2010 | WD | Normal | 210000 |
1457 | 70 | RL | 66.0 | 9042 | Pave | NaN | Reg | Lvl | AllPub | Inside | ... | 0 | NaN | GdPrv | Shed | 2500 | 5 | 2010 | WD | Normal | 266500 |
1458 | 20 | RL | 68.0 | 9717 | Pave | NaN | Reg | Lvl | AllPub | Inside | ... | 0 | NaN | NaN | NaN | 0 | 4 | 2010 | WD | Normal | 142125 |
1459 | 20 | RL | 75.0 | 9937 | Pave | NaN | Reg | Lvl | AllPub | Inside | ... | 0 | NaN | NaN | NaN | 0 | 6 | 2008 | WD | Normal | 147500 |
1460 rows × 80 columns
all_features.shape
(2919, 79)
# 标准化数据
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
# 缺失值设为零
all_features[numeric_features] = all_features[numeric_features].fillna(0)
# 处理离散值
all_features['MSZoning']
0 RL
1 RL
2 RL
3 RL
4 RL
..
1454 RM
1455 RM
1456 RL
1457 RL
1458 RL
Name: MSZoning, Length: 2919, dtype: object
# “Dummy_na=True”将“na”(缺失值)视为有效的特征值,并为其创建指示符特征
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features.shape
(2919, 331)
# Dummy_na=True举例
s1 = ['a', 'b', np.nan]
s1 = pd.DataFrame(s1)
print(s1)
s2 = pd.get_dummies(s1)
print(s2)
s3 = pd.get_dummies(s1,dummy_na=True)
print(s3)
0
0 a
1 b
2 NaN
0_a 0_b
0 1 0
1 0 1
2 0 0
0_a 0_b 0_nan
0 1 0 0
1 0 1 0
2 0 0 1
n_train = train_data.shape[0]
# 转换为张量
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)
# 这里的负数是模糊控制,负数可以为任何数。
# 比如 reshape(2,-1),固定两行,多少列系统根据元素数量自动计算好;
# 同理,reshape(-2,2): 固定两列,行数自动计算好
train_data.SalePrice.values.reshape(-1, 1)
array([[208500],
[181500],
[223500],
...,
[266500],
[142125],
[147500]], dtype=int64)
训练
首先训练一个带损失平方的线性模型,用以查看数据中是否存在有意义的信息,同时作为基线模型,让人直观知道最好的模型超出简单模型多少
# MSE是mean squared error的缩写,即平均平方误差,简称均方误差
loss = nn.MSELoss()
in_features = train_features.shape[1]
def get_net():
net = nn.Sequential(nn.Linear(in_features,1))
return net
使用相对误差
y
−
y
^
y
\frac{y - \hat{y}}{y}
yy−y^,而不是绝对误差
y
−
y
^
y - \hat{y}
y−y^
(解决这个问题的一种方法是用价格预测的对数来衡量差异)。
事实上,这也是比赛中官方用来评价提交质量的误差指标。
即将
δ
\delta
δ for
∣
log
y
−
log
y
^
∣
≤
δ
|\log y - \log \hat{y}| \leq \delta
∣logy−logy^∣≤δ
转换为
e
−
δ
≤
y
^
y
≤
e
δ
e^{-\delta} \leq \frac{\hat{y}}{y} \leq e^\delta
e−δ≤yy^≤eδ。
这使得预测价格的对数与真实标签价格的对数之间出现以下均方根误差:
1 n ∑ i = 1 n ( log y i − log y ^ i ) 2 . \sqrt{\frac{1}{n}\sum_{i=1}^n\left(\log y_i -\log \hat{y}_i\right)^2}. n1i=1∑n(logyi−logy^i)2.
# 均方根误差(root mean square error)
def log_rmse(net, features, labels):
# 为了在取对数的时候稳定该值,将小于1的值设置为1
clipped_preds = torch.clamp(net(features), 1, float('inf'))
rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))
return rmse.item()
def train(net, train_features, train_labels, test_features, test_labels,
num_epochs, learning_rate, weight_decay, batch_size):
train_ls, test_ls = [], []
train_iter = d2l.load_array((train_features, train_labels), batch_size)
# Adam优化算法
optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate, weight_decay = weight_decay)
for epoch in range(num_epochs):
for X,y in train_iter:
optimizer.zero_grad()
l = loss(net(X), y)
l.backward()
optimizer.step()
train_ls.append(log_rmse(net, train_features, train_labels))
if test_labels is not None:
test_ls.append(log_rmse(net, test_features, test_labels))
return train_ls, test_ls
K折交叉验证
def get_k_fold_data(k, i, X, y):
assert k > 1, 'K必须大于1'
fold_size = X.shape[0] // k
X_train, y_train = None, None
for j in range(k):
idx = slice(j * fold_size, (j + 1) * fold_size)
X_part, y_part = X[idx, :], y[idx]
if j == i:
X_valid, y_valid = X_part, y_part
elif X_train is None:
X_train, y_train = X_part, y_part
else:
X_train = torch.cat([X_train, X_part], 0)
y_train = torch.cat([y_train, y_part], 0)
return X_train, y_train, X_valid, y_valid
当我们在 K K K折交叉验证中训练 K K K次后,[返回训练和验证误差的平均值]。
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):
train_l_sum, valid_l_sum = 0, 0
for i in range(k):
data = get_k_fold_data(k, i, X_train, y_train)
net = get_net()
train_ls, valid_ls = train(net, *data, num_epochs, learning_rate, weight_decay, batch_size)
train_l_sum += train_ls[-1]
valid_l_sum += valid_ls[-1]
if i == 0:
d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
legend=['train', 'valid'], yscale='log')
print(f'第{i + 1}折,训练log rmse{float(train_ls[-1]):f},' f'验证log rmse{float(valid_ls[-1]):f}')
return train_l_sum / k, valid_l_sum /k
模型选择
k, num_epochs, lr, weight_decay, batch_size = 10, 200, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, ' f'平均验证log rmse: {float(valid_l):f}')
第1折,训练log rmse0.139559,验证log rmse0.154900
第2折,训练log rmse0.139979,验证log rmse0.121036
第3折,训练log rmse0.140434,验证log rmse0.133896
第4折,训练log rmse0.135732,验证log rmse0.166204
第5折,训练log rmse0.134720,验证log rmse0.165762
第6折,训练log rmse0.139707,验证log rmse0.118329
第7折,训练log rmse0.140887,验证log rmse0.128191
第8折,训练log rmse0.139096,验证log rmse0.149429
第9折,训练log rmse0.133857,验证log rmse0.192688
第10折,训练log rmse0.139484,验证log rmse0.138164
10-折验证: 平均训练log rmse: 0.138346, 平均验证log rmse: 0.146860
提交到kaggle
def train_and_pred(train_features, test_feature, train_labels, test_data,
num_epochs, lr, weight_decay, batch_size):
net = get_net()
train_ls, _ = train(net, train_features, train_labels, None, None,
num_epochs, lr, weight_decay, batch_size)
d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch',
ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
print(f'训练log rmse:{float(train_ls[-1]):f}')
# 将网络应用于测试集。
preds = net(test_features).detach().numpy()
# 将其重新格式化以导出到Kaggle
test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
submission.to_csv('submission.csv', index=False)
train_and_pred(train_features, test_features, train_labels, test_data,
num_epochs, lr, weight_decay, batch_size)
训练log rmse:0.136236
- 登录Kaggle网站,访问房价预测竞赛页面。
- 点击“Submit Predictions”或“Late Submission”按钮(在撰写本文时,该按钮位于右侧)。
- 点击页面底部虚线框中的“Upload Submission File”按钮,选择你要上传的预测文件。
- 点击页面底部的“Make Submission”按钮,即可查看你的结果。