0
点赞
收藏
分享

微信扫一扫

蒻寂自救计划

凛冬已至夏日未远 2022-04-13 阅读 20
机器学习

房价预测比赛

比赛链接

复现沐神的代码,做点笔记

下载和缓存数据集

import hashlib
import os
import tarfile
import zipfile
import requests

#@save
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'

def download(name, cache_dir=os.path.join('.', 'data')):  #@save
    """下载一个DATA_HUB中的文件,返回本地文件名"""
    assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # 命中缓存
    print(f'正在从{url}下载{fname}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname

def download_extract(name, folder=None):  #@save
    """下载并解压zip/tar文件"""
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, '只有zip/tar文件可以被解压缩'
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir

def download_all():  #@save
    """下载DATA_HUB中的所有文件"""
    for name in DATA_HUB:
        download(name)

访问和读取数据

import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l
DATA_HUB['kaggle_house_train'] = (  #@save
    DATA_URL + 'kaggle_house_pred_train.csv',
    '585e9cc93e70b39160e7921475f9bcd7d31219ce')

DATA_HUB['kaggle_house_test'] = (  #@save
    DATA_URL + 'kaggle_house_pred_test.csv',
    'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))

数据预处理

print(train_data.shape)
print(test_data.shape)
(1460, 81)
(1459, 80)
print(train_data.iloc[0:4,[0,1,2,3,-3,-2,-1]])
   Id  MSSubClass MSZoning  LotFrontage SaleType SaleCondition  SalePrice
0   1          60       RL         65.0       WD        Normal     208500
1   2          20       RL         80.0       WD        Normal     181500
2   3          60       RL         68.0       WD        Normal     223500
3   4          70       RL         60.0       WD       Abnorml     140000
print(test_data.iloc[0:4,[0,1,2,3,-3,-2,-1]])
     Id  MSSubClass MSZoning  LotFrontage  YrSold SaleType SaleCondition
0  1461          20       RH         80.0    2010       WD        Normal
1  1462          20       RL         81.0    2010       WD        Normal
2  1463          60       RL         74.0    2010       WD        Normal
3  1464          60       RL         78.0    2010       WD        Normal
all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))
train_data.iloc[:,1:]

MSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilitiesLotConfig...PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
060RL65.08450PaveNaNRegLvlAllPubInside...0NaNNaNNaN022008WDNormal208500
120RL80.09600PaveNaNRegLvlAllPubFR2...0NaNNaNNaN052007WDNormal181500
260RL68.011250PaveNaNIR1LvlAllPubInside...0NaNNaNNaN092008WDNormal223500
370RL60.09550PaveNaNIR1LvlAllPubCorner...0NaNNaNNaN022006WDAbnorml140000
460RL84.014260PaveNaNIR1LvlAllPubFR2...0NaNNaNNaN0122008WDNormal250000
..................................................................
145560RL62.07917PaveNaNRegLvlAllPubInside...0NaNNaNNaN082007WDNormal175000
145620RL85.013175PaveNaNRegLvlAllPubInside...0NaNMnPrvNaN022010WDNormal210000
145770RL66.09042PaveNaNRegLvlAllPubInside...0NaNGdPrvShed250052010WDNormal266500
145820RL68.09717PaveNaNRegLvlAllPubInside...0NaNNaNNaN042010WDNormal142125
145920RL75.09937PaveNaNRegLvlAllPubInside...0NaNNaNNaN062008WDNormal147500

1460 rows × 80 columns

all_features.shape
(2919, 79)
# 标准化数据
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
# 缺失值设为零
all_features[numeric_features] = all_features[numeric_features].fillna(0)
# 处理离散值
all_features['MSZoning']
0       RL
1       RL
2       RL
3       RL
4       RL
        ..
1454    RM
1455    RM
1456    RL
1457    RL
1458    RL
Name: MSZoning, Length: 2919, dtype: object
# “Dummy_na=True”将“na”(缺失值)视为有效的特征值,并为其创建指示符特征
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features.shape
(2919, 331)
# Dummy_na=True举例
s1 = ['a', 'b', np.nan]
s1 = pd.DataFrame(s1)
print(s1)
s2 = pd.get_dummies(s1)
print(s2)
s3 = pd.get_dummies(s1,dummy_na=True)
print(s3)
     0
0    a
1    b
2  NaN
   0_a  0_b
0    1    0
1    0    1
2    0    0
   0_a  0_b  0_nan
0    1    0      0
1    0    1      0
2    0    0      1
n_train = train_data.shape[0]
# 转换为张量
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)
# 这里的负数是模糊控制,负数可以为任何数。
# 比如 reshape(2,-1),固定两行,多少列系统根据元素数量自动计算好;
# 同理,reshape(-2,2): 固定两列,行数自动计算好
train_data.SalePrice.values.reshape(-1, 1)
array([[208500],
       [181500],
       [223500],
       ...,
       [266500],
       [142125],
       [147500]], dtype=int64)

训练

首先训练一个带损失平方的线性模型,用以查看数据中是否存在有意义的信息,同时作为基线模型,让人直观知道最好的模型超出简单模型多少

# MSE是mean squared error的缩写,即平均平方误差,简称均方误差
loss = nn.MSELoss()
in_features = train_features.shape[1]

def get_net():
    net = nn.Sequential(nn.Linear(in_features,1))
    return net

使用相对误差 y − y ^ y \frac{y - \hat{y}}{y} yyy^,而不是绝对误差 y − y ^ y - \hat{y} yy^
(解决这个问题的一种方法是用价格预测的对数来衡量差异)。
事实上,这也是比赛中官方用来评价提交质量的误差指标。
即将 δ \delta δ for ∣ log ⁡ y − log ⁡ y ^ ∣ ≤ δ |\log y - \log \hat{y}| \leq \delta logylogy^δ
转换为 e − δ ≤ y ^ y ≤ e δ e^{-\delta} \leq \frac{\hat{y}}{y} \leq e^\delta eδyy^eδ
这使得预测价格的对数与真实标签价格的对数之间出现以下均方根误差:

1 n ∑ i = 1 n ( log ⁡ y i − log ⁡ y ^ i ) 2 . \sqrt{\frac{1}{n}\sum_{i=1}^n\left(\log y_i -\log \hat{y}_i\right)^2}. n1i=1n(logyilogy^i)2 .

# 均方根误差(root mean square error)
def log_rmse(net, features, labels):
    # 为了在取对数的时候稳定该值,将小于1的值设置为1
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))
    return rmse.item()
def train(net, train_features, train_labels, test_features, test_labels,
         num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    train_iter = d2l.load_array((train_features, train_labels), batch_size)
    # Adam优化算法
    optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate, weight_decay = weight_decay)
    
    for epoch in range(num_epochs):
        for X,y in train_iter:
            optimizer.zero_grad()
            l = loss(net(X), y)
            l.backward()
            optimizer.step()                     
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls     

K折交叉验证

def get_k_fold_data(k, i, X, y):
    assert k > 1, 'K必须大于1'
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat([X_train, X_part], 0)
            y_train = torch.cat([y_train, y_part], 0)
    return X_train, y_train, X_valid, y_valid 

当我们在 K K K折交叉验证中训练 K K K次后,[返回训练和验证误差的平均值]。

def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net()
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate, weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        if i == 0:
            d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
                     xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
                     legend=['train', 'valid'], yscale='log')
        print(f'第{i + 1}折,训练log rmse{float(train_ls[-1]):f},' f'验证log rmse{float(valid_ls[-1]):f}')
    return train_l_sum / k, valid_l_sum /k

模型选择

k, num_epochs, lr, weight_decay, batch_size = 10, 200, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, ' f'平均验证log rmse: {float(valid_l):f}')
第1折,训练log rmse0.139559,验证log rmse0.154900
第2折,训练log rmse0.139979,验证log rmse0.121036
第3折,训练log rmse0.140434,验证log rmse0.133896
第4折,训练log rmse0.135732,验证log rmse0.166204
第5折,训练log rmse0.134720,验证log rmse0.165762
第6折,训练log rmse0.139707,验证log rmse0.118329
第7折,训练log rmse0.140887,验证log rmse0.128191
第8折,训练log rmse0.139096,验证log rmse0.149429
第9折,训练log rmse0.133857,验证log rmse0.192688
第10折,训练log rmse0.139484,验证log rmse0.138164
10-折验证: 平均训练log rmse: 0.138346, 平均验证log rmse: 0.146860

在这里插入图片描述

提交到kaggle

def train_and_pred(train_features, test_feature, train_labels, test_data,
                   num_epochs, lr, weight_decay, batch_size):
    net = get_net()
    train_ls, _ = train(net, train_features, train_labels, None, None,
                        num_epochs, lr, weight_decay, batch_size)
    d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch',
             ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
    print(f'训练log rmse:{float(train_ls[-1]):f}')
    # 将网络应用于测试集。
    preds = net(test_features).detach().numpy()
    # 将其重新格式化以导出到Kaggle
    test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
    submission.to_csv('submission.csv', index=False)
train_and_pred(train_features, test_features, train_labels, test_data,
               num_epochs, lr, weight_decay, batch_size)
训练log rmse:0.136236

在这里插入图片描述

  • 登录Kaggle网站,访问房价预测竞赛页面。
  • 点击“Submit Predictions”或“Late Submission”按钮(在撰写本文时,该按钮位于右侧)。
  • 点击页面底部虚线框中的“Upload Submission File”按钮,选择你要上传的预测文件。
  • 点击页面底部的“Make Submission”按钮,即可查看你的结果。
举报

相关推荐

0 条评论