LightGBM 与 XGBoost
目前几乎所有具有结构化数据集的竞赛的基础学习者。这主要是因为 LightGBM 的实现;它不会像 XGBoost 在默认设置中那样精确搜索最佳分割,而是通过直方图近似(XGBoost 现在也具有此功能,但仍不如 LightGBM 快)。
这会导致预测性能略有下降,但速度会大大提高。这意味着更多的特征工程/实验/模型调整机会,这不可避免地会产生更大的预测性能提升。 (特征工程是赢得大多数 Kaggle 比赛的关键)
stock_list = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv")
prices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
supplemental_prices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv")
df_train = pd.concat([prices, supplemental_prices])
df_train = pd.merge(df_train, stock_list[['SecuritiesCode', 'Name']], left_on = 'SecuritiesCode', right_on = 'SecuritiesCode', how = 'left')
stock_list = stock_list.loc[stock_list['SecuritiesCode'].isin(prices['SecuritiesCode'].unique())]
print(list(df_train.columns))
print(len(list(df_train['SecuritiesCode'].unique())))
df_train.head()
import os
import traceback
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb
import jpx_tokyo_market_prediction
import warnings; warnings.filterwarnings("ignore")
训练
def upper_shadow(df): return df['High'] - np.maximum(df['Close'], df['Open'])
def lower_shadow(df): return np.minimum(df['Close'], df['Open']) - df['Low']
# A utility function to build features from the original df
# It works for rows to, so we can reutilize it.
def get_features(df):
df_feat = df[['Open', 'High', 'Low', 'Close', 'Volume']].copy()
df_feat['Upper_Shadow'] = upper_shadow(df_feat)
df_feat['Lower_Shadow'] = lower_shadow(df_feat)
return df_feat
主要训练函数
def get_Xy_and_model(df_train):
df_proc = get_features(df_train)
df_proc['y'] = df_train['Target']
df_proc = df_proc.dropna(how = "any")
X = df_proc.drop("y", axis=1)
y = df_proc["y"]
model = lgb.LGBMRegressor(device_type = 'gpu')
model.fit(X, y)
return X, y, model
对所有证券循环
print(f"Training model")
X, y, model = get_Xy_and_model(df_train)
Xs, ys, models = X, y, model
x = get_features(df_train.iloc[1])
y_pred = models.predict(pd.DataFrame([x]))
y_pred[0]
0.0005335159727416169
提交
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()
for (df_test, options, financials, trades, secondary_prices, df_pred) in iter_test:
df_pred['row_id'] = (df_pred['Date'].astype(str) + '_' + df_pred['SecuritiesCode'].astype(str))
df_test['row_id'] = (df_test['Date'].astype(str) + '_' + df_test['SecuritiesCode'].astype(str))
model = models
x_test = get_features(df_test)
y_pred = model.predict(x_test)
df_pred['Target'] = y_pred
df_pred = df_pred.sort_values(by = "Target", ascending = False)
df_pred['Rank'] = np.arange(0,2000)
df_pred = df_pred.sort_values(by = "SecuritiesCode", ascending = True)
df_pred.drop(["Target"], axis = 1)
submission = df_pred[["Date", "SecuritiesCode", "Rank"]]
env.predict(submission)
print(df_pred.columns)