Kaggle之泰坦尼克号(1)
问题:
1912 年 4 月 15 日,泰坦尼克号沉没,船上的每个人都没有足够的救生艇,导致 2224 名乘客和船员中有 1502 人死亡。虽然幸存下来有一些运气因素,但似乎有些人比其他人更有可能幸存下来。
构建一个预测模型来回答这个问题:“什么样的人更有可能生存?” 使用乘客数据(即姓名、年龄、性别、社会经济阶层等)
可用数据集:
- 训练集(train.csv)
- 测试集(test.csv)
解决方案一:
score:0.78468
Leaderboard:1700/14296(11.89%)
具体的解决方案如下:
文章目录
一、数据情况和特征工程
示例:pandas 是基于NumPy 的一种工具,该工具是为了解决数据分析任务而创建的。
#导入相关库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
# 训练数据
data_train=pd.read_csv("/kaggle/input/titanic/train.csv")
# 测试数据
data_test = pd.read_csv("/kaggle/input/titanic/test.csv")
# 查看各列属性的数据量和缺失情况
print(data_train.info())
print(data_test.info())
二、特征工程
目前打算利用的7个特征:
- 数值型:Pclass(乘客等级),Age(年龄),SibSp(堂兄弟妹个数),Parch(父母与小孩的个数),Fare(票价)
- 文本型:Sex(性别),Embarked(登船港口)
1.处理缺失数据
1)利用随机森林预测补足Age数据,代码如下:
from sklearn.ensemble import RandomForestRegressor
def set_missing_age(df):
# 把数值类型特征取出来,放入随机森林中进行训练
age_df = df[['Age','Fare','Parch','SibSp','Pclass']]
# 乘客分成已知年龄和未知年龄两个部分
known_age = age_df[age_df.Age.notnull()].values
unknown_age = age_df[age_df.Age.isnull()].values
# 目标数据y
y = known_age[:,0]
# 特征属性数据x
x = known_age[:,1:]
# 利用随机森林进行拟合
rfr = RandomForestRegressor(random_state=0,n_estimators=2000,n_jobs=-1)
rfr.fit(x,y)
# 利用训练的模型进行预测
predictedAges = rfr.predict(unknown_age[:,1::])
# 填补缺失的原始数据
df.loc[(df.Age.isnull()),'Age'] = predictedAges
return df
# 年龄缺失值填充
data_train = set_missing_age(data_train)
2)登船港口缺失数据过少,直接删除确实登录港口的两条数据
#删除登录港口缺失的两条数据
data = data_train.drop(data_train[data_train.Embarked.isnull()].index)
2.Embarked(登船港口),Sex(性别),Pclass(乘客等级)特征因子化,数值型数据Age(年龄)Fare(票价)归一化
import sklearn.preprocessing as preprocessing
# 特征因子化
def set_numeralization(data):
# 针对定类性属性进行因子化,分别有Embarked,Sex,Pclass
dummies_Embarked = pd.get_dummies(data['Embarked'], prefix='Embarked')
dummies_Sex = pd.get_dummies(data['Sex'], prefix='Sex')
dummies_Pclass = pd.get_dummies(data['Pclass'], prefix='Pclass')
# 将新的属性拼合
df = pd.concat([data, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
# 将旧的属性剔除
df.drop(['Pclass', 'Sex', 'Embarked'], axis=1, inplace=True)
return df
# 特征归一化
def set_normalization(df):
scaler = preprocessing.StandardScaler()
age_scale_param = scaler.fit(df['Age'].values.reshape(-1,1))
df['Age_scaled'] = scaler.fit_transform(df['Age'].values.reshape(-1,1),age_scale_param)
fare_scale_param = scaler.fit(df['Fare'].values.reshape(-1,1))
df['Fare_scaled'] = scaler.fit_transform(df['Fare'].values.reshape(-1,1),fare_scale_param)
return df
# 特征工程
data = set_numeralization(data)
data = set_normalization(data)
验证集特征工程,代码如下:
data_test['Fare'].fillna(data_test['Fare'].median(),inplace=True)
data_test = set_missing_age(data_test)
data_test = data_test.drop(data_test[data_test.Embarked.isnull()].index)
data_test = set_numeralization(data_test)
data_test = set_normalization(data_test)
3.查看特征工程后的训练集和验证集是否补缺完整
print(data_test.info())
print(data.info())
三、建立模型并预测
1)利用支持向量机进行预测,精度为0.78229
#from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
y = data["Survived"]
features = ["Pclass_1", "Pclass_2", "Pclass_3", "Sex_male", "Sex_female", "SibSp", "Parch", "Age", "Fare", "Embarked_C", "Embarked_Q","Embarked_S"]
X = pd.get_dummies(data[features])
X_test = pd.get_dummies(data_test[features])
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_scaled=scaler.fit(X).transform(X)
X_test_scaled=scaler.fit(X).transform(X_test)
#将模型输入格式变为DataFrame,并查看标准化后的数据情况
X_scaled=pd.DataFrame(X_scaled,columns=features)
#X_scaled.head()
X_test_scaled=pd.DataFrame(X_test_scaled,columns=features)
#X_test_scaled.head()
model = svm.SVC(C=3, kernel='rbf', gamma=0.1)
model.fit(X_scaled, y)
predictions = model.predict(X_test_scaled)
output = pd.DataFrame({'PassengerId': data_test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
2)通过网格搜索优化模型参数再次预测,精度为0.78468
#通过网格搜索优化超参数C、kernel、gamma
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
model = SVC()
C=[1,2,5,10,20,50]
kernel = ['rbf', 'sigmoid']
#gamma = [0.001,0.01,0.1,1,10,100]
gamma = [0.0195, 0.039, 0.0783, 0.156, 0.313, 0.625, 1.25]
Hyperparameter = dict(C=C, kernel=kernel, gamma=gamma) # 将超参数范围包成字典
#网格搜索,cv分数0.8302
grs = GridSearchCV(model, param_grid=Hyperparameter, cv = 10, n_jobs=1, return_train_score = False)
grs.fit(np.array(X_scaled), np.array(y))
#输出最优超参数
print("Best parameters " + str(grs.best_params_))
#print(f'Best parameters: {grs.best_params_}')
#print(f'Best score: {grs.best_score_}')
gpd = pd.DataFrame(grs.cv_results_)
print("Estimated accuracy of this model for unseen data:{0:1.4f}".format(gpd['mean_test_score'][grs.best_index_]))
#利用调优后的SVM进行预测
predictions = grs.predict(X_test_scaled)
output = pd.DataFrame({'PassengerId': data_test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
3)通过随机搜索优化模型参数再次预测
#随机搜索,如果参数和网格搜索一致,cv分数不稳定,大约10次有一次测试集精度为0.8302
#可以利用随即搜索探索范围
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
model = SVC()
C=[1,2,5,10,20,50]
kernel = ['rbf', 'sigmoid']
#gamma = [0.001,0.01,0.1,1,10,100]
gamma = [0.0195, 0.039, 0.0783, 0.156, 0.313, 0.625, 1.25]
Hyperparameter = dict(C=C, kernel=kernel, gamma=gamma) # 将超参数范围包成字典
#Hyperparameter = {"C": stats.uniform(500, 1500),"gamma": stats.uniform(0, 1),'kernel': ('linear', 'rbf')}
random = RandomizedSearchCV(estimator = model, param_distributions = Hyperparameter, cv = 10, random_state=42, n_jobs = -1)
random.fit(np.array(X_scaled), np.array(y))
print(f'Best parameters: {random.best_params_}')
print(f'Best score: {random.best_score_}')
predictions = random.predict(X_test_scaled)
output = pd.DataFrame({'PassengerId': data_test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
4)遗传算法(GA)参数优化
遗传算法(GA)的结果也是随机的,经过多次啊实验,选取测试集cv精度(0.848314606741573)最高的模型进行预测,最终精度为0.78468
#遗传算法(GA)优化超参数
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
tpot_config = {
'sklearn.svm.SVC': {
'C': [1,2,5,10,20,50],
'kernel': ['rbf', 'sigmoid'],
'gamma': [0.0195, 0.039, 0.0783, 0.156, 0.313, 0.625, 1.25]
}
}
X = X_scaled
y = y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#不限定优化模型,最终结果为随机森林分类器最优
#tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, n_jobs=-1)
#限定优化模型为SVM
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, n_jobs=-1, config_dict=tpot_config)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
#不断重复优化,直到得到如下结果
# Best pipeline: SVC(SVC(input_matrix, C=20, gamma=0.625, kernel=sigmoid), C=2, gamma=0.039, kernel=rbf)
# 0.848314606741573
predictions = tpot.predict(X_test_scaled)
output = pd.DataFrame({'PassengerId': data_test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
5)贝叶斯优化参数优化
from sklearn.model_selection import BayesSearchCV
from sklearn.svm import SVC
model = SVC()
C=[1,2,5,10,20,50]
kernel = ['rbf', 'sigmoid']
#gamma = [0.001,0.01,0.1,1,10,100]
gamma = [0.0195, 0.039, 0.0783, 0.156, 0.313, 0.625, 1.25]
Hyperparameter = dict(C=C, kernel=kernel, gamma=gamma)
#Hyperparameter = {"C": Real(1e-6, 1e+6, prior='log-uniform'), "gamma": Real(1e-6, 1e+1, prior='log-uniform'), "kernel": Categorical(['linear', 'rbf']),}
bayesian = BayesSearchCV(estimator = SVC(), search_spaces = Hyperparameter, cv = 10, random_state=42, n_jobs = -1)
bayesian.fit(np.array(X_scaled), np.array(y))
print(f'Best parameters: {bayesian.best_params_}')print(f'Best score: {bayesian.best_score_}')
predictions = bayesian.predict(X_test_scaled)
output = pd.DataFrame({'PassengerId': data_test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
总结
经过特征工程、模型直接预测(0.78229)、优化超参数(0.78468),精度提升了0.2个百分点,这里仅仅使用单个模型进行了探索,下文将探索其他多个模型的可能性。