目录
*1. 决策树的重要参数
2. 数据预处理
3. 建立模型
4. 网格搜索优化
5. 图形化展示
一、决策树的重要参数介绍
图片引用自:(https://blog.csdn.net/qq_41577045/article/details/79844709)
二、数据预处理
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
data=pd.read_csv(r"D:\数据集\泰坦尼克号\train.csv")
#筛选特征,删除无效的及缺失值太高的特征
data.drop(["Name","Ticket","Cabin"],inplace=True,axis=1) #按照列删除,覆盖原数据
#处理缺失值,年龄用均值填补
data["Age"]=data["Age"].fillna(data["Age"].mean())
data.info()
#处理缺失值的行数据
data=data.dropna() #默认axis=0行数据
#将字符串转换为离散型数值 apply方法
labels=data["Embarked"].unique().tolist()
data["Embarked"]=data["Embarked"].apply(lambda x:labels.index(x))
#性别转换布尔值
data.loc[:,"Sex"]=(data["Sex"]=="male").astype("int")
#训练集和测试集的划分
x=data.iloc[:,data.columns!="Survived"]
y=data.iloc[:,data.columns=="Survived"]
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3)
#恢复索引
for i in [xtrain,xtest,ytrain,ytest]:
i.index=range(i.shape[0])
三、建立模型
clf=DecisionTreeClassifier(random_state=25)
clf=clf.fit(xtrain,ytrain)
score=clf.score(xtest,ytest)
#使用交叉验证
from sklearn.model_selection import cross_val_score
clf=DecisionTreeClassifier(random_state=25)
score=cross_val_score(clf,x,y,cv=10).mean()
#学习曲线
tr=[]
te=[]
for i in range(10):
clf=DecisionTreeClassifier(
random_state=25,
max_depth=i+1,
criterion="entropy"
)
clf=clf.fit(xtrain,ytrain)
score_tr=clf.score(xtrain,ytrain)
score_te=cross_val_score(clf,x,y,cv=10).mean()
tr.append(score_tr)
te.append(score_te)
print(max(te))
plt.plot(range(1,11),tr,color="red",label="train")
plt.plot(range(1,11),te,color="blue",label="test")
plt.xticks(range(1,11))
plt.legend()
plt.show()
四、网格搜索优化
剪枝和调参,网格搜索同时调整多个参数
import numpy as np
gini_threholds=np.linspace(0,0.5,50)
parameters={"criterion":("gini","entropy")
,"splitter":("best","random")
,"max_depth":[*range(1,10)]
,"min_samples_leaf":[*range(1,50,5)]
,"min_impurity_decrease":np.linspace(0,0.5,20)
}
clf=DecisionTreeClassifier(random_state=25)
GS=GridSearchCV(clf,parameters,cv=10)
GS=GS.fit(xtrain,ytrain)
#最优参数集合
GS.best_params_
#最高的分数
GS.best_score_
#特征重要性
[*zip(x,clf.feature_importances_)]
clf=DecisionTreeClassifier(random_state=25,criterion="gini",max_depth=3,min_impurity_decrease=0.0,min_samples_leaf=41,splitter="random")
clf=clf.fit(xtrain,ytrain)
score_te=clf.score(xtest,ytest)
score_tr=clf.score(xtrain,ytrain)
#输出预测结果
y_predict =clf.predict(xtest)
pd.Series(y_predict).to_csv(r"D:\数据集\决策树-泰坦尼克号代码\y_predict.csv")
五、图形化展示
x.columns
from sklearn import tree
import graphviz
feature_name =['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Embarked']
dot_data = tree.export_graphviz(clf
,feature_names= feature_name
,class_names=["活着","死去"]
,filled=True
,rounded=True
,out_file=None
)
graph = graphviz.Source(dot_data)
graph
总结说明
本文代码来源:https://zhuanlan.zhihu.com/p/139684525
仅供学习参考使用。