机器学习实战：鲍鱼回归分析（回归方法总结）-CFANZ编程社区

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

data = pd.read_csv('./data/abalone.txt',
                  header=None,
                  sep='\t'
                  )
data.head()

	0	1	2	3	4	5	6	7	8
0	1	0.455	0.365	0.095	0.5140	0.2245	0.1010	0.150	15
1	1	0.350	0.265	0.090	0.2255	0.0995	0.0485	0.070	7
2	-1	0.530	0.420	0.135	0.6770	0.2565	0.1415	0.210	9
3	1	0.440	0.365	0.125	0.5160	0.2155	0.1140	0.155	10
4	0	0.330	0.255	0.080	0.2050	0.0895	0.0395	0.055	7

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       4177 non-null   int64  
 1   1       4177 non-null   float64
 2   2       4177 non-null   float64
 3   3       4177 non-null   float64
 4   4       4177 non-null   float64
 5   5       4177 non-null   float64
 6   6       4177 non-null   float64
 7   7       4177 non-null   float64
 8   8       4177 non-null   int64  
dtypes: float64(7), int64(2)
memory usage: 293.8 KB

data.describe([0.01,0.99]).T

	count	mean	std	min	1%	50%	99%	max
0	4177.0	0.052909	0.822240	-1.0000	-1.00000	0.0000	1.00000	1.0000
1	4177.0	0.523992	0.120093	0.0750	0.19500	0.5450	0.73500	0.8150
2	4177.0	0.407881	0.099240	0.0550	0.14000	0.4250	0.58000	0.6500
3	4177.0	0.139516	0.041827	0.0000	0.04500	0.1400	0.22000	1.1300
4	4177.0	0.828742	0.490389	0.0020	0.03576	0.7995	2.14442	2.8255
5	4177.0	0.359367	0.221963	0.0010	0.01350	0.3360	0.99778	1.4880
6	4177.0	0.180594	0.109614	0.0005	0.00788	0.1710	0.47610	0.7600
7	4177.0	0.238831	0.139203	0.0015	0.01038	0.2340	0.62000	1.0050
8	4177.0	9.933684	3.224169	1.0000	4.00000	9.0000	20.00000	29.0000

data.iloc[:,-1].value_counts()

9     689
10    634
8     568
11    487
7     391
12    267
6     259
13    203
14    126
5     115
15    103
16     67
17     58
4      57
18     42
19     32
20     26
3      15
21     14
23      9
22      6
27      2
24      2
1       1
26      1
29      1
2       1
25      1
Name: 8, dtype: int64

X = data.iloc[:,:-1].copy()
y = data.iloc[:,-1].copy()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor

C:\Anaconda\lib\site-packages\xgboost\compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import MultiIndex, Int64Index

from sklearn.preprocessing import StandardScaler

s_X = StandardScaler().fit_transform(X)
s_X = pd.DataFrame(data = s_X, columns=X.columns)
s_X

	0	1	2	3	4	5	6	7
0	1.151980	-0.574558	-0.432149	-1.064424	-0.641898	-0.607685	-0.726212	-0.638217
1	1.151980	-1.448986	-1.439929	-1.183978	-1.230277	-1.170910	-1.205221	-1.212987
2	-1.280690	0.050033	0.122130	-0.107991	-0.309469	-0.463500	-0.356690	-0.207139
3	1.151980	-0.699476	-0.432149	-0.347099	-0.637819	-0.648238	-0.607600	-0.602294
4	-0.064355	-1.615544	-1.540707	-1.423087	-1.272086	-1.215968	-1.287337	-1.320757
...	...	...	...	...	...	...	...	...
4172	-1.280690	0.341509	0.424464	0.609334	0.118813	0.047908	0.532900	0.073062
4173	1.151980	0.549706	0.323686	-0.107991	0.279929	0.358808	0.309362	0.155685
4174	1.151980	0.632985	0.676409	1.565767	0.708212	0.748559	0.975413	0.496955
4175	-1.280690	0.841182	0.777187	0.250672	0.541998	0.773341	0.733627	0.410739
4176	1.151980	1.549052	1.482634	1.326659	2.283681	2.640993	1.787449	1.840481

4177 rows × 8 columns

X_train,X_test,y_train,y_test = train_test_split(s_X,y,test_size = 0.2,random_state=19)

RF = RandomForestRegressor(n_estimators = 500)
RF.fit(X_train,y_train)
RF.score(X_test,y_test)

0.5040997932157716

lr = LinearRegression()
lr.fit(X_train,y_train)
lr.score(X_test,y_test)

0.46853356891462006

knn = KNeighborsRegressor()
knn.fit(X_train,y_train)
knn.score(X_test,y_test)

0.4416450520111813

dt = DecisionTreeRegressor()
dt.fit(X_train,y_train)
dt.score(X_test,y_test)

0.054795460519916794

ada = AdaBoostRegressor(
    base_estimator = DecisionTreeRegressor(max_depth=3),
    n_estimators=50,
    learning_rate=1.0
)
ada.fit(X_train,y_train)
ada.score(X_test,y_test)

0.2854128443586734

gbdt = GradientBoostingRegressor()
gbdt.fit(X_train,y_train)
gbdt.score(X_test,y_test)

0.49482544308232923

xgb = XGBRegressor().fit(X_train,y_train)
xgb.score(X_test,y_test)

C:\Anaconda\lib\site-packages\xgboost\data.py:208: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import MultiIndex, Int64Index





0.44712420076610004