0
点赞
收藏
分享

微信扫一扫

机器学习实战:鲍鱼回归分析(回归方法总结)

萧萧雨潇潇 2022-04-21 阅读 31
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv('./data/abalone.txt',
                  header=None,
                  sep='\t'
                  )
data.head()

012345678
010.4550.3650.0950.51400.22450.10100.15015
110.3500.2650.0900.22550.09950.04850.0707
2-10.5300.4200.1350.67700.25650.14150.2109
310.4400.3650.1250.51600.21550.11400.15510
400.3300.2550.0800.20500.08950.03950.0557
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       4177 non-null   int64  
 1   1       4177 non-null   float64
 2   2       4177 non-null   float64
 3   3       4177 non-null   float64
 4   4       4177 non-null   float64
 5   5       4177 non-null   float64
 6   6       4177 non-null   float64
 7   7       4177 non-null   float64
 8   8       4177 non-null   int64  
dtypes: float64(7), int64(2)
memory usage: 293.8 KB
data.describe([0.01,0.99]).T

countmeanstdmin1%50%99%max
04177.00.0529090.822240-1.0000-1.000000.00001.000001.0000
14177.00.5239920.1200930.07500.195000.54500.735000.8150
24177.00.4078810.0992400.05500.140000.42500.580000.6500
34177.00.1395160.0418270.00000.045000.14000.220001.1300
44177.00.8287420.4903890.00200.035760.79952.144422.8255
54177.00.3593670.2219630.00100.013500.33600.997781.4880
64177.00.1805940.1096140.00050.007880.17100.476100.7600
74177.00.2388310.1392030.00150.010380.23400.620001.0050
84177.09.9336843.2241691.00004.000009.000020.0000029.0000
data.iloc[:,-1].value_counts()
9     689
10    634
8     568
11    487
7     391
12    267
6     259
13    203
14    126
5     115
15    103
16     67
17     58
4      57
18     42
19     32
20     26
3      15
21     14
23      9
22      6
27      2
24      2
1       1
26      1
29      1
2       1
25      1
Name: 8, dtype: int64
X = data.iloc[:,:-1].copy()
y = data.iloc[:,-1].copy()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
C:\Anaconda\lib\site-packages\xgboost\compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import MultiIndex, Int64Index
from sklearn.preprocessing import StandardScaler
s_X = StandardScaler().fit_transform(X)
s_X = pd.DataFrame(data = s_X, columns=X.columns)
s_X

01234567
01.151980-0.574558-0.432149-1.064424-0.641898-0.607685-0.726212-0.638217
11.151980-1.448986-1.439929-1.183978-1.230277-1.170910-1.205221-1.212987
2-1.2806900.0500330.122130-0.107991-0.309469-0.463500-0.356690-0.207139
31.151980-0.699476-0.432149-0.347099-0.637819-0.648238-0.607600-0.602294
4-0.064355-1.615544-1.540707-1.423087-1.272086-1.215968-1.287337-1.320757
...........................
4172-1.2806900.3415090.4244640.6093340.1188130.0479080.5329000.073062
41731.1519800.5497060.323686-0.1079910.2799290.3588080.3093620.155685
41741.1519800.6329850.6764091.5657670.7082120.7485590.9754130.496955
4175-1.2806900.8411820.7771870.2506720.5419980.7733410.7336270.410739
41761.1519801.5490521.4826341.3266592.2836812.6409931.7874491.840481

4177 rows × 8 columns

X_train,X_test,y_train,y_test = train_test_split(s_X,y,test_size = 0.2,random_state=19)
RF = RandomForestRegressor(n_estimators = 500)
RF.fit(X_train,y_train)
RF.score(X_test,y_test)
0.5040997932157716
lr = LinearRegression()
lr.fit(X_train,y_train)
lr.score(X_test,y_test)
0.46853356891462006
knn = KNeighborsRegressor()
knn.fit(X_train,y_train)
knn.score(X_test,y_test)
0.4416450520111813
dt = DecisionTreeRegressor()
dt.fit(X_train,y_train)
dt.score(X_test,y_test)
0.054795460519916794
ada = AdaBoostRegressor(
    base_estimator = DecisionTreeRegressor(max_depth=3),
    n_estimators=50,
    learning_rate=1.0
)
ada.fit(X_train,y_train)
ada.score(X_test,y_test)
0.2854128443586734
gbdt = GradientBoostingRegressor()
gbdt.fit(X_train,y_train)
gbdt.score(X_test,y_test)
0.49482544308232923
xgb = XGBRegressor().fit(X_train,y_train)
xgb.score(X_test,y_test)
C:\Anaconda\lib\site-packages\xgboost\data.py:208: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import MultiIndex, Int64Index





0.44712420076610004

举报

相关推荐

0 条评论