import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv('./data/abalone.txt',
header=None,
sep='\t'
)
data.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.455 | 0.365 | 0.095 | 0.5140 | 0.2245 | 0.1010 | 0.150 | 15 |
1 | 1 | 0.350 | 0.265 | 0.090 | 0.2255 | 0.0995 | 0.0485 | 0.070 | 7 |
2 | -1 | 0.530 | 0.420 | 0.135 | 0.6770 | 0.2565 | 0.1415 | 0.210 | 9 |
3 | 1 | 0.440 | 0.365 | 0.125 | 0.5160 | 0.2155 | 0.1140 | 0.155 | 10 |
4 | 0 | 0.330 | 0.255 | 0.080 | 0.2050 | 0.0895 | 0.0395 | 0.055 | 7 |
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 0 4177 non-null int64
1 1 4177 non-null float64
2 2 4177 non-null float64
3 3 4177 non-null float64
4 4 4177 non-null float64
5 5 4177 non-null float64
6 6 4177 non-null float64
7 7 4177 non-null float64
8 8 4177 non-null int64
dtypes: float64(7), int64(2)
memory usage: 293.8 KB
data.describe([0.01,0.99]).T
count | mean | std | min | 1% | 50% | 99% | max | |
---|---|---|---|---|---|---|---|---|
0 | 4177.0 | 0.052909 | 0.822240 | -1.0000 | -1.00000 | 0.0000 | 1.00000 | 1.0000 |
1 | 4177.0 | 0.523992 | 0.120093 | 0.0750 | 0.19500 | 0.5450 | 0.73500 | 0.8150 |
2 | 4177.0 | 0.407881 | 0.099240 | 0.0550 | 0.14000 | 0.4250 | 0.58000 | 0.6500 |
3 | 4177.0 | 0.139516 | 0.041827 | 0.0000 | 0.04500 | 0.1400 | 0.22000 | 1.1300 |
4 | 4177.0 | 0.828742 | 0.490389 | 0.0020 | 0.03576 | 0.7995 | 2.14442 | 2.8255 |
5 | 4177.0 | 0.359367 | 0.221963 | 0.0010 | 0.01350 | 0.3360 | 0.99778 | 1.4880 |
6 | 4177.0 | 0.180594 | 0.109614 | 0.0005 | 0.00788 | 0.1710 | 0.47610 | 0.7600 |
7 | 4177.0 | 0.238831 | 0.139203 | 0.0015 | 0.01038 | 0.2340 | 0.62000 | 1.0050 |
8 | 4177.0 | 9.933684 | 3.224169 | 1.0000 | 4.00000 | 9.0000 | 20.00000 | 29.0000 |
data.iloc[:,-1].value_counts()
9 689
10 634
8 568
11 487
7 391
12 267
6 259
13 203
14 126
5 115
15 103
16 67
17 58
4 57
18 42
19 32
20 26
3 15
21 14
23 9
22 6
27 2
24 2
1 1
26 1
29 1
2 1
25 1
Name: 8, dtype: int64
X = data.iloc[:,:-1].copy()
y = data.iloc[:,-1].copy()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
C:\Anaconda\lib\site-packages\xgboost\compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
from pandas import MultiIndex, Int64Index
from sklearn.preprocessing import StandardScaler
s_X = StandardScaler().fit_transform(X)
s_X = pd.DataFrame(data = s_X, columns=X.columns)
s_X
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | |
---|---|---|---|---|---|---|---|---|
0 | 1.151980 | -0.574558 | -0.432149 | -1.064424 | -0.641898 | -0.607685 | -0.726212 | -0.638217 |
1 | 1.151980 | -1.448986 | -1.439929 | -1.183978 | -1.230277 | -1.170910 | -1.205221 | -1.212987 |
2 | -1.280690 | 0.050033 | 0.122130 | -0.107991 | -0.309469 | -0.463500 | -0.356690 | -0.207139 |
3 | 1.151980 | -0.699476 | -0.432149 | -0.347099 | -0.637819 | -0.648238 | -0.607600 | -0.602294 |
4 | -0.064355 | -1.615544 | -1.540707 | -1.423087 | -1.272086 | -1.215968 | -1.287337 | -1.320757 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
4172 | -1.280690 | 0.341509 | 0.424464 | 0.609334 | 0.118813 | 0.047908 | 0.532900 | 0.073062 |
4173 | 1.151980 | 0.549706 | 0.323686 | -0.107991 | 0.279929 | 0.358808 | 0.309362 | 0.155685 |
4174 | 1.151980 | 0.632985 | 0.676409 | 1.565767 | 0.708212 | 0.748559 | 0.975413 | 0.496955 |
4175 | -1.280690 | 0.841182 | 0.777187 | 0.250672 | 0.541998 | 0.773341 | 0.733627 | 0.410739 |
4176 | 1.151980 | 1.549052 | 1.482634 | 1.326659 | 2.283681 | 2.640993 | 1.787449 | 1.840481 |
4177 rows × 8 columns
X_train,X_test,y_train,y_test = train_test_split(s_X,y,test_size = 0.2,random_state=19)
RF = RandomForestRegressor(n_estimators = 500)
RF.fit(X_train,y_train)
RF.score(X_test,y_test)
0.5040997932157716
lr = LinearRegression()
lr.fit(X_train,y_train)
lr.score(X_test,y_test)
0.46853356891462006
knn = KNeighborsRegressor()
knn.fit(X_train,y_train)
knn.score(X_test,y_test)
0.4416450520111813
dt = DecisionTreeRegressor()
dt.fit(X_train,y_train)
dt.score(X_test,y_test)
0.054795460519916794
ada = AdaBoostRegressor(
base_estimator = DecisionTreeRegressor(max_depth=3),
n_estimators=50,
learning_rate=1.0
)
ada.fit(X_train,y_train)
ada.score(X_test,y_test)
0.2854128443586734
gbdt = GradientBoostingRegressor()
gbdt.fit(X_train,y_train)
gbdt.score(X_test,y_test)
0.49482544308232923
xgb = XGBRegressor().fit(X_train,y_train)
xgb.score(X_test,y_test)
C:\Anaconda\lib\site-packages\xgboost\data.py:208: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
from pandas import MultiIndex, Int64Index
0.44712420076610004