共享单车项目
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA,TruncatedSVD
from sklearn.linear_model import Lasso,Ridge,LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler,LabelEncoder,Binarizer,OneHotEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import calendar
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')
def get_month(mdt):
mobj=datetime.datetime.fromisoformat(mdt)
mobj=mobj.month
return calendar.month_name[mobj]
def get_day(ddt):
dobj=datetime.datetime.fromisoformat(ddt)
return dobj.day
def get_weekday(wdt):
wobj=datetime.datetime.fromisoformat(wdt)
wobjw=wobj.weekday()
return calendar.day_name[wobjw]
def get_hour(hdt):
hobj=datetime.datetime.fromisoformat(hdt)
return hobj.hour
def change_to_int(x):
return int(x)
def main():
data=pd.read_csv('train.csv')
print(data)
data['month_section']=data['datetime'].apply(func=get_month)
data['day_section']=data['datetime'].apply(func=get_day)
data['weekday_section'] = data['datetime'].apply(func=get_weekday)
data['hour_section'] = data['datetime'].apply(func=get_hour)
data['temp_int']=data['temp'].apply(func=change_to_int)
data['atemp_int'] = data['atemp'].apply(func=change_to_int)
data['windspeed_int'] = data['windspeed'].apply(func=change_to_int)
fig,axes=plt.subplots(nrows=2,ncols=2)
sns.boxplot(
data=data,
x='season',
y='count',
ax=axes[0,0]
)
dcount=data.loc[:,['count']]
d_mean=dcount.mean()
d_std=dcount.std()
d_good=dcount-d_mean<=2*d_std
d_good_val=d_good.values.flatten()
d_good_no_noise=data.loc[d_good_val,:]
sns.boxplot(
data=d_good_no_noise,
x='season',
y='count',
ax=axes[0,1]
)
ls= ["season", "month_section", "day_section","weekday_section", "hour_section","count",]
d_heat=d_good_no_noise.loc[:,ls]
d_corr=d_heat.corr()
sns.heatmap(data=d_corr,annot=True,ax=axes[1,0])
dd_count_reset_index=d_good_no_noise.groupby(by=['season']).agg(np.mean).reset_index()
sns.barplot(
data=dd_count_reset_index,
x='season',
y='count',
ax=axes[1,1]
)
plt.show()
fig,axes=plt.subplots(nrows=2,ncols=2)
dd_point_reset_index=d_good_no_noise.groupby(by=['hour_section','weekday_section']).agg(np.mean).reset_index()
sns.pointplot(
data=dd_point_reset_index,
x='hour_section',
y='count',
hue='weekday_section',
ax=axes[0,0]
)
dd_point_reset_index = d_good_no_noise.groupby(by=['hour_section', 'month_section']).agg(np.mean).reset_index()
sns.pointplot(
data=dd_point_reset_index,
x='hour_section',
y='count',
hue='month_section',
ax=axes[0, 1]
)
plt.show()
d_good_no_noise['weekday_hour']=d_good_no_noise.apply(
lambda line:line['weekday_section']+'_'+str(line['hour_section']),axis=1)
d_good_no_noise['month_hour'] = d_good_no_noise.apply(
lambda line: str(line['month_section']) + '_' + str(line['hour_section']), axis=1)
d_good_no_noise['season_hour'] = d_good_no_noise.apply(
lambda line: str(line['season']) + '_' + str(line['hour_section']), axis=1)
print(d_good_no_noise.columns)
""" ------ 删除冗余字段 开始 ------ """
la=['datetime','temp','atemp','windspeed']
for i in la:
d_good_no_noise.drop(i,inplace=True,axis=1)
print(d_good_no_noise.columns)
print(len(d_good_no_noise.columns))
lb=['season_hour','month_hour','weekday_hour','season','weather']
pd.get_dummies(data=d_good_no_noise,columns=lb)
print(d_good_no_noise.columns)
print(len(d_good_no_noise.columns))
y=d_good_no_noise.pop('count')
x=d_good_no_noise
print(x.shape)
print(y.shape)
sobj=StandardScaler()
x=sobj.fit_transform(x)
pobj=PCA(n_components=4)
x=pobj.fit_transform(x)
trainx,testx,trainy,testy=train_test_split(x,y,train_size=0.7,shuffle=True)
robj=Ridge()
params={
'alpha':[i for i in range(5,10)]
}
gobj=GridSearchCV(estimator=robj,param_grid=params,cv=5)
gobj.fit(x,y)
print(gobj.best_params_)
rgobj=Ridge(alpha=gobj.best_params_['alpha'])
rgobj.fit(trainx,trainy)
print(rgobj.score(testx, testy))
testz=rgobj.predict(testx)
print(r2_score(testy, testz))
if __name__ == '__main__':
main()