0
点赞
收藏
分享

微信扫一扫

商品销售预测

得一道人 2022-04-29 阅读 71
python
import numpy as np
import matplotlib.pylab as plt
import pandas as pd 
import os 

导入数据

file=os.walk(r'D:\机器学习\kaggle预测\商店销售')
filename_=[]
filename_data=[]
for root,sub,filename in file:
    for i in filename:
        path=os.path.join(root,i)
        filename_.append(i)
        filename_data.append(pd.read_csv(path))
    
[i for i in filename_]
['holidays_events.csv',
 'oil.csv',
 'sample_submission.csv',
 'stores.csv',
 'test.csv',
 'train.csv',
 'transactions.csv']
#合并数据
df_train1=filename_data[5].merge(filename_data[0],on='date',how='left')
df_train1=df_train1.merge(filename_data[1],on='date',how='left')
df_train1=df_train1.merge(filename_data[3],on='store_nbr',how='left')
df_train1=df_train1.merge(filename_data[6],on=['date','store_nbr'],how='left')
df_train1=df_train1.rename(columns={'type_x':'holiday_type','type_y':'store_type'})
df_train1.head()

id date store_nbr family sales onpromotion holiday_type locale locale_name description transferred dcoilwtico city state store_type cluster transactions
0 0 2013-01-01 1 AUTOMOTIVE 0.000 0 Holiday National Ecuador Primer dia del ano False NaN Quito Pichincha D 13 NaN
1 1 2013-01-01 1 BABY CARE 0.000 0 Holiday National Ecuador Primer dia del ano False NaN Quito Pichincha D 13 NaN
2 2 2013-01-01 1 BEAUTY 0.000 0 Holiday National Ecuador Primer dia del ano False NaN Quito Pichincha D 13 NaN
3 3 2013-01-01 1 BEVERAGES 0.000 0 Holiday National Ecuador Primer dia del ano False NaN Quito Pichincha D 13 NaN
4 4 2013-01-01 1 BOOKS 0.000 0 Holiday National Ecuador Primer dia del ano False NaN Quito Pichincha D 13 NaN
#处理日期:
df_train1['date']=pd.to_datetime(df_train1['date'])
df_train1['year']=df_train1['date'].dt.year
df_train1['month']=df_train1['date'].dt.month
df_train1['week']=df_train1['date'].dt.isocalendar().week
df_train1['quarter']=df_train1['date'].dt.quarter
df_train1['day_of_week']=df_train1['date'].dt.day_name()
df_train1.head()

id date store_nbr family sales onpromotion holiday_type locale locale_name description ... city state store_type cluster transactions year month week quarter day_of_week
0 0 2013-01-01 1 AUTOMOTIVE 0.0 0 Holiday National Ecuador Primer dia del ano ... Quito Pichincha D 13 NaN 2013 1 1 1 Tuesday
1 1 2013-01-01 1 BABY CARE 0.0 0 Holiday National Ecuador Primer dia del ano ... Quito Pichincha D 13 NaN 2013 1 1 1 Tuesday
2 2 2013-01-01 1 BEAUTY 0.0 0 Holiday National Ecuador Primer dia del ano ... Quito Pichincha D 13 NaN 2013 1 1 1 Tuesday
3 3 2013-01-01 1 BEVERAGES 0.0 0 Holiday National Ecuador Primer dia del ano ... Quito Pichincha D 13 NaN 2013 1 1 1 Tuesday
4 4 2013-01-01 1 BOOKS 0.0 0 Holiday National Ecuador Primer dia del ano ... Quito Pichincha D 13 NaN 2013 1 1 1 Tuesday

5 rows × 22 columns

store_nbr、family、cluster绘总

#将每个store_type的销量求平均值
df_st_sa=df_train1.groupby('store_type').agg({'sales':'mean'}).reset_index().sort_values(by='sales',ascending=False)
df_st_sa

store_type sales
0 A 708.378165
3 D 352.084510
1 B 328.275233
4 E 270.285490
2 C 197.790647
#对每一个family求均值并排序
df_fa_sa=df_train1.groupby('family').agg({'sales':'mean'}).reset_index().sort_values(by='sales',ascending=False)[:10]
df_fa_sa

family sales
12 GROCERY I 3790.432797
3 BEVERAGES 2394.912701
30 PRODUCE 1355.373698
7 CLEANING 1074.171518
8 DAIRY 711.175991
5 BREAD/BAKERY 464.150612
28 POULTRY 351.078816
24 MEATS 341.965905
25 PERSONAL CARE 271.192381
9 DELI 265.629746
#对每一个cluster求均值并排序
df_cl_sa=df_train1.groupby('cluster').agg({'sales':'mean'}).reset_index()
df_cl_sa.head()

cluster sales
0 1 327.022808
1 2 261.025731
2 3 194.926534
3 4 297.537877
4 5 1120.118405
from matplotlib.gridspec import GridSpec
plt.figure(figsize=(12,8))
gs=GridSpec(2,2)
ax=plt.subplot(gs[1:,:2])
# plt.barh()
ax.bar(range(df_cl_sa.shape[0]),df_cl_sa.iloc[:,1],width=0.5)
ax.set_title('Clusters VS s Sales')

ax=plt.subplot(gs[:1,:1])
ax.barh(df_fa_sa.iloc[:,0],df_fa_sa.iloc[:,1])
ax.set_title('Average Sales Familys')

ax=plt.subplot(gs[:1,1])
ax.pie(df_st_sa.iloc[:,1],wedgeprops={'width':0.3},labels=df_st_sa.iloc[:,0])

ax.set_title('Highest Sales Stores')

plt.show()

请添加图片描述

月销售量绘总

#将每年的数据按月求均值得到新的数组
df_2013=df_train1[df_train1['year']==2013][['month','sales']]
df_2013=df_2013.groupby('month').agg({'sales':'mean'}).reset_index().rename(columns={'sales':'s13'})
df_2014=df_train1[df_train1['year']==2014][['month','sales']]
df_2014=df_2014.groupby('month').agg({'sales':'mean'}).reset_index().rename(columns={'sales':'s14'})
df_2015=df_train1[df_train1['year']==2015][['month','sales']]
df_2015=df_2015.groupby('month').agg({'sales':'mean'}).reset_index().rename(columns={'sales':'s15'})
df_2016=df_train1[df_train1['year']==2016][['month','sales']]
df_2016=df_2016.groupby('month').agg({'sales':'mean'}).reset_index().rename(columns={'sales':'s16'})
df_2017=df_train1[df_train1['year']==2017][['month','sales']]
df_2017=df_2017.groupby('month').agg({'sales':'mean'}).reset_index()
#补充2017年后面几个月的数据
df_2017_no=pd.DataFrame({'month':[9,10,11,12],'sales':[0,0,0,0]})
df_2017=df_2017.append(df_2017_no).rename(columns={'sales':'s17'})
# print(df_2017)
#将不同年份的数据合并起来
df_year=df_2013.merge(df_2014,on='month').merge(df_2015,on='month').merge(df_2016,on='month').merge(df_2017,on='month')
# tob_labels=['2013','2014','2015','2016','2017']
df_year

month s13 s14 s15 s16 s17
0 1 186.952405 342.341709 269.666595 434.050268 476.596791
1 2 193.581846 241.268892 275.420792 424.695398 465.971468
2 3 206.880581 368.661236 282.368624 418.735398 483.400632
3 4 205.639071 240.577087 279.743138 488.108774 482.172948
4 5 210.184563 242.203129 320.958116 457.671398 487.162797
5 6 215.691343 244.634652 397.249619 419.644575 488.707278
6 7 203.983455 350.830102 403.030170 432.562218 489.909880
7 8 212.479434 251.351805 415.692304 406.437390 465.144891
8 9 220.593588 374.530792 434.734053 419.331240 0.000000
9 10 213.164266 369.213666 432.248428 435.002169 0.000000
10 11 231.136537 384.056027 426.579749 462.916675 0.000000
11 12 298.675144 459.818606 513.845328 557.114822 0.000000
#将除月份这一列的值赋给新的datafram
df_year=df_year[['s13','s14','s15','s16','s17']].replace(np.nan,0)
df_year

s13 s14 s15 s16 s17
Jan 186.952405 342.341709 269.666595 434.050268 476.596791
Feb 193.581846 241.268892 275.420792 424.695398 465.971468
Mar 206.880581 368.661236 282.368624 418.735398 483.400632
Apr 205.639071 240.577087 279.743138 488.108774 482.172948
May 210.184563 242.203129 320.958116 457.671398 487.162797
Jun 215.691343 244.634652 397.249619 419.644575 488.707278
Ju1 203.983455 350.830102 403.030170 432.562218 489.909880
Aug 212.479434 251.351805 415.692304 406.437390 465.144891
Sep 220.593588 374.530792 434.734053 419.331240 0.000000
Oct 213.164266 369.213666 432.248428 435.002169 0.000000
Nov 231.136537 384.056027 426.579749 462.916675 0.000000
Dec 298.675144 459.818606 513.845328 557.114822 0.000000
df_year.index=['Jan','Feb','Mar','Apr','May','Jun','Ju1','Aug','Sep','Oct','Nov','Dec']
y_data=df_2013['month'].tolist()#转化为一个列表
df_year


s13 s14 s15 s16 s17
Jan 186.952405 342.341709 269.666595 434.050268 476.596791
Feb 193.581846 241.268892 275.420792 424.695398 465.971468
Mar 206.880581 368.661236 282.368624 418.735398 483.400632
Apr 205.639071 240.577087 279.743138 488.108774 482.172948
May 210.184563 242.203129 320.958116 457.671398 487.162797
Jun 215.691343 244.634652 397.249619 419.644575 488.707278
Ju1 203.983455 350.830102 403.030170 432.562218 489.909880
Aug 212.479434 251.351805 415.692304 406.437390 465.144891
Sep 220.593588 374.530792 434.734053 419.331240 0.000000
Oct 213.164266 369.213666 432.248428 435.002169 0.000000
Nov 231.136537 384.056027 426.579749 462.916675 0.000000
Dec 298.675144 459.818606 513.845328 557.114822 0.000000
#画每年不同月份,平均销售
plt.figure(figsize=(12,8))
plt.barh(df_year.index,df_year.iloc[:,0],label='2013')
plt.text(100,12.5,'2013')
plt.barh(df_year.index,df_year.iloc[:,1],left=df_year.iloc[:,0],label='2014')
plt.text(500,12.5,'2014')
plt.barh(df_year.index,df_year.iloc[:,2],left=df_year.iloc[:,0]+df_year.iloc[:,1],label='2015')
plt.text(1000,12.5,'2015')
plt.barh(df_year.index,df_year.iloc[:,3],left=df_year.iloc[:,0]+df_year.iloc[:,1]+df_year.iloc[:,2],label='2016')
plt.text(1300,12.5,'2016')
plt.barh(df_year.index,df_year.iloc[:,4],left=df_year.iloc[:,0]+df_year.iloc[:,1]+df_year.iloc[:,2]+df_year.iloc[:,3],label='2017')
plt.text(1700,12.5,'2017')
plt.title('Avg Sales for Each Year',loc='left',y=1.08,fontsize=15)
plt.legend()
plt.show()

在这里插入图片描述

月、季、周,星期几绘总

#绘制每个月,每个季度,每个周的平均销售量
import calendar
df_m_sa=df_train1.groupby('month').agg({'sales':'mean'}).reset_index()
df_m_sa['sales']=round(df_m_sa['sales'],2)           #处理sales函数小数位数
df_m_sa['month_text']=df_m_sa['month'].apply(lambda x: calendar.month_abbr[x])#将数值月份转化为文字形
df_m_sa['text']=df_m_sa['month_text']+'-'+df_m_sa['sales'].astype(str)
df_w_sa=df_train1.groupby('week').agg({'sales':'mean'}).reset_index()
df_q_sa=df_train1.groupby('quarter').agg({'sales':'mean'}).reset_index()

df_m_sa.head(),df_w_sa.head(),df_q_sa.head
(    month   sales month_text        text
 0       1  341.92        Jan  Jan-341.92
 1       2  320.93        Feb  Feb-320.93
 2       3  352.01        Mar  Mar-352.01
 3       4  341.17        Apr  Apr-341.17
 4       5  345.65        May  May-345.65,

     week       sales
 0      1  409.099519
 1      2  347.534643
 2      3  338.142199
 3      4  329.186258
 4      5  344.195233,

    quarter       sales
 0        1  338.825392
 1        2  346.546038
 2        3  359.334098
 3        4  399.229622)
from matplotlib.gridspec import GridSpec
plt.figure(figsize=(12,8))
gs=GridSpec(2,2)
print(gs[:2,:2])
ax=plt.subplot(gs[:1,:1])
# plt.barh()

ax.barh(df_m_sa.iloc[:,2],df_m_sa.iloc[:,1])#为每个条形图添加标签要用循环
for a,b in enumerate(df_m_sa.iloc[:,1]):
    ax.text(b-100,a,df_m_sa.iloc[a,3])
ax.set_title('month wise avg sales analysis')

ax=plt.subplot(gs[:1,1])
ax.pie(df_q_sa.iloc[:,1],wedgeprops={'width':0.3},labels=df_q_sa.iloc[:,0],autopct='%1.2f%%',pctdistance=1.25)
ax.set_title('Quarter wise Avg Sales Analy')

ax=plt.subplot(gs[1:,:2])
ax.fill_between(df_w_sa.iloc[:,0],df_w_sa.iloc[:,1],alpha=0.6)
ax.plot(df_w_sa.iloc[:,0],df_w_sa.iloc[:,1],marker='o')
ax.set_title('Week wise Avg Sales Analysis')
# ax[0,1].set_title('Highest Sales Stores')
# ax[1,0].set_title('Clusters VS s Sales')
plt.show()
GridSpec(2, 2)[0:2, 0:2]

在这里插入图片描述

df_dw_sa=df_train1.groupby('day_of_week').agg({'sales':'mean'}).reset_index()
df_dw_sa['sales']=round(df_dw_sa['sales'],2)

df_dw_sa

day_of_week sales
0 Friday 326.73
1 Monday 348.16
2 Saturday 434.79
3 Sunday 464.74
4 Thursday 286.57
5 Tuesday 319.92
6 Wednesday 330.77
plt.barh(df_dw_sa.iloc[:,0],df_dw_sa.iloc[:,1])#为每个条形图添加标签要用循环
for a,b in enumerate(df_dw_sa.iloc[:,1]):
    plt.text(b-50,a,df_dw_sa.iloc[a,1])
# plt.yticks(['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
plt.title('Avg Sales VS Day of Week')
plt.show()

在这里插入图片描述

store_nbr与holiday关联

df_st_ht=df_train1.groupby(['store_type','holiday_type']).agg({'sales':'mean'}).reset_index()
df_st_ht['sales']=round(df_st_ht['sales'],2)
df_st_ht.head()

store_type holiday_type sales
0 A Additional 957.70
1 A Bridge 969.82
2 A Event 813.56
3 A Holiday 723.28
4 A Transfer 984.63
plt.scatter(df_st_ht.iloc[:,0],df_st_ht.iloc[:,1],s=df_st_ht.iloc[:,2],c=df_st_ht.iloc[:,2],cmap='plasma')
plt.colorbar()
plt.text(4.7,5.5,'sales')
plt.xlim(-0.5,4.5)
plt.ylim(-0.5,5.5)
plt.title('Average Sales:Store Type vs holiday type ')
plt.show()

在这里插入图片描述

df_y_m_st=df_train1.groupby(['year','month','store_type']).agg({'sales':'mean'}).reset_index()
df_y_m_st['sales']=round(df_y_m_st['sales'],2)
df_y_m_st['month']=df_y_m_st['month'].apply(lambda x:calendar.month_abbr[x])
df_y_m_st.head()

year month store_type sales
0 2013 Jan A 392.85
1 2013 Jan B 155.11
2 2013 Jan C 109.06
3 2013 Jan D 191.16
4 2013 Jan E 60.52

280 rows × 4 columns

# a=df_y_m_st['year']==2013
# df_y_m_st.loc[a,'month']

store_nbr与holiday绘总不同年份

a=df_y_m_st['year']==2013
b=df_y_m_st['year']==2014
c=df_y_m_st['year']==2015
d=df_y_m_st['year']==2016
e=df_y_m_st['year']==2017
fig,ax=plt.subplots(5,1,figsize=(24,20))




ax[0].scatter(df_y_m_st.loc[a,'month'],df_y_m_st.loc[a,'store_type'],df_y_m_st.loc[a,'sales'],c=df_y_m_st.loc[a,'sales'],cmap='plasma')
ax[0].text(12,0.01,'year=2013',size=20,rotation='270')
ax[0].set_xticks([])

ax[1].scatter(df_y_m_st.loc[a,'month'],df_y_m_st.loc[a,'store_type'],df_y_m_st.loc[b,'sales'],c=df_y_m_st.loc[b,'sales'],cmap='plasma')
ax[1].text(12,0.01,'year=2014',size=20,rotation='270')
ax[1].set_xticks([])
ax[2].scatter(df_y_m_st.loc[a,'month'],df_y_m_st.loc[a,'store_type'],df_y_m_st.loc[c,'sales'],c=df_y_m_st.loc[c,'sales'],cmap='plasma')
ax[2].text(12,0.01,'year=2015',size=20,rotation='270')
ax[2].set_xticks([])
ax[3].scatter(df_y_m_st.loc[a,'month'],df_y_m_st.loc[a,'store_type'],df_y_m_st.loc[d,'sales'],c=df_y_m_st.loc[d,'sales'],cmap='plasma')
ax[3].text(12,0.01,'year=2016',size=20,rotation='270')
ax[3].set_xticks([])
ax[4].scatter(df_y_m_st.loc[e,'month'],df_y_m_st.loc[e,'store_type'],df_y_m_st.loc[e,'sales'],c=df_y_m_st.loc[e,'sales'],cmap='plasma')
ax[4].text(11.5,0.01,'year=2017',size=20,rotation='270')
ax[4].set_xticks(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Oct','Sep','Nve','Dec'])
for i in range(5):
    ax[i].spines['top'].set_visible(False)
    ax[i].spines['right'].set_visible(False)
    ax[i].spines['bottom'].set_visible(False)
    ax[i].spines['left'].set_visible(False)
    ax[i].set_ylim([-0.5,5])
    ax[i].set_xlim([-0.5,15])
    ax[i].tick_params(axis='both',which='major',labelsize=25)#改变坐标轴的大小
#     ax[i].set_colorbar()   

fig.colorbar(ax[0].scatter(df_y_m_st.loc[a,'month'],df_y_m_st.loc[a,'store_type'],df_y_m_st.loc[a,'sales'],c=df_y_m_st.loc[a,'sales'],cmap='plasma'), ax=[ax[0], ax[1],ax[2],ax[3],ax[4]], shrink=0.9)


plt.text(16,32,'sales',size=20)
plt.show()

在这里插入图片描述

month与holiday绘总

df_m_ht=df_train1.groupby(['month','holiday_type']).agg({'sales':'mean'}).reset_index()
df_m_ht['sales']=round(df_m_ht['sales'],2)
df_m_ht['month']=df_m_ht['month'].apply(lambda x :calendar.month_abbr[x])
plt.scatter(df_m_ht.iloc[:,0],df_m_ht.iloc[:,1],s=df_m_ht.iloc[:,2],c=df_m_ht.iloc[:,2],cmap='plasma')
plt.colorbar()
plt.text(12.5,6,'sales')
plt.xlim(-1,12)
plt.ylim(-0.5,5.5)
plt.title('Average Sales:Month vs holiday type ')
plt.show()

在这里插入图片描述

df_y_m_ht=df_train1.groupby(['year','month','holiday_type']).agg({'sales':'mean'}).reset_index()
df_y_m_ht['sales']=round(df_y_m_ht['sales'],2)
df_y_m_ht['month']=df_y_m_ht['month'].apply(lambda x:calendar.month_abbr[x])
df_y_m_ht.head()

year month holiday_type sales
0 2013 Jan Holiday 1.41
1 2013 Jan Work Day 247.08
2 2013 Feb Holiday 164.82
3 2013 Mar Holiday 307.44
4 2013 Apr Holiday 228.52

96 rows × 4 columns

month与holiday绘总不同年份

a=df_y_m_ht['year']==2013
b=df_y_m_ht['year']==2014
c=df_y_m_ht['year']==2015
d=df_y_m_ht['year']==2016
e=df_y_m_ht['year']==2017
fig,ax=plt.subplots(5,1,figsize=(24,20))

ax[0].scatter(df_y_m_ht.loc[a,'month'],df_y_m_ht.loc[a,'holiday_type'],df_y_m_ht.loc[a,'sales'],c=df_y_m_ht.loc[a,'sales'],cmap='plasma')
ax[0].text(12,0.01,'year=2013',size=20,rotation='270')
ax[0].set_xticks([])

ax[1].scatter(df_y_m_ht.loc[b,'month'],df_y_m_ht.loc[b,'holiday_type'],df_y_m_ht.loc[b,'sales'],c=df_y_m_ht.loc[b,'sales'],cmap='plasma')
ax[1].text(12,0.01,'year=2014',size=20,rotation='270')
ax[1].set_xticks([])
ax[2].scatter(df_y_m_ht.loc[c,'month'],df_y_m_ht.loc[c,'holiday_type'],df_y_m_ht.loc[c,'sales'],c=df_y_m_ht.loc[c,'sales'],cmap='plasma')
ax[2].text(12,0.01,'year=2015',size=20,rotation='270')
ax[2].set_xticks([])
ax[3].scatter(df_y_m_ht.loc[d,'month'],df_y_m_ht.loc[d,'holiday_type'],df_y_m_ht.loc[d,'sales'],c=df_y_m_ht.loc[d,'sales'],cmap='plasma')
ax[3].text(12,0.01,'year=2016',size=20,rotation='270')
ax[3].set_xticks([])
ax[4].scatter(df_y_m_ht.loc[e,'month'],df_y_m_ht.loc[e,'holiday_type'],df_y_m_ht.loc[e,'sales'],c=df_y_m_ht.loc[e,'sales'],cmap='plasma')
ax[4].text(11.5,0.01,'year=2017',size=20,rotation='270')
ax[4].set_xticks(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Oct','Sep','Nve','Dec'])
for i in range(5):
    ax[i].spines['top'].set_visible(False)
    ax[i].spines['right'].set_visible(False)
    ax[i].spines['bottom'].set_visible(False)
    ax[i].spines['left'].set_visible(False)
    ax[i].set_ylim([-0.5,6])
    ax[i].set_xlim([-0.5,12])
    ax[i].tick_params(axis='both',which='major',labelsize=25)#改变坐标轴的大小
#     ax[i].set_colorbar()   

fig.colorbar(ax[0].scatter(df_y_m_ht.loc[a,'month'],df_y_m_ht.loc[a,'holiday_type'],df_y_m_ht.loc[a,'sales'],c=df_y_m_ht.loc[a,'sales'],cmap='plasma'), ax=[ax[0], ax[1],ax[2],ax[3],ax[4]], shrink=0.9)


plt.text(16,32,'sales',size=20)
plt.show()

在这里插入图片描述

举报

相关推荐

0 条评论