一、基于groupby的数据分组

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False  #针对坐标轴中负号显示不出来的问题
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
print(df7)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False  #针对坐标轴中负号显示不出来的问题
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
print(group_region)

1.1 groupby是什么

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False  #针对坐标轴中负号显示不出来的问题
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
print(len(group_region))

import pandas as pd
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
for i in group_region:
    print(i)

import pandas as pd
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
for i in group_region:
    print(type(i))
    print(len(i))

import pandas as pd
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
for i in group_region:
    print(i[0])
print('\n')
for i in group_region:
    print(i[1])

groupby功能生成的是多个tuple对象，每个tuple对象有两个元素，第一个元素是分类的名字，第二个元素是该分类的所有行构成的DataFrame

1.2 groupby的功能

import pandas as pd
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
print(group_region.mean())

import pandas as pd
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
print(group_region.describe())

2. 数据聚合方法

2.1 使用agg方法聚合数据

import pandas as pd
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
print(df7)

import numpy as np
import pandas as pd
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
print(df7[['2018年_pop','2018年_gdp']].agg(np.sum))  #agg可以对一列或多列执行一个函数

import numpy as np
import pandas as pd
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
print(df7[['2018年_pop','2018年_gdp']].agg([np.sum,np.max]))#agg可以对一列或多列执行多个函数)

使用groupby功能分类后，也可以使用此类聚合方法

import numpy as np
import pandas as pd
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
print(group_region[['2018年_pop','2018年_gdp']].agg([np.mean,np.max]))  #可以让一列或多列执行一个或多个函数

import numpy as np
import pandas as pd
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
print(group_region.agg({'2018年_pop':[np.sum,np.max],'2018年_gdp':np.mean}))
#可以通过字典的形式给每一列指定一个函数，字典格式为{行名：该行执行的函数}
#执行的也可以是函数组

2.2 使用apply方法聚合数据

apply不具有执行多种函数的功能

import numpy as np
import pandas as pd
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
print(group_region['2018年_pop'].apply(np.sum))

2.3 使用transform方法聚合数据

transform和agg，apply等方法不同，看以下例子：

求得各个地区的人口和gdp总值

import numpy as np
import pandas as pd
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
print(group_region[['2018年_pop','2018年_gdp']].apply(np.sum))

思考：如果我们想要求每个省占其所在地区的人口和GDP的占比，怎么办？

import numpy as np
import pandas as pd
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
groupsum=group_region.transform(np.sum)
print(groupsum)

transform方法可以对groupby之后的每个group作用函数，但返回的结果是和原dataframe等长的

import numpy as np
import pandas as pd
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
groupsum=group_region.transform(np.sum)
print(groupsum['2018年_pop'])

import numpy as np
import pandas as pd
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
groupsum=group_region.transform(np.sum)
df7['gdp占区域比例']=df7['2018年_gdp']/ groupsum['2018年_gdp']
print(df7)

import numpy as np
import pandas as pd
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
groupsum=group_region.transform(np.sum)
df7['gdp占区域比例']=df7['2018年_gdp']/ groupsum['2018年_gdp']
df7['人口占区域比例']=df7['2018年_pop']/ groupsum['2018年_pop']
print(df7)

3. groupby分组后的数据可视化

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
groupsum=group_region.transform(np.sum)
df7['gdp占区域比例']=df7['2018年_gdp']/ groupsum['2018年_gdp']
df7['人口占区域比例']=df7['2018年_pop']/ groupsum['2018年_pop']
fig1,ax1=plt.subplots(figsize=(8,8))
drawdf=group_region[['2018年_pop','2018年_gdp']].apply(np.sum)
print(drawdf)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False  ##针对坐标轴中负号显示不出来的问题
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
groupsum=group_region.transform(np.sum)
df7['gdp占区域比例']=df7['2018年_gdp']/ groupsum['2018年_gdp']
df7['人口占区域比例']=df7['2018年_pop']/ groupsum['2018年_pop']
fig1,ax1=plt.subplots(figsize=(8,8))
drawdf=group_region[['2018年_pop','2018年_gdp']].apply(np.sum)
drawdf.plot(ax=ax1)
fig1
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False  ##针对坐标轴中负号显示不出来的问题
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
groupsum=group_region.transform(np.sum)
df7['gdp占区域比例']=df7['2018年_gdp']/ groupsum['2018年_gdp']
df7['人口占区域比例']=df7['2018年_pop']/ groupsum['2018年_pop']
fig1,ax1=plt.subplots(figsize=(8,8))
drawdf=group_region[['2018年_pop','2018年_gdp']].apply(np.sum)
drawdf.plot(ax=ax1)
fig2,ax2=plt.subplots(1,2,figsize=(12,6))
group_region[['2018年_pop','2018年_gdp']].agg(np.sum).plot.bar(ax=ax2,subplots=True)
ax2[0].set_ylabel('人口（万人）')
ax2[1].set_ylabel('GDP（亿元）')
fig2
plt.show()
fig2.savefig('pop and gdp.jpg',bbox_inches = 'tight')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False  ##针对坐标轴中负号显示不出来的问题
pop=pd.read_excel('GDPandPopulation.xlsx',sheet_name='Population',index_col=0)
gdp=pd.read_excel('GDPandPopulation.xlsx',sheet_name='GDP',index_col=0)
df5=pd.concat([pop,gdp],axis=0,keys=['pop','gdp'])  #不用改名字即可合并
df6=pop.join(gdp,lsuffix='_pop', rsuffix='_gdp') #两个dataframe列名相同，需要加lsuffix或rsuffix在左边/右边的列名上加后缀来区分
region=pd.read_excel('GDPandPopulation.xlsx',sheet_name='region',index_col=0)
df7=df6.join(region) #没有重名列可以直接使用
group_region=df7.groupby(by='区域')  #以'区域'这一列进行分类，值相同的行被并为一类
groupsum=group_region.transform(np.sum)
df7['gdp占区域比例']=df7['2018年_gdp']/ groupsum['2018年_gdp']
df7['人口占区域比例']=df7['2018年_pop']/ groupsum['2018年_pop']
fig1,ax1=plt.subplots(figsize=(8,8))
drawdf=group_region[['2018年_pop','2018年_gdp']].apply(np.sum)
drawdf.plot(ax=ax1)
fig2,ax2=plt.subplots(1,2,figsize=(12,6))
group_region[['2018年_pop','2018年_gdp']].agg(np.sum).plot.bar(ax=ax2,subplots=True)
ax2[0].set_ylabel('人口（万人）')
ax2[1].set_ylabel('GDP（亿元）')
fig2
#plt.show()
fig2.savefig('pop and gdp.jpg',bbox_inches = 'tight')
df_sum=group_region[['2018年_pop','2018年_gdp']].agg(np.sum)
df_sum['人均GDP']=df_sum['2018年_gdp']/df_sum['2018年_pop']
print(df_sum)
df_sum.plot.bar(subplots=True)
plt.show()