数据聚合与分组运算
GroupBy技术
import numpy as np
import pandas as pd
from pandas import DataFrame,Series
df = DataFrame({'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randn(5),
'data2':np.random.randn(5)})
df
key1 | key2 | data1 | data2 | |
---|---|---|---|---|
0 | a | one | -0.074122 | -0.571432 |
1 | a | two | 0.347874 | -0.794645 |
2 | b | one | 0.399766 | -0.596056 |
3 | b | two | 1.209857 | -0.266257 |
4 | a | one | -0.001175 | 0.180895 |
#根据key1进行分组,并计算data1列的平均值。
grouped = df['data1'].groupby(df['key1'])
grouped
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001CCD8450910>
grouped.mean()
key1
a 0.090859
b 0.804812
Name: data1, dtype: float64
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means
key1 key2
a one -0.037649
two 0.347874
b one 0.399766
two 1.209857
Name: data1, dtype: float64
means.unstack()
key2 | one | two |
---|---|---|
key1 | ||
a | -0.037649 | 0.347874 |
b | 0.399766 | 1.209857 |
states = np.array(['Ohio','California','California','Ohio','Ohio'])
years = np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()
California 2005 0.347874
2006 0.399766
Ohio 2005 0.567867
2006 -0.001175
Name: data1, dtype: float64
df.groupby('key1').mean()
data1 | data2 | |
---|---|---|
key1 | ||
a | 0.090859 | -0.395061 |
b | 0.804812 | -0.431157 |
df.groupby(['key1','key2']).mean()
data1 | data2 | ||
---|---|---|---|
key1 | key2 | ||
a | one | -0.037649 | -0.195268 |
two | 0.347874 | -0.794645 | |
b | one | 0.399766 | -0.596056 |
two | 1.209857 | -0.266257 |
#GroupBy的size方法,可以返回一个含有分组大小的Series。目前,分组键中的任何缺失值都会被排除在结果之外。
df.groupby(['key1','key2']).size()
key1 key2
a one 2
two 1
b one 1
two 1
dtype: int64
对分组进行迭代
for name,group in df.groupby('key1'):
print(name)
print(group)
a
key1 key2 data1 data2
0 a one -0.074122 -0.571432
1 a two 0.347874 -0.794645
4 a one -0.001175 0.180895
b
key1 key2 data1 data2
2 b one 0.399766 -0.596056
3 b two 1.209857 -0.266257
for (k1,k2),group in df.groupby(['key1','key2']):
print(k1,k2)
print(group)
a one
key1 key2 data1 data2
0 a one -0.074122 -0.571432
4 a one -0.001175 0.180895
a two
key1 key2 data1 data2
1 a two 0.347874 -0.794645
b one
key1 key2 data1 data2
2 b one 0.399766 -0.596056
b two
key1 key2 data1 data2
3 b two 1.209857 -0.266257
pieces = dict(list(df.groupby('key1')))
pieces['b']
key1 | key2 | data1 | data2 | |
---|---|---|---|---|
2 | b | one | 0.399766 | -0.596056 |
3 | b | two | 1.209857 | -0.266257 |
df.dtypes
key1 object
key2 object
data1 float64
data2 float64
dtype: object
grouped = df.groupby(df.dtypes,axis=1)
dict(list(grouped))
{dtype('float64'): data1 data2
0 -0.074122 -0.571432
1 0.347874 -0.794645
2 0.399766 -0.596056
3 1.209857 -0.266257
4 -0.001175 0.180895,
dtype('O'): key1 key2
0 a one
1 a two
2 b one
3 b two
4 a one}
选取一个或一组列
df.groupby('key1')['data1']
df.groupby('key1')[['data2']]
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001CCCF3F55A0>
df.groupby(['key1','key2'])[['data2']].mean()
data2 | ||
---|---|---|
key1 | key2 | |
a | one | -0.195268 |
two | -0.794645 | |
b | one | -0.596056 |
two | -0.266257 |
s_grouped = df.groupby(['key1','key2'])['data2']
s_grouped
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001CCD8452DA0>
s_grouped.mean()
key1 key2
a one -0.195268
two -0.794645
b one -0.596056
two -0.266257
Name: data2, dtype: float64
通过字典或Series进行分组
people = DataFrame(np.random.randn(5,5),
columns=['a','b','c','d','e'],
index=['Joe','Steve','Wes','Jim','Travis'])
people.loc[2:3,['b','c']] = np.nan#添加几个NA值
people
C:\windows\ FutureWarning: Slicing a positional slice with .loc is not supported, and will raise TypeError in a future version. Use .loc with labels or .iloc with positions instead.
people.loc[2:3,['b','c']] = np.nan#添加几个NA值
a | b | c | d | e | |
---|---|---|---|---|---|
Joe | 0.309327 | 1.658107 | 1.146959 | -0.123471 | 0.159285 |
Steve | 1.380735 | -0.703245 | 0.158134 | -1.602958 | 1.455772 |
Wes | -0.766580 | NaN | NaN | 0.074462 | 1.430541 |
Jim | -0.615666 | 2.578830 | -0.002766 | 0.885567 | -0.375239 |
Travis | -0.033534 | 1.158113 | 0.637327 | 1.473547 | 0.373215 |
mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_column = people.groupby(mapping,axis=1)
by_column.sum()
blue | red | |
---|---|---|
Joe | 1.023488 | 2.126719 |
Steve | -1.444824 | 2.133263 |
Wes | 0.074462 | 0.663960 |
Jim | 0.882800 | 1.587925 |
Travis | 2.110874 | 1.497794 |
map_series = Series(mapping)
map_series
a red
b red
c blue
d blue
e red
f orange
dtype: object
people.groupby(map_series,axis=1).count()
blue | red | |
---|---|---|
Joe | 2 | 3 |
Steve | 2 | 3 |
Wes | 1 | 2 |
Jim | 2 | 3 |
Travis | 2 | 3 |
通过函数进行分组
people.groupby(len).sum()
a | b | c | d | e | |
---|---|---|---|---|---|
3 | -1.072920 | 4.236937 | 1.144193 | 0.836558 | 1.214587 |
5 | 1.380735 | -0.703245 | 0.158134 | -1.602958 | 1.455772 |
6 | -0.033534 | 1.158113 | 0.637327 | 1.473547 | 0.373215 |
key_list = ['one','one','one','two','two']
people.groupby([len,key_list]).min()
a | b | c | d | e | ||
---|---|---|---|---|---|---|
3 | one | -0.766580 | 1.658107 | 1.146959 | -0.123471 | 0.159285 |
two | -0.615666 | 2.578830 | -0.002766 | 0.885567 | -0.375239 | |
5 | one | 1.380735 | -0.703245 | 0.158134 | -1.602958 | 1.455772 |
6 | two | -0.033534 | 1.158113 | 0.637327 | 1.473547 | 0.373215 |
根据索引级别分组
columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],
[1,3,5,1,3]],names=['cty','tenor'])
hier_df = DataFrame(np.random.randn(4,5),columns=columns)
hier_df
cty | US | JP | |||
---|---|---|---|---|---|
tenor | 1 | 3 | 5 | 1 | 3 |
0 | 0.971689 | -0.207027 | 0.641528 | 1.197729 | -0.800907 |
1 | 0.906871 | -0.087288 | 0.204273 | -0.009374 | 0.637842 |
2 | 0.649755 | -0.800055 | -0.057130 | -1.087200 | 0.435762 |
3 | -0.618737 | 0.325816 | -0.702310 | -0.519860 | -0.101653 |
hier_df.groupby(level='cty',axis=1).count()
cty | JP | US |
---|---|---|
0 | 2 | 3 |
1 | 2 | 3 |
2 | 2 | 3 |
3 | 2 | 3 |
数据聚合
grouped = df.groupby('key1')
#如果传入的百分位上没有值,则quantile会进行线性插值
grouped['data1'].quantile(0.9)
key1
a 0.278064
b 1.128848
Name: data1, dtype: float64
def peak_to_peak(arr):
return arr.max() - arr.min()
grouped.agg(peak_to_peak)
C:\windows\TFutureWarning: ['key2'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.
grouped.agg(peak_to_peak)
data1 | data2 | |
---|---|---|
key1 | ||
a | 0.421996 | 0.975541 |
b | 0.810090 | 0.329799 |
grouped.describe()
data1 | data2 | |||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
key1 | ||||||||||||||||
a | 3.0 | 0.090859 | 0.22555 | -0.074122 | -0.037649 | -0.001175 | 0.173349 | 0.347874 | 3.0 | -0.395061 | 0.511126 | -0.794645 | -0.683039 | -0.571432 | -0.195268 | 0.180895 |
b | 2.0 | 0.804812 | 0.57282 | 0.399766 | 0.602289 | 0.804812 | 1.007334 | 1.209857 | 2.0 | -0.431157 | 0.233203 | -0.596056 | -0.513606 | -0.431157 | -0.348707 | -0.266257 |
#有个知识点
import matplotlib.pyplot as plt
from pylab import *
img = plt.imread('经过优化的GroupBy的方法.png')
imshow(img)
tips = pd.read_csv("E:\\python_study_files\\python\\pydata-book-2nd-edition\\examples\\tips.csv")
#添加“小费占总额百分比”的列
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips[:6]
total_bill | tip | smoker | day | time | size | tip_pct | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | No | Sun | Dinner | 2 | 0.059447 |
1 | 10.34 | 1.66 | No | Sun | Dinner | 3 | 0.160542 |
2 | 21.01 | 3.50 | No | Sun | Dinner | 3 | 0.166587 |
3 | 23.68 | 3.31 | No | Sun | Dinner | 2 | 0.139780 |
4 | 24.59 | 3.61 | No | Sun | Dinner | 4 | 0.146808 |
5 | 25.29 | 4.71 | No | Sun | Dinner | 4 | 0.186240 |
面向列的多函数应用
grouped = tips.groupby(['day','smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
day smoker
Fri No 0.151650
Yes 0.174783
Sat No 0.158048
Yes 0.147906
Sun No 0.160113
Yes 0.187250
Thur No 0.160298
Yes 0.163863
Name: tip_pct, dtype: float64
grouped_pct.agg(['mean','std',peak_to_peak])
mean | std | peak_to_peak | ||
---|---|---|---|---|
day | smoker | |||
Fri | No | 0.151650 | 0.028123 | 0.067349 |
Yes | 0.174783 | 0.051293 | 0.159925 | |
Sat | No | 0.158048 | 0.039767 | 0.235193 |
Yes | 0.147906 | 0.061375 | 0.290095 | |
Sun | No | 0.160113 | 0.042347 | 0.193226 |
Yes | 0.187250 | 0.154134 | 0.644685 | |
Thur | No | 0.160298 | 0.038774 | 0.193350 |
Yes | 0.163863 | 0.039389 | 0.151240 |
#由(name,function)组成的列表,第一个元素会被用作列名
grouped_pct.agg([('foo','mean'),('bar',np.std)])
foo | bar | ||
---|---|---|---|
day | smoker | ||
Fri | No | 0.151650 | 0.028123 |
Yes | 0.174783 | 0.051293 | |
Sat | No | 0.158048 | 0.039767 |
Yes | 0.147906 | 0.061375 | |
Sun | No | 0.160113 | 0.042347 |
Yes | 0.187250 | 0.154134 | |
Thur | No | 0.160298 | 0.038774 |
Yes | 0.163863 | 0.039389 |
functions = ['count','mean','max']
result = grouped['tip_pct','total_bill'].agg(functions)
result
C:\windowFutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
result = grouped['tip_pct','total_bill'].agg(functions)
tip_pct | total_bill | ||||||
---|---|---|---|---|---|---|---|
count | mean | max | count | mean | max | ||
day | smoker | ||||||
Fri | No | 4 | 0.151650 | 0.187735 | 4 | 18.420000 | 22.75 |
Yes | 15 | 0.174783 | 0.263480 | 15 | 16.813333 | 40.17 | |
Sat | No | 45 | 0.158048 | 0.291990 | 45 | 19.661778 | 48.33 |
Yes | 42 | 0.147906 | 0.325733 | 42 | 21.276667 | 50.81 | |
Sun | No | 57 | 0.160113 | 0.252672 | 57 | 20.506667 | 48.17 |
Yes | 19 | 0.187250 | 0.710345 | 19 | 24.120000 | 45.35 | |
Thur | No | 45 | 0.160298 | 0.266312 | 45 | 17.113111 | 41.19 |
Yes | 17 | 0.163863 | 0.241255 | 17 | 19.190588 | 43.11 |
result['tip_pct']
count | mean | max | ||
---|---|---|---|---|
day | smoker | |||
Fri | No | 4 | 0.151650 | 0.187735 |
Yes | 15 | 0.174783 | 0.263480 | |
Sat | No | 45 | 0.158048 | 0.291990 |
Yes | 42 | 0.147906 | 0.325733 | |
Sun | No | 57 | 0.160113 | 0.252672 |
Yes | 19 | 0.187250 | 0.710345 | |
Thur | No | 45 | 0.160298 | 0.266312 |
Yes | 17 | 0.163863 | 0.241255 |
ftuples = [('Durchschnitt','mean'),('Abweichung',np.var)]
grouped['tip_pct','total_bill'].agg(ftuples)
C:\windowsFutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
grouped['tip_pct','total_bill'].agg(ftuples)
tip_pct | total_bill | ||||
---|---|---|---|---|---|
Durchschnitt | Abweichung | Durchschnitt | Abweichung | ||
day | smoker | ||||
Fri | No | 0.151650 | 0.000791 | 18.420000 | 25.596333 |
Yes | 0.174783 | 0.002631 | 16.813333 | 82.562438 | |
Sat | No | 0.158048 | 0.001581 | 19.661778 | 79.908965 |
Yes | 0.147906 | 0.003767 | 21.276667 | 101.387535 | |
Sun | No | 0.160113 | 0.001793 | 20.506667 | 66.099980 |
Yes | 0.187250 | 0.023757 | 24.120000 | 109.046044 | |
Thur | No | 0.160298 | 0.001503 | 17.113111 | 59.625081 |
Yes | 0.163863 | 0.001551 | 19.190588 | 69.808518 |
#对不同的列应用不同的函数
grouped.agg({'tip':np.max,'size':'sum'})
tip | size | ||
---|---|---|---|
day | smoker | ||
Fri | No | 3.50 | 9 |
Yes | 4.73 | 31 | |
Sat | No | 9.00 | 115 |
Yes | 10.00 | 104 | |
Sun | No | 6.00 | 167 |
Yes | 6.50 | 49 | |
Thur | No | 6.70 | 112 |
Yes | 5.00 | 40 |
grouped.agg({'tip_pct':['min','max','mean','std'],
'size':'sum'})
tip_pct | size | |||||
---|---|---|---|---|---|---|
min | max | mean | std | sum | ||
day | smoker | |||||
Fri | No | 0.120385 | 0.187735 | 0.151650 | 0.028123 | 9 |
Yes | 0.103555 | 0.263480 | 0.174783 | 0.051293 | 31 | |
Sat | No | 0.056797 | 0.291990 | 0.158048 | 0.039767 | 115 |
Yes | 0.035638 | 0.325733 | 0.147906 | 0.061375 | 104 | |
Sun | No | 0.059447 | 0.252672 | 0.160113 | 0.042347 | 167 |
Yes | 0.065660 | 0.710345 | 0.187250 | 0.154134 | 49 | |
Thur | No | 0.072961 | 0.266312 | 0.160298 | 0.038774 | 112 |
Yes | 0.090014 | 0.241255 | 0.163863 | 0.039389 | 40 |
以“无索引”的形式返回聚合数据
tips.groupby(['day','smoker'],as_index=False).mean()
day | smoker | total_bill | tip | size | tip_pct | |
---|---|---|---|---|---|---|
0 | Fri | No | 18.420000 | 2.812500 | 2.250000 | 0.151650 |
1 | Fri | Yes | 16.813333 | 2.714000 | 2.066667 | 0.174783 |
2 | Sat | No | 19.661778 | 3.102889 | 2.555556 | 0.158048 |
3 | Sat | Yes | 21.276667 | 2.875476 | 2.476190 | 0.147906 |
4 | Sun | No | 20.506667 | 3.167895 | 2.929825 | 0.160113 |
5 | Sun | Yes | 24.120000 | 3.516842 | 2.578947 | 0.187250 |
6 | Thur | No | 17.113111 | 2.673778 | 2.488889 | 0.160298 |
7 | Thur | Yes | 19.190588 | 3.030000 | 2.352941 | 0.163863 |
分组级运算和转换
#添加一个用于存放各索引分组平均值的列
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means
mean_data1 | mean_data2 | |
---|---|---|
key1 | ||
a | 0.090859 | -0.395061 |
b | 0.804812 | -0.431157 |
pd.merge(df,k1_means,left_on='key1',right_index=True)
key1 | key2 | data1 | data2 | mean_data1 | mean_data2 | |
---|---|---|---|---|---|---|
0 | a | one | -0.074122 | -0.571432 | 0.090859 | -0.395061 |
1 | a | two | 0.347874 | -0.794645 | 0.090859 | -0.395061 |
4 | a | one | -0.001175 | 0.180895 | 0.090859 | -0.395061 |
2 | b | one | 0.399766 | -0.596056 | 0.804812 | -0.431157 |
3 | b | two | 1.209857 | -0.266257 | 0.804812 | -0.431157 |
key = ['one','two','one','two','one']
people.groupby(key).mean()
a | b | c | d | e | |
---|---|---|---|---|---|
one | -0.163596 | 1.408110 | 0.892143 | 0.474846 | 0.654347 |
two | 0.382534 | 0.937792 | 0.077684 | -0.358695 | 0.540267 |
#transform会将一个函数应用到各个分组
people.groupby(key).transform(np.mean)
a | b | c | d | e | |
---|---|---|---|---|---|
Joe | -0.163596 | 1.408110 | 0.892143 | 0.474846 | 0.654347 |
Steve | 0.382534 | 0.937792 | 0.077684 | -0.358695 | 0.540267 |
Wes | -0.163596 | 1.408110 | 0.892143 | 0.474846 | 0.654347 |
Jim | 0.382534 | 0.937792 | 0.077684 | -0.358695 | 0.540267 |
Travis | -0.163596 | 1.408110 | 0.892143 | 0.474846 | 0.654347 |
#从各组中减去平均值
def demean(arr):
return arr-arr.mean()
demeaned = people.groupby(key).transform(demean)
demeaned
a | b | c | d | e | |
---|---|---|---|---|---|
Joe | 0.472923 | 0.249997 | 0.254816 | -0.598317 | -0.495062 |
Steve | 0.998201 | -1.641038 | 0.080450 | -1.244262 | 0.915506 |
Wes | -0.602985 | NaN | NaN | -0.400384 | 0.776194 |
Jim | -0.998201 | 1.641038 | -0.080450 | 1.244262 | -0.915506 |
Travis | 0.130062 | -0.249997 | -0.254816 | 0.998701 | -0.281132 |
demeaned.groupby(key).mean()
a | b | c | d | e | |
---|---|---|---|---|---|
one | 2.775558e-17 | 0.000000e+00 | -5.551115e-17 | 7.401487e-17 | -1.110223e-16 |
two | 0.000000e+00 | 1.110223e-16 | -6.938894e-18 | 0.000000e+00 | -5.551115e-17 |
apply:一般性的“拆分——应用——合并”
#在指定列找到最大值,然后把这个值所在的行选取出来
#将sort_index()改为sort_values()即可
def top(df,n=5,column='tip_pct'):
return df.sort_values(by=column)[-n:]
top(tips,n=6)
total_bill | tip | smoker | day | time | size | tip_pct | |
---|---|---|---|---|---|---|---|
109 | 14.31 | 4.00 | Yes | Sat | Dinner | 2 | 0.279525 |
183 | 23.17 | 6.50 | Yes | Sun | Dinner | 4 | 0.280535 |
232 | 11.61 | 3.39 | No | Sat | Dinner | 2 | 0.291990 |
67 | 3.07 | 1.00 | Yes | Sat | Dinner | 1 | 0.325733 |
178 | 9.60 | 4.00 | Yes | Sun | Dinner | 2 | 0.416667 |
172 | 7.25 | 5.15 | Yes | Sun | Dinner | 2 | 0.710345 |
tips.groupby('smoker').apply(top)
total_bill | tip | smoker | day | time | size | tip_pct | ||
---|---|---|---|---|---|---|---|---|
smoker | ||||||||
No | 88 | 24.71 | 5.85 | No | Thur | Lunch | 2 | 0.236746 |
185 | 20.69 | 5.00 | No | Sun | Dinner | 5 | 0.241663 | |
51 | 10.29 | 2.60 | No | Sun | Dinner | 2 | 0.252672 | |
149 | 7.51 | 2.00 | No | Thur | Lunch | 2 | 0.266312 | |
232 | 11.61 | 3.39 | No | Sat | Dinner | 2 | 0.291990 | |
Yes | 109 | 14.31 | 4.00 | Yes | Sat | Dinner | 2 | 0.279525 |
183 | 23.17 | 6.50 | Yes | Sun | Dinner | 4 | 0.280535 | |
67 | 3.07 | 1.00 | Yes | Sat | Dinner | 1 | 0.325733 | |
178 | 9.60 | 4.00 | Yes | Sun | Dinner | 2 | 0.416667 | |
172 | 7.25 | 5.15 | Yes | Sun | Dinner | 2 | 0.710345 |
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')
total_bill | tip | smoker | day | time | size | tip_pct | |||
---|---|---|---|---|---|---|---|---|---|
smoker | day | ||||||||
No | Fri | 94 | 22.75 | 3.25 | No | Fri | Dinner | 2 | 0.142857 |
Sat | 212 | 48.33 | 9.00 | No | Sat | Dinner | 4 | 0.186220 | |
Sun | 156 | 48.17 | 5.00 | No | Sun | Dinner | 6 | 0.103799 | |
Thur | 142 | 41.19 | 5.00 | No | Thur | Lunch | 5 | 0.121389 | |
Yes | Fri | 95 | 40.17 | 4.73 | Yes | Fri | Dinner | 4 | 0.117750 |
Sat | 170 | 50.81 | 10.00 | Yes | Sat | Dinner | 3 | 0.196812 | |
Sun | 182 | 45.35 | 3.50 | Yes | Sun | Dinner | 3 | 0.077178 | |
Thur | 197 | 43.11 | 5.00 | Yes | Thur | Lunch | 4 | 0.115982 |
result = tips.groupby('smoker')['tip_pct'].describe()
result
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
smoker | ||||||||
No | 151.0 | 0.159328 | 0.039910 | 0.056797 | 0.136906 | 0.155625 | 0.185014 | 0.291990 |
Yes | 93.0 | 0.163196 | 0.085119 | 0.035638 | 0.106771 | 0.153846 | 0.195059 | 0.710345 |
result.unstack('smoker')
smoker
count No 151.000000
Yes 93.000000
mean No 0.159328
Yes 0.163196
std No 0.039910
Yes 0.085119
min No 0.056797
Yes 0.035638
25% No 0.136906
Yes 0.106771
50% No 0.155625
Yes 0.153846
75% No 0.185014
Yes 0.195059
max No 0.291990
Yes 0.710345
dtype: float64
当调用describe之类的方法时,实际上只是应用了以下两条代码的快捷方式:
f = lambda x: x.describe()
grouped.apply(f)
禁止分组键
tips.groupby('smoker',group_keys=False).apply(top)
total_bill | tip | smoker | day | time | size | tip_pct | |
---|---|---|---|---|---|---|---|
88 | 24.71 | 5.85 | No | Thur | Lunch | 2 | 0.236746 |
185 | 20.69 | 5.00 | No | Sun | Dinner | 5 | 0.241663 |
51 | 10.29 | 2.60 | No | Sun | Dinner | 2 | 0.252672 |
149 | 7.51 | 2.00 | No | Thur | Lunch | 2 | 0.266312 |
232 | 11.61 | 3.39 | No | Sat | Dinner | 2 | 0.291990 |
109 | 14.31 | 4.00 | Yes | Sat | Dinner | 2 | 0.279525 |
183 | 23.17 | 6.50 | Yes | Sun | Dinner | 4 | 0.280535 |
67 | 3.07 | 1.00 | Yes | Sat | Dinner | 1 | 0.325733 |
178 | 9.60 | 4.00 | Yes | Sun | Dinner | 2 | 0.416667 |
172 | 7.25 | 5.15 | Yes | Sun | Dinner | 2 | 0.710345 |
分位数和桶分析
frame = DataFrame({'data1':np.random.randn(1000),
'data2':np.random.randn(1000)})
factor = pd.cut(frame.data1,4)
factor[:10]
0 (-1.448, 0.107]
1 (-1.448, 0.107]
2 (-1.448, 0.107]
3 (-1.448, 0.107]
4 (0.107, 1.663]
5 (0.107, 1.663]
6 (0.107, 1.663]
7 (-1.448, 0.107]
8 (-1.448, 0.107]
9 (0.107, 1.663]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-3.01, -1.448] < (-1.448, 0.107] < (0.107, 1.663] < (1.663, 3.218]]
def get_stats(group):
return {'min':group.min(),'max':group.max(),
'count':group.count(),'mean':group.mean()}
grouped = frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()
#区间大小相等
min | max | count | mean | |
---|---|---|---|---|
data1 | ||||
(-3.01, -1.448] | -2.614910 | 2.368046 | 70.0 | -0.092146 |
(-1.448, 0.107] | -2.534962 | 2.783160 | 479.0 | 0.009041 |
(0.107, 1.663] | -3.073771 | 2.513553 | 398.0 | -0.091291 |
(1.663, 3.218] | -2.699080 | 2.373634 | 53.0 | -0.099021 |
#数据点数量相等,使用qcut
#返回分位数编号
grouping = pd.qcut(frame.data1,10,labels=False)
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()
min | max | count | mean | |
---|---|---|---|---|
data1 | ||||
0 | -2.614910 | 2.783160 | 100.0 | 0.006906 |
1 | -2.534962 | 2.490249 | 100.0 | -0.101695 |
2 | -2.015862 | 2.261854 | 100.0 | 0.084059 |
3 | -2.250966 | 2.509572 | 100.0 | -0.000924 |
4 | -2.068747 | 2.425219 | 100.0 | 0.119523 |
5 | -2.913492 | 2.032037 | 100.0 | -0.233505 |
6 | -2.432055 | 1.983781 | 100.0 | -0.038541 |
7 | -2.339164 | 2.046824 | 100.0 | -0.096358 |
8 | -3.073771 | 2.235941 | 100.0 | -0.091584 |
9 | -2.699080 | 2.513553 | 100.0 | -0.084895 |
示例:用特定于分组的值填充缺失值
#用平均值填充NA值
s = Series(np.random.randn(6))
s[::2] = np.nan
s
0 NaN
1 0.209858
2 NaN
3 1.379023
4 NaN
5 -0.743300
dtype: float64
s.fillna(s.mean())
0 0.281860
1 0.209858
2 0.281860
3 1.379023
4 0.281860
5 -0.743300
dtype: float64
states = ['Ohio','New York','Vermont','Florida',
'Oregon','Nevada','California','Idaho']
group_key = ['East']*4+['West']*4
data = Series(np.random.randn(8),index=states)
data[['Vermont','Nevada','Idaho']] = np.nan
data
Ohio 0.155978
New York -0.133767
Vermont NaN
Florida -0.765162
Oregon 0.682524
Nevada NaN
California 0.730390
Idaho NaN
dtype: float64
data.groupby(group_key).mean()
East -0.247650
West 0.706457
dtype: float64
fill_mean = lambda g:g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)
Ohio 0.155978
New York -0.133767
Vermont -0.247650
Florida -0.765162
Oregon 0.682524
Nevada 0.706457
California 0.730390
Idaho 0.706457
dtype: float64
fill_values = {'East':0.5,'West':-1}
fill_func = lambda g:g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)
Ohio 0.155978
New York -0.133767
Vermont 0.500000
Florida -0.765162
Oregon 0.682524
Nevada -1.000000
California 0.730390
Idaho -1.000000
dtype: float64
示例:随机采样和排列
抽取的一个办法:选取np.random.permutation(N)的前K个元素,其中N为完整数据的大小,K为期望的样本大小。
#红桃(Hearts)、黑桃(Spades)、梅花(Clubs)、方片(Diamonds)
suits = ['H','S','C','D']
#python2中,range()返回的是list,可以将两个range()直接相加,如range(5)+range(10) ;python3中,range()成了一个class
card_val = (list(range(1,11))+ [10] * 3) * 4
base_names = ['A'] + list(range(2,11)) + ['J','K','Q']
cards = []
for suit in ['H','S','C','D']:
cards.extend(str(num) + suit for num in base_names)
deck = Series(card_val,index=cards)
deck[:13]
AH 1
2H 2
3H 3
4H 4
5H 5
6H 6
7H 7
8H 8
9H 9
10H 10
JH 10
KH 10
QH 10
dtype: int64
def draw(deck,n=5):
return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)
7H 7
4D 4
8H 8
QC 10
4S 4
dtype: int64
#从每种花色中随机抽取两张牌
get_suit = lambda card:card[-1]#只要最后一个字母
deck.groupby(get_suit).apply(draw,n=2)
C AC 1
JC 10
D 5D 5
8D 8
H 10H 10
JH 10
S 9S 9
5S 5
dtype: int64
#另一种办法
deck.groupby(get_suit,group_keys=False).apply(draw,n=2)
10C 10
AC 1
KD 10
10D 10
3H 3
9H 9
5S 5
8S 8
dtype: int64
示例:分组加权平均数和相关系数
df = DataFrame({'category':['a','a','a','a','b','b','b','b'],
'data':np.random.randn(8),
'weights':np.random.randn(8)})
df
category | data | weights | |
---|---|---|---|
0 | a | 0.591317 | -1.032939 |
1 | a | -0.589692 | 0.436704 |
2 | a | -0.128848 | 2.257153 |
3 | a | -0.774626 | 0.811910 |
4 | b | -2.050679 | 1.144802 |
5 | b | 1.216111 | 0.736471 |
6 | b | -0.801366 | 0.139008 |
7 | b | -1.577430 | -0.576198 |
grouped = df.groupby('category')
get_wavg = lambda g: np.average(g['data'],weights=g['weights'])
grouped.apply(get_wavg)
category
a -0.723088
b -0.453212
dtype: float64
close_px = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\stock_px.csv",parse_dates=True,index_col=0)
close_px
AA | AAPL | GE | IBM | JNJ | MSFT | PEP | SPX | XOM | |
---|---|---|---|---|---|---|---|---|---|
1990-02-01 | 4.98 | 7.86 | 2.87 | 16.79 | 4.27 | 0.51 | 6.04 | 328.79 | 6.12 |
1990-02-02 | 5.04 | 8.00 | 2.87 | 16.89 | 4.37 | 0.51 | 6.09 | 330.92 | 6.24 |
1990-02-05 | 5.07 | 8.18 | 2.87 | 17.32 | 4.34 | 0.51 | 6.05 | 331.85 | 6.25 |
1990-02-06 | 5.01 | 8.12 | 2.88 | 17.56 | 4.32 | 0.51 | 6.15 | 329.66 | 6.23 |
1990-02-07 | 5.04 | 7.77 | 2.91 | 17.93 | 4.38 | 0.51 | 6.17 | 333.75 | 6.33 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2011-10-10 | 10.09 | 388.81 | 16.14 | 186.62 | 64.43 | 26.94 | 61.87 | 1194.89 | 76.28 |
2011-10-11 | 10.30 | 400.29 | 16.14 | 185.00 | 63.96 | 27.00 | 60.95 | 1195.54 | 76.27 |
2011-10-12 | 10.05 | 402.19 | 16.40 | 186.12 | 64.33 | 26.96 | 62.70 | 1207.25 | 77.16 |
2011-10-13 | 10.10 | 408.43 | 16.22 | 186.82 | 64.23 | 27.18 | 62.36 | 1203.66 | 76.37 |
2011-10-14 | 10.26 | 422.00 | 16.60 | 190.53 | 64.72 | 27.27 | 62.24 | 1224.58 | 78.11 |
5472 rows × 9 columns
close_px[-4:]
AA | AAPL | GE | IBM | JNJ | MSFT | PEP | SPX | XOM | |
---|---|---|---|---|---|---|---|---|---|
2011-10-11 | 10.30 | 400.29 | 16.14 | 185.00 | 63.96 | 27.00 | 60.95 | 1195.54 | 76.27 |
2011-10-12 | 10.05 | 402.19 | 16.40 | 186.12 | 64.33 | 26.96 | 62.70 | 1207.25 | 77.16 |
2011-10-13 | 10.10 | 408.43 | 16.22 | 186.82 | 64.23 | 27.18 | 62.36 | 1203.66 | 76.37 |
2011-10-14 | 10.26 | 422.00 | 16.60 | 190.53 | 64.72 | 27.27 | 62.24 | 1224.58 | 78.11 |
#计算日收益率与SPX之间的年度相关系数
rets = close_px.pct_change().dropna()
spx_corr = lambda x:x.corrwith(x['SPX'])
by_year = rets.groupby(lambda x:x.year)
by_year.apply(spx_corr)
AA | AAPL | GE | IBM | JNJ | MSFT | PEP | SPX | XOM | |
---|---|---|---|---|---|---|---|---|---|
1990 | 0.595024 | 0.545067 | 0.752187 | 0.738361 | 0.801145 | 0.586691 | 0.783168 | 1.0 | 0.517586 |
1991 | 0.453574 | 0.365315 | 0.759607 | 0.557046 | 0.646401 | 0.524225 | 0.641775 | 1.0 | 0.569335 |
1992 | 0.398180 | 0.498732 | 0.632685 | 0.262232 | 0.515740 | 0.492345 | 0.473871 | 1.0 | 0.318408 |
1993 | 0.259069 | 0.238578 | 0.447257 | 0.211269 | 0.451503 | 0.425377 | 0.385089 | 1.0 | 0.318952 |
1994 | 0.428549 | 0.268420 | 0.572996 | 0.385162 | 0.372962 | 0.436585 | 0.450516 | 1.0 | 0.395078 |
1995 | 0.291532 | 0.161829 | 0.519126 | 0.416390 | 0.315733 | 0.453660 | 0.413144 | 1.0 | 0.368752 |
1996 | 0.292344 | 0.191482 | 0.750724 | 0.388497 | 0.569232 | 0.564015 | 0.421477 | 1.0 | 0.538736 |
1997 | 0.564427 | 0.211435 | 0.827512 | 0.646823 | 0.703538 | 0.606171 | 0.509344 | 1.0 | 0.695653 |
1998 | 0.533802 | 0.379883 | 0.815243 | 0.623982 | 0.591988 | 0.698773 | 0.494213 | 1.0 | 0.369264 |
1999 | 0.099033 | 0.425584 | 0.710928 | 0.486167 | 0.517061 | 0.631315 | 0.336593 | 1.0 | 0.315383 |
2000 | 0.265359 | 0.440161 | 0.610362 | 0.445114 | 0.189765 | 0.538005 | 0.077525 | 1.0 | 0.084163 |
2001 | 0.624069 | 0.577152 | 0.794632 | 0.696038 | 0.111493 | 0.696447 | 0.133975 | 1.0 | 0.336869 |
2002 | 0.748021 | 0.580548 | 0.822373 | 0.716490 | 0.584758 | 0.784728 | 0.487211 | 1.0 | 0.759933 |
2003 | 0.690466 | 0.545582 | 0.777643 | 0.741775 | 0.562399 | 0.750534 | 0.541487 | 1.0 | 0.662775 |
2004 | 0.591485 | 0.374283 | 0.728626 | 0.601740 | 0.354690 | 0.588531 | 0.466854 | 1.0 | 0.557742 |
2005 | 0.564267 | 0.467540 | 0.675637 | 0.516846 | 0.444728 | 0.562374 | 0.489559 | 1.0 | 0.631010 |
2006 | 0.487638 | 0.428267 | 0.612388 | 0.598636 | 0.394026 | 0.406126 | 0.335054 | 1.0 | 0.518514 |
2007 | 0.642427 | 0.508118 | 0.796945 | 0.603906 | 0.568423 | 0.658770 | 0.651911 | 1.0 | 0.786264 |
2008 | 0.781057 | 0.681434 | 0.777337 | 0.833074 | 0.801005 | 0.804626 | 0.709264 | 1.0 | 0.828303 |
2009 | 0.735642 | 0.707103 | 0.713086 | 0.684513 | 0.603146 | 0.654902 | 0.541474 | 1.0 | 0.797921 |
2010 | 0.745700 | 0.710105 | 0.822285 | 0.783638 | 0.689896 | 0.730118 | 0.626655 | 1.0 | 0.839057 |
2011 | 0.882045 | 0.691931 | 0.864595 | 0.802730 | 0.752379 | 0.800996 | 0.592029 | 1.0 | 0.859975 |
#苹果和微软的年度相关系数
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))
1990 0.408271
1991 0.266807
1992 0.450592
1993 0.236917
1994 0.361638
1995 0.258642
1996 0.147539
1997 0.196144
1998 0.364106
1999 0.329484
2000 0.275298
2001 0.563156
2002 0.571435
2003 0.486262
2004 0.259024
2005 0.300093
2006 0.161735
2007 0.417738
2008 0.611901
2009 0.432738
2010 0.571946
2011 0.581987
dtype: float64
示例:面向分组的线性回归
import statsmodels.api as sm
def regress(data,yvar,xvars):
Y = data[yvar]
X = data[xvars]
X['intercept'] = 1.
result = sm.OLS(Y,X).fit()
return result.params
by_year.apply(regress,'AAPL',['SPX'])
E:\python_study_files\python_pip\.venvs\lpthw\lib\site-packages\statsmodels\compat\pandas.py:65: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
from pandas import Int64Index as NumericIndex
SPX | intercept | |
---|---|---|
1990 | 1.512772 | 0.001395 |
1991 | 1.187351 | 0.000396 |
1992 | 1.832427 | 0.000164 |
1993 | 1.390470 | -0.002657 |
1994 | 1.190277 | 0.001617 |
1995 | 0.858818 | -0.001423 |
1996 | 0.829389 | -0.001791 |
1997 | 0.749928 | -0.001901 |
1998 | 1.164582 | 0.004075 |
1999 | 1.384989 | 0.003273 |
2000 | 1.733802 | -0.002523 |
2001 | 1.676128 | 0.003122 |
2002 | 1.080795 | -0.000219 |
2003 | 1.187770 | 0.000690 |
2004 | 1.363463 | 0.004201 |
2005 | 1.766415 | 0.003246 |
2006 | 1.645496 | 0.000080 |
2007 | 1.198761 | 0.003438 |
2008 | 0.968016 | -0.001110 |
2009 | 0.879103 | 0.002954 |
2010 | 1.052608 | 0.001261 |
2011 | 0.806605 | 0.001514 |
透视表和交叉表
tips.pivot_table(index=['day','smoker'])
size | tip | tip_pct | total_bill | ||
---|---|---|---|---|---|
day | smoker | ||||
Fri | No | 2.250000 | 2.812500 | 0.151650 | 18.420000 |
Yes | 2.066667 | 2.714000 | 0.174783 | 16.813333 | |
Sat | No | 2.555556 | 3.102889 | 0.158048 | 19.661778 |
Yes | 2.476190 | 2.875476 | 0.147906 | 21.276667 | |
Sun | No | 2.929825 | 3.167895 | 0.160113 | 20.506667 |
Yes | 2.578947 | 3.516842 | 0.187250 | 24.120000 | |
Thur | No | 2.488889 | 2.673778 | 0.160298 | 17.113111 |
Yes | 2.352941 | 3.030000 | 0.163863 | 19.190588 |
tips.pivot_table(['tip_pct','size'],index=['time','day'],columns='smoker')
size | tip_pct | ||||
---|---|---|---|---|---|
smoker | No | Yes | No | Yes | |
time | day | ||||
Dinner | Fri | 2.000000 | 2.222222 | 0.139622 | 0.165347 |
Sat | 2.555556 | 2.476190 | 0.158048 | 0.147906 | |
Sun | 2.929825 | 2.578947 | 0.160113 | 0.187250 | |
Thur | 2.000000 | NaN | 0.159744 | NaN | |
Lunch | Fri | 3.000000 | 1.833333 | 0.187735 | 0.188937 |
Thur | 2.500000 | 2.352941 | 0.160311 | 0.163863 |
tips.pivot_table(['tip_pct','size'],index=['time','day'],
columns='smoker',margins=True)
size | tip_pct | ||||||
---|---|---|---|---|---|---|---|
smoker | No | Yes | All | No | Yes | All | |
time | day | ||||||
Dinner | Fri | 2.000000 | 2.222222 | 2.166667 | 0.139622 | 0.165347 | 0.158916 |
Sat | 2.555556 | 2.476190 | 2.517241 | 0.158048 | 0.147906 | 0.153152 | |
Sun | 2.929825 | 2.578947 | 2.842105 | 0.160113 | 0.187250 | 0.166897 | |
Thur | 2.000000 | NaN | 2.000000 | 0.159744 | NaN | 0.159744 | |
Lunch | Fri | 3.000000 | 1.833333 | 2.000000 | 0.187735 | 0.188937 | 0.188765 |
Thur | 2.500000 | 2.352941 | 2.459016 | 0.160311 | 0.163863 | 0.161301 | |
All | 2.668874 | 2.408602 | 2.569672 | 0.159328 | 0.163196 | 0.160803 |
tips.pivot_table('tip_pct',index=['time','smoker'],columns='day',
aggfunc=len,margins=True)
day | Fri | Sat | Sun | Thur | All | |
---|---|---|---|---|---|---|
time | smoker | |||||
Dinner | No | 3.0 | 45.0 | 57.0 | 1.0 | 106 |
Yes | 9.0 | 42.0 | 19.0 | NaN | 70 | |
Lunch | No | 1.0 | NaN | NaN | 44.0 | 45 |
Yes | 6.0 | NaN | NaN | 17.0 | 23 | |
All | 19.0 | 87.0 | 76.0 | 62.0 | 244 |
tips.pivot_table('size',index=['time','smoker'],
columns='day',aggfunc='sum',fill_value=0)
day | Fri | Sat | Sun | Thur | |
---|---|---|---|---|---|
time | smoker | ||||
Dinner | No | 6 | 115 | 167 | 2 |
Yes | 20 | 104 | 49 | 0 | |
Lunch | No | 3 | 0 | 0 | 110 |
Yes | 11 | 0 | 0 | 40 |
#有个知识点
import matplotlib.pyplot as plt
from pylab import *
img = plt.imread('pivot_table的参数.png')
imshow(img)
rows改为index,cols改为columns
交叉表:crosstab
pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)
smoker | No | Yes | All | |
---|---|---|---|---|
time | day | |||
Dinner | Fri | 3 | 9 | 12 |
Sat | 45 | 42 | 87 | |
Sun | 57 | 19 | 76 | |
Thur | 1 | 0 | 1 | |
Lunch | Fri | 1 | 6 | 7 |
Thur | 44 | 17 | 61 | |
All | 151 | 93 | 244 |
示例:2012联邦选举委员会数据库
fec =pd.read_csv("E:\\python_study_files\\python\\pydata-book-2nd-edition\\datasets\\fec\\P00000001-ALL.csv")
fec
C:\windowsDtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.
fec =pd.read_csv("E:\\python_study_files\\python\\pydata-book-2nd-edition\\datasets\\fec\\P00000001-ALL.csv")
cmte_id | cand_id | cand_nm | contbr_nm | contbr_city | contbr_st | contbr_zip | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | receipt_desc | memo_cd | memo_text | form_tp | file_num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | C00410118 | P20002978 | Bachmann, Michelle | HARVEY, WILLIAM | MOBILE | AL | 366010290.0 | RETIRED | RETIRED | 250.0 | 20-JUN-11 | NaN | NaN | NaN | SA17A | 736166 |
1 | C00410118 | P20002978 | Bachmann, Michelle | HARVEY, WILLIAM | MOBILE | AL | 366010290.0 | RETIRED | RETIRED | 50.0 | 23-JUN-11 | NaN | NaN | NaN | SA17A | 736166 |
2 | C00410118 | P20002978 | Bachmann, Michelle | SMITH, LANIER | LANETT | AL | 368633403.0 | INFORMATION REQUESTED | INFORMATION REQUESTED | 250.0 | 05-JUL-11 | NaN | NaN | NaN | SA17A | 749073 |
3 | C00410118 | P20002978 | Bachmann, Michelle | BLEVINS, DARONDA | PIGGOTT | AR | 724548253.0 | NONE | RETIRED | 250.0 | 01-AUG-11 | NaN | NaN | NaN | SA17A | 749073 |
4 | C00410118 | P20002978 | Bachmann, Michelle | WARDENBURG, HAROLD | HOT SPRINGS NATION | AR | 719016467.0 | NONE | RETIRED | 300.0 | 20-JUN-11 | NaN | NaN | NaN | SA17A | 736166 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1001726 | C00500587 | P20003281 | Perry, Rick | GORMAN, CHRIS D. MR. | INFO REQUESTED | XX | 99999 | INFORMATION REQUESTED PER BEST EFFORTS | INFORMATION REQUESTED PER BEST EFFORTS | 5000.0 | 29-SEP-11 | REATTRIBUTION / REDESIGNATION REQUESTED (AUTOM... | NaN | REATTRIBUTION / REDESIGNATION REQUESTED (AUTOM... | SA17A | 751678 |
1001727 | C00500587 | P20003281 | Perry, Rick | DUFFY, DAVID A. MR. | INFO REQUESTED | XX | 99999 | DUFFY EQUIPMENT COMPANY INC. | BUSINESS OWNER | 2500.0 | 30-SEP-11 | NaN | NaN | NaN | SA17A | 751678 |
1001728 | C00500587 | P20003281 | Perry, Rick | GRANE, BRYAN F. MR. | INFO REQUESTED | XX | 99999 | INFORMATION REQUESTED PER BEST EFFORTS | INFORMATION REQUESTED PER BEST EFFORTS | 500.0 | 29-SEP-11 | NaN | NaN | NaN | SA17A | 751678 |
1001729 | C00500587 | P20003281 | Perry, Rick | TOLBERT, DARYL MR. | INFO REQUESTED | XX | 99999 | T.A.C.C. | LONGWALL MAINTENANCE FOREMAN | 500.0 | 30-SEP-11 | NaN | NaN | NaN | SA17A | 751678 |
1001730 | C00500587 | P20003281 | Perry, Rick | ANDERSON, MARILEE MRS. | INFO REQUESTED | XX | 99999 | INFORMATION REQUESTED PER BEST EFFORTS | INFORMATION REQUESTED PER BEST EFFORTS | 2500.0 | 31-AUG-11 | NaN | NaN | NaN | SA17A | 751678 |
1001731 rows × 16 columns
fec.loc[123456]
cmte_id C00431445
cand_id P80003338
cand_nm Obama, Barack
contbr_nm ELLMAN, IRA
contbr_city TEMPE
contbr_st AZ
contbr_zip 852816719
contbr_employer ARIZONA STATE UNIVERSITY
contbr_occupation PROFESSOR
contb_receipt_amt 50.0
contb_receipt_dt 01-DEC-11
receipt_desc NaN
memo_cd NaN
memo_text NaN
form_tp SA17A
file_num 772372
Name: 123456, dtype: object
unique_cands = fec.cand_nm.unique()
unique_cands
array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
"Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick',
'Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G',
'Huntsman, Jon', 'Perry, Rick'], dtype=object)
unique_cands[2]
'Obama, Barack'
parties = {'Bachmann, Michelle':'Republican',
'Cain, Herman':'Republican',
'Gingrich, Newt':'Republican',
'Huntsman, Jon':'Republican',
'Johnson, Gary Earl':'Republican',
'McCotter, Thaddeus G':'Republican',
'Obama, Barack':'Democrat',
'Paul, Ron':'Republican',
'Pawlenty, Timothy':'Republican',
'Perry, Rick':'Republican',
"Roemer, Charles E. 'Buddy' Ⅲ":'Republican',
'Romney, Mitt':'Republican',
'Santorum, Rick':'Republican'}
fec.cand_nm[123456:123461]
125611 Obama, Barack
125612 Obama, Barack
125613 Obama, Barack
125614 Obama, Barack
125615 Obama, Barack
Name: cand_nm, dtype: object
fec.cand_nm[123456:123461].map(parties)
125611 Democrat
125612 Democrat
125613 Democrat
125614 Democrat
125615 Democrat
Name: cand_nm, dtype: object
#添加一个新列
fec['party'] = fec.cand_nm.map(parties)
fec['party'].value_counts()
Democrat 589127
Republican 396504
Name: party, dtype: int64
(fec.contb_receipt_amt>0).value_counts()
True 991475
Name: contb_receipt_amt, dtype: int64
fec = fec[fec.contb_receipt_amt>0]
fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]
雇主职业和雇主统计赞助信息
fec.contbr_occupation.value_counts()[:10]
RETIRED 233990
NOT PROVIDED 56245
ATTORNEY 34286
HOMEMAKER 29931
PHYSICIAN 23432
ENGINEER 14334
TEACHER 13990
CONSULTANT 13273
PROFESSOR 12555
NOT EMPLOYED 9828
Name: contbr_occupation, dtype: int64
occ_mapping = {'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
'INFORMATION REQUESTED':'NOT PROVIDED',
'INFORMATIO REQUESTED (BEST EFFORTS)':'NOT PROVIDED',
'C.E.O':'CEO'}
#如果没有提供相关映射,则返回X
f = lambda x:occ_mapping.get(x,x)
fec.contbr_occupation = fec.contbr_occupation.map(f)
emp_mapping = {'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
'INFORMATION REQUESTED':'NOT PROVIDED',
'SELF':'SELF-EMPLOYED',
'SELF EMPLOYED':'SELF-EMPLOTED'
}
f = lambda x:emp_mapping.get(x,x)
fec.contbr_employer = fec.contbr_employer.map(f)
by_occupation = fec.pivot_table('contb_receipt_amt',index='contbr_occupation',columns='party',aggfunc='sum')
over_2mm = by_occupation[by_occupation.sum(1)>2000000]
over_2mm
party | Democrat | Republican |
---|---|---|
contbr_occupation | ||
ATTORNEY | 11141982.97 | 7462058.31 |
C.E.O. | 1690.00 | 2592983.11 |
CEO | 2074284.79 | 1638668.41 |
CONSULTANT | 2459912.71 | 2538990.45 |
ENGINEER | 951525.55 | 1811937.30 |
EXECUTIVE | 1355161.05 | 4136400.09 |
HOMEMAKER | 4248875.80 | 13625600.78 |
INVESTOR | 884133.00 | 2431258.92 |
LAWYER | 3160478.87 | 391124.32 |
MANAGER | 762883.22 | 1441092.37 |
NOT PROVIDED | 4866973.96 | 20216287.01 |
OWNER | 1001567.36 | 2406081.92 |
PHYSICIAN | 3735124.94 | 3587195.24 |
PRESIDENT | 1878509.95 | 4717413.76 |
PROFESSOR | 2165071.08 | 294032.73 |
REAL ESTATE | 528902.09 | 1624507.25 |
RETIRED | 25305116.38 | 23481023.18 |
SELF-EMPLOYED | 672393.40 | 1636774.54 |
over_2mm.plot(kind='barh')
def get_top_amounts(group,key,n=5):
totals = group.groupby(key)['contb_receipt_amt'].sum()
#根据key对totals进行降序排列
return totals.sort_values(ascending=False)[n:]
grouped = fec_mrbo.groupby('cand_nm')
grouped.apply(get_top_amounts,'contbr_occupation',n=7)
cand_nm contbr_occupation
Obama, Barack PROFESSOR 2165071.08
CEO 2074284.79
PRESIDENT 1878509.95
NOT EMPLOYED 1709188.20
EXECUTIVE 1355161.05
...
Romney, Mitt INDEPENDENT PROFESSIONAL 3.00
IFC CONTRACTING SOLUTIONS 3.00
REMODELER & SEMI RETIRED 3.00
AFFORDABLE REAL ESTATE DEVELOPER 3.00
3RD GENERATION FAMILY BUSINESS OWNER 3.00
Name: contb_receipt_amt, Length: 35973, dtype: float64
grouped.apply(get_top_amounts,'contbr_employer',n=10)
cand_nm contbr_employer
Obama, Barack REFUSED 149516.07
DLA PIPER 148235.00
HARVARD UNIVERSITY 131368.94
IBM 128490.93
GOOGLE 125302.88
...
Romney, Mitt UN 3.00
UPTOWN CHEAPSKATE 3.00
WILL MERRIFIELD 3.00
INDEPENDENT PROFESSIONAL 3.00
HONOLD COMMUNICTAIONS 3.00
Name: contb_receipt_amt, Length: 95890, dtype: float64
对出资额分组
bins = np.array([0,1,10,100,1000,10000,100000,1000000,10000000])
labels = pd.cut(fec_mrbo.contb_receipt_amt,bins)
labels
411 (10, 100]
412 (100, 1000]
413 (100, 1000]
414 (10, 100]
415 (10, 100]
...
701381 (10, 100]
701382 (100, 1000]
701383 (1, 10]
701384 (10, 100]
701385 (100, 1000]
Name: contb_receipt_amt, Length: 694282, dtype: category
Categories (8, interval[int64, right]): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]
grouped = fec_mrbo.groupby(['cand_nm',labels])
grouped.size().unstack(0)
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
contb_receipt_amt | ||
(0, 1] | 493 | 77 |
(1, 10] | 40070 | 3681 |
(10, 100] | 372280 | 31853 |
(100, 1000] | 153991 | 43357 |
(1000, 10000] | 22284 | 26186 |
(10000, 100000] | 2 | 1 |
(100000, 1000000] | 3 | 0 |
(1000000, 10000000] | 4 | 0 |
bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)
bucket_sums
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
contb_receipt_amt | ||
(0, 1] | 318.24 | 77.00 |
(1, 10] | 337267.62 | 29819.66 |
(10, 100] | 20288981.41 | 1987783.76 |
(100, 1000] | 54798531.46 | 22363381.69 |
(1000, 10000] | 51753705.67 | 63942145.42 |
(10000, 100000] | 59100.00 | 12700.00 |
(100000, 1000000] | 1490683.08 | 0.00 |
(1000000, 10000000] | 7148839.76 | 0.00 |
normed_sums = bucket_sums.div(bucket_sums.sum(axis=1),axis=0)
normed_sums
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
contb_receipt_amt | ||
(0, 1] | 0.805182 | 0.194818 |
(1, 10] | 0.918767 | 0.081233 |
(10, 100] | 0.910769 | 0.089231 |
(100, 1000] | 0.710176 | 0.289824 |
(1000, 10000] | 0.447326 | 0.552674 |
(10000, 100000] | 0.823120 | 0.176880 |
(100000, 1000000] | 1.000000 | 0.000000 |
(1000000, 10000000] | 1.000000 | 0.000000 |
normed_sums[:2].plot(kind='barh',stacked=True)
根据州统计赞助信息
grouped = fec_mrbo.groupby(['cand_nm','contbr_st'])
totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals = totals[totals.sum(1)>100000]
totals[:10]
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
contbr_st | ||
AK | 281840.15 | 86204.24 |
AL | 543123.48 | 527303.51 |
AR | 359247.28 | 105556.00 |
AZ | 1506476.98 | 1888436.23 |
CA | 23824984.24 | 11237636.60 |
CO | 2132429.49 | 1506714.12 |
CT | 2068291.26 | 3499475.45 |
DC | 4373538.80 | 1025137.50 |
DE | 336669.14 | 82712.00 |
FL | 7318178.58 | 8338458.81 |
percent = totals.div(totals.sum(1),axis=0)
percent[:10]
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
contbr_st | ||
AK | 0.765778 | 0.234222 |
AL | 0.507390 | 0.492610 |
AR | 0.772902 | 0.227098 |
AZ | 0.443745 | 0.556255 |
CA | 0.679498 | 0.320502 |
CO | 0.585970 | 0.414030 |
CT | 0.371476 | 0.628524 |
DC | 0.810113 | 0.189887 |
DE | 0.802776 | 0.197224 |
FL | 0.467417 | 0.532583 |
from mpl_toolkits.basemap import Basemap, cm
import numpy as np
from matplotlib import rcParams
from matplotlib.collections import LineCollection
import matplotlib.pyplot as plt
#from shapelib import ShapeFile
import pyshp
import dbflib
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
5 import matplotlib.pyplot as plt
6 #from shapelib import ShapeFile
----> 7 import pyshp
8 import dbflib
ModuleNotFoundError: No module named 'pyshp'