数据聚合与分组运算

GroupBy技术

import numpy as np
import pandas as pd
from pandas import DataFrame,Series

df = DataFrame({'key1':['a','a','b','b','a'],
               'key2':['one','two','one','two','one'],
               'data1':np.random.randn(5),
               'data2':np.random.randn(5)})
df

	key1	key2	data1	data2
0	a	one	-0.074122	-0.571432
1	a	two	0.347874	-0.794645
2	b	one	0.399766	-0.596056
3	b	two	1.209857	-0.266257
4	a	one	-0.001175	0.180895

#根据key1进行分组，并计算data1列的平均值。
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001CCD8450910>

grouped.mean()

key1
a    0.090859
b    0.804812
Name: data1, dtype: float64

means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means

key1  key2
a     one    -0.037649
      two     0.347874
b     one     0.399766
      two     1.209857
Name: data1, dtype: float64

means.unstack()

key2	one	two
key1
a	-0.037649	0.347874
b	0.399766	1.209857

states = np.array(['Ohio','California','California','Ohio','Ohio'])
years = np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()

California  2005    0.347874
            2006    0.399766
Ohio        2005    0.567867
            2006   -0.001175
Name: data1, dtype: float64

df.groupby('key1').mean()

	data1	data2
key1
a	0.090859	-0.395061
b	0.804812	-0.431157

df.groupby(['key1','key2']).mean()

		data1	data2
key1	key2
a	one	-0.037649	-0.195268
a	two	0.347874	-0.794645
b	one	0.399766	-0.596056
b	two	1.209857	-0.266257

#GroupBy的size方法，可以返回一个含有分组大小的Series。目前，分组键中的任何缺失值都会被排除在结果之外。
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

对分组进行迭代

for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one -0.074122 -0.571432
1    a  two  0.347874 -0.794645
4    a  one -0.001175  0.180895
b
  key1 key2     data1     data2
2    b  one  0.399766 -0.596056
3    b  two  1.209857 -0.266257

for (k1,k2),group in df.groupby(['key1','key2']):
    print(k1,k2)
    print(group)

a one
  key1 key2     data1     data2
0    a  one -0.074122 -0.571432
4    a  one -0.001175  0.180895
a two
  key1 key2     data1     data2
1    a  two  0.347874 -0.794645
b one
  key1 key2     data1     data2
2    b  one  0.399766 -0.596056
b two
  key1 key2     data1     data2
3    b  two  1.209857 -0.266257

pieces = dict(list(df.groupby('key1')))
pieces['b']

	key1	key2	data1	data2
2	b	one	0.399766	-0.596056
3	b	two	1.209857	-0.266257

df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

grouped = df.groupby(df.dtypes,axis=1)
dict(list(grouped))

{dtype('float64'):       data1     data2
 0 -0.074122 -0.571432
 1  0.347874 -0.794645
 2  0.399766 -0.596056
 3  1.209857 -0.266257
 4 -0.001175  0.180895,
 dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

选取一个或一组列

df.groupby('key1')['data1']
df.groupby('key1')[['data2']]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001CCCF3F55A0>

df.groupby(['key1','key2'])[['data2']].mean()

		data2
key1	key2
a	one	-0.195268
a	two	-0.794645
b	one	-0.596056
b	two	-0.266257

s_grouped = df.groupby(['key1','key2'])['data2']
s_grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001CCD8452DA0>

s_grouped.mean()

key1  key2
a     one    -0.195268
      two    -0.794645
b     one    -0.596056
      two    -0.266257
Name: data2, dtype: float64

通过字典或Series进行分组

people = DataFrame(np.random.randn(5,5),
                  columns=['a','b','c','d','e'],
                  index=['Joe','Steve','Wes','Jim','Travis'])
people.loc[2:3,['b','c']] = np.nan#添加几个NA值
people

C:\windows\ FutureWarning: Slicing a positional slice with .loc is not supported, and will raise TypeError in a future version.  Use .loc with labels or .iloc with positions instead.
  people.loc[2:3,['b','c']] = np.nan#添加几个NA值

	a	b	c	d	e
Joe	0.309327	1.658107	1.146959	-0.123471	0.159285
Steve	1.380735	-0.703245	0.158134	-1.602958	1.455772
Wes	-0.766580	NaN	NaN	0.074462	1.430541
Jim	-0.615666	2.578830	-0.002766	0.885567	-0.375239
Travis	-0.033534	1.158113	0.637327	1.473547	0.373215

mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_column = people.groupby(mapping,axis=1)
by_column.sum()

	blue	red
Joe	1.023488	2.126719
Steve	-1.444824	2.133263
Wes	0.074462	0.663960
Jim	0.882800	1.587925
Travis	2.110874	1.497794

map_series = Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

people.groupby(map_series,axis=1).count()

	blue	red
Joe	2	3
Steve	2	3
Wes	1	2
Jim	2	3
Travis	2	3

通过函数进行分组

people.groupby(len).sum()

	a	b	c	d	e
3	-1.072920	4.236937	1.144193	0.836558	1.214587
5	1.380735	-0.703245	0.158134	-1.602958	1.455772
6	-0.033534	1.158113	0.637327	1.473547	0.373215

key_list = ['one','one','one','two','two']
people.groupby([len,key_list]).min()

		a	b	c	d	e
3	one	-0.766580	1.658107	1.146959	-0.123471	0.159285
3	two	-0.615666	2.578830	-0.002766	0.885567	-0.375239
5	one	1.380735	-0.703245	0.158134	-1.602958	1.455772
6	two	-0.033534	1.158113	0.637327	1.473547	0.373215

根据索引级别分组

columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],
                                    [1,3,5,1,3]],names=['cty','tenor'])
hier_df = DataFrame(np.random.randn(4,5),columns=columns)
hier_df

cty	US			JP
tenor	1	3	5	1	3
0	0.971689	-0.207027	0.641528	1.197729	-0.800907
1	0.906871	-0.087288	0.204273	-0.009374	0.637842
2	0.649755	-0.800055	-0.057130	-1.087200	0.435762
3	-0.618737	0.325816	-0.702310	-0.519860	-0.101653

hier_df.groupby(level='cty',axis=1).count()

cty	JP	US
0	2	3
1	2	3
2	2	3
3	2	3

数据聚合

grouped = df.groupby('key1')
#如果传入的百分位上没有值，则quantile会进行线性插值
grouped['data1'].quantile(0.9)

key1
a    0.278064
b    1.128848
Name: data1, dtype: float64

def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak)

C:\windows\TFutureWarning: ['key2'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.
  grouped.agg(peak_to_peak)

	data1	data2
key1
a	0.421996	0.975541
b	0.810090	0.329799

grouped.describe()

	data1								data2
	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max
key1
a	3.0	0.090859	0.22555	-0.074122	-0.037649	-0.001175	0.173349	0.347874	3.0	-0.395061	0.511126	-0.794645	-0.683039	-0.571432	-0.195268	0.180895
b	2.0	0.804812	0.57282	0.399766	0.602289	0.804812	1.007334	1.209857	2.0	-0.431157	0.233203	-0.596056	-0.513606	-0.431157	-0.348707	-0.266257

#有个知识点
import matplotlib.pyplot as plt
from pylab import *
img = plt.imread('经过优化的GroupBy的方法.png')
imshow(img)

在这里插入图片描述

tips = pd.read_csv("E:\\python_study_files\\python\\pydata-book-2nd-edition\\examples\\tips.csv")
#添加“小费占总额百分比”的列
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips[:6]

	total_bill	tip	smoker	day	time	size	tip_pct
0	16.99	1.01	No	Sun	Dinner	2	0.059447
1	10.34	1.66	No	Sun	Dinner	3	0.160542
2	21.01	3.50	No	Sun	Dinner	3	0.166587
3	23.68	3.31	No	Sun	Dinner	2	0.139780
4	24.59	3.61	No	Sun	Dinner	4	0.146808
5	25.29	4.71	No	Sun	Dinner	4	0.186240

面向列的多函数应用

grouped = tips.groupby(['day','smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

grouped_pct.agg(['mean','std',peak_to_peak])

		mean	std	peak_to_peak
day	smoker
Fri	No	0.151650	0.028123	0.067349
Fri	Yes	0.174783	0.051293	0.159925
Sat	No	0.158048	0.039767	0.235193
Sat	Yes	0.147906	0.061375	0.290095
Sun	No	0.160113	0.042347	0.193226
Sun	Yes	0.187250	0.154134	0.644685
Thur	No	0.160298	0.038774	0.193350
Thur	Yes	0.163863	0.039389	0.151240

#由（name,function)组成的列表，第一个元素会被用作列名
grouped_pct.agg([('foo','mean'),('bar',np.std)])

		foo	bar
day	smoker
Fri	No	0.151650	0.028123
Fri	Yes	0.174783	0.051293
Sat	No	0.158048	0.039767
Sat	Yes	0.147906	0.061375
Sun	No	0.160113	0.042347
Sun	Yes	0.187250	0.154134
Thur	No	0.160298	0.038774
Thur	Yes	0.163863	0.039389

functions = ['count','mean','max']
result = grouped['tip_pct','total_bill'].agg(functions)
result

C:\windowFutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  result = grouped['tip_pct','total_bill'].agg(functions)

		tip_pct			total_bill
		count	mean	max	count	mean	max
day	smoker
Fri	No	4	0.151650	0.187735	4	18.420000	22.75
Fri	Yes	15	0.174783	0.263480	15	16.813333	40.17
Sat	No	45	0.158048	0.291990	45	19.661778	48.33
Sat	Yes	42	0.147906	0.325733	42	21.276667	50.81
Sun	No	57	0.160113	0.252672	57	20.506667	48.17
Sun	Yes	19	0.187250	0.710345	19	24.120000	45.35
Thur	No	45	0.160298	0.266312	45	17.113111	41.19
Thur	Yes	17	0.163863	0.241255	17	19.190588	43.11

result['tip_pct']

		count	mean	max
day	smoker
Fri	No	4	0.151650	0.187735
Fri	Yes	15	0.174783	0.263480
Sat	No	45	0.158048	0.291990
Sat	Yes	42	0.147906	0.325733
Sun	No	57	0.160113	0.252672
Sun	Yes	19	0.187250	0.710345
Thur	No	45	0.160298	0.266312
Thur	Yes	17	0.163863	0.241255

ftuples = [('Durchschnitt','mean'),('Abweichung',np.var)]
grouped['tip_pct','total_bill'].agg(ftuples)

C:\windowsFutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  grouped['tip_pct','total_bill'].agg(ftuples)

		tip_pct		total_bill
		Durchschnitt	Abweichung	Durchschnitt	Abweichung
day	smoker
Fri	No	0.151650	0.000791	18.420000	25.596333
Fri	Yes	0.174783	0.002631	16.813333	82.562438
Sat	No	0.158048	0.001581	19.661778	79.908965
Sat	Yes	0.147906	0.003767	21.276667	101.387535
Sun	No	0.160113	0.001793	20.506667	66.099980
Sun	Yes	0.187250	0.023757	24.120000	109.046044
Thur	No	0.160298	0.001503	17.113111	59.625081
Thur	Yes	0.163863	0.001551	19.190588	69.808518

#对不同的列应用不同的函数
grouped.agg({'tip':np.max,'size':'sum'})

		tip	size
day	smoker
Fri	No	3.50	9
Fri	Yes	4.73	31
Sat	No	9.00	115
Sat	Yes	10.00	104
Sun	No	6.00	167
Sun	Yes	6.50	49
Thur	No	6.70	112
Thur	Yes	5.00	40

grouped.agg({'tip_pct':['min','max','mean','std'],
            'size':'sum'})

		tip_pct				size
		min	max	mean	std	sum
day	smoker
Fri	No	0.120385	0.187735	0.151650	0.028123	9
Fri	Yes	0.103555	0.263480	0.174783	0.051293	31
Sat	No	0.056797	0.291990	0.158048	0.039767	115
Sat	Yes	0.035638	0.325733	0.147906	0.061375	104
Sun	No	0.059447	0.252672	0.160113	0.042347	167
Sun	Yes	0.065660	0.710345	0.187250	0.154134	49
Thur	No	0.072961	0.266312	0.160298	0.038774	112
Thur	Yes	0.090014	0.241255	0.163863	0.039389	40

以“无索引”的形式返回聚合数据

tips.groupby(['day','smoker'],as_index=False).mean()

	day	smoker	total_bill	tip	size	tip_pct
0	Fri	No	18.420000	2.812500	2.250000	0.151650
1	Fri	Yes	16.813333	2.714000	2.066667	0.174783
2	Sat	No	19.661778	3.102889	2.555556	0.158048
3	Sat	Yes	21.276667	2.875476	2.476190	0.147906
4	Sun	No	20.506667	3.167895	2.929825	0.160113
5	Sun	Yes	24.120000	3.516842	2.578947	0.187250
6	Thur	No	17.113111	2.673778	2.488889	0.160298
7	Thur	Yes	19.190588	3.030000	2.352941	0.163863

分组级运算和转换

#添加一个用于存放各索引分组平均值的列
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means

	mean_data1	mean_data2
key1
a	0.090859	-0.395061
b	0.804812	-0.431157

pd.merge(df,k1_means,left_on='key1',right_index=True)

	key1	key2	data1	data2	mean_data1	mean_data2
0	a	one	-0.074122	-0.571432	0.090859	-0.395061
1	a	two	0.347874	-0.794645	0.090859	-0.395061
4	a	one	-0.001175	0.180895	0.090859	-0.395061
2	b	one	0.399766	-0.596056	0.804812	-0.431157
3	b	two	1.209857	-0.266257	0.804812	-0.431157

key = ['one','two','one','two','one']
people.groupby(key).mean()

	a	b	c	d	e
one	-0.163596	1.408110	0.892143	0.474846	0.654347
two	0.382534	0.937792	0.077684	-0.358695	0.540267

#transform会将一个函数应用到各个分组
people.groupby(key).transform(np.mean)

	a	b	c	d	e
Joe	-0.163596	1.408110	0.892143	0.474846	0.654347
Steve	0.382534	0.937792	0.077684	-0.358695	0.540267
Wes	-0.163596	1.408110	0.892143	0.474846	0.654347
Jim	0.382534	0.937792	0.077684	-0.358695	0.540267
Travis	-0.163596	1.408110	0.892143	0.474846	0.654347

#从各组中减去平均值
def demean(arr):
    return arr-arr.mean()
demeaned = people.groupby(key).transform(demean)
demeaned

	a	b	c	d	e
Joe	0.472923	0.249997	0.254816	-0.598317	-0.495062
Steve	0.998201	-1.641038	0.080450	-1.244262	0.915506
Wes	-0.602985	NaN	NaN	-0.400384	0.776194
Jim	-0.998201	1.641038	-0.080450	1.244262	-0.915506
Travis	0.130062	-0.249997	-0.254816	0.998701	-0.281132

demeaned.groupby(key).mean()

	a	b	c	d	e
one	2.775558e-17	0.000000e+00	-5.551115e-17	7.401487e-17	-1.110223e-16
two	0.000000e+00	1.110223e-16	-6.938894e-18	0.000000e+00	-5.551115e-17

apply:一般性的“拆分——应用——合并”

#在指定列找到最大值，然后把这个值所在的行选取出来
#将sort_index()改为sort_values()即可
def top(df,n=5,column='tip_pct'):
    return df.sort_values(by=column)[-n:]
top(tips,n=6)

	total_bill	tip	smoker	day	time	size	tip_pct
109	14.31	4.00	Yes	Sat	Dinner	2	0.279525
183	23.17	6.50	Yes	Sun	Dinner	4	0.280535
232	11.61	3.39	No	Sat	Dinner	2	0.291990
67	3.07	1.00	Yes	Sat	Dinner	1	0.325733
178	9.60	4.00	Yes	Sun	Dinner	2	0.416667
172	7.25	5.15	Yes	Sun	Dinner	2	0.710345

tips.groupby('smoker').apply(top)

		total_bill	tip	smoker	day	time	size	tip_pct
smoker
No	88	24.71	5.85	No	Thur	Lunch	2	0.236746
	185	20.69	5.00	No	Sun	Dinner	5	0.241663
	51	10.29	2.60	No	Sun	Dinner	2	0.252672
	149	7.51	2.00	No	Thur	Lunch	2	0.266312
	232	11.61	3.39	No	Sat	Dinner	2	0.291990
Yes	109	14.31	4.00	Yes	Sat	Dinner	2	0.279525
	183	23.17	6.50	Yes	Sun	Dinner	4	0.280535
	67	3.07	1.00	Yes	Sat	Dinner	1	0.325733
	178	9.60	4.00	Yes	Sun	Dinner	2	0.416667
	172	7.25	5.15	Yes	Sun	Dinner	2	0.710345

tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')

			total_bill	tip	smoker	day	time	size	tip_pct
smoker	day
No	Fri	94	22.75	3.25	No	Fri	Dinner	2	0.142857
	Sat	212	48.33	9.00	No	Sat	Dinner	4	0.186220
	Sun	156	48.17	5.00	No	Sun	Dinner	6	0.103799
	Thur	142	41.19	5.00	No	Thur	Lunch	5	0.121389
Yes	Fri	95	40.17	4.73	Yes	Fri	Dinner	4	0.117750
	Sat	170	50.81	10.00	Yes	Sat	Dinner	3	0.196812
	Sun	182	45.35	3.50	Yes	Sun	Dinner	3	0.077178
	Thur	197	43.11	5.00	Yes	Thur	Lunch	4	0.115982

result = tips.groupby('smoker')['tip_pct'].describe()
result

	count	mean	std	min	25%	50%	75%	max
smoker
No	151.0	0.159328	0.039910	0.056797	0.136906	0.155625	0.185014	0.291990
Yes	93.0	0.163196	0.085119	0.035638	0.106771	0.153846	0.195059	0.710345

result.unstack('smoker')

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

当调用describe之类的方法时，实际上只是应用了以下两条代码的快捷方式：
f = lambda x: x.describe()
grouped.apply(f)

禁止分组键

tips.groupby('smoker',group_keys=False).apply(top)

	total_bill	tip	smoker	day	time	size	tip_pct
88	24.71	5.85	No	Thur	Lunch	2	0.236746
185	20.69	5.00	No	Sun	Dinner	5	0.241663
51	10.29	2.60	No	Sun	Dinner	2	0.252672
149	7.51	2.00	No	Thur	Lunch	2	0.266312
232	11.61	3.39	No	Sat	Dinner	2	0.291990
109	14.31	4.00	Yes	Sat	Dinner	2	0.279525
183	23.17	6.50	Yes	Sun	Dinner	4	0.280535
67	3.07	1.00	Yes	Sat	Dinner	1	0.325733
178	9.60	4.00	Yes	Sun	Dinner	2	0.416667
172	7.25	5.15	Yes	Sun	Dinner	2	0.710345

分位数和桶分析

frame = DataFrame({'data1':np.random.randn(1000),
                  'data2':np.random.randn(1000)})
factor = pd.cut(frame.data1,4)
factor[:10]

0    (-1.448, 0.107]
1    (-1.448, 0.107]
2    (-1.448, 0.107]
3    (-1.448, 0.107]
4     (0.107, 1.663]
5     (0.107, 1.663]
6     (0.107, 1.663]
7    (-1.448, 0.107]
8    (-1.448, 0.107]
9     (0.107, 1.663]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-3.01, -1.448] < (-1.448, 0.107] < (0.107, 1.663] < (1.663, 3.218]]

def get_stats(group):
    return {'min':group.min(),'max':group.max(),
           'count':group.count(),'mean':group.mean()}
grouped = frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()
#区间大小相等

	min	max	count	mean
data1
(-3.01, -1.448]	-2.614910	2.368046	70.0	-0.092146
(-1.448, 0.107]	-2.534962	2.783160	479.0	0.009041
(0.107, 1.663]	-3.073771	2.513553	398.0	-0.091291
(1.663, 3.218]	-2.699080	2.373634	53.0	-0.099021

#数据点数量相等,使用qcut
#返回分位数编号
grouping = pd.qcut(frame.data1,10,labels=False)
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

	min	max	count	mean
data1
0	-2.614910	2.783160	100.0	0.006906
1	-2.534962	2.490249	100.0	-0.101695
2	-2.015862	2.261854	100.0	0.084059
3	-2.250966	2.509572	100.0	-0.000924
4	-2.068747	2.425219	100.0	0.119523
5	-2.913492	2.032037	100.0	-0.233505
6	-2.432055	1.983781	100.0	-0.038541
7	-2.339164	2.046824	100.0	-0.096358
8	-3.073771	2.235941	100.0	-0.091584
9	-2.699080	2.513553	100.0	-0.084895

示例：用特定于分组的值填充缺失值

#用平均值填充NA值
s = Series(np.random.randn(6))
s[::2] = np.nan
s

0         NaN
1    0.209858
2         NaN
3    1.379023
4         NaN
5   -0.743300
dtype: float64

s.fillna(s.mean())

0    0.281860
1    0.209858
2    0.281860
3    1.379023
4    0.281860
5   -0.743300
dtype: float64

states = ['Ohio','New York','Vermont','Florida',
         'Oregon','Nevada','California','Idaho']
group_key = ['East']*4+['West']*4
data = Series(np.random.randn(8),index=states)
data[['Vermont','Nevada','Idaho']] = np.nan
data

Ohio          0.155978
New York     -0.133767
Vermont            NaN
Florida      -0.765162
Oregon        0.682524
Nevada             NaN
California    0.730390
Idaho              NaN
dtype: float64

data.groupby(group_key).mean()

East   -0.247650
West    0.706457
dtype: float64

fill_mean = lambda g:g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)

Ohio          0.155978
New York     -0.133767
Vermont      -0.247650
Florida      -0.765162
Oregon        0.682524
Nevada        0.706457
California    0.730390
Idaho         0.706457
dtype: float64

fill_values = {'East':0.5,'West':-1}
fill_func = lambda g:g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)

Ohio          0.155978
New York     -0.133767
Vermont       0.500000
Florida      -0.765162
Oregon        0.682524
Nevada       -1.000000
California    0.730390
Idaho        -1.000000
dtype: float64

示例：随机采样和排列

抽取的一个办法：选取np.random.permutation(N)的前K个元素，其中N为完整数据的大小，K为期望的样本大小。

#红桃（Hearts)、黑桃（Spades)、梅花（Clubs)、方片（Diamonds)
suits = ['H','S','C','D']
#python2中，range()返回的是list，可以将两个range()直接相加，如range(5)+range(10) ;python3中，range()成了一个class
card_val = (list(range(1,11))+ [10] * 3) * 4
base_names = ['A'] + list(range(2,11)) + ['J','K','Q']
cards = []
for suit in ['H','S','C','D']:
    cards.extend(str(num) + suit for num in base_names)
deck = Series(card_val,index=cards)
deck[:13]

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64

def draw(deck,n=5):
    return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)

7H     7
4D     4
8H     8
QC    10
4S     4
dtype: int64

#从每种花色中随机抽取两张牌
get_suit = lambda card:card[-1]#只要最后一个字母
deck.groupby(get_suit).apply(draw,n=2)

C  AC      1
   JC     10
D  5D      5
   8D      8
H  10H    10
   JH     10
S  9S      9
   5S      5
dtype: int64

#另一种办法
deck.groupby(get_suit,group_keys=False).apply(draw,n=2)

10C    10
AC      1
KD     10
10D    10
3H      3
9H      9
5S      5
8S      8
dtype: int64

示例：分组加权平均数和相关系数

df = DataFrame({'category':['a','a','a','a','b','b','b','b'],
               'data':np.random.randn(8),
               'weights':np.random.randn(8)})
df

	category	data	weights
0	a	0.591317	-1.032939
1	a	-0.589692	0.436704
2	a	-0.128848	2.257153
3	a	-0.774626	0.811910
4	b	-2.050679	1.144802
5	b	1.216111	0.736471
6	b	-0.801366	0.139008
7	b	-1.577430	-0.576198

grouped = df.groupby('category')
get_wavg = lambda g: np.average(g['data'],weights=g['weights'])
grouped.apply(get_wavg)

category
a   -0.723088
b   -0.453212
dtype: float64

close_px = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\stock_px.csv",parse_dates=True,index_col=0)
close_px

	AA	AAPL	GE	IBM	JNJ	MSFT	PEP	SPX	XOM
1990-02-01	4.98	7.86	2.87	16.79	4.27	0.51	6.04	328.79	6.12
1990-02-02	5.04	8.00	2.87	16.89	4.37	0.51	6.09	330.92	6.24
1990-02-05	5.07	8.18	2.87	17.32	4.34	0.51	6.05	331.85	6.25
1990-02-06	5.01	8.12	2.88	17.56	4.32	0.51	6.15	329.66	6.23
1990-02-07	5.04	7.77	2.91	17.93	4.38	0.51	6.17	333.75	6.33
...	...	...	...	...	...	...	...	...	...
2011-10-10	10.09	388.81	16.14	186.62	64.43	26.94	61.87	1194.89	76.28
2011-10-11	10.30	400.29	16.14	185.00	63.96	27.00	60.95	1195.54	76.27
2011-10-12	10.05	402.19	16.40	186.12	64.33	26.96	62.70	1207.25	77.16
2011-10-13	10.10	408.43	16.22	186.82	64.23	27.18	62.36	1203.66	76.37
2011-10-14	10.26	422.00	16.60	190.53	64.72	27.27	62.24	1224.58	78.11

5472 rows × 9 columns

close_px[-4:]

	AA	AAPL	GE	IBM	JNJ	MSFT	PEP	SPX	XOM
2011-10-11	10.30	400.29	16.14	185.00	63.96	27.00	60.95	1195.54	76.27
2011-10-12	10.05	402.19	16.40	186.12	64.33	26.96	62.70	1207.25	77.16
2011-10-13	10.10	408.43	16.22	186.82	64.23	27.18	62.36	1203.66	76.37
2011-10-14	10.26	422.00	16.60	190.53	64.72	27.27	62.24	1224.58	78.11

#计算日收益率与SPX之间的年度相关系数
rets = close_px.pct_change().dropna()
spx_corr = lambda x:x.corrwith(x['SPX'])
by_year = rets.groupby(lambda x:x.year)
by_year.apply(spx_corr)

	AA	AAPL	GE	IBM	JNJ	MSFT	PEP	SPX	XOM
1990	0.595024	0.545067	0.752187	0.738361	0.801145	0.586691	0.783168	1.0	0.517586
1991	0.453574	0.365315	0.759607	0.557046	0.646401	0.524225	0.641775	1.0	0.569335
1992	0.398180	0.498732	0.632685	0.262232	0.515740	0.492345	0.473871	1.0	0.318408
1993	0.259069	0.238578	0.447257	0.211269	0.451503	0.425377	0.385089	1.0	0.318952
1994	0.428549	0.268420	0.572996	0.385162	0.372962	0.436585	0.450516	1.0	0.395078
1995	0.291532	0.161829	0.519126	0.416390	0.315733	0.453660	0.413144	1.0	0.368752
1996	0.292344	0.191482	0.750724	0.388497	0.569232	0.564015	0.421477	1.0	0.538736
1997	0.564427	0.211435	0.827512	0.646823	0.703538	0.606171	0.509344	1.0	0.695653
1998	0.533802	0.379883	0.815243	0.623982	0.591988	0.698773	0.494213	1.0	0.369264
1999	0.099033	0.425584	0.710928	0.486167	0.517061	0.631315	0.336593	1.0	0.315383
2000	0.265359	0.440161	0.610362	0.445114	0.189765	0.538005	0.077525	1.0	0.084163
2001	0.624069	0.577152	0.794632	0.696038	0.111493	0.696447	0.133975	1.0	0.336869
2002	0.748021	0.580548	0.822373	0.716490	0.584758	0.784728	0.487211	1.0	0.759933
2003	0.690466	0.545582	0.777643	0.741775	0.562399	0.750534	0.541487	1.0	0.662775
2004	0.591485	0.374283	0.728626	0.601740	0.354690	0.588531	0.466854	1.0	0.557742
2005	0.564267	0.467540	0.675637	0.516846	0.444728	0.562374	0.489559	1.0	0.631010
2006	0.487638	0.428267	0.612388	0.598636	0.394026	0.406126	0.335054	1.0	0.518514
2007	0.642427	0.508118	0.796945	0.603906	0.568423	0.658770	0.651911	1.0	0.786264
2008	0.781057	0.681434	0.777337	0.833074	0.801005	0.804626	0.709264	1.0	0.828303
2009	0.735642	0.707103	0.713086	0.684513	0.603146	0.654902	0.541474	1.0	0.797921
2010	0.745700	0.710105	0.822285	0.783638	0.689896	0.730118	0.626655	1.0	0.839057
2011	0.882045	0.691931	0.864595	0.802730	0.752379	0.800996	0.592029	1.0	0.859975

#苹果和微软的年度相关系数
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))

1990    0.408271
1991    0.266807
1992    0.450592
1993    0.236917
1994    0.361638
1995    0.258642
1996    0.147539
1997    0.196144
1998    0.364106
1999    0.329484
2000    0.275298
2001    0.563156
2002    0.571435
2003    0.486262
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

示例：面向分组的线性回归

import statsmodels.api as sm
def regress(data,yvar,xvars):
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1.
    result = sm.OLS(Y,X).fit()
    return result.params
by_year.apply(regress,'AAPL',['SPX'])

E:\python_study_files\python_pip\.venvs\lpthw\lib\site-packages\statsmodels\compat\pandas.py:65: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import Int64Index as NumericIndex

	SPX	intercept
1990	1.512772	0.001395
1991	1.187351	0.000396
1992	1.832427	0.000164
1993	1.390470	-0.002657
1994	1.190277	0.001617
1995	0.858818	-0.001423
1996	0.829389	-0.001791
1997	0.749928	-0.001901
1998	1.164582	0.004075
1999	1.384989	0.003273
2000	1.733802	-0.002523
2001	1.676128	0.003122
2002	1.080795	-0.000219
2003	1.187770	0.000690
2004	1.363463	0.004201
2005	1.766415	0.003246
2006	1.645496	0.000080
2007	1.198761	0.003438
2008	0.968016	-0.001110
2009	0.879103	0.002954
2010	1.052608	0.001261
2011	0.806605	0.001514

透视表和交叉表

tips.pivot_table(index=['day','smoker'])

		size	tip	tip_pct	total_bill
day	smoker
Fri	No	2.250000	2.812500	0.151650	18.420000
Fri	Yes	2.066667	2.714000	0.174783	16.813333
Sat	No	2.555556	3.102889	0.158048	19.661778
Sat	Yes	2.476190	2.875476	0.147906	21.276667
Sun	No	2.929825	3.167895	0.160113	20.506667
Sun	Yes	2.578947	3.516842	0.187250	24.120000
Thur	No	2.488889	2.673778	0.160298	17.113111
Thur	Yes	2.352941	3.030000	0.163863	19.190588

tips.pivot_table(['tip_pct','size'],index=['time','day'],columns='smoker')

		size		tip_pct
	smoker	No	Yes	No	Yes
time	day
Dinner	Fri	2.000000	2.222222	0.139622	0.165347
	Sat	2.555556	2.476190	0.158048	0.147906
	Sun	2.929825	2.578947	0.160113	0.187250
	Thur	2.000000	NaN	0.159744	NaN
Lunch	Fri	3.000000	1.833333	0.187735	0.188937
Lunch	Thur	2.500000	2.352941	0.160311	0.163863

tips.pivot_table(['tip_pct','size'],index=['time','day'],
                columns='smoker',margins=True)

		size			tip_pct
	smoker	No	Yes	All	No	Yes	All
time	day
Dinner	Fri	2.000000	2.222222	2.166667	0.139622	0.165347	0.158916
	Sat	2.555556	2.476190	2.517241	0.158048	0.147906	0.153152
	Sun	2.929825	2.578947	2.842105	0.160113	0.187250	0.166897
	Thur	2.000000	NaN	2.000000	0.159744	NaN	0.159744
Lunch	Fri	3.000000	1.833333	2.000000	0.187735	0.188937	0.188765
Lunch	Thur	2.500000	2.352941	2.459016	0.160311	0.163863	0.161301
All		2.668874	2.408602	2.569672	0.159328	0.163196	0.160803

tips.pivot_table('tip_pct',index=['time','smoker'],columns='day',
                aggfunc=len,margins=True)

	day	Fri	Sat	Sun	Thur	All
time	smoker
Dinner	No	3.0	45.0	57.0	1.0	106
Dinner	Yes	9.0	42.0	19.0	NaN	70
Lunch	No	1.0	NaN	NaN	44.0	45
Lunch	Yes	6.0	NaN	NaN	17.0	23
All		19.0	87.0	76.0	62.0	244

tips.pivot_table('size',index=['time','smoker'],
                columns='day',aggfunc='sum',fill_value=0)

	day	Fri	Sat	Sun	Thur
time	smoker
Dinner	No	6	115	167	2
Dinner	Yes	20	104	49	0
Lunch	No	3	0	0	110
Lunch	Yes	11	0	0	40

#有个知识点
import matplotlib.pyplot as plt
from pylab import *
img = plt.imread('pivot_table的参数.png')
imshow(img)

在这里插入图片描述

rows改为index,cols改为columns

交叉表：crosstab

pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)

	smoker	No	Yes	All
time	day
Dinner	Fri	3	9	12
	Sat	45	42	87
	Sun	57	19	76
	Thur	1	0	1
Lunch	Fri	1	6	7
Lunch	Thur	44	17	61
All		151	93	244

示例：2012联邦选举委员会数据库

fec =pd.read_csv("E:\\python_study_files\\python\\pydata-book-2nd-edition\\datasets\\fec\\P00000001-ALL.csv")
fec

C:\windowsDtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.
  fec =pd.read_csv("E:\\python_study_files\\python\\pydata-book-2nd-edition\\datasets\\fec\\P00000001-ALL.csv")

	cmte_id	cand_id	cand_nm	contbr_nm	contbr_city	contbr_st	contbr_zip	contbr_employer	contbr_occupation	contb_receipt_amt	contb_receipt_dt	receipt_desc	memo_cd	memo_text	form_tp	file_num
0	C00410118	P20002978	Bachmann, Michelle	HARVEY, WILLIAM	MOBILE	AL	366010290.0	RETIRED	RETIRED	250.0	20-JUN-11	NaN	NaN	NaN	SA17A	736166
1	C00410118	P20002978	Bachmann, Michelle	HARVEY, WILLIAM	MOBILE	AL	366010290.0	RETIRED	RETIRED	50.0	23-JUN-11	NaN	NaN	NaN	SA17A	736166
2	C00410118	P20002978	Bachmann, Michelle	SMITH, LANIER	LANETT	AL	368633403.0	INFORMATION REQUESTED	INFORMATION REQUESTED	250.0	05-JUL-11	NaN	NaN	NaN	SA17A	749073
3	C00410118	P20002978	Bachmann, Michelle	BLEVINS, DARONDA	PIGGOTT	AR	724548253.0	NONE	RETIRED	250.0	01-AUG-11	NaN	NaN	NaN	SA17A	749073
4	C00410118	P20002978	Bachmann, Michelle	WARDENBURG, HAROLD	HOT SPRINGS NATION	AR	719016467.0	NONE	RETIRED	300.0	20-JUN-11	NaN	NaN	NaN	SA17A	736166
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1001726	C00500587	P20003281	Perry, Rick	GORMAN, CHRIS D. MR.	INFO REQUESTED	XX	99999	INFORMATION REQUESTED PER BEST EFFORTS	INFORMATION REQUESTED PER BEST EFFORTS	5000.0	29-SEP-11	REATTRIBUTION / REDESIGNATION REQUESTED (AUTOM...	NaN	REATTRIBUTION / REDESIGNATION REQUESTED (AUTOM...	SA17A	751678
1001727	C00500587	P20003281	Perry, Rick	DUFFY, DAVID A. MR.	INFO REQUESTED	XX	99999	DUFFY EQUIPMENT COMPANY INC.	BUSINESS OWNER	2500.0	30-SEP-11	NaN	NaN	NaN	SA17A	751678
1001728	C00500587	P20003281	Perry, Rick	GRANE, BRYAN F. MR.	INFO REQUESTED	XX	99999	INFORMATION REQUESTED PER BEST EFFORTS	INFORMATION REQUESTED PER BEST EFFORTS	500.0	29-SEP-11	NaN	NaN	NaN	SA17A	751678
1001729	C00500587	P20003281	Perry, Rick	TOLBERT, DARYL MR.	INFO REQUESTED	XX	99999	T.A.C.C.	LONGWALL MAINTENANCE FOREMAN	500.0	30-SEP-11	NaN	NaN	NaN	SA17A	751678
1001730	C00500587	P20003281	Perry, Rick	ANDERSON, MARILEE MRS.	INFO REQUESTED	XX	99999	INFORMATION REQUESTED PER BEST EFFORTS	INFORMATION REQUESTED PER BEST EFFORTS	2500.0	31-AUG-11	NaN	NaN	NaN	SA17A	751678

1001731 rows × 16 columns

fec.loc[123456]

cmte_id                             C00431445
cand_id                             P80003338
cand_nm                         Obama, Barack
contbr_nm                         ELLMAN, IRA
contbr_city                             TEMPE
contbr_st                                  AZ
contbr_zip                          852816719
contbr_employer      ARIZONA STATE UNIVERSITY
contbr_occupation                   PROFESSOR
contb_receipt_amt                        50.0
contb_receipt_dt                    01-DEC-11
receipt_desc                              NaN
memo_cd                                   NaN
memo_text                                 NaN
form_tp                                 SA17A
file_num                               772372
Name: 123456, dtype: object

unique_cands = fec.cand_nm.unique()
unique_cands

array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
       "Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
       'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick',
       'Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G',
       'Huntsman, Jon', 'Perry, Rick'], dtype=object)

unique_cands[2]

'Obama, Barack'

parties = {'Bachmann, Michelle':'Republican',
          'Cain, Herman':'Republican',
          'Gingrich, Newt':'Republican',
          'Huntsman, Jon':'Republican',
          'Johnson, Gary Earl':'Republican',
          'McCotter, Thaddeus G':'Republican',
          'Obama, Barack':'Democrat',
          'Paul, Ron':'Republican',
          'Pawlenty, Timothy':'Republican',
          'Perry, Rick':'Republican',
          "Roemer, Charles E. 'Buddy' Ⅲ":'Republican',
          'Romney, Mitt':'Republican',
          'Santorum, Rick':'Republican'}
fec.cand_nm[123456:123461]

125611    Obama, Barack
125612    Obama, Barack
125613    Obama, Barack
125614    Obama, Barack
125615    Obama, Barack
Name: cand_nm, dtype: object

fec.cand_nm[123456:123461].map(parties)

125611    Democrat
125612    Democrat
125613    Democrat
125614    Democrat
125615    Democrat
Name: cand_nm, dtype: object

#添加一个新列
fec['party'] = fec.cand_nm.map(parties)
fec['party'].value_counts()

Democrat      589127
Republican    396504
Name: party, dtype: int64

(fec.contb_receipt_amt>0).value_counts()

True    991475
Name: contb_receipt_amt, dtype: int64

fec = fec[fec.contb_receipt_amt>0]
fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]

雇主职业和雇主统计赞助信息

fec.contbr_occupation.value_counts()[:10]

RETIRED         233990
NOT PROVIDED     56245
ATTORNEY         34286
HOMEMAKER        29931
PHYSICIAN        23432
ENGINEER         14334
TEACHER          13990
CONSULTANT       13273
PROFESSOR        12555
NOT EMPLOYED      9828
Name: contbr_occupation, dtype: int64

occ_mapping = {'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
              'INFORMATION REQUESTED':'NOT PROVIDED',
              'INFORMATIO REQUESTED (BEST EFFORTS)':'NOT PROVIDED',
              'C.E.O':'CEO'}
#如果没有提供相关映射，则返回X
f = lambda x:occ_mapping.get(x,x)
fec.contbr_occupation = fec.contbr_occupation.map(f)
emp_mapping = {'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
              'INFORMATION REQUESTED':'NOT PROVIDED',
               'SELF':'SELF-EMPLOYED',
               'SELF EMPLOYED':'SELF-EMPLOTED'
}
f = lambda x:emp_mapping.get(x,x)
fec.contbr_employer = fec.contbr_employer.map(f)
by_occupation = fec.pivot_table('contb_receipt_amt',index='contbr_occupation',columns='party',aggfunc='sum')
over_2mm = by_occupation[by_occupation.sum(1)>2000000]
over_2mm

party	Democrat	Republican
contbr_occupation
ATTORNEY	11141982.97	7462058.31
C.E.O.	1690.00	2592983.11
CEO	2074284.79	1638668.41
CONSULTANT	2459912.71	2538990.45
ENGINEER	951525.55	1811937.30
EXECUTIVE	1355161.05	4136400.09
HOMEMAKER	4248875.80	13625600.78
INVESTOR	884133.00	2431258.92
LAWYER	3160478.87	391124.32
MANAGER	762883.22	1441092.37
NOT PROVIDED	4866973.96	20216287.01
OWNER	1001567.36	2406081.92
PHYSICIAN	3735124.94	3587195.24
PRESIDENT	1878509.95	4717413.76
PROFESSOR	2165071.08	294032.73
REAL ESTATE	528902.09	1624507.25
RETIRED	25305116.38	23481023.18
SELF-EMPLOYED	672393.40	1636774.54

over_2mm.plot(kind='barh')

在这里插入图片描述

def get_top_amounts(group,key,n=5):
    totals = group.groupby(key)['contb_receipt_amt'].sum()
    #根据key对totals进行降序排列
    return totals.sort_values(ascending=False)[n:]
grouped = fec_mrbo.groupby('cand_nm')
grouped.apply(get_top_amounts,'contbr_occupation',n=7)

cand_nm        contbr_occupation                   
Obama, Barack  PROFESSOR                               2165071.08
               CEO                                     2074284.79
               PRESIDENT                               1878509.95
               NOT EMPLOYED                            1709188.20
               EXECUTIVE                               1355161.05
                                                          ...    
Romney, Mitt   INDEPENDENT PROFESSIONAL                      3.00
               IFC CONTRACTING SOLUTIONS                     3.00
               REMODELER & SEMI RETIRED                      3.00
               AFFORDABLE REAL ESTATE DEVELOPER              3.00
               3RD GENERATION FAMILY BUSINESS OWNER          3.00
Name: contb_receipt_amt, Length: 35973, dtype: float64

grouped.apply(get_top_amounts,'contbr_employer',n=10)

cand_nm        contbr_employer         
Obama, Barack  REFUSED                     149516.07
               DLA PIPER                   148235.00
               HARVARD UNIVERSITY          131368.94
               IBM                         128490.93
               GOOGLE                      125302.88
                                             ...    
Romney, Mitt   UN                               3.00
               UPTOWN CHEAPSKATE                3.00
               WILL MERRIFIELD                  3.00
               INDEPENDENT PROFESSIONAL         3.00
               HONOLD COMMUNICTAIONS            3.00
Name: contb_receipt_amt, Length: 95890, dtype: float64

对出资额分组

bins = np.array([0,1,10,100,1000,10000,100000,1000000,10000000])
labels = pd.cut(fec_mrbo.contb_receipt_amt,bins)
labels

411         (10, 100]
412       (100, 1000]
413       (100, 1000]
414         (10, 100]
415         (10, 100]
             ...     
701381      (10, 100]
701382    (100, 1000]
701383        (1, 10]
701384      (10, 100]
701385    (100, 1000]
Name: contb_receipt_amt, Length: 694282, dtype: category
Categories (8, interval[int64, right]): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]

grouped = fec_mrbo.groupby(['cand_nm',labels])
grouped.size().unstack(0)

cand_nm	Obama, Barack	Romney, Mitt
contb_receipt_amt
(0, 1]	493	77
(1, 10]	40070	3681
(10, 100]	372280	31853
(100, 1000]	153991	43357
(1000, 10000]	22284	26186
(10000, 100000]	2	1
(100000, 1000000]	3	0
(1000000, 10000000]	4	0

bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)
bucket_sums

cand_nm	Obama, Barack	Romney, Mitt
contb_receipt_amt
(0, 1]	318.24	77.00
(1, 10]	337267.62	29819.66
(10, 100]	20288981.41	1987783.76
(100, 1000]	54798531.46	22363381.69
(1000, 10000]	51753705.67	63942145.42
(10000, 100000]	59100.00	12700.00
(100000, 1000000]	1490683.08	0.00
(1000000, 10000000]	7148839.76	0.00

normed_sums = bucket_sums.div(bucket_sums.sum(axis=1),axis=0)
normed_sums

cand_nm	Obama, Barack	Romney, Mitt
contb_receipt_amt
(0, 1]	0.805182	0.194818
(1, 10]	0.918767	0.081233
(10, 100]	0.910769	0.089231
(100, 1000]	0.710176	0.289824
(1000, 10000]	0.447326	0.552674
(10000, 100000]	0.823120	0.176880
(100000, 1000000]	1.000000	0.000000
(1000000, 10000000]	1.000000	0.000000

normed_sums[:2].plot(kind='barh',stacked=True)

在这里插入图片描述

根据州统计赞助信息

grouped = fec_mrbo.groupby(['cand_nm','contbr_st'])
totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals = totals[totals.sum(1)>100000]
totals[:10]

cand_nm	Obama, Barack	Romney, Mitt
contbr_st
AK	281840.15	86204.24
AL	543123.48	527303.51
AR	359247.28	105556.00
AZ	1506476.98	1888436.23
CA	23824984.24	11237636.60
CO	2132429.49	1506714.12
CT	2068291.26	3499475.45
DC	4373538.80	1025137.50
DE	336669.14	82712.00
FL	7318178.58	8338458.81

percent = totals.div(totals.sum(1),axis=0)
percent[:10]

cand_nm	Obama, Barack	Romney, Mitt
contbr_st
AK	0.765778	0.234222
AL	0.507390	0.492610
AR	0.772902	0.227098
AZ	0.443745	0.556255
CA	0.679498	0.320502
CO	0.585970	0.414030
CT	0.371476	0.628524
DC	0.810113	0.189887
DE	0.802776	0.197224
FL	0.467417	0.532583

from mpl_toolkits.basemap import Basemap, cm
import numpy as np
from matplotlib import rcParams
from matplotlib.collections import LineCollection
import matplotlib.pyplot as plt
#from shapelib import ShapeFile
import pyshp
import dbflib

---------------------------------------------------------------------------

ModuleNotFoundError                       Traceback (most recent call last)

      5 import matplotlib.pyplot as plt
      6 #from shapelib import ShapeFile
----> 7 import pyshp
      8 import dbflib


ModuleNotFoundError: No module named 'pyshp'