GroupBy机制
import numpy as np
import pandas as pd
df = pd. DataFrame( { 'key1' : [ 'a' , 'a' , 'd' , 'd' , 'a' ] ,
'key2' : [ 'one' , 'two' , 'one' , 'two' , 'one' ] ,
'data1' : np. random. randn( 5 ) ,
'data2' : np. random. randn( 5 ) } )
df
data1 data2 key1 key2 0 0.398171 0.618838 a one 1 1.406440 0.007411 a two 2 0.842236 0.090966 d one 3 -0.377231 0.431523 d two 4 -0.525386 -1.980548 a one
grouped = df[ 'data1' ] . groupby( df[ 'key1' ] )
grouped
<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f3f19fbc438>
grouped. mean( )
key1
a 0.426408
d 0.232502
Name: data1, dtype: float64
means = df[ 'data1' ] . groupby( [ df[ 'key1' ] , df[ 'key2' ] ] ) . mean( )
means
key1 key2
a one -0.063608
two 1.406440
d one 0.842236
two -0.377231
Name: data1, dtype: float64
means. unstack( )
key2 one two key1 a -0.063608 1.406440 d 0.842236 -0.377231
states = np. array( [ 'Ohio' , 'California' , 'California' , 'Ohio' , 'Ohio' ] )
years = np. array( [ 2005 , 2005 , 2006 , 2005 , 2006 ] )
df[ 'data1' ] . groupby( [ states, years] ) . mean( )
California 2005 1.406440
2006 0.842236
Ohio 2005 0.010470
2006 -0.525386
Name: data1, dtype: float64
df. groupby( 'key1' ) . mean( )
data1 data2 key1 a 0.426408 -0.451433 d 0.232502 0.261245
df. groupby( [ 'key1' , 'key2' ] ) . mean( )
data1 data2 key1 key2 a one -0.063608 -0.680855 two 1.406440 0.007411 d one 0.842236 0.090966 two -0.377231 0.431523
df. groupby( [ 'key1' , 'key2' ] ) . size( )
key1 key2
a one 2
two 1
d one 1
two 1
dtype: int64
遍历各个分组
for name, group in df. groupby( 'key1' ) :
print ( name)
print ( group)
a
data1 data2 key1 key2
0 0.398171 0.618838 a one
1 1.406440 0.007411 a two
4 -0.525386 -1.980548 a one
d
data1 data2 key1 key2
2 0.842236 0.090966 d one
3 -0.377231 0.431523 d two
for name, group in df. groupby( [ 'key1' , 'key2' ] ) :
print ( name)
print ( group)
('a', 'one')
data1 data2 key1 key2
0 0.398171 0.618838 a one
4 -0.525386 -1.980548 a one
('a', 'two')
data1 data2 key1 key2
1 1.40644 0.007411 a two
('d', 'one')
data1 data2 key1 key2
2 0.842236 0.090966 d one
('d', 'two')
data1 data2 key1 key2
3 -0.377231 0.431523 d two
pieces = dict ( list ( df. groupby( 'key1' ) ) )
pieces[ 'd' ]
data1 data2 key1 key2 2 0.842236 0.090966 d one 3 -0.377231 0.431523 d two
df. dtypes
data1 float64
data2 float64
key1 object
key2 object
dtype: object
grouped = df. groupby( df. dtypes, axis = 1 )
for name, group in grouped:
print ( name)
print ( group)
float64
data1 data2
0 0.398171 0.618838
1 1.406440 0.007411
2 0.842236 0.090966
3 -0.377231 0.431523
4 -0.525386 -1.980548
object
key1 key2
0 a one
1 a two
2 d one
3 d two
4 a one
选择一列或者所有列的子集
df. groupby( [ 'key1' , 'key2' ] ) [ [ 'data2' ] ] . mean( )
data2 key1 key2 a one -0.680855 two 0.007411 d one 0.090966 two 0.431523
s_grouped = df. groupby( [ 'key1' , 'key2' ] ) [ 'data2' ]
s_grouped
<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f3f1974de10>
s_grouped. mean( )
key1 key2
a one -0.680855
two 0.007411
d one 0.090966
two 0.431523
Name: data2, dtype: float64
使用字典和Series分组
people = pd. DataFrame( np. random. randn( 5 , 5 ) ,
columns = [ 'a' , 'b' , 'c' , 'd' , 'e' ] ,
index = [ 'Joe' , 'Steve' , 'Wes' , 'Jim' , 'Travis' ] )
people
a b c d e Joe -1.650753 -1.182232 -0.534644 0.344981 -0.747273 Steve 1.481961 1.266301 -0.758866 0.931459 -0.512757 Wes -1.755591 -0.003535 0.910192 -0.187150 -0.603618 Jim 1.520320 -1.055722 -1.221894 0.741607 1.282918 Travis -0.271283 0.343674 -0.210378 -0.503580 -0.816606
people. iloc[ 2 : 3 , [ 1 , 2 ] ] = np. nan
people
a b c d e Joe -1.650753 -1.182232 -0.534644 0.344981 -0.747273 Steve 1.481961 1.266301 -0.758866 0.931459 -0.512757 Wes -1.755591 NaN NaN -0.187150 -0.603618 Jim 1.520320 -1.055722 -1.221894 0.741607 1.282918 Travis -0.271283 0.343674 -0.210378 -0.503580 -0.816606
mapping = { 'a' : 'red' , 'b' : 'red' , 'c' : 'blue' , 'd' : 'blue' , 'e' : 'red' , 'f' : 'orange' }
by_columns = people. groupby( mapping, axis = 1 )
by_columns. sum ( )
blue red Joe -0.189663 -3.580258 Steve 0.172593 2.235505 Wes -0.187150 -2.359209 Jim -0.480287 1.747516 Travis -0.713958 -0.744215
map_series = pd. Series( mapping)
map_series
a red
b red
c blue
d blue
e red
f orange
dtype: object
people. groupby( map_series, axis = 1 ) . count( )
blue red Joe 2 3 Steve 2 3 Wes 1 2 Jim 2 3 Travis 2 3
使用函数分组
people. groupby( len ) . sum ( )
a b c d e 3 -1.886025 -2.237954 -1.756538 0.899438 -0.067973 5 1.481961 1.266301 -0.758866 0.931459 -0.512757 6 -0.271283 0.343674 -0.210378 -0.503580 -0.816606
key_list = [ 'one' , 'one' , 'one' , 'two' , 'two' ]
people. groupby( [ len , key_list] ) . min ( )
a b c d e 3 one -1.755591 -1.182232 -0.534644 -0.187150 -0.747273 two 1.520320 -1.055722 -1.221894 0.741607 1.282918 5 one 1.481961 1.266301 -0.758866 0.931459 -0.512757 6 two -0.271283 0.343674 -0.210378 -0.503580 -0.816606
根据索引层级分组
columns = pd. MultiIndex. from_arrays( [ [ 'US' , 'US' , 'US' , 'JP' , 'JP' ] ,
[ 1 , 3 , 5 , 1 , 3 ] ] ,
names = [ 'cty' , 'tenor' ] )
hier_df = pd. DataFrame( np. random. randn( 4 , 5 ) , columns= columns)
hier_df
cty US JP tenor 1 3 5 1 3 0 -0.548593 -0.488964 -0.480414 0.929712 -0.096871 1 1.550861 0.652521 0.231158 0.717516 -2.594271 2 0.014603 -0.308289 0.161634 0.398446 0.437358 3 0.191229 -1.140538 -0.713786 0.549353 0.838565
hier_df. groupby( level = 'cty' , axis = 1 ) . count( )
数据聚合
df
data1 data2 key1 key2 0 0.398171 0.618838 a one 1 1.406440 0.007411 a two 2 0.842236 0.090966 d one 3 -0.377231 0.431523 d two 4 -0.525386 -1.980548 a one
grouped = df. groupby( 'key1' )
grouped
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f3f19767198>
grouped[ 'data1' ] . quantile( 0.9 )
key1
a 1.204786
d 0.720289
Name: data1, dtype: float64
def peak_to_peak ( arr) :
return arr. max ( ) - arr. min ( )
grouped. agg( peak_to_peak)
data1 data2 key1 a 1.931826 2.599386 d 1.219467 0.340557
grouped. describe( )
data1 data2 count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max key1 a 3.0 0.426408 0.966223 -0.525386 -0.063608 0.398171 0.902305 1.406440 3.0 -0.451433 1.359082 -1.980548 -0.986568 0.007411 0.313125 0.618838 d 2.0 0.232502 0.862294 -0.377231 -0.072364 0.232502 0.537369 0.842236 2.0 0.261245 0.240810 0.090966 0.176106 0.261245 0.346384 0.431523
逐列及多函数应用
tips = pd. read_csv( 'examples/tips.csv' )
tips[ 'tip_pct' ] = tips[ 'tip' ] / tips[ 'total_bill' ]
tips[ : 6 ]
total_bill tip smoker day time size tip_pct 0 16.99 1.01 No Sun Dinner 2 0.059447 1 10.34 1.66 No Sun Dinner 3 0.160542 2 21.01 3.50 No Sun Dinner 3 0.166587 3 23.68 3.31 No Sun Dinner 2 0.139780 4 24.59 3.61 No Sun Dinner 4 0.146808 5 25.29 4.71 No Sun Dinner 4 0.186240
grouped = tips. groupby( [ 'day' , 'smoker' ] )
grouped_pct = grouped[ 'tip_pct' ]
grouped_pct. agg( 'mean' )
day smoker
Fri No 0.151650
Yes 0.174783
Sat No 0.158048
Yes 0.147906
Sun No 0.160113
Yes 0.187250
Thur No 0.160298
Yes 0.163863
Name: tip_pct, dtype: float64
grouped_pct. agg( [ 'mean' , 'std' , peak_to_peak] )
mean std peak_to_peak day smoker Fri No 0.151650 0.028123 0.067349 Yes 0.174783 0.051293 0.159925 Sat No 0.158048 0.039767 0.235193 Yes 0.147906 0.061375 0.290095 Sun No 0.160113 0.042347 0.193226 Yes 0.187250 0.154134 0.644685 Thur No 0.160298 0.038774 0.193350 Yes 0.163863 0.039389 0.151240
grouped_pct. agg( [ ( 'foo' , 'mean' ) , ( 'bar' , np. std) ] )
foo bar day smoker Fri No 0.151650 0.028123 Yes 0.174783 0.051293 Sat No 0.158048 0.039767 Yes 0.147906 0.061375 Sun No 0.160113 0.042347 Yes 0.187250 0.154134 Thur No 0.160298 0.038774 Yes 0.163863 0.039389
functions = [ 'count' , 'mean' , 'max' ]
result = grouped[ 'tip_pct' , 'total_bill' ] . agg( functions)
result
tip_pct total_bill count mean max count mean max day smoker Fri No 4 0.151650 0.187735 4 18.420000 22.75 Yes 15 0.174783 0.263480 15 16.813333 40.17 Sat No 45 0.158048 0.291990 45 19.661778 48.33 Yes 42 0.147906 0.325733 42 21.276667 50.81 Sun No 57 0.160113 0.252672 57 20.506667 48.17 Yes 19 0.187250 0.710345 19 24.120000 45.35 Thur No 45 0.160298 0.266312 45 17.113111 41.19 Yes 17 0.163863 0.241255 17 19.190588 43.11
result[ 'tip_pct' ]
count mean max day smoker Fri No 4 0.151650 0.187735 Yes 15 0.174783 0.263480 Sat No 45 0.158048 0.291990 Yes 42 0.147906 0.325733 Sun No 57 0.160113 0.252672 Yes 19 0.187250 0.710345 Thur No 45 0.160298 0.266312 Yes 17 0.163863 0.241255
ftuples = [ ( 'Durchschnitt' , 'mean' ) , ( 'Abweichung' , np. var) ]
grouped[ 'tip_pct' , 'total_bill' ] . agg( ftuples)
tip_pct total_bill Durchschnitt Abweichung Durchschnitt Abweichung day smoker Fri No 0.151650 0.000791 18.420000 25.596333 Yes 0.174783 0.002631 16.813333 82.562438 Sat No 0.158048 0.001581 19.661778 79.908965 Yes 0.147906 0.003767 21.276667 101.387535 Sun No 0.160113 0.001793 20.506667 66.099980 Yes 0.187250 0.023757 24.120000 109.046044 Thur No 0.160298 0.001503 17.113111 59.625081 Yes 0.163863 0.001551 19.190588 69.808518
grouped. agg( { 'tip' : np. max , 'size' : 'sum' } )
tip size day smoker Fri No 3.50 9 Yes 4.73 31 Sat No 9.00 115 Yes 10.00 104 Sun No 6.00 167 Yes 6.50 49 Thur No 6.70 112 Yes 5.00 40
grouped. agg( { 'tip_pct' : [ 'min' , 'max' , 'mean' , 'std' ] ,
'size' : 'sum' } )
size tip_pct sum min max mean std day smoker Fri No 9 0.120385 0.187735 0.151650 0.028123 Yes 31 0.103555 0.263480 0.174783 0.051293 Sat No 115 0.056797 0.291990 0.158048 0.039767 Yes 104 0.035638 0.325733 0.147906 0.061375 Sun No 167 0.059447 0.252672 0.160113 0.042347 Yes 49 0.065660 0.710345 0.187250 0.154134 Thur No 112 0.072961 0.266312 0.160298 0.038774 Yes 40 0.090014 0.241255 0.163863 0.039389
返回不含索引的聚合数据
tips. groupby( [ 'day' , 'smoker' ] , as_index = False ) . mean( )
day smoker total_bill tip size tip_pct 0 Fri No 18.420000 2.812500 2.250000 0.151650 1 Fri Yes 16.813333 2.714000 2.066667 0.174783 2 Sat No 19.661778 3.102889 2.555556 0.158048 3 Sat Yes 21.276667 2.875476 2.476190 0.147906 4 Sun No 20.506667 3.167895 2.929825 0.160113 5 Sun Yes 24.120000 3.516842 2.578947 0.187250 6 Thur No 17.113111 2.673778 2.488889 0.160298 7 Thur Yes 19.190588 3.030000 2.352941 0.163863
应用: 通用拆分——应用——联合
def top ( df, n= 5 , column = 'tip_pct' ) :
return df. sort_values( by = column) [ - n: ]
top( tips, n = 6 )
total_bill tip smoker day time size tip_pct 109 14.31 4.00 Yes Sat Dinner 2 0.279525 183 23.17 6.50 Yes Sun Dinner 4 0.280535 232 11.61 3.39 No Sat Dinner 2 0.291990 67 3.07 1.00 Yes Sat Dinner 1 0.325733 178 9.60 4.00 Yes Sun Dinner 2 0.416667 172 7.25 5.15 Yes Sun Dinner 2 0.710345
tips. groupby( 'smoker' ) . apply ( top)
total_bill tip smoker day time size tip_pct smoker No 88 24.71 5.85 No Thur Lunch 2 0.236746 185 20.69 5.00 No Sun Dinner 5 0.241663 51 10.29 2.60 No Sun Dinner 2 0.252672 149 7.51 2.00 No Thur Lunch 2 0.266312 232 11.61 3.39 No Sat Dinner 2 0.291990 Yes 109 14.31 4.00 Yes Sat Dinner 2 0.279525 183 23.17 6.50 Yes Sun Dinner 4 0.280535 67 3.07 1.00 Yes Sat Dinner 1 0.325733 178 9.60 4.00 Yes Sun Dinner 2 0.416667 172 7.25 5.15 Yes Sun Dinner 2 0.710345
tips. groupby( [ 'smoker' , 'day' ] ) . apply ( top, n= 1 , column = 'total_bill' )
total_bill tip smoker day time size tip_pct smoker day No Fri 94 22.75 3.25 No Fri Dinner 2 0.142857 Sat 212 48.33 9.00 No Sat Dinner 4 0.186220 Sun 156 48.17 5.00 No Sun Dinner 6 0.103799 Thur 142 41.19 5.00 No Thur Lunch 5 0.121389 Yes Fri 95 40.17 4.73 Yes Fri Dinner 4 0.117750 Sat 170 50.81 10.00 Yes Sat Dinner 3 0.196812 Sun 182 45.35 3.50 Yes Sun Dinner 3 0.077178 Thur 197 43.11 5.00 Yes Thur Lunch 4 0.115982
result = tips. groupby( 'smoker' ) [ 'tip_pct' ] . describe( )
result
count mean std min 25% 50% 75% max smoker No 151.0 0.159328 0.039910 0.056797 0.136906 0.155625 0.185014 0.291990 Yes 93.0 0.163196 0.085119 0.035638 0.106771 0.153846 0.195059 0.710345
result. unstack( )
smoker
count No 151.000000
Yes 93.000000
mean No 0.159328
Yes 0.163196
std No 0.039910
Yes 0.085119
min No 0.056797
Yes 0.035638
25% No 0.136906
Yes 0.106771
50% No 0.155625
Yes 0.153846
75% No 0.185014
Yes 0.195059
max No 0.291990
Yes 0.710345
dtype: float64
压缩分组键
tips. groupby( 'smoker' , group_keys = False ) . apply ( top)
total_bill tip smoker day time size tip_pct 88 24.71 5.85 No Thur Lunch 2 0.236746 185 20.69 5.00 No Sun Dinner 5 0.241663 51 10.29 2.60 No Sun Dinner 2 0.252672 149 7.51 2.00 No Thur Lunch 2 0.266312 232 11.61 3.39 No Sat Dinner 2 0.291990 109 14.31 4.00 Yes Sat Dinner 2 0.279525 183 23.17 6.50 Yes Sun Dinner 4 0.280535 67 3.07 1.00 Yes Sat Dinner 1 0.325733 178 9.60 4.00 Yes Sun Dinner 2 0.416667 172 7.25 5.15 Yes Sun Dinner 2 0.710345
分位数和桶分析
frame = pd. DataFrame( { 'data1' : np. random. randn( 1000 ) ,
'data2' : np. random. randn( 1000 ) } )
quartiles = pd. cut( frame. data1, 4 )
quartiles[ : 10 ]
0 (0.134, 1.778]
1 (-1.509, 0.134]
2 (-1.509, 0.134]
3 (-1.509, 0.134]
4 (-1.509, 0.134]
5 (-1.509, 0.134]
6 (-1.509, 0.134]
7 (0.134, 1.778]
8 (-1.509, 0.134]
9 (-1.509, 0.134]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.16, -1.509] < (-1.509, 0.134] < (0.134, 1.778] < (1.778, 3.422]]
def get_stats ( group) :
return { 'min' : group. min ( ) , 'max' : group. max ( ) , 'count' : group. count( ) , 'mean' : group. mean( ) }
grouped = frame. data2. groupby( quartiles)
grouped. apply ( get_stats) . unstack( )
count max mean min data1 (-3.16, -1.509] 56.0 2.734228 0.053983 -2.932092 (-1.509, 0.134] 492.0 2.719045 -0.027057 -2.955542 (0.134, 1.778] 409.0 3.140240 0.027807 -2.777564 (1.778, 3.422] 43.0 1.889577 -0.137910 -1.716137
grouping = pd. qcut( frame. data1, 10 , labels = False )
grouped = frame. data2. groupby( grouping)
grouped. apply ( get_stats) . unstack( )
count max mean min data1 0 100.0 2.734228 0.090541 -2.932092 1 100.0 2.496448 0.066131 -2.085507 2 100.0 2.719045 -0.172793 -2.885545 3 100.0 2.550963 -0.040588 -2.152490 4 100.0 2.660218 -0.090275 -2.739500 5 100.0 2.599636 -0.062196 -2.955542 6 100.0 2.739102 -0.090731 -2.090401 7 100.0 3.140240 0.151551 -2.273333 8 100.0 3.080427 0.212375 -1.990595 9 100.0 1.889577 -0.112474 -2.777564
使用指定的分组值填充缺失值
s = pd. Series( np. random. randn( 6 ) )
s[ : : 2 ] = np. nan
s
0 NaN
1 1.010049
2 NaN
3 -1.347714
4 NaN
5 -0.419845
dtype: float64
s. fillna( s. mean( ) )
0 -0.252503
1 1.010049
2 -0.252503
3 -1.347714
4 -0.252503
5 -0.419845
dtype: float64
states = [ 'Ohio' , 'New York' , 'Vermont' , 'Florida' , 'Oregon' , 'Nevada' , 'California' , 'Idaho' ]
group_key = [ 'East' ] * 4 + [ 'West' ] * 4
data = pd. Series( np. random. randn( 8 ) , index = states)
data
Ohio -2.481099
New York 0.259243
Vermont -0.327158
Florida -1.354184
Oregon 1.049335
Nevada -0.173168
California -0.202508
Idaho 0.405706
dtype: float64
data[ [ 'Vermont' , 'Nevada' , 'Idaho' ] ] = np. nan
data
Ohio -2.481099
New York 0.259243
Vermont NaN
Florida -1.354184
Oregon 1.049335
Nevada NaN
California -0.202508
Idaho NaN
dtype: float64
data. groupby( group_key) . mean( )
East -1.192014
West 0.423414
dtype: float64
fill_mean = lambda g : g. fillna( g. mean( ) )
data. groupby( group_key) . apply ( fill_mean)
Ohio -2.481099
New York 0.259243
Vermont -1.192014
Florida -1.354184
Oregon 1.049335
Nevada 0.423414
California -0.202508
Idaho 0.423414
dtype: float64
随机采样与排列
suits = [ 'H' , 'S' , 'C' , 'D' ]
card_val = ( list ( range ( 1 , 11 ) ) + [ 10 ] * 3 ) * 4
base_names = [ 'A' ] + list ( range ( 2 , 11 ) ) + [ 'J' , 'K' , 'Q' ]
cards = [ ]
for suit in [ 'H' , 'S' , 'C' , 'D' ] :
cards. extend( str ( num) + suit for num in base_names)
deck = pd. Series( card_val, index = cards)
deck[ : 13 ]
AH 1
2H 2
3H 3
4H 4
5H 5
6H 6
7H 7
8H 8
9H 9
10H 10
JH 10
KH 10
QH 10
dtype: int64
def draw ( deck, n= 5 ) :
return deck. sample( n)
draw( deck)
9S 9
QC 10
7H 7
5H 5
6S 6
dtype: int64
get_suit = lambda card : card[ - 1 ]
deck. groupby( get_suit) . apply ( draw, n = 2 )
C AC 1
4C 4
D AD 1
QD 10
H QH 10
8H 8
S 8S 8
3S 3
dtype: int64
deck. groupby( get_suit, group_keys = False ) . apply ( draw, n= 2 )
10C 10
3C 3
9D 9
6D 6
9H 9
4H 4
7S 7
QS 10
dtype: int64
分组加权平均和相关性
df = pd. DataFrame( { 'category' : [ 'a' , 'a' , 'a' , 'a' , 'b' , 'b' , 'b' , 'b' ] ,
'data' : np. random. randn( 8 ) ,
'weights' : np. random. rand( 8 ) } )
df
category data weights 0 a -0.890148 0.524090 1 a 0.267026 0.055750 2 a 0.790869 0.465993 3 a 1.667837 0.085079 4 b 2.038612 0.772350 5 b 0.515091 0.681560 6 b -0.334257 0.869390 7 b -0.353495 0.517881
grouped = df. groupby( 'category' )
get_wavg = lambda g : np. average( g[ 'data' ] , weights = g[ 'weights' ] )
grouped. apply ( get_wavg)
category
a 0.051999
b 0.511026
dtype: float64
ls
CH10--数据聚合与分组操作.ipynb [0m[01;34mpydata-book-2nd-edition[0m/
CH6.ipynb [01;32mSeaborn.ipynb[0m*
[01;34mexamples[0m/ 绘图可视化——pandas,Seaborn.ipynb
[01;35mfigpath.png[0m 绘图与可视化.ipynb
mydata.sqlite
close_px = pd. read_csv( 'examples/stock_px_2.csv' , parse_dates = True , index_col = 0 )
close_px. info( )
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
AAPL 2214 non-null float64
MSFT 2214 non-null float64
XOM 2214 non-null float64
SPX 2214 non-null float64
dtypes: float64(4)
memory usage: 86.5 KB
close_px[ - 4 : ]
AAPL MSFT XOM SPX 2011-10-11 400.29 27.00 76.27 1195.54 2011-10-12 402.19 26.96 77.16 1207.25 2011-10-13 408.43 27.18 76.37 1203.66 2011-10-14 422.00 27.27 78.11 1224.58
spx_corr = lambda x : x. corrwith( x[ 'SPX' ] )
rets = close_px. pct_change( ) . dropna( )
get_year = lambda x : x. year
by_year = rets. groupby( get_year)
by_year. apply ( spx_corr)
AAPL MSFT XOM SPX 2003 0.541124 0.745174 0.661265 1.0 2004 0.374283 0.588531 0.557742 1.0 2005 0.467540 0.562374 0.631010 1.0 2006 0.428267 0.406126 0.518514 1.0 2007 0.508118 0.658770 0.786264 1.0 2008 0.681434 0.804626 0.828303 1.0 2009 0.707103 0.654902 0.797921 1.0 2010 0.710105 0.730118 0.839057 1.0 2011 0.691931 0.800996 0.859975 1.0
by_year. apply ( lambda g: g[ 'AAPL' ] . corr( g[ 'MSFT' ] ) )
2003 0.480868
2004 0.259024
2005 0.300093
2006 0.161735
2007 0.417738
2008 0.611901
2009 0.432738
2010 0.571946
2011 0.581987
dtype: float64
逐组线性回归
import statsmodels. api as sm
def regress ( data, yvar, xvars) :
Y = data[ yvar]
X = data[ xvars]
X[ 'intercept' ] = 1 .
result = sm. OLS( Y, X) . fit( )
return result. params
by_year. apply ( regress, 'AAPL' , [ 'SPX' ] )
SPX intercept 2003 1.195406 0.000710 2004 1.363463 0.004201 2005 1.766415 0.003246 2006 1.645496 0.000080 2007 1.198761 0.003438 2008 0.968016 -0.001110 2009 0.879103 0.002954 2010 1.052608 0.001261 2011 0.806605 0.001514
数据透视表与交叉表
tips. pivot_table( index = [ 'day' , 'smoker' ] )
size tip tip_pct total_bill day smoker Fri No 2.250000 2.812500 0.151650 18.420000 Yes 2.066667 2.714000 0.174783 16.813333 Sat No 2.555556 3.102889 0.158048 19.661778 Yes 2.476190 2.875476 0.147906 21.276667 Sun No 2.929825 3.167895 0.160113 20.506667 Yes 2.578947 3.516842 0.187250 24.120000 Thur No 2.488889 2.673778 0.160298 17.113111 Yes 2.352941 3.030000 0.163863 19.190588
tips. pivot_table( [ 'tip_pct' , 'size' ] , index = [ 'time' , 'day' ] , columns = 'smoker' )
size tip_pct smoker No Yes No Yes time day Dinner Fri 2.000000 2.222222 0.139622 0.165347 Sat 2.555556 2.476190 0.158048 0.147906 Sun 2.929825 2.578947 0.160113 0.187250 Thur 2.000000 NaN 0.159744 NaN Lunch Fri 3.000000 1.833333 0.187735 0.188937 Thur 2.500000 2.352941 0.160311 0.163863
tips. pivot_table( [ 'tip_pct' , 'size' ] , index = [ 'time' , 'day' ] , columns = 'smoker' , margins = True )
size tip_pct smoker No Yes All No Yes All time day Dinner Fri 2.000000 2.222222 2.166667 0.139622 0.165347 0.158916 Sat 2.555556 2.476190 2.517241 0.158048 0.147906 0.153152 Sun 2.929825 2.578947 2.842105 0.160113 0.187250 0.166897 Thur 2.000000 NaN 2.000000 0.159744 NaN 0.159744 Lunch Fri 3.000000 1.833333 2.000000 0.187735 0.188937 0.188765 Thur 2.500000 2.352941 2.459016 0.160311 0.163863 0.161301 All 2.668874 2.408602 2.569672 0.159328 0.163196 0.160803
tips. pivot_table( 'tip_pct' , index = [ 'time' , 'smoker' ] , columns = 'day' , aggfunc= len , margins = True )
day Fri Sat Sun Thur All time smoker Dinner No 3.0 45.0 57.0 1.0 106.0 Yes 9.0 42.0 19.0 NaN 70.0 Lunch No 1.0 NaN NaN 44.0 45.0 Yes 6.0 NaN NaN 17.0 23.0 All 19.0 87.0 76.0 62.0 244.0
tips. pivot_table( 'tip_pct' , index = [ 'time' , 'size' , 'smoker' ] , columns = 'day' ,
aggfunc= 'mean' , fill_value = 0 )
day Fri Sat Sun Thur time size smoker Dinner 1 No 0.000000 0.137931 0.000000 0.000000 Yes 0.000000 0.325733 0.000000 0.000000 2 No 0.139622 0.162705 0.168859 0.159744 Yes 0.171297 0.148668 0.207893 0.000000 3 No 0.000000 0.154661 0.152663 0.000000 Yes 0.000000 0.144995 0.152660 0.000000 4 No 0.000000 0.150096 0.148143 0.000000 Yes 0.117750 0.124515 0.193370 0.000000 5 No 0.000000 0.000000 0.206928 0.000000 Yes 0.000000 0.106572 0.065660 0.000000 6 No 0.000000 0.000000 0.103799 0.000000 Lunch 1 No 0.000000 0.000000 0.000000 0.181728 Yes 0.223776 0.000000 0.000000 0.000000 2 No 0.000000 0.000000 0.000000 0.166005 Yes 0.181969 0.000000 0.000000 0.158843 3 No 0.187735 0.000000 0.000000 0.084246 Yes 0.000000 0.000000 0.000000 0.204952 4 No 0.000000 0.000000 0.000000 0.138919 Yes 0.000000 0.000000 0.000000 0.155410 5 No 0.000000 0.000000 0.000000 0.121389 6 No 0.000000 0.000000 0.000000 0.173706
交叉表
pd. crosstab( [ tips. time, tips. day] , tips. smoker, margins= True )
smoker No Yes All time day Dinner Fri 3 9 12 Sat 45 42 87 Sun 57 19 76 Thur 1 0 1 Lunch Fri 1 6 7 Thur 44 17 61 All 151 93 244