
In [1]:
In [31]:
data_dict = {'color' : ['black', 'white', 'black', 'white', 'black','white', 'black', 'white', 'black', 'white'],'size' : ['S','M','L','M','L','S','S','XL','XL','M'],'date':pd.date_range('1/1/2019',periods=10, freq='W' ),'feature_1': np.random.randn(10),'feature_2': np.random.normal(0.5, 2, 10)}
array=[['A','B','B','B','C','A','B','A','C','C'],['JP','CN','US','US','US','CN','CN','CA','JP','CA']]index = pd.MultiIndex.from_arrays(array, names=['class', 'country'])
data_df = pd.DataFrame(data_dict,index=index)

color date feature_1 feature_2 size
class country
A JP black 2019-01-06 -1.234449 -0.133232 S
B CN white 2019-01-13 1.308935 -0.493569 M
US black 2019-01-20 0.041672 1.014697 L
US white 2019-01-27 -0.203778 1.742654 M
C US black 2019-02-03 0.419852 -2.964561 L
A CN white 2019-02-10 2.350862 -1.895651 S
B CN black 2019-02-17 -0.649887 -0.187894 S
A CA white 2019-02-24 0.912200 0.782471 XL
C JP black 2019-03-03 -1.295436 0.416840 XL
CA white 2019-03-10 0.500633 2.827345 M



In [5]:
group_1 = data_df.groupby('size')
for i in list(group_1):print(i)

('L',                color       date  feature_1  feature_2 size
class country
B     US       black 2019-01-20  -1.204530   2.331003    L
C     US       black 2019-02-03  -0.475149   2.455877    L)
('M',                color       date  feature_1  feature_2 size
class country
B     CN       white 2019-01-13   0.354512  -0.106245    MUS       white 2019-01-27   0.640886   3.105454    M
C     CA       white 2019-03-10   0.471399   1.102412    M)
('S',                color       date  feature_1  feature_2 size
class country
A     JP       black 2019-01-06   0.599631   1.029602    SCN       white 2019-02-10   0.024186   2.412876    S
B     CN       black 2019-02-17   3.110097   0.678240    S)
('XL',                color       date  feature_1  feature_2 size
class country
A     CA       white 2019-02-24   0.890249   1.522595   XL
C     JP       black 2019-03-03  -1.216877   2.321393   XL)


In [6]:

sum_feature_1 sum_feature_2
L -1.679679 4.786880
M 1.466797 4.101621
S 3.733914 4.120718
XL -0.326628 3.843988


In [5]:

color date feature_1 feature_2
class country
B CN white 2019-01-13 1.735991 0.383047
US white 2019-01-27 -0.847715 -2.327769
C CA white 2019-03-10 -0.818303 1.317979


In [34]:
group_2 = data_df.groupby(['size', 'color'])
for i in list(group_2):print(i)

(('L', 'black'),                color       date  feature_1  feature_2 size
class country
B     US       black 2019-01-20   0.041672   1.014697    L
C     US       black 2019-02-03   0.419852  -2.964561    L)
(('M', 'white'),                color       date  feature_1  feature_2 size
class country
B     CN       white 2019-01-13   1.308935  -0.493569    MUS       white 2019-01-27  -0.203778   1.742654    M
C     CA       white 2019-03-10   0.500633   2.827345    M)
(('S', 'black'),                color       date  feature_1  feature_2 size
class country
A     JP       black 2019-01-06  -1.234449  -0.133232    S
B     CN       black 2019-02-17  -0.649887  -0.187894    S)
(('S', 'white'),                color       date  feature_1  feature_2 size
class country
A     CN       white 2019-02-10   2.350862  -1.895651    S)
(('XL', 'black'),                color       date  feature_1  feature_2 size
class country
C     JP       black 2019-03-03  -1.295436    0.41684   XL)
(('XL', 'white'),                color       date  feature_1  feature_2 size
class country
A     CA       white 2019-02-24     0.9122   0.782471   XL)


In [9]:

L     2
M     3
S     3
XL    2
dtype: int64
size  color
L     black    2
M     white    3
S     black    2white    1
XL    black    1white    1
dtype: int64


In [10]:
def get_letter_type(letter):if 'feature' in letter:return 'feature'else:return 'other'for i in list(data_df.groupby(get_letter_type, axis=1)):print(i)

('feature',                feature_1  feature_2
class country
A     JP        0.599631   1.029602
B     CN        0.354512  -0.106245US       -1.204530   2.331003US        0.640886   3.105454
C     US       -0.475149   2.455877
A     CN        0.024186   2.412876
B     CN        3.110097   0.678240
A     CA        0.890249   1.522595
C     JP       -1.216877   2.321393CA        0.471399   1.102412)
('other',                color       date size
class country
A     JP       black 2019-01-06    S
B     CN       white 2019-01-13    MUS       black 2019-01-20    LUS       white 2019-01-27    M
C     US       black 2019-02-03    L
A     CN       white 2019-02-10    S
B     CN       black 2019-02-17    S
A     CA       white 2019-02-24   XL
C     JP       black 2019-03-03   XLCA       white 2019-03-10    M)


In [16]:
for i in list(data_df.groupby(level=[0,1])):print(i)

(('A', 'CA'),                color       date  feature_1  feature_2 size
class country
A     CA       white 2019-02-24   0.890249   1.522595   XL)
(('A', 'CN'),                color       date  feature_1  feature_2 size
class country
A     CN       white 2019-02-10   0.024186   2.412876    S)
(('A', 'JP'),                color       date  feature_1  feature_2 size
class country
A     JP       black 2019-01-06   0.599631   1.029602    S)
(('B', 'CN'),                color       date  feature_1  feature_2 size
class country
B     CN       white 2019-01-13   0.354512  -0.106245    MCN       black 2019-02-17   3.110097   0.678240    S)
(('B', 'US'),                color       date  feature_1  feature_2 size
class country
B     US       black 2019-01-20  -1.204530   2.331003    LUS       white 2019-01-27   0.640886   3.105454    M)
(('C', 'CA'),                color       date  feature_1  feature_2 size
class country
C     CA       white 2019-03-10   0.471399   1.102412    M)
(('C', 'JP'),                color       date  feature_1  feature_2 size
class country
C     JP       black 2019-03-03  -1.216877   2.321393   XL)
(('C', 'US'),                color       date  feature_1  feature_2 size
class country
C     US       black 2019-02-03  -0.475149   2.455877    L)


In [10]:
for name, group in group_3:print(name)print(group)

('CA', 'white')color size       date  feature_1  feature_2
class country
A     CA       white   XL 2019-02-24   0.412967   1.196859
C     CA       white    M 2019-03-10  -0.818303   1.317979
('CN', 'black')color size       date  feature_1  feature_2
class country
B     CN       black    S 2019-02-17  -0.058021  -2.420962
('CN', 'white')color size       date  feature_1  feature_2
class country
B     CN       white    M 2019-01-13   1.735991   0.383047
A     CN       white    S 2019-02-10   0.282515   3.156525
('JP', 'black')color size       date  feature_1  feature_2
class country
A     JP       black    S 2019-01-06   0.997065  -1.018255
C     JP       black   XL 2019-03-03   0.513201  -3.266357
('US', 'black')color size       date  feature_1  feature_2
class country
B     US       black    L 2019-01-20  -0.547211   0.693104
C     US       black    L 2019-02-03  -0.245918   4.444044
('US', 'white')color size       date  feature_1  feature_2
class country
B     US       white    M 2019-01-27  -0.847715  -2.327769


In [17]:
group_2.agg({'feature_1' : np.min,'feature_2' : np.mean})

feature_1 feature_2
size color
L black -1.204530 2.393440
M white 0.354512 1.367207
S black 0.599631 0.853921
white 0.024186 2.412876
XL black -1.216877 2.321393
white 0.890249 1.522595


In [18]:
data_range = lambda x: x.max() - x.min()

date feature_1 feature_2
class country
A JP 42 days 3.085912 1.734636
B CN 56 days 0.286375 3.211699
US 14 days 0.729382 0.124874
US 56 days 0.286375 3.211699
C US 14 days 0.729382 0.124874
A CN 42 days 3.085912 1.734636
B CN 42 days 3.085912 1.734636
A CA 7 days 2.107125 0.798798
C JP 7 days 2.107125 0.798798
CA 56 days 0.286375 3.211699


In [29]:
data_df.iloc[1, 2:4] = np.NaN
group_4 = data_df.groupby('size')
f = lambda x: x.fillna(x.mean())
df_trans = group_4.transform(f)

feature_1 feature_2
class country
A JP -0.023671 -0.409491
B CN -0.091596 -1.399647
US 1.085396 2.245660
US -0.127399 -1.747656
C US -2.046202 3.475487
A CN -1.076002 2.705517
B CN 0.184117 2.913971
A CA 0.601222 -2.098025
C JP -0.009375 -3.623235
CA -0.055794 -1.051638

In [30]:

color date feature_1 feature_2 size
class country
A JP black 2019-01-06 -0.023671 -0.409491 S
B CN white 2019-01-13 NaN NaN M
US black 2019-01-20 1.085396 2.245660 L
US white 2019-01-27 -0.127399 -1.747656 M
C US black 2019-02-03 -2.046202 3.475487 L
A CN white 2019-02-10 -1.076002 2.705517 S
B CN black 2019-02-17 0.184117 2.913971 S
A CA white 2019-02-24 0.601222 -2.098025 XL
C JP black 2019-03-03 -0.009375 -3.623235 XL
CA white 2019-03-10 -0.055794 -1.051638 M


In [32]:

color  class  country
black  A      JP              NaNB      US              NaNC      US        -0.257642B      CN        -0.062787C      JP        -0.508490
white  B      CN              NaNUS              NaNA      CN         1.152006CA         1.019761C      CA         1.254565
Name: feature_1, dtype: float64


In [47]:

color  class  country
black  A      JP              NaNB      US              NaNC      US        -0.772925B      CN        -1.422812C      JP        -2.718247
white  B      CN              NaNUS              NaNA      CN         3.456018CA         4.368218C      CA         4.868851
Name: feature_1, dtype: float64


In [36]:
data_df.groupby('class').filter(lambda x: len(x) > 3)

color date feature_1 feature_2 size
class country
B CN white 2019-01-13 1.308935 -0.493569 M
US black 2019-01-20 0.041672 1.014697 L
US white 2019-01-27 -0.203778 1.742654 M
CN black 2019-02-17 -0.649887 -0.187894 S


In [37]:
data_df.groupby('class')['feature_1'].apply(lambda x: x.describe())

A      count    3.000000mean     0.676204std      1.804268min     -1.23444925%     -0.16112550%      0.91220075%      1.631531max      2.350862
B      count    4.000000mean     0.124235std      0.840077min     -0.64988725%     -0.31530650%     -0.08105375%      0.358488max      1.308935
C      count    3.000000mean    -0.124984std      1.014446min     -1.29543625%     -0.43779250%      0.41985275%      0.460243max      0.500633
Name: feature_1, dtype: float64

In [38]:
def f(group):return pd.DataFrame({'original' : group,'demeaned' : group - group.mean()})

demeaned original
class country
A JP -1.910653 -1.234449
B CN 1.184700 1.308935
CN -0.774122 -0.649887
US -0.082563 0.041672
US -0.328014 -0.203778
C US 0.544836 0.419852
A CN 1.674658 2.350862
CA 0.235996 0.912200
C JP -1.170452 -1.295436
CA 0.625616 0.500633


