第三章 索引(参考答案)

Ex1:公司员工数据集

In [51]: df = pd.read_csv('data/company.csv')
In [52]: dpt = ['Dairy', 'Bakery']
In [53]: df.query("(age <= 40)&(department == @dpt)&(gender=='M')").head(3)
Out[53]: 
      EmployeeID birthdate_key  age city_name department     job_title gender
3611        5791     1/14/1975   40   Kelowna      Dairy  Dairy Person      M
3613        5793     1/22/1975   40  Richmond     Bakery         Baker      M
3615        5795     1/30/1975   40   Nanaimo      Dairy  Dairy Person      M
In [54]: df.loc[(df.age<=40)&df.department.isin(dpt)&(df.gender=='M')].head(3)
Out[54]: 
      EmployeeID birthdate_key  age city_name department     job_title gender
3611        5791     1/14/1975   40   Kelowna      Dairy  Dairy Person      M
3613        5793     1/22/1975   40  Richmond     Bakery         Baker      M
3615        5795     1/30/1975   40   Nanaimo      Dairy  Dairy Person      M
In [55]: df.iloc[(df.EmployeeID%2==1).values,[0,2,-2]].head()
Out[55]: 
   EmployeeID  age                      job_title
1        1319   58                      VP Stores
3        1321   56             VP Human Resources
5        1323   53      Exec Assistant, VP Stores
6        1325   51  Exec Assistant, Legal Counsel
8        1329   48                  Store Manager
In [56]: df_op = df.copy()
In [57]: df_op = df_op.set_index(df_op.columns[-3:].tolist()).swaplevel(0,2,axis=0)
In [58]: df_op = df_op.reset_index(level=1)
In [59]: df_op = df_op.rename_axis(index={'gender':'Gender'})
In [60]: df_op.index = df_op.index.map(lambda x:'_'.join(x))
In [61]: df_op.index = df_op.index.map(lambda x:tuple(x.split('_')))
In [62]: df_op = df_op.rename_axis(index=['gender', 'department'])
In [63]: df_op = df_op.reset_index().reindex(df.columns, axis=1)
In [64]: df_op.equals(df)
Out[64]: True

Ex2:巧克力数据集

In [65]: df = pd.read_csv('data/chocolate.csv')
In [66]: df.columns = [' '.join(i.split('\n')) for i in df.columns]
In [67]: df.head(3)
Out[67]: 
    Company  Review Date Cocoa Percent Company Location  Rating
0  A. Morin         2016           63%           France    3.75
1  A. Morin         2015           70%           France    2.75
2  A. Morin         2015           70%           France    3.00
In [68]: df['Cocoa Percent'] = df['Cocoa Percent'].apply(lambda x:float(x[:-1])/100)
In [69]: df.query('(Rating<3)&(`Cocoa Percent`>`Cocoa Percent`.median())').head(3)
Out[69]: 
               Company  Review Date  Cocoa Percent Company Location  Rating
33  Akesson's (Pralus)         2010           0.75      Switzerland    2.75
34  Akesson's (Pralus)         2010           0.75      Switzerland    2.75
36       Alain Ducasse         2014           0.75           France    2.75
In [70]: idx = pd.IndexSlice
In [71]: exclude = ['France', 'Canada', 'Amsterdam', 'Belgium']
In [72]: res = df.set_index(['Review Date', 'Company Location']).sort_index(level=0)
In [73]: res.loc[idx[2012:,~res.index.get_level_values(1).isin(exclude)],:].head(3)
Out[73]: 
                                  Company  Cocoa Percent  Rating
Review Date Company Location                            
2012        Australia         Bahen & Co.            0.7     3.0
            Australia         Bahen & Co.            0.7     2.5
            Australia         Bahen & Co.            0.7     2.5