第六章 连接(参考答案)

Ex1:美国疫情数据集

In [109]: date = pd.date_range('20200412', '20201116').to_series()
In [110]: date = date.dt.month.astype('string').str.zfill(2
   .....:        ) +'-'+ date.dt.day.astype('string'
   .....:        ).str.zfill(2) +'-'+ '2020'
   .....: 
In [111]: date = date.tolist()
In [112]: L = []
In [113]: for d in date:
   .....:     df = pd.read_csv('data/us_report/' + d + '.csv', index_col='Province_State')
   .....:     data = df.loc['New York', ['Confirmed','Deaths',
   .....:                   'Recovered','Active']]
   .....:     L.append(data.to_frame().T)
   .....: 
In [114]: res = pd.concat(L)
In [115]: res.index = date
In [116]: res.head()
Out[116]: 
           Confirmed Deaths Recovered    Active
04-12-2020    189033   9385   23887.0    179648
04-13-2020    195749  10058   23887.0  185691.0
04-14-2020    203020  10842   23887.0  192178.0
04-15-2020    214454  11617   23887.0  202837.0
04-16-2020    223691  14832   23887.0  208859.0

Ex2:实现join函数

In [117]: def join(df1, df2, how='left'):
   .....:     res_col = df1.columns.tolist() +  df2.columns.tolist()
   .....:     dup = df1.index.unique().intersection(df2.index.unique())
   .....:     res_df = pd.DataFrame(columns = res_col)
   .....:     for label in dup:
   .....:         cartesian = [list(i)+list(j) for i in df1.loc[label
   .....:                     ].values.reshape(-1,1) for j in df2.loc[
   .....:                       label].values.reshape(-1,1)]
   .....:         dup_df = pd.DataFrame(cartesian, index = [label]*len(
   .....:                  cartesian), columns = res_col)
   .....:         res_df = pd.concat([res_df,dup_df])
   .....:     if how in ['left', 'outer']:
   .....:         for label in df1.index.unique().difference(dup):
   .....:             if isinstance(df1.loc[label], pd.DataFrame):
   .....:                 cat = [list(i)+[np.nan]*df2.shape[1
   .....:                       ] for i in df1.loc[label].values]
   .....:             else: cat = [list(i)+[np.nan]*df2.shape[1
   .....:                       ] for i in df1.loc[label].to_frame().values]
   .....:             dup_df = pd.DataFrame(cat, index = [label
   .....:                       ]*len(cat), columns = res_col)
   .....:             res_df = pd.concat([res_df,dup_df])
   .....:     if how in ['right', 'outer']:
   .....:         for label in df2.index.unique().difference(dup):
   .....:             if isinstance(df2.loc[label], pd.DataFrame):
   .....:                 cat = [[np.nan]+list(i)*df1.shape[1
   .....:                       ] for i in df2.loc[label].values]
   .....:             else: cat = [[np.nan]+list(i)*df1.shape[1
   .....:                       ] for i in df2.loc[label].to_frame().values]
   .....:             dup_df = pd.DataFrame(cat, index = [label
   .....:                       ]*len(cat), columns = res_col)
   .....:             res_df = pd.concat([res_df,dup_df])
   .....:     return res_df
   .....: 
In [118]: df1 = pd.DataFrame({'col1':[1,2,3,4,5]}, index=list('AABCD'))
In [119]: df1
Out[119]: 
   col1
A     1
A     2
B     3
C     4
D     5
In [120]: df2 = pd.DataFrame({'col2':list('opqrst')}, index=list('ABBCEE'))
In [121]: df2
Out[121]: 
  col2
A    o
B    p
B    q
C    r
E    s
E    t
In [122]: join(df1, df2, how='outer')
Out[122]: 
  col1 col2
A    1    o
A    2    o
B    3    p
B    3    q
C    4    r
D    5  NaN
E  NaN    s
E  NaN    t