Ex1:美国疫情数据集
In [109]: date = pd.date_range('20200412', '20201116').to_series()
In [110]: date = date.dt.month.astype('string').str.zfill(2
.....: ) +'-'+ date.dt.day.astype('string'
.....: ).str.zfill(2) +'-'+ '2020'
.....:
In [111]: date = date.tolist()
In [112]: L = []
In [113]: for d in date:
.....: df = pd.read_csv('data/us_report/' + d + '.csv', index_col='Province_State')
.....: data = df.loc['New York', ['Confirmed','Deaths',
.....: 'Recovered','Active']]
.....: L.append(data.to_frame().T)
.....:
In [114]: res = pd.concat(L)
In [115]: res.index = date
In [116]: res.head()
Out[116]:
Confirmed Deaths Recovered Active
04-12-2020 189033 9385 23887.0 179648
04-13-2020 195749 10058 23887.0 185691.0
04-14-2020 203020 10842 23887.0 192178.0
04-15-2020 214454 11617 23887.0 202837.0
04-16-2020 223691 14832 23887.0 208859.0
Ex2:实现join函数
In [117]: def join(df1, df2, how='left'):
.....: res_col = df1.columns.tolist() + df2.columns.tolist()
.....: dup = df1.index.unique().intersection(df2.index.unique())
.....: res_df = pd.DataFrame(columns = res_col)
.....: for label in dup:
.....: cartesian = [list(i)+list(j) for i in df1.loc[label
.....: ].values.reshape(-1,1) for j in df2.loc[
.....: label].values.reshape(-1,1)]
.....: dup_df = pd.DataFrame(cartesian, index = [label]*len(
.....: cartesian), columns = res_col)
.....: res_df = pd.concat([res_df,dup_df])
.....: if how in ['left', 'outer']:
.....: for label in df1.index.unique().difference(dup):
.....: if isinstance(df1.loc[label], pd.DataFrame):
.....: cat = [list(i)+[np.nan]*df2.shape[1
.....: ] for i in df1.loc[label].values]
.....: else: cat = [list(i)+[np.nan]*df2.shape[1
.....: ] for i in df1.loc[label].to_frame().values]
.....: dup_df = pd.DataFrame(cat, index = [label
.....: ]*len(cat), columns = res_col)
.....: res_df = pd.concat([res_df,dup_df])
.....: if how in ['right', 'outer']:
.....: for label in df2.index.unique().difference(dup):
.....: if isinstance(df2.loc[label], pd.DataFrame):
.....: cat = [[np.nan]+list(i)*df1.shape[1
.....: ] for i in df2.loc[label].values]
.....: else: cat = [[np.nan]+list(i)*df1.shape[1
.....: ] for i in df2.loc[label].to_frame().values]
.....: dup_df = pd.DataFrame(cat, index = [label
.....: ]*len(cat), columns = res_col)
.....: res_df = pd.concat([res_df,dup_df])
.....: return res_df
.....:
In [118]: df1 = pd.DataFrame({'col1':[1,2,3,4,5]}, index=list('AABCD'))
In [119]: df1
Out[119]:
col1
A 1
A 2
B 3
C 4
D 5
In [120]: df2 = pd.DataFrame({'col2':list('opqrst')}, index=list('ABBCEE'))
In [121]: df2
Out[121]:
col2
A o
B p
B q
C r
E s
E t
In [122]: join(df1, df2, how='outer')
Out[122]:
col1 col2
A 1 o
A 2 o
B 3 p
B 3 q
C 4 r
D 5 NaN
E NaN s
E NaN t