第十章 时序数据(参考答案)

Ex1:太阳辐射数据集

In [208]: df = pd.read_csv('data/solar.csv', usecols=['Data','Time',
   .....:                  'Radiation','Temperature'])
   .....: 
In [209]: solar_date = df.Data.str.extract('([/|\w]+\s).+')[0]
In [210]: df['Data'] = pd.to_datetime(solar_date + df.Time)
In [211]: df = df.drop(columns='Time').rename(columns={'Data':'Datetime'}
   .....:             ).set_index('Datetime').sort_index()
   .....: 
In [212]: df.head(3)
Out[212]: 
                     Radiation  Temperature
Datetime                           
2016-09-01 00:00:08       2.58           51
2016-09-01 00:05:10       2.83           51
2016-09-01 00:20:06       2.16           51
In [213]: s = df.index.to_series().reset_index(drop=True).diff().dt.total_seconds()
In [214]: max_3 = s.nlargest(3).index
In [215]: df.index[max_3.union(max_3-1)]
Out[215]: 
DatetimeIndex(['2016-09-29 23:55:26', '2016-10-01 00:00:19',
               '2016-11-29 19:05:02', '2016-12-01 00:00:02',
               '2016-12-05 20:45:53', '2016-12-08 11:10:42'],
              dtype='datetime64[ns]', name='Datetime', freq=None)
In [216]: res = s.mask((s>s.quantile(0.99))|(s<s.quantile(0.01)))
In [217]: _ = plt.hist(res, bins=50)

../_images/ch10_ex1.png

In [218]: res = df.Radiation.rolling('6H').corr(df.Temperature)
In [219]: res.tail(3)
Out[219]: 
Datetime
2016-12-31 23:45:04    0.328574
2016-12-31 23:50:03    0.261883
2016-12-31 23:55:01    0.262406
dtype: float64
In [220]: res = df.Temperature.resample('6H', origin='03:00:00').mean()
In [221]: res.head(3)
Out[221]: 
Datetime
2016-08-31 21:00:00    51.218750
2016-09-01 03:00:00    50.033333
2016-09-01 09:00:00    59.379310
Freq: 6H, Name: Temperature, dtype: float64
In [222]: my_dt = df.index.shift(freq='-6H')
In [223]: int_loc = [df.index.get_loc(i, method='nearest') for i in my_dt]
In [224]: res = df.Radiation.iloc[int_loc]
In [225]: res.tail(3)
Out[225]: 
Datetime
2016-12-31 17:45:02    9.33
2016-12-31 17:50:01    8.49
2016-12-31 17:55:02    5.84
Name: Radiation, dtype: float64

Ex2:水果销量数据集

In [226]: df = pd.read_csv('data/fruit.csv')
In [227]: df.Date = pd.to_datetime(df.Date)
In [228]: df_grape = df.query("Fruit == 'Grape'")
In [229]: res = df_grape.groupby([np.where(df_grape.Date.dt.day<=15,
   .....:                         'First', 'Second'),df_grape.Date.dt.month]
   .....:                         )['Sale'].mean().to_frame().unstack(0
   .....:                         ).droplevel(0,axis=1)
   .....: 
In [230]: res = (res.First/res.Second).rename_axis('Month')
In [231]: res.head()
Out[231]: 
Month
1    1.174998
2    0.968890
3    0.951351
4    1.020797
5    0.931061
dtype: float64
In [232]: df[df.Date.dt.is_month_end].query("Fruit == 'Pear'"
   .....:                           ).groupby('Date').Sale.sum().head()
   .....: 
Out[232]: 
Date
2019-01-31    847
2019-02-28    774
2019-03-31    761
2019-04-30    648
2019-05-31    616
Name: Sale, dtype: int64
In [233]: df[df.Date.isin(pd.date_range('20190101', '20191231',
   .....:                 freq='BM'))].query("Fruit == 'Pear'"
   .....:                 ).groupby('Date').Sale.mean().head()
   .....: 
Out[233]: 
Date
2019-01-31    60.500000
2019-02-28    59.538462
2019-03-29    56.666667
2019-04-30    64.800000
2019-05-31    61.600000
Name: Sale, dtype: float64
In [234]: target_dt = df.drop_duplicates().groupby(df.Date.drop_duplicates(
   .....:             ).dt.month)['Date'].nlargest(5).reset_index(drop=True)
   .....: 
In [235]: res = df.set_index('Date').loc[target_dt].reset_index(
   .....:             ).query("Fruit == 'Apple'")
   .....: 
In [236]: res = res.groupby(res.Date.dt.month)['Sale'].mean(
   .....:             ).rename_axis('Month')
   .....: 
In [237]: res.head()
Out[237]: 
Month
1    65.313725
2    54.061538
3    59.325581
4    65.795455
5    57.465116
Name: Sale, dtype: float64
In [238]: month_order = ['January','February','March','April',
   .....:                 'May','June','July','August','September',
   .....:                 'October','November','December']
   .....: 
In [239]: week_order = ['Mon','Tue','Wed','Thu','Fri','Sat','Sum']
In [240]: group1 = df.Date.dt.month_name().astype('category').cat.reorder_categories(
   .....:         month_order, ordered=True)
   .....: 
In [241]: group2 = df.Fruit
In [242]: group3 = df.Date.dt.dayofweek.replace(dict(zip(range(7),week_order))
   .....:          ).astype('category').cat.reorder_categories(
   .....:          week_order, ordered=True)
   .....: 
In [243]: res = df.groupby([group1, group2,group3])['Sale'].count().to_frame(
   .....:          ).unstack(0).droplevel(0,axis=1)
   .....: 
In [244]: res.head()
Out[244]: 
Date        January  February  March  April  May  June  July  August  September  October  November  December
Fruit Date                                                                                            
Apple Mon        46        43     43     47   43    40    41      38         59       42        39        45
      Tue        50        40     44     52   46    39    50      42         40       57        47        47
      Wed        50        47     37     43   39    39    58      43         35       46        47        38
      Thu        45        35     31     47   58    33    52      44         36       63        37        40
      Fri        32        33     52     31   46    38    37      48         34       37        46        41
In [245]: df_apple = df[(df.Fruit=='Apple')&(
   .....:               ~df.Date.dt.dayofweek.isin([5,6]))]
   .....: 
In [246]: s = pd.Series(df_apple.Sale.values,
   .....:               index=df_apple.Date).groupby('Date').sum()
   .....: 
In [247]: res = s.rolling('10D').mean().reindex(
   .....:               pd.date_range('20190101','20191231')).fillna(method='ffill')
   .....: 
In [248]: res.head()
Out[248]: 
2019-01-01    189.000000
2019-01-02    335.500000
2019-01-03    520.333333
2019-01-04    527.750000
2019-01-05    527.750000
Freq: D, dtype: float64