第九章 分类数据(参考答案)

Ex1:统计未出现的类别

In [174]: def my_crosstab(s1, s2, dropna=True):
   .....:     idx1 = (s1.cat.categories if s1.dtype.name == 'category' and
   .....:                              not dropna else s1.unique())
   .....:     idx2 = (s2.cat.categories if s2.dtype.name == 'category' and
   .....:                              not dropna else s2.unique())
   .....:     res = pd.DataFrame(np.zeros((idx1.shape[0], idx2.shape[0])),
   .....:                     index=idx1, columns=idx2)
   .....:     for i, j in zip(s1, s2):
   .....:         res.at[i, j] += 1
   .....:     res = res.rename_axis(index=s1.name, columns=s2.name).astype('int')
   .....:     return res
   .....: 
In [175]: df = pd.DataFrame({'A':['a','b','c','a'],
   .....:                    'B':['cat','cat','dog','cat']})
   .....: 
In [176]: df.B = df.B.astype('category').cat.add_categories('sheep')
In [177]: my_crosstab(df.A, df.B)
Out[177]: 
B  cat  dog
A  
a    2    0
b    1    0
c    0    1
In [178]: my_crosstab(df.A, df.B, dropna=False)
Out[178]: 
B  cat  dog  sheep
A         
a    2    0      0
b    1    0      0
c    0    1      0

Ex2:钻石数据集

In [179]: df = pd.read_csv('data/diamonds.csv')
In [180]: s_obj, s_cat = df.cut, df.cut.astype('category')
In [181]: %timeit -n 30 s_obj.nunique()
5.27 ms +- 519 us per loop (mean +- std. dev. of 7 runs, 30 loops each)
In [182]: %timeit -n 30 s_cat.nunique()
1.09 ms +- 54.7 us per loop (mean +- std. dev. of 7 runs, 30 loops each)
In [183]: df.cut = df.cut.astype('category').cat.reorder_categories([
   .....:         'Fair', 'Good', 'Very Good', 'Premium', 'Ideal'],ordered=True)
   .....: 
In [184]: df.clarity = df.clarity.astype('category').cat.reorder_categories([
   .....:         'I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'],ordered=True)
   .....: 
In [185]: res = df.sort_values(['cut', 'clarity'], ascending=[False, True])
In [186]: res.head(3)
Out[186]: 
     carat    cut clarity  price
315   0.96  Ideal      I1   2801
535   0.96  Ideal      I1   2826
551   0.97  Ideal      I1   2830
In [187]: res.tail(3)
Out[187]: 
       carat   cut clarity  price
47407   0.52  Fair      IF   1849
49683   0.52  Fair      IF   2144
50126   0.47  Fair      IF   2211
In [188]: df.cut = df.cut.cat.reorder_categories(
   .....:         df.cut.cat.categories[::-1])
   .....: 
In [189]: df.clarity = df.clarity.cat.reorder_categories(
   .....:             df.clarity.cat.categories[::-1])
   .....: 
In [190]: df.cut = df.cut.cat.codes # 方法一:利用cat.codes
In [191]: clarity_cat = df.clarity.cat.categories
In [192]: df.clarity = df.clarity.replace(dict(zip(
   .....:             clarity_cat, np.arange(
   .....:                 len(clarity_cat))))) # 方法二:使用replace映射
   .....: 
In [193]: df.head(3)
Out[193]: 
   carat  cut  clarity  price
0   0.23    0        6    326
1   0.21    1        5    326
2   0.23    3        3    327
In [194]: q = [0, 0.2, 0.4, 0.6, 0.8, 1]
In [195]: point = [-np.infty, 1000, 3500, 5500, 18000, np.infty]
In [196]: avg = df.price / df.carat
In [197]: df['avg_cut'] = pd.cut(avg, bins=point, labels=[
   .....:                 'Very Low', 'Low', 'Mid', 'High', 'Very High'])
   .....: 
In [198]: df['avg_qcut'] = pd.qcut(avg, q=q, labels=[
   .....:                 'Very Low', 'Low', 'Mid', 'High', 'Very High'])
   .....: 
In [199]: df.head()
Out[199]: 
   carat  cut  clarity  price avg_cut  avg_qcut
0   0.23    0        6    326     Low  Very Low
1   0.21    1        5    326     Low  Very Low
2   0.23    3        3    327     Low  Very Low
3   0.29    1        4    334     Low  Very Low
4   0.31    3        6    335     Low  Very Low
In [200]: df.avg_cut.unique()
Out[200]: 
['Low', 'Mid', 'High']
Categories (3, object): ['Low' < 'Mid' < 'High']
In [201]: df.avg_cut.cat.categories
Out[201]: Index(['Very Low', 'Low', 'Mid', 'High', 'Very High'], dtype='object')
In [202]: df.avg_cut = df.avg_cut.cat.remove_categories([
   .....:             'Very Low', 'Very High'])
   .....: 
In [203]: df.avg_cut.head(3)
Out[203]: 
0    Low
1    Low
2    Low
Name: avg_cut, dtype: category
Categories (3, object): ['Low' < 'Mid' < 'High']
In [204]: interval_avg = pd.IntervalIndex(pd.qcut(avg, q=q))
In [205]: interval_avg.right.to_series().reset_index(drop=True).head(3)
Out[205]: 
0    2295.0
1    2295.0
2    2295.0
dtype: float64
In [206]: interval_avg.left.to_series().reset_index(drop=True).head(3)
Out[206]: 
0    1051.162
1    1051.162
2    1051.162
dtype: float64
In [207]: interval_avg.length.to_series().reset_index(drop=True).head(3)
Out[207]: 
0    1243.838
1    1243.838
2    1243.838
dtype: float64