Ex1:统计未出现的类别
In [174]: def my_crosstab(s1, s2, dropna=True):
.....: idx1 = (s1.cat.categories if s1.dtype.name == 'category' and
.....: not dropna else s1.unique())
.....: idx2 = (s2.cat.categories if s2.dtype.name == 'category' and
.....: not dropna else s2.unique())
.....: res = pd.DataFrame(np.zeros((idx1.shape[0], idx2.shape[0])),
.....: index=idx1, columns=idx2)
.....: for i, j in zip(s1, s2):
.....: res.at[i, j] += 1
.....: res = res.rename_axis(index=s1.name, columns=s2.name).astype('int')
.....: return res
.....:
In [175]: df = pd.DataFrame({'A':['a','b','c','a'],
.....: 'B':['cat','cat','dog','cat']})
.....:
In [176]: df.B = df.B.astype('category').cat.add_categories('sheep')
In [177]: my_crosstab(df.A, df.B)
Out[177]:
B cat dog
A
a 2 0
b 1 0
c 0 1
In [178]: my_crosstab(df.A, df.B, dropna=False)
Out[178]:
B cat dog sheep
A
a 2 0 0
b 1 0 0
c 0 1 0
Ex2:钻石数据集
In [179]: df = pd.read_csv('data/diamonds.csv')
In [180]: s_obj, s_cat = df.cut, df.cut.astype('category')
In [181]: %timeit -n 30 s_obj.nunique()
5.27 ms +- 519 us per loop (mean +- std. dev. of 7 runs, 30 loops each)
In [182]: %timeit -n 30 s_cat.nunique()
1.09 ms +- 54.7 us per loop (mean +- std. dev. of 7 runs, 30 loops each)
In [183]: df.cut = df.cut.astype('category').cat.reorder_categories([
.....: 'Fair', 'Good', 'Very Good', 'Premium', 'Ideal'],ordered=True)
.....:
In [184]: df.clarity = df.clarity.astype('category').cat.reorder_categories([
.....: 'I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'],ordered=True)
.....:
In [185]: res = df.sort_values(['cut', 'clarity'], ascending=[False, True])
In [186]: res.head(3)
Out[186]:
carat cut clarity price
315 0.96 Ideal I1 2801
535 0.96 Ideal I1 2826
551 0.97 Ideal I1 2830
In [187]: res.tail(3)
Out[187]:
carat cut clarity price
47407 0.52 Fair IF 1849
49683 0.52 Fair IF 2144
50126 0.47 Fair IF 2211
In [188]: df.cut = df.cut.cat.reorder_categories(
.....: df.cut.cat.categories[::-1])
.....:
In [189]: df.clarity = df.clarity.cat.reorder_categories(
.....: df.clarity.cat.categories[::-1])
.....:
In [190]: df.cut = df.cut.cat.codes # 方法一:利用cat.codes
In [191]: clarity_cat = df.clarity.cat.categories
In [192]: df.clarity = df.clarity.replace(dict(zip(
.....: clarity_cat, np.arange(
.....: len(clarity_cat))))) # 方法二:使用replace映射
.....:
In [193]: df.head(3)
Out[193]:
carat cut clarity price
0 0.23 0 6 326
1 0.21 1 5 326
2 0.23 3 3 327
In [194]: q = [0, 0.2, 0.4, 0.6, 0.8, 1]
In [195]: point = [-np.infty, 1000, 3500, 5500, 18000, np.infty]
In [196]: avg = df.price / df.carat
In [197]: df['avg_cut'] = pd.cut(avg, bins=point, labels=[
.....: 'Very Low', 'Low', 'Mid', 'High', 'Very High'])
.....:
In [198]: df['avg_qcut'] = pd.qcut(avg, q=q, labels=[
.....: 'Very Low', 'Low', 'Mid', 'High', 'Very High'])
.....:
In [199]: df.head()
Out[199]:
carat cut clarity price avg_cut avg_qcut
0 0.23 0 6 326 Low Very Low
1 0.21 1 5 326 Low Very Low
2 0.23 3 3 327 Low Very Low
3 0.29 1 4 334 Low Very Low
4 0.31 3 6 335 Low Very Low
In [200]: df.avg_cut.unique()
Out[200]:
['Low', 'Mid', 'High']
Categories (3, object): ['Low' < 'Mid' < 'High']
In [201]: df.avg_cut.cat.categories
Out[201]: Index(['Very Low', 'Low', 'Mid', 'High', 'Very High'], dtype='object')
In [202]: df.avg_cut = df.avg_cut.cat.remove_categories([
.....: 'Very Low', 'Very High'])
.....:
In [203]: df.avg_cut.head(3)
Out[203]:
0 Low
1 Low
2 Low
Name: avg_cut, dtype: category
Categories (3, object): ['Low' < 'Mid' < 'High']
In [204]: interval_avg = pd.IntervalIndex(pd.qcut(avg, q=q))
In [205]: interval_avg.right.to_series().reset_index(drop=True).head(3)
Out[205]:
0 2295.0
1 2295.0
2 2295.0
dtype: float64
In [206]: interval_avg.left.to_series().reset_index(drop=True).head(3)
Out[206]:
0 1051.162
1 1051.162
2 1051.162
dtype: float64
In [207]: interval_avg.length.to_series().reset_index(drop=True).head(3)
Out[207]:
0 1243.838
1 1243.838
2 1243.838
dtype: float64