第05章布爾索引 · Pandas Cookbook 帶注釋源碼

# 第05章布爾索引 ```py In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline ``` ## 1\. 計算布爾值統計信息 ```py # 讀取movie，設定行索引是movie_title In[2]: pd.options.display.max_columns = 50 In[3]: movie = pd.read_csv('data/movie.csv', index_col='movie_title') movie.head() Out[3]: ``` ![](https://img.kancloud.cn/46/f2/46f210d0aef57cae2b79580406456133_1202x609.png) ```py # 判斷電影時長是否超過兩小時 In[4]: movie_2_hours = movie['duration'] > 120 movie_2_hours.head(10) Out[4]: movie_title Avatar True Pirates of the Caribbean: At World's End True Spectre True The Dark Knight Rises True Star Wars: Episode VII - The Force Awakens False John Carter True Spider-Man 3 True Tangled False Avengers: Age of Ultron True Harry Potter and the Half-Blood Prince True Name: duration, dtype: bool ``` ```py # 有多少時長超過兩小時的電影 In[5]: movie_2_hours.sum() Out[5]: 1039 ``` ```py # 超過兩小時的電影的比例 In[6]: movie_2_hours.mean() Out[6]: 0.21135069161920261 ``` ```py # 用describe()輸出一些該布爾Series信息 In[7]: movie_2_hours.describe() Out[7]: count 4916 unique 2 top False freq 3877 Name: duration, dtype: object ``` ```py # 實際上，dureation這列是有缺失值的，要想獲得真正的超過兩小時的電影的比例，需要先刪掉缺失值 In[8]: movie['duration'].dropna().gt(120).mean() Out[8]: 0.21199755152009794 ``` ### 原理 ```py # 統計False和True值的比例 In[9]: movie_2_hours.value_counts(normalize=True) Out[9]: False 0.788649 True 0.211351 Name: duration, dtype: float64 ``` ### 更多 ```py # 比較同一個DataFrame中的兩列 In[10]: actors = movie[['actor_1_facebook_likes', 'actor_2_facebook_likes']].dropna() (actors['actor_1_facebook_likes'] > actors['actor_2_facebook_likes']).mean() Out[10]: 0.97776871303283708 ``` ## 2\. 構建多個布爾條件 ```py In[11]: movie = pd.read_csv('data/movie.csv', index_col='movie_title') movie.head() Out[11]: ``` ![](https://img.kancloud.cn/22/48/2248b128785514a68f5f9f7b6feee1f3_1204x612.png) ```py # 創建多個布爾條件 In[12]: criteria1 = movie.imdb_score > 8 criteria2 = movie.content_rating == 'PG-13' criteria3 = (movie.title_year < 2000) | (movie.title_year >= 2010) criteria2.head() Out[12]: movie_title Avatar True Pirates of the Caribbean: At World's End True Spectre True The Dark Knight Rises True Star Wars: Episode VII - The Force Awakens False Name: content_rating, dtype: bool ``` ```py # 將這些布爾條件合并成一個 In[13]: criteria_final = criteria1 & criteria2 & criteria3 criteria_final.head() Out[13]: movie_title Avatar False Pirates of the Caribbean: At World's End False Spectre False The Dark Knight Rises True Star Wars: Episode VII - The Force Awakens False Name: content_rating, dtype: bool ``` ### 更多 ```py # 在Pandas中，位運算符（&, |, ~）的優先級高于比較運算符，因此如過前面的條件3不加括號，就會報錯 In[14]: movie.title_year < 2000 | movie.title_year > 2009 --------------------------------------------------------------------------- TypeError Traceback (most recent call last) /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/ops.py in na_op(x, y) 882 try: --> 883 result = op(x, y) 884 except TypeError: /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/ops.py in <lambda>(x, y) 130 names('rand_'), op('&')), --> 131 ror_=bool_method(lambda x, y: operator.or_(y, x), 132 names('ror_'), op('|')), TypeError: ufunc 'bitwise_or' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'' During handling of the above exception, another exception occurred: ValueError Traceback (most recent call last) /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/ops.py in na_op(x, y) 900 y = bool(y) --> 901 result = lib.scalar_binop(x, y, op) 902 except: pandas/_libs/lib.pyx in pandas._libs.lib.scalar_binop (pandas/_libs/lib.c:15035)() ValueError: Buffer dtype mismatch, expected 'Python object' but got 'double' During handling of the above exception, another exception occurred: TypeError Traceback (most recent call last) <ipython-input-14-1e7ee3f1401c> in <module>() ----> 1 movie.title_year < 2000 | movie.title_year > 2009 /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/ops.py in wrapper(self, other) 933 is_integer_dtype(np.asarray(other)) else fill_bool) 934 return filler(self._constructor( --> 935 na_op(self.values, other), 936 index=self.index)).__finalize__(self) 937 /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/ops.py in na_op(x, y) 903 raise TypeError("cannot compare a dtyped [{0}] array with " 904 "a scalar of type [{1}]".format( --> 905 x.dtype, type(y).__name__)) 906 907 return result TypeError: cannot compare a dtyped [float64] array with a scalar of type [bool] ``` ## 3\. 用布爾索引過濾 ```py # 讀取movie數據集，創建布爾條件 In[15]: movie = pd.read_csv('data/movie.csv', index_col='movie_title') crit_a1 = movie.imdb_score > 8 crit_a2 = movie.content_rating == 'PG-13' crit_a3 = (movie.title_year < 2000) | (movie.title_year > 2009) final_crit_a = crit_a1 & crit_a2 & crit_a3 # 創建第二個布爾條件 In[16]: crit_b1 = movie.imdb_score < 5 crit_b2 = movie.content_rating == 'R' crit_b3 = (movie.title_year >= 2000) & (movie.title_year <= 2010) final_crit_b = crit_b1 & crit_b2 & crit_b3 # 將這兩個條件用或運算合并起來 In[17]: final_crit_all = final_crit_a | final_crit_b final_crit_all.head() Out[17]: movie_title Avatar False Pirates of the Caribbean: At World's End False Spectre False The Dark Knight Rises True Star Wars: Episode VII - The Force Awakens False dtype: bool ``` ```py # 用最終的布爾條件過濾數據 In[18]: movie[final_crit_all].head() Out[18]: ``` ![](https://img.kancloud.cn/d5/f7/d5f75e56d82c1add52f65af456ef9c6d_1202x528.png) ```py # 使用loc，對指定的列做過濾操作，可以清楚地看到過濾是否起作用 In[19]: cols = ['imdb_score', 'content_rating', 'title_year'] movie_filtered = movie.loc[final_crit_all, cols] movie_filtered.head(10) Out[19]: ``` ![](https://img.kancloud.cn/72/00/7200c5d7a429a5afa5fe2d1c50c35bae_621x478.png) ### 更多 ```py # 用一個長布爾表達式代替前面由短表達式生成的布爾條件 In[21]: final_crit_a2 = (movie.imdb_score > 8) & \ (movie.content_rating == 'PG-13') & \ ((movie.title_year < 2000) | (movie.title_year > 2009)) final_crit_a2.equals(final_crit_a) Out[21]: ``` ![](https://img.kancloud.cn/72/54/7254576652737cfdcd362515590e9c62_1215x448.png) ## 4\. 用標簽索引代替布爾索引 ```py # 用布爾索引選取所有得克薩斯州的學校 >>> college = pd.read_csv('data/college.csv') >>> college[college['STABBR'] == 'TX'].head() ``` ```py # 用STABBR作為行索引，然后用loc選取 In[22]: college2 = college.set_index('STABBR') college2.loc['TX'].head() Out[22]: ``` ![](https://img.kancloud.cn/87/9e/879e08666cfe3dc3d0c6d469816f48ce_1205x487.png) ```py # 比較二者的速度 In[23]: %timeit college[college['STABBR'] == 'TX'] 1.51 ms ± 51.4 μs per loop (mean ± std. dev. of 7 runs, 1000 loops each) In[24]: %timeit college2.loc['TX'] 604 μs ± 23.9 μs per loop (mean ± std. dev. of 7 runs, 1000 loops each) ``` ```py # 使用STABBR作為行索引所用的時間 In[25]: %timeit college2 = college.set_index('STABBR') 1.28 ms ± 47.5 μs per loop (mean ± std. dev. of 7 runs, 1000 loops each) ``` ### 更多 ```py # 使用布爾索引和標簽選取多列 In[26]: states =['TX', 'CA', 'NY'] college[college['STABBR'].isin(states)] college2.loc[states].head() Out[26]: ``` ![](https://img.kancloud.cn/81/7b/817bfe84c031ef77f4e3e0d91071417e_1204x487.png) ## 5\. 用唯一和有序索引選取 ```py # 讀取college數據集，使用STABBR作為行索引，檢查行索引是否有序 In[27]: college = pd.read_csv('data/college.csv') college2 = college.set_index('STABBR') In[28]: college2.index.is_monotonic Out[28]: False ``` ```py # 將college2排序，存儲成另一個對象，查看其是否有序 In[29]: college3 = college2.sort_index() college3.index.is_monotonic Out[29]: True ``` ```py # 從這三個DataFrame選取得克薩斯州，比較速度 In[30]: %timeit college[college['STABBR'] == 'TX'] 1.58 ms ± 63.8 μs per loop (mean ± std. dev. of 7 runs, 1000 loops each) In[31]: %timeit college2.loc['TX'] 622 μs ± 18.1 μs per loop (mean ± std. dev. of 7 runs, 1000 loops each) In[32]: %timeit college3.loc['TX'] 198 μs ± 5.8 μs per loop (mean ± std. dev. of 7 runs, 1000 loops each) ``` ```py # 使用INSTNM作為行索引，檢測行索引是否唯一 In[33]: college_unique = college.set_index('INSTNM') college_unique.index.is_unique Out[33]: True ``` ```py # 用布爾索引選取斯坦福大學 In[34]: college[college['INSTNM'] == 'Stanford University'] Out[34]: ``` ![](https://img.kancloud.cn/09/01/0901d5732947417d6bd769daf3155412_1205x142.png) ```py # 用行索引標簽選取斯坦福大學 In[35]: college_unique.loc['Stanford University'] Out[35]: CITY Stanford STABBR CA HBCU 0 MENONLY 0 WOMENONLY 0 RELAFFIL 0 SATVRMID 730 SATMTMID 745 DISTANCEONLY 0 UGDS 7018 UGDS_WHITE 0.3752 UGDS_BLACK 0.0591 UGDS_HISP 0.1607 UGDS_ASIAN 0.1979 UGDS_AIAN 0.0114 UGDS_NHPI 0.0038 UGDS_2MOR 0.1067 UGDS_NRA 0.0819 UGDS_UNKN 0.0031 PPTUG_EF 0 CURROPER 1 PCTPELL 0.1556 PCTFLOAN 0.1256 UG25ABV 0.0401 MD_EARN_WNE_P10 86000 GRAD_DEBT_MDN_SUPP 12782 Name: Stanford University, dtype: object ``` ```py # 比較兩種方法的速度 In[36]: %timeit college[college['INSTNM'] == 'Stanford University'] 1.44 ms ± 66 μs per loop (mean ± std. dev. of 7 runs, 1000 loops each) In[37]: %timeit college_unique.loc['Stanford University'] 191 μs ± 5.31 μs per loop (mean ± std. dev. of 7 runs, 10000 loops each) ``` ### 更多 ```py # 使用CITY和STABBR兩列作為行索引，并進行排序 In[38]: college.index = college['CITY'] + ', ' + college['STABBR'] college = college.sort_index() college.head() Out[38]: ``` ![](https://img.kancloud.cn/bf/2a/bf2ae83357bcde6b061040417f2c239a_1202x448.png) ```py # 選取所有Miami, FL的大學 In[39]: college.loc['Miami, FL'].head() Out[39]: ``` ![](https://img.kancloud.cn/0c/63/0c63eceb1326624d7e3cf879a2bbb20d_1202x553.png) ```py # 速度比較 In[40]: %%timeit crit1 = college['CITY'] == 'Miami' crit2 = college['STABBR'] == 'FL' college[crit1 & crit2] 2.83 ms ± 82.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each) In[41]: %timeit college.loc['Miami, FL'] 226 μs ± 17.3 μs per loop (mean ± std. dev. of 7 runs, 1000 loops each) ``` ```py # 判斷這兩個條件是否相同 In[42]: college[(college['CITY'] == 'Miami') & (college['STABBR'] == 'FL')].equals(college.loc['Miami, FL']) Out[42]: True ``` ## 6\. 觀察股價 ```py # 讀取Schlumberger stock數據集，行索引設為Date列，并將其轉變為DatetimeIndex In[43]: slb = pd.read_csv('data/slb_stock.csv', index_col='Date', parse_dates=['Date']) slb.head() Out[43]: ``` ![](https://img.kancloud.cn/0e/40/0e40fceb0e1a2eedf5791818c158b216_481x277.png) ```py # 選取Close這列，用describe返回統計信息 In[44]: slb_close = slb['Close'] slb_summary = slb_close.describe(percentiles=[.1, .9]) slb_summary Out[44]: count 1895.000000 mean 79.121905 std 11.767802 min 51.750000 10% 64.892000 50% 78.000000 90% 93.248000 max 117.950000 Name: Close, dtype: float64 ``` ```py # 用布爾索引選取最高和最低10%的收盤價 In[45]: upper_10 = slb_summary.loc['90%'] lower_10 = slb_summary.loc['10%'] criteria = (slb_close < lower_10) | (slb_close > upper_10) slb_top_bottom_10 = slb_close[criteria] # 過濾出的數據使用灰色，所有的收盤價使用黑色，用matplotlib在十分之一和十分之九分位數位置畫橫線 In[46]: slb_close.plot(color='black', figsize=(12,6)) slb_top_bottom_10.plot(marker='o', style=' ', ms=4, color='lightgray') xmin = criteria.index[0] xmax = criteria.index[-1] plt.hlines(y=[lower_10, upper_10], xmin=xmin, xmax=xmax,color='black') Out[46]: <matplotlib.collections.LineCollection at 0x1174b3278> ``` ![](https://img.kancloud.cn/4c/93/4c93db535e215ffe39e4beca106bd5cc_1079x532.png) ### 更多 ```py # 使用fill_between可以在兩條線之間填充顏色 In[47]: slb_close.plot(color='black', figsize=(12,6)) plt.hlines(y=[lower_10, upper_10], xmin=xmin, xmax=xmax,color='lightgray') plt.fill_between(x=criteria.index, y1=lower_10, y2=slb_close.values, color='black') plt.fill_between(x=criteria.index,y1=lower_10, y2=slb_close.values, where=slb_close < lower_10, color='lightgray') plt.fill_between(x=criteria.index, y1=upper_10, y2=slb_close.values, where=slb_close > upper_10, color='lightgray') Out[47]: ``` ![](https://img.kancloud.cn/f7/c2/f7c2a07c87f6f4596e8db4a815ebd7d2_1068x535.png) ## 7\. 翻譯SQL的WHERE語句 ```py # 讀取employee數據集 In[48]: employee = pd.read_csv('data/employee.csv') # 對各項做下了解 In[49]: employee.DEPARTMENT.value_counts().head() Out[49]: Houston Police Department-HPD 638 Houston Fire Department (HFD) 384 Public Works & Engineering-PWE 343 Health & Human Services 110 Houston Airport System (HAS) 106 Name: DEPARTMENT, dtype: int64 In[50]: employee.GENDER.value_counts() Out[50]: Male 1397 Female 603 Name: GENDER, dtype: int64 In[51]: employee.BASE_SALARY.describe().astype(int) Out[51]: count 1886 mean 55767 std 21693 min 24960 25% 40170 50% 54461 75% 66614 max 275000 Name: BASE_SALARY, dtype: int64 ``` ```py # 創建布爾條件，并從'UNIQUE_ID', 'DEPARTMENT', 'GENDER', 'BASE_SALARY'四列選取 In[52]: depts = ['Houston Police Department-HPD', 'Houston Fire Department (HFD)'] criteria_dept = employee.DEPARTMENT.isin(depts) criteria_gender = employee.GENDER == 'Female' criteria_sal = (employee.BASE_SALARY >= 80000) & \ (employee.BASE_SALARY <= 120000) In[53]: criteria_final = criteria_dept & criteria_gender & criteria_sal In[54]: select_columns = ['UNIQUE_ID', 'DEPARTMENT', 'GENDER', 'BASE_SALARY'] employee.loc[criteria_final, select_columns].head() Out[54]: ``` ![](https://img.kancloud.cn/c1/68/c168f569e04b4f308189244d06549291_707x243.png) ### 更多 ```py # 使用between選取80000到120000之間的薪水 In[55]: criteria_sal = employee.BASE_SALARY.between(80000, 120000) # 排除最常出現的5家單位 In[56]: top_5_depts = employee.DEPARTMENT.value_counts().index[:5] criteria = ~employee.DEPARTMENT.isin(top_5_depts) employee[criteria].head() Out[56]: ``` ![](https://img.kancloud.cn/ae/8d/ae8d110cb8de8fb9c29d21ade16a01a9_1204x465.png) 功能一樣的SQL語句是： ```py SELECT * FROM EMPLOYEE WHERE DEPARTMENT not in ( SELECT DEPARTMENT FROM ( SELECT DEPARTMENT, COUNT(1) as CT FROM EMPLOYEE GROUP BY DEPARTMENT ORDER BY CT DESC LIMIT 5 ) ); ``` ## 8\. 確定股票收益的正態值 ```py # 加載亞馬遜的股票數據，使用Data作為行索引 In[57]: amzn = pd.read_csv('data/amzn_stock.csv', index_col='Date', parse_dates=['Date']) amzn.head() Out[57]: ``` ![](https://img.kancloud.cn/1a/2d/1a2db6f25f9f6a72c8f41d71319f0ae4_511x286.png) ```py # 選取Close收盤價，用pct_change()計算每日回報率 In[58]: amzn_daily_return = amzn.Close.pct_change() amzn_daily_return.head() Out[58]: Date 2010-01-04 NaN 2010-01-05 0.005900 2010-01-06 -0.018116 2010-01-07 -0.017013 2010-01-08 0.027077 Name: Close, dtype: float64 ``` ```py # 去掉缺失值，畫一張柱狀圖，查看分布情況 In[59]: amzn_daily_return = amzn_daily_return.dropna() amzn_daily_return.hist(bins=20) Out[59]: <matplotlib.axes._subplots.AxesSubplot at 0x1174b3128> ``` ![](https://img.kancloud.cn/1f/09/1f09e8a652b8855eb955f015c694916d_566x385.png) ```py # 計算平均值和標準差 In[60]: mean = amzn_daily_return.mean() std = amzn_daily_return.std() # 計算每個數據的z-score的絕對值：z-score是遠離平均值的標準差值得個數 In[61]: abs_z_score = amzn_daily_return.sub(mean).abs().div(std) # 計算位于1，2，3個標準差之內的收益率的比例 In[62]: pcts = [abs_z_score.lt(i).mean() for i in range(1,4)] print('{:.3f} fall within 1 standard deviation. ' '{:.3f} within 2 and {:.3f} within 3'.format(*pcts)) 0.787 fall within 1 standard deviation. 0.956 within 2 and 0.985 within 3 ``` ### 更多 ```py # 將上面的方法整合成一個函數 In[63]: def test_return_normality(stock_data): close = stock_data['Close'] daily_return = close.pct_change().dropna() daily_return.hist(bins=20) mean = daily_return.mean() std = daily_return.std() abs_z_score = abs(daily_return - mean) / std pcts = [abs_z_score.lt(i).mean() for i in range(1,4)] print('{:.3f} fall within 1 standard deviation. ' '{:.3f} within 2 and {:.3f} within 3'.format(*pcts)) In[64]: slb = pd.read_csv('data/slb_stock.csv', index_col='Date', parse_dates=['Date']) test_return_normality(slb) 0.742 fall within 1 standard deviation. 0.946 within 2 and 0.986 within 3 ``` ![](https://img.kancloud.cn/f9/54/f954a424e6069c5e82862443e98aa857_565x380.png) ## 9\. 使用查詢方法提高布爾索引的可讀性 ```py # 讀取employee數據，確定選取的部門和列 In[65]: employee = pd.read_csv('data/employee.csv') depts = ['Houston Police Department-HPD', 'Houston Fire Department (HFD)'] select_columns = ['UNIQUE_ID', 'DEPARTMENT', 'GENDER', 'BASE_SALARY'] # 創建查詢字符串，并執行query方法 In[66]: qs = "DEPARTMENT in @depts " \ "and GENDER == 'Female' " \ "and 80000 <= BASE_SALARY <= 120000" emp_filtered = employee.query(qs) emp_filtered[select_columns].head() Out[66]: ``` ![](https://img.kancloud.cn/fd/f7/fdf7fbfe78d58eb61d49ca4ff541a1e6_703x246.png) ### 更多 ```py # 若要不使用部門列表，也可以使用下面的方法 In[67]: top10_depts = employee.DEPARTMENT.value_counts().index[:10].tolist() qs = "DEPARTMENT not in @top10_depts and GENDER == 'Female'" employee_filtered2 = employee.query(qs) employee_filtered2[['DEPARTMENT', 'GENDER']].head() Out[67]: ``` ![](https://img.kancloud.cn/72/ca/72cafee38f3de3047994aef7c79f51ea_417x249.png) ## 10\. 用where方法保留Series ```py # 讀取movie數據集，movie_title作為行索引，actor_1_facebook_likes列刪除缺失值 In[68]: movie = pd.read_csv('data/movie.csv', index_col='movie_title') fb_likes = movie['actor_1_facebook_likes'].dropna() fb_likes.head() Out[68]: movie_title Avatar 1000.0 Pirates of the Caribbean: At World's End 40000.0 Spectre 11000.0 The Dark Knight Rises 27000.0 Star Wars: Episode VII - The Force Awakens 131.0 Name: actor_1_facebook_likes, dtype: float64 ``` ```py # 使用describe獲得對數據的認知 In[69]: fb_likes.describe(percentiles=[.1, .25, .5, .75, .9]).astype(int) Out[69]: count 4909 mean 6494 std 15106 min 0 10% 240 25% 607 50% 982 75% 11000 90% 18000 max 640000 Name: actor_1_facebook_likes, dtype: int64 ``` ```py # 作用和前面相同（這里是作者代碼弄亂了） In[70]: fb_likes.describe(percentiles=[.1,.25,.5,.75,.9]) Out[70]: count 4909.000000 mean 6494.488491 std 15106.986884 min 0.000000 10% 240.000000 25% 607.000000 50% 982.000000 75% 11000.000000 90% 18000.000000 max 640000.000000 Name: actor_1_facebook_likes, dtype: float64 ``` ```py # 畫一張柱狀圖 In[71]: fb_likes.hist() Out[71]: <matplotlib.axes._subplots.AxesSubplot at 0x10f9fbe80> ``` ![](https://img.kancloud.cn/b4/f6/b4f6e472d2ea4aeedc8de167c0532491_583x383.png) ```py # 檢測小于20000個喜歡的的比例 In[72]: criteria_high = fb_likes < 20000 criteria_high.mean().round(2) Out[71]: 0.91000000000000003 ``` ```py # where條件可以返回一個同樣大小的Series，但是所有False會被替換成缺失值 In[73]: fb_likes.where(criteria_high).head() Out[73]: movie_title Avatar 1000.0 Pirates of the Caribbean: At World's End NaN Spectre 11000.0 The Dark Knight Rises NaN Star Wars: Episode VII - The Force Awakens 131.0 Name: actor_1_facebook_likes, dtype: float64 ``` ```py # 第二個參數other，可以讓你控制替換值 In[74]: fb_likes.where(criteria_high, other=20000).head() Out[74]: movie_title Avatar 1000.0 Pirates of the Caribbean: At World's End 20000.0 Spectre 11000.0 The Dark Knight Rises 20000.0 Star Wars: Episode VII - The Force Awakens 131.0 Name: actor_1_facebook_likes, dtype: float64 ``` ```py # 通過where條件，設定上下限的值 In[75]: criteria_low = fb_likes > 300 fb_likes_cap = fb_likes.where(criteria_high, other=20000)\ .where(criteria_low, 300) fb_likes_cap.head() Out[75]: movie_title Avatar 1000.0 Pirates of the Caribbean: At World's End 20000.0 Spectre 11000.0 The Dark Knight Rises 20000.0 Star Wars: Episode VII - The Force Awakens 300.0 Name: actor_1_facebook_likes, dtype: float64 ``` ```py # 原始Series和修改過的Series的長度是一樣的 In[76]: len(fb_likes), len(fb_likes_cap) Out[76]: (4909, 4909) ``` ```py # 再做一張柱狀圖，效果好多了 In[77]: fb_likes_cap.hist() Out[77]: <matplotlib.axes._subplots.AxesSubplot at 0x10eeea8d0> ``` ![](https://img.kancloud.cn/dc/8e/dc8e9c2ec7623c0ef8de940214598e97_582x380.png) ```py In[78]: fb_likes_cap2 = fb_likes.clip(lower=300, upper=20000) fb_likes_cap2.equals(fb_likes_cap) Out[78]: True ``` ## 11\. 對DataFrame的行做mask ```py # 讀取movie，根據條件進行篩選 In[79]: movie = pd.read_csv('data/movie.csv', index_col='movie_title') c1 = movie['title_year'] >= 2010 c2 = movie['title_year'].isnull() criteria = c1 | c2 # 使用mask方法，使所有滿足條件的數據消失 In[80]: movie.mask(criteria).head() Out[80]: ``` ![](https://img.kancloud.cn/a0/86/a0860549dab8d8c9b36f8f3e15690c86_1586x580.png) ```py # 去除缺失值 In[81]: movie_mask = movie.mask(criteria).dropna(how='all') movie_mask.head() Out[81]: ``` ![](https://img.kancloud.cn/09/37/093737843881775038c2dbfcc3708dc9_1198x611.png) ```py # 用布爾索引選取title_year小于2010的電影 In[82]: movie_boolean = movie[movie['title_year'] < 2010] movie_boolean.head() Out[82]: ``` ![](https://img.kancloud.cn/ad/db/addb8727a35fa3e3a702a69dc02ecf0b_1204x607.png) ```py # 判斷這兩種方法是否相同 In[83]: movie_mask.equals(movie_boolean) Out[83]: False ``` ```py # 判斷二者的形狀是否相同 In[84]: movie_mask.shape == movie_boolean.shape Out[84]: True ``` ```py # mask方法產生了許多缺失值，缺失值是float類型，所以之前是整數型的列都變成了浮點型 In[85]: movie_mask.dtypes == movie_boolean.dtypes Out[85]: color True director_name True num_critic_for_reviews True duration True director_facebook_likes True actor_3_facebook_likes True actor_2_name True actor_1_facebook_likes True gross True genres True actor_1_name True num_voted_users False cast_total_facebook_likes False actor_3_name True facenumber_in_poster True plot_keywords True movie_imdb_link True num_user_for_reviews True language True country True content_rating True budget True title_year True actor_2_facebook_likes True imdb_score True aspect_ratio True movie_facebook_likes False dtype: bool ``` ```py # Pandas有一個assert_frame_equal方法，可以判斷兩個Pandas對象是否一樣，而不檢測其數據類型 In[86]: from pandas.testing import assert_frame_equal assert_frame_equal(movie_boolean, movie_mask, check_dtype=False) ``` ### 更多 ```py # 比較mask和布爾索引的速度，兩者相差了一個數量級 In[87]: %timeit movie.mask(criteria).dropna(how='all') 11.1 ms ± 48.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each) In[88]: %timeit movie[movie['title_year'] < 2010] 1.12 ms ± 36.7 μs per loop (mean ± std. dev. of 7 runs, 1000 loops each) ``` ## 12\. 使用布爾值、整數、標簽進行選取 ```py # 讀取movie，根據布爾條件選取 In[89]: movie = pd.read_csv('data/movie.csv', index_col='movie_title') c1 = movie['content_rating'] == 'G' c2 = movie['imdb_score'] < 4 criteria = c1 & c2 In[90]: movie_loc = movie.loc[criteria] movie_loc.head() Out[90]: ``` ![](https://img.kancloud.cn/20/f1/20f171e232d6c5cf6b3ceb9639b80517_1205x579.png) ```py # 檢查loc條件和布爾條件創建出來的兩個DataFrame是否一樣 In[91]: movie_loc.equals(movie[criteria]) Out[91]: True ``` ```py # 嘗試用.iloc使用布爾索引 In[92]: movie_iloc = movie.iloc[criteria] --------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-92-24a12062c6c3> in <module>() ----> 1 movie_iloc = movie.iloc[criteria] /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key) 1326 else: 1327 key = com._apply_if_callable(key, self.obj) -> 1328 return self._getitem_axis(key, axis=0) 1329 1330 def _is_scalar_access(self, key): /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis) 1731 1732 if is_bool_indexer(key): -> 1733 self._has_valid_type(key, axis) 1734 return self._getbool_axis(key, axis=axis) 1735 /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in _has_valid_type(self, key, axis) 1588 "indexing on an integer type " 1589 "is not available") -> 1590 raise ValueError("iLocation based boolean indexing cannot use " 1591 "an indexable as a mask") 1592 return True ValueError: iLocation based boolean indexing cannot use an indexable as a mask ``` ```py # 但是，卻可以使用布爾值得ndarray，用values可以取出array In[93]: movie_iloc = movie.iloc[criteria.values] In[94]: movie_iloc.equals(movie_loc) Out[94]: True In[95]: movie.loc[criteria.values] Out[95]: ``` ![](https://img.kancloud.cn/f4/be/f4bec7bd6bf93277b6fb9620fc5daded_1209x670.png) ```py # 布爾索引也可以用來選取列 In[96]: criteria_col = movie.dtypes == np.int64 criteria_col.head() Out[96]: color False director_name False num_critic_for_reviews False duration False director_facebook_likes False dtype: bool ``` ```py In[97]: movie.loc[:, criteria_col].head() Out[97]: ``` ![](https://img.kancloud.cn/af/fa/affab611e32d018b5d5c02a7971473b8_1029x285.png) ```py # 因為criteria_col是包含行索引的一個Series，必須要使用底層的ndarray，才能使用，iloc In[98]: movie.iloc[:, criteria_col.values].head() Out[98]: ``` ![](https://img.kancloud.cn/23/2b/232ba17053f8b1671918a5c32acc8aaf_1030x286.png) ```py # 選取'content_rating', 'imdb_score', 'title_year', 'gross'四列，按照imdb_score升序排列 In[99]: cols = ['content_rating', 'imdb_score', 'title_year', 'gross'] movie.loc[criteria, cols].sort_values('imdb_score') Out[99]: ``` ![](https://img.kancloud.cn/7a/b6/7ab6ecd202876cf34caaa55269a356b3_773x314.png) ```py # 用get_loc獲取這四列的整數位置 In[100]: col_index = [movie.columns.get_loc(col) for col in cols] col_index Out[100]: [20, 24, 22, 8] ``` ```py # 這時候就可以使用iloc了 In[101]: movie.iloc[criteria.values, col_index].sort_values('imdb_score') Out[101]: ``` ![](https://img.kancloud.cn/bc/d2/bcd23e41c575c7ba98c830fd74ea40bb_772x321.png) ### 原理 ```py # 查看Series的底層結構 In[102]: a = criteria.values a[:5] Out[102]: array([False, False, False, False, False], dtype=bool) In[103]: len(a), len(criteria) Out[103]: (4916, 4916) ``` ### 更多 ```py # 傳入的布爾索引可以跟要操作的DataFrame長度不同 In[104]: movie.loc[[True, False, True], [True, False, False, True]] Out[104]: ``` ![](https://img.kancloud.cn/a3/98/a398c8a17153f6738a5cd2c6703abf5e_285x163.png)