第03章數據分析入門 · Pandas Cookbook 帶注釋源碼

# 第03章數據分析入門 ```py In[1]: import pandas as pd import numpy as np from IPython.display import display pd.options.display.max_columns = 50 ``` ## 1\. 規劃數據分析路線 ```py # 讀取查看數據 In[2]: college = pd.read_csv('data/college.csv') In[3]: college.head() Out[3]: ``` ![](https://img.kancloud.cn/99/e8/99e83ab8daf1f1ed25985fd4d87ccb7d_943x513.png) ```py # 數據的行數與列數 In[4]: college.shape Out[4]: (7535, 27) ``` ```py # 統計數值列，并進行轉置 In[5]: with pd.option_context('display.max_rows', 8): display(college.describe(include=[np.number]).T) Out[5]: ``` ![](https://img.kancloud.cn/ab/fc/abfc66adf0fdf238a656634fff8dd17b_762x456.png) ```py # 統計對象和類型列 In[6]: college.describe(include=[np.object, pd.Categorical]).T Out[6]: ``` ![](https://img.kancloud.cn/a6/a2/a6a211a757a71b34e8d57a554591ae9b_724x248.png) ```py # 列出每列的數據類型，非缺失值的數量，以及內存的使用 In[7]: college.info() <class 'pandas.core.frame.DataFrame'> RangeIndex: 7535 entries, 0 to 7534 Data columns (total 27 columns): INSTNM 7535 non-null object CITY 7535 non-null object STABBR 7535 non-null object HBCU 7164 non-null float64 MENONLY 7164 non-null float64 WOMENONLY 7164 non-null float64 RELAFFIL 7535 non-null int64 SATVRMID 1185 non-null float64 SATMTMID 1196 non-null float64 DISTANCEONLY 7164 non-null float64 UGDS 6874 non-null float64 UGDS_WHITE 6874 non-null float64 UGDS_BLACK 6874 non-null float64 UGDS_HISP 6874 non-null float64 UGDS_ASIAN 6874 non-null float64 UGDS_AIAN 6874 non-null float64 UGDS_NHPI 6874 non-null float64 UGDS_2MOR 6874 non-null float64 UGDS_NRA 6874 non-null float64 UGDS_UNKN 6874 non-null float64 PPTUG_EF 6853 non-null float64 CURROPER 7535 non-null int64 PCTPELL 6849 non-null float64 PCTFLOAN 6849 non-null float64 UG25ABV 6718 non-null float64 MD_EARN_WNE_P10 6413 non-null object GRAD_DEBT_MDN_SUPP 7503 non-null object dtypes: float64(20), int64(2), object(5) memory usage: 1.6+ MB ``` ```py # 重復了，但沒設置最大行數 In[8]: college.describe(include=[np.number]).T Out[8]: ``` ![](https://img.kancloud.cn/9d/ab/9dab524e91360b3e458205ac745b7e8e_950x941.png) ```py # 和前面重復了 In[9]: college.describe(include=[np.object, pd.Categorical]).T Out[9]: ``` ![](https://img.kancloud.cn/af/9d/af9deba2862ba623a286ac45988ea710_728x247.png) ### 更多 ```py # 在describe方法中，打印分位數 In[10]: with pd.option_context('display.max_rows', 5): display(college.describe(include=[np.number], percentiles=[.01, .05, .10, .25, .5, .75, .9, .95, .99]).T) ``` ![](https://img.kancloud.cn/dc/f8/dcf866cc515c8d77bd9e258a344a1783_1212x329.png) ```py # 展示一個數據字典：數據字典的主要作用是解釋列名的意義 In[11]: college_dd = pd.read_csv('data/college_data_dictionary.csv') In[12]: with pd.option_context('display.max_rows', 8): display(college_dd) ``` ![](https://img.kancloud.cn/3b/d0/3bd0cf87e52e89d9801813cb52326881_656x449.png) ## 2\. 改變數據類型，降低內存消耗 ```py # 選取五列 In[13]: college = pd.read_csv('data/college.csv') different_cols = ['RELAFFIL', 'SATMTMID', 'CURROPER', 'INSTNM', 'STABBR'] col2 = college.loc[:, different_cols] col2.head() Out[13]: ``` ![](https://img.kancloud.cn/e5/9a/e59ad7f9cbadbb9bef65626331652682_792x245.png) ```py # 查看數據類型 In[14]: col2.dtypes Out[14]: RELAFFIL int64 SATMTMID float64 CURROPER int64 INSTNM object STABBR object dtype: object ``` ```py # 用memory_usage方法查看每列的內存消耗 In[15]: original_mem = col2.memory_usage(deep=True) original_mem Out[15]: Index 80 RELAFFIL 60280 SATMTMID 60280 CURROPER 60280 INSTNM 660240 STABBR 444565 dtype: int64 ``` ```py # RELAFFIL這列只包含0或1，因此沒必要用64位，使用astype方法將其變為8位（1字節）整數 In[16]: col2['RELAFFIL'] = col2['RELAFFIL'].astype(np.int8) # 再次查看數據類型 In[17]: col2.dtypes Out[17]: RELAFFIL int8 SATMTMID float64 CURROPER int64 INSTNM object STABBR object dtype: object ``` ```py # 檢查兩個對象列的獨立值的個數 In[18]: col2.select_dtypes(include=['object']).nunique() Out[18]: INSTNM 7535 STABBR 59 dtype: int64 ``` ```py # STABBR列可以轉變為“類型”（Categorical），獨立值的個數小于總數的1% In[19]: col2['STABBR'] = col2['STABBR'].astype('category') col2.dtypes Out[19]: RELAFFIL int8 SATMTMID float64 CURROPER int64 INSTNM object STABBR category dtype: object ``` ```py # 再次檢查內存的使用 In[20]: new_mem = col2.memory_usage(deep=True) new_mem Out[20]: Index 80 RELAFFIL 7535 SATMTMID 60280 CURROPER 60280 INSTNM 660699 STABBR 13576 dtype: int64 ``` ```py # 通過和原始數據比較，RELAFFIL列變為了原來的八分之一，STABBR列只有原始大小的3% In[21]: new_mem / original_mem Out[21]: Index 1.000000 RELAFFIL 0.125000 SATMTMID 1.000000 CURROPER 1.000000 INSTNM 1.000695 STABBR 0.030538 dtype: float64 ``` ### 更多 ```py # CURROPER和INSTNM分別是int64和對象類型 In[22]: college = pd.read_csv('data/college.csv') In[23]: college[['CURROPER', 'INSTNM']].memory_usage(deep=True) Out[23]: Index 80 CURROPER 60280 INSTNM 660240 dtype: int64 ``` ```py # CURROPER列加上了10000000，但是內存使用沒有變化；但是INSTNM列加上了一個a，內存消耗增加了105字節 In[24]: college.loc[0, 'CURROPER'] = 10000000 college.loc[0, 'INSTNM'] = college.loc[0, 'INSTNM'] + 'a' # college.loc[1, 'INSTNM'] = college.loc[1, 'INSTNM'] + 'a' college[['CURROPER', 'INSTNM']].memory_usage(deep=True) Out[24]: Index 80 CURROPER 60280 INSTNM 660345 dtype: int64 ``` ```py # 數據字典中的信息顯示MENONLY這列只包含0和1，但是由于含有缺失值，它的類型是浮點型 In[25]: college['MENONLY'].dtype Out[25]: dtype('float64') ``` ```py # 任何數值類型的列，只要有一個缺失值，就會成為浮點型；這列中的任何整數都會強制成為浮點型 In[26]: college['MENONLY'].astype('int8') # ValueError: Cannot convert non-finite values (NA or inf) to integer --------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-26-98afc27c1701> in <module>() ----> 1 college['MENONLY'].astype('int8') # ValueError: Cannot convert non-finite values (NA or inf) to integer ~/anaconda3/lib/python3.6/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs) 116 else: 117 kwargs[new_arg_name] = new_arg_value --> 118 return func(*args, **kwargs) 119 return wrapper 120 return _deprecate_kwarg ~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs) 4002 # else, only a single dtype is given 4003 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors, -> 4004 **kwargs) 4005 return self._constructor(new_data).__finalize__(self) 4006 ~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in astype(self, dtype, **kwargs) 3455 3456 def astype(self, dtype, **kwargs): -> 3457 return self.apply('astype', dtype=dtype, **kwargs) 3458 3459 def convert(self, **kwargs): ~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs) 3322 3323 kwargs['mgr'] = self -> 3324 applied = getattr(b, f)(**kwargs) 3325 result_blocks = _extend_blocks(applied, result_blocks) 3326 ~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in astype(self, dtype, copy, errors, values, **kwargs) 542 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs): 543 return self._astype(dtype, copy=copy, errors=errors, values=values, --> 544 **kwargs) 545 546 def _astype(self, dtype, copy=False, errors='raise', values=None, ~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs) 623 624 # _astype_nansafe works fine with 1-d only --> 625 values = astype_nansafe(values.ravel(), dtype, copy=True) 626 values = values.reshape(self.shape) 627 ~/anaconda3/lib/python3.6/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy) 685 686 if not np.isfinite(arr).all(): --> 687 raise ValueError('Cannot convert non-finite values (NA or inf) to ' 688 'integer') 689 ValueError: Cannot convert non-finite values (NA or inf) to integer ``` ```py # 對于數據類型，可以替換字符串名：27、28、30、31是等價的 In[27]: college.describe(include=['int64', 'float64']).T Out[27]: ``` ![](https://img.kancloud.cn/e8/ca/e8ca5750dd2c7ffa68013f95f2d80a2e_1050x908.png) ```py In[28]: college.describe(include=[np.int64, np.float64]).T Out[28]: ``` ![](https://img.kancloud.cn/cc/06/cc065be6883114ec265ef4145441b4b6_1044x909.png) ```py In[29]: college['RELAFFIL'] = college['RELAFFIL'].astype(np.int8) In[30]: college.describe(include=['int', 'float']).T # defaults to 64 bit int/floats Out[30]: ``` ![](https://img.kancloud.cn/0a/43/0a439c7e587f325778ded994c5613e1a_1046x873.png) ```py In[31]: college.describe(include=['number']).T # also works as the default int/float are 64 bits Out[31]: ``` ![](https://img.kancloud.cn/60/91/6091b017b6843f6d2aa02d6c5a121929_1048x914.png) ```py # 轉變數據類型時也可以如法炮制 In[32]: college['MENONLY'] = college['MENONLY'].astype('float16') college['RELAFFIL'] = college['RELAFFIL'].astype('int8') In[33]: college.index = pd.Int64Index(college.index) college.index.memory_usage() Out[33]: 60280 ``` ## 3\. 從最大中選擇最小 ```py # 讀取movie.csv，選取'movie_title', 'imdb_score', 'budget'三列 In[34]: movie = pd.read_csv('data/movie.csv') movie2 = movie[['movie_title', 'imdb_score', 'budget']] movie2.head() Out[34]: ``` ![](https://img.kancloud.cn/8a/38/8a38b8bb4109de3bfb1f24a39f3afafb_660x245.png) ```py # 用nlargest方法，選出imdb_score分數最高的100個 In[35]: movie2.nlargest(100, 'imdb_score').head() Out[35]: ``` ![](https://img.kancloud.cn/d0/fc/d0fc7a07e6c764f40d08c269e38ef93a_550x248.png) ```py # 用鏈式操作，nsmallest方法再從中挑出預算最小的五部 In[36]: movie2.nlargest(100, 'imdb_score').nsmallest(5, 'budget') Out[36]: ``` ![](https://img.kancloud.cn/3a/62/3a621e46db7218aa5bc18ff9ae2e06df_491x248.png) ## 4\. 通過排序選取每組的最大值 ```py # 同上，選取出三列。按照title_year降序排列 In[37]: movie = pd.read_csv('data/movie.csv') movie2 = movie[['movie_title', 'title_year', 'imdb_score']] In[38]: movie2.sort_values('title_year', ascending=False).head() Out[38]: ``` ![](https://img.kancloud.cn/72/1a/721a5a9fee80f139d15e81924ba6473b_564x248.png) ```py # 用列表同時對兩列進行排序 In[39]: movie3 = movie2.sort_values(['title_year','imdb_score'], ascending=False) movie3.head() Out[39]: ``` ![](https://img.kancloud.cn/c9/f9/c9f967676de556288a33a8efdfc6b342_526x245.png) ```py # 用drop_duplicates去重，只保留每年的第一條數據 In[40]: movie_top_year = movie3.drop_duplicates(subset='title_year') movie_top_year.head() Out[40]: ``` ![](https://img.kancloud.cn/69/36/693698093c9525ef01792488caa2d30b_633x245.png) ```py # 通過給ascending設置列表，可以同時對一列降序排列，一列升序排列 In[41]: movie4 = movie[['movie_title', 'title_year', 'content_rating', 'budget']] movie4_sorted = movie4.sort_values(['title_year', 'content_rating', 'budget'], ascending=[False, False, True]) movie4_sorted.drop_duplicates(subset=['title_year', 'content_rating']).head(10) Out[41]: ``` ![](https://img.kancloud.cn/06/20/0620d771e13d904c3a892f950a15210c_803x445.png) ## 5\. 用sort_values復現nlargest方法 ```py # 和前面一樣nlargest和nsmallest鏈式操作進行選取 In[42]: movie = pd.read_csv('data/movie.csv') movie2 = movie[['movie_title', 'imdb_score', 'budget']] movie_smallest_largest = movie2.nlargest(100, 'imdb_score').nsmallest(5, 'budget') movie_smallest_largest Out[42]: ``` ![](https://img.kancloud.cn/36/aa/36aac14a4c9e765fa4ef859ad9f95655_489x247.png) ```py # 用sort_values方法，選取imdb_score最高的100個 In[43]: movie2.sort_values('imdb_score', ascending=False).head(100).head() Out[43]: # 然后可以再.sort_values('budget').head()，選出預算最低的5個，結果如下 ``` ![](https://img.kancloud.cn/9b/f7/9bf779b1f16761ea2583c7fe7e0b1885_696x314.png) 這兩種方法得到的最小的5部電影不同，用tail進行調查： ```py # tail可以查看尾部 In[45]: movie2.nlargest(100, 'imdb_score').tail() Out[45]: ``` ![](https://img.kancloud.cn/9b/f7/9bf779b1f16761ea2583c7fe7e0b1885_696x314.png) ```py In[46]: movie2.sort_values('imdb_score', ascending=False).head(100).tail() Out[46]: ``` ![](https://img.kancloud.cn/01/f7/01f7b808111d65b40440055bea86c7bd_876x324.png) 這是因為評分在8.4以上的電影超過了100部。 ## 6\. 計算跟蹤止損單價格 ```py # pip install pandas_datareader 或 conda install pandas_datareader，來安裝pandas_datareader In[47]: import pandas_datareader as pdr ``` > 筆記：pandas_datareader的問題 > pandas_datareader在讀取“google”源時會有問題。如果碰到問題，切換到“Yahoo”。 ```py # 查詢特斯拉在2017年第一天的股價 In[49]: tsla = pdr.DataReader('tsla', data_source='yahoo',start='2017-1-1') tsla.head(8) Out[49]: ``` ![](https://img.kancloud.cn/62/12/6212505070ca57b2397349b5a4d12cd3_788x403.png) ```py # 只關注每天的收盤價，使用cummax得到迄今為止的收盤價最大值 In[50]: tsla_close = tsla['Close'] In[51]: tsla_cummax = tsla_close.cummax() tsla_cummax.head(8) Out[51]: ``` ![](https://img.kancloud.cn/4f/85/4f85baf42f1204902ed0763fdf45ad04_312x262.png) ```py # 將下行區間限制到10%，將tsla_cummax乘以0.9 >>> tsla_trailing_stop = tsla_cummax * .9 >>> tsla_trailing_stop.head(8) Date 2017-01-03 195.291 2017-01-04 204.291 2017-01-05 204.291 2017-01-06 206.109 2017-01-09 208.152 2017-01-10 208.152 2017-01-11 208.152 2017-01-12 208.152 Name: Close, dtype: float64 ``` ### 更多 ```py # 將上述功能包裝成一個函數 In[52]: def set_trailing_loss(symbol, purchase_date, perc): close = pdr.DataReader(symbol, 'yahoo', start=purchase_date)['Close'] return close.cummax() * perc In[53]: tsla_cummax = tsla_close.cummax() tsla_cummax.head(8) Out[53]: ``` ![](https://img.kancloud.cn/52/73/52733ed400c711265d6c33ef78f9fa7a_311x190.png)