# 六、日期時間預處理
> 作者:[Chris Albon](https://chrisalbon.com/)
>
> 譯者:[飛龍](https://github.com/wizardforcel)
>
> 協議:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/)
## 把日期和時間拆成多個特征
```py
# 加載庫
import pandas as pd
# 創建數據幀
df = pd.DataFrame()
# 創建五個日期
df['date'] = pd.date_range('1/1/2001', periods=150, freq='W')
# 為年月日,時分秒創建特征
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
# 展示三行
df.head(3)
```
| | date | year | month | day | hour | minute |
| --- | --- | --- | --- | --- | --- | --- |
| 0 | 2001-01-07 | 2001 | 1 | 7 | 0 | 0 |
| 1 | 2001-01-14 | 2001 | 1 | 14 | 0 | 0 |
| 2 | 2001-01-21 | 2001 | 1 | 21 | 0 | 0 |
## 計算日期時間之間的差
```py
# 加載庫
import pandas as pd
# 創建數據幀
df = pd.DataFrame()
# 創建兩個 datetime 特征
df['Arrived'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-04-2017')]
df['Left'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-06-2017')]
# 計算特征之間的間隔
df['Left'] - df['Arrived']
'''
0 0 days
1 2 days
dtype: timedelta64[ns]
'''
# 計算特征之間的間隔
pd.Series(delta.days for delta in (df['Left'] - df['Arrived']))
'''
0 0
1 2
dtype: int64
'''
```
## 將字符串轉換為日期
```py
# 加載庫
import numpy as np
import pandas as pd
# 創建字符串
date_strings = np.array(['03-04-2005 11:35 PM',
'23-05-2010 12:01 AM',
'04-09-2009 09:09 PM'])
```
如果`errors="coerce"`那么任何問題都不會產生錯誤(默認行為),而是將導致錯誤的值設置為`NaT`(即缺失值)。
| 代碼 | 描述 | 示例 |
| --- | --- | --- |
| ` %Y ` | 整年 | `2001` |
| ` %m ` | 零填充的月份 | `04` |
| ` %d ` | 零填充的日期 | `09` |
| ` %I ` | 零填充的小時(12 小時) | `02` |
| ` %p ` | AM 或 PM | `AM` |
| ` %M ` | 零填充的分鐘 | `05` |
| ` %S ` | 零填充的秒鐘 | `09` |
```py
# 轉換為 datetime
[pd.to_datetime(date, format="%d-%m-%Y %I:%M %p", errors="coerce") for date in date_strings]
'''
[Timestamp('2005-04-03 23:35:00'),
Timestamp('2010-05-23 00:01:00'),
Timestamp('2009-09-04 21:09:00')]
'''
```
## 轉換 pandas 列的時區
```py
# 加載庫
import pandas as pd
from pytz import all_timezones
# 展示十個時區
all_timezones[0:10]
'''
['Africa/Abidjan',
'Africa/Accra',
'Africa/Addis_Ababa',
'Africa/Algiers',
'Africa/Asmara',
'Africa/Asmera',
'Africa/Bamako',
'Africa/Bangui',
'Africa/Banjul',
'Africa/Bissau']
'''
# 創建十個日期
dates = pd.Series(pd.date_range('2/2/2002', periods=10, freq='M'))
# 設置時區
dates_with_abidjan_time_zone = dates.dt.tz_localize('Africa/Abidjan')
# 查看 pandas 序列
dates_with_abidjan_time_zone
'''
0 2002-02-28 00:00:00+00:00
1 2002-03-31 00:00:00+00:00
2 2002-04-30 00:00:00+00:00
3 2002-05-31 00:00:00+00:00
4 2002-06-30 00:00:00+00:00
5 2002-07-31 00:00:00+00:00
6 2002-08-31 00:00:00+00:00
7 2002-09-30 00:00:00+00:00
8 2002-10-31 00:00:00+00:00
9 2002-11-30 00:00:00+00:00
dtype: datetime64[ns, Africa/Abidjan]
'''
# 轉換時區
dates_with_london_time_zone = dates_with_abidjan_time_zone.dt.tz_convert('Europe/London')
# 查看 pandas 序列
dates_with_london_time_zone
'''
0 2002-02-28 00:00:00+00:00
1 2002-03-31 00:00:00+00:00
2 2002-04-30 01:00:00+01:00
3 2002-05-31 01:00:00+01:00
4 2002-06-30 01:00:00+01:00
5 2002-07-31 01:00:00+01:00
6 2002-08-31 01:00:00+01:00
7 2002-09-30 01:00:00+01:00
8 2002-10-31 00:00:00+00:00
9 2002-11-30 00:00:00+00:00
dtype: datetime64[ns, Europe/London]
'''
```
## 編碼星期
```py
# 加載庫
import pandas as pd
# 創建數據集
dates = pd.Series(pd.date_range('2/2/2002', periods=3, freq='M'))
# 查看數據
dates
'''
0 2002-02-28
1 2002-03-31
2 2002-04-30
dtype: datetime64[ns]
'''
# 查看星期
dates.dt.weekday_name
'''
0 Thursday
1 Sunday
2 Tuesday
dtype: object
'''
```
## 處理時間序列中的缺失值
```py
# 加載庫
import pandas as pd
import numpy as np
# 創建日期
time_index = pd.date_range('01/01/2010', periods=5, freq='M')
# 創建數據幀,設置索引
df = pd.DataFrame(index=time_index)
# 創建帶有一些缺失值的特征
df['Sales'] = [1.0,2.0,np.nan,np.nan,5.0]
# 對缺失值執行插值
df.interpolate()
```
| | Sales |
| --- | --- |
| 2010-01-31 | 1.0 |
| 2010-02-28 | 2.0 |
| 2010-03-31 | 3.0 |
| 2010-04-30 | 4.0 |
| 2010-05-31 | 5.0 |
```py
# 前向填充
df.ffill()
```
| | Sales |
| --- | --- |
| 2010-01-31 | 1.0 |
| 2010-02-28 | 2.0 |
| 2010-03-31 | 2.0 |
| 2010-04-30 | 2.0 |
| 2010-05-31 | 5.0 |
```py
# 后向填充
df.bfill()
```
| | Sales |
| --- | --- |
| 2010-01-31 | 1.0 |
| 2010-02-28 | 2.0 |
| 2010-03-31 | 5.0 |
| 2010-04-30 | 5.0 |
| 2010-05-31 | 5.0 |
```py
# 對缺失值執行插值
df.interpolate(limit=1, limit_direction='forward')
```
| | Sales |
| --- | --- |
| 2010-01-31 | 1.0 |
| 2010-02-28 | 2.0 |
| 2010-03-31 | 3.0 |
| 2010-04-30 | NaN |
| 2010-05-31 | 5.0 |
## 處理時區
```py
# 加載庫
import pandas as pd
from pytz import all_timezones
# 展示十個時區
all_timezones[0:10]
'''
['Africa/Abidjan',
'Africa/Accra',
'Africa/Addis_Ababa',
'Africa/Algiers',
'Africa/Asmara',
'Africa/Asmera',
'Africa/Bamako',
'Africa/Bangui',
'Africa/Banjul',
'Africa/Bissau']
'''
# 創建 datetime
pd.Timestamp('2017-05-01 06:00:00', tz='Europe/London')
# Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')
# 創建 datetime
date = pd.Timestamp('2017-05-01 06:00:00')
# 設置時區
date_in_london = date.tz_localize('Europe/London')
# 修改時區
date_in_london.tz_convert('Africa/Abidjan')
# Timestamp('2017-05-01 05:00:00+0000', tz='Africa/Abidjan')
```
## 平移時間特征
```py
# 加載庫
import pandas as pd
# 創建數據幀
df = pd.DataFrame()
# 創建數據
df['dates'] = pd.date_range('1/1/2001', periods=5, freq='D')
df['stock_price'] = [1.1,2.2,3.3,4.4,5.5]
# 將值平移一行
df['previous_days_stock_price'] = df['stock_price'].shift(1)
# 展示數據幀
df
```
| | dates | stock_price | previous_days_stock_price |
| --- | --- | --- | --- |
| 0 | 2001-01-01 | 1.1 | NaN |
| 1 | 2001-01-02 | 2.2 | 1.1 |
| 2 | 2001-01-03 | 3.3 | 2.2 |
| 3 | 2001-01-04 | 4.4 | 3.3 |
| 4 | 2001-01-05 | 5.5 | 4.4 |
## 滑動時間窗口
```py
# 加載庫
import pandas as pd
# 創建 datetime
time_index = pd.date_range('01/01/2010', periods=5, freq='M')
# 創建數據幀,設置索引
df = pd.DataFrame(index=time_index)
# 創建特征
df['Stock_Price'] = [1,2,3,4,5]
# 計算滑動均值
df.rolling(window=2).mean()
```
| | Stock_Price |
| --- | --- |
| 2010-01-31 | NaN |
| 2010-02-28 | 1.5 |
| 2010-03-31 | 2.5 |
| 2010-04-30 | 3.5 |
| 2010-05-31 | 4.5 |
```py
# 識別滑動時間窗口中的最大值
df.rolling(window=2).max()
```
| | Stock_Price |
| --- | --- |
| 2010-01-31 | NaN |
| 2010-02-28 | 2.0 |
| 2010-03-31 | 3.0 |
| 2010-04-30 | 4.0 |
| 2010-05-31 | 5.0 |
## 選擇日期時間范圍
```py
# 加載庫
import pandas as pd
# 創建數據幀
df = pd.DataFrame()
# 創建 datetime
df['date'] = pd.date_range('1/1/2001', periods=100000, freq='H')
```
如果數據幀未按時間索引,請使用此方法。
```py
# 選擇兩個日期時間之間的觀測
df[(df['date'] > '2002-1-1 01:00:00') & (df['date'] <= '2002-1-1 04:00:00')]
```
| | date |
| --- | --- |
| 8762 | 2002-01-01 02:00:00 |
| 8763 | 2002-01-01 03:00:00 |
| 8764 | 2002-01-01 04:00:00 |
如果數據幀按時間索引,請使用此方法。
```py
# 設置索引
df = df.set_index(df['date'])
# 選擇兩個日期時間之間的觀測
df.loc['2002-1-1 01:00:00':'2002-1-1 04:00:00']
```
| | date |
| --- | --- |
| date | |
| 2002-01-01 01:00:00 | 2002-01-01 01:00:00 |
| 2002-01-01 02:00:00 | 2002-01-01 02:00:00 |
| 2002-01-01 03:00:00 | 2002-01-01 03:00:00 |
| 2002-01-01 04:00:00 | 2002-01-01 04:00:00 |