## 2.1 井下溫度缺失值和異常值處理
```
import numpy as np
temperature_str = np.loadtxt('ug_detect.csv',\
dtype = bytes, \
delimiter=',',\
skiprows=1,\
usecols=(1),\
unpack = False)
print("讀取出的數組是temperature_str:\n", \
temperature_str)
temperature = np.ndarray( len(temperature_str) )
for index in range(0, len(temperature_str)) :
item = temperature_str[index]
if item != b"":
item = item.decode( 'gb2312' )
item = float( item )
else:
item = None
temperature[index] = item
for index in range(0, len(temperature)) :
item = temperature[index]
if item >= 500.0:
item = None
temperature[index] = item
print("溫度是:\n", temperature)
import matplotlib.pyplot as plt
t = np.arange( len( temperature ))
plt.plot(t,temperature)
plt.plot(t,temperature,'pr')
plt.show()
def bisec(dataArray):
for index in range(0, len(dataArray)) :
if np.isnan ( dataArray[index]):
dataArray[index] = 0.5 * ( dataArray[index - 1] + dataArray[index + 1] )
bisec(temperature)
t = np.arange( len( temperature ))
plt.plot(t,temperature)
plt.plot(t,temperature,'pr')
plt.show()
import time
import random
while True:
print("aaa")
time.sleep(5)
```
## 2.2 使用pandas
```
import pandas as pd
import matplotlib.pyplot as plt
import scipy.interpolate as itp
ug_data = pd.read_csv('ug_detect.csv',\
header = 0, \
encoding='gb2312')
temperature_data = ug_data[u'溫度(?C)']
humidity_data = ug_data[u'相對濕度']
gas_data = ug_data[u'瓦斯(m?/min)']
co_data = ug_data[u'一氧化碳(m?/min)']
#尋找異常值并設置為None
def defectsCop(data_series, threshold):
for index in range(0, len(data_series)):
item = data_series[index]
if item >= float(threshold):
item = None
data_series[index] = item
def seriesItp(data_series):
for index in range(0, len(data_series)) :
item = data_series[index]
if pd.isnull( data_series[index] ):
x_list = [index - 1, index + 1]
y_list = [ data_series[index - 1],\
data_series[index + 1]]
lagrange_poly = itp.lagrange(x_list, y_list)
data_series[index] = lagrange_poly(index)
defectsCop(temperature_data, 60)
defectsCop(humidity_data, 200)
defectsCop(gas_data, 100)
defectsCop(co_data, 100)
seriesItp(temperature_data)
seriesItp(humidity_data)
seriesItp(gas_data)
seriesItp(co_data)
all_data = pd.DataFrame(\
{"溫度":temperature_data,\
"相對濕度":humidity_data,\
"瓦斯濃度":gas_data, \
"一氧化碳濃度":co_data})
all_data.to_csv('all_data_pandas.csv',\
index = False, \
encoding='gb2312')
```
## 3.1 歌詞處理
```
# 1 句頻統計
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# 讀取歌詞文件
with open('jaychou_lyrics.txt', 'r', encoding='utf-8') as f:
lyrics = f.read()
# 分句
words = re.findall(r'\w+', lyrics)
# 統計句頻
word_count = Counter(words)
print("Top 10 words:")
for word, count in word_count.most_common(10):
print(f"{word}: {count}")
#2 提詞器
import pandas as pd
# 假設txt文件名為'jay_lyrics.txt'
file_name = 'jaychou_lyrics.txt'
# 讀取txt文件到pandas Series
with open(file_name, 'r', encoding='utf-8') as f:
lyrics = pd.Series(f.read().splitlines()) # 使用splitlines()按行分割
# 創建一個函數來查找并返回下一句歌詞
def get_next_line(input_line):
# 嘗試找到輸入的歌詞在Series中的索引
index = lyrics[lyrics == input_line].index.min()
# 檢查是否找到了歌詞并且不是最后一行
if not pd.isnull(index) and index < len(lyrics) - 1:
# 返回下一句歌詞
return lyrics.iloc[index + 1]
else:
# 如果沒有找到或者已經是最后一行,返回相應信息
return "未找到該句歌詞或已經是最后一句了。"
# 用戶輸入歌詞
user_input = input("請輸入一句歌詞:")
# 調用函數并輸出結果
print(get_next_line(user_input))
```
## 4.1 幸福指數
### 補充map小練習
```
import pandas as pd
# 假設我們有一個DataFrame,其中一列名為'ChineseWords'
data = {
'ChineseWords': ['你好', '謝謝', '再見']
}
df = pd.DataFrame(data)
# 創建一個字典作為翻譯表
translation_dict = {
'你好': 'Hello',
'謝謝': 'Thank you',
'再見': 'Goodbye'
}
# 使用map方法將'ChineseWords'列中的值翻譯成英文
df['EnglishWords'] = df['ChineseWords'].map(translation_dict)
# 打印翻譯后的DataFrame
print(df)
```
```
import pandas as pd
# 假設數據已經加載到DataFrame中,名為df
df = pd.read_excel('happy.xls').dropna() # 如果數據是從CSV文件加載
# 數據清洗和預處理
# 檢查缺失值
print(df.isnull().sum())
# 將分類變量轉換為數值型
df['性別'] = df['性別'].map({'男': 1, '女': 0}) # 假設還有女性數據
df['是否城市'] = df['是否城市'].map({'城市': 1, '農村': 0})
df['婚姻狀況'] = df['婚姻狀況'].map({'已婚': 1, '未婚': 0}) # 假設還有其他婚姻狀況
df['健康狀況'] = df['健康狀況'].map({'是': 1, '否': 0}) # 假設健康狀況有'是'和'否'兩種
df['公共服務態度'] = df['公共服務態度'].map({'滿意': 1, '不滿意': 0}) # 假設還有'不滿意'選項
# 去除不需要的列
df = df.drop(['編號', '調查時間'], axis=1)
# 查看預處理后的數據
print(df.head(100))
# 任務二
from sklearn.preprocessing import StandardScaler
# 創建新特征:年齡和總收入
df['年齡'] = 2023 - df['出生年']
df['總收入'] = df['個人收入'] + df['家庭收入']
# 刪除原始的個人收入和家庭收入列(可選)
df = df.drop(['個人收入', '家庭收入'], axis=1)
數據標準化
scaler = StandardScaler()
df[['年齡', '總收入']] = scaler.fit_transform(df[['年齡', '總收入']])
# 查看帶有新特征的數據
print(df.head())
```
## 5.3 內、外、左、右連接——合并母嬰購物數據
```
import pandas as pd
# 假設數據已經加載到DataFrame中,名為df
mum_baby = pd.read_csv('mum_baby.csv').dropna() # 如果數據是從CSV文件加載
trade_history = pd.read_csv('trade_history.csv').dropna() # 如果數據是從CSV文件加載
pd.merge(mum_baby, trade_history, on='user_id', how='right')
```