## 1.1 二手房分析
```
import pandas as pd
# 讀取Excel文件
df = pd.read_excel("xihu.xlsx")
def processName(x):
return x.strip("['']")
#數據清洗
df['小區名稱'] = df['小區名稱'].apply(processName)
df['建筑面積'] = df['建筑面積'].str.replace('㎡', '').astype(float)
#任務1:小王有200萬買房預算,想買一個建筑面積80平米以上,一梯兩戶,附近有學校的房子,請幫他篩選出來?
def contains_school(x):
words = ["學校", "大學", "中學", "小學"]
for word in words:
if word in str(x):
return True
return False
xiaowang_df=df[(df['建筑面積'] > 80) & (df['梯戶比例'] == '一梯兩戶')&
(df["小區介紹"].apply(contains_school)|df["核心賣點"].apply(contains_school))]
# 任務2:公司董事長請你幫忙挑選出西湖區最高檔的小區(單價最貴)
average_prices_per_area = df.groupby('區域位置')['單價'].mean()
# 按照平均房價降序排序
sorted_prices = average_prices_per_area.sort_values(ascending=False)
sorted_prices.index[0]
# 任務3 識別炒房
# 轉換日期列為datetime對象
df['上次交易'] = pd.to_datetime(df['上次交易'])
df['掛牌時間'] = pd.to_datetime(df['掛牌時間'])
# 計算日期差異(以年為單位)
df['交易與掛牌時間差'] = (df['掛牌時間'] - df['上次交易']) / pd.Timedelta(days=365)
# 篩選條件:面積小于100平米,且交易與掛牌時間差小于5年
filtered_df = df[(df['建筑面積'] < 100) & (df['交易與掛牌時間差'] < 5)]
```
## 2.新浪股票分析
### 可視化
```python
import pandas as pd
import matplotlib.pyplot as plt
# 定義列名
column_names = ['股票代碼', '交易日期', '收盤價', '最高價', '最低價', '開盤價', '前收盤', '漲跌額', '漲跌幅', '換手率', '成交量', '成交金額', '總市值', '流通市值']
# 讀取CSV文件,指定列名
df = pd.read_csv('stock.csv',names=column_names)
# 數據預處理,將交易日期轉換為日期時間對象
df['交易日期'] = pd.to_datetime(df['交易日期'])
# 可視化收盤價隨時間的變化
plt.figure(figsize=(22, 6))
plt.plot(df['交易日期'], df['收盤價'], label='close')
# 可視化最高價和最低價隨時間的變化
plt.plot(df['交易日期'], df['最高價'], label='high')
plt.plot(df['交易日期'], df['最低價'], label='low')
# 添加圖例
plt.legend()
# 添加標題和軸標簽
plt.title('time change')
plt.xlabel('date')
plt.ylabel('price(rmb)')
# 顯示圖表
plt.show()
# 可視化成交量隨時間的變化
plt.figure(figsize=(12, 6))
plt.plot(df['交易日期'], df['成交量'], label='volume')
# 添加標題和軸標簽
plt.title('change')
plt.xlabel('date')
plt.ylabel('volume')
# 顯示圖表
plt.show()
```
```
# 4.3.1 計算收盤價常用統計量
import numpy as np
closing_price = np.loadtxt("stock.csv",
delimiter=",",
usecols=(2))
print("closing_price的類型是:",type(closing_price))
print("closing_price的維數是:",closing_price.shape)
print("closing_price元素個數是:",closing_price.size)
print(closing_price)
avg = np.mean(closing_price)
print("收盤價的平均值是:%.2f" % avg)
med = np.median(closing_price)
print("收盤價的中位數是%.2f" % med)
print("中位數所在位置的索引是:%d" % np.where( closing_price == 23.95 ))
variance =np.var(closing_price)
print("收盤價的方差是:%.2f" % variance)
```
```
# 4.3.2 計算股價最高值和最低值
from numpy import loadtxt
from numpy import max
from numpy import min
from numpy import ptp
(high_price,
low_price) = loadtxt("stock.csv",\
delimiter = ",",\
usecols=(3,4),\
unpack=True)
print("high_price的類型是:",type(high_price))
print("high_price的維數是:",high_price.shape)
print("high_price的元素個數是:",high_price.size)
print("low_price的類型是:",type(low_price))
print("low_price的維數是:",low_price.shape)
print("low_price的元素個數是:",low_price.size)
highest = max(high_price)
print("該股票的股價最高值是:",highest)
lowest = min(low_price)
print("該股票的股價最低值是:",lowest)
middle = (highest + lowest)/2
print("該股票股價的中間值是:",middle)
high_range = ptp(high_price)
print("該股票最高價的波動范圍是:",high_range)
low_range = ptp(low_price)
print("該股票最低價的波動范圍是:",low_range)
```
## 4.3.3 加權平均價
```
import numpy as np
closing_price = np.loadtxt("stock.csv",\
delimiter = ",",\
usecols = (2),\
unpack=False)
volume = np.loadtxt("stock.csv",\
delimiter = ",",\
usecols = (11),\
unpack=False)
vwap = np.average(closing_price,weights=volume)
print("該股票的成交量加權平均值是:%.2f" % vwap )
t = np.arange(closing_price.shape[0])
twap = np.average(closing_price,weights=t)
print("該股票的時間加權平均值是:%.2f" % twap )
```
## 4.3.4 周末效應
```
import datetime
#年/月/日 ===>星期幾
def date2str(nowDate):
nowDate = str(nowDate,"GB2312")
return datetime.datetime.strptime(nowDate,"%Y/%m/%d").date().weekday()+1
days, closing_price = np.loadtxt('stock.csv',\
delimiter = ',',\
usecols = (1,2),\
converters = {1:date2str},\
unpack=True)
for i in range(days.size):
print("發生交易的天數是星期%d,當天收盤價是%f" \
% (days[i], closing_price[i]))
price_avg = np.zeros(5)
for i in range(1,6):
index = np.where( days == i )
price = np.take(closing_price,index)
avg = np.mean(price)
price_avg[i-1] = avg
print('星期', i, '的平均收盤價是:', price_avg[i-1])
```
## 相似度計算練習——菜品推薦
```
import numpy as np
# 每道菜有4個特征(辣度、甜度、油膩度、咸度)
products = {
'鍋包肉': np.array([2.0, 4.5, 4.9,3.5]),
'毛血旺': np.array([4.5, 2.0, 4.5,4.6]),
'宮保雞丁': np.array([3.8, 4.0, 3.7,3.5]),
'水煮肉': np.array([4.3, 2.2, 4.5,4.5]),
# ... 可以添加更多商品
}
# 計算兩個向量(商品)之間的內積
def inner_product(vec1, vec2):
return np.dot(vec1, vec2)
#找出相速度最高的菜
def findSimilar(x):
# 獲取給定菜品的特征向量
target_vec = products[x]
# 初始化最大相似度和最相似菜品
max_similarity = 0.0
most_similar_dish = None
# 遍歷所有菜品,計算與給定菜品的內積
for dish, vec in products.items():
if dish != x: # 排除自身
similarity = inner_product(target_vec, vec)
# 如果當前內積大于之前的最大相似度,則更新最大相似度和最相似菜品
if similarity > max_similarity:
max_similarity = similarity
most_similar_dish = dish
# 返回最相似菜品和相似度
return most_similar_dish, max_similarity
```
