十、模型選擇 · 數據科學和人工智能技術筆記

# 十、模型選擇 > 作者：[Chris Albon](https://chrisalbon.com/) > > 譯者：[飛龍](https://github.com/wizardforcel) > > 協議：[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/) ## 在模型選擇期間尋找最佳預處理步驟在進行模型選擇時，我們必須小心正確處理預處理。首先，`GridSearchCV`使用交叉驗證來確定哪個模型表現最好。然而，在交叉驗證中，我們假裝作為測試集被留出的一折是不可見的，因此不適合一些預處理步驟（例如縮放或標準化）。出于這個原因，我們無法預處理數據然后運行`GridSearchCV`。其次，一些預處理方法有自己的參數，通常必須由用戶提供。通過在搜索空間中包括候選成分值，可以像對待任何想要搜索其他超參數一樣對待它們。 ```py # 加載庫 import numpy as np from sklearn import datasets from sklearn.feature_selection import SelectKBest from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler # 設置隨機種子 np.random.seed(0) # 加載數據 iris = datasets.load_iris() X = iris.data y = iris.target ``` 我們包括兩個不同的預處理步驟：主成分分析和 k 最佳特征選擇。 ```py # 創建組合預處理對象 preprocess = FeatureUnion([('pca', PCA()), ("kbest", SelectKBest(k=1))]) # 創建流水線 pipe = Pipeline([('preprocess', preprocess), ('classifier', LogisticRegression())]) # 創建候選值空間 search_space = [{'preprocess__pca__n_components': [1, 2, 3], 'classifier__penalty': ['l1', 'l2'], 'classifier__C': np.logspace(0, 4, 10)}] # 創建網格搜索 clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1) # 擬合網格搜索 best_model = clf.fit(X, y) # 查看最佳超參數 print('Best Number Of Princpal Components:', best_model.best_estimator_.get_params()['preprocess__pca__n_components']) print('Best Penalty:', best_model.best_estimator_.get_params()['classifier__penalty']) print('Best C:', best_model.best_estimator_.get_params()['classifier__C']) ''' Best Number Of Princpal Components: 3 Best Penalty: l1 Best C: 59.9484250319 ''' ``` ## 使用網格搜索的超參數調優 ![](https://img.kancloud.cn/ea/03/ea0382cde7e74034025acd289250d0a4_1802x1202.jpg) ```py # 加載庫 import numpy as np from sklearn import linear_model, datasets from sklearn.model_selection import GridSearchCV # 加載數據 iris = datasets.load_iris() X = iris.data y = iris.target # 創建邏輯回歸 logistic = linear_model.LogisticRegression() # 創建正則化懲罰空間 penalty = ['l1', 'l2'] # 創建正則化超參數空間 C = np.logspace(0, 4, 10) # 創建超參數選項 hyperparameters = dict(C=C, penalty=penalty) # 使用 5 折交叉驗證創建網格搜索 clf = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0) # 擬合網格搜索 best_model = clf.fit(X, y) # 查看最佳超參數 print('Best Penalty:', best_model.best_estimator_.get_params()['penalty']) print('Best C:', best_model.best_estimator_.get_params()['C']) ''' Best Penalty: l1 Best C: 7.74263682681 ''' # 預測目標向量 best_model.predict(X) ''' array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) ''' ``` ## 使用隨機搜索的超參數調優 ```py # 加載庫 from scipy.stats import uniform from sklearn import linear_model, datasets from sklearn.model_selection import RandomizedSearchCV # 加載數據 iris = datasets.load_iris() X = iris.data y = iris.target # 創建邏輯回歸 logistic = linear_model.LogisticRegression() # 創建正則化懲罰空間 penalty = ['l1', 'l2'] # 使用均勻分布創建正則化超參數分布 C = uniform(loc=0, scale=4) # 創建超參數選項 hyperparameters = dict(C=C, penalty=penalty) # 使用 5 折交叉驗證和 100 個迭代 clf = RandomizedSearchCV(logistic, hyperparameters, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1) # 擬合隨機搜索 best_model = clf.fit(X, y) # 查看最佳超參數 print('Best Penalty:', best_model.best_estimator_.get_params()['penalty']) print('Best C:', best_model.best_estimator_.get_params()['C']) ''' Best Penalty: l1 Best C: 1.66808801881 ''' # 預測目標向量 best_model.predict(X) ''' array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) ''' ``` ## 使用網格搜索的模型選擇 ![](https://img.kancloud.cn/7a/5f/7a5f9150f5d1ddb9b39d5e9ebeefa3d3_1801x1201.jpg) ```py # 加載庫 import numpy as np from sklearn import datasets from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline # 設置隨機種子 np.random.seed(0) # 加載數據 iris = datasets.load_iris() X = iris.data y = iris.target ``` 請注意，我們包括需要搜索的多個可能的學習算法和多個可能的超參數值。 ```py # 創建流水線 pipe = Pipeline([('classifier', RandomForestClassifier())]) # 創建候選學習算法和它們的超參數的空間 search_space = [{'classifier': [LogisticRegression()], 'classifier__penalty': ['l1', 'l2'], 'classifier__C': np.logspace(0, 4, 10)}, {'classifier': [RandomForestClassifier()], 'classifier__n_estimators': [10, 100, 1000], 'classifier__max_features': [1, 2, 3]}] # 創建網格搜索 clf = GridSearchCV(pipe, search_space, cv=5, verbose=0) # 擬合網格搜索 best_model = clf.fit(X, y) # 查看最佳模型 best_model.best_estimator_.get_params()['classifier'] ''' LogisticRegression(C=7.7426368268112693, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l1', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False) ''' # 預測目標向量 best_model.predict(X) ''' array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) ''' ``` ## 帶有參數選項的流水線 ```py # 導入所需的包 import numpy as np from sklearn import linear_model, decomposition, datasets from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV, cross_val_score from sklearn.preprocessing import StandardScaler # 加載乳腺癌數據集 dataset = datasets.load_breast_cancer() # 從數據集特征中創建 X X = dataset.data # 從數據集目標中創建 y y = dataset.target # 創建縮放器對象 sc = StandardScaler() # 創建 PCA 對象 pca = decomposition.PCA() # 創建邏輯回歸對象，帶有 L2 懲罰 logistic = linear_model.LogisticRegression() # 創建三步流水線。首先，標準化數據。 # 其次，使用 PCA 轉換數據。 # 然后在數據上訓練邏輯回歸。 pipe = Pipeline(steps=[('sc', sc), ('pca', pca), ('logistic', logistic)]) # 創建 1 到 30 的一列整數（X + 1，特征序號） n_components = list(range(1,X.shape[1]+1,1)) # 創建正則化參數的一列值 C = np.logspace(-4, 4, 50) # 為正則化乘法創建一列選項 penalty = ['l1', 'l2'] # 為所有參數選項創建字典 # 注意，你可以使用 '__' 來訪問流水線的步驟的參數 parameters = dict(pca__n_components=n_components, logistic__C=C, logistic__penalty=penalty) # 創建網格搜索對象 clf = GridSearchCV(pipe, parameters) # 擬合網格搜索 clf.fit(X, y) # 查看超參數 print('Best Penalty:', clf.best_estimator_.get_params()['logistic__penalty']) print('Best C:', clf.best_estimator_.get_params()['logistic__C']) print('Best Number Of Components:', clf.best_estimator_.get_params()['pca__n_components']) # 使用 3 折交叉驗證擬合網格搜索 cross_val_score(clf, X, y) ```