1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
| ''' 准备工作 '''
import pandas as pd data = pd.read_csv('abalone.txt', sep='\t', header = None) data.head()
x = data.iloc[:,:-1] y = data.iloc[:,-1]
from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 0)
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import RandomizedSearchCV from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import accuracy_score from sklearn.metrics import make_scorer from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_score from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt import numpy as np import time import warnings
warnings.filterwarnings("ignore")
''' 随机森林调参,网格搜索 ''' start = time.time() rfr = RandomForestRegressor(random_state = 0) params = {'n_estimators':list(range(10,200,40)),'max_depth':list(range(10,200,40))}
gs_rfr = GridSearchCV(estimator = rfr,param_grid = params,cv = 5,scoring = make_scorer(mean_absolute_error,greater_is_better=False))
gs_rfr.fit(x_train,y_train) end = time.time() print(gs_rfr.best_params_) print(gs_rfr.best_score_) print('网格搜索所消耗的时间为:%.3f S'% float(end - start))
gs_rfr.score(x_test,y_test) print(gs_rfr.score(x_test,y_test))
''' 随机网格搜索调参 ''' start = time.time() rfc = RandomForestRegressor(random_state = 0) params = {'n_estimators':list(range(10,200,40)),'max_depth':list(range(10,200,40))}
gs_rfc = RandomizedSearchCV(estimator = rfc,param_distributions = params,cv = 5,scoring = make_scorer(mean_absolute_error,greater_is_better=False))
gs_rfc.fit(x_train,y_train) end = time.time() print(gs_rfc.best_params_) print(gs_rfc.best_score_) print('随机网格搜索所消耗的时间为:%.3f S'% float(end - start))
gs_rfc.score(x_test,y_test) print(gs_rfc.score(x_test,y_test))
''' 卡方检验: 经典卡方检验是检验定性自变量对定性因变量的相关性。假设自变量有N种取值,因变量有M种,考虑自变量等于i且因变量等于j的样本频数的观察值与期望的差距。χ² = ∑((A - E)² / E)。 ''' from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2
x_non_negative = x - np.min(x)
xs = SelectKBest(chi2,k = 5).fit_transform(x_non_negative,y)
''' 递归特征消除法: 用一个基模型来进行多轮训练,每轮训练后,消除若干权值系数的特征,再基于新的特征集进行下一轮训练 ''' from sklearn.feature_selection import RFE from sklearn.linear_model import LinearRegression
XS = RFE(estimator = LinearRegression(),n_features_to_select = 5).fit_transform(x,y)
''' 卡方检验调参图 ''' from sklearn.model_selection import train_test_split xs_train,xs_test,y_train,y_test = train_test_split(xs,y,test_size = 0.2,random_state = 0)
rfr = RandomForestRegressor() rfr.fit(xs_train,y_train) train_predicts = rfr.predict(xs_train) test_predicts = rfr.predict(xs_test)
mean_squared_error(train_predicts,y_train) mean_squared_error(test_predicts,y_test)
error_trains = [] error_tests = [] for i in range(10,200,10): rfr = RandomForestRegressor(n_estimators = i,random_state = 90) rfr.fit(xs_train,y_train) train_predicts = rfr.predict(xs_train) test_predicts = rfr.predict(xs_test) error_trains.append(mean_squared_error(train_predicts,y_train)) error_tests.append(mean_squared_error(test_predicts,y_test))
x = list(range(10,200,10)) plt.plot(x,error_trains,'--',label = 'train') plt.plot(x,error_tests,label = 'test') plt.xlabel('n_estimators') plt.ylabel('MSE') plt.legend() plt.show()
''' 递归消除调参图 ''' from sklearn.model_selection import train_test_split XS_train,XS_test,y_train,y_test = train_test_split(XS,y,test_size = 0.2,random_state = 0)
rfr = RandomForestRegressor() rfr.fit(XS_train,y_train) train_predicts = rfr.predict(XS_train) test_predicts = rfr.predict(XS_test)
mean_squared_error(train_predicts,y_train) mean_squared_error(test_predicts,y_test)
error_trains = [] error_tests = [] for i in range(10,200,10): rfr = RandomForestRegressor(n_estimators = i,random_state = 90) rfr.fit(XS_train,y_train) train_predicts = rfr.predict(XS_train) test_predicts = rfr.predict(XS_test) error_trains.append(mean_squared_error(train_predicts,y_train)) error_tests.append(mean_squared_error(test_predicts,y_test))
x = list(range(10,200,10)) plt.plot(x,error_trains,'--',label = 'train') plt.plot(x,error_tests,label = 'test') plt.xlabel('n_estimators') plt.ylabel('MSE') plt.legend() plt.show()
|