1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
'''
准备工作
'''
# 数据导入与划分
import pandas as pd
data = pd.read_csv('abalone.txt', sep='\t', header = None)
data.head()

x = data.iloc[:,:-1]
y = data.iloc[:,-1]

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 0)

# 准备库
from sklearn.model_selection import GridSearchCV # 网格搜索库
from sklearn.model_selection import RandomizedSearchCV # 随机网格搜索库
from sklearn.ensemble import RandomForestRegressor # 随机森林分类库
from sklearn.metrics import accuracy_score # 评价指标库
from sklearn.metrics import make_scorer #
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error # 线性回归用于makescorer的值
from sklearn.linear_model import LogisticRegression # 逻辑回归库
from sklearn.model_selection import cross_val_score # 交叉验证库
from sklearn.linear_model import LinearRegression # 线性回归库
import matplotlib.pyplot as plt
import numpy as np
import time
import warnings

warnings.filterwarnings("ignore") # 忽略一下警告

'''
随机森林调参,网格搜索
'''
start = time.time()
rfr = RandomForestRegressor(random_state = 0)
params = {'n_estimators':list(range(10,200,40)),'max_depth':list(range(10,200,40))}

gs_rfr = GridSearchCV(estimator = rfr,param_grid = params,cv = 5,scoring = make_scorer(mean_absolute_error,greater_is_better=False)) # 线性回归时要用mae,并且设置越小越好

gs_rfr.fit(x_train,y_train)
end = time.time()
print(gs_rfr.best_params_)
print(gs_rfr.best_score_) # 训练接mae
print('网格搜索所消耗的时间为:%.3f S'% float(end - start))

gs_rfr.score(x_test,y_test) # 测试集mae
print(gs_rfr.score(x_test,y_test))

# 缩小范围,多次重复,即可得到更准确的最优参数。

'''
随机网格搜索调参
'''
start = time.time()
rfc = RandomForestRegressor(random_state = 0)
params = {'n_estimators':list(range(10,200,40)),'max_depth':list(range(10,200,40))}

gs_rfc = RandomizedSearchCV(estimator = rfc,param_distributions = params,cv = 5,scoring = make_scorer(mean_absolute_error,greater_is_better=False))

gs_rfc.fit(x_train,y_train)
end = time.time()
print(gs_rfc.best_params_)
print(gs_rfc.best_score_)
print('随机网格搜索所消耗的时间为:%.3f S'% float(end - start))

gs_rfc.score(x_test,y_test)
print(gs_rfc.score(x_test,y_test))

'''
卡方检验:
经典卡方检验是检验定性自变量对定性因变量的相关性。假设自变量有N种取值,因变量有M种,考虑自变量等于i且因变量等于j的样本频数的观察值与期望的差距。χ² = ∑((A - E)² / E)。
'''
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2 # 指标

x_non_negative = x - np.min(x) # 非负化

# 选择K个最好的特征,返回的就是选择后的特征
xs = SelectKBest(chi2,k = 5).fit_transform(x_non_negative,y)

'''
递归特征消除法:
用一个基模型来进行多轮训练,每轮训练后,消除若干权值系数的特征,再基于新的特征集进行下一轮训练
'''
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression # 基模型

# 递归消除法,返回值也是选择后的数据
XS = RFE(estimator = LinearRegression(),n_features_to_select = 5).fit_transform(x,y) # 基模型,保留特征数量



'''
卡方检验调参图
'''
from sklearn.model_selection import train_test_split
xs_train,xs_test,y_train,y_test = train_test_split(xs,y,test_size = 0.2,random_state = 0) # 划分

rfr = RandomForestRegressor()
rfr.fit(xs_train,y_train)
train_predicts = rfr.predict(xs_train)
test_predicts = rfr.predict(xs_test)

mean_squared_error(train_predicts,y_train)
mean_squared_error(test_predicts,y_test)

# 调节参数,决策树棵数
error_trains = []
error_tests = []
for i in range(10,200,10):
# 修改n_estimators,也就是决策树棵数,并确定随机种子
rfr = RandomForestRegressor(n_estimators = i,random_state = 90) # 控制种子,保证变量唯一
rfr.fit(xs_train,y_train)
train_predicts = rfr.predict(xs_train)
test_predicts = rfr.predict(xs_test)
error_trains.append(mean_squared_error(train_predicts,y_train))
error_tests.append(mean_squared_error(test_predicts,y_test))

# 绘制调参图
x = list(range(10,200,10))
plt.plot(x,error_trains,'--',label = 'train')
plt.plot(x,error_tests,label = 'test')
plt.xlabel('n_estimators')
plt.ylabel('MSE')
plt.legend()
plt.show()

'''
递归消除调参图
'''
from sklearn.model_selection import train_test_split
XS_train,XS_test,y_train,y_test = train_test_split(XS,y,test_size = 0.2,random_state = 0) # 划分

rfr = RandomForestRegressor()
rfr.fit(XS_train,y_train)
train_predicts = rfr.predict(XS_train)
test_predicts = rfr.predict(XS_test)

mean_squared_error(train_predicts,y_train)
mean_squared_error(test_predicts,y_test)

# 调节参数,决策树棵数
error_trains = []
error_tests = []
for i in range(10,200,10):
# 修改n_estimators,也就是决策树棵数,并确定随机种子
rfr = RandomForestRegressor(n_estimators = i,random_state = 90) # 控制种子,保证变量唯一
rfr.fit(XS_train,y_train)
train_predicts = rfr.predict(XS_train)
test_predicts = rfr.predict(XS_test)
error_trains.append(mean_squared_error(train_predicts,y_train))
error_tests.append(mean_squared_error(test_predicts,y_test))

# 绘制调参图
x = list(range(10,200,10))
plt.plot(x,error_trains,'--',label = 'train')
plt.plot(x,error_tests,label = 'test')
plt.xlabel('n_estimators')
plt.ylabel('MSE')
plt.legend()
plt.show()