代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error
import matplotlib.pyplot as plt
from sklearn_tree import DecisionTreeRgeressor

data = load_boston()
X = data['data']
y = data['target']
train_X,test_X, train_y, test_y = train_test_split(X,y,test_size = 0.3)

from sklearn.model_selection import KFold
def get_stacking(clf, x_train, y_train, x_test, n_folds=10): # 设定几折交叉验证
"""
这个函数是stacking的核心,使用交叉验证的方法得到次级训练集
"""
#计算训练集和测试集的样本数
train_num, test_num = x_train.shape[0], x_test.shape[0]
#存储结果
second_level_train_set = np.zeros((train_num,))
second_level_test_set = np.zeros((test_num,))
test_nfolds_sets = np.zeros((test_num, n_folds))
#K折交叉验证
kf = KFold(n_splits=n_folds)
#依次使用K折数据集训练数据
for i,(train_index, test_index) in enumerate(kf.split(x_train)):
#切分K折数据
x_tra, y_tra = x_train[train_index], y_train[train_index]
x_tst, y_tst = x_train[test_index], y_train[test_index]
#训练数据
clf.fit(x_tra, y_tra)
#对训练集和测试集进行预测
second_level_train_set[test_index] = clf.predict(x_tst)
test_nfolds_sets[:,i] = clf.predict(x_test)
#计算返回的均值
second_level_test_set[:] = test_nfolds_sets.mean(axis = 1)
return second_level_train_set, second_level_test_set

# 需要融合的模型都有哪些
rf_model = RandomForestRegressor()
dr_model = DecisionTreeRegressor()

# 存储新特征的列表
train_sets = []
test_sets = []
for clf in [rf_model,dr_model]:
train_set,test_set = get_stacking(clf,train_X, train_y, test_X, n_folds=10)
train_sets.append(train_set)
test_sets.append(test_set)

meta_train = np.concatenate([result_set.reshape(-1,1) for result_set in train_sets],axis = 1)
meta_test = np.concatenate([result_set.reshape(-1,1) for result_set in test_sets],axis = 1)

# 弱学习器作为整合融合模型
dr_clf = DecisionTreeRegressor(max_depth = 3)
dr_clf.fit(meta_train,train_y)

df_predict_train = dr_clf.predict(meta_train)
df_predict_test = dr_clf.predict(meta_test)

mean_squared_error(df_predict_train,train_y)
mean_squared_error(df_predict_test,test_y)

# 实际上需要把待融合的模型以及弱学习器模型参数调好再进行使用,否则容易过拟合。