1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
| import pandas as pd import numpy as np from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error,mean_absolute_error import matplotlib.pyplot as plt from sklearn_tree import DecisionTreeRgeressor
data = load_boston() X = data['data'] y = data['target'] train_X,test_X, train_y, test_y = train_test_split(X,y,test_size = 0.3)
from sklearn.model_selection import KFold def get_stacking(clf, x_train, y_train, x_test, n_folds=10): """ 这个函数是stacking的核心,使用交叉验证的方法得到次级训练集 """ train_num, test_num = x_train.shape[0], x_test.shape[0] second_level_train_set = np.zeros((train_num,)) second_level_test_set = np.zeros((test_num,)) test_nfolds_sets = np.zeros((test_num, n_folds)) kf = KFold(n_splits=n_folds) for i,(train_index, test_index) in enumerate(kf.split(x_train)): x_tra, y_tra = x_train[train_index], y_train[train_index] x_tst, y_tst = x_train[test_index], y_train[test_index] clf.fit(x_tra, y_tra) second_level_train_set[test_index] = clf.predict(x_tst) test_nfolds_sets[:,i] = clf.predict(x_test) second_level_test_set[:] = test_nfolds_sets.mean(axis = 1) return second_level_train_set, second_level_test_set
rf_model = RandomForestRegressor() dr_model = DecisionTreeRegressor()
train_sets = [] test_sets = [] for clf in [rf_model,dr_model]: train_set,test_set = get_stacking(clf,train_X, train_y, test_X, n_folds=10) train_sets.append(train_set) test_sets.append(test_set)
meta_train = np.concatenate([result_set.reshape(-1,1) for result_set in train_sets],axis = 1) meta_test = np.concatenate([result_set.reshape(-1,1) for result_set in test_sets],axis = 1)
dr_clf = DecisionTreeRegressor(max_depth = 3) dr_clf.fit(meta_train,train_y)
df_predict_train = dr_clf.predict(meta_train) df_predict_test = dr_clf.predict(meta_test)
mean_squared_error(df_predict_train,train_y) mean_squared_error(df_predict_test,test_y)
|