线性回归项目示例

'''
准备工作
'''
# 数据导入与划分
import pandas as pd
data = pd.read_csv('abalone.txt', sep='\t', header = None)
data.head()

x = data.iloc[:,:-1]
y = data.iloc[:,-1]

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 0)

# 准备库
from sklearn.model_selection import GridSearchCV # 网格搜索库
from sklearn.model_selection import RandomizedSearchCV # 随机网格搜索库
from sklearn.ensemble import RandomForestRegressor # 随机森林分类库
from sklearn.metrics import accuracy_score # 评价指标库
from sklearn.metrics import make_scorer # 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error # 线性回归用于makescorer的值
from sklearn.linear_model import LogisticRegression # 逻辑回归库
from sklearn.model_selection import cross_val_score # 交叉验证库
from sklearn.linear_model import LinearRegression # 线性回归库
import matplotlib.pyplot as plt
import numpy as np
import time
import warnings

warnings.filterwarnings("ignore") # 忽略一下警告

'''
随机森林调参，网格搜索
'''
start = time.time()
rfr = RandomForestRegressor(random_state = 0)
params = {'n_estimators':list(range(10,200,40)),'max_depth':list(range(10,200,40))}

gs_rfr = GridSearchCV(estimator = rfr,param_grid = params,cv = 5,scoring = make_scorer(mean_absolute_error,greater_is_better=False)) # 线性回归时要用mae，并且设置越小越好

gs_rfr.fit(x_train,y_train)
end = time.time()
print(gs_rfr.best_params_)
print(gs_rfr.best_score_) # 训练接mae
print('网格搜索所消耗的时间为：%.3f S'% float(end - start))

gs_rfr.score(x_test,y_test) # 测试集mae
print(gs_rfr.score(x_test,y_test))

# 缩小范围，多次重复，即可得到更准确的最优参数。

'''
随机网格搜索调参
'''
start = time.time()
rfc = RandomForestRegressor(random_state = 0)
params = {'n_estimators':list(range(10,200,40)),'max_depth':list(range(10,200,40))}

gs_rfc = RandomizedSearchCV(estimator = rfc,param_distributions = params,cv = 5,scoring = make_scorer(mean_absolute_error,greater_is_better=False))

gs_rfc.fit(x_train,y_train)
end = time.time()
print(gs_rfc.best_params_)
print(gs_rfc.best_score_)
print('随机网格搜索所消耗的时间为：%.3f S'% float(end - start))

gs_rfc.score(x_test,y_test)
print(gs_rfc.score(x_test,y_test))

'''
卡方检验：
经典卡方检验是检验定性自变量对定性因变量的相关性。假设自变量有N种取值，因变量有M种，考虑自变量等于i且因变量等于j的样本频数的观察值与期望的差距。χ² = ∑((A - E)² / E)。
'''
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2 # 指标

x_non_negative = x - np.min(x)  # 非负化

# 选择K个最好的特征，返回的就是选择后的特征
xs = SelectKBest(chi2,k = 5).fit_transform(x_non_negative,y)

'''
递归特征消除法：
用一个基模型来进行多轮训练，每轮训练后，消除若干权值系数的特征，再基于新的特征集进行下一轮训练
'''
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression # 基模型

# 递归消除法，返回值也是选择后的数据
XS = RFE(estimator = LinearRegression(),n_features_to_select = 5).fit_transform(x,y) # 基模型，保留特征数量



'''
卡方检验调参图
'''
from sklearn.model_selection import train_test_split
xs_train,xs_test,y_train,y_test = train_test_split(xs,y,test_size = 0.2,random_state = 0) # 划分

rfr = RandomForestRegressor()
rfr.fit(xs_train,y_train)
train_predicts = rfr.predict(xs_train)
test_predicts = rfr.predict(xs_test)

mean_squared_error(train_predicts,y_train)
mean_squared_error(test_predicts,y_test)

# 调节参数，决策树棵数
error_trains = []
error_tests = []
for i in range(10,200,10):
    # 修改n_estimators，也就是决策树棵数，并确定随机种子
    rfr = RandomForestRegressor(n_estimators = i,random_state = 90) # 控制种子，保证变量唯一
    rfr.fit(xs_train,y_train)
    train_predicts = rfr.predict(xs_train)
    test_predicts = rfr.predict(xs_test)
    error_trains.append(mean_squared_error(train_predicts,y_train))
    error_tests.append(mean_squared_error(test_predicts,y_test))

# 绘制调参图
x = list(range(10,200,10))
plt.plot(x,error_trains,'--',label = 'train')
plt.plot(x,error_tests,label = 'test')
plt.xlabel('n_estimators')
plt.ylabel('MSE')
plt.legend()
plt.show()

'''
递归消除调参图
'''
from sklearn.model_selection import train_test_split
XS_train,XS_test,y_train,y_test = train_test_split(XS,y,test_size = 0.2,random_state = 0) # 划分

rfr = RandomForestRegressor()
rfr.fit(XS_train,y_train)
train_predicts = rfr.predict(XS_train)
test_predicts = rfr.predict(XS_test)

mean_squared_error(train_predicts,y_train)
mean_squared_error(test_predicts,y_test)

# 调节参数，决策树棵数
error_trains = []
error_tests = []
for i in range(10,200,10):
    # 修改n_estimators，也就是决策树棵数，并确定随机种子
    rfr = RandomForestRegressor(n_estimators = i,random_state = 90) # 控制种子，保证变量唯一
    rfr.fit(XS_train,y_train)
    train_predicts = rfr.predict(XS_train)
    test_predicts = rfr.predict(XS_test)
    error_trains.append(mean_squared_error(train_predicts,y_train))
    error_tests.append(mean_squared_error(test_predicts,y_test))

# 绘制调参图
x = list(range(10,200,10))
plt.plot(x,error_trains,'--',label = 'train')
plt.plot(x,error_tests,label = 'test')
plt.xlabel('n_estimators')
plt.ylabel('MSE')
plt.legend()
plt.show()