数据:http://www.statsci.org/data/general/fev.html

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import mpl
# 正常显示中文标签
mpl.rcParams['font.sans-serif'] = ['SimHei']
# 正常显示负号
mpl.rcParams['axes.unicode_minus'] = False
# 禁用科学计数法
pd.set_option('display.float_format', lambda x: '%.2f' % x)
# 读取数据
df = pd.read_table(r'C:\Users\Administrator\Desktop\儿童呼吸道疾病.txt')

df.shape #  (654, 6)
df.info()

df.describe().T


数据探索及可视化

#查看类别特征
print('Sex:',df['Sex'].unique())
# Sex: ['Female' 'Male']
print('Smoker:',df['Smoker'].unique())
# Smoker: ['Non' 'Current']
mpl.rcParams['font.sans-serif'] = ['SimHei']
color = sns.color_palette()
plt.subplot(121)
sns.countplot('Sex',order = ['Female','Male'],data = df,palette=color_palette)
plt.xlabel('Sex(性别)',fontsize=14)
plt.xticks(fontsize=13)
plt.tight_layout()plt.subplot(122)
sns.countplot('Smoker',order = ['Non','Current'],data = df)
plt.xlabel('Smoker(是否吸烟)',fontsize=14)
plt.xticks(fontsize=13)
plt.tight_layout()

color = sns.color_palette()
plt.subplot(311)
sns.countplot(x = 'Age',data = df)
plt.xlabel('Age(年龄)')
plt.tight_layout()
plt.subplot(312)
sns.distplot(df.FEV, kde=True,color=color[3])
plt.xlabel('FEV(强迫呼气量)')
plt.tight_layout()
plt.subplot(313)
sns.distplot(df.Height, kde=True)
plt.xlabel('Height(身高)')
plt.tight_layout()
plt.show()


# 密度图
df.iloc[:,:3].plot(kind='density', subplots=True, sharex=False, fontsize=1)
pyplot.show()

df['Age'].unique()
# array([ 9,  8,  7,  6,  5,  4,  3, 11, 10, 14, 12, 13, 15, 18, 19, 16, 17],dtype=int64)
# 箱型图
mpl.rcParams['font.sans-serif'] = ['SimHei']
color = sns.color_palette()
plt.subplot(131)
sns.boxplot(data=df.Age,color=color[1])
plt.xlabel('Age(年龄)')
plt.tight_layout()plt.subplot(132)
sns.boxplot(data=df.FEV,color=color[2])
plt.xlabel('FEV(强迫呼气量)')
plt.tight_layout()plt.subplot(133)
sns.boxplot(data=df.Height,color=color[3])
plt.xlabel('Height(身高)')
plt.tight_layout()
plt.show()

sns.set(rc = {'figure.figsize':(16,10)})
sns.countplot(x = 'Age',hue = 'Smoker',hue_order = ['Non','Current'],data = df)

sns.set(rc = {'figure.figsize':(8,4)})
sns.countplot(x = 'Sex',hue = 'Smoker',hue_order = ['Non','Current'],data = df)

sns.set(rc = {'figure.figsize':(8,4)})
sns.countplot(x = 'Age',hue = 'Sex',hue_order = ['Female','Male'],data = df)

df_temp = df[['Age','Smoker']]
df_temp['Count'] = 1
df_temp = df_temp.groupby(['Age','Smoker']).agg('sum').reset_index()
df_temp

df_temp2 = df_temp.groupby('Age').agg('sum').reset_index()
df_temp2

fig,axes = plt.subplots(2,2,figsize = (8,6))
sns.boxplot(x = 'Smoker',y = 'Height',order = ['Non','Current'],data = df,ax = axes[0,0])
sns.boxplot(x = 'Smoker',y = 'FEV',order = ['Non','Current'],data = df,ax = axes[0,1])
sns.countplot(x = 'Sex',hue = 'Smoker',hue_order = ['Non','Current'],data = df,palette=color_palette,ax = axes[1,0])
sns.countplot(x = 'Age',hue = 'Smoker',hue_order = ['Non','Current'],data = df,palette = 'rainbow',ax = axes[1,1])
plt.tight_layout()

sns.regplot(x = 'Height',y = 'FEV',order = 4,data = df)

import pandas_profiling
profile = pandas_profiling.ProfileReport(df)
profile.to_file('profile.html')
df['Smoker'].value_counts(normalize = True)Non       0.90
Current   0.10
Name: Smoker, dtype: float64df['Sex'].value_counts()Male      336
Female    318
Name: Sex, dtype: int64

通过特征预测儿童的FEV

df.head()


对类别数据做one_hot_encoding编码处理

df = pd.get_dummies(df)
df.drop(['ID'],axis=1,inplace=True)
df.head()


建模

#将数据划分为标签和特征
X = df.drop(['FEV'],axis = 1)
y = df['FEV']#将数据集划分为测试集和训练集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 0)print('训练特征的大小:',X_train.shape)
print('训练标签的大小:',y_train.shape)
print('测试特征的大小:',X_test.shape)
print('测试标签的大小:',y_test.shape)
'''
训练特征的大小: (457, 6)
训练标签的大小: (457,)
测试特征的大小: (197, 6)
测试标签的大小: (197,)
'''
#相关性矩阵(correlation matrix)
corr = df.corr()
corr
#相关性矩阵的可视化
sns.heatmap(corr,fmt = 'f',annot = True,xticklabels = corr.columns,yticklabels = corr.columns)


创建模型评分函数

def score(y, y_pred):from pylab import mplmpl.rcParams['font.sans-serif'] = ['SimHei']# 计算均方误差 MSEprint('MSE = {0}'.format(mean_squared_error(y, y_pred)))# 计算模型决定系数 R2print('R2 = {0}'.format(r2_score(y, y_pred)))# 计算预测残差,找异常点y = pd.Series(y)y_pred = pd.Series(y_pred, index=y.index)resid = y - y_predmean_resid = resid.mean()std_resid = resid.std()z = (resid - mean_resid) / std_residn_outliers = sum(abs(z)>3)# 图一:真实值vs预计值plt.figure(figsize=(18,5), dpi=80)plt.subplot(131)plt.plot(y, y_pred, '.')plt.xlabel('y')plt.ylabel('y_pred')plt.title('corr = {:.3f}'.format(np.corrcoef(y,y_pred)[0][1]))# 图二:残差分布散点图plt.subplot(132)plt.plot(y, y-y_pred, '.')plt.xlabel('残差分布散点图',fontsize=10)plt.ylabel('resid')plt.ylim([-3,3])plt.title('std_resid = {:.3f}'.format(std_resid))# 图三:残差z得分直方图plt.subplot(133)sns.distplot(z, bins=50)plt.xlabel('残差z得分直方图',fontsize=10)plt.title('{:.0f} samples with z>3'.format(n_outliers))plt.tight_layout()

开始模型训练前,利用岭回归模型预测,剔除异常样本

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, RepeatedKFold
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.svm import SVR
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')
# 利用RidgeCV函数自动寻找最优参数
ridge = RidgeCV()
ridge.fit(X_train, y_train)
print('best_alpha = {0}'.format(ridge.alpha_))y_pred = ridge.predict(X_train)
score(y_train, y_pred)

best_alpha = 10.0
MSE = 0.17293985227197042
R2 = 0.7631132810439695


找出异常样本点并剔除

resid = y_train - y_pred
resid = pd.Series(resid, index=range(len(y_train)))
resid_z = (resid-resid.mean()) / resid.std()
outliers = resid_z[abs(resid_z)>3].index
print(f'{len(outliers)} Outliers:')
print(outliers.tolist())plt.figure(figsize=(14,6),dpi=60)plt.subplot(121)
plt.plot(y_train, y_pred, '.')
plt.plot(y_train[outliers], y_pred[outliers], 'ro')
plt.title(f'MSE = {mean_squared_error(y_train,y_pred)}')
plt.legend(['Accepted', 'Outliers'])
plt.xlabel('y_train')
plt.ylabel('y_pred')plt.subplot(122)
sns.distplot(resid_z, bins = 50)
sns.distplot(resid_z.loc[outliers], bins = 50, color = 'r')
plt.legend(['Accepted', 'Outliers'])
plt.xlabel('z')
plt.tight_layout()


异常样本点剔除,outliers是行索引

X_train = np.array(pd.DataFrame(X_train).drop(outliers,axis=0))
y_train = np.array(pd.Series(y_train).drop(outliers,axis=0))print('训练特征的大小:',X_train.shape)
print('训练标签的大小:',y_train.shape)
print('测试特征的大小:',X_test.shape)
print('测试标签的大小:',y_test.shape)

训练特征的大小: (454, 6)
训练标签的大小: (454,)
测试特征的大小: (197, 6)
测试标签的大小: (197,)

13种预测模型

from sklearn.linear_model import LinearRegression # 线性回归
from sklearn.neighbors import KNeighborsRegressor # K近邻回归
from sklearn.svm import SVR # 支持向量回归
from sklearn.linear_model import Lasso # 套索回归
from sklearn.linear_model import Ridge # 岭估计
from sklearn.neural_network import MLPRegressor # 神经网络回归
from sklearn.tree import DecisionTreeRegressor # 决策树回归
from sklearn.tree import ExtraTreeRegressor # 极端随机森林回归
from xgboost import XGBRegressor # XGBoot
from sklearn.ensemble import RandomForestRegressor # 随机森林回归
from sklearn.ensemble import AdaBoostRegressor  # Adaboost 集成学习
from sklearn.ensemble import GradientBoostingRegressor # 集成学习梯度提升决策树
from sklearn.ensemble import BaggingRegressor # bagging回归
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
# filter warnings
warnings.filterwarnings('ignore')
# 正常显示中文
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
# 正常显示符号
from matplotlib import rcParams
rcParams['axes.unicode_minus']=False#再整理出一组标准化的数据,通过对比可以看出模型的效果有没有提高
scale_x=StandardScaler()
X1=scale_x.fit_transform(X)
scale_y=StandardScaler()
y=np.array(y).reshape(-1,1)
y1=scale_y.fit_transform(y)
y1=y1.ravel()
x_train1,x_test1,y_train1,y_test1 = train_test_split(X1,y1,test_size = 0.3,random_state = 1)
models=[LinearRegression(),KNeighborsRegressor(),SVR(),Ridge(),Lasso(),MLPRegressor(alpha=20),DecisionTreeRegressor(),ExtraTreeRegressor(),XGBRegressor(),RandomForestRegressor(),AdaBoostRegressor(),GradientBoostingRegressor(),BaggingRegressor()]
models_str=['LinearRegression','KNNRegressor','SVR','Ridge','Lasso','MLPRegressor','DecisionTree','ExtraTree','XGBoost','RandomForest','AdaBoost','GradientBoost','Bagging']
score_=[]model_score = {}
X_train = np.array(X_train)
X_test = np.array(X_test)
for name,model in zip(models_str,models):print('开始训练模型:'+name)model=model   #建立模型model.fit(X_train,y_train)y_pred=model.predict(X_test)  score=model.score(X_test,y_test)model_score[name]=scorescore_.append(str(score)[:5])print(name +' 得分:'+str(score))

开始训练模型:LinearRegression
LinearRegression 得分:0.7971676214804524
开始训练模型:KNNRegressor
KNNRegressor 得分:0.7992186690987086
开始训练模型:SVR
SVR 得分:0.7867497469169773
开始训练模型:Ridge
Ridge 得分:0.7971267244847953
开始训练模型:Lasso
Lasso 得分:0.7099415298688412
开始训练模型:MLPRegressor
MLPRegressor 得分:0.6829379368209219
开始训练模型:DecisionTree
DecisionTree 得分:0.7088841813498914
开始训练模型:ExtraTree
ExtraTree 得分:0.7151472981465214
开始训练模型:XGBoost
XGBoost 得分:0.7207793661482227
开始训练模型:RandomForest
RandomForest 得分:0.7724484732603365
开始训练模型:AdaBoost
AdaBoost 得分:0.789789713210759
开始训练模型:GradientBoost
GradientBoost 得分:0.7826655908985967
开始训练模型:Bagging
Bagging 得分:0.7551367079941442

pd.DataFrame({'models':models_str,'未标准化:R^2':score_}).sort_values(by="未标准化:R^2" , ascending=False)


使用标准化的数据

models=[LinearRegression(),KNeighborsRegressor(),SVR(),Ridge(),Lasso(),MLPRegressor(alpha=20),DecisionTreeRegressor(),ExtraTreeRegressor(),XGBRegressor(),RandomForestRegressor(),AdaBoostRegressor(),GradientBoostingRegressor(),BaggingRegressor()]
models_str=['LinearRegression','KNNRegressor','SVR','Ridge','Lasso','MLPRegressor','DecisionTree','ExtraTree','XGBoost','RandomForest','AdaBoost','GradientBoost','Bagging']
score_1=[]
model_score = {}
for name,model in zip(models_str,models):#print('开始训练模型:'+name)model=modelmodel.fit(x_train1,y_train1)y_pred=model.predict(x_test1)score=model.score(x_test1,y_test1)model_score[name]=scorescore_1.append(str(score)[:5])#print(name +' 得分:'+str(score))
#print(pd.concat([models_str,score_1],axis=1))
pd.DataFrame({'models':models_str,'未标准化':score_,'标准化后':score_1})


使用对数化的数据

models=[LinearRegression(),KNeighborsRegressor(),SVR(),Ridge(),Lasso(),MLPRegressor(alpha=20),DecisionTreeRegressor(),ExtraTreeRegressor(),XGBRegressor(),RandomForestRegressor(),AdaBoostRegressor(),GradientBoostingRegressor(),BaggingRegressor()]
models_str=['LinearRegression','KNNRegressor','SVR','Ridge','Lasso','MLPRegressor','DecisionTree','ExtraTree','XGBoost','RandomForest','AdaBoost','GradientBoost','Bagging']
score_log=[]
model_score = {}
#平滑处理预测值y
#平滑处理y值,x不处理。(x代表特征,y代表预测值)
y_log=np.log(y)
x_train,x_test,y_train_log,y_test_log = train_test_split(X,y_log,test_size = 0.3,random_state = 1)
#
for name,model in zip(models_str,models):#print('开始训练模型:'+name)model=modelmodel.fit(x_train,y_train_log)y_pred=model.predict(x_test)score=model.score(x_test,y_test_log)model_score[name]=scorescore_log.append(str(score)[:5])#print(name +' 得分:'+str(score))
#print(pd.concat([models_str,score_1],axis=1))
pd.DataFrame({'models':models_str,'未标准化':score_,'标准化后':score_1,'对数化处理':score_log})


K近邻回归

from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(5,weights = 'uniform')
model = knn.fit(X_train,y_train)
FsctY = model.predict(X_test)
FsctY = pd.DataFrame(FsctY)
FsctY

pred_knn = model.predict(X_test)
score(y_test, pred_knn)

MSE = 0.15872206639593908
R2 = 0.7992186690987086

score_list =[]
for i in np.arange(3,100):knn = KNeighborsRegressor(i,weights='uniform')score = cross_val_score(knn,X_train,y_train,cv=5,scoring="r2").mean()score_list.append(score)
plt.figure(figsize=[10,5])
plt.plot(range(3,100),score_list)
plt.show()

score_list =[]
for i in np.arange(3,20):knn = KNeighborsRegressor(i,weights='uniform')score = cross_val_score(knn,X_train,y_train,cv=5,scoring="r2").mean()score_list.append(score)
plt.figure(figsize=[10,5])
plt.plot(range(3,20),score_list)
plt.axhline(y=max(score_list), color='r', linestyle='-')
plt.axvline(x=score_list.index(max(score_list))+3, color='r', linestyle='-')
plt.show()
print(max(score_list))


0.7613005039793965

knn = KNeighborsRegressor(15,weights = 'uniform')
model = knn.fit(X_train,y_train)
FsctY = model.predict(X_test)
FsctY = pd.DataFrame(FsctY)
pred_knn = model.predict(X_test)
score(y_test, pred_knn)

MSE = 0.1504554917992104
R2 = 0.8096757774719427


多元线性回归

 sklearn.linear_model.LinearRegression (fit_intercept=True, normalize=False, copy_X=True, n_jobs=None)
# 导入需要的模块和库
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.datasets import fetch_california_housing as fch #加利福尼亚房屋价值数据集
import pandas as pd# 建模
reg = LR().fit(X_train, y_train)
yhat = reg.predict(X_test) #预测
yhat
[*zip(X.columns,reg.coef_)]# 系数

[(‘Age’, 0.05952511330609858),
(‘Height’, 0.10458651179070048),
(‘Sex_Female’, -0.06336513256395504),
(‘Sex_Male’, 0.06336513256395507),
(‘Smoker_Current’, -0.052819177711489945),
(‘Smoker_Non’, 0.05281917771148996)]

特征重要性/参数的可视化

pd.DataFrame({'variable': X.columns,'coefficient': reg.coef_[0]
}) \.round(decimals=2) \.sort_values('coefficient', ascending=False) \.style.bar(color=['grey', 'lightblue'], align='zero')
reg.intercept_

-4.410081149675156

#使用 Python统计建模分析工具包statsmodels 进行比较
# 两者得出的系数还是有所不同
X_train = pd.DataFrame(X_train,columns=X.columns)
import statsmodels.api as sm
results = sm.OLS(y_train, X_train).fit()
print(results.summary())

sns.set_style("whitegrid") # 使用whitegrid主题
plt.plot(range(len(y_test)),sorted(y_test),c="black",label= "Data")
plt.plot(range(len(yhat)),sorted(yhat),c="red",label = "Predict")
plt.legend()
plt.show()

score(y_test, yhat)

MSE = 0.16034346473404532
R2 = 0.7971676214804524

岭回归

#使用岭回归来进行建模
reg = Ridge(alpha=1).fit(X_train,y_train)
reg.score(X_test,y_test)

0.7971267244847953

#交叉验证下,与线性回归相比,岭回归的结果如何变化?
alpharange = np.arange(0,100)
ridge= []
for alpha in alpharange:reg = Ridge(alpha=alpha)regs = cross_val_score(reg,X,y,cv=5,scoring = "r2").mean() # 5折ridge.append(regs)
plt.plot(alpharange,ridge,color="red",label="Ridge")
plt.title("r2_Mean")
plt.legend()
plt.show()

#选取最佳的正则化参数取值
from sklearn.linear_model import RidgeCV
Ridge_ = RidgeCV(alphas=(0.01,0.1, 1.0, 10.0)#,scoring="neg_mean_squared_error",store_cv_values=True#,cv=5).fit(X_train, y_train)
#无关交叉验证的岭回归结果
Ridge_.score(X_test,y_test)

0.7967617152901645

#查看被选择出来的最佳正则化系数
print('best_alpha = {0}'.format(Ridge_.alpha_))

best_alpha = 10.0

pred_Ridge = Ridge_.predict(X_test)
score(y_test, pred_Ridge)

MSE = 0.1606643425218166
R2 = 0.7967617152901645


Lasso回归
利用LassoCV自动选择最佳正则化参数

lasso = LassoCV(cv=5)
lasso.fit(X_train, y_train)
print('best_alpha = {0}'.format(lasso.alpha_))pred_lasso = lasso.predict(X_test)
score(y_test, pred_lasso)

best_alpha = 0.004044127926992565
MSE = 0.16166101021415222
R2 = 0.7955009437397598

支持向量回归(SVR)

使用sklearn中的网格搜索方法 GridSearchCV 寻找SVR最优模型参数

创建GridSearchCV网格参数搜寻函数,评价标准为最小均方误差,采用K折交叉验证的检验方法

def gsearch(model, param_grid, scoring='neg_mean_squared_error', splits=5, repeats=1, n_jobs=-1):# p次k折交叉验证rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats, random_state=0)model_gs = GridSearchCV(model, param_grid=param_grid, scoring=scoring, cv=rkfold, verbose=1, n_jobs=-1)model_gs.fit(X_train, y_train)print('参数最佳取值: {0}'.format(model_gs.best_params_))print('最小均方误差: {0}'.format(abs(model_gs.best_score_)))return model_gs

使用SVR回归器默认的“rbf”内核,即高斯核

对惩罚参数C与核系数gamma进行网格搜索CV验证

svr = SVR()
cv_params = {'C': np.logspace(0, 3, 4), 'gamma': np.logspace(-4, -1, 4)}
svr = gsearch(svr, cv_params)

参数最佳取值: {‘C’: 100.0, ‘gamma’: 0.0001}
最小均方误差: 0.15531991719148028

pred_svr = svr.predict(X_test)
score(y_test, pred_svr)

MSE = 0.15136862309913945
R2 = 0.8085206783615895

XGB回归(XGBRegressor )

# 调参
# 初始参数值
params = {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

1.最佳迭代次数:n_estimators

cv_params = {'n_estimators': [100,200,300,400,500,600,700,800,900,1000,1100,1200]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)

参数最佳取值: {‘n_estimators’: 100}
最小均方误差: 0.22547065866645086

# 更新参数
params['n_estimators'] = 100

2.min_child_weight 以及 max_depth

cv_params = {'max_depth': [3,4,5,6,7,8,9],'min_child_weight': [1,2,3,4,5,6,7]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)

参数最佳取值: {‘max_depth’: 3, ‘min_child_weight’: 7}
最小均方误差: 0.1787887844016734

# 更新参数
params['max_depth'] = 3
params['min_child_weight'] = 7

3.后剪枝参数 gamma

cv_params = {'gamma': [0,0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)

参数最佳取值: {‘gamma’: 0.6}
最小均方误差: 0.1738115095993947

# 更新参数
params['gamma'] = 0.6

4.样本采样subsample 和 列采样colsample_bytree

cv_params = {'subsample': [0.6,0.7,0.8,0.9],'colsample_bytree': [0.6,0.7,0.8,0.9]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)

参数最佳取值: {‘colsample_bytree’: 0.9, ‘subsample’: 0.8}
最小均方误差: 0.16685866639860653

# 更新参数
params['subsample'] = 0.9
params['colsample_bytree'] = 0.8

5.L1正则项参数reg_alpha 和 L2正则项参数reg_lambda

cv_params = {'reg_alpha': [0,0.02,0.05,0.1,1,2,3],'reg_lambda': [0,0.02,0.05,0.1,1,2,3]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)

参数最佳取值: {‘reg_alpha’: 0, ‘reg_lambda’: 0}
最小均方误差: 0.1741069934979665

不做更新

6.最后是learning_rate,一般这时候要调小学习率来测试

cv_params = {'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.07, 0.1, 0.2]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)

参数最佳取值: {‘learning_rate’: 0.07}
最小均方误差: 0.1735823933607791

不做更新

参数调优完成

cv_params = {'subsample': [0.6,0.7,0.8,0.9],'colsample_bytree': [0.6,0.7,0.8,0.9]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)pred_xgb = xgb.predict(X_test)
score(y_test, pred_xgb)

参数最佳取值: {‘colsample_bytree’: 0.9, ‘subsample’: 0.8}
最小均方误差: 0.16685866639860653

MSE = 0.1562520212506388
R2 = 0.8023432437903115


模型评估

models = [KNeighborsRegressor(),LinearRegression(),Ridge(alpha=1),lasso, svr, xgb]
model_names = ['KNeighborsRegressor','LinearRegression','Ridge','Lasso','SVR','XGB']
plt.figure(figsize=(20,5))for i,m in enumerate(models):train_sizes, train_scores, test_scores = learning_curve(m, X, y, cv=5, scoring='neg_mean_squared_error',train_sizes=np.linspace(0.1,1.0,5), n_jobs=-1)train_scores_mean = -train_scores.mean(axis=1)test_scores_mean = -test_scores.mean(axis=1)plt.subplot(2,3,i+1)plt.plot(train_sizes, train_scores_mean, 'o-', label='Train')plt.plot(train_sizes, test_scores_mean, '^-', label='Test')plt.xlabel('Train_size')plt.ylabel('Score')plt.ylim([0,0.35])plt.title(model_names[i], fontsize=16)plt.legend()plt.grid()plt.tight_layout()


模型加权融合

对于多个模型结果,采取加权融合的办法进行结合,即对各模型预测结果取加权平均,这样可用避免单个模型在预测某一部分数据时产生较大的误差。

# 禁用科学计数法
pd.set_option('display.float_format', lambda x: '%.10f' % x)
def model_mix(pred_1, pred_2, pred_3):result = pd.DataFrame(columns=['knn','SVR','XGB','Combine'])for a in range(1,6):for b in range(1,6):for c in range(1,6):y_pred = (a*pred_1 + b*pred_2 + c*pred_3) / (a+b+c)mse = mean_squared_error(y_test, y_pred)result = result.append([{'knn':a, 'SVR':b, 'XGB':c, 'Combine':mse}], ignore_index=True)return resultmodel_combine = model_mix( FsctY, pred_svr, pred_xgb)
model_combine.sort_values(by='Combine', inplace=True)
model_combine


在测试集中,Knn、SVR、XGBRegressor三种模型分别取权重为5/11、5/11、1/11时得到的预测数据均方误差最小,Min(MSE) = 0.14592

模型预测 对3种模型预测结果进行加权融合

pd.set_option('display.float_format', lambda x: '%.4f' % x)
ans_mix = (5*FsctY + 5 * pred_svr + 1 * pred_xgb) / 11
ans_mix

score(y_test, ans_mix)

MSE = 0.14591990076794314
R2 = 0.8154132406007941


案例:儿童呼吸道疾病数据集相关推荐

  1. 生命早期肠道微生物群与儿童呼吸道疾病之间的关联

    谷禾健康 儿童呼吸系统疾病,包括呼吸道感染.反复喘息和哮喘,是儿童及其以后年龄发病和死亡的重要原因. 而哮喘是其中比较典型的一种,哮喘是全球最常见的慢性疾病之一,是一种复杂的.异质性的免疫介导的紊乱集 ...

  2. 安搭Share提醒,谨防秋冬季儿童呼吸道疾病

    眼下,天气干燥,早晚温差较大,儿童容易患呼吸道感染等疾病.日前,安搭Share从各医院儿科门诊了解到,自上周起儿童发病率呈上升趋势,就诊患儿数量明显增加.安搭Share提醒,市民需尽量不要去空气流通性 ...

  3. R语言logistic回归、判别分析(LDA)、多元自适应样条回归MARS分析案例:分析乳腺癌数据集明确细针穿刺肿瘤活检结果

    R语言logistic回归.判别分析(LDA).多元自适应样条回归MARS分析案例:分析乳腺癌数据集明确细针穿刺肿瘤活检结果 目录

  4. 预防防御鸡呼吸道疾病 鸡吃啥药防治呼吸道感染

    鸡呼吸道疾病高发,多数是因为通风不合理,空气不流通,湿度不行,贼风侵袭,温差过大等引起.达龙支克用于治疗鸡大肠杆菌病.沙门氏菌病以及支原体引起的滑液囊和呼吸道疾病. 鸡舍的门直接对着鸡笼,人进进出出, ...

  5. 三伏天空调当道 小心引起呼吸道疾病

    据<杭州网>报道,近日记者去杭州市红十字会医院采访,路过输液室,里面人声鼎沸,黑压压坐满了人.一位家长左手高举盐水袋,右手抱着孩子,转了一圈没找到座位.大多数病人的症状很类似,主要有咽喉疼 ...

  6. 鸡感染呼吸道疾病怎么办 防治鸡流鼻涕的特效药

    鸡感染呼吸道疾病怎么办 防治鸡流鼻涕的特效药 秋冬季是鸡呼吸道病的高发期,一旦感染有的患鸡治疗起来还很麻烦.因此,我们应掌握呼吸道疾病发病原因,尽早做好预防,避免这些病因滋生.对于已经感染的患鸡,要尽 ...

  7. 宝宝生病处理锦囊+儿童常见疾病

    曾经因为宝宝生病在网上像无头苍蝇一样转来转去,不知道应该从何处找起.有空的时候发现这里其实有很多很好的处理方法,但是很散,一不小心就沉了,所以想发个帖子,把这些帖子都收集一起,欢迎各位妈妈将自己看到的 ...

  8. SQL案例_0_员工表数据集

    数据库数据集 数据集说明 这里参考Oracle的SCOTT用户下的员工信息表,该用户下有4张表.详细的员工表结构和数据见网盘链接: 链接:https://pan.baidu.com/s/1CbnJSO ...

  9. 【Pytorch神经网络实战案例】01 CIFAR-10数据集:Pytorch使用GPU训练CNN模版-方法①

    import torch import torchvision from torch import nn from torch.utils.tensorboard import SummaryWrit ...

最新文章

  1. ffmpeg linux安装_ffmpeg命令中文手册
  2. OpenCV 错误:无法打开摄像头(打开摄像头卡机)
  3. 千万级游标_在一个千万级的数据库查寻中,如何提高查询效率
  4. 模型转换状态已支持Webhook!
  5. 【自然框架】——思路、结构、特点的介绍(初稿,欢迎大家多提意见)
  6. Linux配置汇总上(北大青鸟Linux课程学习总结)
  7. 搭建K8s集群(kubeadm方式)-部署master节点
  8. 【渝粤题库】陕西师范大学163202 管理学原理 作业(高起本 专升本)
  9. 检测系列--YOLO系列
  10. python面向对象程序设计实训学生自我总结_Python面向对象程序设计示例小结
  11. fastnest怎么一键排版_什么公众号排版编辑器可以换字体?公众号字体在哪里选择修改?...
  12. C语言预定义宏的使用
  13. mysql grant 用户权限
  14. ROSE HA高可用性软件介绍(转载)
  15. 基于人脸识别录入 人脸图片识别 及测试的效果
  16. 如何进行音频合并?很简单,只需三步骤
  17. 基于人脸识别的门禁系统设计与实现--论文
  18. 初中计算机室教学计划,初中信息技术教学计划
  19. 怎么修改windows10在cmd下的用户名为英文名
  20. MAC 本机电脑ip自动分配改变,导致的坑

热门文章

  1. 计算机技术有哪些系统,电脑系统教程:win7有哪些系统版本和区别
  2. system占用cpu解决(ntoskrnl.exe)
  3. 加了权重样本的AUC如何计算
  4. 经纬恒润荣获极氪汽车“最佳创新奖”
  5. 398489-28-6,1-Boc-3-ethyl-3-azetidinol,3-乙基-3-羟基氮杂环丁烷-1-羧酸叔丁酯的结构式解析
  6. 【AlgorithmTraining】03:Project Euler 03
  7. mysql优化 个人笔记 非礼勿扰 -m06
  8. E. Gardener and Tree 树形DP
  9. 51单片机入门自学方法
  10. You are the reason I am. You are all my reasons