案例:儿童呼吸道疾病数据集
数据:http://www.statsci.org/data/general/fev.html
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import mpl
# 正常显示中文标签
mpl.rcParams['font.sans-serif'] = ['SimHei']
# 正常显示负号
mpl.rcParams['axes.unicode_minus'] = False
# 禁用科学计数法
pd.set_option('display.float_format', lambda x: '%.2f' % x)
# 读取数据
df = pd.read_table(r'C:\Users\Administrator\Desktop\儿童呼吸道疾病.txt')
df.shape # (654, 6)
df.info()
df.describe().T
数据探索及可视化
#查看类别特征
print('Sex:',df['Sex'].unique())
# Sex: ['Female' 'Male']
print('Smoker:',df['Smoker'].unique())
# Smoker: ['Non' 'Current']
mpl.rcParams['font.sans-serif'] = ['SimHei']
color = sns.color_palette()
plt.subplot(121)
sns.countplot('Sex',order = ['Female','Male'],data = df,palette=color_palette)
plt.xlabel('Sex(性别)',fontsize=14)
plt.xticks(fontsize=13)
plt.tight_layout()plt.subplot(122)
sns.countplot('Smoker',order = ['Non','Current'],data = df)
plt.xlabel('Smoker(是否吸烟)',fontsize=14)
plt.xticks(fontsize=13)
plt.tight_layout()
color = sns.color_palette()
plt.subplot(311)
sns.countplot(x = 'Age',data = df)
plt.xlabel('Age(年龄)')
plt.tight_layout()
plt.subplot(312)
sns.distplot(df.FEV, kde=True,color=color[3])
plt.xlabel('FEV(强迫呼气量)')
plt.tight_layout()
plt.subplot(313)
sns.distplot(df.Height, kde=True)
plt.xlabel('Height(身高)')
plt.tight_layout()
plt.show()
# 密度图
df.iloc[:,:3].plot(kind='density', subplots=True, sharex=False, fontsize=1)
pyplot.show()
df['Age'].unique()
# array([ 9, 8, 7, 6, 5, 4, 3, 11, 10, 14, 12, 13, 15, 18, 19, 16, 17],dtype=int64)
# 箱型图
mpl.rcParams['font.sans-serif'] = ['SimHei']
color = sns.color_palette()
plt.subplot(131)
sns.boxplot(data=df.Age,color=color[1])
plt.xlabel('Age(年龄)')
plt.tight_layout()plt.subplot(132)
sns.boxplot(data=df.FEV,color=color[2])
plt.xlabel('FEV(强迫呼气量)')
plt.tight_layout()plt.subplot(133)
sns.boxplot(data=df.Height,color=color[3])
plt.xlabel('Height(身高)')
plt.tight_layout()
plt.show()
sns.set(rc = {'figure.figsize':(16,10)})
sns.countplot(x = 'Age',hue = 'Smoker',hue_order = ['Non','Current'],data = df)
sns.set(rc = {'figure.figsize':(8,4)})
sns.countplot(x = 'Sex',hue = 'Smoker',hue_order = ['Non','Current'],data = df)
sns.set(rc = {'figure.figsize':(8,4)})
sns.countplot(x = 'Age',hue = 'Sex',hue_order = ['Female','Male'],data = df)
df_temp = df[['Age','Smoker']]
df_temp['Count'] = 1
df_temp = df_temp.groupby(['Age','Smoker']).agg('sum').reset_index()
df_temp
df_temp2 = df_temp.groupby('Age').agg('sum').reset_index()
df_temp2
fig,axes = plt.subplots(2,2,figsize = (8,6))
sns.boxplot(x = 'Smoker',y = 'Height',order = ['Non','Current'],data = df,ax = axes[0,0])
sns.boxplot(x = 'Smoker',y = 'FEV',order = ['Non','Current'],data = df,ax = axes[0,1])
sns.countplot(x = 'Sex',hue = 'Smoker',hue_order = ['Non','Current'],data = df,palette=color_palette,ax = axes[1,0])
sns.countplot(x = 'Age',hue = 'Smoker',hue_order = ['Non','Current'],data = df,palette = 'rainbow',ax = axes[1,1])
plt.tight_layout()
sns.regplot(x = 'Height',y = 'FEV',order = 4,data = df)
import pandas_profiling
profile = pandas_profiling.ProfileReport(df)
profile.to_file('profile.html')
df['Smoker'].value_counts(normalize = True)Non 0.90
Current 0.10
Name: Smoker, dtype: float64df['Sex'].value_counts()Male 336
Female 318
Name: Sex, dtype: int64
通过特征预测儿童的FEV
df.head()
对类别数据做one_hot_encoding编码处理
df = pd.get_dummies(df)
df.drop(['ID'],axis=1,inplace=True)
df.head()
建模
#将数据划分为标签和特征
X = df.drop(['FEV'],axis = 1)
y = df['FEV']#将数据集划分为测试集和训练集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 0)print('训练特征的大小:',X_train.shape)
print('训练标签的大小:',y_train.shape)
print('测试特征的大小:',X_test.shape)
print('测试标签的大小:',y_test.shape)
'''
训练特征的大小: (457, 6)
训练标签的大小: (457,)
测试特征的大小: (197, 6)
测试标签的大小: (197,)
'''
#相关性矩阵(correlation matrix)
corr = df.corr()
corr
#相关性矩阵的可视化
sns.heatmap(corr,fmt = 'f',annot = True,xticklabels = corr.columns,yticklabels = corr.columns)
创建模型评分函数
def score(y, y_pred):from pylab import mplmpl.rcParams['font.sans-serif'] = ['SimHei']# 计算均方误差 MSEprint('MSE = {0}'.format(mean_squared_error(y, y_pred)))# 计算模型决定系数 R2print('R2 = {0}'.format(r2_score(y, y_pred)))# 计算预测残差,找异常点y = pd.Series(y)y_pred = pd.Series(y_pred, index=y.index)resid = y - y_predmean_resid = resid.mean()std_resid = resid.std()z = (resid - mean_resid) / std_residn_outliers = sum(abs(z)>3)# 图一:真实值vs预计值plt.figure(figsize=(18,5), dpi=80)plt.subplot(131)plt.plot(y, y_pred, '.')plt.xlabel('y')plt.ylabel('y_pred')plt.title('corr = {:.3f}'.format(np.corrcoef(y,y_pred)[0][1]))# 图二:残差分布散点图plt.subplot(132)plt.plot(y, y-y_pred, '.')plt.xlabel('残差分布散点图',fontsize=10)plt.ylabel('resid')plt.ylim([-3,3])plt.title('std_resid = {:.3f}'.format(std_resid))# 图三:残差z得分直方图plt.subplot(133)sns.distplot(z, bins=50)plt.xlabel('残差z得分直方图',fontsize=10)plt.title('{:.0f} samples with z>3'.format(n_outliers))plt.tight_layout()
开始模型训练前,利用岭回归模型预测,剔除异常样本
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, RepeatedKFold
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.svm import SVR
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')
# 利用RidgeCV函数自动寻找最优参数
ridge = RidgeCV()
ridge.fit(X_train, y_train)
print('best_alpha = {0}'.format(ridge.alpha_))y_pred = ridge.predict(X_train)
score(y_train, y_pred)
best_alpha = 10.0
MSE = 0.17293985227197042
R2 = 0.7631132810439695
找出异常样本点并剔除
resid = y_train - y_pred
resid = pd.Series(resid, index=range(len(y_train)))
resid_z = (resid-resid.mean()) / resid.std()
outliers = resid_z[abs(resid_z)>3].index
print(f'{len(outliers)} Outliers:')
print(outliers.tolist())plt.figure(figsize=(14,6),dpi=60)plt.subplot(121)
plt.plot(y_train, y_pred, '.')
plt.plot(y_train[outliers], y_pred[outliers], 'ro')
plt.title(f'MSE = {mean_squared_error(y_train,y_pred)}')
plt.legend(['Accepted', 'Outliers'])
plt.xlabel('y_train')
plt.ylabel('y_pred')plt.subplot(122)
sns.distplot(resid_z, bins = 50)
sns.distplot(resid_z.loc[outliers], bins = 50, color = 'r')
plt.legend(['Accepted', 'Outliers'])
plt.xlabel('z')
plt.tight_layout()
异常样本点剔除,outliers是行索引
X_train = np.array(pd.DataFrame(X_train).drop(outliers,axis=0))
y_train = np.array(pd.Series(y_train).drop(outliers,axis=0))print('训练特征的大小:',X_train.shape)
print('训练标签的大小:',y_train.shape)
print('测试特征的大小:',X_test.shape)
print('测试标签的大小:',y_test.shape)
训练特征的大小: (454, 6)
训练标签的大小: (454,)
测试特征的大小: (197, 6)
测试标签的大小: (197,)
13种预测模型
from sklearn.linear_model import LinearRegression # 线性回归
from sklearn.neighbors import KNeighborsRegressor # K近邻回归
from sklearn.svm import SVR # 支持向量回归
from sklearn.linear_model import Lasso # 套索回归
from sklearn.linear_model import Ridge # 岭估计
from sklearn.neural_network import MLPRegressor # 神经网络回归
from sklearn.tree import DecisionTreeRegressor # 决策树回归
from sklearn.tree import ExtraTreeRegressor # 极端随机森林回归
from xgboost import XGBRegressor # XGBoot
from sklearn.ensemble import RandomForestRegressor # 随机森林回归
from sklearn.ensemble import AdaBoostRegressor # Adaboost 集成学习
from sklearn.ensemble import GradientBoostingRegressor # 集成学习梯度提升决策树
from sklearn.ensemble import BaggingRegressor # bagging回归
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
# filter warnings
warnings.filterwarnings('ignore')
# 正常显示中文
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
# 正常显示符号
from matplotlib import rcParams
rcParams['axes.unicode_minus']=False#再整理出一组标准化的数据,通过对比可以看出模型的效果有没有提高
scale_x=StandardScaler()
X1=scale_x.fit_transform(X)
scale_y=StandardScaler()
y=np.array(y).reshape(-1,1)
y1=scale_y.fit_transform(y)
y1=y1.ravel()
x_train1,x_test1,y_train1,y_test1 = train_test_split(X1,y1,test_size = 0.3,random_state = 1)
models=[LinearRegression(),KNeighborsRegressor(),SVR(),Ridge(),Lasso(),MLPRegressor(alpha=20),DecisionTreeRegressor(),ExtraTreeRegressor(),XGBRegressor(),RandomForestRegressor(),AdaBoostRegressor(),GradientBoostingRegressor(),BaggingRegressor()]
models_str=['LinearRegression','KNNRegressor','SVR','Ridge','Lasso','MLPRegressor','DecisionTree','ExtraTree','XGBoost','RandomForest','AdaBoost','GradientBoost','Bagging']
score_=[]model_score = {}
X_train = np.array(X_train)
X_test = np.array(X_test)
for name,model in zip(models_str,models):print('开始训练模型:'+name)model=model #建立模型model.fit(X_train,y_train)y_pred=model.predict(X_test) score=model.score(X_test,y_test)model_score[name]=scorescore_.append(str(score)[:5])print(name +' 得分:'+str(score))
开始训练模型:LinearRegression
LinearRegression 得分:0.7971676214804524
开始训练模型:KNNRegressor
KNNRegressor 得分:0.7992186690987086
开始训练模型:SVR
SVR 得分:0.7867497469169773
开始训练模型:Ridge
Ridge 得分:0.7971267244847953
开始训练模型:Lasso
Lasso 得分:0.7099415298688412
开始训练模型:MLPRegressor
MLPRegressor 得分:0.6829379368209219
开始训练模型:DecisionTree
DecisionTree 得分:0.7088841813498914
开始训练模型:ExtraTree
ExtraTree 得分:0.7151472981465214
开始训练模型:XGBoost
XGBoost 得分:0.7207793661482227
开始训练模型:RandomForest
RandomForest 得分:0.7724484732603365
开始训练模型:AdaBoost
AdaBoost 得分:0.789789713210759
开始训练模型:GradientBoost
GradientBoost 得分:0.7826655908985967
开始训练模型:Bagging
Bagging 得分:0.7551367079941442
pd.DataFrame({'models':models_str,'未标准化:R^2':score_}).sort_values(by="未标准化:R^2" , ascending=False)
使用标准化的数据
models=[LinearRegression(),KNeighborsRegressor(),SVR(),Ridge(),Lasso(),MLPRegressor(alpha=20),DecisionTreeRegressor(),ExtraTreeRegressor(),XGBRegressor(),RandomForestRegressor(),AdaBoostRegressor(),GradientBoostingRegressor(),BaggingRegressor()]
models_str=['LinearRegression','KNNRegressor','SVR','Ridge','Lasso','MLPRegressor','DecisionTree','ExtraTree','XGBoost','RandomForest','AdaBoost','GradientBoost','Bagging']
score_1=[]
model_score = {}
for name,model in zip(models_str,models):#print('开始训练模型:'+name)model=modelmodel.fit(x_train1,y_train1)y_pred=model.predict(x_test1)score=model.score(x_test1,y_test1)model_score[name]=scorescore_1.append(str(score)[:5])#print(name +' 得分:'+str(score))
#print(pd.concat([models_str,score_1],axis=1))
pd.DataFrame({'models':models_str,'未标准化':score_,'标准化后':score_1})
使用对数化的数据
models=[LinearRegression(),KNeighborsRegressor(),SVR(),Ridge(),Lasso(),MLPRegressor(alpha=20),DecisionTreeRegressor(),ExtraTreeRegressor(),XGBRegressor(),RandomForestRegressor(),AdaBoostRegressor(),GradientBoostingRegressor(),BaggingRegressor()]
models_str=['LinearRegression','KNNRegressor','SVR','Ridge','Lasso','MLPRegressor','DecisionTree','ExtraTree','XGBoost','RandomForest','AdaBoost','GradientBoost','Bagging']
score_log=[]
model_score = {}
#平滑处理预测值y
#平滑处理y值,x不处理。(x代表特征,y代表预测值)
y_log=np.log(y)
x_train,x_test,y_train_log,y_test_log = train_test_split(X,y_log,test_size = 0.3,random_state = 1)
#
for name,model in zip(models_str,models):#print('开始训练模型:'+name)model=modelmodel.fit(x_train,y_train_log)y_pred=model.predict(x_test)score=model.score(x_test,y_test_log)model_score[name]=scorescore_log.append(str(score)[:5])#print(name +' 得分:'+str(score))
#print(pd.concat([models_str,score_1],axis=1))
pd.DataFrame({'models':models_str,'未标准化':score_,'标准化后':score_1,'对数化处理':score_log})
K近邻回归
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(5,weights = 'uniform')
model = knn.fit(X_train,y_train)
FsctY = model.predict(X_test)
FsctY = pd.DataFrame(FsctY)
FsctY
pred_knn = model.predict(X_test)
score(y_test, pred_knn)
MSE = 0.15872206639593908
R2 = 0.7992186690987086
score_list =[]
for i in np.arange(3,100):knn = KNeighborsRegressor(i,weights='uniform')score = cross_val_score(knn,X_train,y_train,cv=5,scoring="r2").mean()score_list.append(score)
plt.figure(figsize=[10,5])
plt.plot(range(3,100),score_list)
plt.show()
score_list =[]
for i in np.arange(3,20):knn = KNeighborsRegressor(i,weights='uniform')score = cross_val_score(knn,X_train,y_train,cv=5,scoring="r2").mean()score_list.append(score)
plt.figure(figsize=[10,5])
plt.plot(range(3,20),score_list)
plt.axhline(y=max(score_list), color='r', linestyle='-')
plt.axvline(x=score_list.index(max(score_list))+3, color='r', linestyle='-')
plt.show()
print(max(score_list))
0.7613005039793965
knn = KNeighborsRegressor(15,weights = 'uniform')
model = knn.fit(X_train,y_train)
FsctY = model.predict(X_test)
FsctY = pd.DataFrame(FsctY)
pred_knn = model.predict(X_test)
score(y_test, pred_knn)
MSE = 0.1504554917992104
R2 = 0.8096757774719427
多元线性回归
sklearn.linear_model.LinearRegression (fit_intercept=True, normalize=False, copy_X=True, n_jobs=None)
# 导入需要的模块和库
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.datasets import fetch_california_housing as fch #加利福尼亚房屋价值数据集
import pandas as pd# 建模
reg = LR().fit(X_train, y_train)
yhat = reg.predict(X_test) #预测
yhat
[*zip(X.columns,reg.coef_)]# 系数
[(‘Age’, 0.05952511330609858),
(‘Height’, 0.10458651179070048),
(‘Sex_Female’, -0.06336513256395504),
(‘Sex_Male’, 0.06336513256395507),
(‘Smoker_Current’, -0.052819177711489945),
(‘Smoker_Non’, 0.05281917771148996)]
特征重要性/参数的可视化
pd.DataFrame({'variable': X.columns,'coefficient': reg.coef_[0]
}) \.round(decimals=2) \.sort_values('coefficient', ascending=False) \.style.bar(color=['grey', 'lightblue'], align='zero')
reg.intercept_
-4.410081149675156
#使用 Python统计建模分析工具包statsmodels 进行比较
# 两者得出的系数还是有所不同
X_train = pd.DataFrame(X_train,columns=X.columns)
import statsmodels.api as sm
results = sm.OLS(y_train, X_train).fit()
print(results.summary())
sns.set_style("whitegrid") # 使用whitegrid主题
plt.plot(range(len(y_test)),sorted(y_test),c="black",label= "Data")
plt.plot(range(len(yhat)),sorted(yhat),c="red",label = "Predict")
plt.legend()
plt.show()
score(y_test, yhat)
MSE = 0.16034346473404532
R2 = 0.7971676214804524
岭回归
#使用岭回归来进行建模
reg = Ridge(alpha=1).fit(X_train,y_train)
reg.score(X_test,y_test)
0.7971267244847953
#交叉验证下,与线性回归相比,岭回归的结果如何变化?
alpharange = np.arange(0,100)
ridge= []
for alpha in alpharange:reg = Ridge(alpha=alpha)regs = cross_val_score(reg,X,y,cv=5,scoring = "r2").mean() # 5折ridge.append(regs)
plt.plot(alpharange,ridge,color="red",label="Ridge")
plt.title("r2_Mean")
plt.legend()
plt.show()
#选取最佳的正则化参数取值
from sklearn.linear_model import RidgeCV
Ridge_ = RidgeCV(alphas=(0.01,0.1, 1.0, 10.0)#,scoring="neg_mean_squared_error",store_cv_values=True#,cv=5).fit(X_train, y_train)
#无关交叉验证的岭回归结果
Ridge_.score(X_test,y_test)
0.7967617152901645
#查看被选择出来的最佳正则化系数
print('best_alpha = {0}'.format(Ridge_.alpha_))
best_alpha = 10.0
pred_Ridge = Ridge_.predict(X_test)
score(y_test, pred_Ridge)
MSE = 0.1606643425218166
R2 = 0.7967617152901645
Lasso回归
利用LassoCV自动选择最佳正则化参数
lasso = LassoCV(cv=5)
lasso.fit(X_train, y_train)
print('best_alpha = {0}'.format(lasso.alpha_))pred_lasso = lasso.predict(X_test)
score(y_test, pred_lasso)
best_alpha = 0.004044127926992565
MSE = 0.16166101021415222
R2 = 0.7955009437397598
支持向量回归(SVR)
使用sklearn中的网格搜索方法 GridSearchCV 寻找SVR最优模型参数
创建GridSearchCV网格参数搜寻函数,评价标准为最小均方误差,采用K折交叉验证的检验方法
def gsearch(model, param_grid, scoring='neg_mean_squared_error', splits=5, repeats=1, n_jobs=-1):# p次k折交叉验证rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats, random_state=0)model_gs = GridSearchCV(model, param_grid=param_grid, scoring=scoring, cv=rkfold, verbose=1, n_jobs=-1)model_gs.fit(X_train, y_train)print('参数最佳取值: {0}'.format(model_gs.best_params_))print('最小均方误差: {0}'.format(abs(model_gs.best_score_)))return model_gs
使用SVR回归器默认的“rbf”内核,即高斯核
对惩罚参数C与核系数gamma进行网格搜索CV验证
svr = SVR()
cv_params = {'C': np.logspace(0, 3, 4), 'gamma': np.logspace(-4, -1, 4)}
svr = gsearch(svr, cv_params)
参数最佳取值: {‘C’: 100.0, ‘gamma’: 0.0001}
最小均方误差: 0.15531991719148028
pred_svr = svr.predict(X_test)
score(y_test, pred_svr)
MSE = 0.15136862309913945
R2 = 0.8085206783615895
XGB回归(XGBRegressor )
# 调参
# 初始参数值
params = {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}
1.最佳迭代次数:n_estimators
cv_params = {'n_estimators': [100,200,300,400,500,600,700,800,900,1000,1100,1200]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)
参数最佳取值: {‘n_estimators’: 100}
最小均方误差: 0.22547065866645086
# 更新参数
params['n_estimators'] = 100
2.min_child_weight 以及 max_depth
cv_params = {'max_depth': [3,4,5,6,7,8,9],'min_child_weight': [1,2,3,4,5,6,7]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)
参数最佳取值: {‘max_depth’: 3, ‘min_child_weight’: 7}
最小均方误差: 0.1787887844016734
# 更新参数
params['max_depth'] = 3
params['min_child_weight'] = 7
3.后剪枝参数 gamma
cv_params = {'gamma': [0,0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)
参数最佳取值: {‘gamma’: 0.6}
最小均方误差: 0.1738115095993947
# 更新参数
params['gamma'] = 0.6
4.样本采样subsample 和 列采样colsample_bytree
cv_params = {'subsample': [0.6,0.7,0.8,0.9],'colsample_bytree': [0.6,0.7,0.8,0.9]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)
参数最佳取值: {‘colsample_bytree’: 0.9, ‘subsample’: 0.8}
最小均方误差: 0.16685866639860653
# 更新参数
params['subsample'] = 0.9
params['colsample_bytree'] = 0.8
5.L1正则项参数reg_alpha 和 L2正则项参数reg_lambda
cv_params = {'reg_alpha': [0,0.02,0.05,0.1,1,2,3],'reg_lambda': [0,0.02,0.05,0.1,1,2,3]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)
参数最佳取值: {‘reg_alpha’: 0, ‘reg_lambda’: 0}
最小均方误差: 0.1741069934979665
不做更新
6.最后是learning_rate,一般这时候要调小学习率来测试
cv_params = {'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.07, 0.1, 0.2]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)
参数最佳取值: {‘learning_rate’: 0.07}
最小均方误差: 0.1735823933607791
不做更新
参数调优完成
cv_params = {'subsample': [0.6,0.7,0.8,0.9],'colsample_bytree': [0.6,0.7,0.8,0.9]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)pred_xgb = xgb.predict(X_test)
score(y_test, pred_xgb)
参数最佳取值: {‘colsample_bytree’: 0.9, ‘subsample’: 0.8}
最小均方误差: 0.16685866639860653
MSE = 0.1562520212506388
R2 = 0.8023432437903115
模型评估
models = [KNeighborsRegressor(),LinearRegression(),Ridge(alpha=1),lasso, svr, xgb]
model_names = ['KNeighborsRegressor','LinearRegression','Ridge','Lasso','SVR','XGB']
plt.figure(figsize=(20,5))for i,m in enumerate(models):train_sizes, train_scores, test_scores = learning_curve(m, X, y, cv=5, scoring='neg_mean_squared_error',train_sizes=np.linspace(0.1,1.0,5), n_jobs=-1)train_scores_mean = -train_scores.mean(axis=1)test_scores_mean = -test_scores.mean(axis=1)plt.subplot(2,3,i+1)plt.plot(train_sizes, train_scores_mean, 'o-', label='Train')plt.plot(train_sizes, test_scores_mean, '^-', label='Test')plt.xlabel('Train_size')plt.ylabel('Score')plt.ylim([0,0.35])plt.title(model_names[i], fontsize=16)plt.legend()plt.grid()plt.tight_layout()
模型加权融合
对于多个模型结果,采取加权融合的办法进行结合,即对各模型预测结果取加权平均,这样可用避免单个模型在预测某一部分数据时产生较大的误差。
# 禁用科学计数法
pd.set_option('display.float_format', lambda x: '%.10f' % x)
def model_mix(pred_1, pred_2, pred_3):result = pd.DataFrame(columns=['knn','SVR','XGB','Combine'])for a in range(1,6):for b in range(1,6):for c in range(1,6):y_pred = (a*pred_1 + b*pred_2 + c*pred_3) / (a+b+c)mse = mean_squared_error(y_test, y_pred)result = result.append([{'knn':a, 'SVR':b, 'XGB':c, 'Combine':mse}], ignore_index=True)return resultmodel_combine = model_mix( FsctY, pred_svr, pred_xgb)
model_combine.sort_values(by='Combine', inplace=True)
model_combine
在测试集中,Knn、SVR、XGBRegressor三种模型分别取权重为5/11、5/11、1/11时得到的预测数据均方误差最小,Min(MSE) = 0.14592
模型预测 对3种模型预测结果进行加权融合
pd.set_option('display.float_format', lambda x: '%.4f' % x)
ans_mix = (5*FsctY + 5 * pred_svr + 1 * pred_xgb) / 11
ans_mix
score(y_test, ans_mix)
MSE = 0.14591990076794314
R2 = 0.8154132406007941
案例:儿童呼吸道疾病数据集相关推荐
- 生命早期肠道微生物群与儿童呼吸道疾病之间的关联
谷禾健康 儿童呼吸系统疾病,包括呼吸道感染.反复喘息和哮喘,是儿童及其以后年龄发病和死亡的重要原因. 而哮喘是其中比较典型的一种,哮喘是全球最常见的慢性疾病之一,是一种复杂的.异质性的免疫介导的紊乱集 ...
- 安搭Share提醒,谨防秋冬季儿童呼吸道疾病
眼下,天气干燥,早晚温差较大,儿童容易患呼吸道感染等疾病.日前,安搭Share从各医院儿科门诊了解到,自上周起儿童发病率呈上升趋势,就诊患儿数量明显增加.安搭Share提醒,市民需尽量不要去空气流通性 ...
- R语言logistic回归、判别分析(LDA)、多元自适应样条回归MARS分析案例:分析乳腺癌数据集明确细针穿刺肿瘤活检结果
R语言logistic回归.判别分析(LDA).多元自适应样条回归MARS分析案例:分析乳腺癌数据集明确细针穿刺肿瘤活检结果 目录
- 预防防御鸡呼吸道疾病 鸡吃啥药防治呼吸道感染
鸡呼吸道疾病高发,多数是因为通风不合理,空气不流通,湿度不行,贼风侵袭,温差过大等引起.达龙支克用于治疗鸡大肠杆菌病.沙门氏菌病以及支原体引起的滑液囊和呼吸道疾病. 鸡舍的门直接对着鸡笼,人进进出出, ...
- 三伏天空调当道 小心引起呼吸道疾病
据<杭州网>报道,近日记者去杭州市红十字会医院采访,路过输液室,里面人声鼎沸,黑压压坐满了人.一位家长左手高举盐水袋,右手抱着孩子,转了一圈没找到座位.大多数病人的症状很类似,主要有咽喉疼 ...
- 鸡感染呼吸道疾病怎么办 防治鸡流鼻涕的特效药
鸡感染呼吸道疾病怎么办 防治鸡流鼻涕的特效药 秋冬季是鸡呼吸道病的高发期,一旦感染有的患鸡治疗起来还很麻烦.因此,我们应掌握呼吸道疾病发病原因,尽早做好预防,避免这些病因滋生.对于已经感染的患鸡,要尽 ...
- 宝宝生病处理锦囊+儿童常见疾病
曾经因为宝宝生病在网上像无头苍蝇一样转来转去,不知道应该从何处找起.有空的时候发现这里其实有很多很好的处理方法,但是很散,一不小心就沉了,所以想发个帖子,把这些帖子都收集一起,欢迎各位妈妈将自己看到的 ...
- SQL案例_0_员工表数据集
数据库数据集 数据集说明 这里参考Oracle的SCOTT用户下的员工信息表,该用户下有4张表.详细的员工表结构和数据见网盘链接: 链接:https://pan.baidu.com/s/1CbnJSO ...
- 【Pytorch神经网络实战案例】01 CIFAR-10数据集:Pytorch使用GPU训练CNN模版-方法①
import torch import torchvision from torch import nn from torch.utils.tensorboard import SummaryWrit ...
最新文章
- ffmpeg linux安装_ffmpeg命令中文手册
- OpenCV 错误:无法打开摄像头(打开摄像头卡机)
- 千万级游标_在一个千万级的数据库查寻中,如何提高查询效率
- 模型转换状态已支持Webhook!
- 【自然框架】——思路、结构、特点的介绍(初稿,欢迎大家多提意见)
- Linux配置汇总上(北大青鸟Linux课程学习总结)
- 搭建K8s集群(kubeadm方式)-部署master节点
- 【渝粤题库】陕西师范大学163202 管理学原理 作业(高起本 专升本)
- 检测系列--YOLO系列
- python面向对象程序设计实训学生自我总结_Python面向对象程序设计示例小结
- fastnest怎么一键排版_什么公众号排版编辑器可以换字体?公众号字体在哪里选择修改?...
- C语言预定义宏的使用
- mysql grant 用户权限
- ROSE HA高可用性软件介绍(转载)
- 基于人脸识别录入 人脸图片识别 及测试的效果
- 如何进行音频合并?很简单,只需三步骤
- 基于人脸识别的门禁系统设计与实现--论文
- 初中计算机室教学计划,初中信息技术教学计划
- 怎么修改windows10在cmd下的用户名为英文名
- MAC 本机电脑ip自动分配改变,导致的坑
热门文章
- 计算机技术有哪些系统,电脑系统教程:win7有哪些系统版本和区别
- system占用cpu解决(ntoskrnl.exe)
- 加了权重样本的AUC如何计算
- 经纬恒润荣获极氪汽车“最佳创新奖”
- 398489-28-6,1-Boc-3-ethyl-3-azetidinol,3-乙基-3-羟基氮杂环丁烷-1-羧酸叔丁酯的结构式解析
- 【AlgorithmTraining】03:Project Euler 03
- mysql优化 个人笔记 非礼勿扰 -m06
- E. Gardener and Tree 树形DP
- 51单片机入门自学方法
- You are the reason I am. You are all my reasons