R代码 (葡萄酒)及其可视化分析
#随机森林-支持向量机
library(ggplot2)#绘图工具
library(tidyverse)#ggplot2/dplyr等万能包
read.csv("wine")#读取数据
wine<-read.csv("wine",header=T,sep=',')
#添加变量名
colname<-c("type","fixed.acidity","volatile.acidity","citric.acid","residual.sugar","chlorides","free.sulfur.dioxide","total.sulfur.dioxide","density","pH","sulphates","alcohol","quality")
colnames(wine)<-colname
#连续变量
varcontinue<-c("fixed.acidity","volatile.acidity","citric.acid","residual.sugar","chlorides","free.sulfur.dioxide","total.sulfur.dioxide","density","pH","sulphates","alcohol")
#连续变量转化为数值型并与因子型变量合并
wine<-cbind(lapply(wine[,varcontinue],function(x)as.numeric(as.character(x))),wine[,setdiff(colname,varcontinue)])
#查看各变量类型
str(wine)
#查看缺失值情况
sum(is.na(wine))#分析是否有残缺值
#尝试观察是否存在非NA型缺失值
table(wine$type)
table(wine$quality)
table(wine$density)
#summaryg函数:最小最大值,平均值、残差第一四分位数、残差第三四分位数以及中位数。
summary(wine)
par(mfrow=c(3,4))
hist(x=wine$fixed.acidity,breaks = 12,freq = F,col = "red",border = "blue",main="直方图",xlab="fixed.acidity",
ylab="频数",xlim=c(3,15),ylim=c(0,0.5))
lines(density(wine$fixed.acidity))
hist(x=wine$volatile.acidity,breaks = 12,freq = F,col = "red",border = "blue",main="直方图",xlab="volatile.acidity",
ylab="频数",xlim=c(0,1),ylim=c(0,4))
lines(density(wine$volatile.acidity))
hist(x=wine$citric.acid ,breaks = 12,freq = F,col = "red",border = "blue",main="直方图",xlab="citric.acid",
ylab="频数",xlim=c(0,0.8),ylim=c(0,4.2))
lines(density(wine$citric.acid))
hist(x=wine$residual.sugar,breaks = 12,freq = F,col = "red",border = "blue",main="直方图",xlab="residual.sugar",
ylab="频数",xlim=c(0,30),ylim=c(0,0.13))
lines(density(wine$residual.sugar))
hist(x=wine$chlorides,breaks = 12,freq = F,col = "red",border = "blue",main="直方图",xlab="chlorides",
ylab="频数",xlim=c(0,0.5),ylim=c(0,14))
lines(density(wine$chlorides))
hist(x=wine$free.sulfur.dioxide,breaks = 12,freq = F,col = "red",border = "blue",main="直方图",xlab="free.sulfur.dioxide",
ylab="频数",xlim=c(0,100),ylim=c(0,0.02))
lines(density(wine$free.sulfur.dioxide))
hist(x=wine$total.sulfur.dioxide,breaks = 12,freq = F,col = "red",border = "blue",main="直方图",xlab="total.sulfur.dioxide",
ylab="频数",xlim=c(6,300),ylim=c(0,0.008))
lines(density(wine$total.sulfur.dioxide))
hist(x=wine$pH,breaks = 12,freq = F,col = "red",border = "blue",main="直方图",xlab="pH",
ylab="频数",xlim=c(3,4),ylim=c(0,2.5))
lines(density(wine$pH))
hist(x=wine$sulphates,breaks = 16,freq = F,col = "red",border = "blue",main="直方图",xlab="sulphates",
ylab="频数",xlim=c(0.1,1),ylim=c(0,4))
lines(density(wine$sulphates))
hist(x=wine$alcohol,breaks = 12,freq = F,col = "red",border = "blue",main="直方图",xlab="alcohol",
ylab="频数",xlim=c(8,16),ylim=c(0,0.4))
lines(density(wine$alcohol))
#将数据集分为训练集和测试集,并查看数据集基本属性
train=sample(1:nrow(wine),0.7*nrow(wine))
wine_train <- wine[train,]
wine_test <- wine[-train,]
str(wine_train)
str(wine_test)
for(i in 1:(n-1)){set.seed(1234)
rf_train<-randomForest(as.factor(wine_train$quality)~fixed.acidity+volatile.acidity+citric.acid+residual.sugar+chlorides+free.sulfur.dioxide+total.sulfur.dioxide+density+pH+sulphates+alcohol,data=wine_train,mtry=i,ntree=1000)rate[i]<-mean(rf_train$err.rate)   #计算基于模型误判率均值print(rf_train)
}
rate     #展示所有模型误判率的均值
plot(rate)
#随机森林-支持向量机
library(ggplot2)
library(tidyverse)
read.csv("wine")#读取数据
wine<-read.csv("wine",header=T,sep=',')
#添加变量名
colname<-c("type","fixed.acidity","volatile.acidity","citric.acid","residual.sugar","chlorides","free.sulfur.dioxide","total.sulfur.dioxide","density","pH","sulphates","alcohol","quality")
colnames(wine)<-colname
#连续变量
varcontinue<-c("fixed.acidity","volatile.acidity","citric.acid","residual.sugar","chlorides","free.sulfur.dioxide","total.sulfur.dioxide","density","pH","sulphates","alcohol")
#连续变量转化为数值型并与因子型变量合并
wine<-cbind(lapply(wine[,varcontinue],function(x)as.numeric(as.character(x))),wine[,setdiff(colname,varcontinue)])
#查看各变量类型
str(wine)
#查看缺失值情况
sum(is.na(wine))#分析是否有残缺值
#尝试观察是否存在非NA型缺失值
table(wine$type)
table(wine$quality)
table(wine$density)
summary(wine)#summaryg函数:最小最大值,平均值、残差第一四分位数、残差第三四分位数以及中位数。
par(mfrow=c(3,4))
hist(x=wine$fixed.acidity,breaks = 12,freq = F,col = "green",border = "yellow",main="直方图",xlab="fixed.acidity",
ylab="频数",xlim=c(3,15),ylim=c(0,0.5))
lines(density(wine$fixed.acidity))
hist(x=wine$volatile.acidity,breaks = 12,freq = F,col = "green",border = "yellow",main="直方图",xlab="volatile.acidity",
ylab="频数",xlim=c(0,1),ylim=c(0,4))
lines(density(wine$volatile.acidity))
hist(x=wine$citric.acid ,breaks = 12,freq = F,col = "green",border = "yellow",main="直方图",xlab="citric.acid",
ylab="频数",xlim=c(0,0.8),ylim=c(0,4.2))
lines(density(wine$citric.acid))
hist(x=wine$residual.sugar,breaks = 12,freq = F,col = "green",border = "yellow",main="直方图",xlab="residual.sugar",
ylab="频数",xlim=c(0,30),ylim=c(0,0.13))
lines(density(wine$residual.sugar))
hist(x=wine$chlorides,breaks = 12,freq = F,col = "green",border = "yellow",main="直方图",xlab="chlorides",
ylab="频数",xlim=c(0,0.5),ylim=c(0,14))
lines(density(wine$chlorides))
hist(x=wine$free.sulfur.dioxide,breaks = 12,freq = F,col = "green",border = "yellow",main="直方图",xlab="free.sulfur.dioxide",
ylab="频数",xlim=c(0,100),ylim=c(0,0.02))
lines(density(wine$free.sulfur.dioxide))
hist(x=wine$total.sulfur.dioxide,breaks = 12,freq = F,col = "green",border = "yellow",main="直方图",xlab="total.sulfur.dioxide",
ylab="频数",xlim=c(6,300),ylim=c(0,0.008))
lines(density(wine$total.sulfur.dioxide))
hist(x=wine$pH,breaks = 12,freq = F,col = "green",border = "yellow",main="直方图",xlab="pH",
ylab="频数",xlim=c(3,4),ylim=c(0,2.5))
lines(density(wine$pH))
hist(x=wine$sulphates,breaks = 16,freq = F,col = "green",border = "yellow",main="直方图",xlab="sulphates",
ylab="频数",xlim=c(0.1,1),ylim=c(0,4))
lines(density(wine$sulphates))
hist(x=wine$alcohol,breaks = 12,freq = F,col = "green",border = "yellow",main="直方图",xlab="alcohol",
ylab="频数",xlim=c(8,16),ylim=c(0,0.4))
lines(density(wine$alcohol))
hist(x=wine$quality,breaks = 16,freq = F,col = "green",border = "yellow",main="直方图",xlab="quality",
ylab="频数",xlim=c(2,10),ylim=c(0,1))
lines(density(wine$quality))
hist(x=wine$density,breaks = 16,freq = F,col = "green",border = "yellow",main="直方图",xlab="density",
ylab="频数",xlim=c(0.99,1.009),ylim=c(0,100))
lines(density(wine$density))
#将数据集分为训练集和测试集,并查看数据集基本属性
train=sample(1:nrow(wine),0.7*nrow(wine))
wine_train <- wine[train,]
wine_test <- wine[-train,]
str(wine_train)
#寻找最优参数mtry,即指定节点中用于二叉树的最佳变量个数
library("randomForest")
n<-length(names(wine_train))     #计算数据集中自变量个数,等同n=ncol(train_data)
rate=1     #设置模型误判率向量初始值
for(i in 1:(n-1)){set.seed(1234)rf_train<-randomForest(as.factor(wine_train$quality)~fixed.acidity+volatile.acidity+citric.acid+residual.sugar+chlorides+free.sulfur.dioxide+total.sulfur.dioxide+density+pH+sulphates+alcohol,data=wine_train,mtry=i,ntree=1000)rate[i]<-mean(rf_train$err.rate)   #计算基于OOB数据的模型误判率均值print(rf_train)
}rate     #展示所有模型误判率的均值
plot(rate)
#寻找最佳参数ntree,即指定随机森林所包含的最佳决策树数目
set.seed(100)
rf_train<-randomForest(as.factor(wine_train$quality)~fixed.acidity+volatile.acidity+citric.acid+residual.sugar+chlorides+free.sulfur.dioxide+total.sulfur.dioxide+density+pH+sulphates+alcohol,data=wine_train,mtry=12,ntree=1000)
plot(rf_train)    #绘制模型误差与决策树数量关系图
legend(800,0.02,"quality=0",cex=0.9,bty="n")
legend(800,0.0245,"total",cex=0.09,bty="n")
set.seed(100)
rf_train<-randomForest(as.factor(wine_train$quality)~fixed.acidity+volatile.acidity+citric.acid+residual.sugar+chlorides+free.sulfur.dioxide+total.sulfur.dioxide+density+pH+sulphates+alcohol,data=wine_train,mtry=12,ntree=500,importance=TRUE,proximity=TRUE)
pred_rf <- predict(rf_train, wine_train, type = 'class')
importance<-importance(x=rf_train)
importance
set.seed(100)
library(caret)
varImpPlot(rf_train)    #绘制变量重要性曲线
confusionMatrix(pred_rf, wine_train$quality)   #混淆矩阵
pred1<-predict(rf_train,data=wine_train)
Freq1<-table(pred1,wine_train$quality)
#验证矩阵中迹占整体情况
sum(diag(Freq1))/sum(Freq1)
#SVM分析
library("kernlab")         # ksvm()
library("caret")           # confusionMatrix
set.seed(1234)
svm_model<- ksvm(as.factor(wine_train$quality)~fixed.acidity+volatile.acidity+citric.acid+residual.sugar+chlorides+free.sulfur.dioxide+total.sulfur.dioxide+density+pH+sulphates+alcohol, data = wine_train, kernel = 'rbfdot')
el <- ksvm(as.factor(wine_train$quality)~fixed.acidity+volatile.acidity+citric.acid+residual.sugar+chlorides+free.sulfur.dioxide+total.sulfur.dioxide+density+pH+sulphates+alcohol,data = wine_train,kernel = 'rbfdot')
pred_svm <- predict(svm_model, wine_train, type = 'response')
confusionMatrix(pred_svm, wine_train$quality)
#再在测试集上进行验证:
pred_svm_test <- predict(svm_model,wine_test,type = 'response')
confusionMatrix(pred_svm_test,as.factor(wine_test$quality) )
library(pROC)
library(ggplot2)
red_rf_test<-as.numeric(pred_rf_test)
pred_svm_test<-as.numeric(pred_svm_test)
wine_test$quality<-as.numeric(wine_test$quality)
par(mfrow=c(1,2))
roc(pred_svm_test,wine_test$quality,plot=TRUE,print.thres=TRUE, print.auc=TRUE,main="随机森林ROC")
roc(red_rf_test,wine_test$quality,plot=TRUE,print.thres=TRUE, print.auc=TRUE,main="支持向量ROC")这里插入代码片

【R代码 (葡萄酒)及其可视化分析 #随机森林-支持向量机】相关推荐

  1. R语言决策树、bagging、随机森林模型在训练集以及测试集的预测结果(accuray、F1、偏差Deviance)对比分析、计算训练集和测试集的预测结果的差值来分析模型的过拟合(overfit)情况

    R语言决策树.bagging.随机森林模型在训练集以及测试集的预测结果(accuray.F1.偏差Deviance)对比分析.计算训练集和测试集的预测结果的差值来分析模型的过拟合(overfit)情况 ...

  2. R语言使用caret包构建随机森林模型(random forest)构建回归模型、通过method参数指定算法名称、通过ntree参数指定随机森林中树的个数

    R语言使用caret包构建随机森林模型(random forest)构建回归模型.通过method参数指定算法名称.通过ntree参数指定随机森林中树的个数 目录

  3. 通过Python做葡萄酒成分与质量的关系分析并可视化--GBDT/随机森林特征选取

    葡萄酒成分与质量关系分析 -- 通过GBDT以及Random Forests进行特征选取 在UCI下载葡萄酒数据集,链接:https://archive.ics.uci.edu/ml/machine- ...

  4. 随机森林回归预测r语言_R包 randomForest 进行随机森林分析

    randomForest 包提供了利用随机森林算法解决分类和回归问题的功能:我们这里只关注随机森林算法在分类问题中的应用 首先安装这个R包 install.packages("randomF ...

  5. 随机森林分类算法python代码_Python机器学习笔记:随机森林算法

    随机森林算法的理论知识 随机森林是一种有监督学习算法,是以决策树为基学习器的集成学习算法.随机森林非常简单,易于实现,计算开销也很小,但是它在分类和回归上表现出非常惊人的性能,因此,随机森林被誉为&q ...

  6. 【R】【决策树】【随机森林】

    文章目录 实验思维导图 1.决策树--ctree()--iris 1.1 数据 1.1.1 程序包加载 1.1.2 数据集探索 1.1.3 数据集拆分 1.2 训练 1.2.1 设置因变量.自变量 1 ...

  7. R语言数据缺失值处理(随机森林,多重插补)

    缺失值是指数据由于种种因素导致的数据不完整,可以分为机械原因和人为原因.对于缺失值我们通常采用以下几种方法来进行插补. 1.读取数据 通过read.csv函数导入文档,也可以用其他函数读入,如open ...

  8. R语言使用randomForest包构建随机森林模型(Random forests)、使用importance函数查看特征重要度、使用table函数计算混淆矩阵评估分类模型性能、包外错误估计OOB

    R语言使用randomForest包中的randomForest函数构建随机森林模型(Random forests).使用importance函数查看特征重要度.使用table函数计算混淆矩阵评估分类 ...

  9. R语言使用randomForest包构建随机森林模型的步骤和流程、随机森林算法包括抽样样本(观察)和变量来创建大量的决策树(多个树,构成了森林,而且通过样本抽样和变量抽样,让多个树尽量不同)

    R语言使用randomForest包中的randomForest函数构建随机森林模型的步骤和流程(Random forests).随机森林算法包括抽样样本(观察)和变量来创建大量的决策树(多个树,构成 ...

最新文章

  1. codeforces A. Jeff and Digits 解题报告
  2. 百度AI快车道—企业深度学习实战营,推荐系统主题专场即将开课
  3. 人类评估已不是NLG的最佳标准,华盛顿大学提出全新观点,网友:那是评估人水平不行...
  4. 检查JavaScript中变量是数字还是字符串
  5. 《程序是怎样跑起来的》第一章读后感
  6. 数据结构:平衡二叉树概念、旋转
  7. Sturt2做表单重复提交
  8. java 正则表达式 替换字符串img标签的路径_python面试题汇总第06期-正则表达式(内附7题及答案)...
  9. android 城市列表数据,用RecyclerView写的城市列表
  10. 获取指定日期的常用前后节点(月初月末周一周末等等)
  11. extjs 日期不显示
  12. matlab打开矩阵表,如何将Excel数据导入MATLAB中?:EXCLE中通过矩阵表输出选项
  13. 常用的 iptables配置脚本
  14. ruby环境变量的文件读取形式
  15. 办公软件在多屏宽屏上的应用设想
  16. SPICE电路仿真软件介绍
  17. 医用计算机app,‎App Store 上的“金融-计算器”
  18. OTT盒子市场起量正当时 12家主控厂商争夺3000万市场
  19. python 汉字 简繁体 转换方法
  20. 域名过期后还能续费域名吗?

热门文章

  1. ARM服务器和云手游
  2. 微信扫码登陆在chrome浏览器失败
  3. BUUCTF msic 专题(120)[QCTF2018]X-man-A face
  4. 使用luckysheet实现excel导入导出
  5. 安信实验室呼吁键盘厂商申请windows徽标认证(WHQL)
  6. 【架构师】零基础到精通——网关详解
  7. win10微软图标点击无反应_Win10系统点击无线图标没反应的解决方法
  8. python中常用英语口语_1000句常用英语口语
  9. 3D点云的快速分割:自动驾驶汽车应用的LiDAR处理实例
  10. 基于QT的人脸识别考勤管理系统【一】