(R)机器学习--学习笔记--第四章节学习笔记
install.packages("tm")
library(tm)
install.packages("ggplot2")
library(ggplot2)
data.path<-"G:R/ML_for_Hackers-master/03-Classification/data/"
easyham.path<-paste(data.path,"easy_ham/",sep="")
//数据处理的方法
parse.email<-function(path){
full.msg<-msg.full(path)
date<-get.date(full.msg)
from<-get.from(full.msg)
subj<-get.subject(full.msg)
msg<-get.msg(full.msg)
return(c(date,from,subj,msg,path))
}
//第一个函数,用来获取所有的数据
msg.full<-function(path){
con<-file(path,open="rt",encoding="latin1")
msg<-readLines(con)
close(con)
return(msg)
}
//第二个函数,用来获取邮件发件人
//这里传入的是正则,需要详细了解
get.from<-function(msg.vec){
from<-msg.vec[grepl("From: ",msg.vec)]
if(length(strsplit(from,'[":<> ]'))>=1){
from<-strsplit(from,'[":<> ]')[[1]]
from<-from[which(from!=""&from!=" ")]
return(from[grepl("@",from)][1])
}else{
return("")
}
}
//第三个函数,获取邮件内容
get.msg<-function(msg.vec){
if(!(is.na(which(msg.vec=="")[1]+1))){
msg<-msg.vec[seq(which(msg.vec=="")[1]+1,length(msg.vec),1)]
return(paste(msg,collapse="\n"))
}else{
return("")
}
}
//第四个函数,获取邮件主题
get.subject<-function(msg.vec){
subj<-msg.vec[grepl("Subject: ",msg.vec)]
if(length(subj)>0){
return(strsplit(subj,"Subject: ")[[1]][2])
}else{
return("")
}
}
//第五个函数,获取邮件的接收时间和日期
//用冒号,加号或者减号分割日期
//替换掉首尾的空格
get.date<-function(msg.vec){
date.grep<-grepl("^Date: ",msg.vec)
date.grepl<-which(date.grep==TRUE)
date<-msg.vec[date.grepl[1]]
date<-strsplit(date,"\\+|\\-|: ")[[1]][2]
date<-gsub("^\\s+|\\s+$","",date)
return(strtrim(date,25))
}
//数据最终整理
easyham.docs<-dir(easyham.path)
easyham.docs<-easyham.docs[which(easyham.docs!="cdms")]
easyham.parse<-lapply(easyham.docs,function(p) parse.email(paste(easyham.path,
p,sep="")))
ehparse.matrix<-do.call(rbind,easyham.parse)
allparse.df<-data.frame(ehparse.matrix,stringAsFactors=FALSE)
names(allparse.df)<-c("Date","From.Email","Subject","Message","Path")
head(allparse.df)
//日期处理(可复用)
date.converter<-function(dates,pattern1,pattern2){
pattern1.convert<-strptime(dates,pattern1)
pattern2.convert<-strptime(dates,pattern2)
pattern1.convert[is.na(pattern1.convert)]<-
pattern2.convert[is.na(pattern2.convert)]
return(pattern1.convert)
}
//这个地方很容易引起问题,导致时间转换后结果是NA,需要增加下面这个系统设置
Sys.setlocale("LC_TIME", "C");
pattern1<-"%a, %d %b %Y %H:%M:%S"
pattern2<-"%d %b %Y %H:%M:%S"
allparse.df$Date<-date.converter(allparse.df$Date,pattern1,pattern2)
//处理其他数据
allparse.df$Subject<-tolower(allparse.df$Subject)
allparse.df$From.Email<-tolower(allparse.df$From.Email)
priority.df<-allparse.df[with(allparse.df,order(Date)),]
priority.train<-priority.df[1:(round(nrow(priority.df)/2)),]
//设置权重策略
library(plyr)
//在这里遇到的主要问题是:
Error: 'names' attribute [11] must be the same length as the vector [1]
原因和解决方案如下面英文解说,主要是时间转换后就占用了9列,所以加起来是不一样的:
#The problem is with the Date field,
#which is at this point a list that contains 9 fields
#(sec, min, hour, mday, mon, year, wday, yday, isdst).
#To solve the problem I've simply converted the dates into character vectors,
#used ddply then converted the dates back to Date
tmp <- priority.train$Date
priority.train$Date <- as.character(priority.train$Date)
from.weight<-ddply(priority.train, .(From.Email), summarise, Freq=length(Subject))
priority.train$Date <- tmp
rm(tmp)
head(from.weight)
names(from.weight)
#此处增加绘制条状图的功能,详细方法
#绘制条状图,查看数据情况
from.weight <- from.weight[with(from.weight, order(Freq)), ]
from.ex <- subset(from.weight, Freq > 6)
from.scales <- ggplot(from.ex) +
geom_rect(aes(xmin = 1:nrow(from.ex) - 0.5,
xmax = 1:nrow(from.ex) + 0.5,
ymin = 0,
ymax = Freq,
fill = "lightgrey",
color = "darkblue")) +
scale_x_continuous(breaks = 1:nrow(from.ex), labels = from.ex$From.Email) +
coord_flip() +
scale_fill_manual(values = c("lightgrey" = "lightgrey"), guide = "none") +
scale_color_manual(values = c("darkblue" = "darkblue"), guide = "none") +
ylab("Number of Emails Received (truncated at 6)") +
xlab("Sender Address") +
theme_bw() +
theme(axis.text.y = element_text(size = 5, hjust = 1))
ggsave(plot = from.scales,
filename = file.path("images", "0011_from_scales.pdf"),
height = 4.8,
width = 7)
//计算每一个线程的活跃度
get.threads<-function(threads.matrix,email.df){
threads<-unique(threads.matrix[,2])
thread.counts<-lapply(threads,function(t) thread.counts(t,email.df))
thread.matrix<-do.call(rbind,thread.counts)
return(cbind(threads,thread.matrix))
}
thread.counts<-function(thread,email.df){
thread.times<-email.df$Date[which(email.df$Subject==thread|
email.df$Subject==paste("re:",thread))]
freq<-length(thread.times)
min.time<-min(thread.times)
max.time<-max(thread.times)
time.span<-as.numeric(difftime(max.time,min.time,units="secs"))
if(freq<2){
return(c(NA,NA,NA))
}else{
trans.weight<-freq/time.span
log.trans.weight<-10+log(trans.weight,base=10)
return(c(freq,time.span,log.trans.weight))
}
}
thread.weights<-get.threads(threads.matrix,priority.train)
thread.weights<-data.frame(thread.weights,StringAsFactors=FALSE)
names(thread.weights)<-c("Thread","Freq","Response","Weight")
thread.weights$Freq<-as.numeric(thread.weights$Freq)
thread.weights$Response<-as.numeric(thread.weights$Response)
#thread.weights$Weight<-as.numeric(thread.weights$Weight)
thread.weights<-subset(thread.weights,is.na(thread.weights$Freq)==FALSE)
head(thread.weights)
term.counts <- function(term.vec, control)
{
vec.corpus <- Corpus(VectorSource(term.vec))
vec.tdm <- TermDocumentMatrix(vec.corpus, control = control)
return(rowSums(as.matrix(vec.tdm)))
}
thread.terms <- term.counts(thread.weights$Thread,
control = list(stopwords = TRUE))
thread.terms <- names(thread.terms)
term.weights <- sapply(thread.terms,
function(t) mean(thread.weights$Weight[grepl(t, thread.weights$Thread, fixed = TRUE)]))
term.weights <- data.frame(list(Term = names(term.weights),
Weight = term.weights),
stringsAsFactors = FALSE,
row.names = 1:length(term.weights))
msg.terms <- term.counts(priority.train$Message,
control = list(stopwords = TRUE,
removePunctuation = TRUE,
removeNumbers = TRUE))
msg.weights <- data.frame(list(Term = names(msg.terms),
Weight = log(msg.terms, base = 10)),
stringsAsFactors = FALSE,
row.names = 1:length(msg.terms))
get.weights<-function(search.term,weight.df,term=TRUE){
if(length(search.term)>0){
if(term){
term.match<-match(names(search.term),weight.df$Term)
}else{
term.match<-match(search.term,weight.df$Term)
}
match.weights<-weight.df$Weight[which(!is.na(term.match))]
if(length(match.weights)>1){
return(1)
}else{
return(mean(match.weights))
}
}else{
return(1)
}
}
#排序的计算
rank.message <- function(path)
{
msg <- parse.email(path)
from <- ifelse(length(which(from.weight$From.Email == msg[2])) > 0,
from.weight$Weight[which(from.weight$From.Email == msg[2])],
1)
# Second is based on senders in threads, and threads themselves
thread.from <- ifelse(length(which(senders.df$From.Email == msg[2])) > 0,
senders.df$Weight[which(senders.df$From.Email == msg[2])],
1)
subj <- strsplit(tolower(msg[3]), "re: ")
is.thread <- ifelse(subj[[1]][1] == "", TRUE, FALSE)
if(is.thread)
{
activity <- get.weights(subj[[1]][2], thread.weights, term = FALSE)
}
else
{
activity <- 1
}
# Next, weight based on terms
# Weight based on terms in threads
thread.terms <- term.counts(msg[3], control = list(stopwords = TRUE))
thread.terms.weights <- get.weights(thread.terms, term.weights)
# Weight based terms in all messages
msg.terms <- term.counts(msg[4],
control = list(stopwords = TRUE,
removePunctuation = TRUE,
removeNumbers = TRUE))
msg.weights <- get.weights(msg.terms, msg.weights)
# Calculate rank by interacting all weights
rank <- prod(from,
thread.from,
activity,
thread.terms.weights,
msg.weights)
return(c(msg[1], msg[2], msg[3], rank))
}
# Find splits again
train.paths <- priority.df$Path[1:(round(nrow(priority.df) / 2))]
test.paths <- priority.df$Path[((round(nrow(priority.df) / 2)) + 1):nrow(priority.df)]
head(test.paths)
# Now, create a full-featured training set.
train.ranks <- suppressWarnings(lapply(train.paths, rank.message))
train.ranks.matrix <- do.call(rbind, train.ranks)
train.ranks.matrix <- cbind(train.paths, train.ranks.matrix, "TRAINING")
train.ranks.df <- data.frame(train.ranks.matrix, stringsAsFactors = FALSE)
names(train.ranks.df) <- c("Message", "Date", "From", "Subj", "Rank", "Type")
train.ranks.df$Rank <- as.numeric(train.ranks.df$Rank)
(R)机器学习--学习笔记--第四章节学习笔记相关推荐
- 线性代数学习笔记——第四章学习指南——n维向量空间
一.学习内容及要求 1. 内容: §4.1. n维向量空间的概念 线性代数学习笔记--第四十讲--n维向量空间的概念 线性代数学习笔记--第四十一讲--n维向量空间的子空间 §4.2. 向量组的线性相 ...
- 《Python深度学习》第四章读书笔记
第四章 机器学习基础 本章重点:处理机器学习问题的通用工作流程: 定义问题与要训练的数据. 收集这些数据,有需要的话用标签来标注数据. 选择衡量问题成功的指标. 你要在验证数据上监控哪些指标? 确定评 ...
- Cocos2d-x 3.2 学习笔记(四)学习打包Android平台APK!
从cocos2dx 3.2项目打包成apk安卓应用文件,搭建安卓环境的步骤有点繁琐,但搭建一次之后,以后就会非常快捷! (涉及到3.1.1版本的,请自动对应3.2版本,3.x版本的环境搭建都是一样的) ...
- dymola学习笔记-第四天——学习软件自带教程DymolaUserManualVolume1,page87-104
笔记第三天中问题"案例2.4.3为什么Value不呈现?教程上是有的,其次,为什么曲线光滑度没有案例高?"得到了解决. check motor,发现我的motor有error,把他 ...
- 一步步学习SPD2010--第十四章节--在Web页面使用控件(3)--验证用户数据输入
通过使用验证控件,你可以验证用户输入到控件的数据.插入的控件可以是HTML标签或者标准ASP.NET控件. 在本次练习中,你创建数据输入表单,并使用RequiredFi ...
- 一步步学习SPD2010--第十四章节--在Web页面使用控件(2)--使用标准ASP.NET服务器控件...
尽管你可以在所有类型Web页面使用HTML控件,但是,真正强大的功能在你使用ASP.NET服务器控件时才会显现出来.ASP.NET服务器控件出现在插入选项卡控件组内的ASP.NET选 ...
- GUI学习之十四——QAbstractSpinBox学习总结
QAbstractSpinBox是一个抽象类,是将所有步长调节器的通用的功能抽象出了一个父类.虽然QAbstractSpinBox是一个抽象类,但是可以直接实例化使用.QAbstractSpinBox ...
- 链乔教育在线|智能合约学习——以太坊智能合约学习笔记(四)
链乔教育在线|智能合约学习--以太坊智能合约学习笔记(四) 本笔记记录的是使用matemask钱包连接以太坊私链,并编写一个基本的提币智能合约部署到以太坊私链上,进行调用. 一.浏览器安装metama ...
- 零基础带你学习计算机网络—(四)
零基础带你学习计算机网络(四) 学习内容 一.物理层的基本概念 二.传输媒体 三.传输方式 四.编码与调制 五.信道的极限容量 六.真题演练 思维导图 (一).物理层的基本概念 物理层考虑的是怎样才能 ...
最新文章
- c语言 dll注入,教大家写一个远程线程的DLL注入,其实还是蛮简单的……………………...
- 在Eclipse中使用JUnit4进行单元测试
- 算法-------矩阵中的最长递增路径(Java版本)
- 20170819 - 今日技能封装 - A
- ubuntu安装python编译器_Ubuntu中安装VIM编辑器
- python删除数据库_用Python删除Cosmos数据库文档
- jfinal启动正常,但是报错:oejw.WebAppContext:Failed startup
- 作为程序员,你还在用B站学习?别做梦了
- [原译]实现IEnumerable接口理解yield关键字
- 【数据分享】历次人口普查数据(一普到七普)
- UPnP 端口映射服务威胁分析
- [论文写作笔记] C9 概括和结论展示科学严谨性
- context deadline exceeded
- 基于JAVA HAPI包以树形结构实现可配置式 HL7消息接收与解析
- 2020 年 1 月 14 日外延支持结束后继续接收安全更新的过程
- Mysql中Drop删除用户的名字_mysql中drop和delete方法删除用户的区别
- Cisco 2960 3750交换机端口流量限速(QOS)
- 5,uniapp功能之—打印机,打印文本和二维码等,(佳博的打印机)
- 联想320s笔记本能装服务器系统,联想ideapad 320S-15笔记本怎么安装win7系统
- 万能的wifi空口Tcp抓包方式
热门文章
- AndroidStudio的Gradle完全教程
- 【线性代数】分块矩阵的运算、特征值
- m2硬盘写入速度测试软件,【英特尔 600P 512G PCIE M2 SSD 固态硬盘测试总结】读写速度|空间_摘要频道_什么值得买...
- Android的布局管理--表格布局
- 解决win10一开机内存(8G)就占用70%多,查看任务管理器并没有占用内存很高的进程的问题
- 五金自营平台进军MRO百亿市值 行业独角兽势头显现
- 2.4GHz WiFi速率测试指导及Omnipeek 空口log分析
- 除留余数法构造哈希函数并用链地址法处理哈希冲突【C++实现】
- 获取光标位置及动态设置光标到指定位置
- 《把时间当作朋友》之读后感