install.packages("tm")
library(tm)
install.packages("ggplot2")
library(ggplot2)
data.path<-"G:R/ML_for_Hackers-master/03-Classification/data/"
easyham.path<-paste(data.path,"easy_ham/",sep="")
//数据处理的方法
parse.email<-function(path){
  full.msg<-msg.full(path)
  date<-get.date(full.msg)
  from<-get.from(full.msg)
  subj<-get.subject(full.msg)
  msg<-get.msg(full.msg)
  return(c(date,from,subj,msg,path))
}
//第一个函数,用来获取所有的数据
msg.full<-function(path){
  con<-file(path,open="rt",encoding="latin1")
  msg<-readLines(con)
  close(con)
  return(msg)
}
//第二个函数,用来获取邮件发件人
//这里传入的是正则,需要详细了解
get.from<-function(msg.vec){
  from<-msg.vec[grepl("From: ",msg.vec)]
  if(length(strsplit(from,'[":<> ]'))>=1){
    from<-strsplit(from,'[":<> ]')[[1]]
    from<-from[which(from!=""&from!=" ")]
    return(from[grepl("@",from)][1])
  }else{
    return("")
  }
}
//第三个函数,获取邮件内容
get.msg<-function(msg.vec){
  if(!(is.na(which(msg.vec=="")[1]+1))){
    msg<-msg.vec[seq(which(msg.vec=="")[1]+1,length(msg.vec),1)]
    return(paste(msg,collapse="\n"))
  }else{
    return("")
  }
}
//第四个函数,获取邮件主题
get.subject<-function(msg.vec){
  subj<-msg.vec[grepl("Subject: ",msg.vec)]
  if(length(subj)>0){
    return(strsplit(subj,"Subject: ")[[1]][2])
  }else{
    return("")
  }
}
//第五个函数,获取邮件的接收时间和日期
//用冒号,加号或者减号分割日期
//替换掉首尾的空格
get.date<-function(msg.vec){
  date.grep<-grepl("^Date: ",msg.vec)
  date.grepl<-which(date.grep==TRUE)
  date<-msg.vec[date.grepl[1]]
  date<-strsplit(date,"\\+|\\-|: ")[[1]][2]
  date<-gsub("^\\s+|\\s+$","",date)
  return(strtrim(date,25))
}
//数据最终整理
easyham.docs<-dir(easyham.path)
easyham.docs<-easyham.docs[which(easyham.docs!="cdms")]
easyham.parse<-lapply(easyham.docs,function(p) parse.email(paste(easyham.path,
                                                                 p,sep="")))
ehparse.matrix<-do.call(rbind,easyham.parse)
allparse.df<-data.frame(ehparse.matrix,stringAsFactors=FALSE)
names(allparse.df)<-c("Date","From.Email","Subject","Message","Path")
head(allparse.df)

//日期处理(可复用)
date.converter<-function(dates,pattern1,pattern2){
  pattern1.convert<-strptime(dates,pattern1)
  pattern2.convert<-strptime(dates,pattern2)
  pattern1.convert[is.na(pattern1.convert)]<-
    pattern2.convert[is.na(pattern2.convert)]
  return(pattern1.convert)
}
//这个地方很容易引起问题,导致时间转换后结果是NA,需要增加下面这个系统设置
Sys.setlocale("LC_TIME", "C");

pattern1<-"%a, %d %b %Y %H:%M:%S"
pattern2<-"%d %b %Y %H:%M:%S"
allparse.df$Date<-date.converter(allparse.df$Date,pattern1,pattern2)

//处理其他数据
allparse.df$Subject<-tolower(allparse.df$Subject)
allparse.df$From.Email<-tolower(allparse.df$From.Email)
priority.df<-allparse.df[with(allparse.df,order(Date)),]
priority.train<-priority.df[1:(round(nrow(priority.df)/2)),]

//设置权重策略

library(plyr)

//在这里遇到的主要问题是:

Error: 'names' attribute [11] must be the same length as the vector [1]

原因和解决方案如下面英文解说,主要是时间转换后就占用了9列,所以加起来是不一样的:

#The problem is with the Date field,

#which is at this point a list that contains 9 fields
#(sec, min, hour, mday, mon, year, wday, yday, isdst).
#To solve the problem I've simply converted the dates into character vectors,
#used ddply then converted the dates back to Date
tmp <- priority.train$Date
priority.train$Date <- as.character(priority.train$Date)
from.weight<-ddply(priority.train, .(From.Email), summarise, Freq=length(Subject))
priority.train$Date <- tmp
rm(tmp)
head(from.weight)
names(from.weight)

#此处增加绘制条状图的功能,详细方法

#绘制条状图,查看数据情况
from.weight <- from.weight[with(from.weight, order(Freq)), ]
from.ex <- subset(from.weight, Freq > 6)
from.scales <- ggplot(from.ex) +
  geom_rect(aes(xmin = 1:nrow(from.ex) - 0.5,
                xmax = 1:nrow(from.ex) + 0.5,
                ymin = 0,
                ymax = Freq,
                fill = "lightgrey",
                color = "darkblue")) +
  scale_x_continuous(breaks = 1:nrow(from.ex), labels = from.ex$From.Email) +
  coord_flip() +
  scale_fill_manual(values = c("lightgrey" = "lightgrey"), guide = "none") +
  scale_color_manual(values = c("darkblue" = "darkblue"), guide = "none") +
  ylab("Number of Emails Received (truncated at 6)") +
  xlab("Sender Address") +
  theme_bw() +
  theme(axis.text.y = element_text(size = 5, hjust = 1))
ggsave(plot = from.scales,
       filename = file.path("images", "0011_from_scales.pdf"),
       height = 4.8,
       width = 7)
//计算每一个线程的活跃度
get.threads<-function(threads.matrix,email.df){
  threads<-unique(threads.matrix[,2])
  thread.counts<-lapply(threads,function(t) thread.counts(t,email.df))
  thread.matrix<-do.call(rbind,thread.counts)
  return(cbind(threads,thread.matrix))
}

thread.counts<-function(thread,email.df){
  thread.times<-email.df$Date[which(email.df$Subject==thread|
                                      email.df$Subject==paste("re:",thread))]
  freq<-length(thread.times)
  min.time<-min(thread.times)
  max.time<-max(thread.times)
  time.span<-as.numeric(difftime(max.time,min.time,units="secs"))
  if(freq<2){
    return(c(NA,NA,NA))
  }else{
    trans.weight<-freq/time.span
    log.trans.weight<-10+log(trans.weight,base=10)
    return(c(freq,time.span,log.trans.weight))
  }
}

thread.weights<-get.threads(threads.matrix,priority.train)
thread.weights<-data.frame(thread.weights,StringAsFactors=FALSE)
names(thread.weights)<-c("Thread","Freq","Response","Weight")
thread.weights$Freq<-as.numeric(thread.weights$Freq)
thread.weights$Response<-as.numeric(thread.weights$Response)
#thread.weights$Weight<-as.numeric(thread.weights$Weight)
thread.weights<-subset(thread.weights,is.na(thread.weights$Freq)==FALSE)

head(thread.weights)

term.counts <- function(term.vec, control)
{
  vec.corpus <- Corpus(VectorSource(term.vec))
  vec.tdm <- TermDocumentMatrix(vec.corpus, control = control)
  return(rowSums(as.matrix(vec.tdm)))
}

thread.terms <- term.counts(thread.weights$Thread,
                            control = list(stopwords = TRUE))
thread.terms <- names(thread.terms)

term.weights <- sapply(thread.terms,
                       function(t) mean(thread.weights$Weight[grepl(t, thread.weights$Thread, fixed = TRUE)]))
term.weights <- data.frame(list(Term = names(term.weights),
                                Weight = term.weights),
                           stringsAsFactors = FALSE,
                           row.names = 1:length(term.weights))

msg.terms <- term.counts(priority.train$Message,
                         control = list(stopwords = TRUE,
                                        removePunctuation = TRUE,
                                        removeNumbers = TRUE))
msg.weights <- data.frame(list(Term = names(msg.terms),
                               Weight = log(msg.terms, base = 10)),
                          stringsAsFactors = FALSE,
                          row.names = 1:length(msg.terms))

get.weights<-function(search.term,weight.df,term=TRUE){
  if(length(search.term)>0){
    if(term){
      term.match<-match(names(search.term),weight.df$Term)
    }else{
      term.match<-match(search.term,weight.df$Term)
    }
    match.weights<-weight.df$Weight[which(!is.na(term.match))]
    if(length(match.weights)>1){
      return(1)
    }else{
      return(mean(match.weights))
    }
  }else{
    return(1)
  }
}
#排序的计算
rank.message <- function(path)
{
  msg <- parse.email(path)
 
  from <- ifelse(length(which(from.weight$From.Email == msg[2])) > 0,
                 from.weight$Weight[which(from.weight$From.Email == msg[2])],
                 1)
 
  # Second is based on senders in threads, and threads themselves
  thread.from <- ifelse(length(which(senders.df$From.Email == msg[2])) > 0,
                        senders.df$Weight[which(senders.df$From.Email == msg[2])],
                        1)
 
  subj <- strsplit(tolower(msg[3]), "re: ")
  is.thread <- ifelse(subj[[1]][1] == "", TRUE, FALSE)
  if(is.thread)
  {
    activity <- get.weights(subj[[1]][2], thread.weights, term = FALSE)
  }
  else
  {
    activity <- 1
  }
 
  # Next, weight based on terms    
 
  # Weight based on terms in threads
  thread.terms <- term.counts(msg[3], control = list(stopwords = TRUE))
  thread.terms.weights <- get.weights(thread.terms, term.weights)
 
  # Weight based terms in all messages
  msg.terms <- term.counts(msg[4],
                           control = list(stopwords = TRUE,
                                          removePunctuation = TRUE,
                                          removeNumbers = TRUE))
  msg.weights <- get.weights(msg.terms, msg.weights)
 
  # Calculate rank by interacting all weights
  rank <- prod(from,
               thread.from,
               activity,
               thread.terms.weights,
               msg.weights)
 
  return(c(msg[1], msg[2], msg[3], rank))
}
# Find splits again
train.paths <- priority.df$Path[1:(round(nrow(priority.df) / 2))]
test.paths <- priority.df$Path[((round(nrow(priority.df) / 2)) + 1):nrow(priority.df)]

head(test.paths)
# Now, create a full-featured training set.
train.ranks <- suppressWarnings(lapply(train.paths, rank.message))
train.ranks.matrix <- do.call(rbind, train.ranks)
train.ranks.matrix <- cbind(train.paths, train.ranks.matrix, "TRAINING")
train.ranks.df <- data.frame(train.ranks.matrix, stringsAsFactors = FALSE)
names(train.ranks.df) <- c("Message", "Date", "From", "Subj", "Rank", "Type")
train.ranks.df$Rank <- as.numeric(train.ranks.df$Rank)

(R)机器学习--学习笔记--第四章节学习笔记相关推荐

  1. 线性代数学习笔记——第四章学习指南——n维向量空间

    一.学习内容及要求 1. 内容: §4.1. n维向量空间的概念 线性代数学习笔记--第四十讲--n维向量空间的概念 线性代数学习笔记--第四十一讲--n维向量空间的子空间 §4.2. 向量组的线性相 ...

  2. 《Python深度学习》第四章读书笔记

    第四章 机器学习基础 本章重点:处理机器学习问题的通用工作流程: 定义问题与要训练的数据. 收集这些数据,有需要的话用标签来标注数据. 选择衡量问题成功的指标. 你要在验证数据上监控哪些指标? 确定评 ...

  3. Cocos2d-x 3.2 学习笔记(四)学习打包Android平台APK!

    从cocos2dx 3.2项目打包成apk安卓应用文件,搭建安卓环境的步骤有点繁琐,但搭建一次之后,以后就会非常快捷! (涉及到3.1.1版本的,请自动对应3.2版本,3.x版本的环境搭建都是一样的) ...

  4. dymola学习笔记-第四天——学习软件自带教程DymolaUserManualVolume1,page87-104

    笔记第三天中问题"案例2.4.3为什么Value不呈现?教程上是有的,其次,为什么曲线光滑度没有案例高?"得到了解决. check motor,发现我的motor有error,把他 ...

  5. 一步步学习SPD2010--第十四章节--在Web页面使用控件(3)--验证用户数据输入

            通过使用验证控件,你可以验证用户输入到控件的数据.插入的控件可以是HTML标签或者标准ASP.NET控件.         在本次练习中,你创建数据输入表单,并使用RequiredFi ...

  6. 一步步学习SPD2010--第十四章节--在Web页面使用控件(2)--使用标准ASP.NET服务器控件...

             尽管你可以在所有类型Web页面使用HTML控件,但是,真正强大的功能在你使用ASP.NET服务器控件时才会显现出来.ASP.NET服务器控件出现在插入选项卡控件组内的ASP.NET选 ...

  7. GUI学习之十四——QAbstractSpinBox学习总结

    QAbstractSpinBox是一个抽象类,是将所有步长调节器的通用的功能抽象出了一个父类.虽然QAbstractSpinBox是一个抽象类,但是可以直接实例化使用.QAbstractSpinBox ...

  8. 链乔教育在线|智能合约学习——以太坊智能合约学习笔记(四)

    链乔教育在线|智能合约学习--以太坊智能合约学习笔记(四) 本笔记记录的是使用matemask钱包连接以太坊私链,并编写一个基本的提币智能合约部署到以太坊私链上,进行调用. 一.浏览器安装metama ...

  9. 零基础带你学习计算机网络—(四)

    零基础带你学习计算机网络(四) 学习内容 一.物理层的基本概念 二.传输媒体 三.传输方式 四.编码与调制 五.信道的极限容量 六.真题演练 思维导图 (一).物理层的基本概念 物理层考虑的是怎样才能 ...

最新文章

  1. c语言 dll注入,教大家写一个远程线程的DLL注入,其实还是蛮简单的……………………...
  2. 在Eclipse中使用JUnit4进行单元测试
  3. 算法-------矩阵中的最长递增路径(Java版本)
  4. 20170819 - 今日技能封装 - A
  5. ubuntu安装python编译器_Ubuntu中安装VIM编辑器
  6. python删除数据库_用Python删除Cosmos数据库文档
  7. jfinal启动正常,但是报错:oejw.WebAppContext:Failed startup
  8. 作为程序员,你还在用B站学习?别做梦了
  9. [原译]实现IEnumerable接口理解yield关键字
  10. 【数据分享】历次人口普查数据(一普到七普)
  11. UPnP 端口映射服务威胁分析
  12. [论文写作笔记] C9 概括和结论展示科学严谨性
  13. context deadline exceeded
  14. 基于JAVA HAPI包以树形结构实现可配置式 HL7消息接收与解析
  15. 2020 年 1 月 14 日外延支持结束后继续接收安全更新的过程
  16. Mysql中Drop删除用户的名字_mysql中drop和delete方法删除用户的区别
  17. Cisco 2960 3750交换机端口流量限速(QOS)
  18. 5,uniapp功能之—打印机,打印文本和二维码等,(佳博的打印机)
  19. 联想320s笔记本能装服务器系统,联想ideapad 320S-15笔记本怎么安装win7系统
  20. 万能的wifi空口Tcp抓包方式

热门文章

  1. AndroidStudio的Gradle完全教程
  2. 【线性代数】分块矩阵的运算、特征值
  3. m2硬盘写入速度测试软件,【英特尔 600P 512G PCIE M2 SSD 固态硬盘测试总结】读写速度|空间_摘要频道_什么值得买...
  4. Android的布局管理--表格布局
  5. 解决win10一开机内存(8G)就占用70%多,查看任务管理器并没有占用内存很高的进程的问题
  6. 五金自营平台进军MRO百亿市值 行业独角兽势头显现
  7. 2.4GHz WiFi速率测试指导及Omnipeek 空口log分析
  8. 除留余数法构造哈希函数并用链地址法处理哈希冲突【C++实现】
  9. 获取光标位置及动态设置光标到指定位置
  10. 《把时间当作朋友》之读后感