（R）机器学习--学习笔记--第四章节学习笔记

install.packages("tm")
library(tm)
install.packages("ggplot2")
library(ggplot2)
data.path<-"G:R/ML_for_Hackers-master/03-Classification/data/"
easyham.path<-paste(data.path,"easy_ham/",sep="")
//数据处理的方法
parse.email<-function(path){
full.msg<-msg.full(path)
date<-get.date(full.msg)
from<-get.from(full.msg)
subj<-get.subject(full.msg)
msg<-get.msg(full.msg)
return(c(date,from,subj,msg,path))
}
//第一个函数，用来获取所有的数据
msg.full<-function(path){
con<-file(path,open="rt",encoding="latin1")
msg<-readLines(con)
close(con)
return(msg)
}
//第二个函数，用来获取邮件发件人
//这里传入的是正则，需要详细了解
get.from<-function(msg.vec){
from<-msg.vec[grepl("From: ",msg.vec)]
if(length(strsplit(from,'[":<> ]'))>=1){
    from<-strsplit(from,'[":<> ]')[[1]]
    from<-from[which(from!=""&from!=" ")]
    return(from[grepl("@",from)][1])
}else{
    return("")
}
}
//第三个函数，获取邮件内容
get.msg<-function(msg.vec){
if(!(is.na(which(msg.vec=="")[1]+1))){
    msg<-msg.vec[seq(which(msg.vec=="")[1]+1,length(msg.vec),1)]
    return(paste(msg,collapse="\n"))
}else{
    return("")
}
}
//第四个函数，获取邮件主题
get.subject<-function(msg.vec){
subj<-msg.vec[grepl("Subject: ",msg.vec)]
if(length(subj)>0){
    return(strsplit(subj,"Subject: ")[[1]][2])
}else{
    return("")
}
}
//第五个函数，获取邮件的接收时间和日期
//用冒号，加号或者减号分割日期
//替换掉首尾的空格
get.date<-function(msg.vec){
date.grep<-grepl("^Date: ",msg.vec)
date.grepl<-which(date.grep==TRUE)
date<-msg.vec[date.grepl[1]]
date<-strsplit(date,"\\+|\\-|: ")[[1]][2]
date<-gsub("^\\s+|\\s+$","",date)
return(strtrim(date,25))
}
//数据最终整理
easyham.docs<-dir(easyham.path)
easyham.docs<-easyham.docs[which(easyham.docs!="cdms")]
easyham.parse<-lapply(easyham.docs,function(p) parse.email(paste(easyham.path,
                                                                 p,sep="")))
ehparse.matrix<-do.call(rbind,easyham.parse)
allparse.df<-data.frame(ehparse.matrix,stringAsFactors=FALSE)
names(allparse.df)<-c("Date","From.Email","Subject","Message","Path")
head(allparse.df)

//日期处理（可复用）
date.converter<-function(dates,pattern1,pattern2){
pattern1.convert<-strptime(dates,pattern1)
pattern2.convert<-strptime(dates,pattern2)
pattern1.convert[is.na(pattern1.convert)]<-
pattern2.convert[is.na(pattern2.convert)]
return(pattern1.convert)
}
//这个地方很容易引起问题，导致时间转换后结果是NA，需要增加下面这个系统设置
Sys.setlocale("LC_TIME", "C");

pattern1<-"%a, %d %b %Y %H:%M:%S"
pattern2<-"%d %b %Y %H:%M:%S"
allparse.df$Date<-date.converter(allparse.df$Date,pattern1,pattern2)

//处理其他数据
allparse.df$Subject<-tolower(allparse.df$Subject)
allparse.df$From.Email<-tolower(allparse.df$From.Email)
priority.df<-allparse.df[with(allparse.df,order(Date)),]
priority.train<-priority.df[1:(round(nrow(priority.df)/2)),]

//设置权重策略

library(plyr)

//在这里遇到的主要问题是：

Error: 'names' attribute [11] must be the same length as the vector [1]

原因和解决方案如下面英文解说，主要是时间转换后就占用了9列，所以加起来是不一样的：

#The problem is with the Date field,

#which is at this point a list that contains 9 fields
#(sec, min, hour, mday, mon, year, wday, yday, isdst).
#To solve the problem I've simply converted the dates into character vectors,
#used ddply then converted the dates back to Date
tmp <- priority.train$Date
priority.train$Date <- as.character(priority.train$Date)
from.weight<-ddply(priority.train, .(From.Email), summarise, Freq=length(Subject))
priority.train$Date <- tmp
rm(tmp)
head(from.weight)
names(from.weight)

#此处增加绘制条状图的功能,详细方法

#绘制条状图，查看数据情况
from.weight <- from.weight[with(from.weight, order(Freq)), ]
from.ex <- subset(from.weight, Freq > 6)
from.scales <- ggplot(from.ex) +
geom_rect(aes(xmin = 1:nrow(from.ex) - 0.5,
                xmax = 1:nrow(from.ex) + 0.5,
                ymin = 0,
                ymax = Freq,
                fill = "lightgrey",
                color = "darkblue")) +
scale_x_continuous(breaks = 1:nrow(from.ex), labels = from.ex$From.Email) +
coord_flip() +
scale_fill_manual(values = c("lightgrey" = "lightgrey"), guide = "none") +
scale_color_manual(values = c("darkblue" = "darkblue"), guide = "none") +
ylab("Number of Emails Received (truncated at 6)") +
xlab("Sender Address") +
theme_bw() +
theme(axis.text.y = element_text(size = 5, hjust = 1))
ggsave(plot = from.scales,
       filename = file.path("images", "0011_from_scales.pdf"),
       height = 4.8,
       width = 7)
//计算每一个线程的活跃度
get.threads<-function(threads.matrix,email.df){
threads<-unique(threads.matrix[,2])
thread.counts<-lapply(threads,function(t) thread.counts(t,email.df))
thread.matrix<-do.call(rbind,thread.counts)
return(cbind(threads,thread.matrix))
}

thread.counts<-function(thread,email.df){
thread.times<-email.df$Date[which(email.df$Subject==thread|
                                      email.df$Subject==paste("re:",thread))]
freq<-length(thread.times)
min.time<-min(thread.times)
max.time<-max(thread.times)
time.span<-as.numeric(difftime(max.time,min.time,units="secs"))
if(freq<2){
    return(c(NA,NA,NA))
}else{
    trans.weight<-freq/time.span
    log.trans.weight<-10+log(trans.weight,base=10)
    return(c(freq,time.span,log.trans.weight))
}
}

thread.weights<-get.threads(threads.matrix,priority.train)
thread.weights<-data.frame(thread.weights,StringAsFactors=FALSE)
names(thread.weights)<-c("Thread","Freq","Response","Weight")
thread.weights$Freq<-as.numeric(thread.weights$Freq)
thread.weights$Response<-as.numeric(thread.weights$Response)
#thread.weights$Weight<-as.numeric(thread.weights$Weight)
thread.weights<-subset(thread.weights,is.na(thread.weights$Freq)==FALSE)

head(thread.weights)

term.counts <- function(term.vec, control)
{
vec.corpus <- Corpus(VectorSource(term.vec))
vec.tdm <- TermDocumentMatrix(vec.corpus, control = control)
return(rowSums(as.matrix(vec.tdm)))
}

thread.terms <- term.counts(thread.weights$Thread,
control = list(stopwords = TRUE))
thread.terms <- names(thread.terms)

term.weights <- sapply(thread.terms,
                       function(t) mean(thread.weights$Weight[grepl(t, thread.weights$Thread, fixed = TRUE)]))
term.weights <- data.frame(list(Term = names(term.weights),
                                Weight = term.weights),
                           stringsAsFactors = FALSE,
                           row.names = 1:length(term.weights))

msg.terms <- term.counts(priority.train$Message,
                         control = list(stopwords = TRUE,
                                        removePunctuation = TRUE,
                                        removeNumbers = TRUE))
msg.weights <- data.frame(list(Term = names(msg.terms),
                               Weight = log(msg.terms, base = 10)),
                          stringsAsFactors = FALSE,
                          row.names = 1:length(msg.terms))

get.weights<-function(search.term,weight.df,term=TRUE){
if(length(search.term)>0){
    if(term){
      term.match<-match(names(search.term),weight.df$Term)
    }else{
      term.match<-match(search.term,weight.df$Term)
    }
    match.weights<-weight.df$Weight[which(!is.na(term.match))]
    if(length(match.weights)>1){
      return(1)
    }else{
      return(mean(match.weights))
    }
}else{
    return(1)
}
}
#排序的计算
rank.message <- function(path)
{
msg <- parse.email(path)

from <- ifelse(length(which(from.weight$From.Email == msg[2])) > 0,
                 from.weight$Weight[which(from.weight$From.Email == msg[2])],
                 1)

# Second is based on senders in threads, and threads themselves
thread.from <- ifelse(length(which(senders.df$From.Email == msg[2])) > 0,
                        senders.df$Weight[which(senders.df$From.Email == msg[2])],
                        1)

subj <- strsplit(tolower(msg[3]), "re: ")
is.thread <- ifelse(subj[[1]][1] == "", TRUE, FALSE)
if(is.thread)
{
    activity <- get.weights(subj[[1]][2], thread.weights, term = FALSE)
}
else
{
    activity <- 1
}

# Next, weight based on terms

# Weight based on terms in threads
thread.terms <- term.counts(msg[3], control = list(stopwords = TRUE))
thread.terms.weights <- get.weights(thread.terms, term.weights)

# Weight based terms in all messages
msg.terms <- term.counts(msg[4],
                           control = list(stopwords = TRUE,
                                          removePunctuation = TRUE,
                                          removeNumbers = TRUE))
msg.weights <- get.weights(msg.terms, msg.weights)

# Calculate rank by interacting all weights
rank <- prod(from,
               thread.from,
               activity,
               thread.terms.weights,
               msg.weights)

return(c(msg[1], msg[2], msg[3], rank))
}
# Find splits again
train.paths <- priority.df$Path[1:(round(nrow(priority.df) / 2))]
test.paths <- priority.df$Path[((round(nrow(priority.df) / 2)) + 1):nrow(priority.df)]

head(test.paths)
# Now, create a full-featured training set.
train.ranks <- suppressWarnings(lapply(train.paths, rank.message))
train.ranks.matrix <- do.call(rbind, train.ranks)
train.ranks.matrix <- cbind(train.paths, train.ranks.matrix, "TRAINING")
train.ranks.df <- data.frame(train.ranks.matrix, stringsAsFactors = FALSE)
names(train.ranks.df) <- c("Message", "Date", "From", "Subj", "Rank", "Type")
train.ranks.df$Rank <- as.numeric(train.ranks.df$Rank)

（R）机器学习--学习笔记--第四章节学习笔记相关推荐

线性代数学习笔记——第四章学习指南——n维向量空间
一.学习内容及要求 1. 内容: §4.1. n维向量空间的概念线性代数学习笔记--第四十讲--n维向量空间的概念线性代数学习笔记--第四十一讲--n维向量空间的子空间 §4.2. 向量组的线性相 ...
《Python深度学习》第四章读书笔记
第四章机器学习基础本章重点:处理机器学习问题的通用工作流程: 定义问题与要训练的数据. 收集这些数据,有需要的话用标签来标注数据. 选择衡量问题成功的指标. 你要在验证数据上监控哪些指标? 确定评 ...
Cocos2d-x 3.2 学习笔记（四）学习打包Android平台APK!
从cocos2dx 3.2项目打包成apk安卓应用文件,搭建安卓环境的步骤有点繁琐,但搭建一次之后,以后就会非常快捷! (涉及到3.1.1版本的,请自动对应3.2版本,3.x版本的环境搭建都是一样的) ...
dymola学习笔记-第四天——学习软件自带教程DymolaUserManualVolume1，page87-104
笔记第三天中问题"案例2.4.3为什么Value不呈现?教程上是有的,其次,为什么曲线光滑度没有案例高?"得到了解决. check motor,发现我的motor有error,把他 ...
一步步学习SPD2010--第十四章节--在Web页面使用控件（3）--验证用户数据输入
通过使用验证控件,你可以验证用户输入到控件的数据.插入的控件可以是HTML标签或者标准ASP.NET控件. 在本次练习中,你创建数据输入表单,并使用RequiredFi ...
一步步学习SPD2010--第十四章节--在Web页面使用控件（2）--使用标准ASP.NET服务器控件...
尽管你可以在所有类型Web页面使用HTML控件,但是,真正强大的功能在你使用ASP.NET服务器控件时才会显现出来.ASP.NET服务器控件出现在插入选项卡控件组内的ASP.NET选 ...
GUI学习之十四——QAbstractSpinBox学习总结
QAbstractSpinBox是一个抽象类,是将所有步长调节器的通用的功能抽象出了一个父类.虽然QAbstractSpinBox是一个抽象类,但是可以直接实例化使用.QAbstractSpinBox ...
链乔教育在线|智能合约学习——以太坊智能合约学习笔记（四）
链乔教育在线|智能合约学习--以太坊智能合约学习笔记(四) 本笔记记录的是使用matemask钱包连接以太坊私链,并编写一个基本的提币智能合约部署到以太坊私链上,进行调用. 一.浏览器安装metama ...
零基础带你学习计算机网络—（四）
零基础带你学习计算机网络(四) 学习内容一.物理层的基本概念二.传输媒体三.传输方式四.编码与调制五.信道的极限容量六.真题演练思维导图 (一).物理层的基本概念物理层考虑的是怎样才能 ...

（R）机器学习--学习笔记--第四章节学习笔记

（R）机器学习--学习笔记--第四章节学习笔记相关推荐

最新文章

热门文章