朴素贝叶斯、费舍尔分类方法

朴素贝叶斯、费舍尔分类法

案例：有一批图书分类样本数据，用朴素贝叶斯、费舍尔分类法对样本数据学习进行分类。

数据清洗

这里我们不讨论数据的获取和清洗，而是聚焦在使用朴素贝叶斯、费舍尔分类法上。假设我们已经从数据中确定了几列有价值的数据，并做好了数据清洗。清洗后生成样本数据如下：

思路逻辑图：

Python代码

1、定义特征、提取特征 getfeatures.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#@Time: 2019-11-10 22:25
#@Author: gaoll#特征定义方法1
'''
（1）将Title中出现的单词word当作一个特征，字符长度2-10之间
（2）将Publiser当作一个特征
（3）将Summary中出现的单词word当作特征，字符长度2-20之间
（4）将Summary中出现的两个词的词组当作特征
（5）统计Summary中出现的大写单词，如果出现频次超多30%则作为一个特征，记作UPPERCASE。这是一个虚拟特征。
'''
import re
def getfeature1(item):arr = item.split('|') #训练样本数据，第一列Title|第二列Publisher|第三列Summary|分类Title = arr[0]Publisher = arr[1]Summary = arr[2]category = arr[3]#print(Title)#print(Publisher)#print(Summary)f = {} #建立一个空字典存放特征splitter = re.compile('\\W*') #按照文中出现的特殊字符，对字符串进行分割#(1)Title中的出现的单词titlewords = [s.lower() for s in Title.split(' ') if len(s)>2 and len(s)<20]#print(titlewords)for w in titlewords:f['Title:'+w] = 1 #将Title中出现的单词作为一个特征，它区分于其他位置出现的特征#(2)Publisher作为一个特征f['Publisher:'+Publisher] = 1#(3)Summary中出现的单词作为特征\(4)Summary中出现的两个词的词组作为特征summarywords = [s for s in Summary.split(' ')  if len(s)>2 and len(s)<20]#print(summarywords)uc = 0for i in range(len(summarywords)):w = summarywords[i]if w.isupper():uc += 1f[w.lower()] = 1  #(3)Summary中出现的单词作为特征if i < len(summarywords) -1 :twowords = ' '.join([summarywords[i].lower(),summarywords[i+1].lower()])  f[twowords] = 1  #(4)Summary中出现的两个词的词组作为特征#(5)统计Summary中出现的大写单词，如果出现频次超多30%则作为一个特征，记作UPPERCASE。这是一个虚拟特征。#print(uc)#print(summarywords)if float(uc)/len(summarywords) > 0.3:f['UPPERCASE'] = 1return f,category

2、创建分类器 docclass.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#@Time: 2019-11-10 23:18
#@Author: gaollimport sqlite3  #使用sqlite操作数据库
import math
#这里定义分类器
#创建一个类
class classifier():"""docstring for classifier"""def __init__(self, getfeatures,filename=None):#super(classifier, self).__init__()self.getfeatures = getfeaturesdef setdb(self,dbfile):#链接数据库，创建表fc,cc用于存放特征数据self.con = sqlite3.connect(dbfile)self.con.execute('create table if not exists fc(feature,category,count)')self.con.execute('create table if not exists cc(category,count)')def incf(self,f,cat):#将[f,cat,num]数据写入 fc 表中count = self.fcount(f,cat)if count == 0 :self.con.execute("insert into fc values(%s,%s,1)"%(f,cat))else:self.con.execute("update fs set count=%d where feature='%s' and category='%s'"%(count+1,f,cat))def fcount(self,f,cat):#从表 fc 中读取当前特征[f,cat,num]res = self.con.execute("select count from fc where feature='%s' and category='%s'"%(f,cat)).fetchone()if res == None:return 0else:return float(res[0])def incc(self,cat):#将[cat,num] 数据写入cc 表中count = self.catcount(cat)if count == 0 :self.con.execute("insert into cc values('%s',1)"%cat)else:self.con.execute("update cc set count=%d where cat='%s'"%(count+1,cat))def catcount(self,cat):#读取当前[cat,num]数据res = self.con.execute("select count from cc where category='%s'"%cat).fetchone()if res == None:return 0else:return float(res[0])def categories(self):#全部分类的列表cur = self.con.execute('select category form cc').fetcheall()category_list = [d[0] for d in cur]return category_listdef totalcount(self):#总的样本量res = self.con.execute('select sum(count) from cc').fetchone()if res == None:return 0else:return res[0]def train(self,item):features,cat = self.getfeatures(item) #特征提取时返回的是特征列表和分类两个变量self.incc(cat)  #[cat,num]数据写入for f in features:self.incf(f,cat)  #[f,cat,num]数据写入self.con.commit() #提交数据库命令操作def basic_prob(self,f,cat):#类方法，基本概率P(f,cat)，朴素贝叶斯和费舍尔分类方法都有用到#定义基本概率P(f,cat)= 特征f出现在cat中计数值/分类cat计数值。if self.catcount(cat) == 0:return 0else:return self.fcount(f,cat)/self.catcount(cat)def weightedprob(self,f,cat,prob_func,weight=1.0,assumedprob=0.5):#从初始值assumedprob=0.5开始，取其与新计算的概率值得加权概率值作为概率basicprob = prob_func(f,cat) #计算当前的概率值totals = 0for c in self.categories():totals += self.fcount(f,cat)  #计算f特征在所有分类中出现的次数，把它当作新概率的权重系数bp = (weight*assumedprob + totals*basicprob) / (weight + totals) #计算权重概率return bp#创建朴素贝叶斯分类器
'''
1、根据贝叶斯公式：P(cat|item) = P(item|cat) * (P(cat) / P(item))
2、其中概率P(item|cat) *= P(f|cat),f为item中提取的特征
3、其中概率P(f|cat)=fc(f,cat)/cc(cat) #定义成了类方法
4、其中概率P(cat)=cc(cat)/total分类数
5、其中P(item)对于本次计算概率都一样，所以忽略此项
6、综上，对于给定的一个item属于分类cat的概率值P(cat|item)
'''class naivebayes(classifier):"""新建一个classifier的子类，命名navibayes，朴素贝叶斯"""def __init__(self, getfeatures):super().__init__(getfeatures) #获取父类中定义的特征提取方法self.thresholds = {} #存储每个分类的阈值def setthreshold(self,cat,t):self.thresholds[cat] = tdef getthreshold(self,cat):if cat not in self.thresholds:return 1.0else:return self.thresholds[cat]def naivebayes_prob(self,item,cat):#根据公式，计算贝叶斯分类概率cat_prob = self.catcount(cat) / self.totalcount() #分类cat的概率item_cat_prob = 1features = self.getfeatures(item) #提取特征for f in features:item_cat_prob *= self.weightedprob(f,cat,self.basic_prob) #计算item属于cat分类的概率，也可以用最简单的概率计算公式basic_probbayesprob = item_cat_prob * cat_probreturn bayesprobdef classify(self,item,default=None):#比较分类概率，及阈值，得出最后的分类maxprob = 0.0best_cat = Nonefor cat in self.categorys():bayesprob = self.naivebayes_prob(item,cat)if bayesprob > maxprob * self.getthreshold(cat):maxprob = bayesprobbest_cat = catreturn best_cat#创建费舍尔分类器
'''
1、根据F检验，计算概率公式P(cat|item)
=invchi2(fscore,df)，其中invchi2为倒置对数卡方函数，自由度df = item中特征量*2。
2、其中值fscore= -2 * math.log(P(item|cat))
3、其中概率P(item|cat) *= P(f|cat),f为item中提取的特征
4、其中P(f|cat)=P1(f|cat)/sum(P1(f|cat),cat)
5、其中概率值P1(f|cat)=fc(f,cat)/cc(cat)
6、综上，对于给定的一个item属于分类cat的概率值P(cat|item)
'''
class fisherclassfier(classifier):"""新建一个classifier的子类，命名fisherclassfier，费舍尔分类器"""def __init__(self, getfeatures):super().__init__(getfeatures) #获取父类中定义的特征提取方法self.minimuns = {} #存储每个分类的概率下限def setminimum(self,cat,min):self.minimuns[cat] = mindef getminmun(self,cat):if cat not in self.minimuns:return 0else:return self.minimuns[cat]def cprob(self,f,cat):#特征在该分类中出现的频率，即基本概率clf = self.basic_prob(f,cat)if clf == 0:return 0#特征在所有分类中出现的频率freqsum = 0 for c in self.categories():freqsum += self.basic_prob(f,c)#概率等于特征在该分类中出现的频率除以总体频率p = clf/freqsumreturn p def invchi2(self,chi,df):m = chi/2.0sum = term = math.exp(-m)for i in range(1,df//2):term *= m/i sum +=termreturn min(sum,1.0)def fisher_prob(self,item,cat):item_cat_prob = 1features = self.getfeatures(item)for f in features:item_cat_prob *= self.weightedprob(f,cat,self.cprob)#计算F值fscore = -2 * math.log(item_cat_prob)#利用倒置对数卡方函数求得概率return self.invchi2(fscore,len(features)*2)def classify(self,item,default=None):#比较各分类概率，及下限，得出最终概率maxprob = 0.0best_cat = Nonefor c in self.categories():fisherprob = self.fisher_prob(item,cat)if fisher_prob > self.getminmun(c) anf fisher_prob > maxprob:maxprob = fisher_probbest_cat = c return best_cat

3、训练和预测 train_predict.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#@Time: 2019-11-11 01:18
#@Author: gaolldef sampletrain(samplefile,cf,database):#对样本文件进行训练cf.setdb(database) #输入sql使用的databasef = open(samplefile,'r')for line in f:cf.train(line) #对样本进行训练def predict(testfile,cf):#对测试文件进行预测#cf.setdb('databasename.db') #输入sql使用的databasef = open(testfile,'r')for line in f:cat=cf.classify(line) #对样本进行训练print('|'.join([line,cat]))

4、运行 run.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#@Time: 2019-11-11 02:53
#@Author: gaollimport os,sys,datetime
import getfeatures
import docclass
import train_predict
from getfeatures import getfeature1
from getfeatures import getfeature2
from train_predict import sampletrain
from train_predict import predictif __name__ == '__main__':train_file = 'train_sample_file.txt'test_file = 'test_sample_file.txt'cf = docclass.fisherclassfier(getfeatures.getfeature1) #使用费舍尔方法 + 第一种特征定义database = 'gaoll' #数据库名train_predict.sampletrain(train_file,cf,database) #开始训练train_predict.predict(test_file,cf) #预测

Done

朴素贝叶斯、费舍尔分类方法相关推荐

朴素贝叶斯实现分类_关于朴素贝叶斯分类及其实现的简短教程
朴素贝叶斯实现分类 Naive Bayes classification is one of the most simple and popular algorithms in data mining ...
r包调用legend函数_R语言实现基于朴素贝叶斯构造分类模型数据可视化
本文内容原创,未经作者许可禁止转载! 目录一.前言二.摘要三.关键词四.算法原理五.经典应用六.R建模 1.载入相关包(内含彩蛋): 1.1 library包载入 1.2 pacman包载 ...
八、朴素贝叶斯中文分类实战
1.朴素贝叶斯中文分类实战文本分类的流程如下图所示: 朴素贝叶斯中文分类的目录结构中文分类的目录机构包括停用词文件.训练集文件和和测试集文件,具体内容如下图所示: 2 数据准备与处理 2.1 数据 ...
朴素贝叶斯算法-分类算法
朴素贝叶斯算法-分类算法 1 概率基础概率定义为一件事情发生的可能性联合概率:包含多个条件,且所有条件同时成立的概率,记作P(A,B) 条件概率:事件A在另一个事件B已经发生条件下的发送概率,记作 ...
构造matlab决策树分类器,Matlab建立逻辑回归，决策树，SVM，KNN和朴素贝叶斯模型分类绘制ROC曲线...
尽管对于较高的阈值,SVM可以产生更好的ROC值,但逻辑回归通常更擅长区分不良雷达收益与良好雷达.朴素贝叶斯的ROC曲线通常低于其他两个ROC曲线,这表明样本内性能比其他两个分类器方法差. 比较所 ...
matlab绘制贝叶斯曲线,Matlab建立SVM，KNN和朴素贝叶斯模型分类绘制ROC曲线
原文链接:http://tecdat.cn/?p=15508 绘制ROC曲线通过Logistic回归进行分类加载样本数据.load fisheriris通过使用与versicolor和virgini ...
判别模型、生成模型与朴素贝叶斯方法
1.判别模型与生成模型上篇报告中提到的回归模型是判别模型,也就是根据特征值来求结果的概率.形式化表示为,在参数确定的情况下,求解条件概率.通俗的解释为在给定特征后预测结果出现的概率. 比如说要确定一 ...
判别模型、生成模型和朴素贝叶斯模型
1判别模型与生成模型上篇报告中提到的回归模型是判别模型,也就是根据特征值来求结果的概率.形式化表示为,在参数确定的情况下,求解条件概率.通俗的解释为在给定特征后预测结果出现的概率. 比如说要确定一只 ...
机器学习实战（四）——基于概率论的分类方法：朴素贝叶斯
朴素贝叶斯法 4.1 基于贝叶斯决策理论的分类方法 4.1.1 贝叶斯决策理论 4.1.2 条件概率 4.1.3 全概率公式 4.1.4 贝叶斯推断 4.1.5 朴素贝叶斯 4.2 使用朴素贝叶斯进行 ...
机器学习（8）朴素贝叶斯算法（20条新闻分类）
目录一.基础理论二.实战:20条新闻分类 1.读取数据 2.训练集划分 3.特征工程(文本特征提取) 4.朴素贝叶斯算法训练 5.模型评估方法一:预测值与真实值比对方法二:计算准确率总代码 ...