将中文语句矩阵数字化

import pandas as pd
import pickle

先导入pandas库，该库提供了csv文件的相关数据分析功能，是我们数据挖掘必须学会使用的库。
之后导入pickle库，该库是用于将中文语句中使用频数超过10次的字或词收集的字典序列化。

train_df = pd.read_csv('../yangqf/Desktop/Train.csv',encoding='gb18030')
train_df.head()

读取Train.csv文件，采用gb18030的方式解码，并显示该数据集中的前五行数据

jieguoList = ['Love', 'Sorrow', 'Hate', 'Anxiety', 'Surprise', 'Expect', 'Joy', 'Anger']
train_df.info()

jieguoList列表用于存储分类类别，info()用于显示数据集的基本数据情况

train_df["Text"][0]

显示第一个数据的Text列文字

length = len(train_df) #数据的大小
Total_dict={} #将数据集中所有出现的字词编号并放入该字典中
icount = 0 #计数器
ziShu_dict = {} #字数字典for i in range(length):  #该for循环用于让字数字典所有字词的计数先初始化为0 Str = ""for j in range(len(train_df["Text"][i])):if train_df["Text"][i][j] != ' ':Str = "{0}{1}".format(Str,train_df["Text"][i][j])continueziShu_dict[Str] = 0 #字典内的数据显示如:{"她们":0,"都":0,"睡":0,"蹑手蹑脚":0,……}Str = ""for i in range(length): #该for循环用于将字数字典内的字词进行分别计数，统计数据集所有的对应的字词数量#Total_dict字典将所有的字词进行收集并记录序号,数据显示如:#{她们":0,"都":1,"睡":2,"蹑手蹑脚":3,……}Str = ""for j in range(len(train_df["Text"][i])):if train_df["Text"][i][j] != ' ':Str = "{0}{1}".format(Str,train_df["Text"][i][j])continueif Str in Total_dict:ziShu_dict[Str] += 1else:Total_dict[Str] = icounticount+=1Str = ""
print(len(Total_dict))  #输出总字典的长度
print(len(ziShu_dict))  #输出字数字典的长度，会与总字典长度一样

icount = 0
for i in range(len(Total_dict)): #该for循环是删除字数字典中字词出现数量小于10次的字词，也是为了防止出            #现过拟合的现象，减小特征向量的空间，减小学习模型的复杂度，并把字数字             #典改成序号排序的字典，类似于Total_dictList = []temp = list(Total_dict.keys())[list(Total_dict.values()).index(i)]  #该语句作用是根据字典的#value值提取对应的Keyif ziShu_dict[temp] < 10:del ziShu_dict[temp]else:ziShu_dict[temp] = icounticount += 1for i in range(len(jieguoList)): #该for循环是为了创建多个分类列，因为我们要处理的是多标签的分类问题，          #所以先将问题转化为二分类的问题，比如提取Love类，将Label里有Love的#标记为1，没有Love的标记为0，这样可以创建一个专门判断Love的学习模#型，以此类推，总共需要创建8个学习模型。List = []for j in range(length):if jieguoList[i] in train_df["Labels"][j] :List.append(1)else:List.append(0)train_df[jieguoList[i]] = Listicount = 0for i in range(len(ziShu_dict)): #for循环用于将中文语句数字化，比如'他们'这个词在总体数据集中出现超过#了10次，便创建一列，将它设为列名为0，接着判断所有语句中是否出现了它#，如果出现了，便将0列设为1，没有出现设为0。依次类推，可以将中文数字#化List = []temp = list(ziShu_dict.keys())[list(ziShu_dict.values()).index(i)]for j in range(length):if temp in train_df["Text"][j] :List.append(1)else:List.append(0)icount += 1print(len(ziShu_dict)-icount)train_df[i] = Listwith open('Total_dict.dict', 'wb') as handle:  #用于序列化Total_dict字典，保存在磁盘上，便于下次用在#测试集上的中文序列化。pickle.dump(Total_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)with open('ziShu_dict.dict', 'wb') as handle:  #原理同上pickle.dump(ziShu_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

print(ziShu_dict)
print(len(ziShu_dict))
train_df.to_csv('../yangqf/Desktop/zhongyaoshuju.csv', index=False)

输出字数字典，并输出其长度，最后将数字化后的数据集保存在磁盘上，便于之后读取处理。

train_df.head()

对8个类别进行模型建立，找到8个最适合的机器学习模型

# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
import pickle
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

导入数据分析处理的库和可视化的库和机器学习的库

# 读取数据
train_df = pd.read_csv('../yangqf/Desktop/zhongyaoshuju.csv')
x = train_df.drop("ID",axis = 1) #丢弃数据集中没用的ID列
x = x.drop("Text",axis=1) #接着丢弃没用的“Text”列
x = x.drop("Labels",axis=1) #再丢弃没用的"Labels"列
x_love = x.drop(['Sorrow', 'Hate', 'Anxiety', 'Surprise', 'Expect', 'Joy', 'Anger'], axis=1, inplace=True) #留下Love列，其他类都丢弃
x_love_train = x.drop("Love",axis=1)  #丢弃完Love这个Labels后，剩余的便是要求的训练集特征向量
y_love_train = x['Love'] #将Love列当做给训练集标签列# 将特征划分到 X 中，标签划分到 Y 中
# 使用train_test_split函数划分数据集(训练集占75%，测试集占25%)
x_love_train, x_love_test, y_love_train, y_love_test = train_test_split(x_love_train, y_love_train,test_size=0.25)

# Logistic Regressionlove_logreg = LogisticRegression()
love_logreg.fit(x_love_train, y_love_train)
y_love_predict = love_logreg.predict(x_love_test)
acc_log = round(love_logreg.score(x_love_test, y_love_test) * 100, 2)
print(acc_log)

经过与其他机器学习模型比较之后，发现Logistic回归模型的效果更好，故对Love类的判断采用逻辑回归模型。

x = train_df.drop(['ID','Text','Labels','Love','Hate','Anxiety','Surprise','Expect','Joy','Anger'],axis=1)
x
x_sorrow_train = x.drop("Sorrow",axis=1)
y_sorrow_train = x['Sorrow']
# 使用train_test_split函数划分数据集(训练集占75%，测试集占25%)
x_sorrow_train, x_sorrow_test, y_sorrow_train, y_sorrow_test = train_test_split(x_sorrow_train, y_sorrow_train,test_size=0.25)# Random Forestsorrow_random_forest = RandomForestClassifier(n_estimators=100)
sorrow_random_forest.fit(x_sorrow_train, y_sorrow_train)
# Y_pred = random_forest.predict(X_test)
# random_forest.score(X_train, Y_train)
acc_random_forest = round(sorrow_random_forest.score(x_sorrow_test, y_sorrow_test) * 100, 2)
acc_random_forest

原理同上，将sorrow列单独拎出来判断，发现随机森林模型的判断更好，顾选择其判断。

x = train_df.drop(['ID','Text','Labels','Love','Sorrow','Anxiety','Surprise','Expect','Joy','Anger'],axis=1)
x
x_hate_train = x.drop("Hate",axis=1)
y_hate_train = x['Hate']
# 使用train_test_split函数划分数据集(训练集占75%，测试集占25%)
x_hate_train, x_hate_test, y_hate_train, y_hate_test = train_test_split(x_hate_train, y_hate_train,test_size=0.25)
# Random Foresthate_random_forest = RandomForestClassifier(n_estimators=100)
hate_random_forest.fit(x_hate_train, y_hate_train)
# Y_pred = random_forest.predict(X_test)
# random_forest.score(X_train, Y_train)
acc_random_forest = round(hate_random_forest.score(x_hate_test, y_hate_test) * 100, 2)
print(acc_random_forest)

x = train_df.drop(['ID','Text','Labels','Love','Sorrow','Hate','Surprise','Expect','Joy','Anger'],axis=1)
x
x_anxiety_train = x.drop("Anxiety",axis=1)
y_anxiety_train = x['Anxiety']
# 使用train_test_split函数划分数据集(训练集占75%，测试集占25%)
x_anxiety_train, x_anxiety_test, y_anxiety_train, y_anxiety_test = train_test_split(x_anxiety_train, y_anxiety_train,test_size=0.25)# Random Forestanxiety_random_forest = RandomForestClassifier(n_estimators=100)
anxiety_random_forest.fit(x_anxiety_train, y_anxiety_train)
# Y_pred = random_forest.predict(X_test)
# random_forest.score(X_train, Y_train)
acc_random_forest = round(anxiety_random_forest.score(x_anxiety_test, y_anxiety_test) * 100, 2)
print(acc_random_forest)

x = train_df.drop(['ID','Text','Labels','Love','Sorrow','Hate','Anxiety','Expect','Joy','Anger'],axis=1)
x
x_surprise_train = x.drop("Surprise",axis=1)
y_surprise_train = x['Surprise']
# 使用train_test_split函数划分数据集(训练集占75%，测试集占25%)
x_surprise_train, x_surprise_test, y_surprise_train, y_surprise_test = train_test_split(x_surprise_train, y_surprise_train,test_size=0.25)# Random Forestsurprise_random_forest = RandomForestClassifier(n_estimators=100)
surprise_random_forest.fit(x_surprise_train, y_surprise_train)
# Y_pred = random_forest.predict(X_test)
# random_forest.score(X_train, Y_train)
acc_random_forest = round(surprise_random_forest.score(x_surprise_test, y_surprise_test) * 100, 2)
print(acc_random_forest)

x = train_df.drop(['ID','Text','Labels','Love','Sorrow','Hate','Anxiety','Surprise','Joy','Anger'],axis=1)
x
x_expect_train = x.drop("Expect",axis=1)
y_expect_train = x['Expect']
# 使用train_test_split函数划分数据集(训练集占75%，测试集占25%)
x_expect_train, x_expect_test, y_expect_train, y_expect_test = train_test_split(x_expect_train, y_expect_train,test_size=0.25)# Random Forestexpect_random_forest = RandomForestClassifier(n_estimators=100)
expect_random_forest.fit(x_expect_train, y_expect_train)
# Y_pred = random_forest.predict(X_test)
# random_forest.score(X_train, Y_train)
acc_random_forest = round(expect_random_forest.score(x_expect_test, y_expect_test) * 100, 2)
print(acc_random_forest)

x = train_df.drop(['ID','Text','Labels','Love','Sorrow','Hate','Anxiety','Surprise','Expect','Anger'],axis=1)
x
x_joy_train = x.drop("Joy",axis=1)
y_joy_train = x['Joy']
# 使用train_test_split函数划分数据集(训练集占75%，测试集占25%)
x_joy_train, x_joy_test, y_joy_train, y_joy_test = train_test_split(x_joy_train, y_joy_train,test_size=0.25)
# Random Forestjoy_random_forest = RandomForestClassifier(n_estimators=100)
joy_random_forest.fit(x_joy_train, y_joy_train)
# Y_pred = random_forest.predict(X_test)
# random_forest.score(X_train, Y_train)
acc_random_forest = round(joy_random_forest.score(x_joy_test, y_joy_test) * 100, 2)
print(acc_random_forest)

x = train_df.drop(['ID','Text','Labels','Love','Sorrow','Hate','Anxiety','Surprise','Expect','Joy'],axis=1)
x
x_anger_train = x.drop("Anger",axis=1)
y_anger_train = x['Anger']
# 使用train_test_split函数划分数据集(训练集占75%，测试集占25%)
x_anger_train, x_anger_test, y_anger_train, y_anger_test = train_test_split(x_anger_train, y_anger_train,test_size=0.25)
# Random Forestanger_random_forest = RandomForestClassifier(n_estimators=100)
anger_random_forest.fit(x_anger_train, y_anger_train)
# Y_pred = random_forest.predict(X_test)
# random_forest.score(X_train, Y_train)
acc_random_forest = round(anger_random_forest.score(x_anger_test, y_anger_test) * 100, 2)
print(acc_random_forest)

重复上述过程，便可以得到8个机器学习模型，便于之后对测试集类别进行判断。

joblib.dump(love_logreg, '../yangqf/Desktop/xueximoxing/love_Logistic.model')#也可以使用文件对象
joblib.dump(sorrow_random_forest, '../yangqf/Desktop/xueximoxing/sorrow_random_forest.model')#也可以使用文件对象
joblib.dump(hate_random_forest, '../yangqf/Desktop/xueximoxing/hate_random_forest.model')#也可以使用文件对象
joblib.dump(anxiety_random_forest, '../yangqf/Desktop/xueximoxing/anxiety_random_forest.model')#也可以使用文件对象
joblib.dump(surprise_random_forest, '../yangqf/Desktop/xueximoxing/surprise_random_forest.model')#也可以使用文件对象
joblib.dump(expect_random_forest, '../yangqf/Desktop/xueximoxing/expect_random_forest.model')#也可以使用文件对象
joblib.dump(joy_random_forest, '../yangqf/Desktop/xueximoxing/joy_random_forest.model')#也可以使用文件对象
joblib.dump(anger_random_forest, '../yangqf/Desktop/xueximoxing/anger_random_forest.model')#也可以使用文件对象

将所有学习后的模型进行序列化之后，保存在磁盘上。

这里只是验证打分了模型，之后要记得把所有数据都用上去建立新模型。

记一次数据挖掘：中文语句情绪多标签分类问题相关推荐

C语言输入中文语句并按倒叙将它输出
2019独角兽企业重金招聘Python工程师标准>>> 在Dev C++里按ctrl+空格键就可以输入中文汉字. #include<stdio.h> #include&l ...
第14章用BERT实现中文语句分类
BERT以Transformer的Encoder为架构,已MLM为模型,在很多领域取得历史性的的突破.这里以Transformers上基于中文语料库上训练的预训练模型bert-base-chinese ...
【开源】Time-NLP 中文语句中的时间语义识别
Time-NLP 中文语句中的时间语义识别 author:shinyke github地址:https://github.com/shinyke/Time-NLP/ 本工具是由复旦NLP中的时间分析功 ...
STM32项目（六）—— 中文电子捡货标签
STM32项目(六)-- 中文电子捡货标签宗旨:技术的分享是有限的,分享的精神是无限的. 传统物流行业仓储拣货采用纸单作业,拣货完成后再进行验货.出货,容易造成拣货错误.拣货速度与效率低.新员工培训 ...
数据挖掘：模型选择——监督学习（分类）
数据挖掘:模型选择--监督学习(分类) 机器学习算法可分为监督学习和非监督学习.本文主要讨论非监督学习中的分类任务. 一.简单介绍简单的说,监督学习就是有标签的数据,有需要预测的变量. 分类任务就是 ...
基于deap脑电数据集的脑电情绪识别二分类算法（附代码）
想尝试一下脑电情绪识别的各个二分类算法. 代码主要分为三部分:快速傅里叶变换处理(fft).数据预处理.以及各个模型处理. 采用的模型包括:决策树.SVM.KNN三个模型(模型采用的比较简单,可以直接 ...
JAVA调用有道API接口对数据库中的中文语句进行翻译
今天遇到一个小需求,就是将数据库中的某个中文字段翻译成英文,总共有六百多条,直接只用数据库update语句和手动翻译效率很慢.我想这如果可以调用有道翻译API接口将翻译的语句结合原中文字段拼接成upd ...
记一次数据库查询语句的优化
周六的时候,运维告诉我一个sql慢查询一直报警,他把sql语句给到我,让我优化下. select 很多字段 from ad_order where state = 1 and deleteflag=0 ...
记一次mysql中文字符乱码的问题排查
今天开发反应两样的程序往一个库里面插入数据正常,往另外一个库里面插入数据有乱码.第一反应就是两个数据库关于字符集的配置不一样. 在两个库分别查看参数: show variables like &quo ...

记一次数据挖掘：中文语句情绪多标签分类问题

记一次数据挖掘：中文语句情绪多标签分类问题

将中文语句矩阵数字化

对8个类别进行模型建立，找到8个最适合的机器学习模型

记一次数据挖掘：中文语句情绪多标签分类问题相关推荐

最新文章

热门文章