一、读取数据

使用 pandas 读取文件：

data_set = pd.read_csv("data/mbti_1.csv")  # 读取文件

二、显示文件信息

def showTableInfo(data_set):print("DATA PROFILING")print()print("a. Desribing Data Set:")data_set.info()                     # 输出表格信息print()# 统计行、列数目print("b. We have {r} Rows and {c} Columns".format(r=data_set.shape[0], c=data_set.shape[1]))print()# 统计空值数目print("c. Null Values are :")print(data_set[data_set.isnull()].count())print()# 统计一共有多少种性格类型print("d. There are {t} Unique MBTI types in this study".format(t=data_set['type'].nunique()))print(np.unique(np.array(data_set['type'])))print()# 统计用户数目和发言语句数量print("e. No. of Total users & Posts =>")posts = []data_set.apply(lambda x: extract(x, posts), axis=1)print("Number of users", len(data_set))print("Number of posts", len(posts))print()# 输出文件前五行print("f. Data Sneak Peek: First 5 rows")print(data_set.head(5))

三、统计各类型数目

def countTypeNumber(data_set):p_post = data_set['type'].value_counts()  # count of comments per personality type - sns barplot requires 1D data# 柱状图统计每个类别数量plt.figure(figsize=(15, 4))  # 图像的尺寸sns.barplot(p_post.index, p_post.values)  # 柱状图横坐标为类别，纵坐标为数量plt.xlabel('MBTI Personality', size=12)  # x 轴标题plt.ylabel('Posts available', size=12)  # y 轴标题plt.title('Posts with regards to each personality type')  # 图标标题plt.show()  # 显示图表print()print("The number of every type is :")# 输出每个类别的人数for idx in range(len(p_post.values)):print(p_post.index[idx], ": ", p_post.values[idx])

四、数据集处理

# 数据集分割为训练集和测试集，比例为 7：3
X_train, X_test, y_train, y_test = train_test_split(data_set['posts'], data_set['type'],test_size=0.3,random_state=123)tfidf = TfidfVectorizer(stop_words='english')   # 统计词频，并使用 tf-idf编码
X_train = tfidf.fit_transform(X_train)          # 对训练集使用 tf-idf 编码
X_test = tfidf.transform(X_test)                # 对测试集使用 tf-idf 编码

五、模型建立与预测

model1 = LogisticRegression()                   # 逻辑回归模型
model1.fit(X_train, y_train)                    # 训练逻辑回归模型
y_pred1 = model1.predict(X_test)                # 使用训练好的模型预测

六、指标评价

def showMetrics(y_true,y_pred,model_name): # 计算各种指标conf_matrix = confusion_matrix(y_true, y_pred)        # 混淆矩阵acc    = accuracy_score(y_true, y_pred)               # 准确率prec   = precision_score(y_true, y_pred,average='macro')   # 精确率recall = recall_score(y_true, y_pred,average='macro') # 召回率classes = ['ENFJ','ENFP','ENTJ','ENTP','ESFJ','ESFP','ESTJ','ESTP','INFJ','INFP','INTJ','INTP','ISFJ','ISFP','ISTJ','ISTP']  ## 可视化混淆矩阵disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=classes)disp.plot(include_values=True,  # 混淆矩阵每个单元格上显示具体数值cmap="viridis",  # 使用的sklearn中的默认值ax=None,  # 同上xticks_rotation="horizontal",  # 同上values_format="d"  # 显示的数值格式)plt.title('Confusion Matrix of ' + model_name) # 标题名plt.show()                # 显示图片print("Accuracy :",acc)   # 输出准确率print("Precision :",prec) # 输出精确率print("Recall :",recall)  # 输出召回率

七、完整代码

import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplaydef extract(posts, new_posts): # 统计语句数量for post in posts[1].split("|||"):     # 以 ”|||“ 为分隔符new_posts.append((posts[0], post)) # 构建语句列表def showTableInfo(data_set):print("DATA PROFILING")print()print("a. Desribing Data Set:")data_set.info()                     # 输出表格信息print()# 统计行、列数目print("b. We have {r} Rows and {c} Columns".format(r=data_set.shape[0], c=data_set.shape[1]))print()# 统计空值数目print("c. Null Values are :")print(data_set[data_set.isnull()].count())print()# 统计一共有多少种性格类型print("d. There are {t} Unique MBTI types in this study".format(t=data_set['type'].nunique()))print(np.unique(np.array(data_set['type'])))print()# 统计用户数目和发言语句数量print("e. No. of Total users & Posts =>")posts = []data_set.apply(lambda x: extract(x, posts), axis=1)print("Number of users", len(data_set))print("Number of posts", len(posts))print()# 输出文件前五行print("f. Data Sneak Peek: First 5 rows")print(data_set.head(5))def countTypeNumber(data_set):p_post = data_set['type'].value_counts()  # count of comments per personality type - sns barplot requires 1D data# 柱状图统计每个类别数量plt.figure(figsize=(15, 4))  # 图像的尺寸sns.barplot(p_post.index, p_post.values)  # 柱状图横坐标为类别，纵坐标为数量plt.xlabel('MBTI Personality', size=12)  # x 轴标题plt.ylabel('Posts available', size=12)  # y 轴标题plt.title('Posts with regards to each personality type')  # 图标标题plt.show()  # 显示图表print()print("The number of every type is :")# 输出每个类别的人数for idx in range(len(p_post.values)):print(p_post.index[idx], ": ", p_post.values[idx])def showMetrics(y_true,y_pred,model_name): # 计算各种指标conf_matrix = confusion_matrix(y_true, y_pred)        # 混淆矩阵acc    = accuracy_score(y_true, y_pred)               # 准确率prec   = precision_score(y_true, y_pred,average='macro')   # 精确率recall = recall_score(y_true, y_pred,average='macro') # 召回率classes = ['ENFJ','ENFP','ENTJ','ENTP','ESFJ','ESFP','ESTJ','ESTP','INFJ','INFP','INTJ','INTP','ISFJ','ISFP','ISTJ','ISTP']  ## 可视化混淆矩阵disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=classes)disp.plot(include_values=True,  # 混淆矩阵每个单元格上显示具体数值cmap="viridis",  # 使用的sklearn中的默认值ax=None,  # 同上xticks_rotation="horizontal",  # 同上values_format="d"  # 显示的数值格式)plt.title('Confusion Matrix of ' + model_name) # 标题名plt.show()                # 显示图片print("Accuracy :",acc)   # 输出准确率print("Precision :",prec) # 输出精确率print("Recall :",recall)  # 输出召回率if __name__ == '__main__':warnings.filterwarnings("ignore")          # 过滤警告data_set = pd.read_csv("data/mbti_1.csv")  # 读取文件showTableInfo(data_set)                         # 显示数据信息countTypeNumber(data_set)                  # 统计每类性格的人数# 数据集分割为训练集和测试集，比例为 7：3X_train, X_test, y_train, y_test = train_test_split(data_set['posts'], data_set['type'],test_size=0.3,random_state=123)tfidf = TfidfVectorizer(stop_words='english')   # 统计词频，并使用 tf-idf编码X_train = tfidf.fit_transform(X_train)          # 对训练集使用 tf-idf 编码X_test = tfidf.transform(X_test)                # 对测试集使用 tf-idf 编码model1 = LogisticRegression()                   # 逻辑回归模型model1.fit(X_train, y_train)                    # 训练逻辑回归模型y_pred1 = model1.predict(X_test)                # 使用训练好的模型预测print()print("The metrics of LogisticRegression:")showMetrics(y_test,y_pred1,model_name="LogisticRegression") # 计算并输出各种评价指标model2 = SGDClassifier()                        # SGD 线性分类器模型model2.fit(X_train, y_train)                    # 训练 SGD 线性分类器模型y_pred2 = model2.predict(X_test)                # 使用训练好的模型预测print()print("The metrics of SGDClassifier:")showMetrics(y_test, y_pred2, model_name="SGDClassifier")  # 计算并输出各种评价指标

八、参考链接

sklearn计算准确率、精确率、召回率、F1 score
[Python+sklearn] 计算混淆矩阵 confusion_matrix()函数
分类模型confusion matrix混淆矩阵可视化
MBTI数据读取与统计
如何从文本中提取特征信息

sklearn对MBTI分类并统计指标相关推荐

python分类算法的应用_Python基于sklearn库的分类算法简单应用示例
Python基于sklearn库的分类算法简单应用示例来源:中文源码网浏览: 次日期:2018年9月2日 [下载文档: Python基于sklearn库的分类算法简单应用示例.tx ...
sklearn实现KNN分类算法
sklearn实现KNN分类算法 Pyhthon Sklearn 机器学习库提供了 neighbors 模块,该模块下提供了 KNN 算法的常用方法,如下所示: 类方法说明 KNeighborsCl ...
(NO.1)利用sklearn进行鸢尾花分类
文章目录利用sklearn进行鸢尾花分类 preheat 联库版本查询 practice summary 利用sklearn进行鸢尾花分类 preheat 联库 sklearn是基于Numpy和S ...
15分钟带你入门sklearn与机器学习——分类算法篇
作者 | 何从庆本文转载自AI算法之心(ID:AIHeartForYou) [导读]众所周知,Scikit-learn(以前称为scikits.learn)是一个用于Python编程语言的免费软件机 ...
sklearn 统计多分类和单分类结果的混淆矩阵API 写法 confusion_matrix 左边为真实值上边为预测值
分析代码源自 from sklearn.metrics import confusion_matrix 如何写混淆矩阵手动分析如下一将y_true y_pred写成列向量的形式 y_true y ...
sklearn中的分类决策树
决策树决策树简介决策树是一种使用if-then-else的决策规则的监督学习方法. 其三要素为,枝节点,叶节点与分支条件,同时为了减少过拟合还有剪枝方法为了便于记忆,可以称其为一方法三要素决策 ...
python使用欧氏距离knn_python运用sklearn实现KNN分类算法
KNN(K-Nearest-Neighbours Classiflication)分类算法,供大家参考,具体内容如下最简单的分类算法,易于理解和实现实现步骤:通过选取与该点距离最近的k个样本,在这 ...
sklearn保存svm分类模型_【菜菜的sklearn】07 支持向量机(上)
小伙伴们大家好~o(￣▽￣)ブ,我是菜菜,这里是我的sklearn课堂第7期,今天分享的内容是支持向量机(上),下周还有下篇哦~ 我的开发环境是Jupyter lab,所用的库和版本大家参考:Pyth ...
sklearn保存svm分类模型_机器学习100天-Day1601线性支持向量机分类
说明:本文依据<Sklearn 与 TensorFlow 机器学习实用指南>完成,所有版权和解释权均归作者和翻译成员所有,我只是搬运和做注解. 第五章是对支持向量机SVM的系统介绍,阐述支 ...

sklearn对MBTI分类并统计指标

一、读取数据

二、显示文件信息

三、统计各类型数目

四、数据集处理

五、模型建立与预测

六、指标评价

七、完整代码

八、参考链接

sklearn对MBTI分类并统计指标相关推荐

最新文章

热门文章