需求

目的：通过统计公司年报中的“会计专业词汇”以及“表示转折的连词”等内容，分析年报的可读性（会计词、转折词越多可读性较差），进而分析年报可读性对各方面的影响。
输入：年报（08年到18年）、词典（会计专业词典*4、连词词典）
输出：文章总字数、各词典在文章中出现的次数

词典预处理

本次分析采用的词典主要有5个：连词词典，两个灵格斯的会计词典，会计术语，会计科目

由于灵格斯的两个词典是专用的格式，需要借助工具先转换成txt格式，好处理一些。直接转换出的词典是一行中英文都有的，用代码简单的将英文过滤掉（文中未贴出）

结巴加载词典，然后进行分词

import jieba, csv, os, fnmatch
import codecs# 获取该目录下所有文件的名字
def openfile(path):txt_path_list = []files = os.listdir(path)for filename in files:if not fnmatch.fnmatch(filename, '*.txt'):continueif fnmatch.fnmatch(filename, '*英文版*') or fnmatch.fnmatch(filename, '90*') or fnmatch.fnmatch(filename,'*ST*') or fnmatch.fnmatch(filename, '*修订*') or fnmatch.fnmatch(filename, '*更新*') or fnmatch.fnmatch(filename,'*广告*') or fnmatch.fnmatch(filename, '*取消*')or fnmatch.fnmatch(filename, '*印刷*'):continueif debug:print(filename)txt_path = os.path.join(path, filename)txt_path_list.append(txt_path)return txt_path_listdef get_txt(txt_path):txt = open(txt_path, "r", encoding='ANSI').read()txt = txt.replace("\n", "").replace(" ", "")if debug:print(txt_path)txt_count = len(txt)if debug:print("总字数：", txt_count)return txt, txt_count# 开始分词
def do_jieba(txt):# 载入词典jieba.load_userdict("keywords\\link_words.txt")jieba.load_userdict("keywords\\accounting_words1.txt")# jieba.load_userdict("keywords\\accounting_words2.txt")# jieba.load_userdict("keywords\\accounting_words3.txt")jieba.load_userdict("keywords\\accounting_words3.txt")words = jieba.lcut(txt)  # 使用精确模式对文本进行分词counts = {}  # 通过键值对的形式存储词语及其出现的次数for word in words:# if len(word) == 1:  # 单个词语不计算在内#     continue# else:counts[word] = counts.get(word, 0) + 1  # 遍历所有词语，每出现一次其对应的值加 1jieba_result = list(counts.items())  # 将键值对转换成列表jieba_result.sort(key=lambda x: x[1], reverse=True)  # 根据词语出现的次数进行从大到小排序return jieba_result# 提取关键词
def get_keywords():# link_words_, accounting_words1_, accounting_words2_, accounting_words3_, accounting_words4_ = [], [], [], [], []link_words_, accounting_words1_, accounting_words4_ = [], [], []# 连接词提取for i in open("keywords\\link_words.txt", "r", encoding="utf-8"):i = i.replace("\n", "")# if len(i) == 1:  # 单个词语不计算在内#     continue# else:link_words_.append(i)if debug:print("连接词:", link_words_)# 会计词提取1for i in open("keywords\\accounting_words1.txt", "r", encoding="utf-8"):i = i.replace("\n", "")if len(i) == 1:  # 单个词语不计算在内continueelse:accounting_words1_.append(i)if debug:print("会计词1:", accounting_words1_)# 会计词提取2# for i in open("keywords\\accounting_words2.txt", "r", encoding="utf-8"):#     i = i.replace("\n", "")#     if len(i) == 1:  # 单个词语不计算在内#         continue#     else:#         accounting_words2_.append(i)# if debug:#     print("会计词2:", accounting_words2_)## # 会计词提取3# for i in open("keywords\\accounting_words3.txt", "r", encoding="utf-8"):#     i = i.replace("\n", "")#     if len(i) == 1:  # 单个词语不计算在内#         continue#     else:#         accounting_words3_.append(i)# if debug:#     print("会计词3:", accounting_words3_)# 会计词提取4for i in open("keywords\\accounting_words4.txt", "r", encoding="utf-8"):i = i.replace("\n", "")if len(i) == 1:  # 单个词语不计算在内continueelse:accounting_words4_.append(i)if debug:print("会计词3:", accounting_words4_)# return link_words_, accounting_words1_, accounting_words2_, accounting_words3_, accounting_words4_return link_words_, accounting_words1_, accounting_words4_# 计算字数
def calculate_words(items):link_count = 0accounting_count1 = 0accounting_count2 = 0accounting_count3 = 0accounting_count4 = 0for i in items:if i[0] in link_words:if debug:print(i, len(i[0]))link_count += i[1] * len(i[0])if debug:print("连接词总字数：", link_count)for i in items:if i[0] in accounting_words1:if debug:print(i, len(i[0]))accounting_count1 += i[1] * len(i[0])if debug:print("会计词总字数1：", accounting_count1)# for i in items:#     if i[0] in accounting_words2:#         if debug:#             print(i, len(i[0]))#         accounting_count2 += i[1] * len(i[0])# if debug:#     print("会计词总字数2：", accounting_count2)## for i in items:#     if i[0] in accounting_words3:#         if debug:#             print(i, len(i[0]))#         accounting_count3 += i[1] * len(i[0])# if debug:#     print("会计词总字数3：", accounting_count3)for i in items:if i[0] in accounting_words4:if debug:print(i, len(i[0]))accounting_count4 += i[1] * len(i[0])if debug:print("会计词总字数3：", accounting_count4)# return link_count, accounting_count1, accounting_count2, accounting_count3return link_count, accounting_count1, accounting_count4# 计算个数
def calculate_number(items):link_number = 0accounting_number1 = 0accounting_number4 = 0for i in items:if i[0] in link_words:if debug:print(i, len(i[0]))link_number += (1 + i[1])if debug:print("连接词总个数：", link_number)for i in items:if i[0] in accounting_words1:if debug:print(i, len(i[0]))accounting_number1 += (1 + i[1])if debug:print("会计词总个数1：", accounting_number1)for i in items:if i[0] in accounting_words4:if debug:print(i, len(i[0]))accounting_number4 += (1 + i[1])if debug:print("会计词总个数4：", accounting_number4)# return link_count, accounting_count1, accounting_count2, accounting_count3return link_number, accounting_number1, accounting_number4# 保存数据到csv文件
def save_to_csv(field_list, data):# 1. 创建文件对象f = open('analysis_result.csv', 'w', encoding='utf-8-sig', newline="")# f.write(codecs.BOM_UTF8)# 2. 基于文件对象构建 csv写入对象csv_writer = csv.writer(f)# 3. 构建列表头csv_writer.writerow(field_list)# 4. 写入csv文件内容# csv_writer.writerow(["l", '18', '男'])# csv_writer.writerow(["c", '20', '男'])# csv_writer.writerow(["w", '22', '女'])for i in data:csv_writer.writerow(i)# 5. 关闭文件f.close()if __name__ == '__main__':debug = 0# 获取关键词# link_words, accounting_words1, accounting_words2, accounting_words3 = get_keywords()link_words, accounting_words1, accounting_words4 = get_keywords()# 设置表头# field_list = ["股票代码", "年份", "总字数", "连词字数", "会计词典1字数", "会计词典2字数", "会计词典3字数"]field_list = ["股票代码", "年份", "总字数", "连词个数", "会计词典1个数", "会计词典4个数"]# 最终结果csv_result = []# 文件夹名path_list = ["txt2012", "txt2013", "txt2014", "txt2015", "txt2016", "txt2017", "txt2018"]for path in path_list:year = path[3:]  # 年份txt_path_list = openfile(path)for i in txt_path_list:shares_num = i[8:14]  # 股票号txt, txt_count = get_txt(i)  # 年报和字数jieba_result = do_jieba(txt)# link_count, accounting_count1, accounting_count2, accounting_count3 = calculate_words(jieba_result)  # "连词字数", "会计词典1字数", "会计词典2字数", "会计词典3字数"link_count, accounting_count1, accounting_count4 = calculate_number(jieba_result)  # "连词字数", "会计词典1字数", "会计词典2字数", "会计词典3字数"# csv_result.append([shares_num, year, txt_count, link_count, accounting_count1, accounting_count2, accounting_count3])# print([shares_num, year, txt_count, link_count, accounting_count1, accounting_count2, accounting_count3])csv_result.append([shares_num, year, txt_count, link_count, accounting_count1, accounting_count4])print([shares_num, year, txt_count, link_count, accounting_count1, accounting_count4])# break# breakprint(csv_result)save_to_csv(field_list, csv_result)print("save to csv success!")

后记

代码很简单，主要是记录下帮朋友提供论文数据的过程。
有需要词典的同学请前往如下链接，自行下载。
灵格斯会计词典(2个)ld2格式&txt格式
5个词典(预处理后)txt格式
有需要年报的同学可以私信联系我。
需要协助分析的同学也可以私信，有偿。

利用jieba完成对年报可读性分析相关推荐

利用jieba分词分析小说一
准备工作下载好需要分析的小说txt文件,这里我选择的是<龙族>的第一部. 小说人物名字的txt文件. 中文停用词txt文件. 安装好jieba库. 正式开始用jieba.cut()完成 ...
上市公司年报可读性：财务报告可读性管理层讨论与分析可读性（2001-2021年）
数据来源:自主整理时间跨度:2001-2021年区域范围:沪深A股指标说明: 可读性是文本分析的重要维度.上市公司的年报文本可读性,既可以直接作为解释变量,来分析一系列企业行为,如李春涛老师发表 ...
利用gensim里word2vec训练实例——分析三国里人物关系
前言万物皆可Embedding 入坑cs224N后看完第二周和相关论文.觉得word2vec非常有意思,将一段具有上下文关系的短文(实体)词语学习嵌入到语义空间成为一个向量,然后判断两个词语(实体) ...
python 利用jieba读取txt文本进行分词后存入新txt
python 利用jieba读取txt文本进行分词后存入新txt import jieba txt = open("news.txt",encoding='UTF-8').read ...
符号执行：利用Angr进行简单CTF逆向分析
一.符号执行概括简单的来说,符号执行就是在运行程序时,用符号来替代真实值.符号执行相较于真实值执行的优点在于,当使用真实值执行程序时,我们能够遍历的程序路径只有一条,而使用符号进行执行时,由于符号是 ...
如何更好地利用Pmd、Findbugs和CheckStyle分析结果
这里列出了很多Java静态分析工具,每一种工具关注一个特定的能发挥自己特长的领域,我们可以列举一下: Pmd 它是一个基于静态规则集的Java源码分析器,它可以识别出潜在的如下问题: – 可能的bug ...
如何利用结构化思维写好分析报告？
在讲如何写好分析报告前,我们先来讲讲何为结构化思维?结构化思维是一种从无序到有序的思考过程. 举个例子,现在有个问题:如何把200ml的水装进100ml的杯子?大家可能有各种各样的答案:把水冻成冰,用 ...
利用WebBrowser实现Web打印的分析
利用WebBrowser实现Web打印的分析原文:利用WebBrowser实现Web打印的分析 WebBrowser是IE内置的浏览器控件,无需用户下载.本文档所讨论的是有关IE6.0版本的WebB ...
如何利用wireshark对TCP消息进行分析
原文:https://www.cnblogs.com/studyofadeerlet/p/7485298.html 如何利用wireshark对TCP消息进行分析 (1) 几个概念介绍 1 seq:数 ...

利用jieba完成对年报可读性分析

年报可读性分析

需求

词典预处理

结巴加载词典，然后进行分词

后记

利用jieba完成对年报可读性分析相关推荐

最新文章

热门文章