get_article

import os
import re
from defined_function.get_dir_list import get_dir_list
from defined_function.pickle_var_file import pickle_read, pickle_writedef get_info():print('Start getting info...')journal_filter()    #例：在0001-8708\article\j.html中匹配doi值，没有的话，标明是哪篇文章get_article_type() #例：处理0001-8708\article_list后生成article_type，其中包含了文章所属区块的类型和属于该区块的文章数量article_filter()    #匹配article_type，将区块分类，得到大致的文章类型key_except等；再通过比较article_list中文章区块信息、文章类型，确定article的类型；#最终将一些类型不是论文的文章删除，并把文章的序号，类型和文章名字写进列表article_select中get_author_list()   #在article的网页中匹配，生成author_list，其中按序包含了author_all（所有作者），脚注信息，doi值，文章序号，类型以及名字get_article_info()def journal_filter():name_article_list = 'article_list'#article_list是get_article.py中get_article_list()函数处理issue中网页生成的，其中按序包含文章的副网址、所属区块、类型、名字name_article_dir = 'article'key_doi = re.compile(r'SDM.doi = \'(.*?)\'')dir_list = get_dir_list()for i in range(len(dir_list)):article_list = pickle_read(os.path.join(dir_list[i], name_article_list))    #例：实现的是读取0001-8708\article_listfolder = os.path.join(dir_list[i], name_article_dir)    #例：folder就是0001-8708\articlefor j in range(len(article_list)):  #遍历article_list，这样就会知道一个文件夹如0001-8708\article中共有多少篇文章f = open(os.path.join(folder, '{}.html'.format(j)), mode='r', encoding='utf-8')  #例：打开0001-8708\article\j.htmlpage = f.read()f.close()if not key_doi.search(page):print(i, '\{}\{}.html'.format(folder, j))def get_article_type():name_article_list = 'article_list'  #article_list中按序包含文章的副网址、所属区块、类型、名字name_article_type = 'article_type'name_article_type_list = 'article_type.txt'dir_list = get_dir_list()type_list = []type_count = []for i in range(len(dir_list)):print('Getting {} of {} journals...'.format(i+1, len(dir_list)))article_list = pickle_read(os.path.join(dir_list[i], name_article_list))    #例：article_list列表就是0001-8708\article_listfor article in article_list:if not article[1] in type_list: #article[1]为文章所属区块，如果区块不在列表type_list中的话，把区块加进该列表中，并且计数为1type_list.append(article[1])type_count.append(1)else:                               #否则，如果区块已经在type_list中的话，遍历列表type_listfor j in range(len(type_list)):if type_list[j] == article[1]:  #如果有和该区块相同的项，就将该项的计数加1type_count[j] += 1breakarticle_type_list = list(map(lambda m, n: '{}\t{}\n'.format(m, n), type_list, type_count))article_type = list(map(lambda m, n: (m, n), type_list, type_count))    #这里的article_type记录的是文章区块的类型以及属于该区块的文章个数#print("{}".format(article_type))    #以0001-8708为例，article_type中内容如：('', 335), ('Corrigendum', 1), ('Erratum', 1)pickle_write(article_type, name_article_type)   #将列表article_type写进文件name_article_typef = open(name_article_type_list, mode='w', encoding='utf-8')f.writelines(article_type_list)f.close()def article_filter():name_article_type = 'article_type'  #get_info.py中get_article_type()函数处理article_list的article[1]（即文章区块信息）生成的，包含了文章所属区块的类型article_type[0]和属于该区块的文章数量article_type[1]name_article_list = 'article_list'  #article_list中按序包含文章的副网址article[0]、所属区块article[1]、类型article[2]、名字article[3]name_article_select = 'article_select'key_except = re.compile(r'editor.*?choice',re.I)    #re.I为忽略大小写key_exclude = re.compile(r'editor|book|news|acknowledgment|acknowledgement|education|retraction|erratum|'r'introduction|in this issue|feature|foreword|topic|response|reply|comment|'r'index|content|abstract|highlight|obituary|announcement|guideline|\sview|^view|list|'r'presentation|survey|summary|correction|abbreviation', re.I)key_letter = re.compile(r'letter|correspondence', re.I)key_review = re.compile(r'|reveiw|insight', re.I)   #"|"表示或，所以这里就是任意或者review或者insightarticle_type_all = pickle_read(name_article_type)   #article_type_all即article_typetype_except = []type_exclude = []type_letter = []type_review = []for i in range(len(article_type_all)):article_type = article_type_all[i][0]if key_except.search(article_type):type_except.append(article_type)    #没能找到例子elif key_exclude.search(article_type):type_exclude.append(article_type)   #例：'Book reviews', "Editors' Acknowledgement", 'Commentary'elif key_letter.search(article_type):type_letter.append(article_type)    #例：'Correspondence'elif key_review.search(article_type):type_review.append(article_type)    #例：'Articles', 'Reports', 'Review', 'Errata', 'ASHG Awards and Addresses'#print("{}".format(article_type_all))#print("{}".format(type_except))#print("{}".format(type_exclude))#print("{}".format(type_letter))#print("{}".format(type_review))count_letter = 0    #文章区块在type_letter中或文章名字能被 key_letter匹配到的文章属于lettercount_review = 0    #文章类型非空且不是'Original Research Article'的文章，以及，文章区块在type_review中或文章名字能被 key_review匹配到的文章属于reviewcount_paper = 0     #文章类型非空且类型为'Original Research Article'的文章，文章区块在type_paper中的文章，以及文章类型是空的，且区块信息也不在type_except、exclude、letter、review中的文章，属于paperdir_list = get_dir_list()for i in range(len(dir_list)):print('Getting {} of {} journals...'.format(i+1, len(dir_list)))article_list = pickle_read(os.path.join(dir_list[i], name_article_list))article_select = []for j in range(len(article_list)):article = article_list[j]   #article是article_list的第j+1行if article[2]:              #article[2]即文章类型，如果不是空的话，继续判断if article[2] == 'Original Research Article':   #如果文章类型是'Original Research Article'的话article_select.append((j, 'Paper', article[3]))#将文章的序号j，类型'Paper'和文章名字加进列表中count_paper += 1                                #将paper的计数count_paper加1else:article_select.append((j, 'Review', article[3]))#否则的话，认为该类型是'Review'，将文章的序号j，类型'Review'和文章名字加进列表中count_review += 1                                #将review的计数count_review加1continueif article[1] in type_except:   #先判断文章类型article[2]是不是空的，如果是空的，则判断文章的区块信息article[1]是否在type_except中，如果在的话article_select.append((j, 'Paper', article[3])) #认为该文章类型是'Paper'，将文章的序号j，类型'Paper'和文章名字加进列表中count_paper += 1                                 #paper的计数count_paper加1continueif article[1] in type_exclude or key_exclude.search(article[3]):#如果文章的区块信息article[1]在type_exclude中，或者key_exclude能在文章的名字中匹配到信息,则该文章不属于letter、review、paper中任一种continue    #如：如：('/science/article/pii/S0002929713000402?np=y', 'ASHG Awards and Addresses', '', '2012 William Allan Award Introduction: Uta Francke')该类型虽然是属于type_review，但是名字中有Introductionif article[1] in type_letter or key_letter.search(article[3]):  #如果文章的区块信息article[1]在type_letter中，或者key_letter能在文章的名字中匹配到信息article_select.append((j, 'Letter', article[3]))    #认为该文章类型是'Letter'，将文章的序号j，类型'Letter'和文章名字加进列表中count_letter += 1                                    #paper的计数count_paper加1continueif article[1] in type_review or key_review.search(article[3]):#如果文章的区块信息article[1]在type_review中，或者key_review能在文章的名字中匹配到信息article_select.append((j, 'Review', article[3]))        #认为该文章类型是'Review'，将文章的序号j，类型'Review'和文章名字加进列表中count_review += 1continuearticle_select.append((j, 'Paper', article[3])) #如果article[2]即文章类型是空的，且区块信息article[1]也不在type_except、exclude、letter、review中，默认其为'Paper'count_paper += 1pickle_write(article_select, os.path.join(dir_list[i], name_article_select))print(dir_list[i],count_letter, count_review, count_paper)#print("{}".format(article_select))  #形如：(4, 'Paper', 'Genetic and Epigenetic Regulation of Human lincRNA Gene Expression')def get_author_list():name_article_dir = 'article'name_article_select = 'article_select'  #article_select是article_filter()函数中生成的，包括了文章的序号，类型和文章名字name_author_list = 'author_list'        #author_list在此函数中生成，包含了author_all（所有作者），脚注信息，doi值，文章序号，类型以及名字key_author = re.compile(r'<li><a href="#" class="authorName.*?</li>|<li><span class="authorDegrees">.*?</li>')#匹配作者信息key_equal = re.compile(r'<dl class="footnote".*?</dl>') #匹配脚注信息，脚注信息包括等同作者说明、联系地址等等信息key_doi = re.compile(r'SDM.doi = \'(.*?)\'')    #匹配doi值，\'标明是符号‘dir_list = get_dir_list()for i in range(len(dir_list)):print('Getting {} of {} journals...'.format(i+1, len(dir_list)))article_select = pickle_read(os.path.join(dir_list[i], name_article_select))#形如：(4, 'Paper', 'Genetic and Epigenetic Regulation of Human lincRNA Gene Expression')folder = os.path.join(dir_list[i], name_article_dir)#folder路径，例：0002-9297\articleauthor_list = []for article in article_select:f = open(os.path.join(folder, '{}.html'.format(article[0])), mode='r', encoding='utf-8')    #这里article[0]就是文章的序号page = f.read()     #page就是文章，例：0002-9297\article\2.htmlf.close()author_all = key_author.findall(page)   #匹配到的，形如：<li><a href="#" class="authorName S_C_authorName" id="authname_N3a7fe8b0N727f3258" data-t="a" data-fn="Tobias\xa0B."。。。。author_equal = key_equal.findall(page)#匹配到的形如：<dl class="footnote" id="fn1" data-t="n"><dt class="label"><a href="#bfn1" class="intra_ref">3</a></dt><dd><p>These authors contributed equally to this work</p></dd></dl>article_doi = key_doi.search(page).groups()[0]  #形如：10.1016/j.ajhg.2013.05.001author_list.append((author_all, author_equal, article_doi, article[0], article[1], article[2]))#author_all为文章所有作者信息, author_equal为文章的脚注, article_doi为文章的doi值，article[0]为文章序号,article[1]为文章类型，article[2]为文章名字pickle_write(author_list, os.path.join(dir_list[i], name_author_list))def get_article_info():def judge_complete():   #判断equal_name_list[k]将非字符串类型转变成列表中元素nonlocal equal_name_list    #非全局变量，可以访问外部作用域nonlocal author_equalfor k in range(len(equal_name_list)):if type(equal_name_list[k]) == str: #如果equal_name_list[k]是字符串类型的话，返回值为Falsereturn Falseauthor_equal.append([n[0] for n in equal_name_list])    #遍历equal_name_list，不是的话，将其每一行的不是字符串的项放进列表author_equal中，如：[['Xiaoyan Xu'], ['Qiang Fu'], ['Qun Zhang']]处理后变成： [['Xiaoyan Xu', 'Qiang Fu', 'Qun Zhang']]return Truename_author_list = 'author_list' #author_list在函数get_author_list()中生成，包含了文章作者，脚注，doi值，序号，类型以及名字name_article_info = 'article_info'#article_info在此函数中生成key_author_name = re.compile(r'<a href="#" class="authorName.*?>(.*?)</a>')key_author_name_split = re.compile(r'\s')   #匹配空白字符key_author_name_hyphen = re.compile(r'-')   #匹配连字符key_author_name_split_hyphen = re.compile(r'\s|-')  #匹配空白字符或者连字符,切片函数，把匹配到的字符作为分界key_corr = re.compile(r'Corresponding author')  #匹配通讯作者key_equal = re.compile(r'contributed equally')  #匹配等同作者相关信息key_equal_split = re.compile(r'\s[^A-Z\s\.]*?\.\s|\)\.\s')  #匹配空白+[非A-Z 空格 .]集合+ .+空格或者).+空格   如：‘abstracted data. M.A.’中' data. '被匹配出来，相当于起到断句作用key_equal_1 = re.compile(r'class="intra_ref">(.*?)</a>')key_equal_1_not = re.compile(r'Appendix 1')key_equal_1_sub = re.compile(r'<.*?>(.*?)<.*?>')#( ) 标记一个子表达式的开始和结束位置。子表达式可以获取供以后使用。取两个标签之间的内容key_equal_1_spec = re.compile(r' and ')key_equal_1_single = re.compile(r'This author|The author|Co-first author')key_equal_2 = re.compile(r'>[\s\)\]\.,;:]*([^<]*?contributed equally)')#匹配集合[\s\)\.,;:]0次或多次，没有就只要括号>;先匹配括号()中的，括号中内容取以右尖括号开头的字符串#key_equal_2 = re.compile(r'>([^<]*?contributed equally)')#可是为什么一定要匹配这些空格、括号、逗号之类的符号，没有影响啊？key_equal_2_sub = re.compile(r'\s*contributed equally') #匹配空格和contributed equallykey_equal_2_1 = re.compile(r'All authors|All of the authors|All the authors|^Authors$|^The authors$|^The authors do|'r'^The three institutions|^The Tsimikas|These authors|^Northeast Normal University|'r'^These author$')   #匹配这些词，或以这些词开头或结尾，多数表示等同的是所有作者key_equal_2_2 = re.compile(r'Both authors|Both first authors|The 1st 2 authors|The first 2 authors|'r'The first two authors')key_equal_2_3 = re.compile(r'The first 3 authors|The first three authors|the first three authors')key_equal_2_4 = re.compile(r'The last 2 authors|The last two authors')key_equal_2_5 = re.compile(r'The last 3 authors')key_equal_2_6 = re.compile(r'The last four authors')key_equal_2_7 = re.compile(r'Second and third authors')key_equal_3_sub = re.compile(r' have$|^and has.*Merck. |,$|^As joint first authors, |^Author contributions: |'r' performed .*?$|^Author |^Authors |^Both | both$| equally$|^Note: Both |'r' as co-corresponding author$| are joint first authors and$|, these authors have|'r', MD|, PhD|^Professors |^Drs |^Drs. |^The authors | are co-first authors and$')  #匹配表示等同的一些修饰词key_equal_3_split = re.compile(r',\sand\s|,\s|\sand\s|\s&\s')   #匹配首尾有空格的连接词，如：anddir_list = get_dir_list()for i in range(len(dir_list)):print('Getting {} of {} journals...'.format(i+1, len(dir_list)))folder = dir_list[i]print(folder)author_list = pickle_read(os.path.join(folder, name_author_list))   #author_list例：0002-9297\author_listarticle_info = []for article in author_list: #author_list中每一个article都包括文章作者，脚注，doi值，序号，类型以及名字。author_name = [key_author_name.search(n).groups()[0] for n in article[0]]   #匹配article[0]（即文章作者信息），形如：<li><a href="#" class="authorName S_C_authorName"。。。#print("{}\t{}".format(article[3],author_name))     # author_name例：108 ['Xing Hua', 'Haiming Xu', 'Yaning Yang'。。。]（因为article[0]是重复匹配而来的，其中有多少作者，就有多少重复的<li><a href="#"这样的句子，所以n指每一对''中的句子 ）author_name_split = [key_author_name_split.split(n) for n in author_name]   #匹配author_name，删除空白字符，这里n指每一对''中的内容#print("{}\t{}".format(article[3],author_name_split))    #author_name_split例：108   [['Xing', 'Hua'], ['Haiming', 'Xu'], ['Yaning', 'Yang']。。。]author_name_split_hyphen = [key_author_name_split_hyphen.split(n) for n in author_name] #匹配author_name，删除其中空白字符或者连字符#print("{}\t{}".format(article[3],author_name_split_hyphen))    #例：如果author_name是'Moeenaldeen\xa0D. Al-Sayed'那么author_name_split_hyphen是['Moeenaldeen', 'D.', 'Al', 'Sayed']if not author_name:continue    #如果author_name是空的，那么忽略该文章，程序继续author_corr = []    #通讯作者的集合author_equal = []author_equal_all = []equal_flag_1 = []equal_flag_2 = []for j in range(len(article[0])):    #在一篇文章中遍历它的所有作者信息，article[0]是author_list中的第一项，包含作者信息if key_corr.search(article[0][j]):  #如果在article[0]的第j+1个句子中能够匹配到Corresponding author，author_corr.append(author_name[j])#那么就把该文章的第j+1的作者加进列表author_corr，通讯作者的集合#print("{}\t{}".format(article[3],author_corr))#例：180    ['Peter Calabrese', 'Norman Arnheim']for line in article[1]: #在一篇文章中遍历它的脚注信息，article[1]是author_list中的第二项，包含脚注信息if key_equal.search(line):  #如果在脚注信息中匹配到了contributed equally，将这项加进列表author_equal_allauthor_equal_all.append(line)for line in author_equal_all:   #在一篇文章中，遍历author_equal_all列表，其中包含了等同作者的脚注信息if key_equal_1.search(line): #key_equal_1 = re.compile(r'class="intra_ref">(.*?)</a>')，如果能匹配到 ，则将匹配到的字符和被匹配的line赋给equal_flag_tempequal_flag_temp = (key_equal_1.search(line).groups()[0], line)#line中有class="intra_ref">3</a>，那么equal_flag_temp就是3和line 例：'3' '<dl class="footnote"...#print(equal_flag_temp)if key_equal_1_not.search(equal_flag_temp[0]):  #遍历equal_flag_temp[0]，即equal_flag_temp的第一项（上标），如果能匹配到Appendix 1，则把匹配到的字符和line加进列表equal_flag_2equal_flag_2.append((key_equal_2.search(line).groups()[0], line))#print(equal_flag_2)elif key_equal_1_spec.search(equal_flag_temp[0]):#如果在equal_flag_temp[0]中能够匹配到and，那么将匹配到的字符串去除and之后，和line加进列表equal_flag_1equal_flag_1 += [(n, line) for n in key_equal_1_spec.split(equal_flag_temp[0])]#print(equal_flag_1)elif key_equal_1_sub.search(equal_flag_temp[0]):    #如果key_equal_1_sub能够在equal_flag_temp[0]中能够匹配到，将匹配到的字符串和line加进列表equal_flag_1equal_flag_1.append((key_equal_1_sub.search(equal_flag_temp[0]).groups()[0], line))#print(equal_flag_1)else:       #如果以上三种匹配表达式都没有匹配成功的话，就将equal_flag_temp加给列表equal_flag_1equal_flag_1.append(equal_flag_temp)#print(equal_flag_1)else:equal_flag_2.append((key_equal_2.search(line).groups()[0], line))#如果key_equal_1没有匹配成功的话，则在所有的line中用key_equal_2匹配，将匹配到的字符串和line赋给列表equal_flag_2#print("{}\t{}".format(article[3],equal_flag_2))     #以文件夹1558-7673为例：['Drs Linton and Pond contributed equally','<dl class="footnote"...]for line in equal_flag_1:   #遍历每一篇文章中equal_flag_1(有关等同作者的上标)，根据文章的不同上标确定相应的匹配表达式key_equal_flag，在article[0]即作者信息中匹配找到有相同上标的作者集合if line[0] == '**':#根据equal_flag_1里第一项的形式，来确定key_equal_flag的形式key_equal_flag = re.compile(r'<sup>\*\*</sup>|<sup>\*</sup>')   #\*表示符号*elif line[0] == '*':key_equal_flag = re.compile(r'<sup>\*</sup>')elif line[0] == '+':key_equal_flag = re.compile(r'<sup>\+</sup>')   #\+表示符号+else:       #如果以上这些特殊符号都没有的话，就将key_equal_flag设置成如下形式，例：如果equal_flag_1的某一项是：'3','<dl class="footnote"...key_equal_flag = re.compile(r'<sup>' + line[0] + '</sup>')  #那么key_equal_flag = re.compile(r'<sup> 3 </sup>'),使用“+”号可连接字符串temp = []for k in range(len(article[0])):if key_equal_flag.search(article[0][k]):temp.append(author_name[k])#print("{}\t{}\t{}".format(article[3],line[0],temp)) #以文件夹0002-9297为例：180 3   ['Song-Ro Yoon', 'Soo-Kung Choi']，180    4   ['Peter Calabrese', 'Norman Arnheim']if len(temp) == 0:#如果根据equal_flag_1确定的key_equal_flag 在article[0]中没有匹配到等同作者的话，那么在equal_flag_1的line[1]（脚注信息）中再次用key_equal_2匹配equal_flag_2.append((key_equal_2.search(line[1]).groups()[0], line[1]))#将在equal_flag_1中匹配到的字符串和line[1]加进列表equal_flag_2elif len(temp) == 1:    #如果该文章的temp只有一个的话，那么先在line[1]中匹配key_equal_1_single，成功的话，将temp加进列表author_name；#print(len(temp))if key_equal_1_single.search(line[1]):  #key_equal_1_single = re.compile(r'This author|The author|Co-first author')author_name.append(temp)else:               #否则，如果匹配不成功的话，在line[1]中匹配key_equal_2，将匹配到的字符串和line[1]加进列表equal_flag_2中equal_flag_2.append((key_equal_2.search(line[1]).groups()[0], line[1]))else:       #如果该文章的temp多于一个的话，直接将temp加进列表author_equal中author_equal.append(temp)for line in equal_flag_2:   #equal_flag_2形如：[('I.-Y.C. and C.J. contributed equally', '<dl class="footnote" id="item2" data-t="n"><dd><p>I.-Y.C. and C.J. contributed equally to this work.</p></dd></dl>')]equal_split = key_equal_split.split(line[0])    #以在line[0]中匹配到的字符串为分界，例：['E.G.R. and C.S. conducted literature searches and abstracted', 'M.A.W. performed statistical', 'All authors contributed equally']split_words = key_equal_split.findall(line[0])  #找出所有在line[0]中能匹配到的整个正则式,例：[' data. ', ' analyses. ']#print(line[0]) #该例子的line[0]是：E.G.R. and C.S. conducted literature searches and abstracted data. M.A.W. performed statistical analyses. All authors contributed equallyfor k in range(len(split_words)):   #遍历split_words，将split_words的第k+1项加入equal_split的第k+1项，之前把句子断开，现在再接上equal_split[k] += split_words[k]#例：['E.G.R. and C.S. conducted literature searches and abstracted data. ', 'M.A.W. performed statistical analyses. ', 'All authors contributed equally']#print(article[3],equal_split)  #以0010-7824文件夹为例for item in equal_split:    #遍历equal_split，如果在equal_split中匹配到了contributed equally，那么在item中用key_equal_2_sub匹配，并且将匹配到的字符串删除if key_equal.search(item):  #即把contributed equally以及这之前的空格删去，并将剩下的字符串复制给equal_sentence，例：T.M. and A.M.equal_sentence = key_equal_2_sub.sub('', item)  #以0010-7824文件夹为例，匹配到：All authors#print(article[3],equal_sentence)if key_equal_2_1.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_1匹配到，匹配如All authors这些词，则将这篇文章的所有作者都加进等同作者的列表author_equalauthor_equal.append(author_name)continueif key_equal_2_2.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_2匹配到，匹配如Both authors这些词，则将author_name中的前两个作者加进等同作者列表author_equal.append(author_name[:2])#[:2]表示从开始到下标为2的元素，但是不包括结束下标（此处2就为结束下标）#print(author_equal)    #以文件夹0015-0282为例continueif key_equal_2_3.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_3匹配到，匹配如The first 3 authors这些词，则将author_name中的前三个作者加进等同作者列表author_equal.append(author_name[:3])continueif key_equal_2_4.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_4匹配到，匹配如The last 2 authors这些词，则将author_name中的最后两个作者加进等同作者列表author_equal.append(author_name[-2:])continueif key_equal_2_5.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_5匹配到，匹配如The last 3 authors这些词，则将author_name中的最后三个作者加进等同作者列表author_equal.append(author_name[-3:])continueif key_equal_2_6.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_6匹配到，匹配如The last four authors这些词，则将author_name中的最后四个作者加进等同作者列表author_equal.append(author_name[-4:])continueif key_equal_2_7.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_7匹配到，匹配如Second and third authors这些词，则将author_name中的第二个和第三个作者加进等同作者列表author_equal.append(author_name[1:3])continueequal_sentence = key_equal_3_sub.sub('', equal_sentence)#在equal_sentence中将能被key_equal_3_sub匹配到的字符串，如作为第一个单词的Authors这些词，删除#print(article[3],equal_sentence)    #以0015-0282文件夹为例：原来的equal_sentence是：Authors H.S.S. and Y.-M.L.，匹配处理之后是：H.S.S. and Y.-M.L.equal_name_list = key_equal_3_split.split(equal_sentence)   #以在equal_sentence中匹配到的字符串为分界，并将这些字符串组赋值给equal_name_list。#print(article[3],equal_name_list)  #key_equal_3_split主要匹配一些如and的连接词，承接上例：['H.S.S.', 'Y.-M.L.']author_name_modify = [' '.join(n) for n in author_name_split]   #author_name_split形如：[。。。 ['H.', 'Sunny', 'Sun'], ['Yung-Ming', 'Lin']]，此处又重新将姓和名组合起来了#print(article[3],author_name_modify)        # author_name_modify又重新将姓和名组合起来了，形如：[。。。'H. Sunny Sun', 'Yung-Ming Lin']for k in range(len(equal_name_list)):   #找出等同作者列表里的作者在文章作者列表中的完整名称for l in range(len(author_name_modify)):if equal_name_list[k].lower() == author_name_modify[l].lower():#lower()函数返回将字符串中所有大写字符转换为小写后生成的字符串。equal_name_list[k] = [author_name[l]]#同为小写字母时判断，如果equal_name_list第k+1个字符串和author_name第l+1个字符串相同，则将author_name第l+1个元素赋值给equal_name_list第k+1个元素#print(article[3],equal_name_list)   #形如：[['Xiaoyan Xu'], ['Qiang Fu'], ['Qun Zhang']]breakif judge_complete():    #将刚刚得到的列表equal_name_list的中的非字符串类型转变成列表中元素#print(article[3],author_equal) #author_equal承接上例形如：[['Xiaoyan Xu', 'Qiang Fu', 'Qun Zhang']]continueauthor_name_modify = [n[-1]+' '+' '.join(n[:-1]) for n in author_name_split] #将每一项的最后一个单词向前移两个，如：['Ya-Jing Tan', 'Yun Xiong', 'Guo-Lian Ding']经处理后变成：['Tan Ya-Jing', 'Xiong Yun', 'Ding Guo-Lian']#print(article[3],author_name_modify)for k in range(len(equal_name_list)):if type(equal_name_list[k]) == list:#如果equal_name_list中第k+1项是列表类型，那么继续continuefor l in range(len(author_name)):if equal_name_list[k].lower() == author_name_modify[l].lower():#重复前一个循环的比较过程equal_name_list[k] = [author_name[l]]breakif judge_complete():continueauthor_name_modify = [n[0][0]+'. '+' '.join(n[1:]) for n in author_name_split]#取第一个字符串的首字母+符号'.'+字符串第一个单词之后的内容#print(article[3],author_name_modify)    #承接上例，形如：['Y. Tan', 'Y. Xiong', 'G. Ding']for k in range(len(equal_name_list)):   #重复前一个循环的比较过程if type(equal_name_list[k]) == list:continuefor l in range(len(author_name)):if equal_name_list[k].lower() == author_name_modify[l].lower():equal_name_list[k] = [author_name[l]]breakif judge_complete():continueauthor_name_modify = [n[0][0]+' '+' '.join(n[1:]) for n in author_name_split]#相较于上一次改变，删去了符号'.',承接上例，形如： ['Y Tan', 'Y Xiong', 'G Ding']#print(article[3],author_name_modify)for k in range(len(equal_name_list)):   #重复前一个循环的比较过程if type(equal_name_list[k]) == list:continuefor l in range(len(author_name)):if equal_name_list[k].lower() == author_name_modify[l].lower():equal_name_list[k] = [author_name[l]]breakif judge_complete():continueauthor_name_modify = ['.'.join([m[0] for m in n])+'.' for n in author_name_split]#将每一个字符串的每个单词都取其首字母,在同一个字符串中用符号'.'连接#print(article[3],author_name_modify)                                               #承接上例，形如：['Y.T.', 'Y.X.', 'G.D.']for k in range(len(equal_name_list)):   ##重复前一个循环的比较过程if type(equal_name_list[k]) == list:continuefor l in range(len(author_name)):if equal_name_list[k].lower() == author_name_modify[l].lower():equal_name_list[k] = [author_name[l]]breakif judge_complete():continueauthor_name_modify = ['. '.join([m[0] for m in n])+'.' for n in author_name_split]  #取每个元素首字母，之间用'. '连接，并在最后字母处也加上符号'.'，承接上例，形如：['J. C.', 'H. A.', 'W. L.']#print(article[3],author_name_modify)for k in range(len(equal_name_list)):   #重复前一个循环的比较过程if type(equal_name_list[k]) == list:continuefor l in range(len(author_name)):if equal_name_list[k].lower() == author_name_modify[l].lower():equal_name_list[k] = [author_name[l]]breakif judge_complete():continueauthor_name_modify = [''.join([m[0] for m in n]) for n in author_name_split]    #只取每个字符串的首字母，承接上例，形如：['YT', 'YX', 'GD']#print(article[3],author_name_modify)for k in range(len(equal_name_list)):   #重复前一个循环的比较过程if type(equal_name_list[k]) == list:continuefor l in range(len(author_name)):if equal_name_list[k].lower() == author_name_modify[l].lower():equal_name_list[k] = [author_name[l]]breakif judge_complete():continueauthor_name_modify = [n[0]+n[-1][0]+'.' for n in author_name_split] #取第一个单词和最后一个单词的首字母，并加上符号'.',承接上例，形如：['Ya-JingT.', 'YunX.', 'Guo-LianD.']#print(article[3],author_name_modify)for k in range(len(equal_name_list)):   #重复前一个循环的比较过程if type(equal_name_list[k]) == list:continuefor l in range(len(author_name)):if equal_name_list[k].lower() == author_name_modify[l].lower():equal_name_list[k] = [author_name[l]]breakif judge_complete():continueauthor_name_modify = [] #将author_name_modify清空for k in range(len(author_name_split)):#以0015-0282为例，author_name_split形如：[['Jennifer', 'L.', 'Herington'], ['Dana', 'R.', 'Glore']]if len(author_name_split[k]) > 2:   #如果author_name_split第k+1项元素的个数大于2，那么取每个单词的首字母，并以符号'.'或符号'. '连接author_name_modify.append(author_name_split[k][0][0]+'.'+author_name_split[k][1][0]+'. '+' '.join(author_name_split[k][2:]))   #取第一个和第二个元素的首字母，之间用符号'.'连接，以及第三个元素的全部，之间用符号'. '连接，此处是符号和空格#print(article[3],author_name_modify)#形如：['J.L. Herington', 'D.R. Glore']else:author_name_modify.append('')   #否则删除for k in range(len(equal_name_list)):   #重复前一个循环的比较过程if type(equal_name_list[k]) == list:continuefor l in range(len(author_name)):if equal_name_list[k].lower() == author_name_modify[l].lower():equal_name_list[k] = [author_name[l]]breakif judge_complete():continueauthor_name_modify = [] #再次清空for k in range(len(author_name_split)):if len(author_name_split[k]) > 2:#如果author_name_split第k+1项元素的个数大于2，那么取每个单词的首字母，并以符号'. '连接author_name_modify.append(author_name_split[k][0][0]+'. '+author_name_split[k][1][0]+'. '+' '.join(author_name_split[k][2:]))#取第一个和第二个元素的首字母，以及第三个元素的全部,之间用符号'. '连接，符号和空格#print(article[3],author_name_modify)# 形如：['J. L. Herington', 'D. R. Glore']else:author_name_modify.append('')for k in range(len(equal_name_list)):   #重复前一个循环的比较过程if type(equal_name_list[k]) == list:continuefor l in range(len(author_name)):if equal_name_list[k].lower() == author_name_modify[l].lower():equal_name_list[k] = [author_name[l]]breakif judge_complete():continueauthor_name_modify = [''.join([m[0] for m in n]) for n in author_name_split]#取每个单词首字母#print(article[3],author_name_modify)    #形如： ['JLH', 'DRG']for k in range(len(equal_name_list)):   #重复前一个循环的比较过程if type(equal_name_list[k]) == list:continuefor l in range(len(author_name)):if equal_name_list[k].lower() == author_name_modify[l].lower():equal_name_list[k] = [author_name[l]]breakif judge_complete():continueauthor_name_modify = [' '.join(n[:-1])+' '+n[-1][0] for n in author_name_split] #从第一个到倒数第二个元素用空格连接，并取最后一个单词的首字母#print(article[3],author_name_modify)   #形如：['Jennifer L. H', 'Dana R. G']for k in range(len(equal_name_list)):   #重复前一个循环的比较过程if type(equal_name_list[k]) == list:continuefor l in range(len(author_name)):if equal_name_list[k].lower() == author_name_modify[l].lower():equal_name_list[k] = [author_name[l]]breakif judge_complete():continueauthor_name_modify = [n[-1] for n in author_name_split]#取最后一个单词，形如：['Herington', 'Glore']#print(article[3],author_name_modify)for k in range(len(equal_name_list)):   #重复前一个循环的比较过程if type(equal_name_list[k]) == list:continuefor l in range(len(author_name)):if equal_name_list[k].lower() == author_name_modify[l].lower():equal_name_list[k] = [author_name[l]]breakif judge_complete():continueauthor_name_modify = [] #再次清空列表author_name_modifyfor k in range(len(author_name_split_hyphen)):  #如果author_name_split第k+1项元素的个数大于2，那么取第一个单词首字母并加上符号'.'if len(author_name_split_hyphen[k]) > 2:    #取第二个单词首字母并加上符号'.'和空格，以及剩下的所有单词author_name_modify.append(author_name_split_hyphen[k][0][0]+'.'+author_name_split_hyphen[k][1][0]+'. '+' '.join(author_name_split_hyphen[k][2:]))else:author_name_modify.append('')#print(article[3],author_name_modify)    #形如：['J.L. Herington', 'D.R. Glore']for k in range(len(equal_name_list)):   #重复前一个循环的比较过程if type(equal_name_list[k]) == list:continueequal_name_hyphen = key_author_name_hyphen.sub('', equal_name_list[k])  #删除equal_name_list中的连字符for l in range(len(author_name)):if equal_name_hyphen.lower() == author_name_modify[l].lower():equal_name_list[k] = [author_name[l]]breakif judge_complete():continueauthor_name_modify = []for k in range(len(author_name_split_hyphen)):  #如果author_name_split第k+1项元素的个数大于2，那么取其第一个单词首字母加上符号'.'和空格，if len(author_name_split_hyphen[k]) > 2:    #第二个单词首字母加上符号'.'和空格，以及剩下的所有单词author_name_modify.append(author_name_split_hyphen[k][0][0]+'. '+author_name_split_hyphen[k][1][0]+'. '+' '.join(author_name_split_hyphen[k][2:]))else:author_name_modify.append('')#print(article[3],author_name_modify)    #形如：['J. L. Herington', 'D. R. Glore', 'K. L. Bruner Tran']for k in range(len(equal_name_list)):   #重复前一个循环的比较过程，但是增加了一次匹配过程，用于删除其中的连字符if type(equal_name_list[k]) == list:continueequal_name_hyphen = key_author_name_hyphen.sub('', equal_name_list[k])#因为上一步中author_name_modify是由author_name_split_hyphen匹配而来的，已经去除掉连字符了#print(article[3],equal_name_hyphen) #形如：K.L.B.T.for l in range(len(author_name)):if equal_name_hyphen.lower() == author_name_modify[l].lower():equal_name_list[k] = [author_name[l]]breakif judge_complete():continueauthor_name_modify = [' '.join(n) for n in author_name_split]   #将原先被分开的单词重新组合#print(article[3],author_name_modify)    #形如：['Ya-Jing Tan', 'Yun Xiong']for k in range(len(equal_name_list)):if type(equal_name_list[k]) == list:continue        #key_author_name_split是以空白字符为分界；取第一个分组的首字母，并加上符号'.'和空格equal_name_temp = key_author_name_split.split(equal_name_list[k])[0][0] + '. ' + \' '.join(key_author_name_split.split(equal_name_list[k])[1:])     #取从第二个到最后的所有分组，例如：'Y.-J. T.'变成'Y. T.'#print(article[3],equal_name_temp)for l in range(len(author_name)):if equal_name_temp.lower() == author_name_modify[l].lower():equal_name_list[k] = [author_name[l]]   #将相同的那项以列表形式代替equal_name_list的第k+1项breakif judge_complete():continueauthor_name_modify = ['.'.join([m[0] for m in n])+'.' for n in author_name_split_hyphen]#author_name_split_hyphen形如:['Ya', 'Jing', 'Tan'], ['Yun', 'Xiong']#print(article[3],author_name_modify)#取每个分组的首字母，用符号'.'连接，并在该分组中以符号'.'结尾for k in range(len(equal_name_list)):if type(equal_name_list[k]) == list:continueequal_name_temp = re.sub('-', '', equal_name_list[k])#删除equal_name_list项中的连字符，如原本的：H.-K.A.#print(article[3],equal_name_list)   #处理后的equal_name_temp形如：H.K.A.（因为equal_name_temp处理的是equal_name_list[k]，单个的某项）for l in range(len(author_name)):if equal_name_temp.lower() == author_name_modify[l].lower():equal_name_list[k] = [author_name[l]]breakif judge_complete():continueauthor_name_modify = ['.'.join([m[0] for m in n]) for n in author_name_split]#取author_name_split每个分组的首字母，用符号'.'连接#print(article[3],author_name_modify)   #形如：['Y.T', 'Y.X']for k in range(len(equal_name_list)):   #重复前一个循环的比较过程，我将其称为modify过程。if type(equal_name_list[k]) == list:continueequal_name_temp = re.sub('-', '', equal_name_list[k])for l in range(len(author_name)):if equal_name_temp.lower() == author_name_modify[l].lower():equal_name_list[k] = [author_name[l]]breakif judge_complete():continueauthor_name_modify = [''.join([m[0] for m in n]) for n in author_name_split_hyphen]#取author_name_split_hyphen每个分组的首字母#print(article[3],author_name_modify)for k in range(len(equal_name_list)):#重复前一个循环的比较过程if type(equal_name_list[k]) == list:continueequal_name_temp = re.sub('-', '', equal_name_list[k])for l in range(len(author_name)):if equal_name_temp.lower() == author_name_modify[l].lower():equal_name_list[k] = [author_name[l]]breakif judge_complete():continuekey_author_name_temp = [re.compile(n[-1]) for n in author_name_split]   #匹配表达式，取每个分组的最后一个字符串for k in range(len(equal_name_list)):if type(equal_name_list[k]) == list:continuefor l in range(len(author_name)):if key_author_name_temp[l].search(equal_name_list[k]):  #如果能在equal_name_list中第k+1项能被第l+1个匹配式匹配到，就将author_name第l+1项赋给equal_name_list第k+1项equal_name_list[k] = [author_name[l]]breakif judge_complete():continuekey_author_name_temp = [re.compile(n[0]) for n in author_name_split] #匹配表达式，取每个分组的第一个字符串for k in range(len(equal_name_list)):   #重复上一比较过程if type(equal_name_list[k]) == list:continuefor l in range(len(author_name)):if key_author_name_temp[l].search(equal_name_list[k]):equal_name_list[k] = [author_name[l]]breakif judge_complete():continueif equal_name_list[0] in ['T.C-D.', ['Roy Phitayakorn'], 'CDA', 'ASm', 'H.-P.K',['Francesca Moro'], 'Y.-J. T.', 'A.M.D.A.', 'LK.M.', ['Andras Hoffman'],'K.R.', 'M.dC.V.H.', 'Y-G.K.', ['Scott J. Robbie'], ['Seung Hoon Woo'],'M.S.M', 'C.J.C.T.', ['Klaas J. Wardenaar'], 'L.-Q. X.',['Massimiliano Fusaro'], ['Oliver Husser'], ['Icela Palma'], 'W-M.L','program. The project']:author_equal.append(author_name[:2])  #如果equal_name_list的第一项在以上集合中，那么把author_name的前两个元素加入author_equal中continueif equal_name_list[0] in ['M-T.M-G', ["Anthony V. D'Amico"], 'MC', 'CAZ', ['Arne Östman'],'J.J.V.P.']:   # 如果equal_name_list的第一项在以上集合中author_equal.append(author_name[-2:]) #那么把author_name的最后两个元素加入author_equal中continueif equal_name_list[0] in ['MH']:        #如果equal_name_list的第一项在以上集合中author_equal.append(author_name[1:3]) #那么把author_name的第二个和第三个元素加入author_equal中continueif equal_name_list[0] in [['Chunsheng Liu']]:   #如果equal_name_list的第一项在以上集合中，author_equal.append(author_name[:3])  # 那么把author_name的前三个元素加入author_equal中continueif equal_name_list[0] in [['Leonidas Chouliaras']]:#如果equal_name_list的第一项在以上集合中，author_equal.append(author_name[:2]+author_name[-2:])#那么把author_name的前两个和最后两个元素加入author_equal中continueif equal_name_list[0] in [['Karin Hek']]:  #如果equal_name_list的第一项在以上集合中author_equal.append(author_name[:5])     #那么把author_name的前五个元素加入author_equal中continueif equal_name_list[0] in [['Cornelia M. van Duijn']]: #如果equal_name_list的第一项在以上集合中author_equal.append(author_name[-8:])      #那么把author_name的最后八个元素加入author_equal中for j in range(len(author_equal)):temp = []for line in author_equal[j]:  #将author_equal中的每一项加进列表temp中if not line in temp:temp.append(line)if len(temp) == 1 and not author_name[0] in temp:  #如果列表中只有一个元素，并且author_name中的第一个元素不在temp中temp.append(author_name[0])                     #则将author_name中的第一个元素加进列表temp中author_equal[j] = temparticle_index = article[3] #article_index：文章编号article_type = article[4]  #article_type： 文章类型article_title = article[5]  #article_title： 文章名称article_doi = article[2]    #article_doi：文章doi值article_info.append((author_name, author_corr, author_equal, article_index, article_type, article_title,article_doi)) #将所有作者集合、通讯作者集合、等同作者集合、文章编号、文章类型、文章名称、文章doi值按序加进列表article_info中pickle_write(article_info, os.path.join(folder, name_article_info))
get_article_info相关推荐

动态分析Android App之动态调试
这个系列一共有五篇左右,内容主要介绍如何在Java层动态分析和调试Android App,和网上其他教程相比,内容更充实,体系更健全,深入而浅出. 闻道有先后,术业有专攻,希望能给刚入门Android ...
Python网络爬虫实战之爬取小说
一.目标 1- 爬取一个章节的小说 2- 爬取整一本小说二.爬取单章节的小说 2.1 数据准备这里在网上随意搜了一个网站,其链接如下: https://www.biqukan.com/1_1408 ...
[免费专栏] Android安全之动态调试APP的一些技巧「Android Studio调试」
也许每个人出生的时候都以为这世界都是为他一个人而存在的,当他发现自己错的时候,他便开始长大少走了弯路,也就错过了风景,无论如何,感谢经历 Android安全付费专栏长期更新,本篇最新内容请前往: [ ...
Android开发之MVVM模式实践：协程与网络请求的结合
前言大家好,我是小益!在经过前两章对协程的介绍后,我们终于又回到了 MVVM的封装 .协程在Android开发中最常用的场景应该是网络请求了,其次是一些使用 Thread 的场景,本章内容我们将着重 ...
get_article_info

get_article_info相关推荐

最新文章

热门文章