实体消歧（链接到实体库）

disambiguation.py

#!/usr/bin/python3
import pymysql
import json
import requests
from SimilarityEN import similaritydef findCandidates(entity):# 打开数据库连接db = pymysql.connect(host=host, port=3306,user=username, passwd=password, db=dbname)# 使用 cursor() 方法创建一个游标对象 cursorcursor = db.cursor()# SQL 查询语句sql = "SELECT * FROM t_wikidata_human_content WHERE `name` LIKE " + "'" + "%" + entity + "'"cursor.execute(sql)persons = cursor.fetchall()candidates = []for person in persons:candidates.append({'wiki_id': person[1], 'name': person[2], 'description': person[3]})return candidatesdef getEntity(query):url = 'http://ip:8018/getNer'properties = {'text': query, 'lang': 'en'}resp = requests.get(url, params=properties).json()entities = []for entity in resp:if entity['ner'] == "PERSON":entities.append(entity)return entitiesdef match(query):entities = getEntity(query)points = []for entity in entities:     #单一实体消歧candidates = findCandidates(entity['word'])texts = []for candidate in candidates:texts.append(candidate['description'])try:indent = similarity(texts, query)sort = sorted(enumerate(indent), key=lambda x: x[1])   ##b[-1][0] 最大值的原下标max_index = sort[-1][0]except:max_index = 0try:points.append({'wiki_id': candidates[max_index]['wiki_id'], 'name': candidates[max_index]['name'],'begin': entity['begin'], 'end': entity['end']})except:points.append({'wiki_id': -1,'begin': entity['begin'], 'end': entity['end']})points = json.dumps(points, indent=4)return points

similarity.py

# -*- coding:utf-8 -*-import codecs
import re
from gensim import corpora, models, similarities
from nltk.tokenize import WordPunctTokenizerdef wordtokenizer(sentence):words = WordPunctTokenizer().tokenize(sentence)return wordsdef tokenization(text, stopwordpath):stop_words = stopwordpathstopwords = codecs.open(stop_words, 'r', encoding='utf8').readlines()stopwords = [w.strip() for w in stopwords]result = []text = re.sub("[-',{:+}|.()/?!·;]", ' ', text).lower()words = wordtokenizer(text)for word in words:if word not in stopwords:result.append(word)return resultdef similarity(texts, query, stopwordpath='stop.txt'):corpus = []for text in texts:corpus.append(tokenization(text, stopwordpath))dictionary = corpora.Dictionary(corpus)     # 生成特征字典,为每个出现在语料库中的单词分配了一个独一无二的整数编号iddoc_bow = [dictionary.doc2bow(text) for text in corpus]     # 函数doc2bow() 简单地对每个不同单词的出现次数进行了计数，并将单词转换为其编号，然后以稀疏向量的形式返回结果。tfidf = models.TfidfModel(doc_bow)      # 每一个特征的IDF值的统计tfidf_bow = tfidf[doc_bow]      # 计算tfidfquery = tokenization(query, stopwordpath)query_bow = dictionary.doc2bow(query)index = similarities.MatrixSimilarity(tfidf_bow)sims = index[query_bow]return sims

实体消歧（链接到实体库）相关推荐

知识图谱（五）——实体消歧
一.任务概述多样性--同一实体在文本中会有不同的指称.eg:飞人.帮主.老大和MJ都指美国篮球运动员迈克尔·乔丹歧义性--相同的实体指称在不同的上下文中可以指不同的实体.eg:迈克尔·乔丹指美国篮 ...
【创新实训】BERT4EL，基于文本相似度的实体消歧实现
任务描述现有douban.mtime.maoyan三个来源的电影,包含名称.简介.导演.演员.类型等等属性. 需要相同的电影融合为一个电影条目,其中maoyan数量很少,可以合并到mtime中. 参 ...
信息抽取之实体消歧，统一
1.前言信息抽取相关内容可以参考信息抽取简介和关系抽取详解 2.实体消歧的本质如小米,它是一个实体,在有些句子中表示"小米公司",但在某些语句下它表示一种谷物又比如: 怎么 ...
nlp（贪心学院）——实体消歧、实体统一、指代消解、句法分析
任务212:Entity Disambiguation (实体消歧)介绍小米是公司还是吃的? 苹果是公司还是吃的? 根据左边的上下文找出左边的James Craig到底是右边(1)(2)(3)哪个J ...
【工程处理技巧一篇】基于半规则数据的命名实体消歧识别【未完】
作者:finallyly 出处:博客园(转载请注明作者和出处) 看到这篇文章的标题,您一定会以为此篇博客要讲解一个何等高深的算法.其实不然,本篇博客旨在分享笔者在处理那些繁杂.冗踏.低端甚至于极其TM ...
实体统一，实体消歧，指代消解
指代消解:比较难,目前还没有得到很好的结果.
文献阅读课10-Neural Relation Extraction for Knowledge Base Enrichment(提取+嵌入+消歧+规范化联合模型，实体已知，仅关系抽取，多词实体)
文章目录 Abstract 1.Introduction 2. 相关工作 2.2 Entity-aware Relation Extraction 3.提出的模型 3.1 Solution Frame ...
命名实体如何进行概念消歧?
1 引言命名实体概念消歧是命名实体消歧(英语:Named Entity Disambiguation)的一个重要研究子领域(命名实体概念可见本文3.1章).什么叫概念消歧了?在这里举一个简单例子进行 ...
【极简】实体识别和消歧
文章目录命名实体识别消除歧义 TF-IDF句向量(有监督) 词周边特征 TF-IDF特征(有监督) 词向量(无监督) 基于规则的内联修改权重方法变种:地名消歧正则表达式命名实体识别 impo ...
【NLP入门教程】七、词义消歧
词义消歧(Word Sense Disambiguation, WSD)其目标是确定文本中词汇的正确含义.由于许多单词具有多种含义,词义消歧对于理解和分析文本具有关键作用. 1. 词义消歧的挑战词义 ...

实体消歧（链接到实体库）

实体消歧（链接到实体库）相关推荐

最新文章

热门文章