知识图谱嵌入

知识图谱是一个三元组组成的集合，将头尾实体通过关系连接成一个图，而知识图谱存在一个问题，就是离散的图结构是不能够进行语义计算的，为帮助计算机对知识进行计算，解决数据稀疏性，可以将知识图谱中的实体、关系映射到低维连续的向量空间中，这类方法称为知识图谱嵌入。

TransE

TransE提出了一种将实体与关系嵌入到低维向量空间中的简单模型

原理：

TransE基于实体和关系的分布式向量表示，将每个三元组实例（head，relation，tail）中的关系relation看做从实体head到实体tail的翻译，即向量相加，通过不断调整h、r和t的向量，使（h + r）尽可能与 t 相等，即，头实体向量 + 关系向量 = 尾实体向量（h + r = t）。

TransE代码

import codecs
import numpy as np
import copy
import time
import randomdef dataloader(file1, file2, file3):"""加载数据file1：训练数据集file2：实体集合file3：关系集合返回数据：三个列表，实体，关系，三元组"""entity = []relation = []entities2id = {}relations2id = {}with open(file2, 'r') as f1, open(file3, 'r') as f2:lines1 = f1.readlines()lines2 = f2.readlines()for line in lines1:line = line.strip().split('\t')if len(line) != 2:continueentities2id[line[0]] = line[1]entity.append(line[1])for line in lines2:line = line.strip().split('\t')if len(line) != 2:continuerelations2id[line[0]] = line[1]relation.append(line[1])triple_list = []with codecs.open(file1, 'r') as f:content = f.readlines()for line in content:triple = line.strip().split("\t")if len(triple) != 3:continueh_ = entities2id[triple[0]]r_ = relations2id[triple[1]]t_ = entities2id[triple[2]]triple_list.append([h_, r_, t_])print("Complete load. entity : %d , relation : %d , triple : %d" % (len(entity), len(relation), len(triple_list)))return entity, relation, triple_listclass TransE:def __init__(self, entity, relation, triple_list, embedding_dim=50, lr=0.01, margin=1.0, norm=1):"""参数初始化"""self.entities = entityself.relations = relationself.triples = triple_listself.dimension = embedding_dimself.learning_rate = lrself.margin = marginself.norm = normself.loss = 0.0def data_initialise(self):"""将实体id列表、关系id列表转变为entityVectorList和relationVectorList两个字典。"""entityVectorList = {}         # {实体id：实体向量}relationVectorList = {}       # {关系id：关系向量}for entity in self.entities:entity_vector = np.random.uniform(-6.0 / np.sqrt(self.dimension), 6.0 / np.sqrt(self.dimension),self.dimension)entityVectorList[entity] = entity_vectorfor relation in self.relations:relation_vector = np.random.uniform(-6.0 / np.sqrt(self.dimension), 6.0 / np.sqrt(self.dimension),self.dimension)relation_vector = self.normalization(relation_vector)relationVectorList[relation] = relation_vectorself.entities = entityVectorListself.relations = relationVectorListdef normalization(self, vector):return vector / np.linalg.norm(vector)def training_run(self, epochs=1, nbatches=100, out_file_title = ''):batch_size = int(len(self.triples) / nbatches)print("batch size: ", batch_size)for epoch in range(epochs):start = time.time()self.loss = 0.0# Normalise the embedding of the entities to 1for entity in self.entities.keys():self.entities[entity] = self.normalization(self.entities[entity]);for batch in range(nbatches):batch_samples = random.sample(self.triples, batch_size)Tbatch = []for sample in batch_samples:corrupted_sample = copy.deepcopy(sample)pr = np.random.random(1)[0]if pr > 0.5:# change the head entitycorrupted_sample[0] = random.sample(self.entities.keys(), 1)[0]while corrupted_sample[0] == sample[0]:corrupted_sample[0] = random.sample(self.entities.keys(), 1)[0]else:# change the tail entitycorrupted_sample[2] = random.sample(self.entities.keys(), 1)[0]while corrupted_sample[2] == sample[2]:corrupted_sample[2] = random.sample(self.entities.keys(), 1)[0]if (sample, corrupted_sample) not in Tbatch:Tbatch.append((sample, corrupted_sample))self.update_triple_embedding(Tbatch)end = time.time()print("epoch: ", epoch, "cost time: %s" % (round((end - start), 3)))print("running loss: ", self.loss)with codecs.open(out_file_title +"TransE_entity_" + str(self.dimension) + "dim_batch" + str(batch_size), "w") as f1:for e in self.entities.keys():f1.write(e + "\t")f1.write(str(list(self.entities[e])))f1.write("\n")with codecs.open(out_file_title +"TransE_relation_" + str(self.dimension) + "dim_batch" + str(batch_size), "w") as f2:for r in self.relations.keys():f2.write(r + "\t")f2.write(str(list(self.relations[r])))f2.write("\n")def update_triple_embedding(self, Tbatch):copy_entity = copy.deepcopy(self.entities)copy_relation = copy.deepcopy(self.relations)for correct_sample, corrupted_sample in Tbatch:correct_copy_head = copy_entity[correct_sample[0]]correct_copy_tail = copy_entity[correct_sample[2]]relation_copy = copy_relation[correct_sample[1]]corrupted_copy_head = copy_entity[corrupted_sample[0]]corrupted_copy_tail = copy_entity[corrupted_sample[2]]correct_head = self.entities[correct_sample[0]]correct_tail = self.entities[correct_sample[2]]relation = self.relations[correct_sample[1]]corrupted_head = self.entities[corrupted_sample[0]]corrupted_tail = self.entities[corrupted_sample[2]]# calculate the distance of the triplesif self.norm == 1:correct_distance = norm_l1(correct_head, relation, correct_tail)corrupted_distance = norm_l1(corrupted_head, relation, corrupted_tail)else:correct_distance = norm_l2(correct_head, relation, correct_tail)corrupted_distance = norm_l2(corrupted_head, relation, corrupted_tail)loss = self.margin + correct_distance - corrupted_distanceif loss > 0:self.loss += losscorrect_gradient = 2 * (correct_head + relation - correct_tail)corrupted_gradient = 2 * (corrupted_head + relation - corrupted_tail)if self.norm == 1:for i in range(len(correct_gradient)):if correct_gradient[i] > 0:correct_gradient[i] = 1else:correct_gradient[i] = -1if corrupted_gradient[i] > 0:corrupted_gradient[i] = 1else:corrupted_gradient[i] = -1correct_copy_head -= self.learning_rate * correct_gradientrelation_copy -= self.learning_rate * correct_gradientcorrect_copy_tail -= -1 * self.learning_rate * correct_gradientrelation_copy -= -1 * self.learning_rate * corrupted_gradientif correct_sample[0] == corrupted_sample[0]:# if corrupted_triples replaces the tail entity, the head entity's embedding need to be updated twicecorrect_copy_head -= -1 * self.learning_rate * corrupted_gradientcorrupted_copy_tail -= self.learning_rate * corrupted_gradientelif correct_sample[2] == corrupted_sample[2]:# if corrupted_triples replaces the head entity, the tail entity's embedding need to be updated twicecorrupted_copy_head -= -1 * self.learning_rate * corrupted_gradientcorrect_copy_tail -= self.learning_rate * corrupted_gradient# normalising these new embedding vector, instead of normalising all the embedding togethercopy_entity[correct_sample[0]] = self.normalization(correct_copy_head)copy_entity[correct_sample[2]] = self.normalization(correct_copy_tail)if correct_sample[0] == corrupted_sample[0]:# if corrupted_triples replace the tail entity, update the tail entity's embeddingcopy_entity[corrupted_sample[2]] = self.normalization(corrupted_copy_tail)elif correct_sample[2] == corrupted_sample[2]:# if corrupted_triples replace the head entity, update the head entity's embeddingcopy_entity[corrupted_sample[0]] = self.normalization(corrupted_copy_head)# the paper mention that the relation's embedding don't need to be normalisedcopy_relation[correct_sample[1]] = relation_copy# copy_relation[correct_sample[1]] = self.normalization(relation_copy)self.entities = copy_entityself.relations = copy_relationif __name__ == '__main__': file1 = "./train.txt"file2 = "./entity2id.txt"file3 = "./relation2id.txt"entity_set, relation_set, triple_list = dataloader(file1, file2, file3)transE = TransE(entity_set, relation_set, triple_list, embedding_dim=50, lr=0.01, margin=1.0, norm=2)transE.data_initialise()transE.training_run(out_file_title="test")

知识图谱嵌入模型之TransE算法相关推荐

ACL-BioNLP 2020 | 耶鲁大学实践成果：生物医药知识图谱嵌入模型基准测试
今天给大家介绍的是耶鲁大学医学信息学中心主任Brandt教授实验室和爱丁堡大学的博士生联合发表在ACL-BioNLP 2020发表的文章"Benchmark and Best Practic ...
【知识图谱】本周文献阅读笔记（3）——周二 2023.1.10：英文）知识图谱补全研究综述 + 网络安全知识图谱研究综述 + 知识图谱嵌入模型中的损失函数 + 图神经网络应用于知识图谱推理的研究综述
声明:仅学习使用~ 对于各文献,目前仅是泛读形式,摘出我认为重要的点,并非按照原目录进行简单罗列! 另:鉴于阅读paper数目稍多,对paper内提到的多数模型暂未细致思考分析.目的是总结整理关于KG ...
FKGE：合格的知识图谱嵌入已经学会保护隐私啦！
来源 | ACT_BIGDATA 本文介绍我们最近的一项被CIKM 2021录用的工作<Differentially Private Federated Knowledge Graphs Emb ...
Ampligraph——基于tensorflow的python库，可用于知识图谱嵌入和链接预测
目录一.AmpliGraph 1.介绍 2.特点 3.模块 4.安装AmpliGraph 二.API接口 1.数据 2.模型 3.评估 4.发现 5.其他实用函数三.实例代码 1.训练和评估嵌入模 ...
知识图谱嵌入内容整理
以下内容均来自于网络,笔者将其整理到这篇博客中,侵删. 知识嵌入知识点知识图谱嵌入定义:为了解决前面提到的知识图谱表示的挑战,在词向量的启发下,研究者考虑如何将知识图谱中的实体和关系映射到连续的向 ...
技术动态 | 「知识图谱嵌入技术研究」最新2022综述
转载公众号 | 专知知识图谱(KG)是一种用图模型来描述知识和建模事物之间关联关系的技术. 知识图谱嵌入(KGE)作为一种被广泛采用的知识表示方法,其主要思想是将知识图谱中的实体和关系嵌入到连续的 ...
CIKM 2021 | FKGE：差分隐私的联邦知识图谱嵌入
本文介绍我们最近的一项被CIKM 2021录用的工作<Differentially Private Federated Knowledge Graphs Embedding>: Paper ...
【WWW2021】高效的非抽样知识图谱嵌入
点击上方"视学算法",选择加"星标"或"置顶" 重磅干货,第一时间送达来源:专知本文附论文,建议阅读5分钟NS-KGE的基本思想是在模型学 ...
论文浅尝 | 区分概念和实例的知识图谱嵌入方法
链接:https://arxiv.org/pdf/1811.04588.pdf 知识图谱的表示学习最近几年被广泛研究,表示学习的结果对知识图谱补全和信息抽取都有很大帮助.本文提出了一种新的区分概念和实 ...

知识图谱嵌入模型之TransE算法

知识图谱嵌入

TransE

原理：

TransE代码

知识图谱嵌入模型之TransE算法相关推荐

最新文章

热门文章