
  NLP常用的损失函数主要包括多类分类(SoftMax + CrossEntropy)、对比学习(Contrastive Learning)、三元组损失(Triplet Loss)和文本相似度(Sentence Similarity)。其中分类和文本相似度是非常常用的两个损失函数,对比学习和三元组损失则是近两年比较新颖的自监督损失函数。




from transformers import BertTokenizer, BertConfigclass SoftmaxLayerWithLoss(nn.Module):"""This loss aims to calculate softmax between input sentences (pairs) with labels@:param hidden_dim: The hidden dimension@:param num_labels: The number of labels@:param is_sentence_pair: (bool) Whether to feed sentence pair@:param combine_type: The type of combination of sentence pair:- cat: rep =[rep_a, rep_b], -1)- diff: rep = rep_a - rep_b- mul: rep = rep_a * rep_b- avg: rep =  (rep_a + rep_b) / 2.0- sum: rep = rep_a + rep_b"""def __init__(self,hidden_dim: int,num_labels: int,is_sentence_pair=False,combine_type='cat', # cat / diff / mul / avg / sum):super(SoftmaxLayerWithLoss, self).__init__()self.hidden_dim = hidden_dimself.num_labels = num_labelsself.is_sentence_pair = is_sentence_pairself.combine_type = combine_typeassert self.combine_type in ['cat', 'diff', 'mul', 'avg', 'sum']if self.combine_type == 'cat':self.hidden_dim = self.hidden_dim * 2self.classifier = nn.Linear(self.hidden_dim, num_labels)def forward(self, rep_a, rep_b=None, label: Tensor=None):# rep_a: [batch_size, hidden_dim]# rep_b: [batch_size, hidden_dim]rep = Noneif self.combine_type == 'cat':rep =[rep_a, rep_b], -1)if self.combine_type == 'diff':rep = rep_a - rep_bif self.combine_type == 'mul':rep = rep_a * rep_bif self.combine_type == 'avg':rep = (rep_a + rep_b) / 2if self.combine_type == 'sum':rep = rep_a + rep_boutput = self.classifier(rep)loss_fct = nn.CrossEntropyLoss()if label is not None:loss = loss_fct(output, label.view(-1))return losselse:return rep, outputif __name__ == "__main__":# configure for huggingface pre-trained language modelsconfig = BertConfig.from_pretrained('bert-base-cased')# tokenizer for huggingface pre-trained language modelstokenizer = BertTokenizer.from_pretrained('bert-base-cased')# pytorch_model.bin for huggingface pre-trained language modelsmodel = BertModel.from_pretrained('bert-base-cased')# obtain two batch of examples, each corresponding example is a pairexamples1 = ['This is the book.', 'Disney film is well seeing for us.']examples2 = ['I love to read it.', 'I don\'t want to have a try due to the hardness.']label = [1, 0]# convert each example for feature# {'input_ids': xxx, 'attention_mask': xxx, 'token_tuype_ids': xxx}features1 = tokenizer(examples1, add_special_tokens=True, padding=True)features2 = tokenizer(examples2, add_special_tokens=True, padding=True)# padding and convert to feature batchmax_seq_lem = 16features1 = {key: torch.Tensor([value + [0] * (max_seq_lem - len(value)) for value in values]).long() for key, values in features1.items()}features2 = {key: torch.Tensor([value + [0] * (max_seq_lem - len(value)) for value in values]).long() for key, values in features2.items()}label = torch.Tensor(label).long()# obtain sentence embedding by averaged poolingrep_a = model(**features1)[0] # [batch_size, max_seq_len, hidden_dim]rep_b = model(**features2)[0] # [batch_size, max_seq_len, hidden_dim]rep_a = torch.mean(rep_a, -1)  # [batch_size, hidden_dim]rep_b = torch.mean(rep_b, -1)  # [batch_size, hidden_dim]# obtain contrastive lossloss_fn = SoftmaxLayerWithLoss(hidden_dim=rep_a.shape[-1], num_labels=2, is_sentence_pair=True, combine_type='cat')loss = loss_fn(rep_a=rep_a, rep_b=rep_b, label=label)print(loss) # tensor(0.6986, grad_fn=<SumBackward0>)



  对比学习(Contrastive Learning)指的是给定一个anchor以及若干候选项。anchor表示一个确定的特征向量,或由神经网络(例如BERT)表征的向量,candidate则是一组候选项,其中包含positive(与anchor同类)和若干negative(与anchor不同类)。对比学习的目标是尽可能让同类的相似度更大,不同类的相似度越小。详细可看如下代码以及实例:

from transformers import BertTokenizer, BertConfigclass SiameseDistanceMetric(Enum):"""The metric for the contrastive loss"""EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2)MANHATTAN = lambda x, y: F.pairwise_distance(x, y, p=1)COSINE_DISTANCE = lambda x, y: 1-F.cosine_similarity(x, y)class ContrastiveLoss(nn.Module):"""Contrastive loss. Expects as input two texts and a label of either 0 or 1. If the label == 1, then the distance between thetwo embeddings is reduced. If the label == 0, then the distance between the embeddings is increased.@:param distance_metric: The distance metric function@:param margin: (float) The margin distance@:param size_average: (bool) Whether to get averaged lossInput example of forward function:rep_anchor: [[0.2, -0.1, ..., 0.6], [0.2, -0.1, ..., 0.6], ..., [0.2, -0.1, ..., 0.6]]rep_candidate: [[0.3, 0.1, ...m -0.3], [-0.8, 1.2, ..., 0.7], ..., [-0.9, 0.1, ..., 0.4]]label: [0, 1, ..., 1]Return example of forward function:0.015 (averged)2.672 (sum)"""def __init__(self, distance_metric=SiameseDistanceMetric.COSINE_DISTANCE, margin: float = 0.5, size_average:bool = False):super(ContrastiveLoss, self).__init__()self.distance_metric = distance_metricself.margin = marginself.size_average = size_averagedef forward(self, rep_anchor, rep_candidate, label: Tensor):# rep_anchor: [batch_size, hidden_dim] denotes the representations of anchors# rep_candidate: [batch_size, hidden_dim] denotes the representations of positive / negative# label: [batch_size, hidden_dim] denotes the label of each anchor - candidate pairdistances = self.distance_metric(rep_anchor, rep_candidate)losses = 0.5 * (label.float() * distances.pow(2) + (1 - label).float() * F.relu(self.margin - distances).pow(2))return losses.mean() if self.size_average else losses.sum()if __name__ == "__main__":# configure for huggingface pre-trained language modelsconfig = BertConfig.from_pretrained('bert-base-cased')# tokenizer for huggingface pre-trained language modelstokenizer = BertTokenizer.from_pretrained('bert-base-cased')# pytorch_model.bin for huggingface pre-trained language modelsmodel = BertModel.from_pretrained('bert-base-cased')# obtain two batch of examples, each corresponding example is a pairexamples1 = ['This is the sentence anchor 1.', 'It is the second sentence in this article named Section D.']examples2 = ['It is the same as anchor 1.', 'I think it is different with Section D.']label = [1, 0]# convert each example for feature# {'input_ids': xxx, 'attention_mask': xxx, 'token_tuype_ids': xxx}features1 = tokenizer(examples1, add_special_tokens=True, padding=True)features2 = tokenizer(examples2, add_special_tokens=True, padding=True)# padding and convert to feature batchmax_seq_lem = 16features1 = {key: torch.Tensor([value + [0] * (max_seq_lem - len(value)) for value in values]).long() for key, values in features1.items()}features2 = {key: torch.Tensor([value + [0] * (max_seq_lem - len(value)) for value in values]).long() for key, values in features2.items()}label = torch.Tensor(label).long()# obtain sentence embedding by averaged poolingrep_anchor = model(**features1)[0] # [batch_size, max_seq_len, hidden_dim]rep_candidate = model(**features2)[0] # [batch_size, max_seq_len, hidden_dim]rep_anchor = torch.mean(rep_anchor, -1) # [batch_size, hidden_dim]rep_candidate = torch.mean(rep_candidate, -1) # [batch_size, hidden_dim]# obtain contrastive lossloss_fn = ContrastiveLoss()loss = loss_fn(rep_anchor=rep_anchor, rep_candidate=rep_candidate, label=label)print(loss) # tensor(0.0869, grad_fn=<SumBackward0>)


  三元组损失(Triplet Loss)与对比学习比较类似,其旨在拉近anchor与positive的距离,拉开anchor与negative的距离。不同之处在于Triplet Loss考虑到anchor与其他表征向量的最小距离margin值,损失函数则是margin loss。代码如下所示:

