spell_errors

# 词典库
vocab = set([line.rstrip() for line in open('vocab.txt')])
from nltk.corpus import reuters# 读取语料库
categories = reuters.categories()
corpus = reuters.sents(categories=categories)

# 生成候选集
def generate_candidates(word):'''生成编辑距离为1的候选集'''letters = 'abcdefghijklmnopqrstuvwxyz'splits = [(word[:i],word[i:]) for i in range(len(word)+1)]#insertinserts = [L+c+R for L,R in splits for c in letters]#deletedeletes = [L+R[1:] for L,R in splits if R]#replacereplaces = [L+c+R[1:] for L,R in splits if R for c in letters]candidates = set(inserts+deletes+replaces)# 过来掉不存在于词典库里面的单词return [i for i in candidates if i in vocab]def generate_candidates_two(word):'''生成编辑距离为2的候选集'''candidates = generate_candidates(word)candidates_list = []for i in candidates:candidates_list += generate_candidates(i)return [i for i in list(set(candidates_list)) if i in vocab]
generate_candidates_two("apple")
'''
['amply', 'applies', 'apply', 'apple', 'aptly', 'sample', 'ample', 'apples']
'''

# 构建语言模型: bigram
term_count = {}
bigram_count = {}
for text in corpus:text = ['<s>'] + textfor i in range(len(text)-1):term = text[i]bigram = text[i:i+2]bigram = ' '.join(bigram)if term in term_count:term_count[term] += 1else:term_count[term] = 1if bigram in bigram_count:bigram_count[bigram] += 1else:bigram_count[bigram] = 1

# 用户打错的概率统计 - channel probability
channel_prob = {}for line in open('spell-errors.txt','r'):items = line.split(':')correct = items[0].strip()mistakes = [item.strip() for item in items[1].strip().split(",")]channel_prob[correct] = {}for mistake in mistakes:channel_prob[correct][mistake] = 1 / len(mistakes)'''
{'raining': {'rainning': 0.5, 'raning': 0.5},'writings': {'writtings': 1.0},'disparagingly': {'disparingly': 1.0},...
'''

import numpy as np
V = len(term_count.keys())
file = open("testdata.txt", 'r')
for line in file:items = line.split('\t')line = items[2].split()for word in line:if word not in vocab:candidates = generate_candidates_two(word)if len(candidates) < 1:continue   # 不建议这么做（这是不对的） probs = []# 对于每一个candidate, 计算它的score# score = p(correct)*p(mistake|correct)#       = log p(correct) + log p(mistake|correct)# 返回score最大的candidatefor candi in candidates:prob = 0# a.  p(mistake|correct)if candi in channel_prob and word in channel_prob[candi]:prob += np.log(channel_prob[candi][word])else:prob += np.log(1 / V)# b. 计算语言模型的概率 log p(correct)idx = items[2].index(word)+1a = items[2][idx - 1] + candiif a in bigram_count:prob += np.log((bigram_count[a] + 1.0)/ (term_count[candi] + V)               )else:prob += np.log(1.0 / V)b = candi + items[2][idx + 1]if b in bigram_count:prob += np.log((bigram_count[b] + 1.0)/ (term_count[candi] + V)               )else:prob += np.log(1.0 / V)probs.append(prob)max_idx = probs.index(max(probs))print (word, candidates[max_idx])

spell_errors相关推荐

检索式问答系统baseline
1 系统介绍搭建一个基于检索式的简单的问答系统. 本项目包括: 字符串操作文本预处理技术(词过滤,标准化) 文本的表示(词袋模型,tf-idf, word2vec) 文本相似度计算文本高效检索 ...

spell_errors

spell_errors相关推荐

最新文章

热门文章