SpellCorrection


# 词典库
vocab = set([line.rstrip() for line in open('vocab.txt')])
# 需要生成所有候选集合
def generate_candidates(word):"""word: 给定的输入(错误的输入) 返回所有(valid)候选集合"""# 生成编辑距离为1的单词# 1.insert 2. delete 3. replace# appl: replace: bppl, cppl, aapl, abpl... #       insert: bappl, cappl, abppl, acppl....#       delete: ppl, apl, app# 假设使用26个字符letters = 'abcdefghijklmnopqrstuvwxyz' splits = [(word[:i], word[i:]) for i in range(len(word)+1)]# insert操作inserts = [L+c+R for L, R in splits for c in letters]# deletedeletes = [L+R[1:] for L,R in splits if R]# replacereplaces = [L+c+R[1:] for L,R in splits if R for c in letters]candidates = set(inserts+deletes+replaces)# 过来掉不存在于词典库里面的单词return [word for word in candidates if word in vocab] generate_candidates("apple")
['ample', 'apples', 'apply', 'apple']
from nltk.corpus import reuters# 读取语料库
categories = reuters.categories()
corpus = reuters.sents(categories=categories)
# 构建语言模型: bigram
term_count = {}
bigram_count = {}
for doc in corpus:doc = ['<s>'] + docfor i in range(0, len(doc)-1):# bigram: [i,i+1]term = doc[i]bigram = doc[i:i+2]if term in term_count:term_count[term]+=1else:term_count[term]=1bigram = ' '.join(bigram)if bigram in bigram_count:bigram_count[bigram]+=1else:bigram_count[bigram]=1# sklearn里面有现成的包
# 用户打错的概率统计 - channel probability
channel_prob = {}for line in open('spell-errors.txt'):items = line.split(":")correct = items[0].strip()# EDIT START# 数据中的*可能为词频mistakes = [item.strip().split("*") for item in items[1].strip().split(",")]mistakes = [[mis[0], 1] if len(mis) == 1 else [mis[0], int(mis[1])] for mis in mistakes]mis_count = sum([mis[1] for mis in mistakes])channel_prob[correct] = {}for mis in mistakes:channel_prob[correct][mis[0]] = mis[1] / mis_count# EDIT ENDprint(channel_prob)
{'raining': {'rainning': 0.5, 'raning': 0.5}, 'writings': {'writtings': 1.0}, 'disparagingly': {'disparingly': 1.0}, 'yellow': {'yello': 1.0}, 'four': {'forer': 0.08333333333333333, 'fours': 0.08333333333333333, 'fuore': 0.08333333333333333, 'fore': 0.4166666666666667, 'for': 0.3333333333333333}, 'woods': {'woodes': 1.0}, 'hanging': {'haing': 1.0}, 'aggression': {'agression': 1.0}, 'looking': {'loking': 0.09090909090909091, 'begining': 0.09090909090909091, 'luing': 0.09090909090909091, 'look': 0.18181818181818182, 'locking': 0.09090909090909091, 'lucking': 0.09090909090909091, 'louk': 0.09090909090909091, 'looing': 0.09090909090909091, 'lookin': 0.09090909090909091, 'liking': 0.09090909090909091},...
import numpy as np
V = len(term_count.keys())file = open("testdata.txt", 'r')
for line in file:items = line.rstrip().split('\t')line = items[2].split()# line = ["I", "like", "playing"]for word in line:if word not in vocab:# 需要替换word成正确的单词# Step1: 生成所有的(valid)候选集合candidates = generate_candidates(word)# 一种方式: if candidate = [], 多生成几个candidates, 比如生成编辑距离不大于2的# TODO : 根据条件生成更多的候选集合# EDIT START# 获得编辑距离不大于2的candidatesfor i in range(len(candidates)):candidates += generate_candidates(candidates[i])candidates = list(set(candidates))# EDIT ENDif len(candidates) < 1:continue   # 不建议这么做(这是不对的) probs = []# 对于每一个candidate, 计算它的score# score = p(correct)*p(mistake|correct)#       = log p(correct) + log p(mistake|correct)# 返回score最大的candidatefor candi in candidates:prob = 0# a. 计算channel probabilityif candi in channel_prob and word in channel_prob[candi]:prob += np.log(channel_prob[candi][word])else:prob += np.log(0.0001)# b. 计算语言模型的概率# EDIT STARTidx = line.index(word)bigram = ['<s>', candi] if idx == 0 else [line[idx - 1], candi]if " ".join(bigram) in bigram_count:prob += np.log((bigram_count[" ".join(bigram)] + 1.0) / (term_count[bigram[0]] + V))# EDIT END# TODO: 也要考虑当前 [word, post_word]#   prob += np.log(bigram概率)else:prob += np.log(1.0 / V)probs.append(prob)#             # EDIT START
#             # 原代码存在错误:probs几乎只有一个数,求最大值实际无意义
#             # 可解除该部分代码注释观察probs
#             print(set(probs))
#             # EDIT ENDmax_idx = probs.index(max(probs))print (word, candidates[max_idx])
protectionst protectionist
products. products
long-run, longrun
gain. gain
17, 1
retaiation retaliation...

P.S.

若读取nltk.corpus语料库时出现问题,尝试下载全部nltk_data
若遍历corpus过程中出现问题,尝试按照错误提示路径手动解压punkt并创建相应文件夹结构

# EDIT START
# EDIT START -> END: Edit by ziuno, for reference only.
# EDIT END

NLPCamp-SpellCorrection相关推荐

  1. NLP-基础知识-001

    一.文本分析流程 Pipeline 原始文本(网页文本.新闻....) ->  分词(中文.英文) -> 清洗(无用的标签 !¥ 停用词.....)  -> 标准化(英文时态等) - ...

最新文章

  1. 我的blog开张了,希望大家能多多赏光啊
  2. C++中正确使用PRId64
  3. Java测试驱动开发--总结
  4. 深入理解ARM体系架构(S3C6410)---认识S3C6410
  5. PHP date_sunrise,php中 date_sunrise函数具有哪些功能呢?
  6. SQL Server2000的安装
  7. opencv双线程图片处理_打开正经图片,你可能会看到一张黄图,这种造假方法能同时骗过 AI 和人眼...
  8. LAMP+Redis详解(一)——基本原理
  9. 《HeadFirst SQL》笔记
  10. Tomcat安装与卸载
  11. C++输入大写字母转小写字母
  12. Bzoj4484 [Jsoi2015]最小表示
  13. 学习篇-Activiti-29-流程定义存储表
  14. ios点击推送闪退_王者ios14苹果手机闪退已修复,腾讯痛失百万玩家能否再回来...
  15. 楚留香服务器维护时间,2019年8月9日官方维护公告
  16. 49天精通Java,第12天,Java接口的作用和意义
  17. IOS10 无法获取手机传感器
  18. Linux命令之帮助手册
  19. Pytorch中的DDP
  20. 分布式--雪花算法--使用/原理/实例

热门文章

  1. HTML5 Video(视频),HTML 音频(Audio)
  2. 技术交流群加入方式开放
  3. Raspberry Pi 更新软件源
  4. c语言结构体和联合体,C语言结构体和联合体
  5. form表单非空验证
  6. 商户怎样选择商业wifi进行移动营销
  7. stun turn ice等穿越NAT方法
  8. 汽车电子软件开发:AutoSAR集成开发环境
  9. FATFS最新R0.13官方源码下载
  10. MusicBox(音乐播放器)