自然语言处理入门（3）—

Word2Vec是2013年Google开源的一款用于词向量计算的工具，由于其可以在百万数量级的词典和上亿的数据集上进行高效地训练，且该工具得到的词向量，可以很好地度量词与词之间的相似性，在学术界和工业界都得到了广泛的应用。word2vec采用的是n元语法模型(n-gram model)，即假设一个词只与周围n个词有关，而与文本中的其他词无关。其训练词向量有两种方式：CBOW和Skip-gram。
（1）CBOW（Continuous Bag of words,连续词袋模型）:用其上下文词，来预测当前词生成的概率。
（2）Skip-Gram：用当前词去预测上下文词的生成概率。

本文就以Python中的gensim为例，介绍下Word2Vec的特性。主要可以分为电商评论数据获取、中文分词、Word2Vec测试三部分。

1.电商评论数据获取

本文的测试数据集来自于华为荣耀天猫旗舰店荣耀V10手机的评论数据（天猫页面链接：https://detail.tmall.com/item.htm?spm=a1z10.1-b-s.w13636028-15291748785.6.70ea7f34skZgjc&id=562003579553&sku_properties=10004:653780895;5919063:6536025），共计3000条。

获取天猫评论数据的方法，在链接http://blog.csdn.net/flysky1991/article/details/74586286
中有详细说明，就不再多说了。实现代码如下所示：

# -*- coding: utf-8 -*-
"""
Created on Thu Feb  1 18:10:52 2018@author: zch
"""import requests
import json
import time
import random
import pymysql.cursors'''
荣耀V10天猫原始链接：https://detail.tmall.com/item.htm?spm=a1z10.1-b-s.w13636028-15291748785.6.70ea7f34skZgjc&id=562003579553&sku_properties=10004:653780895;5919063:6536025
'''def crawlProductComment(url,page):#商品评论的JSON数据#url = 'https://rate.tmall.com/list_detail_rate.htm?itemId=562003579553&spuId=101717810&spuId=101717810&sellerId=1114511827&order=3&currentPage=1&append=⊙&content=1'req = requests.get(url)jsondata = req.text[15:]data = json.loads(jsondata)#输出页面信息print("正在获取第{}页的评论数据！".format(data['paginator']['page']))j = 0k = 0#遍历评论信息列表for i in data["rateList"]:j = j + 1if page < 10:if j < 10:k = '00' + str(page) + '0' + str(j)else:k = '00' + str(page) + str(j)elif page < 100:if j < 10:k = '0' + str(page) + '0' + str(j)else:k = '0' + str(page) + str(j)else:if j < 10:k = str(page) + '0' + str(j)else:k = str(page) + str(j)#输出商品sku信息auctionSku = i['auctionSku']rateDate = i['rateDate']rateContent = i['rateContent']info = i['appendComment']if info:appendCommentTime = info['commentTime']appendCommentContent = info['content']else:appendCommentTime = ""appendCommentContent = ""print("第{}个商品的sku:{}".format(k,auctionSku))#输出评论时间和评论内容print("评论时间：{}".format(rateDate))print("评论内容：{}".format(rateContent))info = i['appendComment']#判断是否有追加评论if info:print("追评时间:{}".format(appendCommentTime))print("追评内容:{}".format(appendCommentContent))print("-------------------------------------------------")'''数据库操作'''#获取数据库链接connection  = pymysql.connect(host = 'localhost',user = 'root',password = '123456',db = 'tmall',charset = 'utf8mb4')try:#获取会话指针with connection.cursor() as cursor:#创建sql语句sql = "insert into `HonorV10_Comment` (`id`,`auctionSku`,`rateDate`,`rateContent`,`appendCommentTime`,`appendCommentContent`) values (%s,%s,%s,%s,%s,%s)"#执行sql语句cursor.execute(sql,(k,auctionSku,rateDate,rateContent,appendCommentTime,appendCommentContent))#提交数据库connection.commit()finally:connection.close()for i in range(l,151):#print("正在获取第{}页评论数据!".format(i+1))#获取荣耀V10评论JSON链接,通过更改page参数的值来循环读取多页评论信息#itemId:"562003579553",sellerId:"1114511827",shopId:"101717810"url = 'https://rate.tmall.com/list_detail_rate.htm?itemId=562003579553&spuId=101717810&spuId=101717810&sellerId=1114511827&order=3&currentPage=' + str(i) +'&append=⊙&content=1'crawlProductComment(url,i)#设置休眠时间time.sleep(random.randint(32,66))

2.中文分词

本文采用的是jieba来对评论数据进行分词操作。实现代码如下所示：

# -*- coding: utf-8 -*-
"""
Created on Wed Feb  7 11:23:27 2018@author: zch
"""import pandas as pd
import pymysql.cursors
import re
import jieba'''
数据库操作
'''#获取数据库链接
connection  = pymysql.connect(host = 'localhost',user = 'root',password = '123456',db = 'tmall',charset = 'utf8mb4')
try:#获取会话指针with connection.cursor() as cursor:#创建sql语句sql = "select * from `HonorV10_Comment` limit 3000"#执行sql语句cursor.execute(sql)data = cursor.fetchall()#print(data[1000])#print(data[1][1])#提交数据库connection.commit()
finally:connection.close()f1 = open("tmall_review.txt",'a')
for i in range(0,3000):line = data[i][3]print(line)# 中文的编码范围是：\u4e00到\u9fa5 p2 = re.compile(r'[^\u4e00-\u9fa5]')  result = " ".join(p2.split(line)).strip() #line.replace('\t','').replace('\n','').replace(' ','')cutline = jieba.cut(result,cut_all=False)f1.write(" ".join(cutline))f1.close()

3.Word2Vec测试

首先，读取经过jieba分词的评论数据，然后分别测试词语之间的相似度、某个词的相关词表和词语间的对应关系。实现代码如下所示：

# -*- coding: utf-8 -*-
"""
Created on Wed Feb  7 11:21:39 2018@author: zch
"""from gensim.models import word2vec
import logging
import pandas as pd# 主程序
logging.basicConfig(format='%(asctime)s:%(levelname)s: %(message)s', level=logging.INFO)  # 加载语料,默认为utf-8编码
sentences =word2vec.Text8Corpus(u"D:\\data/tmall/tmall_review.txt")#训练skip-gram模型，默认window=5
model =word2vec.Word2Vec(sentences,size=200)    print(model)# 计算两个词的相似度/相关程度
try:  #最新版本的用法，老版本用法为y1 = model.similarity(u"苹果", u"华为")，下同。  y1 = model.wv.similarity(u"华为", u"手机")
except KeyError:  y1 = 0
print (u"【华为】和【手机】的相似度为：", y1)
print("------------------------\n")#计算某个词的相关词列表(topn=10)
y2 = model.wv.most_similar(u"物流",topn=10)
print(u"和【物流】最相关的词有：\n")
for item in y2:print(item[0], item[1])
print("------------------------\n")# 寻找对应关系
print (u"上网-流畅，拍照-")
y3 =model.wv.most_similar([u'上网', u'流畅'], [u'拍照'],topn=5)
for item in y3:  print(item[0], item[1])
print("------------------------\n")