Amazon评论数据的预处理代码（Positive Negative）

Amazon评论数据的预处理代码，用于情感分析，代码改自

https://github.com/PaddlePaddle/Paddle/tree/develop/demo/quick_start/data

Amazon商品评论数据网址：

http://jmcauley.ucsd.edu/data/amazon/

Bash脚本文件

get_data.sh：

#!/bin/bash# 1. size of pos : neg = 1:1.
# 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
# 3. distinct train set and test set.set -e# Download data
echo "Downloading Amazon Electronics reviews data..."
# http://jmcauley.ucsd.edu/data/amazon/
# wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
# wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Digital_Music_5.json.gz
echo "Downloading mosesdecoder..."
# https://github.com/moses-smt/mosesdecoder
# wget https://github.com/moses-smt/mosesdecoder/archive/master.zip# unzip master.zip
# rm master.zip##################
# Preprocess data
echo "Preprocess data..."
export LC_ALL=C
UNAME_STR=`uname`if [ ${UNAME_STR} == 'Linux' ]; thenSHUF_PROG='shuf'
elseSHUF_PROG='gshuf'
fimkdir -p tmp
# python preprocess.py -i reviews_Electronics_5.json.gz
python preprocess.py -i reviews_Digital_Music_5.json.gz
# uniq and shuffle
cd tmp
echo 'Uniq and shuffle...'
cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed
cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffedmin_len=`sed -n '$=' neg.shuffed`
echo `sed -n '$=' neg.shuffed`
test_num=$((min_len/10))
if [ $test_num -gt 12500 ];thentest_num=12500
fi
train_num=$((min_len-test_num))head -n$train_num pos.shuffed >train.pos
head -n$train_num neg.shuffed >train.neg
tail -n$test_num pos.shuffed >test.pos
tail -n$test_num neg.shuffed >test.negcat train.pos train.neg | ${SHUF_PROG} >../train.txt
cat test.pos test.neg | ${SHUF_PROG} >../test.txtcd -
echo 'train.txt' > train.list
echo 'test.txt' > test.list# use 30k dict
# rm -rf tmp
mv dict.txt dict_all.txt
cat dict_all.txt | head -n 30001 > dict.txt
echo 'Done.'

数据处理文件：preprocess.py：

# -*- coding: UTF-8 -*-"""
1. Tokenize the words and punctuation
Usage:python preprocess.py -i data_file [random seed]
"""import sys
import os
import operator
import gzip
from subprocess import Popen, PIPE
from optparse import OptionParser
import json
from multiprocessing import Queue
from multiprocessing import Pool
import multiprocessingbatch_size = 5000
word_count = {}
num_tokenize = max(1,multiprocessing.cpu_count() - 2)  # parse + tokenize + save
max_queue_size = 8
parse_queue = Queue(maxsize=max_queue_size + num_tokenize)
tokenize_queue = Queue(maxsize=max_queue_size + num_tokenize)def create_dict(data):"""Create dictionary based on data, and saved in data_dir/dict.txt.The first line is unk \t -1.data: list, input data by batch."""for seq in data:try:for w in seq.lower().split():if w not in word_count:word_count[w] = 1else:word_count[w] += 1except:sys.stderr.write(seq + "\tERROR\n")def parse(path):"""Open .gz file."""sys.stderr.write(path)g = gzip.open(path, 'r')for l in g:yield json.loads(l)g.close()def tokenize(sentences):"""Use tokenizer.perl to tokenize input sentences.tokenizer.perl is tool of Moses.sentences : a list of input sentences.return: a list of processed text."""dir = './mosesdecoder-master/scripts/tokenizer/tokenizer.perl'if not os.path.exists(dir):sys.exit("The ./mosesdecoder-master/scripts/tokenizer/tokenizer.perl does not exists.")tokenizer_cmd = [dir, '-l', 'en', '-q', '-']assert isinstance(sentences, list)text = "\n".join(sentences)tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)tok_text, _ = tokenizer.communicate(text)toks = tok_text.split('\n')[:-1]return toksdef save_data(instance, data_dir, pre_fix, batch_num):"""save data by batch"""label = ['1' if pre_fix == 'pos' else '0' for i in range(len(instance))]lines = ['%s\t%s' % (label[i], instance[i]) for i in range(len(label))]file_name = os.path.join(data_dir, "%s_%s.txt" % (pre_fix, batch_num))file(file_name, 'w').write('\n'.join(lines) + '\n')def tokenize_batch(id):"""tokenize data by batch"""while True:num_batch, instance, pre_fix = parse_queue.get()if num_batch == -1:  ### parse_queue finishedtokenize_queue.put((-1, None, None))sys.stderr.write("Thread %s finish\n" % (id))breaktokenize_instance = tokenize(instance)tokenize_queue.put((num_batch, tokenize_instance, pre_fix))sys.stderr.write('.')def save_batch(data_dir, num_tokenize, data_dir_dict):"""save data by batchbuild dict.txt"""token_count = 0while True:num_batch, instance, pre_fix = tokenize_queue.get()if num_batch == -1:token_count += 1if token_count == num_tokenize:  #### tokenize finished.breakelse:continuesave_data(instance, data_dir, pre_fix, num_batch)create_dict(instance)  ## update dictsys.stderr.write("save file finish\n")f = open(data_dir_dict, 'w')f.write('%s\t%s\n' % ('unk', '-1'))for k, v in sorted(word_count.items(), key=operator.itemgetter(1), \reverse=True):f.write('%s\t%s\n' % (k, v))f.close()sys.stderr.write("build dict finish\n")def parse_batch(data, num_tokenize):"""parse data by batchparse -> tokenize -> save"""raw_txt = parse(data)neg, pos = [], []count = 0sys.stderr.write("extract raw data\n")for l in raw_txt:rating = l["overall"]text = l["reviewText"].lower()  # # convert words to lower caseif rating == 5.0 and text:pos.append(text)if rating < 3.0 and text:neg.append(text)if len(pos) == batch_size or len(neg) == batch_size:if len(pos) == batch_size:batch = pospre_fix = 'pos'else:batch = negpre_fix = 'neg'parse_queue.put((count, batch, pre_fix))count += 1if pre_fix == 'pos':pos = []else:neg = []if len(pos) > 0:parse_queue.put((count, pos, 'pos'))count += 1if len(neg) > 0:parse_queue.put((count, neg, 'neg'))count += 1for i in range(num_tokenize):parse_queue.put((-1, None, None))  #### for tokenize's input finishedsys.stderr.write("parsing finish\n")def option_parser():parser = OptionParser(usage="usage: python preprcoess.py "\"-i data_path [options]")parser.add_option("-i", "--data", action="store", dest="input", help="Input data path.")parser.add_option("-s","--seed",action="store",dest="seed",default=1024,help="Set random seed.")return parser.parse_args()def main():reload(sys)sys.setdefaultencoding('utf-8')options, args = option_parser()data = options.inputseed = options.seeddata_dir_dict = os.path.join(os.path.dirname(data), 'dict.txt')data_dir = os.path.join(os.path.dirname(data), 'tmp')pool = Pool(processes=num_tokenize + 2)pool.apply_async(parse_batch, args=(data, num_tokenize))for i in range(num_tokenize):pool.apply_async(tokenize_batch, args=(str(i), ))pool.apply_async(save_batch, args=(data_dir, num_tokenize, data_dir_dict))pool.close()pool.join()file(os.path.join(os.path.dirname(data), 'labels.list'),'w').write('neg\t0\npos\t1\n')if __name__ == '__main__':main()

转载于:https://www.cnblogs.com/huadongw/p/6165119.html

Amazon评论数据的预处理代码（Positive Negative）相关推荐

论文笔记二 Positive, Negative and Neutral: Modeling Implicit Feedback inSession-based News Recommendatio。
目录一论文简介论文名称 :Positive, Negative and Neutral: Modeling Implicit Feedback in 中文名称:积极.消极和中立:在基于会话的新 ...
机器学习中，对于数据的预处理是否是测试集和训练集一起进行？
转载自:https://www.zhihu.com/question/312639136 机器学习中,对于数据的预处理是否是测试集和训练集一起进行? 最近在尝试训练和应用模型,遇上一个问题,就是针对数 ...
Python数据处理工具 ——Pandas（数据的预处理）
0.前言本文将介绍强大的数据处理模块Pandas,该模块可以帮助数据分析师轻松地解决数据的预处理问题,如数据类型的转换.缺失值的处理.描述性统计分析.数据的汇总等. 通过本章内容的学习,读者将会掌握 ...
第四章数据的预处理与特征构建(续)
申请评分卡模型数据的预处理与特征构建(续) 课程简介:逻辑回归模型的特征需要是数值型,因此类别型变量不能直接放入模型中去,需要对其进行编码.此外,为了获取评分模型的稳定性,建模时需要对数值型特征做分 ...
智课雅思短语---二、exert positive/ negative effects on…
智课雅思短语---二.exert positive/ negative effects on- 一.总结一句话总结:对-产生有利/不利的影响 1.the advantages far outweig ...
matlab 数据白化,“matlab对Excel表格数据预处理“急求FastICA 的源程序 matlab，包括数据的预处理（中心化和白化），注释详细点，谢谢！...
急求FastICA 的源程序 matlab,包括数据的预处理(中心化和白化),注释详细点,谢谢! % function [Ahat2, shat, n_iteration Test] = nc_fas ...
通过接口封装Shopee商品列表、shopee详情、shopee评论数据接口代码展示教程
业务背景:作为全球最大的 B2C 电子商务平台之一,Shopee 平台提供了丰富的商品资源,吸引了大量的全球买家和卖家.为了方便开发者接入 Shopee 平台,Shopee 平台提供了丰富的 API ...
数据的预处理与特征构建（申请评分卡模型）
数据的预处理工作可以有效处理缺失值与异常值,从而增强模型的稳定性: 而特征构建工作则可以将信息从字段中加以提炼,形成有业务含义的优异特征评分卡使用策略: 1.进件量较大,规则无法满足更细的切分需要: ...
h5评论直接显示代码_全套H5教程免费学，让你0基础自学制作H5页面
当前,H5页面已成为各大品牌及新闻媒体普遍采取的表现形式,它可以通过优质的内容.新颖的创意向用户推广产品.传播信息,并利用互联网的快速性,短时间内达到很高的浏览量和识别度.相较于H5而言,海报或视频设 ...

Amazon评论数据的预处理代码（Positive Negative）

Amazon评论数据的预处理代码（Positive Negative）相关推荐

最新文章

热门文章