【大数据笔记】- Hadoop MapReduce API

一.基础环境：

本文默认了你已经有一点的java基础，本机环境已安装java、maven、ide，配置好了相关的环境变量，且已经有可用的hadoop环境，已经用idea新建一个java maven项目。还要有一台linux客户机，可执行hadoop命令的。

以上环境有没完成的，自行去百度完成。

二.pom.xml引入包：

      <dependency><groupId>org.apache.hadoop</groupId><artifactId>hadoop-common</artifactId><version>2.7.3</version></dependency><dependency><groupId>org.apache.hadoop</groupId><artifactId>hadoop-hdfs</artifactId><version>2.7.3</version></dependency><dependency><groupId>org.apache.hadoop</groupId><artifactId>hadoop-client</artifactId><version>2.7.3</version></dependency>

三.准备统计文件并上传

1.新建一个文件word_test.txt

I have searched a thousand years，And I have cried a thousand tears。
I found everything I need，You are everything to me。

2.上传到hadoop

先 rz 上传到linux客户机，再执行下边命令上传到hdfs

hadoop fs -mkdir /tmp/mr_test/
hadoop fs -put ./word_test.txt /tmp/mr_test/

四.上代码(官方WordCount V1)

package com.yixin;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import java.io.IOException;
import java.util.StringTokenizer;public class WordCount {public static class TokenizerMapperextends Mapper<Object, Text, Text, IntWritable> {private final static IntWritable one = new IntWritable(1);private final Text word = new Text();public void map(Object key, Text value, Context context) throws IOException, InterruptedException {StringTokenizer itr = new StringTokenizer(value.toString());while (itr.hasMoreTokens()) {word.set(itr.nextToken());context.write(word, one);}}}public static class IntSumReducerextends Reducer<Text,IntWritable,Text,IntWritable> {private final IntWritable result = new IntWritable();public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {int sum = 0;for (IntWritable val : values) {sum += val.get();}result.set(sum);context.write(key, result);}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = Job.getInstance(conf, "word count");job.setJarByClass(WordCount.class);job.setMapperClass(TokenizerMapper.class);job.setCombinerClass(IntSumReducer.class);job.setReducerClass(IntSumReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);FileInputFormat.addInputPath(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));System.exit(job.waitForCompletion(true) ? 0 : 1);}
}

五.打包上传执行

1.用ide或maven打包代码成jar，

2.rz上传到linux客户机

3.执行代码

hadoop jar mrtest-1.0-SNAPSHOT.jar com.yixin.WordCount /tmp/mr_test/word_test.txt /tmp/mr_test/output

4.查看结果

hadoop fs -cat /tmp/mr_test/output/*

六.官方WordCount V2

与v1的主要差别，在Map类中是增加了 setup(Context context) 方法，增加2个参数对大小写和忽略字符进行控制。具体说明我在代码中有备注。

实际上，map和reduce都有setup(Context context) ,cleanup(Context context)，前一个是初始化操作，后一个是做清理工作。在map或reduce方法的前后执行，大家可以根据业务合理使用。

1.上v2代码

package com.yixin;import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.StringUtils;public class WordCount2 {/*** 这里可以设置map和reduce用到的变量和方法。* 注：正常每个map和reduce是各用各的，修改互相不可见。除非你用的全局计数器或分布式缓存。*/public static class TokenizerMapperextends Mapper<Object, Text, Text, IntWritable> {/*** 这里可以设置map用到的变量和方法。* 注：正常每个map是各用各的，修改互相不可见。除非你用的全局计数器或分布式缓存。*/// 这就是个全局计数器，各map是可共享的，修改可见的。static enum CountersEnum {INPUT_WORDS}private final static IntWritable one = new IntWritable(1);private Text word = new Text();private boolean caseSensitive;private Set<String> patternsToSkip = new HashSet<String>();private Configuration conf;private BufferedReader fis;/*** setup方法可以执行一些map执行前的一些初始化工作，* 如对变量做初始化加工，设置数据库连接，输入路径过滤等。* 这个例子是对是否大小写做了个处理，对跳过字符的规则做了处理。*/@Overridepublic void setup(Context context) throws IOException,InterruptedException {conf = context.getConfiguration();caseSensitive = conf.getBoolean("wordcount.case.sensitive", true);if (conf.getBoolean("wordcount.skip.patterns", true)) {URI[] patternsURIs = Job.getInstance(conf).getCacheFiles();if(patternsURIs!=null){for (URI patternsURI : patternsURIs) {Path patternsPath = new Path(patternsURI.getPath());String patternsFileName = patternsPath.getName().toString();parseSkipFile(patternsFileName);}}}}private void parseSkipFile(String fileName) {try {fis = new BufferedReader(new FileReader(fileName));String pattern = null;while ((pattern = fis.readLine()) != null) {patternsToSkip.add(pattern);}} catch (IOException ioe) {System.err.println("Caught exception while parsing the cached file '"+ StringUtils.stringifyException(ioe));}}@Overridepublic void map(Object key, Text value, Context context) throws IOException, InterruptedException {String line = (caseSensitive) ?value.toString() : value.toString().toLowerCase();for (String pattern : patternsToSkip) {line = line.replaceAll(pattern, "");}StringTokenizer itr = new StringTokenizer(line);while (itr.hasMoreTokens()) {word.set(itr.nextToken());context.write(word, one);Counter counter = context.getCounter(CountersEnum.class.getName(),CountersEnum.INPUT_WORDS.toString());counter.increment(1);}}}public static class IntSumReducerextends Reducer<Text, IntWritable, Text, IntWritable> {private IntWritable result = new IntWritable();public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {int sum = 0;for (IntWritable val : values) {sum += val.get();}result.set(sum);context.write(key, result);}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);String[] remainingArgs = optionParser.getRemainingArgs();if (!(remainingArgs.length != 2 || remainingArgs.length != 4)) {System.err.println("Usage: wordcount <in> <out> [-skip skipPatternFile]");System.exit(2);}Job job = Job.getInstance(conf, "word count");job.setJarByClass(WordCount2.class);job.setMapperClass(TokenizerMapper.class);job.setCombinerClass(IntSumReducer.class);job.setReducerClass(IntSumReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);List<String> otherArgs = new ArrayList<String>();for (int i = 0; i < remainingArgs.length; ++i) {if ("-skip".equals(remainingArgs[i])) {job.addCacheFile(new Path(remainingArgs[++i]).toUri());job.getConfiguration().setBoolean("wordcount.skip.patterns", true);} else {otherArgs.add(remainingArgs[i]);}}FileInputFormat.addInputPath(job, new Path(otherArgs.get(0)));FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1)));System.exit(job.waitForCompletion(true) ? 0 : 1);}
}

2.运行方式a，和v1参数一样

打包上传和v1一样不说了，运行是如下命令：

hadoop fs -rm -r /tmp/mr_test/output
hadoop jar mrtest-1.0-SNAPSHOT.jar com.yixin.WordCount2 /tmp/mr_test/word_test.txt /tmp/mr_test/output

结果：

hadoop fs -cat /tmp/mr_test/output/*

3.新建文件patterns.txt（跳过字符的规则）,并上传到集群。

新建，编辑，保存

vim patterns.txt

\.
\,
\!
to
\，
\。

上传：

hadoop fs -put patterns.txt /tmp/mr_test/

运行方法b，多加2个参数，跳过不需要统计的字符

hadoop fs -rm -r /tmp/mr_test/output
hadoop jar mrtest-1.0-SNAPSHOT.jar com.yixin.WordCount2 /tmp/mr_test/word_test.txt /tmp/mr_test/output -skip /tmp/mr_test/patterns.txt

结果：

hadoop fs -cat /tmp/mr_test/output/*

七.MR简化配置工具ToolRunner：

可以使这个工具来预配置，这样能简化使用方的代码量，如精简main方法，只需要2行代码，这个我就不写例子，推荐大家看这个文章：

使用ToolRunner运行Hadoop程序基本原理分析_jediael_lu的专栏-CSDN博客