Flume 安装使用(自己总结)

准备工作

安装JDK8

# 下载JDK 8
wget --no-cookies --no-check-certificate --header \
"Cookie: gpw_e24=http%3A%2F%2Fwww.oracle.com%2F; oraclelicense=accept-securebackup-cookie" \
"http://download.oracle.com/otn-pub/java/jdk/8u241-b07/1f5b5a70bf22433b84d0e960903adac8/jdk-8u241-linux-x64.tar.gz" -P ~/# 解压安装
sudo tar -xf jdk-8u241-linux-x64.tar.gz -C /usr/local/# 配置~/.bashrc
echo -e '\n\n# JDK 1.8' >> ~/.bashrc
echo 'export JAVA_HOME=/usr/local/jdk1.8.0_241' >> ~/.bashrc
echo 'export JRE_HOME=${JAVA_HOME}/jre' >> ~/.bashrc
echo 'export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib' >> ~/.bashrc
echo 'export PATH=$PATH:${JAVA_HOME}/bin' >> ~/.bashrc# 应用设置
source ~/.bashrc# 测试效果
java -version# 清理残余
rm jdk-8u241-linux-x64.tar.gz

单机Hadoop

单机模式仅供本次测试用，生产中还是用集群HDFS。

# 下载
wget https://archive.apache.org/dist/hadoop/common/hadoop-2.7.2/hadoop-2.7.2.tar.gz# 解压
sudo mkdir -p /opt/modules
sudo chown -R `id -u`:`id -g` /opt/modules  # 赋予当前用户修改该目录的权限
tar -xf hadoop-2.7.2.tar.gz -C /opt/modules# 删除多余的cmd 文件，Linux 下不需要这些文件
rm -rf /opt/modules/hadoop-2.7.2/bin/*.cmd
rm -rf /opt/modules/hadoop-2.7.2/sbin/*.cmd
rm -rf /opt/modules/hadoop-2.7.2/etc/hadoop/*.cmd# 配置
## 1. hadoop-env.sh，修改JAVA_HOME 路径
sed -i 's#=${JAVA_HOME}#=/usr/local/jdk1.8.0_241#' \/opt/modules/hadoop-2.7.2/etc/hadoop/hadoop-env.sh## 2. core-site.xml
cat > /opt/modules/hadoop-2.7.2/etc/hadoop/core-site.xml << 'EOF'
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration><!-- 指定HDFS 中NameNode 的地址 --><property><name>fs.defaultFS</name><value>hdfs://hadoop112:9000</value></property><!-- 指定Hadoop 运行时产生文件的存储目录 --><property><name>hadoop.tmp.dir</name><value>/opt/modules/hadoop-2.7.2/data/tmp</value></property>
</configuration>
EOF## 3. hdfs-site.xml
cat > /opt/modules/hadoop-2.7.2/etc/hadoop/hdfs-site.xml << 'EOF'
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration><!-- 指定文件在HDFS 中的副本数 --><property><name>dfs.replication</name><value>1</value></property>
</configuration>
EOF# 启动
## 1. 格式化`NameNode`，仅在初始化时执行一次。
### 前面将数据文件设置在data 目录下，先清空再格式化
cd /opt/modules/hadoop-2.7.2
rm -rf data/ logs/ && bin/hdfs namenode -format
### 出现Exiting with status 0 字样表示格式化成功## 2. 启动HDFS
### 统一启动，NameNode、DataNode、SecondaryNameNode 一起
cd /opt/modules/hadoop-2.7.2 && sbin/start-dfs.sh# 验证
## 浏览器访问 http://hadoop112:50070# 停止
cd /opt/modules/hadoop-2.7.2 && sbin/stop-dfs.sh

创建工作目录

# 创建操作目录，操作用户为abc
sudo mkdir -p /opt/modules
sudo chown -R `id -u`:`id -g` /opt/modules

安装配置

# 1. 下载
wget http://archive.apache.org/dist/flume/1.7.0/apache-flume-1.7.0-bin.tar.gz# 2. 解压
tar -xf apache-flume-1.7.0-bin.tar.gz -C /opt/modules
cd /opt/modules && mv apache-flume-1.7.0-bin flume-1.7.0# 3. 配置
echo 'export JAVA_HOME=/usr/local/jdk1.8.0_241' > /opt/modules/flume-1.7.0/conf/flume-env.sh

使用案例

开始之前先说明以下，Flume有三大组件，分别是：Source、Channel、Sink。它们三者各自都有很多类型可以选择，例如Channel就有memory和file两种类型可供使用，甚至类型还可以自定义。但是，在同一个Flume进程中即同一个Agent中，Channel两端连接的是Source和Sink的，所以Channel连Channel是不存在的。不同Agent进程可以通过Avro 接口让Source和Sink串联在一起。

Web Server
logs

Source

0. 创建案例目录

# 案例配置存放路径
mkdir /opt/modules/flume-1.7.0/jobs

1. 实时监听端口数据

在hadoop112启动监听44444端口，hadoop113作为客户端来访问，如有数据变化则打印到终端上。
本实验在同一台机器上做也可以，例如hadoop112开两个终端，一个启动Flume，另一个开netcat。

1.1 Shell 实现

nc hadoop112 44444

hadoop112
(nc -lk 44444)

hadoop113

# 0. 使用netcat 实验
## 0.1 服务端，在hadoop112 上执行，不指定主机，默认绑定所有本机的IP
nc -lk 44444  # 监听44444 端口，nc 是netcat 缩写，-l 监听，-k 保存不关闭
### 关于netcat 类似于cat，cat 是本地连接输入输出的，netcat 则是网络上连接输入输出的
### cat 源自于单词catenate，连接的意思，同义词connect## 0.2 客户端，在另一个节点hadoop113 上执行
nc hadoop112 44444  # 连接hadoop112 上的44444 端口，Ctrl+d 或Ctrl+c 结束
### 客户端hadoop113 输入、服务端hadoop112 输出

1.2 Flume 实现

Source r1
netcat

# 使用flume 实现以上过程
## 1. 创建任务配置，hadoop112 上执行，参考官网v1.7.0 的说明
cat > /opt/modules/flume-1.7.0/jobs/netcat_flume_logger.conf << 'EOF'
# Name the components on this agent
## a1 是agent 的名字，在同一个节点上要唯一，因为可以启多个agent
## source、sink、channel 都可以有多个
a1.sources = r1
a1.sinks = k1
a1.channels = c1# Describe/configure the source
## 定义source 即源头的类型，这里定义相当于netcat 服务端
a1.sources.r1.type = netcat
a1.sources.r1.bind = hadoop112
a1.sources.r1.port = 44444# Describe the sink
## 定义sink 即下游的类型，logger 默认打印到日志文件，启动参数中可改为打印到控制台
a1.sinks.k1.type = logger# Use a channel which buffers events in memory
## 定义channel 即管道的类型，memory 应该比file 要快一些
a1.channels.c1.type = memory
## capacity 指channel 中能容纳的事件数，一个事件好比一个乒乓球，这里channel 能容纳1000 个
a1.channels.c1.capacity = 1000
## channel 每次从source 提取或发送给sink 的事件数，一次从源头取100 个乒乓球，或发给下游100 个
a1.channels.c1.transactionCapacity = 100# Bind the source and sink to the channel
## 绑定source、sink 和channel，即定义它们之间的关系
a1.sources.r1.channels = c1
## 注意！！sink 只能对应一个channel
a1.sinks.k1.channel = c1
EOF## 2. 启动flume，相当于启动了netcat 服务端
### flume 相当于shell 的一般命令，没有服务后台进程，要靠启动agent
### agent 包含了Flume 的source、channel、sink 三大组件
cd /opt/modules/flume-1.7.0/ &&\
bin/flume-ng agent --conf conf/ --conf-file jobs/netcat_flume_logger.conf \--name a1 -Dflume.root.logger=INFO,console
### --conf 读取JAVA_HOME 等配置，--conf-file 才是agent 的配置，--name 是agent 的名字
### -D 动态修改参数值，将默认的flume.root.logger=INFO,LOGFILE 改成flume.root.logger=INFO,console
### 日志默认参数在conf/log4j.properties，修改INFO 就包含了INFO 以下例如DEBUG 的信息，以上命令简写形式如下
cd /opt/modules/flume-1.7.0/ &&\
bin/flume-ng agent -c conf/ -f jobs/netcat_flume_logger.conf \-n a1 -Dflume.root.logger=INFO,console
### 启动后，flume 会阻塞等待客户端访问## 3. 在hadoop113 上启动netcat 客户端访问
nc hadoop112 44444
### 输入的字符在hadoop112 的控制台上都会被打印出来
telnet hadoop112 44444  # 或用telnet 有也是同样的效果## 4. 停止flume
kill $(jps -l | grep org.apache.flume.node.Application | cut -d ' ' -f 1)
### ！！！切记不能用kill -9，因为可能还有数据没有传输完毕
### kill 或者Ctrl+c 都会调用flume 的hook 程序进行收尾工作，处理完剩余的数据才停止

2. 实时监控文件内容追加

2.1 追加内容打印到控制台

Source r1
exec

# 0. 创建测试用的日志文件
mkdir -p /tmp/log && touch /tmp/log/t.log# 1. 创建任务配置文件
cat > /opt/modules/flume-1.7.0/jobs/exec_flume_logger.conf << 'EOF'
# Name the components on this agent
## a1 是agent 的名字，在同一个节点上要唯一，因为可以启多个agent
## source、sink、channel 都可以有多个
a1.sources = r1
a1.sinks = k1
a1.channels = c1# Describe/configure the source
## 定义source 即源头的类型，exec 表要执行系统命令
a1.sources.r1.type = exec
## -F 即使日志文件回滚了，inode 信息变了，它也会根据文件名追踪新的日志文件
a1.sources.r1.command = tail -F /tmp/log/t.log
## tail 命令默认会读取文件的最后10 行
## 用什么去解析执行command，以下为flume 默认值，不写也行
a1.sources.r1.shell = /bin/bash -c# Describe the sink
## 定义sink 即下游的类型，logger 默认打印到日志文件，启动参数中可改为打印到控制台
a1.sinks.k1.type = logger# Use a channel which buffers events in memory
## 定义channel 即管道的类型
a1.channels.c1.type = memory
## capacity 指channel 中能容纳的事件数，一个事件好比一个乒乓球，这里channel 能容纳1000 个
a1.channels.c1.capacity = 1000
## channel 每次从source 提取或发送给sink 的事件数，一次从源头取100 个乒乓球，或发给下游100 个
a1.channels.c1.transactionCapacity = 100# Bind the source and sink to the channel
## 绑定source、sink 和channel，即定义它们之间的关系
a1.sources.r1.channels = c1
## 注意！！sink 只能对应一个channel
a1.sinks.k1.channel = c1
EOF# 2. 启动flume agent
cd /opt/modules/flume-1.7.0/ &&\
bin/flume-ng agent -c conf/ -f jobs/exec_flume_logger.conf -n a1 -Dflume.root.logger=INFO,console# 3. 模拟生产环境追加日志文件内容，另外开个终端
echo 'data1' >> /tmp/log/t.log
echo 'data2' >> /tmp/log/t.log
echo 'data3' >> /tmp/log/t.log# 4. 验证传输
## 在终端往/tmp/log/t.log 追加内容，在控制台可以看到对应的信息。# 5. 停止flume agent
kill $(jps -l | grep org.apache.flume.node.Application | cut -d' ' -f1)
### ！！！切记不能用kill -9，因为可能还有数据没有传输完毕
### kill 或者Ctrl+c 都会调用flume 的hook 程序进行收尾工作，处理完剩余的数据才停止

2.2 追加内容保存到HDFS

Source r1
exec

# 0. 环境准备
## 创建测试用的日志文件
mkdir -p /tmp/log && touch /tmp/log/tt.log
## 启动单机版HDFS，NameNode、DataNode、SecondaryNameNode 一起
cd /opt/modules/hadoop-2.7.2 && sbin/start-dfs.sh# 1. 创建任务配置文件
cat > /opt/modules/flume-1.7.0/jobs/exec_flume_hdfs.conf << 'EOF'
# Name the components on this agent
## a1 是agent 的名字，在同一个节点上要唯一，因为可以启多个agent
## source、sink、channel 都可以有多个
a1.sources = r1
a1.sinks = k1
a1.channels = c1# Describe/configure the source
## 定义source 即源头的类型，exec 表要执行系统命令
a1.sources.r1.type = exec
## -F 即使日志文件回滚了，inode 信息变了，它也会根据文件名追踪新的日志文件
a1.sources.r1.command = tail -F /tmp/log/tt.log
## tail 命令默认会读取文件的最后10 行
## 用什么去解析执行command，以下为flume 默认值，不写也行
a1.sources.r1.shell = /bin/bash -c
# Describe the sink
## 定义sink 即下游的类型，指定类型为hdfs 及存储路径的文件夹命名规则
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://hadoop112:9000/flume/webdata/%Y%m%d/%H
## 上传文件的前缀
a1.sinks.k1.hdfs.filePrefix = logs
## 设置文件内容为DataStream 便于观察，默认SequenceFile 是序列化的文本编辑器不好打开
a1.sinks.k1.hdfs.fileType = DataStream
## 使用本地时间戳，如为false 以上的%Y%m%d 等将不可用，因默认event 的header 为空
a1.sinks.k1.hdfs.useLocalTimeStamp = true
## 配合文件夹命名规则，每小时建立一个新文件夹，以下三项为一组，round 为true 时才生效
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 1
a1.sinks.k1.hdfs.roundUnit = hour
## 临时文件，即正在写入的文件的后缀名，满足roll 三项中任一项则去掉改后缀变成正式文件
a1.sinks.k1.hdfs.inUseSuffix = .tmp
## 文件滚动规则，以下三项roll 为一组
## 设置每个滚动文件的大小，HDFS 每个块128MB(1024*1024*128=134217728 Byte)
a2.sinks.k1.hdfs.rollSize = 134210000
## 最后4 位为0000，让其提早滚动到新文件，避免128.1MB 要存两个块，第二个块只有一丁点内容
## 将.tmp 文件变成正式文件的间隔时间，单位为秒，太小会产生过多的小文件，为0 则不按间隔时间滚动
a1.sinks.k1.hdfs.rollInterval = 30
## 累计到事件数后滚动，为0 则忽略累计事件数
a1.sinks.k1.hdfs.rollCount = 0
## 如果rollInterval 和rollCount 都为0 则单文件大小达到rollSize 才滚动文件# Use a channel which buffers events in memory
## 定义channel 即管道的类型
a1.channels.c1.type = memory
## capacity 指channel 中能容纳的事件数，一个事件好比一个乒乓球，这里channel 能容纳1000 个
a1.channels.c1.capacity = 1000
## channel 每次从source 提取或发送给sink 的事件数，一次从源头取100 个乒乓球，或发给下游100 个
a1.channels.c1.transactionCapacity = 100# Bind the source and sink to the channel
## 绑定source、sink 和channel，即定义它们之间的关系
a1.sources.r1.channels = c1
## 注意！！sink 只能对应一个channel
a1.sinks.k1.channel = c1
EOF# 2. 启动flume agent
cd /opt/modules/flume-1.7.0/ && bin/flume-ng agent -c conf/ -f jobs/exec_flume_hdfs.conf -n a1# 3. 模拟生产环境追加日志文件内容，另外开个终端
date >> /tmp/log/tt.log && sleep 3
date >> /tmp/log/tt.log && sleep 3
date >> /tmp/log/tt.log# 4. 验证传输
## 在终端往/tmp/log/tt.log 追加内容，在hadoop112:50070/explorer.html#/flume/webdata 可看到
## HDFS 在有event 消息传过来的情况下，每小时创建一个文件夹，临时*.tmp 文件过30 秒去掉后缀变成正式文件
## 下载对应的文件，可以看到里面的文本，用其跟追加到/tmp/log/tt.log 内容的作比较。# 5. 停止flume agent
kill $(jps -l | grep org.apache.flume.node.Application | cut -d' ' -f1)
### ！！！切记不能用kill -9，因为可能还有数据没有传输完毕
### kill 或者Ctrl+c 都会调用flume 的hook 程序进行收尾工作，处理完剩余的数据才停止

3. 实时监控目录下新增文件

Source r1
spooldir

# 0. 创建测试用的日志文件
mkdir -p /tmp/log/source /tmp/log/target# 1. 创建任务配置文件
cat > /opt/modules/flume-1.7.0/jobs/spooldir_flume_fileroll.conf << 'EOF'
# Name the components on this agent
## a1 是agent 的名字，在同一个节点上要唯一，因为可以启多个agent
## source、sink、channel 都可以有多个
a1.sources = r1
a1.sinks = k1
a1.channels = c1# Describe/configure the source
## 定义source 即源头的类型，spooldir 会扫描源文件夹
a1.sources.r1.type = spooldir
## 每500 毫秒扫描一次源文件夹的新增文件
a1.sources.r1.pollDelay = 500
## 源文件夹
a1.sources.r1.spoolDir = /tmp/log/source# Describe the sink
## 定义sink 即下游的类型，file_roll 即输出到本地文件系统
a1.sinks.k1.type = file_roll
## 输出文件的目标文件夹
a1.sinks.k1.sink.directory = /tmp/log/target# Use a channel which buffers events in memory
## 定义channel 即管道的类型
a1.channels.c1.type = memory
## capacity 指channel 中能容纳的事件数，一个事件好比一个乒乓球，这里channel 能容纳1000 个
a1.channels.c1.capacity = 1000
## channel 每次从source 提取或发送给sink 的事件数，一次从源头取100 个乒乓球，或发给下游100 个
a1.channels.c1.transactionCapacity = 100# Bind the source and sink to the channel
## 绑定source、sink 和channel，即定义它们之间的关系
a1.sources.r1.channels = c1
## 注意！！sink 只能对应一个channel
a1.sinks.k1.channel = c1
EOF# 2. 启动flume agent
cd /opt/modules/flume-1.7.0/ && bin/flume-ng agent -c conf/ -f jobs/spooldir_flume_fileroll.conf -n a1# 3. 模拟生产环境追加文件到源文件夹
cd /tmp/log/source && for((i=1;i<=5;i++)); do echo $i$i$i > $i.txt; done# 4. 验证传输
## 往/tmp/log/source 里添加的新文件经过500 毫秒之后都增加了.COMPLETED 后缀名
## 在/tmp/log/target 看到传输过来的文件内容，可能多个源文件被合并成了一个
cd /tmp/log/source && ls
cd /tmp/log/target && ls
## 注意：
### 1. 不可以往源文件夹里放入同名文件，例如已有1.txt.COMPLETED，再放如一个1.txt 文件会报错，进程须重启才能再次工作
### 2. 修改文件夹里文件的内容，spooldir 也不会将其当新文件看待，所以新内容必须放到不重名的新文件里# 5. 停止flume agent
kill $(jps -l | grep org.apache.flume.node.Application | cut -d' ' -f1)
### ！！！切记不能用kill -9，因为可能还有数据没有传输完毕
### kill 或者Ctrl+c 都会调用flume 的hook 程序进行收尾工作，处理完剩余的数据才停止

4. 实时监控目录下多个文件内容追加

TAILDIR类型的sink可以在确保数据不丢失的前提下实现实时监控和断点续传。

Source r1
TAILDIR

# 0. 创建测试用的日志文件
mkdir -p /tmp/log/test1 /tmp/log/test2# 1. 创建任务配置文件
cat > /opt/modules/flume-1.7.0/jobs/taildir_flume_logger.conf << 'EOF'
# Name the components on this agent
## a1 是agent 的名字，在同一个节点上要唯一，因为可以启多个agent
## source、sink、channel 都可以有多个
a1.sources = r1
a1.sinks = k1
a1.channels = c1# Describe/configure the source
## 定义source 即源头的类型，TAILDIR 会监控源文件夹变化
a1.sources.r1.type = TAILDIR
## 设置监控的文件组，这里有f1 和f2 两个
a1.sources.r1.filegroups = f1 f2
## 可用绝对路径或正则表达式与文件组绑定来监控文件
a1.sources.r1.filegroups.f1 = /tmp/log/test1/t.log
a1.sources.r1.filegroups.f2 = /tmp/log/test2/.*log.*
## 以下设置均可选，默认值请参考FlumeUserGuide
## 否则在Event 的header 加入信息
a1.sources.r1.fileHeader = true
## 头部信息K-V 形式，K 和V 可随意命名，也可有多个头部信息
a1.sources.r1.headers.f1.headerKey1 = value1
a1.sources.r1.headers.f2.headerKey1 = value2
a1.sources.r1.headers.f2.headerKey2 = value22
## 文件空闲多久将其关闭，单位毫秒
a1.sources.r1.idleTimeout = 2000
## 保存断点续传记录的文件
a1.sources.r1.positionFile = /opt/modules/flume-1.7.0/logs/taildir_position.json# Describe the sink
## 定义sink 即下游的类型，logger 默认打印到日志文件，启动参数中可改为打印到控制台
a1.sinks.k1.type = logger# Use a channel which buffers events in memory
## 定义channel 即管道的类型
a1.channels.c1.type = memory
## capacity 指channel 中能容纳的事件数，一个事件好比一个乒乓球，这里channel 能容纳1000 个
a1.channels.c1.capacity = 1000
## channel 每次从source 提取或发送给sink 的事件数，一次从源头取100 个乒乓球，或发给下游100 个
a1.channels.c1.transactionCapacity = 100# Bind the source and sink to the channel
## 绑定source、sink 和channel，即定义它们之间的关系
a1.sources.r1.channels = c1
## 注意！！sink 只能对应一个channel
a1.sinks.k1.channel = c1
EOF# 2. 启动flume agent
cd /opt/modules/flume-1.7.0/ &&\
bin/flume-ng agent -c conf/ -f jobs/taildir_flume_logger.conf \-n a1 -Dflume.root.logger=INFO,console# 3. 模拟生产环境追加文件到源文件夹
## 3.1 创建/tmp/log/test1/t.log 并修改其内容
cd /tmp/log/test1 && echo 111 > t.log && echo 1111 >> t.log
## 3.2 在/tmp/log/test2 下创建多个.log 文件，并修改它们的内容
cd /tmp/log/test2 && \for((i=1;i<=5;i++)); do echo $i$i$i > $i.log; done && \for((i=1;i<=5;i++)); do echo abcde$i$i$i > $i.log; done# 4. 验证传输
## 以上修改在终端都应该可以看到对应的事件信息
## 另外，停止flume 后，往/tmp/log/test1/t.log 追加点内容，再次启动flume，
## 终端应显示关闭flume 后追加的内容，而不是像tail 命令那样默认读最后10 行，这就是断点续传# 5. 停止flume agent
kill $(jps -l | grep org.apache.flume.node.Application | cut -d' ' -f1)
### ！！！切记不能用kill -9，因为可能还有数据没有传输完毕
### kill 或者Ctrl+c 都会调用flume 的hook 程序进行收尾工作，处理完剩余的数据才停止

5. Flume 简单串联

产生日志的网站后台并不在大数据集群中，需要在网站后台安装Flume，然后再和大数据集群中的Flume串联。
这里假设hadoop113是网站后台，hadoop112是大数据集群中的数据采集节点。
Flume串联使用Avro 接口，Avro Source是服务端，Avro Sink是客户端，记住服务端要先于客户端启动。

Source r1
netcat

# 1. 创建任务配置文件
## 1.1 在hadoop112 上执行，配置服务端
cat > /opt/modules/flume-1.7.0/jobs/avro_flume_avro.conf << 'EOF'
# Name the components on this agent
## a1 是agent 的名字，在同一个节点上要唯一，因为可以启多个agent
## source、sink、channel 都可以有多个
a1.sources = r1
a1.sinks = k1
a1.channels = c1# Describe/configure the source
## 定义source 即源头的类型，avro 表要跟另一台机器上的avro sink 串联
a1.sources.r1.type = avro
## 绑定本机的IP，0.0.0.0 表本机所有可用IP
a1.sources.r1.bind = 0.0.0.0
## 监听的端口，与客户端一致
a1.sources.r1.port = 4545# Describe the sink
## 定义sink 即下游的类型，logger 默认打印到日志文件，启动参数中可改为打印到控制台
a1.sinks.k1.type = logger# Use a channel which buffers events in memory
## 定义channel 即管道的类型
a1.channels.c1.type = memory
## capacity 指channel 中能容纳的事件数，一个事件好比一个乒乓球，这里channel 能容纳1000 个
a1.channels.c1.capacity = 1000
## channel 每次从source 提取或发送给sink 的事件数，一次从源头取100 个乒乓球，或发给下游100 个
a1.channels.c1.transactionCapacity = 100# Bind the source and sink to the channel
## 绑定source、sink 和channel，即定义它们之间的关系
a1.sources.r1.channels = c1
## 注意！！sink 只能对应一个channel
a1.sinks.k1.channel = c1
EOF## 1.2 在hadoop113 上执行，配置客户端
cat > /opt/modules/flume-1.7.0/jobs/avro_flume_avro.conf << 'EOF'
# Name the components on this agent
## a1 是agent 的名字，在同一个节点上要唯一，因为可以启多个agent
## source、sink、channel 都可以有多个
a1.sources = r1
a1.sinks = k1
a1.channels = c1# Describe/configure the source
## 定义source 即源头的类型，这里定义相当于netcat 服务端
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444# Describe the sink
## 定义sink 即下游的类型，avro 表示下游有avro source 串联对接
a1.sinks.k1.type = avro
## 连接的服务端即avro source 的地址
a1.sinks.k1.hostname = hadoop112
## 服务端avro source 的端口
a1.sinks.k1.port = 4545# Use a channel which buffers events in memory
## 定义channel 即管道的类型
a1.channels.c1.type = memory
## capacity 指channel 中能容纳的事件数，一个事件好比一个乒乓球，这里channel 能容纳1000 个
a1.channels.c1.capacity = 1000
## channel 每次从source 提取或发送给sink 的事件数，一次从源头取100 个乒乓球，或发给下游100 个
a1.channels.c1.transactionCapacity = 100# Bind the source and sink to the channel
## 绑定source、sink 和channel，即定义它们之间的关系
a1.sources.r1.channels = c1
## 注意！！sink 只能对应一个channel
a1.sinks.k1.channel = c1
EOF# 2. 启动flume agent
## 2.1 在hadoop112 上启动服务端，注意！！服务端必须先启，否则客户端启动不起来
cd /opt/modules/flume-1.7.0/ &&\
bin/flume-ng agent -c conf/ -f jobs/avro_flume_avro.conf -n a1 -Dflume.root.logger=INFO,console## 2.2 在hadoop113 上启动客户端，必须确保服务端已经启好
cd /opt/modules/flume-1.7.0/ && bin/flume-ng agent -c conf/ -f jobs/avro_flume_avro.conf -n a1# 3. 验证传输
## 在hadoop113 再开一个终端，往本机44444 端口发送数据
telnet localhost 44444
## 或使用netcat 也是一样的效果，在hadoop112 上应该可以看到从hadoop113 发来的数据
nc localhost 44444
## PVE 上测试居然要9 秒消息才能传到……# 4. 停止flume agent
kill $(jps -l | grep org.apache.flume.node.Application | cut -d' ' -f1)
### ！！！切记不能用kill -9，因为可能还有数据没有传输完毕
### kill 或者Ctrl+c 都会调用flume 的hook 程序进行收尾工作，处理完剩余的数据才停止

Flume 高级

Flume 内部机制

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-WBomzned-1584755721693)(flume_flow_path.png)]

Flume 传输事务

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-biDgXVTL-1584755721699)(flume_transaction.png)]

数据复制

由于只有Channel Selector才有replicating即数据复制的功能，准确的说Source 组件才有数据复制功能，因为Channel Selectors是属于Source的。所以，需要通过多个Channel来实现数据多组副本数据落地，而无法使用一个Channel接多个Sink的方式实现。

Source r1
netcat

Channel Selector
replicating

# 0. 环境准备
mkdir -p /tmp/log/target# 1. 创建任务配置，在hadoop112 上执行
cat > /opt/modules/flume-1.7.0/jobs/flume_replicating.conf << 'EOF'
# Name the components on this agent
## a1 是agent 的名字，在同一个节点上要唯一，因为可以启多个agent
## source、sink、channel 都可以有多个
a1.sources = r1
a1.sinks = k1 k2
a1.channels = c1 c2# Describe/configure the source
## 定义source 即源头的类型，这里定义相当于netcat 服务端
a1.sources.r1.type = netcat
a1.sources.r1.bind = hadoop112
a1.sources.r1.port = 44444
## 设置Source 的Channel 选择器类型为数据复制，即下游的Channel 们都使用相同的数据
a1.sources.r1.selector.type = replicating# Describe the sink
## 定义sink 即下游的类型，logger 默认打印到日志文件，启动参数中可改为打印到控制台
a1.sinks.k1.type = logger
## 定义sink 即下游的类型，file_roll 即输出到本地文件系统
a1.sinks.k2.type = file_roll
## 输出文件的目标文件夹
a1.sinks.k2.sink.directory = /tmp/log/target# Use a channel which buffers events in memory
## 定义channel 即管道的类型，memory 应该比file 要快一些
a1.channels.c1.type = memory
## capacity 指channel 中能容纳的事件数，一个事件好比一个乒乓球，这里channel 能容纳1000 个
a1.channels.c1.capacity = 1000
## channel 每次从source 提取或发送给sink 的事件数，一次从源头取100 个乒乓球，或发给下游100 个
a1.channels.c1.transactionCapacity = 100
## 多定义一个Channel c2 用于数据复制，用file 类型和memory 效果一样，效率可能会慢一些
a1.channels.c2.type = file# Bind the source and sink to the channel
## 绑定source、sink 和channel，即定义它们之间的关系
a1.sources.r1.channels = c1 c2
## 注意！！sink 只能对应一个channel
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2
EOF# 2. 启动flume，相当于启动了netcat 服务端
cd /opt/modules/flume-1.7.0/ &&\
bin/flume-ng agent -c conf/ -f jobs/flume_replicating.conf \-n a1 -Dflume.root.logger=INFO,console
### 启动后，flume 会阻塞等待客户端访问# 3. 验证数据复制
## 另开一个终端或在hadoop113 上启动netcat 客户端访问
nc hadoop112 44444
telnet hadoop112 44444  # 或用telnet 有也是同样的效果
## 输入的字符会出现在hadoop112 的控制台及/tmp/log/target 里的文件内
cat /tmp/log/target/*  # 在hadoop112 上执行## 4. 停止flume
kill $(jps -l | grep org.apache.flume.node.Application | cut -d ' ' -f 1)
### ！！！切记不能用kill -9，因为可能还有数据没有传输完毕
### kill 或者Ctrl+c 都会调用flume 的hook 程序进行收尾工作，处理完剩余的数据才停止

负载均衡

使用Channel 组件中Sink Processor设置策略为load_balance即负载均衡的Sink Group，然后将多个Sink纳入其中。Sink Group会按照一定的规则将消息分配给组中的某个Sink，以达到负载均衡的目的。同时，通过多个Flume Agent中的Channel对消息数据进行缓存，以分担因最终Sink端写入过慢而造成源头端的数据积压。

Source r1
exec

# 1. 创建任务配置
## 1.1 创建flume_load_balance_2.conf，在hadoop112 上执行
cat > /opt/modules/flume-1.7.0/jobs/flume_load_balance_2.conf << 'EOF'
# Name the components on this agent
## a1 是agent 的名字，在同一个节点上要唯一，因为可以启多个agent
## source、sink、channel 都可以有多个
a1.sources = r1
a1.sinks = k1 k2
a1.channels = c1# Describe/configure the source
## 定义source 即源头的类型，这里定义相当于netcat 服务端
a1.sources.r1.type = netcat
a1.sources.r1.bind = hadoop112
a1.sources.r1.port = 44444# Describe the sink
## 定义sink 即下游的类型，avro 表示下游有avro source 串联对接
a1.sinks.k1.type = avro
## 连接的服务端即avro source 的地址
a1.sinks.k1.hostname = hadoop113
## 服务端avro source 的端口
a1.sinks.k1.port = 4545
## 新增一个Sink 做负载均衡
a1.sinks.k2.type = avro
## 连接的服务端即avro source 的地址
a1.sinks.k2.hostname = hadoop114
## 服务端avro source 的端口
a1.sinks.k2.port = 4545# Use a channel which buffers events in memory
## 定义channel 即管道的类型，memory 应该比file 要快一些
a1.channels.c1.type = memory
## capacity 指channel 中能容纳的事件数，一个事件好比一个乒乓球，这里channel 能容纳1000 个
a1.channels.c1.capacity = 1000
## channel 每次从source 提取或发送给sink 的事件数，一次从源头取100 个乒乓球，或发给下游100 个
a1.channels.c1.transactionCapacity = 100# Bind the source and sink to the channel
## 绑定source、sink 和channel，即定义它们之间的关系
a1.sources.r1.channels = c1
## 注意！！sink 只能对应一个channel
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c1## 定义Sink Group，可以将某几个Sink 划到一个组里，它们使用相同的策略
a1.sinkgroups = g1
a1.sinkgroups.g1.sinks = k1 k2
### g1 组里使用负载均衡策略
a1.sinkgroups.g1.processor.type = load_balance
EOF## 1.2 创建flume_load_balance_34.conf，在hadoop113 和hadoop114 上执行
cat > /opt/modules/flume-1.7.0/jobs/flume_load_balance_34.conf << 'EOF'
# Name the components on this agent
## a1 是agent 的名字，在同一个节点上要唯一，因为可以启多个agent
## source、sink、channel 都可以有多个
a1.sources = r1
a1.sinks = k1
a1.channels = c1# Describe/configure the source
## 定义source 即源头的类型，avro 表要跟另一台机器上的avro sink 串联
a1.sources.r1.type = avro
## 绑定本机的IP，0.0.0.0 表本机所有可用IP
a1.sources.r1.bind = 0.0.0.0
## 监听的端口，与客户端一致
a1.sources.r1.port = 4545# Describe the sink
## 定义sink 即下游的类型，logger 默认打印到日志文件，启动参数中可改为打印到控制台
a1.sinks.k1.type = logger# Use a channel which buffers events in memory
## 定义channel 即管道的类型
a1.channels.c1.type = memory
## capacity 指channel 中能容纳的事件数，一个事件好比一个乒乓球，这里channel 能容纳1000 个
a1.channels.c1.capacity = 1000
## channel 每次从source 提取或发送给sink 的事件数，一次从源头取100 个乒乓球，或发给下游100 个
a1.channels.c1.transactionCapacity = 100# Bind the source and sink to the channel
## 绑定source、sink 和channel，即定义它们之间的关系
a1.sources.r1.channels = c1
## 注意！！sink 只能对应一个channel
a1.sinks.k1.channel = c1
EOF# 2. 启动flume
## 2.1 分别在hadoop113 和hadoop114 上执行，avro source 作为服务器端必须先启动
cd /opt/modules/flume-1.7.0/ &&\
bin/flume-ng agent -c conf/ -f jobs/flume_load_balance_34.conf \-n a1 -Dflume.root.logger=INFO,console## 2.2 在数据源头hadoop112 上执行，确保hadoop113 和hadoop114 上包含avro source 的flume 已启动
cd /opt/modules/flume-1.7.0/ && bin/flume-ng agent -c conf/ -f jobs/flume_load_balance_2.conf -n a1# 3. 验证数据复制
## 另开一个终端或在hadoop112 上启动netcat 客户端访问
nc hadoop112 44444
telnet hadoop112 44444  # 或用telnet 有也是同样的效果
## 输入的字符会随机出现在hadoop113 或hadoop114 的终端上，有可能多条消息只发往同一个终端，这由负载均衡后台算法决定。## 4. 停止flume
kill $(jps -l | grep org.apache.flume.node.Application | cut -d ' ' -f 1)
### ！！！切记不能用kill -9，因为可能还有数据没有传输完毕
### kill 或者Ctrl+c 都会调用flume 的hook 程序进行收尾工作，处理完剩余的数据才停止

故障转移

使用Channel 组件中Sink Processor设置策略为failover的Sink Group，然后将多个Sink纳入其中，让组内的Sink们形成主备关系，根据优先级平时只有一Sink正常工作，它出现问题后故障转移到另一个Sink。但即使这样源头端的Flume还是存在单点故障的问题，因为无法设置主备两个Flume去读取同一个数据源。

Source r1
exec

# 1. 创建任务配置
## 1.1 创建flume_failover_2.conf，在hadoop112 上执行
cat > /opt/modules/flume-1.7.0/jobs/flume_failover_2.conf << 'EOF'
# Name the components on this agent
## a1 是agent 的名字，在同一个节点上要唯一，因为可以启多个agent
## source、sink、channel 都可以有多个
a1.sources = r1
a1.sinks = k1 k2
a1.channels = c1# Describe/configure the source
## 定义source 即源头的类型，这里定义相当于netcat 服务端
a1.sources.r1.type = netcat
a1.sources.r1.bind = hadoop112
a1.sources.r1.port = 44444# Describe the sink
## 定义sink 即下游的类型，avro 表示下游有avro source 串联对接
a1.sinks.k1.type = avro
## 连接的服务端即avro source 的地址
a1.sinks.k1.hostname = hadoop113
## 服务端avro source 的端口
a1.sinks.k1.port = 4545
## 新增一个Sink 做负载均衡
a1.sinks.k2.type = avro
## 连接的服务端即avro source 的地址
a1.sinks.k2.hostname = hadoop114
## 服务端avro source 的端口
a1.sinks.k2.port = 4545# Use a channel which buffers events in memory
## 定义channel 即管道的类型，memory 应该比file 要快一些
a1.channels.c1.type = memory
## capacity 指channel 中能容纳的事件数，一个事件好比一个乒乓球，这里channel 能容纳1000 个
a1.channels.c1.capacity = 1000
## channel 每次从source 提取或发送给sink 的事件数，一次从源头取100 个乒乓球，或发给下游100 个
a1.channels.c1.transactionCapacity = 100# Bind the source and sink to the channel
## 绑定source、sink 和channel，即定义它们之间的关系
a1.sources.r1.channels = c1
## 注意！！sink 只能对应一个channel
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c1## 定义Sink Group，可以将某几个Sink 划到一个组里，它们使用相同的策略
a1.sinkgroups = g1
a1.sinkgroups.g1.sinks = k1 k2
### g1 组里使用故障转移策略
a1.sinkgroups.g1.processor.type = failover
### sink1 的优先级高于sink2，默认先走sink1
a1.sinkgroups.g1.processor.priority.k1 = 50
a1.sinkgroups.g1.processor.priority.k2 = 10
EOF## 1.2 创建flume_failover_34，分别在hadoop113 和hadoop114 上执行
cat > /opt/modules/flume-1.7.0/jobs/flume_failover_34.conf << 'EOF'
# Name the components on this agent
## a1 是agent 的名字，在同一个节点上要唯一，因为可以启多个agent
## source、sink、channel 都可以有多个
a1.sources = r1
a1.sinks = k1
a1.channels = c1# Describe/configure the source
## 定义source 即源头的类型，avro 表要跟另一台机器上的avro sink 串联
a1.sources.r1.type = avro
## 绑定本机的IP，0.0.0.0 表本机所有可用IP
a1.sources.r1.bind = 0.0.0.0
## 监听的端口，与客户端一致
a1.sources.r1.port = 4545# Describe the sink
## 定义sink 即下游的类型，logger 默认打印到日志文件，启动参数中可改为打印到控制台
a1.sinks.k1.type = logger# Use a channel which buffers events in memory
## 定义channel 即管道的类型
a1.channels.c1.type = memory
## capacity 指channel 中能容纳的事件数，一个事件好比一个乒乓球，这里channel 能容纳1000 个
a1.channels.c1.capacity = 1000
## channel 每次从source 提取或发送给sink 的事件数，一次从源头取100 个乒乓球，或发给下游100 个
a1.channels.c1.transactionCapacity = 100# Bind the source and sink to the channel
## 绑定source、sink 和channel，即定义它们之间的关系
a1.sources.r1.channels = c1
## 注意！！sink 只能对应一个channel
a1.sinks.k1.channel = c1
EOF# 2. 启动flume
## 2.1 分别在hadoop113 和hadoop114 上执行，avro source 作为服务器端必须先启动
cd /opt/modules/flume-1.7.0/ &&\
bin/flume-ng agent -c conf/ -f jobs/flume_failover_34.conf \-n a1 -Dflume.root.logger=INFO,console## 2.2 在数据源头hadoop112 上执行，确保hadoop113 和hadoop114 上包含avro source 的flume 已启动
cd /opt/modules/flume-1.7.0/ && bin/flume-ng agent -c conf/ -f jobs/flume_failover_2.conf -n a1# 3. 验证数据复制
## 另开一个终端或在hadoop112 上启动netcat 客户端访问
nc hadoop112 44444
telnet hadoop112 44444  # 或用telnet 有也是同样的效果
## 输入的字符会出现在hadoop113 的终端上，直到hadoop113 上的flume agent 退出，输入的字符才会跑到hadoop114 的终端上。## 4. 停止flume
kill $(jps -l | grep org.apache.flume.node.Application | cut -d ' ' -f 1)
### ！！！切记不能用kill -9，因为可能还有数据没有传输完毕
### kill 或者Ctrl+c 都会调用flume 的hook 程序进行收尾工作，处理完剩余的数据才停止

聚合数据

数据来源端假设是网站后台负载均衡的机器，每台上面的日志都需要采集，因为它们并不是互为备份的关系，每台机器上的日志都不重复，但假设落地端是HDFS，它的连接数或用户数有限，所以需要将数据聚合在一起提交。

Web Server
Node A

Source r1
exec

Web Server
Node B

rb1

# 0. 配置说明
## 实验中对比流程图略做小小改动，输入端都用netcat，任一个输入端有数据，输出端都会显示# 1. 创建任务配置
## 1.1 创建flume_consolidation_2.conf，在hadoop112 上执行
cat > /opt/modules/flume-1.7.0/jobs/flume_consolidation_2.conf << 'EOF'
# Name the components on this agent
## a1 是agent 的名字，在同一个节点上要唯一，因为可以启多个agent
## source、sink、channel 都可以有多个
a1.sources = r1
a1.sinks = k1
a1.channels = c1# Describe/configure the source
## 定义source 即源头的类型，avro 表要跟另一台机器上的avro sink 串联
a1.sources.r1.type = avro
## 绑定本机的IP，0.0.0.0 表本机所有可用IP
a1.sources.r1.bind = 0.0.0.0
## 监听的端口，与客户端一致
a1.sources.r1.port = 4545# Describe the sink
## 定义sink 即下游的类型，logger 默认打印到日志文件，启动参数中可改为打印到控制台
a1.sinks.k1.type = logger# Use a channel which buffers events in memory
## 定义channel 即管道的类型
a1.channels.c1.type = memory
## capacity 指channel 中能容纳的事件数，一个事件好比一个乒乓球，这里channel 能容纳1000 个
a1.channels.c1.capacity = 1000
## channel 每次从source 提取或发送给sink 的事件数，一次从源头取100 个乒乓球，或发给下游100 个
a1.channels.c1.transactionCapacity = 100# Bind the source and sink to the channel
## 绑定source、sink 和channel，即定义它们之间的关系
a1.sources.r1.channels = c1
## 注意！！sink 只能对应一个channel
a1.sinks.k1.channel = c1
EOF## 1.2 创建flume_consolidation_34.conf，分别在hadoop113 和hadoop114 上执行
cat > /opt/modules/flume-1.7.0/jobs/flume_consolidation_34.conf << 'EOF'
# Name the components on this agent
## a1 是agent 的名字，在同一个节点上要唯一，因为可以启多个agent
## source、sink、channel 都可以有多个
a1.sources = r1
a1.sinks = k1
a1.channels = c1# Describe/configure the source
## 定义source 即源头的类型，这里定义相当于netcat 服务端
a1.sources.r1.type = netcat
## 绑定本机的IP，0.0.0.0 表本机所有可用IP
a1.sources.r1.bind = 0.0.0.0
a1.sources.r1.port = 44444# Describe the sink
## 定义sink 即下游的类型，avro 表示下游有avro source 串联对接
a1.sinks.k1.type = avro
## 连接的服务端即avro source 的地址
a1.sinks.k1.hostname = hadoop112
## 服务端avro source 的端口
a1.sinks.k1.port = 4545# Use a channel which buffers events in memory
## 定义channel 即管道的类型，memory 应该比file 要快一些
a1.channels.c1.type = memory
## capacity 指channel 中能容纳的事件数，一个事件好比一个乒乓球，这里channel 能容纳1000 个
a1.channels.c1.capacity = 1000
## channel 每次从source 提取或发送给sink 的事件数，一次从源头取100 个乒乓球，或发给下游100 个
a1.channels.c1.transactionCapacity = 100# Bind the source and sink to the channel
## 绑定source、sink 和channel，即定义它们之间的关系
a1.sources.r1.channels = c1
## 注意！！sink 只能对应一个channel
a1.sinks.k1.channel = c1
EOF# 2. 启动flume
## 2.1 在数据落地端hadoop112 上执行，avro source 作为服务器端必须先启动
cd /opt/modules/flume-1.7.0/ &&\
bin/flume-ng agent -c conf/ -f jobs/flume_consolidation_2.conf \-n a1 -Dflume.root.logger=INFO,console## 2.2 分别在hadoop113 和hadoop114 上执行，确保hadoop112 上包含avro source 的flume 已启动
cd /opt/modules/flume-1.7.0/ && bin/flume-ng agent -c conf/ -f jobs/flume_consolidation_34.conf -n a1# 3. 验证数据复制
## 分别在hadoop113 和hadoop114 上另开一个终端
nc hadoop112 44444
telnet hadoop112 44444  # 或用telnet 有也是同样的效果
## 在hadoop113 或hadoop114 终端上输入的字符都会在hadoop112 的终端上。## 4. 停止flume
kill $(jps -l | grep org.apache.flume.node.Application | cut -d ' ' -f 1)
### ！！！切记不能用kill -9，因为可能还有数据没有传输完毕
### kill 或者Ctrl+c 都会调用flume 的hook 程序进行收尾工作，处理完剩余的数据才停止

自定义Interceptor 多路复用

1. 结构流程图

通过Channel Selector的multiplexing属性即多路复用，根据Event的header内容选择通道。同时，因为Channel Selectors是属于Source组件的，所以需要配置多个Channel来实现多通道。

Source r1
netcat

Channel Selector
multiplexing

2. 代码实现

创建一个Maven工程
GroupId: com.abc.flume
ArtifactId: flume-interceptor01
Version: 1.0
ProjectName: flume-interceptor01

pom.xml文件内容如下

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion><groupId>com.abc.flume</groupId><artifactId>flume-interceptor01</artifactId><version>1.0</version><dependencies><dependency><groupId>org.apache.flume</groupId><artifactId>flume-ng-core</artifactId><version>1.7.0</version></dependency></dependencies>
</project>

Java实现代码如下

// MyTypeInterceptor.java
package com.abc.flume;import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;import java.util.ArrayList;
import java.util.List;
import java.util.Map;public class MyTypeInterceptor implements Interceptor {// 用于保存处理过的Eventprivate List<Event> resultList;public void initialize() {// 初始化resultListresultList = new ArrayList<Event>();}public Event intercept(Event event) {System.out.println("v1.3a==-=-=-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-==-=");System.out.println(event);// 0. 如果Event 的Body 为空则不作任何处理if (event.getBody().length == 0) return event;// 1. 获取事件Headers 信息Map<String, String> headers = event.getHeaders();// 2. 获取事件Body 信息String body = new String(event.getBody());// 3. 根据Body 前2 个字符来判断并添加Header 内容String str = body.substring(0, 2).toUpperCase();if ("AA".equals(str)) {headers.put("myType", "AA");} else if ("AB".equals(str)) {headers.put("myType", "AB");} else if ("BB".equals(str)) {headers.put("myType", "BB");}return event;}public List<Event> intercept(List<Event> list) {System.out.println("deal with List<Event>");// 1. 清空结果resultListresultList.clear();// 2. 向resultList 中添加处理过的非空eventfor (Event event : list) {resultList.add(intercept(event));}// 3. 返回结果resultListreturn resultList;}public void close() {}// 使用静态内部类返回一个MyTypeInterceptor 对象供程序使用，类名可以随便起public static class Builder implements Interceptor.Builder {public Interceptor build() {return new MyTypeInterceptor();}public void configure(Context context) {}}
}

使用Maven–> Lifecycle --> package --> 在工程下生成target/flume-interceptor01-1.0.jar
将flume-interceptor01-1.0.jar拷贝至hadoop112的/opt/modules/flume-1.7.0/lib/下

3. 配置测试

# 0. 环境准备
mkdir -p /tmp/log/sink1 /tmp/log/sink2# 1. 创建任务配置，在hadoop112 上执行
cat > /opt/modules/flume-1.7.0/jobs/flume_multiplexing.conf << 'EOF'
# Name the components on this agent
## a1 是agent 的名字，在同一个节点上要唯一，因为可以启多个agent
## source、sink、channel 都可以有多个
a1.sources = r1
a1.sinks = k1 k2
a1.channels = c1 c2# Describe/configure the source
## 定义source 即源头的类型，这里定义相当于netcat 服务端
a1.sources.r1.type = netcat
a1.sources.r1.bind = 0.0.0.0
a1.sources.r1.port = 44444
## 定义Source 的Interceptor 即拦截器
a1.sources.r1.interceptors = i1
## $ 后面的是拦截器内部的静态类，它负责返回拦截器对象
a1.sources.r1.interceptors.i1.type = com.abc.flume.MyTypeInterceptor$Builder
## 设置Source 的Channel 选择器类型为多路复用，根据事件的头信息选择通道
a1.sources.r1.selector.type = multiplexing
a1.sources.r1.selector.header = myType
a1.sources.r1.selector.mapping.AA = c1
a1.sources.r1.selector.mapping.BB = c2
a1.sources.r1.selector.mapping.AB = c1 c2
a1.sources.r1.selector.default = c1# Describe the sink
## 定义sink 即下游的类型，file_roll 即输出到本地文件系统
a1.sinks.k1.type = file_roll
## 不滚动文件，只输出到一个文件中
a1.sinks.k1.sink.rollInterval = 0
## 输出文件的目标文件夹
a1.sinks.k1.sink.directory = /tmp/log/sink1
## 定义sink 即下游的类型，file_roll 即输出到本地文件系统
a1.sinks.k2.type = file_roll
## 不滚动文件，只输出到一个文件中
a1.sinks.k2.sink.rollInterval = 0
## 输出文件的目标文件夹
a1.sinks.k2.sink.directory = /tmp/log/sink2# Use a channel which buffers events in memory
## 定义channel 即管道的类型，memory 应该比file 要快一些
a1.channels.c1.type = memory
## capacity 指channel 中能容纳的事件数，一个事件好比一个乒乓球，这里channel 能容纳1000 个
a1.channels.c1.capacity = 1000
## channel 每次从source 提取或发送给sink 的事件数，一次从源头取100 个乒乓球，或发给下游100 个
a1.channels.c1.transactionCapacity = 100
## 多定义一个Channel c2 用于多路复用，其他参数默认
a1.channels.c2.type = memory# Bind the source and sink to the channel
## 绑定source、sink 和channel，即定义它们之间的关系
a1.sources.r1.channels = c1 c2
## 注意！！sink 只能对应一个channel
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2
EOF# 2. 启动flume，相当于启动了netcat 服务端
cd /opt/modules/flume-1.7.0/ && bin/flume-ng agent -n a1 -c conf/ -f jobs/flume_multiplexing.conf
### 启动后，flume 会阻塞等待客户端访问# 3. 验证多路复用
## 另开一个终端或在hadoop113 上启动netcat 客户端访问
nc hadoop112 44444
telnet hadoop112 44444  # 或用telnet 有也是同样的效果
## 拦截器按照消息前两个字符为其被打上标签，字符判断不区分大小写
## aa 打上AA 标签，发往/tmp/log/sink1
## bb 打上BB 标签，发往/tmp/log/sink2
## ab 打上AB 标签，发往/tmp/log/sink1 和/tmp/log/sink2
## 其他不打标签，默认发往/tmp/log/sink1
## 查看最终落地的内容
tail -f /tmp/log/sink1/*
tail -f /tmp/log/sink2/*## 4. 停止flume
kill $(jps -l | grep org.apache.flume.node.Application | cut -d ' ' -f 1)
### ！！！切记不能用kill -9，因为可能还有数据没有传输完毕
### kill 或者Ctrl+c 都会调用flume 的hook 程序进行收尾工作，处理完剩余的数据才停止

自定义Source

未完成

自定义Sink

未完成

监控Flume

操作系统为Ubuntu14.04，其他发行版可能不适用。

Ganlia 结构

gmond 需要监控的主机上的监听器
gmetad 整合所有反馈信息
gweb 网页可视化工具

hadoop113hadoop112hadoop111

put

get

flume

gmond

gmated

rddtool

gweb

1. 安装配置Ganglia

# 1. 安装gmetad、gweb 和gmond，在hadoop111 上执行
sudo apt-get update
sudo apt-get install -y gmetad ganglia-webfrontend ganglia-monitor rrdtool
## 会提示两次是否要重启apache2，选Yes
## 如果选错了No，可以手动重启，sudo /etc/init.d/apache2 restart# 2. 配置gweb，在hadoop111 上执行
## 使用tee 和here document 创建配置文件，并将tee 的标准输出重定向到/dev/null 即什么都不显示
sudo tee /etc/apache2/sites-enabled/ganglia.conf << 'EOF' > /dev/null
Alias /ganglia /usr/share/ganglia-webfrontend
<Directory "/usr/share/ganglia-webfrontend">AllowOverride AllOrder allow,denyAllow from allDeny from none
</Directory>
EOF
## 顺带提一句，cat 与> 是两个命令，所以sudo cat > 不能创建需要root 权限的文件，详见cat 命令的总结
## 去掉重启Apache 时的提示，AH00558: apache2: Could not reliably determine the server's fully qualified domain name, using 192.168.0.111. Set the 'ServerName' directive globally to suppress this message
sudo tee -a /etc/apache2/apache2.conf <<< "ServerName hadoop111"
## hadoop111 是该节点的局域网内的域名，在各个节点的/etc/hosts 里有192.168.0.111  hadoop111# 3. 配置gmated，在hadoop111 上执行
## 如果备份文件不存在则先备份
if [ ! -f '/etc/ganglia/gmetad.conf.bak' ]; then cd /etc/ganglia; sudo cp gmetad.conf gmetad.conf.bak;
fi
sudo tee /etc/ganglia/gmetad.conf << 'EOF' > /dev/null
# data_source 有三个参数：集群名称，轮询时间，被监听gmond 地址:端口(可多个，空格隔开)
data_source "flumes" 10 hadoop111:8649
EOF# 4. 配置gmond，在hadoop111 上执行
## 如果备份文件不存在则先备份
if [ ! -f '/etc/ganglia/gmond.conf.bak' ]; then cd /etc/ganglia; sudo cp gmond.conf gmond.conf.bak;
fi
## name 与gmate 中data_source 集群名称一致
sudo sed -i 's/name = "unspecified"/name = "flumes"/' /etc/ganglia/gmond.conf
## 注释掉mcast_join、bind
sudo sed -i 's/mcast_join/#mcast_join/' /etc/ganglia/gmond.conf
sudo sed -i 's/bind/#bind/' /etc/ganglia/gmond.conf
## 在'udp_send_channel {' 下添加host 属性
sudo sed -i '/udp_send_channel\ {/a\ \ host = hadoop111' /etc/ganglia/gmond.conf# 5. 重启ganglia，在hadoop111 上执行
sudo /etc/init.d/ganglia-monitor restart
sudo /etc/init.d/gmetad restart
sudo /etc/init.d/apache2 restart
## 查看服务状态
service --status-all
service gmetad status
service ganglia-monitor status
service apache2 status

2. 启动Flume 并监控

# 1. 创建任务配置，分别在hadoop112、hadoop113 上执行
cat > /opt/modules/flume-1.7.0/jobs/flume_ganglia.conf << 'EOF'
# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = 0.0.0.0
a1.sources.r1.port = 44444# Describe the sink
a1.sinks.k1.type = logger# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100# Bind the source and sink to the channel
a1.sources.r1.channels = c1
## 注意！！sink 只能对应一个channel
a1.sinks.k1.channel = c1
EOF# 2. 启动Flume，分别在hadoop112、hadoop113 上执行
cd /opt/modules/flume-1.7.0/ &&\
bin/flume-ng agent -n a1 -c conf/ -f jobs/flume_ganglia.conf \-Dflume.root.logger=INFO,console \-Dflume.monitoring.type=ganglia \-Dflume.monitoring.hosts=hadoop111:8649
## 以ganglia 的形式进行监控，将监控数据发到hadoop111 的8649 端口
## 每启动一个Flume Agent，在Ganglia 页面上Choose Node 就多一个该进程所在的节点选项# 3. 压力测试Flume，分别在hadoop112、hadoop113 上执行
## 使用here string 通过netcat 发给本机44444 端口，再将反馈信息重定向到/dev/null
for((i=1; i<=100; i++)); donc localhost 44444 <<< "data-$i" > /dev/null
done# 4. 检验Ganglia
## 浏览器访问，http://192.168.0.111/ganglia/
## 以flume.CHANNEL.c1 为例，已放入的事件数 = 还在通道中的事件数 + 已取走的事件数，即
## EventPutSuccessCount = ChannelSize + EventTakeSuccessCount

参考文章

Flume v1.7 官方使用说明
Flume transactionCapacity capacity 的简单理解
Flume v1.8 用户手册中文版
Flume 核心概念
理解 inode
Ubuntu14.04 安装配置Ganglia
sudo echo 依旧写入不了特权文件
Linux 管道指令（pipe）与shell 重定向的区别
深入理解Ganglia 之Overview
sed 匹配内容的前一行和后一行添加内容
Ubuntu 服务管理