0
点赞
收藏
分享

微信扫一扫

Flume介绍

_karen 2024-01-28 阅读 9

一、介绍

二、部署

# 解压缩
tar -zxvf apache-flume-1.10.1-bin.tar.gz -C /opt/module/

三、使用

3.1、netcat to logger

vim /opt/module/flume/job/net_to_log.conf
# example.conf: A single-node Flume configuration

# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444

# Describe the sink
a1.sinks.k1.type = logger

# Use a channel which buffers events in memory
# capacity:Maximum capacity of the channel
# transactionCapacity:The maximum size of transaction supported by the channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
# 启动该Agent
bin/flume-ng agent -n a1 -c /opt/module/flume/conf/ -f /opt/module/flume/job/net_to_log.conf
# 开启客户端消息推送
nc 127.0.0.1 44444

3.2、netcat to kafka

# example.conf: A single-node Flume configuration

# Name the components on this agent
a1.sources = r1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444


a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers = hadoop102:9092,hadoop103:9092
a1.channels.c1.kafka.topic = netcat_to_kafka_topic
a1.channels.c1.parseAsFlumeEvent = false

# Bind the source and sink to the channel
a1.sources.r1.channels = c1

3.3、file to hdfs

# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /opt/module/logs/file.log
a1.sources.r1.channels = c1

# Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://hadoop102:8020/flume/%Y%m%d/%H

# 上传文件的前缀
a1.sinks.k1.hdfs.filePrefix = events-
# 是否按照时间滚动文件夹
a1.sinks.k1.hdfs.round = true
# 多少时间单位创建一个新的文件夹
a1.sinks.k1.hdfs.roundValue = 10
# 重新定义时间单位
a1.sinks.k1.hdfs.roundUnit = hour
# 是否使用本地时间戳
a1.sinks.k1.hdfs.useLocalTimeStamp = true
# 积攒多少Ecent才flush一次到HDFS
a1.sinks.k1.hdfs.batchSize = 100
# 设置文件类型,可支持压缩
a1.sinks.k1.hdfs.fileType = DataStream
# 多久生成一个新的文件(单位:秒)
a1.sinks.k1.hdfs.rollInterval = 60
# 设置每个文件滚动的大小
a1.sinks.k1.hdfs.rollSize = 134217700
# 文件的滚动与Event数量无关
a1.sinks.k1.hdfs.rollCount = 0


# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

3.4、tail to hdfs

# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = TAILDIR
# 存放断点续传的最后位置
a1.sources.r1.positionFile = /opt/module/logs/tail_dir.json
# 监控多个目录
a1.sources.r1.filegroups = f1 f2
a1.sources.r1.filegroups.f1 = /opt/module/logs/files/.*file.*
a1.sources.r1.filegroups.f2 = /opt/module/logs/files2/.*log.*


# Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://hadoop102:8020/flume/files/%Y%m%d/%H

# 上传文件的前缀
a1.sinks.k1.hdfs.filePrefix = events-
# 是否按照时间滚动文件夹
a1.sinks.k1.hdfs.round = true
# 多少时间单位创建一个新的文件夹
a1.sinks.k1.hdfs.roundValue = 10
# 重新定义时间单位
a1.sinks.k1.hdfs.roundUnit = hour
# 是否使用本地时间戳
a1.sinks.k1.hdfs.useLocalTimeStamp = true
# 积攒多少Event才flush一次到HDFS
a1.sinks.k1.hdfs.batchSize = 100
# 设置文件类型,可支持压缩
a1.sinks.k1.hdfs.fileType = DataStream
# 多久生成一个新的文件(单位:秒)
a1.sinks.k1.hdfs.rollInterval = 20
# 设置每个文件滚动的大小
a1.sinks.k1.hdfs.rollSize = 134217700
# 文件的滚动与Event数量无关
a1.sinks.k1.hdfs.rollCount = 0

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

四、其他

4.1、Spooling Directory Source

4.2、Avro Source

在这里插入图片描述

4.3、Kafka Source

在这里插入图片描述
在这里插入图片描述

4.4、HTTP Source

在这里插入图片描述
在这里插入图片描述

4.5、Kafka Channel

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

4.6、JDBC Channel

在这里插入图片描述

4.7、Hive Sink

在这里插入图片描述
在这里插入图片描述

4.8、Kafka Sink

在这里插入图片描述
在这里插入图片描述

4.9、HTTP Sink

在这里插入图片描述

在这里插入图片描述

五、拓扑结构

5.1、Setting multi-agent flow

在这里插入图片描述

5.1、Consolidation

在这里插入图片描述

# Name the components on this agent
a1.sources = r1
a1.sinks = k1      
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /opt/module/logs/file.log


# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100


# Describe the sink
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = hadoop104
a1.sinks.k1.port = 4141


# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = hadoop103
a1.sources.r1.port = 44444


# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100


# Describe the sink
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = hadoop104
a1.sinks.k1.port = 4141


# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = avro
a1.sources.r1.bind = hadoop104
a1.sources.r1.port = 4141


# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100


# Describe the sink
a1.sinks.k1.type = logger


# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

5.1、Multiplexing the flow

在这里插入图片描述

vim flume_channel_selectors_example.conf

# Name the components on this agent
a1.sources = r1
a1.sinks = k1 k2
a1.channels = c1 c2

# Channel选择器,默认为:replicating(将数据流复制给所有channel)
a1.sources.r1.selector.type = replicating

# Describe/configure the source
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /opt/module/logs/file.log



# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

a1.channels.c2.type = memory
a1.channels.c2.capacity = 1000
a1.channels.c2.transactionCapacity = 100



# Describe the sink
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = localhost
a1.sinks.k1.port = 4141

a1.sinks.k2.type = avro
a1.sinks.k2.hostname = localhost
a1.sinks.k2.port = 4142


# Bind the source and sink to the channel
a1.sources.r1.channels = c1 c2
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2

vim avro_to_logger1.conf


# Name the components on this agent
a2.sources = r1
a2.sinks = k1
a2.channels = c1


# Describe/configure the source
a2.sources.r1.type = avro
a2.sources.r1.bind = localhost
a2.sources.r1.port = 4141



# Use a channel which buffers events in memory
a2.channels.c1.type = memory
a2.channels.c1.capacity = 1000
a2.channels.c1.transactionCapacity = 100



# Describe the sink
a2.sinks.k1.type = logger


# Bind the source and sink to the channel
a2.sources.r1.channels = c1
a2.sinks.k1.channel = c1
vim avro_to_logger2.conf


# Name the components on this agent
a3.sources = r1
a3.sinks = k1
a3.channels = c2


# Describe/configure the source
a3.sources.r1.type = avro
a3.sources.r1.bind = localhost
a3.sources.r1.port = 4142



# Use a channel which buffers events in memory
a3.channels.c2.type = memory
a3.channels.c2.capacity = 1000
a3.channels.c2.transactionCapacity = 100



# Describe the sink
a3.sinks.k1.type = logger


# Bind the source and sink to the channel
a3.sources.r1.channels = c2
a3.sinks.k1.channel = c2
# 启动这三个agent
bin/flume-ng agent -n a2 -c /opt/module/flume/conf/ -f /opt/module/flume/job/group/avro_to_logger1.conf
bin/flume-ng agent -n a3 -c /opt/module/flume/conf/ -f /opt/module/flume/job/group/avro_to_logger2.conf
bin/flume-ng agent -n a1 -c /opt/module/flume/conf/ -f /opt/module/flume/job/group/flume_channel_selectors_example.conf

六、拦截器

6.1、Timestamp Interceptor

在这里插入图片描述

# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1


# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = 0.0.0.0
a1.sources.r1.port = 44444

# 配置拦截器
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = timestamp


# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100


# Describe the sink
a1.sinks.k1.type = logger


# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

在这里插入图片描述

6.2、Host Interceptor

在这里插入图片描述

# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1


# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = 0.0.0.0
a1.sources.r1.port = 44444

# 配置拦截器
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = host


# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100


# Describe the sink
a1.sinks.k1.type = logger


# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

在这里插入图片描述

6.3、Static Interceptor

在这里插入图片描述

# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1


# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = 0.0.0.0
a1.sources.r1.port = 44444

# 配置拦截器
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = static
a1.sources.r1.interceptors.i1.key = datacenter
a1.sources.r1.interceptors.i1.value = NEW_YORK


# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100


# Describe the sink
a1.sinks.k1.type = logger


# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

在这里插入图片描述

6.4、Remove Header Interceptor

在这里插入图片描述

6.5、UUID Interceptor

在这里插入图片描述

6.6、Morphline Interceptor

在这里插入图片描述

6.7、Search and Replace Interceptor

在这里插入图片描述

6.8、Regex Filtering Interceptor

在这里插入图片描述

6.9、Regex Extractor Interceptor

在这里插入图片描述

在这里插入图片描述

6.10、自定义拦截器

  1. 引用POM依赖
<dependency>
    <groupId>org.apache.flume</groupId>
    <artifactId>flume-ng-core</artifactId>
    <version>1.10.1</version>
</dependency>
  1. 编写拦截器
package com.xx.interceptor;

import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
 * @author xiaxing
 * @describe flume拦截器
 * @since 2024/1/23 16:37
 */
public class TypeInterceptor implements Interceptor {

    private List<Event> addHeaderEvents;

    @Override
    public void initialize() {
        addHeaderEvents = new ArrayList<>();
    }

    @Override
    public Event intercept(Event event) {
        Map<String, String> headers = event.getHeaders();
        String body = new String(event.getBody());
        if (body.contains("CZ")) {
            headers.put("state", "CZ");
        } else if (body.contains("US")) {
            headers.put("state", "US");
        } else {
            headers.put("state", "UN");
        }
        return event;
    }

    @Override
    public List<Event> intercept(List<Event> list) {
        addHeaderEvents.clear();
        list.forEach(event -> {
            addHeaderEvents.add(this.intercept(event));
        });
        return addHeaderEvents;
    }

    @Override
    public void close() {

    }

    public static class Builder implements Interceptor.Builder {

        @Override
        public Interceptor build() {
            return new TypeInterceptor();
        }

        @Override
        public void configure(Context context) {

        }
    }
}

  1. 将打包好的jar包放到flume中

在这里插入图片描述

  1. 编写flume.conf
# Name the components on this agent
a1.sources = r1
a1.sinks = k1 k2 k3
a1.channels = c1 c2 c3


# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444

# 配置拦截器
# 拦截器名称
a1.sources.r1.interceptors = i1
# 拦截器路径
a1.sources.r1.interceptors.i1.type = com.xx.interceptor.TypeInterceptor$Builder
a1.sources.r1.selector.type = multiplexing

# 指定头信息中的key
a1.sources.r1.selector.header = state
# 如果value为CZ则将数据发送到c1这个channel
a1.sources.r1.selector.mapping.CZ = c1
# 如果value为CZ则将数据发送到c2这个channel
a1.sources.r1.selector.mapping.US = c2
# 没有命中的发往c3这个channel
a1.sources.r1.selector.default = c3



# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

a1.channels.c2.type = memory
a1.channels.c2.capacity = 1000
a1.channels.c2.transactionCapacity = 100

a1.channels.c3.type = memory
a1.channels.c3.capacity = 1000
a1.channels.c3.transactionCapacity = 100




# Describe the sink
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = hadoop102
a1.sinks.k1.port = 4141


a1.sinks.k2.type = avro
a1.sinks.k2.hostname = hadoop103
a1.sinks.k2.port = 4142


a1.sinks.k3.type = avro
a1.sinks.k3.hostname = hadoop104
a1.sinks.k3.port = 4143


# Bind the source and sink to the channel
a1.sources.r1.channels = c1 c2 c3
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2
a1.sinks.k3.channel = c3
# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1


# Describe/configure the source
a1.sources.r1.type = avro
a1.sources.r1.bind = 0.0.0.0
a1.sources.r1.port = 4141



# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100



# Describe the sink
a1.sinks.k1.type = logger


# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1


# Describe/configure the source
a1.sources.r1.type = avro
a1.sources.r1.bind = 0.0.0.0
a1.sources.r1.port = 4142



# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100



# Describe the sink
a1.sinks.k1.type = logger


# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1


# Describe/configure the source
a1.sources.r1.type = avro
a1.sources.r1.bind = 0.0.0.0
a1.sources.r1.port = 4143



# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100



# Describe the sink
a1.sinks.k1.type = logger


# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

在这里插入图片描述

举报

相关推荐

0 条评论