Oracle数据迁移至ClickHouse(二):flume

本文介绍如何使用Flume结合第三方JDBC驱动实现Oracle到ClickHouse的数据同步,包括环境搭建、源码修改及配置详解。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

一.前言

最近研究flume发现网络上的资料多少都有些问题,针对clickhouse引用最多的flume sink是:https://reviews.apache.org/r/50692/diff/1#2,这个源码的pom.xml存在问题,无法编译;其次clickhouse官网的jdbc驱动效率不高,鉴于此,本文介绍利用官网推荐的第三方jdbc重写flume sink,环境基于RHEL6,相关过程见下文。

二.数据库环境准备

oracle:建表列类型&Insert的数据(包含特殊字符&null)尽可能接近生产环境。

create table tdba_test
(
  TID          number primary key,
  TSN          varchar2(40),
  TNO          number,
  TAMT         number(15,2),
  CREATE_DATE  Date,
  UPDATE_DATE  Date,
  UPDATE_TIME  date
);
insert into tdba_test values(1,'SN001',1,1.1,to_date('2020-01-01','yyyy-mm-dd'),to_date('2020-01-02','yyyy-mm-dd'),to_date('2020-01-02 01:01:01','yyyy-mm-dd hh24:mi:ss'));
insert into tdba_test values(2,'SN002',2,2.2,to_date('2020-01-01','yyyy-mm-dd'),to_date('2020-01-02','yyyy-mm-dd'),to_date('2020-01-02 01:01:01','yyyy-mm-dd hh24:mi:ss'));
insert into tdba_test values(3,'SN003',3,3.3,to_date('2020-01-01','yyyy-mm-dd'),to_date('2020-01-02','yyyy-mm-dd'),to_date('2020-01-02 01:01:01','yyyy-mm-dd hh24:mi:ss'));
insert into tdba_test values(4,'SN'',''004',4,4.4,to_date('2020-01-01','yyyy-mm-dd'),to_date('2020-01-02','yyyy-mm-dd'),to_date('2020-01-02 01:01:01','yyyy-mm-dd hh24:mi:ss'));
insert into tdba_test values(5,'SN","005,',5,5.5,to_date('2020-01-01','yyyy-mm-dd'),to_date('2020-01-02','yyyy-mm-dd'),to_date('2020-01-02 01:01:01','yyyy-mm-dd hh24:mi:ss'));
insert into tdba_test values(6,NULL,6,6.6,to_date('2020-01-01','yyyy-mm-dd'),to_date('2020-01-02','yyyy-mm-dd'),to_date('2020-01-02 01:01:01','yyyy-mm-dd hh24:mi:ss'));
insert into tdba_test values(7,NULL,NULL,7.7,to_date('2020-01-01','yyyy-mm-dd'),to_date('2020-01-02','yyyy-mm-dd'),to_date('2020-01-02 01:01:01','yyyy-mm-dd hh24:mi:ss'));
insert into tdba_test values(8,NULL,NULL,NULL,to_date('2020-01-01','yyyy-mm-dd'),to_date('2020-01-02','yyyy-mm-dd'),to_date('2020-01-02 01:01:01','yyyy-mm-dd hh24:mi:ss'));
insert into tdba_test values(9,NULL,NULL,NULL,to_date('2020-01-01','yyyy-mm-dd'),NULL,to_date('2020-01-02 01:01:01','yyyy-mm-dd hh24:mi:ss'));
insert into tdba_test values(10,NULL,NULL,NULL,to_date('2020-01-01','yyyy-mm-dd'),NULL,NULL);

clickhouse:

create table TDBA_TEST
(
  TID          UInt32,
  TSN          String,
  TNO          UInt16,
  TAMT         Decimal(15,2),
  CREATE_DATE  Date,
  UPDATE_DATE  Date,
  UPDATE_TIME  DateTime
) ENGINE = MergeTree(CREATE_DATE, (TID), 8192)

三.下载flume

下载flume,解压即可
http://www.apache.org/dyn/closer.lua/flume/1.9.0/apache-flume-1.9.0-bin.tar.gz

四.flume source

源码地址:https://github.com/keedio/flume-ng-sql-source.git
编译打包后把jar包放入flume的lib目录即可。

五.flume sink

开发流程:新建一个类,实现Configurable接口,继承AbstractSink类。重写configure、start、stop、process方法,不多介绍,实际运行下很容易了解它的运作流程。
源码比较简单,编译打包后jar放入flume lib目录即可。

package org.ips.demo;
/*
creation_time     created_by        update_time       updated_by        Description
----------------  ----------------  ----------------  ----------------  ------------------------------------------------------------
202005            xlzhu@ips.com                                         flume sink for clickhouse database

*/
public class ClickHouseSinkConstants {
    public static final String HOST = "host";
    public static final String PORT = "port";
    public static final String BATCH_SIZE = "batchSize";
    public static final String USER = "user";
    public static final String PASSWORD = "password";
    public static final String DATABASE = "database";
    public static final String TABLE = "table";
    public static final String DEFAULT_PORT = "8123";
    public static final int DEFAULT_BATCH_SIZE = 10000;
    public static final String DEFAULT_USER = "";
    public static final String DEFAULT_PASSWORD = "";
	public static final String SQL_INSERT="INSERT INTO %s.%s FORMAT CSV";
	public static final String JDBC_DRIVER_CLASS="cc.blynk.clickhouse.ClickHouseDriver";
}
package org.ips.demo;
/*
creation_time     created_by        update_time       updated_by        Description
----------------  ----------------  ----------------  ----------------  ------------------------------------------------------------
202005            xlzhu@ips.com                                         flume sink for clickhouse database using Third-party drivers
                                                                        references:https://clickhouse.tech/docs/en/interfaces/jdbc/
																		           https://github.com/blynkkk/clickhouse4j

*/
import com.google.common.base.Preconditions;
import org.apache.flume.*;
import org.apache.flume.conf.Configurable;
import org.apache.flume.instrumentation.SinkCounter;
import org.apache.flume.sink.AbstractSink;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayInputStream;
import cc.blynk.clickhouse.copy.*;
import cc.blynk.clickhouse.ClickHouseDataSource;
import cc.blynk.clickhouse.ClickHouseConnectionImpl;
import java.sql.*;
import java.nio.charset.StandardCharsets;


import static org.ips.demo.ClickHouseSinkConstants.*;

public class ClickHouseSink extends AbstractSink implements Configurable {

    private static final Logger logger = LoggerFactory.getLogger(ClickHouseSink.class);
	private ClickHouseDataSource dataSource = null;
    private SinkCounter sinkCounter = null;
    private String host = null;
    private String port = null;
    private String user = null;
    private String password = null;
    private String database = null;
    private String table = null;
    private int batchSize;

    @Override
    public void configure(Context context) {
        logger.debug("------######begin configure...");
        if (sinkCounter == null) {
            sinkCounter = new SinkCounter(getName());
        }

        Preconditions.checkArgument(context.getString(HOST) != null && context.getString(HOST).length() > 0, "ClickHouse host must be specified!");
        this.host = context.getString(HOST);
        if (!this.host.startsWith("jdbc:clickhouse://")) {
            this.host = "jdbc:clickhouse://" + this.host;
        }

        Preconditions.checkArgument(context.getString(DATABASE) != null && context.getString(DATABASE).length() > 0, "ClickHouse database must be specified!");
        this.database = context.getString(DATABASE);
        Preconditions.checkArgument(context.getString(TABLE) != null && context.getString(TABLE).length() > 0, "ClickHouse table must be specified!");
        this.table = context.getString(TABLE);
        this.port = context.getString(PORT, DEFAULT_PORT);
        this.user = context.getString(USER, DEFAULT_USER);
        this.password = context.getString(PASSWORD, DEFAULT_PASSWORD);
        this.batchSize = context.getInteger(BATCH_SIZE, DEFAULT_BATCH_SIZE);
		logger.debug("------######end configure...");
    }

    @Override
    public void start() {
		logger.debug("------######begin start...");
		//String path = getClass().getProtectionDomain().getCodeSource().getLocation().getPath();
		//logger.debug("------######jar path:"+path);
        String jdbcUrl = String.format("%s:%s/%s", this.host, this.port, this.database);
		try{
		   this.dataSource = new ClickHouseDataSource(jdbcUrl);
		   logger.debug("------######getDataSource ok");
		} catch (Exception e) {            
		   e.printStackTrace();        
		}
        sinkCounter.start();
        super.start();
		logger.debug("------######end start...");
    }


    @Override
    public void stop() {
        logger.debug("------######begin stop");     
        sinkCounter.incrementConnectionClosedCount();
        sinkCounter.stop();
        super.stop();
		logger.debug("------######end stop");
    }


    @Override
    public Status process() throws EventDeliveryException {
		logger.debug("------######begin process");
        Status status = null;
        // Start transaction
        Channel ch = getChannel();
        Transaction txn = ch.getTransaction();
        txn.begin();
        try {
            int count;
            StringBuilder batch = new StringBuilder();
            for (count = 0; count < batchSize; ++count) {
                Event event = ch.take();
                if (event == null) {
                    break;
                }
                batch.append(new String(event.getBody(), StandardCharsets.UTF_8)).append("\n");
            }
			logger.debug("------######data from channel:\n"+batch.toString()+"EOF");
            if (count <= 0) {
                sinkCounter.incrementBatchEmptyCount();
                txn.commit();
                return Status.BACKOFF;
            } else if (count < batchSize) {
                sinkCounter.incrementBatchUnderflowCount();
            } else {
                sinkCounter.incrementBatchCompleteCount();
            }
            sinkCounter.addToEventDrainAttemptCount(count);
			CopyManager copyManager = CopyManagerFactory.create((ClickHouseConnectionImpl) this.dataSource.getConnection(this.user,this.password));
			logger.debug("------######copyManager ok");
			logger.debug("------######SQL:"+String.format(SQL_INSERT, this.database, this.table));
            copyManager.copyToDb(String.format(SQL_INSERT, this.database, this.table), new ByteArrayInputStream(batch.toString().getBytes(StandardCharsets.UTF_8))); 
            logger.debug("------######copyTodb ok");			
			sinkCounter.incrementEventDrainSuccessCount();
			status = Status.READY;
            txn.commit();
            logger.debug("------######commit ok");
        } catch (Throwable t) {
            txn.rollback();
            logger.error(t.getMessage(), t);
            status = Status.BACKOFF;
            // re-throw all Errors
            if (t instanceof Error) {
                throw (Error) t;
            }
        } finally {
            txn.close();
        }
		logger.debug("------######end process");
        return status;
    }
}

pom.xml:

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>org.ips</groupId>
  <artifactId>demo</artifactId>
  <version>0.0.1-SNAPSHOT</version>

  <name>demo</name>
  <!-- FIXME change it to the project's website -->
  <url>http://www.example.com</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
  </properties>

  <dependencies>
    <dependency>
        <groupId>org.apache.flume</groupId>
        <artifactId>flume-ng-core</artifactId>
		<version>1.9.0</version>
    </dependency>

    <dependency>
        <groupId>com.google.guava</groupId>
        <artifactId>guava</artifactId>
        <version>19.0</version>
    </dependency>

    <dependency>
        <groupId>ru.yandex.clickhouse</groupId>
        <artifactId>clickhouse-jdbc</artifactId>
        <version>0.2.3</version>
    </dependency>
	<dependency>
        <groupId>cc.blynk.clickhouse</groupId>
        <artifactId>clickhouse4j</artifactId>
        <version>1.4.4</version>
    </dependency>
	<dependency>  
       <groupId>log4j</groupId>  
       <artifactId>log4j</artifactId>  
       <version>1.2.15</version>  
       <exclusions>  
         <exclusion>  
           <groupId>javax.mail</groupId>  
           <artifactId>mail</artifactId>  
         </exclusion>  
         <exclusion>  
           <groupId>javax.jms</groupId>  
           <artifactId>jms</artifactId>  
         </exclusion>  
         <exclusion>  
           <groupId>com.sun.jdmk</groupId>  
           <artifactId>jmxtools</artifactId>  
         </exclusion>  
         <exclusion>  
           <groupId>com.sun.jmx</groupId>  
           <artifactId>jmxri</artifactId>  
         </exclusion>  
       </exclusions>  
       <scope>compile</scope>  
     </dependency> 
  </dependencies>

 <build>
        <finalName>flume-clickhouse-sink</finalName>
        <plugins>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <archive>
                    <manifest>
                    <mainClass>org.ips.demo.ClickHouseSink</mainClass>
                    </manifest>
                    </archive>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
            </plugin>
        </plugins>
    </build>
</project>

六.ojdbc6.jar

由于要从oracle读取数据,需要把oracle的jdbc jar包拷入flume的lib目录;jar可以从网络下载或oracle数据库安装目录$ORACLE_HOME/jdbc/lib/

七.配置

编辑flume配置文件,重点在custom.query(不建议使用table),可以通过定制SQL屏蔽部分数据库差异导致的问题(比如null问题)
vi conf/flume-sql.conf
agentTest.channels = channel_db
agentTest.sources = oracle
agentTest.sinks = clickhouse
###########sql source#################
####### For each Test of the sources, the type is defined
agentTest.sources.oracle.type = org.keedio.flume.source.SQLSource
agentTest.sources.oracle.hibernate.connection.url = jdbc:oracle:thin:@192.168.xxx.xxx:1522:xxx

####### Hibernate Database connection properties
agentTest.sources.oracle.hibernate.connection.user = xxx
agentTest.sources.oracle.hibernate.connection.password = xxx
agentTest.sources.oracle.hibernate.connection.autocommit = true
agentTest.sources.oracle.hibernate.dialect = org.hibernate.dialect.Oracle10gDialect
agentTest.sources.oracle.hibernate.connection.driver_class = oracle.jdbc.driver.OracleDriver
agentTest.sources.oracle.run.query.delay=10000
agentTest.sources.oracle.enclose.by.quotes =true

agentTest.sources.oracle.status.file.path = /usr/local/apache-flume-1.9.0-bin
agentTest.sources.oracle.status.file.name = agentTest.sqlSource.status

#######Custom query
#agentTest.sources.oracle.table = TDBA_TEST
agentTest.sources.oracle.start.from = 1
agentTest.sources.oracle.custom.query = select tid,tsn,tno,tamt,to_char(create_date,‘yyyy-mm-dd’) as create_date,nvl(to_char(update_date,‘yyyy-mm-dd’),‘0000-00-00’) as update_date,nvl(to_char(update_time,‘yyyy-mm-dd hh24:mi:ss’),‘0000-00-00 00:00:00’) from tdba_test where tid>=$@$

agentTest.sources.oracle.batch.size = 1000
agentTest.sources.oracle.max.rows = 1000
agentTest.sources.oracle.hibernate.connection.provider_class = org.hibernate.connection.C3P0ConnectionProvider
agentTest.sources.oracle.hibernate.c3p0.min_size = 1
agentTest.sources.oracle.hibernate.c3p0.max_size = 10
agentTest.sources.oracle.channels = channel_db

##############################
agentTest.channels.channel_db.type = memory
agentTest.channels.channel_db.capacity = 1000
agentTest.channels.channel_db.transactionCapacity = 1000
agentTest.channels.channel_db.byteCapacityBufferPercentage = 20
agentTest.channels.channel_db.byteCapacity = 1600000

agentTest.sinks.clickhouse.type = org.ips.demo.ClickHouseSink
agentTest.sinks.clickhouse.channel = channel_db
agentTest.sinks.clickhouse.host = 127.0.0.1
agentTest.sinks.clickhouse.port = 8123
agentTest.sinks.clickhouse.database = default
agentTest.sinks.clickhouse.table = TDBA_TEST
agentTest.sinks.clickhouse.batchSize = 10000
agentTest.sinks.clickhouse.user = default
agentTest.sinks.clickhouse.password = test#1234

八.运行

./bin/flume-ng agent –-conf conf –-conf-file conf/flume-sql.conf –-name agentTest Dflume.root.logger=DEBUG,console

九.验证

clickhouse:
在这里插入图片描述
status文件:
[root@ipshis apache-flume-1.9.0-bin]# more agentTest.sqlSource.status
{“SourceName”:“oracle”,“URL”:“jdbc:oracle:thin:@192.168.xxx.xxx:1522:xxx”,“LastIndex”:“11”,“Query”:“select tid,tsn,tno,tamt,to_char(create_date,‘yyyy-mm-dd’) as create_date,nvl(to_char(update_date,‘yyyy-mm-dd’),‘0000-00-00’) as updat
e_date,nvl(to_char(update_time,‘yyyy-mm-dd hh24:mi:ss’),‘0000-00-00 00:00:00’) from TDBA_TEST where tid>=$@$”}
[root@ipshis apache-flume-1.9.0-bin]#

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值