一.前言
最近研究flume发现网络上的资料多少都有些问题,针对clickhouse引用最多的flume sink是:https://reviews.apache.org/r/50692/diff/1#2,这个源码的pom.xml存在问题,无法编译;其次clickhouse官网的jdbc驱动效率不高,鉴于此,本文介绍利用官网推荐的第三方jdbc重写flume sink,环境基于RHEL6,相关过程见下文。
二.数据库环境准备
oracle:建表列类型&Insert的数据(包含特殊字符&null)尽可能接近生产环境。
create table tdba_test
(
TID number primary key,
TSN varchar2(40),
TNO number,
TAMT number(15,2),
CREATE_DATE Date,
UPDATE_DATE Date,
UPDATE_TIME date
);
insert into tdba_test values(1,'SN001',1,1.1,to_date('2020-01-01','yyyy-mm-dd'),to_date('2020-01-02','yyyy-mm-dd'),to_date('2020-01-02 01:01:01','yyyy-mm-dd hh24:mi:ss'));
insert into tdba_test values(2,'SN002',2,2.2,to_date('2020-01-01','yyyy-mm-dd'),to_date('2020-01-02','yyyy-mm-dd'),to_date('2020-01-02 01:01:01','yyyy-mm-dd hh24:mi:ss'));
insert into tdba_test values(3,'SN003',3,3.3,to_date('2020-01-01','yyyy-mm-dd'),to_date('2020-01-02','yyyy-mm-dd'),to_date('2020-01-02 01:01:01','yyyy-mm-dd hh24:mi:ss'));
insert into tdba_test values(4,'SN'',''004',4,4.4,to_date('2020-01-01','yyyy-mm-dd'),to_date('2020-01-02','yyyy-mm-dd'),to_date('2020-01-02 01:01:01','yyyy-mm-dd hh24:mi:ss'));
insert into tdba_test values(5,'SN","005,',5,5.5,to_date('2020-01-01','yyyy-mm-dd'),to_date('2020-01-02','yyyy-mm-dd'),to_date('2020-01-02 01:01:01','yyyy-mm-dd hh24:mi:ss'));
insert into tdba_test values(6,NULL,6,6.6,to_date('2020-01-01','yyyy-mm-dd'),to_date('2020-01-02','yyyy-mm-dd'),to_date('2020-01-02 01:01:01','yyyy-mm-dd hh24:mi:ss'));
insert into tdba_test values(7,NULL,NULL,7.7,to_date('2020-01-01','yyyy-mm-dd'),to_date('2020-01-02','yyyy-mm-dd'),to_date('2020-01-02 01:01:01','yyyy-mm-dd hh24:mi:ss'));
insert into tdba_test values(8,NULL,NULL,NULL,to_date('2020-01-01','yyyy-mm-dd'),to_date('2020-01-02','yyyy-mm-dd'),to_date('2020-01-02 01:01:01','yyyy-mm-dd hh24:mi:ss'));
insert into tdba_test values(9,NULL,NULL,NULL,to_date('2020-01-01','yyyy-mm-dd'),NULL,to_date('2020-01-02 01:01:01','yyyy-mm-dd hh24:mi:ss'));
insert into tdba_test values(10,NULL,NULL,NULL,to_date('2020-01-01','yyyy-mm-dd'),NULL,NULL);
clickhouse:
create table TDBA_TEST
(
TID UInt32,
TSN String,
TNO UInt16,
TAMT Decimal(15,2),
CREATE_DATE Date,
UPDATE_DATE Date,
UPDATE_TIME DateTime
) ENGINE = MergeTree(CREATE_DATE, (TID), 8192)
三.下载flume
下载flume,解压即可
http://www.apache.org/dyn/closer.lua/flume/1.9.0/apache-flume-1.9.0-bin.tar.gz
四.flume source
源码地址:https://github.com/keedio/flume-ng-sql-source.git
编译打包后把jar包放入flume的lib目录即可。
五.flume sink
开发流程:新建一个类,实现Configurable接口,继承AbstractSink类。重写configure、start、stop、process方法,不多介绍,实际运行下很容易了解它的运作流程。
源码比较简单,编译打包后jar放入flume lib目录即可。
package org.ips.demo;
/*
creation_time created_by update_time updated_by Description
---------------- ---------------- ---------------- ---------------- ------------------------------------------------------------
202005 xlzhu@ips.com flume sink for clickhouse database
*/
public class ClickHouseSinkConstants {
public static final String HOST = "host";
public static final String PORT = "port";
public static final String BATCH_SIZE = "batchSize";
public static final String USER = "user";
public static final String PASSWORD = "password";
public static final String DATABASE = "database";
public static final String TABLE = "table";
public static final String DEFAULT_PORT = "8123";
public static final int DEFAULT_BATCH_SIZE = 10000;
public static final String DEFAULT_USER = "";
public static final String DEFAULT_PASSWORD = "";
public static final String SQL_INSERT="INSERT INTO %s.%s FORMAT CSV";
public static final String JDBC_DRIVER_CLASS="cc.blynk.clickhouse.ClickHouseDriver";
}
package org.ips.demo;
/*
creation_time created_by update_time updated_by Description
---------------- ---------------- ---------------- ---------------- ------------------------------------------------------------
202005 xlzhu@ips.com flume sink for clickhouse database using Third-party drivers
references:https://clickhouse.tech/docs/en/interfaces/jdbc/
https://github.com/blynkkk/clickhouse4j
*/
import com.google.common.base.Preconditions;
import org.apache.flume.*;
import org.apache.flume.conf.Configurable;
import org.apache.flume.instrumentation.SinkCounter;
import org.apache.flume.sink.AbstractSink;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayInputStream;
import cc.blynk.clickhouse.copy.*;
import cc.blynk.clickhouse.ClickHouseDataSource;
import cc.blynk.clickhouse.ClickHouseConnectionImpl;
import java.sql.*;
import java.nio.charset.StandardCharsets;
import static org.ips.demo.ClickHouseSinkConstants.*;
public class ClickHouseSink extends AbstractSink implements Configurable {
private static final Logger logger = LoggerFactory.getLogger(ClickHouseSink.class);
private ClickHouseDataSource dataSource = null;
private SinkCounter sinkCounter = null;
private String host = null;
private String port = null;
private String user = null;
private String password = null;
private String database = null;
private String table = null;
private int batchSize;
@Override
public void configure(Context context) {
logger.debug("------######begin configure...");
if (sinkCounter == null) {
sinkCounter = new SinkCounter(getName());
}
Preconditions.checkArgument(context.getString(HOST) != null && context.getString(HOST).length() > 0, "ClickHouse host must be specified!");
this.host = context.getString(HOST);
if (!this.host.startsWith("jdbc:clickhouse://")) {
this.host = "jdbc:clickhouse://" + this.host;
}
Preconditions.checkArgument(context.getString(DATABASE) != null && context.getString(DATABASE).length() > 0, "ClickHouse database must be specified!");
this.database = context.getString(DATABASE);
Preconditions.checkArgument(context.getString(TABLE) != null && context.getString(TABLE).length() > 0, "ClickHouse table must be specified!");
this.table = context.getString(TABLE);
this.port = context.getString(PORT, DEFAULT_PORT);
this.user = context.getString(USER, DEFAULT_USER);
this.password = context.getString(PASSWORD, DEFAULT_PASSWORD);
this.batchSize = context.getInteger(BATCH_SIZE, DEFAULT_BATCH_SIZE);
logger.debug("------######end configure...");
}
@Override
public void start() {
logger.debug("------######begin start...");
//String path = getClass().getProtectionDomain().getCodeSource().getLocation().getPath();
//logger.debug("------######jar path:"+path);
String jdbcUrl = String.format("%s:%s/%s", this.host, this.port, this.database);
try{
this.dataSource = new ClickHouseDataSource(jdbcUrl);
logger.debug("------######getDataSource ok");
} catch (Exception e) {
e.printStackTrace();
}
sinkCounter.start();
super.start();
logger.debug("------######end start...");
}
@Override
public void stop() {
logger.debug("------######begin stop");
sinkCounter.incrementConnectionClosedCount();
sinkCounter.stop();
super.stop();
logger.debug("------######end stop");
}
@Override
public Status process() throws EventDeliveryException {
logger.debug("------######begin process");
Status status = null;
// Start transaction
Channel ch = getChannel();
Transaction txn = ch.getTransaction();
txn.begin();
try {
int count;
StringBuilder batch = new StringBuilder();
for (count = 0; count < batchSize; ++count) {
Event event = ch.take();
if (event == null) {
break;
}
batch.append(new String(event.getBody(), StandardCharsets.UTF_8)).append("\n");
}
logger.debug("------######data from channel:\n"+batch.toString()+"EOF");
if (count <= 0) {
sinkCounter.incrementBatchEmptyCount();
txn.commit();
return Status.BACKOFF;
} else if (count < batchSize) {
sinkCounter.incrementBatchUnderflowCount();
} else {
sinkCounter.incrementBatchCompleteCount();
}
sinkCounter.addToEventDrainAttemptCount(count);
CopyManager copyManager = CopyManagerFactory.create((ClickHouseConnectionImpl) this.dataSource.getConnection(this.user,this.password));
logger.debug("------######copyManager ok");
logger.debug("------######SQL:"+String.format(SQL_INSERT, this.database, this.table));
copyManager.copyToDb(String.format(SQL_INSERT, this.database, this.table), new ByteArrayInputStream(batch.toString().getBytes(StandardCharsets.UTF_8)));
logger.debug("------######copyTodb ok");
sinkCounter.incrementEventDrainSuccessCount();
status = Status.READY;
txn.commit();
logger.debug("------######commit ok");
} catch (Throwable t) {
txn.rollback();
logger.error(t.getMessage(), t);
status = Status.BACKOFF;
// re-throw all Errors
if (t instanceof Error) {
throw (Error) t;
}
} finally {
txn.close();
}
logger.debug("------######end process");
return status;
}
}
pom.xml:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.ips</groupId>
<artifactId>demo</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>demo</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>1.9.0</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>19.0</version>
</dependency>
<dependency>
<groupId>ru.yandex.clickhouse</groupId>
<artifactId>clickhouse-jdbc</artifactId>
<version>0.2.3</version>
</dependency>
<dependency>
<groupId>cc.blynk.clickhouse</groupId>
<artifactId>clickhouse4j</artifactId>
<version>1.4.4</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.15</version>
<exclusions>
<exclusion>
<groupId>javax.mail</groupId>
<artifactId>mail</artifactId>
</exclusion>
<exclusion>
<groupId>javax.jms</groupId>
<artifactId>jms</artifactId>
</exclusion>
<exclusion>
<groupId>com.sun.jdmk</groupId>
<artifactId>jmxtools</artifactId>
</exclusion>
<exclusion>
<groupId>com.sun.jmx</groupId>
<artifactId>jmxri</artifactId>
</exclusion>
</exclusions>
<scope>compile</scope>
</dependency>
</dependencies>
<build>
<finalName>flume-clickhouse-sink</finalName>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>org.ips.demo.ClickHouseSink</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</plugin>
</plugins>
</build>
</project>
六.ojdbc6.jar
由于要从oracle读取数据,需要把oracle的jdbc jar包拷入flume的lib目录;jar可以从网络下载或oracle数据库安装目录$ORACLE_HOME/jdbc/lib/
七.配置
编辑flume配置文件,重点在custom.query(不建议使用table),可以通过定制SQL屏蔽部分数据库差异导致的问题(比如null问题)
vi conf/flume-sql.conf
agentTest.channels = channel_db
agentTest.sources = oracle
agentTest.sinks = clickhouse
###########sql source#################
####### For each Test of the sources, the type is defined
agentTest.sources.oracle.type = org.keedio.flume.source.SQLSource
agentTest.sources.oracle.hibernate.connection.url = jdbc:oracle:thin:@192.168.xxx.xxx:1522:xxx
####### Hibernate Database connection properties
agentTest.sources.oracle.hibernate.connection.user = xxx
agentTest.sources.oracle.hibernate.connection.password = xxx
agentTest.sources.oracle.hibernate.connection.autocommit = true
agentTest.sources.oracle.hibernate.dialect = org.hibernate.dialect.Oracle10gDialect
agentTest.sources.oracle.hibernate.connection.driver_class = oracle.jdbc.driver.OracleDriver
agentTest.sources.oracle.run.query.delay=10000
agentTest.sources.oracle.enclose.by.quotes =true
agentTest.sources.oracle.status.file.path = /usr/local/apache-flume-1.9.0-bin
agentTest.sources.oracle.status.file.name = agentTest.sqlSource.status
#######Custom query
#agentTest.sources.oracle.table = TDBA_TEST
agentTest.sources.oracle.start.from = 1
agentTest.sources.oracle.custom.query = select tid,tsn,tno,tamt,to_char(create_date,‘yyyy-mm-dd’) as create_date,nvl(to_char(update_date,‘yyyy-mm-dd’),‘0000-00-00’) as update_date,nvl(to_char(update_time,‘yyyy-mm-dd hh24:mi:ss’),‘0000-00-00 00:00:00’) from tdba_test where tid>=$@$
agentTest.sources.oracle.batch.size = 1000
agentTest.sources.oracle.max.rows = 1000
agentTest.sources.oracle.hibernate.connection.provider_class = org.hibernate.connection.C3P0ConnectionProvider
agentTest.sources.oracle.hibernate.c3p0.min_size = 1
agentTest.sources.oracle.hibernate.c3p0.max_size = 10
agentTest.sources.oracle.channels = channel_db
##############################
agentTest.channels.channel_db.type = memory
agentTest.channels.channel_db.capacity = 1000
agentTest.channels.channel_db.transactionCapacity = 1000
agentTest.channels.channel_db.byteCapacityBufferPercentage = 20
agentTest.channels.channel_db.byteCapacity = 1600000
agentTest.sinks.clickhouse.type = org.ips.demo.ClickHouseSink
agentTest.sinks.clickhouse.channel = channel_db
agentTest.sinks.clickhouse.host = 127.0.0.1
agentTest.sinks.clickhouse.port = 8123
agentTest.sinks.clickhouse.database = default
agentTest.sinks.clickhouse.table = TDBA_TEST
agentTest.sinks.clickhouse.batchSize = 10000
agentTest.sinks.clickhouse.user = default
agentTest.sinks.clickhouse.password = test#1234
八.运行
./bin/flume-ng agent –-conf conf –-conf-file conf/flume-sql.conf –-name agentTest Dflume.root.logger=DEBUG,console
九.验证
clickhouse:
status文件:
[root@ipshis apache-flume-1.9.0-bin]# more agentTest.sqlSource.status
{“SourceName”:“oracle”,“URL”:“jdbc:oracle:thin:@192.168.xxx.xxx:1522:xxx”,“LastIndex”:“11”,“Query”:“select tid,tsn,tno,tamt,to_char(create_date,‘yyyy-mm-dd’) as create_date,nvl(to_char(update_date,‘yyyy-mm-dd’),‘0000-00-00’) as updat
e_date,nvl(to_char(update_time,‘yyyy-mm-dd hh24:mi:ss’),‘0000-00-00 00:00:00’) from TDBA_TEST where tid>=$@$”}
[root@ipshis apache-flume-1.9.0-bin]#