目的
- 实现Mysql整库同步入湖, 并自动创建表结构
- 实现Flink读取Iceberg实现准实时数仓搭建
整库入湖
数据准备
Mysql 数据库表信息如下:
create table cdc_tb_1 (
id int primary key auto_increment,
name varchar(20)
);
create table cdc_tb_3 (
id int primary key auto_increment,
name varchar(20)
);
create table cdc_tb_2 (
id int primary key auto_increment,
age int
);
-- 写入数据
insert into cdc_tb_3(name) values('z1'),('z2'),('z3');
insert into cdc_tb_2(age) values(11),(22),(33);
insert into cdc_tb_1(name) values('z1'),('z2'),('z3');
写入流程
-- 创建 Catalog
CREATE CATALOG iceberg WITH (
'type'='iceberg',
'catalog-type'='hive',
'uri'='thrift://xxxxx:9083',
'clients'='5',
'property-version'='1',
'warehouse'='/warehouse/tablespace/managed/iceberg'
)
-- 创建 Database
CREATE DATABASE IF NOT EXISTS iceberg.flink_iceberg
-- 创建 Iceberg 表
CREATE TABLE `iceberg`.`flink_iceberg`.`cdc_test` (
`id` INT UNIQUE COMMENT 'unique id',
`name` STRING NOT NULL,
PRIMARY KEY(`id`) NOT ENFORCED
) with ('format-version'='2', 'write.upsert.enabled'='true')
-- 使用CDC读取MysqlTable
CREATE TABLE cdc_test (
id INT,
name STRING,
PRIMARY KEY(id) NOT ENFORCED
) WITH (
'connector' = 'mysql-cdc',
'hostname' = 'xxxxx',
'port' = '3306',
'scan.startup.mode' = 'initial',
'username' = 'root',
'password' = 'xxxxxx',
'database-name' = 'xxx',
'table-name' = 'xxxx')
-- 写入SQL
insert into `iceberg`.`flink_iceberg`.`cdc_test` select * from cdc_test;
该方法只能一对一进行表数据的接入,在实际生产中效率较低;为了提高入湖效率,探索实现整库入湖实现,功能实现规划:
- 实现单任务整库入湖
- 实现Iceberg表自动创建(根据源库表结构)
实现:
- 通过 JDBC 获取源库的表信息:表名,表结构,字段,主键等信息
- 通过Stream API 实现 CDC 整库BinLog订阅
// 订阅整库
MySqlSource<RowData> build = MySqlSource.<RowData>builder()
.hostname("xxx")
.username("xxx")
.password("xxxx")
.startupOptions(StartupOptions.initial())
.databaseList("flink_iceberg")
.tableList(list.toArray(new String[]{}))
.deserializer(new CustomDebeziumDeserializationSchema(schemaHashMap))
.build();
- 通过表信息自动创建对应的 Iceberg 表
- 根据表名分流写入 Iceberg
// 写入 DataStreamApi 写入方式
FlinkSink.forRowData(dataStream)
.distributionMode(DistributionMode.HASH)
.upsert(true)
.tableLoader(tableLoader)
.append();
最终效果图:
使用Spark集成查询结果:
- 查看Iceberg表已成功创建存在
- 查询数据 (select * from hive.flink_iceberg.cdc_tb_1;)
- 查询数据 (select * from hive.flink_iceberg.cdc_tb_2;)
- 查询数据 (select * from hive.flink_iceberg.cdc_tb_3;)
和Mysql中一至。
集成Jar包
Flink 版本1.17.1 iceberg 1.3.0(基于master分支打包)
connect-api-2.7.0.jar
flink-cep-1.17.1.jar
flink-connector-files-1.17.1.jar
flink-connector-jdbc-3.1.0-1.17.jar
flink-connector-mysql-cdc-2.4.0.jar_bk
flink-csv-1.17.1.jar
flink-dist-1.17.1.jar
flink-json-1.17.1.jar
flink-scala_2.12-1.17.1.jar
flink-shaded-hadoop-3-uber-3.1.1.7.1.1.0-565-9.0.jar
flink-sql-connector-mysql-cdc-2.4.0.jar
flink-table-api-java-uber-1.17.1.jar
flink-table-planner-loader-1.17.1.jar
flink-table-runtime-1.17.1.jar
hive-exec-3.1.0.3.1.0.0-78.jar
iceberg-flink-runtime-1.17-1.3.0-SNAPSHOT.jar
iceberg-flink-runtime-1.17-1.4.0-SNAPSHOT.jar
kafka-clients-2.7.0.jar
libfb303-0.9.3.jar
libthrift-0.13.0.jar
log4j-1.2-api-2.17.1.jar
log4j-api-2.17.1.jar
log4j-core-2.17.1.jar
log4j-slf4j-impl-2.17.1.jar
mysql-connector-java-5.1.47.jar