系列文章目录
一、DataX详解和架构介绍
二、DataX源码分析 JobContainer
三、DataX源码分析 TaskGroupContainer
四、DataX源码分析 TaskExecutor
五、DataX源码分析 reader
六、DataX源码分析 writer
七、DataX源码分析 Channel
八、DataX源码分析-插件机制
TaskExecutor
在DataX中,TaskExecutor是任务执行器,它负责具体的数据抽取、转换和加载工作。每个TaskExecutor都会对应一个数据同步任务,并且会根据配置的数据源和目标源信息,创建相应的读取器(Reader)和写入器(Writer),然后启动读取和写入线程,开始数据的同步工作。
具体来说,TaskExecutor的启动过程主要做了以下几件事情:
创建了Reader和Writer的线程任务,Reader和Writer共用一个Channel。这个Channel是DataX中用于数据传输的通道,Reader从数据源读取数据后,会通过Channel将数据发送给Writer;Writer则从Channel中获取数据,并写入到目标源中。
先启动Writer线程,再启动Reader线程。这是因为在数据同步过程中,Writer需要等待Reader发送数据过来才能进行写入操作,所以需要先启动Writer线程,让它处于等待状态;然后再启动Reader线程,开始读取数据并通过Channel发送给Writer。
至此,同步数据的Task任务已经启动了。在任务执行过程中,TaskExecutor会监控Reader和Writer的运行状态,并根据需要进行相应的处理,比如处理读取或写入异常、记录同步的进度信息等。
需要注意的是,一个TaskExecutor是对应一个数据同步任务的,如果需要同时执行多个数据同步任务,就需要创建多个TaskExecutor。另外,TaskExecutor在执行完任务后会自动关闭相关的资源,比如关闭读取器和写入器的连接等。
初始化
- 创建channel实例
- 创建writerRunner,在WriterRunner核心run方法中,主要进行了对Writer插件各个生命周期的调用和每个阶段的耗时统计。
- 创建writerThread线程,通过设置thread的contextClassLoader,即可实现同步和主程序不同的加载器。
-
- 创建readerRunner,在ReaderRunner核心run方法中,主要进行了对Reader插件各个生命周期的调用和每个阶段的耗时统计,其中重要的方法:taskReader.startRead(recordSender);
- 创建readerThread线程,设置thread的contextClassLoader
doStart
- 启动writerThread线程
run方法主要执行流程
//获取Writer插件实例
Writer.Task taskWriter = (Writer.Task) this.getPlugin();
//Writer插件各个生命周期的调用
taskWriter.init();
taskWriter.prepare();
taskWriter.startWrite(recordReceiver);
taskWriter.post();
super.markSuccess();
- 启动readerThread线程
run方法主要执行流程
获取reader插件实例
Reader.Task taskReader = (Reader.Task) this.getPlugin();
//Reader插件各个生命周期的调用
taskReader.init();
taskReader.prepare();
taskReader.startRead(recordSender);
taskReader.post();
TaskExecutor源码
/**
* TaskExecutor是一个完整task的执行器
* 其中包括1:1的reader和writer
*/
class TaskExecutor {
private Configuration taskConfig;
private int taskId;
private int attemptCount;
private Channel channel;
private Thread readerThread;
private Thread writerThread;
private ReaderRunner readerRunner;
private WriterRunner writerRunner;
/**
* 该处的taskCommunication在多处用到:
* 1. channel
* 2. readerRunner和writerRunner
* 3. reader和writer的taskPluginCollector
*/
private Communication taskCommunication;
public TaskExecutor(Configuration taskConf, int attemptCount) {
// 获取该taskExecutor的配置
this.taskConfig = taskConf;
Validate.isTrue(null != this.taskConfig.getConfiguration(CoreConstant.JOB_READER)
&& null != this.taskConfig.getConfiguration(CoreConstant.JOB_WRITER),
"[reader|writer]的插件参数不能为空!");
// 得到taskId
this.taskId = this.taskConfig.getInt(CoreConstant.TASK_ID);
this.attemptCount = attemptCount;
/**
* 由taskId得到该taskExecutor的Communication
* 要传给readerRunner和writerRunner,同时要传给channel作统计用
*/
this.taskCommunication = containerCommunicator
.getCommunication(taskId);
Validate.notNull(this.taskCommunication,
String.format("taskId[%d]的Communication没有注册过", taskId));
this.channel = ClassUtil.instantiate(channelClazz,
Channel.class, configuration);
this.channel.setCommunication(this.taskCommunication);
/**
* 获取transformer的参数
*/
List<TransformerExecution> transformerInfoExecs = TransformerUtil.buildTransformerInfo(taskConfig);
/**
* 生成writerThread
*/
writerRunner = (WriterRunner) generateRunner(PluginType.WRITER);
this.writerThread = new Thread(writerRunner,
String.format("%d-%d-%d-writer",
jobId, taskGroupId, this.taskId));
//通过设置thread的contextClassLoader,即可实现同步和主程序不通的加载器
this.writerThread.setContextClassLoader(LoadUtil.getJarLoader(
PluginType.WRITER, this.taskConfig.getString(
CoreConstant.JOB_WRITER_NAME)));
/**
* 生成readerThread
*/
readerRunner = (ReaderRunner) generateRunner(PluginType.READER,transformerInfoExecs);
this.readerThread = new Thread(readerRunner,
String.format("%d-%d-%d-reader",
jobId, taskGroupId, this.taskId));
/**
* 通过设置thread的contextClassLoader,即可实现同步和主程序不通的加载器
*/
this.readerThread.setContextClassLoader(LoadUtil.getJarLoader(
PluginType.READER, this.taskConfig.getString(
CoreConstant.JOB_READER_NAME)));
}
public void doStart() {
this.writerThread.start();
// reader没有起来,writer不可能结束
if (!this.writerThread.isAlive() || this.taskCommunication.getState() == State.FAILED) {
throw DataXException.asDataXException(
FrameworkErrorCode.RUNTIME_ERROR,
this.taskCommunication.getThrowable());
}
this.readerThread.start();
// 这里reader可能很快结束
if (!this.readerThread.isAlive() && this.taskCommunication.getState() == State.FAILED) {
// 这里有可能出现Reader线上启动即挂情况 对于这类情况 需要立刻抛出异常
throw DataXException.asDataXException(
FrameworkErrorCode.RUNTIME_ERROR,
this.taskCommunication.getThrowable());
}
}
private AbstractRunner generateRunner(PluginType pluginType) {
return generateRunner(pluginType, null);
}
private AbstractRunner generateRunner(PluginType pluginType, List<TransformerExecution> transformerInfoExecs) {
AbstractRunner newRunner = null;
TaskPluginCollector pluginCollector;
switch (pluginType) {
case READER:
newRunner = LoadUtil.loadPluginRunner(pluginType,
this.taskConfig.getString(CoreConstant.JOB_READER_NAME));
newRunner.setJobConf(this.taskConfig.getConfiguration(
CoreConstant.JOB_READER_PARAMETER));
pluginCollector = ClassUtil.instantiate(
taskCollectorClass, AbstractTaskPluginCollector.class,
configuration, this.taskCommunication,
PluginType.READER);
RecordSender recordSender;
if (transformerInfoExecs != null && transformerInfoExecs.size() > 0) {
recordSender = new BufferedRecordTransformerExchanger(taskGroupId, this.taskId, this.channel,this.taskCommunication ,pluginCollector, transformerInfoExecs);
} else {
recordSender = new BufferedRecordExchanger(this.channel, pluginCollector);
}
((ReaderRunner) newRunner).setRecordSender(recordSender);
/**
* 设置taskPlugin的collector,用来处理脏数据和job/task通信
*/
newRunner.setTaskPluginCollector(pluginCollector);
break;
case WRITER:
newRunner = LoadUtil.loadPluginRunner(pluginType,
this.taskConfig.getString(CoreConstant.JOB_WRITER_NAME));
newRunner.setJobConf(this.taskConfig
.getConfiguration(CoreConstant.JOB_WRITER_PARAMETER));
pluginCollector = ClassUtil.instantiate(
taskCollectorClass, AbstractTaskPluginCollector.class,
configuration, this.taskCommunication,
PluginType.WRITER);
((WriterRunner) newRunner).setRecordReceiver(new BufferedRecordExchanger(
this.channel, pluginCollector));
/**
* 设置taskPlugin的collector,用来处理脏数据和job/task通信
*/
newRunner.setTaskPluginCollector(pluginCollector);
break;
default:
throw DataXException.asDataXException(FrameworkErrorCode.ARGUMENT_ERROR, "Cant generateRunner for:" + pluginType);
}
newRunner.setTaskGroupId(taskGroupId);
newRunner.setTaskId(this.taskId);
newRunner.setRunnerCommunication(this.taskCommunication);
return newRunner;
}
// 检查任务是否结束
private boolean isTaskFinished() {
// 如果reader 或 writer没有完成工作,那么直接返回工作没有完成
if (readerThread.isAlive() || writerThread.isAlive()) {
return false;
}
if(taskCommunication==null || !taskCommunication.isFinished()){
return false;
}
return true;
}
private int getTaskId(){
return taskId;
}
private long getTimeStamp(){
return taskCommunication.getTimestamp();
}
private int getAttemptCount(){
return attemptCount;
}
private boolean supportFailOver(){
return writerRunner.supportFailOver();
}
private void shutdown(){
writerRunner.shutdown();
readerRunner.shutdown();
if(writerThread.isAlive()){
writerThread.interrupt();
}
if(readerThread.isAlive()){
readerThread.interrupt();
}
}
private boolean isShutdown(){
return !readerThread.isAlive() && !writerThread.isAlive();
}
}
}