Kafka源码解析（二）-消费者

执迷迷

已于 2022-06-12 21:08:28 修改

阅读量2.2k

点赞数 1

CC 4.0 BY-SA版权

文章标签： kafka zookeeper 分布式

于 2022-06-12 20:48:50 首次发布

本文链接：https://blog.csdn.net/qq_40344505/article/details/125246822

本文深入解析了Apache Kafka消费者的工作流程，包括初始化过程、消息拉取机制以及offset提交的同步和异步方式。在初始化阶段，详细介绍了消费者如何配置元数据、设置拦截器和反序列化器等。在消息拉取部分，讲解了消费者如何订阅主题、触发分区分配以及如何通过poll方法获取消息。最后，讨论了同步和异步提交offset的实现细节，包括重试和错误处理策略。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

Consumer初始化

Consumer消息拉取

Consumer的offset提交

Consumer初始化

org.apache.kafka.clients.consumer.KafkaConsumer#KafkaConsumer(org.apache.kafka.clients.consumer.ConsumerConfig, org.apache.kafka.common.serialization.Deserializer<K>, org.apache.kafka.common.serialization.Deserializer<V>)

    KafkaConsumer(ConsumerConfig config, Deserializer<K> keyDeserializer, Deserializer<V> valueDeserializer) {
        try {
            // 消费者组平衡的配置
            GroupRebalanceConfig groupRebalanceConfig = new GroupRebalanceConfig(config,
                    GroupRebalanceConfig.ProtocolType.CONSUMER);
            // 消费者组id
            this.groupId = Optional.ofNullable(groupRebalanceConfig.groupId);
            // 客户端id
            this.clientId = config.getString(CommonClientConfigs.CLIENT_ID_CONFIG);

            LogContext logContext;

            // If group.instance.id is set, we will append it to the log context.
            if (groupRebalanceConfig.groupInstanceId.isPresent()) {
                logContext = new LogContext("[Consumer instanceId=" + groupRebalanceConfig.groupInstanceId.get() +
                        ", clientId=" + clientId + ", groupId=" + groupId.orElse("null") + "] ");
            } else {
                logContext = new LogContext("[Consumer clientId=" + clientId + ", groupId=" + groupId.orElse("null") + "] ");
            }

            this.log = logContext.logger(getClass());
            boolean enableAutoCommit = config.maybeOverrideEnableAutoCommit();
            groupId.ifPresent(groupIdStr -> {
                if (groupIdStr.isEmpty()) {
                    log.warn("Support for using the empty group id by consumers is deprecated and will be removed in the next major release.");
                }
            });

            log.debug("Initializing the Kafka consumer");
            // 客户端等待请求响应的最长时间,默认30S
            this.requestTimeoutMs = config.getInt(ConsumerConfig.REQUEST_TIMEOUT_MS_CONFIG);
            this.defaultApiTimeoutMs = config.getInt(ConsumerConfig.DEFAULT_API_TIMEOUT_MS_CONFIG);
            this.time = Time.SYSTEM;
            // 指标配置与jms
            this.metrics = buildMetrics(config, time, clientId);
            // 重试时间
            this.retryBackoffMs = config.getLong(ConsumerConfig.RETRY_BACKOFF_MS_CONFIG);
            // 拦截器配置
            List<ConsumerInterceptor<K, V>> interceptorList = (List) config.getConfiguredInstances(
                    ConsumerConfig.INTERCEPTOR_CLASSES_CONFIG,
                    ConsumerInterceptor.class,
                    Collections.singletonMap(ConsumerConfig.CLIENT_ID_CONFIG, clientId));
            this.interceptors = new ConsumerInterceptors<>(interceptorList);
            // key反序列化器
            if (keyDeserializer == null) {
                this.keyDeserializer = config.getConfiguredInstance(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, Deserializer.class);
                this.keyDeserializer.configure(config.originals(Collections.singletonMap(ConsumerConfig.CLIENT_ID_CONFIG, clientId)), true);
            } else {
                config.ignore(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG);
                this.keyDeserializer = keyDeserializer;
            }
            // value反序列化器
            if (valueDeserializer == null) {
                this.valueDeserializer = config.getConfiguredInstance(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, Deserializer.class);
                this.valueDeserializer.configure(config.originals(Collections.singletonMap(ConsumerConfig.CLIENT_ID_CONFIG, clientId)), false);
            } else {
                config.ignore(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG);
                this.valueDeserializer = valueDeserializer;
            }
            // offset偏移量策略（latest, earliest, none）
            OffsetResetStrategy offsetResetStrategy = OffsetResetStrategy.valueOf(config.getString(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG).toUpperCase(Locale.ROOT));
            this.subscriptions = new SubscriptionState(logContext, offsetResetStrategy);
            ClusterResourceListeners clusterResourceListeners = configureClusterResourceListeners(keyDeserializer,
                    valueDeserializer, metrics.reporters(), interceptorList);
            // 获取元数据（配置是否可以消费系统主题数据，默认为false，配置是否允许自动创建主题，默认为true）
            this.metadata = new ConsumerMetadata(retryBackoffMs,
                    config.getLong(ConsumerConfig.METADATA_MAX_AGE_CONFIG),
                    !config.getBoolean(ConsumerConfig.EXCLUDE_INTERNAL_TOPICS_CONFIG),
                    config.getBoolean(ConsumerConfig.ALLOW_AUTO_CREATE_TOPICS_CONFIG),
                    subscriptions, logContext, clusterResourceListeners);
            // kafka集群地址
            List<InetSocketAddress> addresses = ClientUtils.parseAndValidateAddresses(
                    config.getList(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG), config.getString(ConsumerConfig.CLIENT_DNS_LOOKUP_CONFIG));
            this.metadata.bootstrap(addresses);
            String metricGrpPrefix = "consumer";

            FetcherMetricsRegistry metricsRegistry = new FetcherMetricsRegistry(Collections.singleton(CLIENT_ID_METRIC_TAG), metricGrpPrefix);
            ChannelBuilder channelBuilder = ClientUtils.createChannelBuilder(config, time, logContext);
            this.isolationLevel = IsolationLevel.valueOf(
                    config.getString(ConsumerConfig.ISOLATION_LEVEL_CONFIG).toUpperCase(Locale.ROOT));
            Sensor throttleTimeSensor = Fetcher.throttleTimeSensor(metrics, metricsRegistry);
            // 心跳时间，默认3S
            int heartbeatIntervalMs = config.getInt(ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG);

            ApiVersions apiVersions = new ApiVersions();
            // 创建NetworkClient
            // 重试时间，默认50ms
            // 最大重试时间，1s
            // 发送缓存，128k
            // 接收缓存，默认64k
            NetworkClient netClient = new NetworkClient(
                    new Selector(config.getLong(ConsumerConfig.CONNECTIONS_MAX_IDLE_MS_CONFIG), metrics, time, metricGrpPrefix, channelBuilder, logContext),
                    this.metadata,
                    clientId,
                    100, // a fixed large enough value will suffice for max in-flight requests
                    config.getLong(ConsumerConfig.RECONNECT_BACKOFF_MS_CONFIG),
                    config.getLong(ConsumerConfig.RECONNECT_BACKOFF_MAX_MS_CONFIG),
                    config.getInt(ConsumerConfig.SEND_BUFFER_CONFIG),
                    config.getInt(ConsumerConfig.RECEIVE_BUFFER_CONFIG),
                    config.getInt(ConsumerConfig.REQUEST_TIMEOUT_MS_CONFIG),
                    config.getLong(ConsumerConfig.SOCKET_CONNECTION_SETUP_TIMEOUT_MS_CONFIG),
                    config.getLong(ConsumerConfig.SOCKET_CONNECTION_SETUP_TIMEOUT_MAX_MS_CONFIG),
                    time,
                    true,
                    apiVersions,
                    throttleTimeSensor,
                    logContext);
            // 创建ConsumerNetworkClient
            this.client = new ConsumerNetworkClient(
                    logContext,
                    netClient,
                    metadata,
                    time,
                    retryBackoffMs,
                    config.getInt(ConsumerConfig.REQUEST_TIMEOUT_MS_CONFIG),
                    heartbeatIntervalMs); //Will avoid blocking an extended period of time to prevent heartbeat thread starvation
            // 消费者分区分配策略
            this.assignors = ConsumerPartitionAssignor.getAssignorInstances(
                    config.getList(ConsumerConfig.PARTITION_ASSIGNMENT_STRATEGY_CONFIG),
                    config.originals(Collections.singletonMap(ConsumerConfig.CLIENT_ID_CONFIG, clientId))
            );

            // no coordinator will be constructed for the default (null) group id
            // 为消费者组准备的协调器
            this.coordinator = !groupId.isPresent() ? null :
                new ConsumerCoordinator(groupRebalanceConfig,
                        logContext,
                        this.client,
                        assignors,
                        this.metadata,
                        this.subscriptions,
                        metrics,
                        metricGrpPrefix,
                        this.time,
                        enableAutoCommit,
                        config.getInt(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG),
                        this.interceptors,
                        config.getBoolean(ConsumerConfig.THROW_ON_FETCH_STABLE_OFFSET_UNSUPPORTED));
            // 消费者从fetcher里拉取数据，这里是一些相关配置
            this.fetcher = new Fetcher<>(
                    logContext,
                    this.client,
                    config.getInt(ConsumerConfig.FETCH_MIN_BYTES_CONFIG),
                    config.getInt(ConsumerConfig.FETCH_MAX_BYTES_CONFIG),
                    config.getInt(ConsumerConfig.FETCH_MAX_WAIT_MS_CONFIG),
                    config.getInt(ConsumerConfig.MAX_PARTITION_FETCH_BYTES_CONFIG),
                    config.getInt(ConsumerConfig.MAX_POLL_RECORDS_CONFIG),
                    config.getBoolean(ConsumerConfig.CHECK_CRCS_CONFIG),
                    config.getString(ConsumerConfig.CLIENT_RACK_CONFIG),
                    this.keyDeserializer,
                    this.valueDeserializer,
                    this.metadata,
                    this.subscriptions,
                    metrics,
                    metricsRegistry,
                    this.time,
                    this.retryBackoffMs,
                    this.requestTimeoutMs,
                    isolationLevel,
                    apiVersions);

            this.kafkaConsumerMetrics = new KafkaConsumerMetrics(metrics, metricGrpPrefix);

            config.logUnused();
            AppInfoParser.registerAppInfo(JMX_PREFIX, clientId, metrics, time.milliseconds());
            log.debug("Kafka consumer initialized");
        } catch (Throwable t) {
            // call close methods if internal objects are already constructed; this is to prevent resource leak. see KAFKA-2121
            // we do not need to call `close` at all when `log` is null, which means no internal objects were initialized.
            if (this.log != null) {
                close(0, true);
            }
            // now propagate the exception
            throw new KafkaException("Failed to construct kafka consumer", t);
        }
    }

Consumer消息拉取

在拉取消息之前，消费者首先要声明自己订阅的主题，具体源码位置为：

org.apache.kafka.clients.consumer.KafkaConsumer#subscribe(java.util.Collection<java.lang.String>, org.apache.kafka.clients.consumer.ConsumerRebalanceListener)

    public void subscribe(Collection<String> topics, ConsumerRebalanceListener listener) {
        acquireAndEnsureOpen();
        try {
            maybeThrowInvalidGroupIdException();
            // 订阅的主题为null直接抛异常
            if (topics == null)
                throw new IllegalArgumentException("Topic collection to subscribe to cannot be null");
            // 订阅的主题为空，则取消订阅当前订阅的主题
            if (topics.isEmpty()) {
                // treat subscribing to empty topic list as the same as unsubscribing
                this.unsubscribe();
            } else {
                // 遍历主题集合，判断是否为空
                for (String topic : topics) {
                    if (Utils.isBlank(topic))
                        throw new IllegalArgumentException("Topic collection to subscribe to cannot contain null or empty topic");
                }
                throwIfNoAssignorsConfigured();
                // 清除不属于本次订阅主题集合的数据
                fetcher.clearBufferedDataForUnassignedTopics(topics);
                log.info("Subscribed to topic(s): {}", Utils.join(topics, ", "));
                // 判断是否需要更改订阅的主题（这里会注册一个rebalance的监听器）
                if (this.subscriptions.subscribe(new HashSet<>(topics), listener))
                    // 更新元数据
                    metadata.requestUpdateForNewTopics();
            }
        } finally {
            release();
        }
    }

在设置完订阅的主题后，可以通过poll方法进入消息拉取的流程。

org.apache.kafka.clients.consumer.KafkaConsumer#poll(org.apache.kafka.common.utils.Timer, boolean)

    private ConsumerRecords<K, V> poll(final Timer timer, final boolean includeMetadataInTimeout) {
        acquireAndEnsureOpen();
        try {
            this.kafkaConsumerMetrics.recordPollStart(timer.currentTimeMs());

            if (this.subscriptions.hasNoSubscriptionOrUserAssignment()) {
                throw new IllegalStateException("Consumer is not subscribed to any topics or assigned any partitions");
            }

            do {
                client.maybeTriggerWakeup();

                if (includeMetadataInTimeout) {
                    // try to update assignment metadata BUT do not need to block on the timer for join group
                    // 消费者分区分配流程
                    updateAssignmentMetadataIfNeeded(timer, false);
                } else {
                    while (!updateAssignmentMetadataIfNeeded(time.timer(Long.MAX_VALUE), true)) {
                        log.warn("Still waiting for metadata");
                    }
                }
                // 拉取消息流程
                final Map<TopicPartition, List<ConsumerRecord<K, V>>> records = pollForFetches(timer);
                if (!records.isEmpty()) {
                    // before returning the fetched records, we can send off the next round of fetches
                    // and avoid block waiting for their responses to enable pipelining while the user
                    // is handling the fetched records.
                    //
                    // NOTE: since the consumed position has already been updated, we must not allow
                    // wakeups or any other errors to be triggered prior to returning the fetched records.
                    if (fetcher.sendFetches() > 0 || client.hasPendingRequests()) {
                        client.transmitSends();
                    }
                    // 拦截器链对消息进行处理
                    return this.interceptors.onConsume(new ConsumerRecords<>(records));
                }
            } while (timer.notExpired());

            return ConsumerRecords.empty();
        } finally {
            release();
            this.kafkaConsumerMetrics.recordPollEnd(timer.currentTimeMs());
        }
    }

updateAssignmentMetadataIfNeeded方法的核心是调用 ConsumerCoordinator的poll方法，具体看下源码：

org.apache.kafka.clients.consumer.internals.ConsumerCoordinator#poll(org.apache.kafka.common.utils.Timer, boolean)

    public boolean poll(Timer timer, boolean waitForJoinGroup) {
        // 获取最新的元数据
        maybeUpdateSubscriptionMetadata();

        invokeCompletedOffsetCommitCallbacks();
        // 确保当前是 AUTO_TOPICS 或 AUTO_PATTERN（USER_ASSIGNED 不需要再平衡）订阅模式
        if (subscriptions.hasAutoAssignedPartitions()) {
            // 没指定分区分配策略直接抛异常
            if (protocol == null) {
                throw new IllegalStateException("User configured " + ConsumerConfig.PARTITION_ASSIGNMENT_STRATEGY_CONFIG +
                    " to empty while trying to subscribe for group protocol to auto assign partitions");
            }
            // Always update the heartbeat last poll time so that the heartbeat thread does not leave the
            // group proactively due to application inactivity even if (say) the coordinator cannot be found.
            // 发送心跳，3S一次
            pollHeartbeat(timer.currentTimeMs());
            // 查找coordinator
            // ensureCoordinatorReady方法里是个do while循环，当没找到coordinator并且未超时的情况下会一直向服务端发送查找请求
            // 具体的查找逻辑为会先找到一个节点，然后调用sendFindCoordinatorRequest方法
            // sendFindCoordinatorRequest方法里就是调用ConsumerNetworkClient的send方法
            // 在ensureCoordinatorReady里通过ConsumerNetworkClient的poll方法获取返回结果
            if (coordinatorUnknown() && !ensureCoordinatorReady(timer)) {
                return false;
            }
            // 判断是否需要执行分区分配
            if (rejoinNeededOrPending()) {
                // due to a race condition between the initial metadata fetch and the initial rebalance,
                // we need to ensure that the metadata is fresh before joining initially. This ensures
                // that we have matched the pattern against the cluster's topics at least once before joining.
                if (subscriptions.hasPatternSubscription()) {
                    // For consumer group that uses pattern-based subscription, after a topic is created,
                    // any consumer that discovers the topic after metadata refresh can trigger rebalance
                    // across the entire consumer group. Multiple rebalances can be triggered after one topic
                    // creation if consumers refresh metadata at vastly different times. We can significantly
                    // reduce the number of rebalances caused by single topic creation by asking consumer to
                    // refresh metadata before re-joining the group as long as the refresh backoff time has
                    // passed.
                    if (this.metadata.timeToAllowUpdate(timer.currentTimeMs()) == 0) {
                        this.metadata.requestUpdate();
                    }

                    if (!client.ensureFreshMetadata(timer)) {
                        return false;
                    }

                    maybeUpdateSubscriptionMetadata();
                }

                // if not wait for join group, we would just use a timer of 0
                // 加入消费者组，进行分区分配
                if (!ensureActiveGroup(waitForJoinGroup ? timer : time.timer(0L))) {
                    // since we may use a different timer in the callee, we'd still need
                    // to update the original timer's current time after the call
                    timer.update(time.milliseconds());

                    return false;
                }
            }
        } else {
            // For manually assigned partitions, if there are no ready nodes, await metadata.
            // If connections to all nodes fail, wakeups triggered while attempting to send fetch
            // requests result in polls returning immediately, causing a tight loop of polls. Without
            // the wakeup, poll() with no channels would block for the timeout, delaying re-connection.
            // awaitMetadataUpdate() initiates new connections with configured backoff and avoids the busy loop.
            // When group management is used, metadata wait is already performed for this scenario as
            // coordinator is unknown, hence this check is not required.
            // 手动分区类型如果没有准备好的节点则阻塞等待元数据更新
            if (metadata.updateRequested() && !client.hasReadyNodes(timer.currentTimeMs())) {
                client.awaitMetadataUpdate(timer);
            }
        }

        maybeAutoCommitOffsetsAsync(timer.currentTimeMs());
        return true;
    }

当我们使用 AUTO_TOPICS 或 AUTO_PATTERN 模式订阅 Kafka topic时，我们并不需要考虑当前消费者具体消费哪个分区，Kafka 会依据分区分配策略为消费者分配一个或多个分区进行消费（一个分区至多被一个消费者消费，不允许多个消费者同时消费同一个分区）。但是消费者可能会中途加入，也可能会中途退出并且topic的分区数目也是允许改变的，此时就需要为注册的消费者重新分配分区。

分区再分配操作分为 3 个阶段，并且是一个与集群交互联动的过程，这里我们以客户端视角，当消费者检测到需要重新分配分区时会触发执行：

发送 GroupCoordinatorRequest 请求获取目标可用的 GroupCoordinator 实例所在的 broker 节点，如果没有则选择负载最小的节点并尝试建立连接
向 GroupCoordinator 实例所在节点发送 JoinGroupRequest 请求申请加入目标 group，GroupCoordinator 实例会在既定时间范围内等待消费者的申请加入请求，如果提前检测到已经接收到 group 名下所有消费者的申请，或者等待时间超时，则会返回 JoinGroupResponse 响应，主要目的是告知谁是新的 Group Leader 消费者，以及最终确定的分区分配策略
Group Leader 依据指定的分区分配策略为当前 group 名下的消费者分配分区，并向目标 GroupCoordinator 实例所在节点发送 SyncGroupRequest 请求以告知最终的分区分配结果。

判定需要分区分配的方法为rejoinNeededOrPending()，具体源码如下：

org.apache.kafka.clients.consumer.internals.ConsumerCoordinator#rejoinNeededOrPending

    public boolean rejoinNeededOrPending() {
        // 如果为USER_ASSIGNED订阅模式则不需要执行分区再分配
        if (!subscriptions.hasAutoAssignedPartitions())
            return false;

        // we need to rejoin if we performed the assignment and metadata has changed;
        // also for those owned-but-no-longer-existed partitions we should drop them as lost
        // 分区数量发生变化，重新join
        if (assignmentSnapshot != null && !assignmentSnapshot.matches(metadataSnapshot)) {
            final String reason = String.format("cached metadata has changed from %s at the beginning of the rebalance to %s",
                assignmentSnapshot, metadataSnapshot);
            requestRejoin(reason);
            return true;
        }

        // we need to join if our subscription has changed since the last join
        // 消费者topic订阅信息发生变化，重新join
        if (joinedSubscription != null && !joinedSubscription.equals(subscriptions.subscription())) {
            final String reason = String.format("subscription has changed from %s at the beginning of the rebalance to %s",
                joinedSubscription, subscriptions.subscription());
            requestRejoin(reason);
            return true;
        }
        // 判断rejoinNeeded 或者joinFuture不为Null
        return super.rejoinNeededOrPending();
    }

如果判定需要执行分区再分配操作，接下去会调用 AbstractCoordinator的ensureActiveGroup方法确认所属 group 对应的目标 GroupCoordinator 实例所在节点是否准备好接收请求，如果对应节点不可用，则会发送 GroupCoordinatorRequest 请求查找负载较小且可用的节点，并与之建立连接。接着会调用 AbstractCoordinator的joinGroupIfNeeded方法开始执行分区再分配策略，源码如下：

org.apache.kafka.clients.consumer.internals.AbstractCoordinator#joinGroupIfNeeded

    boolean joinGroupIfNeeded(final Timer timer) {
        // 如果需要进行分区分配
        while (rejoinNeededOrPending()) {
            // 检查coordinator是否准备好
            if (!ensureCoordinatorReady(timer)) {
                return false;
            }

            // call onJoinPrepare if needed. We set a flag to make sure that we do not call it a second
            // time if the client is woken up before a pending rebalance completes. This must be called
            // on each iteration of the loop because an event requiring a rebalance (such as a metadata
            // refresh which changes the matched subscription set) can occur while another rebalance is
            // still in progress.
            if (needsJoinPrepare) {
                // need to set the flag before calling onJoinPrepare since the user callback may throw
                // exception, in which case upon retry we should not retry onJoinPrepare either.
                needsJoinPrepare = false;
                onJoinPrepare(generation.generationId, generation.memberId);
            }
            // 创建并发送JoinGroupRequest请求，申请加入目标group
            final RequestFuture<ByteBuffer> future = initiateJoinGroup();
            // 获取joinGroup请求结果
            client.poll(future, timer);
            if (!future.isDone()) {
                // we ran out of time
                return false;
            }
            // 执行分区分配成功
            if (future.succeeded()) {
                Generation generationSnapshot;
                MemberState stateSnapshot;

                // Generation data maybe concurrently cleared by Heartbeat thread.
                // Can't use synchronized for {@code onJoinComplete}, because it can be long enough
                // and shouldn't block heartbeat thread.
                // See {@link PlaintextConsumerTest#testMaxPollIntervalMsDelayInAssignment}
                synchronized (AbstractCoordinator.this) {
                    generationSnapshot = this.generation;
                    stateSnapshot = this.state;
                }

                if (!generationSnapshot.equals(Generation.NO_GENERATION) && stateSnapshot == MemberState.STABLE) {
                    // Duplicate the buffer in case `onJoinComplete` does not complete and needs to be retried.
                    ByteBuffer memberAssignment = future.value().duplicate();

                    onJoinComplete(generationSnapshot.generationId, generationSnapshot.memberId, generationSnapshot.protocolName, memberAssignment);

                    // Generally speaking we should always resetJoinGroupFuture once the future is done, but here
                    // we can only reset the join group future after the completion callback returns. This ensures
                    // that if the callback is woken up, we will retry it on the next joinGroupIfNeeded.
                    // And because of that we should explicitly trigger resetJoinGroupFuture in other conditions below.
                    resetJoinGroupFuture();
                    needsJoinPrepare = true;
                } else {
                    final String reason = String.format("rebalance failed since the generation/state was " +
                            "modified by heartbeat thread to %s/%s before the rebalance callback triggered",
                            generationSnapshot, stateSnapshot);

                    resetStateAndRejoin(reason);
                    resetJoinGroupFuture();
                }
            } else {
                // 执行分区分配失败，依据异常类型判断是否重试
                final RuntimeException exception = future.exception();

                resetJoinGroupFuture();
                rejoinNeeded = true;

                if (exception instanceof UnknownMemberIdException ||
                    exception instanceof IllegalGenerationException ||
                    exception instanceof RebalanceInProgressException ||
                    exception instanceof MemberIdRequiredException)
                    continue;
                else if (!future.isRetriable())
                    throw exception;

                timer.sleep(rebalanceConfig.retryBackoffMs);
            }
        }
        return true;
    }

在执行完updateAssignmentMetadataIfNeeded方法后，执行拉取消息的方法为pollForFetches方法，具体源码如下：

org.apache.kafka.clients.consumer.KafkaConsumer#pollForFetches

    private Map<TopicPartition, List<ConsumerRecord<K, V>>> pollForFetches(Timer timer) {
        long pollTimeout = coordinator == null ? timer.remainingMs() :
                Math.min(coordinator.timeToNextPoll(timer.currentTimeMs()), timer.remainingMs());

        // if data is available already, return it immediately
        // 先从completeFetches拉取一次数据，如果有数据直接返回
        final Map<TopicPartition, List<ConsumerRecord<K, V>>> records = fetcher.fetchedRecords();
        if (!records.isEmpty()) {
            return records;
        }

        // send any new fetches (won't resend pending fetches)
        // 没数据会发送一次sendFetches请求
        // 内部的逻辑就是创建FetchRequest.Builder，通过ConsumerNetworkClient的send方法向服务端发送拉取数据的请求
        // 返回的future监听了个监听器，通过回调将数据添加到completedFetches中
        fetcher.sendFetches();

        // We do not want to be stuck blocking in poll if we are missing some positions
        // since the offset lookup may be backing off after a failure

        // NOTE: the use of cachedSubscriptionHashAllFetchPositions means we MUST call
        // updateAssignmentMetadataIfNeeded before this method.
        if (!cachedSubscriptionHashAllFetchPositions && pollTimeout > retryBackoffMs) {
            pollTimeout = retryBackoffMs;
        }

        log.trace("Polling for fetches with timeout {}", pollTimeout);

        Timer pollTimer = time.timer(pollTimeout);
        client.poll(pollTimer, () -> {
            // since a fetch might be completed by the background thread, we need this poll condition
            // to ensure that we do not block unnecessarily in poll()
            return !fetcher.hasAvailableFetches();
        });
        timer.update(pollTimer.currentTimeMs());
        // 再次从completedFetches中拉取数据
        return fetcher.fetchedRecords();
    }

Consumer的offset提交

offset提交分为同步提交和异步提交。

同步提交的源码如下：

org.apache.kafka.clients.consumer.internals.ConsumerCoordinator#commitOffsetsSync

    public boolean commitOffsetsSync(Map<TopicPartition, OffsetAndMetadata> offsets, Timer timer) {
        invokeCompletedOffsetCommitCallbacks();

        if (offsets.isEmpty())
            return true;

        do {
            if (coordinatorUnknown() && !ensureCoordinatorReady(timer)) {
                return false;
            }
            // 发送提交请求
            RequestFuture<Void> future = sendOffsetCommitRequest(offsets);
            client.poll(future, timer);

            // We may have had in-flight offset commits when the synchronous commit began. If so, ensure that
            // the corresponding callbacks are invoked prior to returning in order to preserve the order that
            // the offset commits were applied.
            invokeCompletedOffsetCommitCallbacks();
            // 提交成功
            if (future.succeeded()) {
                if (interceptors != null)
                    interceptors.onCommit(offsets);
                return true;
            }

            if (future.failed() && !future.isRetriable())
                throw future.exception();

            timer.sleep(rebalanceConfig.retryBackoffMs);
        } while (timer.notExpired()); // 未超时，则继续进行提交
    
        return false;
    }

从源码中可以看出，同步提交的方式就是通过一个do while循环进行提交。

异步提交的源码如下：

org.apache.kafka.clients.consumer.internals.ConsumerCoordinator#commitOffsetsAsync

    public void commitOffsetsAsync(final Map<TopicPartition, OffsetAndMetadata> offsets, final OffsetCommitCallback callback) {
        invokeCompletedOffsetCommitCallbacks();
        // 有coordinator，调用sendOffsetCommitRequest方法提交offset，返回结果future注册监听等待返回结果
        if (!coordinatorUnknown()) {
            doCommitOffsetsAsync(offsets, callback);
        } else {
            // we don't know the current coordinator, so try to find it and then send the commit
            // or fail (we don't want recursive retries which can cause offset commits to arrive
            // out of order). Note that there may be multiple offset commits chained to the same
            // coordinator lookup request. This is fine because the listeners will be invoked in
            // the same order that they were added. Note also that AbstractCoordinator prevents
            // multiple concurrent coordinator lookup requests.
            // 通过注册监听器查找coordinator，找到后在调用doCommitOffsetsAsync方法进行offset提交
            pendingAsyncCommits.incrementAndGet();
            lookupCoordinator().addListener(new RequestFutureListener<Void>() {
                @Override
                public void onSuccess(Void value) {
                    pendingAsyncCommits.decrementAndGet();
                    doCommitOffsetsAsync(offsets, callback);
                    client.pollNoWakeup();
                }

                @Override
                public void onFailure(RuntimeException e) {
                    pendingAsyncCommits.decrementAndGet();
                    completedOffsetCommits.add(new OffsetCommitCompletion(callback, offsets,
                            new RetriableCommitFailedException(e)));
                }
            });
        }

        // ensure the commit has a chance to be transmitted (without blocking on its completion).
        // Note that commits are treated as heartbeats by the coordinator, so there is no need to
        // explicitly allow heartbeats through delayed task execution.
        client.pollNoWakeup();
    }