-
Notifications
You must be signed in to change notification settings - Fork 15.1k
KAFKA-9274: Gracefully handle timeout exception #8060
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,7 @@ | |
| import org.apache.kafka.clients.consumer.Consumer; | ||
| import org.apache.kafka.clients.consumer.OffsetAndMetadata; | ||
| import org.apache.kafka.clients.producer.Producer; | ||
| import org.apache.kafka.clients.producer.ProducerConfig; | ||
| import org.apache.kafka.clients.producer.ProducerRecord; | ||
| import org.apache.kafka.common.KafkaException; | ||
| import org.apache.kafka.common.PartitionInfo; | ||
|
|
@@ -70,6 +71,7 @@ public class RecordCollectorImpl implements RecordCollector { | |
|
|
||
| // used when eosEnabled is true only | ||
| private boolean transactionInFlight = false; | ||
| private boolean transactionInitialized = false; | ||
| private Producer<byte[], byte[]> producer; | ||
| private volatile KafkaException sendException; | ||
|
|
||
|
|
@@ -95,24 +97,30 @@ public RecordCollectorImpl(final TaskId taskId, | |
| this.droppedRecordsSensor = TaskMetrics.droppedRecordsSensorOrSkippedRecordsSensor(threadId, taskId.toString(), streamsMetrics); | ||
|
|
||
| producer = producerSupplier.get(taskId); | ||
| } | ||
|
|
||
| @Override | ||
| public void initialize() { | ||
| maybeInitTxns(); | ||
| } | ||
|
|
||
| private void maybeInitTxns() { | ||
| if (eosEnabled) { | ||
| if (eosEnabled && !transactionInitialized) { | ||
| // initialize transactions if eos is turned on, which will block if the previous transaction has not | ||
| // completed yet; do not start the first transaction until the topology has been initialized later | ||
| try { | ||
| producer.initTransactions(); | ||
|
|
||
| transactionInitialized = true; | ||
| } catch (final TimeoutException exception) { | ||
| final String errorMessage = "Timeout exception caught when initializing transactions for task " + taskId + ". " + | ||
| log.warn("Timeout exception caught when initializing transactions for task {}. " + | ||
| "\nThe broker is either slow or in bad state (like not having enough replicas) in responding to the request, " + | ||
| "or the connection to broker was interrupted sending the request or receiving the response. " + | ||
| "\n Consider overwriting `max.block.ms` to a larger value to avoid timeout errors"; | ||
| "Would retry initializing the task in the next loop." + | ||
| "\nConsider overwriting producer config {} to a larger value to avoid timeout errors", | ||
| ProducerConfig.MAX_BLOCK_MS_CONFIG, taskId); | ||
|
|
||
| // TODO K9113: we do NOT try to handle timeout exception here but throw it as a fatal error | ||
| throw new StreamsException(errorMessage, exception); | ||
| throw exception; | ||
| } catch (final KafkaException exception) { | ||
| throw new StreamsException("Error encountered while initializing transactions for task " + taskId, exception); | ||
| } | ||
|
|
@@ -163,7 +171,7 @@ public void commit(final Map<TopicPartition, OffsetAndMetadata> offsets) { | |
| } catch (final ProducerFencedException error) { | ||
| throw new TaskMigratedException(taskId, "Producer get fenced trying to commit a transaction", error); | ||
| } catch (final TimeoutException error) { | ||
| // TODO K9113: currently handle timeout exception as a fatal error, should discuss whether we want to handle it | ||
| // TODO KIP-447: we can consider treating it as non-fatal and retry on the thread level | ||
| throw new StreamsException("Timed out while committing transaction via producer for task " + taskId, error); | ||
| } catch (final KafkaException error) { | ||
| throw new StreamsException("Error encountered sending offsets and committing transaction " + | ||
|
|
@@ -176,7 +184,7 @@ public void commit(final Map<TopicPartition, OffsetAndMetadata> offsets) { | |
| throw new TaskMigratedException(taskId, "Consumer committing offsets failed, " + | ||
| "indicating the corresponding thread is no longer part of the group.", error); | ||
| } catch (final TimeoutException error) { | ||
| // TODO K9113: currently handle timeout exception as a fatal error | ||
| // TODO KIP-447: we can consider treating it as non-fatal and retry on the thread level | ||
| throw new StreamsException("Timed out while committing offsets via consumer for task " + taskId, error); | ||
| } catch (final KafkaException error) { | ||
| throw new StreamsException("Error encountered committing offsets via consumer for task " + taskId, error); | ||
|
|
@@ -244,9 +252,16 @@ public <K, V> void send(final String topic, | |
| final StreamPartitioner<? super K, ? super V> partitioner) { | ||
| final Integer partition; | ||
|
|
||
| // TODO K9113: we need to decide how to handle exceptions from partitionsFor | ||
| if (partitioner != null) { | ||
| final List<PartitionInfo> partitions = producer.partitionsFor(topic); | ||
| final List<PartitionInfo> partitions; | ||
| try { | ||
| partitions = producer.partitionsFor(topic); | ||
| } catch (final KafkaException e) { | ||
| // here we cannot drop the message on the floor even if it is a transient timeout exception, | ||
| // so we treat everything the same as a fatal exception | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well -- we could also "buffer" the record and try to send it later? In the mean time we would need to pause the corresponding task though to not process more input records (or course, we would need to let the task finish processing the current input record what might lead to more output records that we would need to buffer, too). -- This is just a wild thought and we could also handle this case later if required.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought about the buffering mechanism here, but decided it may not worth since we've not seen timeout from |
||
| throw new StreamsException("Could not determine the number of partitions for topic '" + topic + | ||
| "' for task " + taskId + " due to " + e.toString()); | ||
| } | ||
| if (partitions.size() > 0) { | ||
| partition = partitioner.partition(topic, key, value, partitions.size()); | ||
| } else { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ | |
| package org.apache.kafka.streams.processor.internals; | ||
|
|
||
| import org.apache.kafka.clients.consumer.Consumer; | ||
| import org.apache.kafka.clients.consumer.ConsumerConfig; | ||
| import org.apache.kafka.clients.consumer.ConsumerRecord; | ||
| import org.apache.kafka.clients.consumer.OffsetAndMetadata; | ||
| import org.apache.kafka.common.KafkaException; | ||
|
|
@@ -186,11 +187,14 @@ public boolean isActive() { | |
|
|
||
| /** | ||
| * @throws LockException could happen when multi-threads within the single instance, could retry | ||
| * @throws TimeoutException if initializing record collector timed out | ||
| * @throws StreamsException fatal error, should close the thread | ||
| */ | ||
| @Override | ||
| public void initializeIfNeeded() { | ||
| if (state() == State.CREATED) { | ||
| recordCollector.initialize(); | ||
|
|
||
| StateManagerUtil.registerStateStores(log, logPrefix, topology, stateMgr, stateDirectory, processorContext); | ||
|
|
||
| transitionTo(State.RESTORING); | ||
|
|
@@ -199,6 +203,9 @@ public void initializeIfNeeded() { | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * @throws TimeoutException if fetching committed offsets timed out | ||
| */ | ||
| @Override | ||
| public void completeRestoration() { | ||
| if (state() == State.RESTORING) { | ||
|
|
@@ -612,6 +619,12 @@ private void initializeMetadata() { | |
| .filter(e -> e.getValue() != null) | ||
| .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); | ||
| initializeTaskTime(offsetsAndMetadata); | ||
| } catch (final TimeoutException e) { | ||
| log.warn("Encountered {} while trying to fetch committed offsets, will retry initializing the metadata in the next loop." + | ||
| "\nConsider overwriting consumer config {} to a larger value to avoid timeout errors", | ||
| ConsumerConfig.DEFAULT_API_TIMEOUT_MS_CONFIG); | ||
|
|
||
| throw e; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we throw
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because on the caller |
||
| } catch (final KafkaException e) { | ||
| throw new StreamsException(format("task [%s] Failed to initialize offsets for %s", id, partitions), e); | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is just to verify that initTransactions can indeed be retried. cc @hachikuji