-
Notifications
You must be signed in to change notification settings - Fork 15.1k
KAFKA-12352: Make sure all rejoin group and reset state has a reason #10232
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b17fc78
b4646ae
f2cf3e8
c36c6a7
ac15c59
66a116d
2bdb62c
c8cb7ab
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -455,31 +455,27 @@ boolean joinGroupIfNeeded(final Timer timer) { | |
| resetJoinGroupFuture(); | ||
| needsJoinPrepare = true; | ||
| } else { | ||
| log.info("Generation data was cleared by heartbeat thread to {} and state is now {} before " + | ||
| "the rebalance callback is triggered, marking this rebalance as failed and retry", | ||
| generationSnapshot, stateSnapshot); | ||
| resetStateAndRejoin(); | ||
| final String reason = String.format("rebalance failed since the generation/state was " + | ||
| "modified by heartbeat thread to %s/%s before the rebalance callback triggered", | ||
| generationSnapshot, stateSnapshot); | ||
|
|
||
| resetStateAndRejoin(reason); | ||
| resetJoinGroupFuture(); | ||
| } | ||
| } else { | ||
| final RuntimeException exception = future.exception(); | ||
|
|
||
| // we do not need to log error for memberId required, | ||
| // since it is not really an error and is transient | ||
| if (!(exception instanceof MemberIdRequiredException)) { | ||
| log.info("Rebalance failed.", exception); | ||
| } | ||
|
|
||
| resetJoinGroupFuture(); | ||
|
|
||
| if (exception instanceof UnknownMemberIdException || | ||
| exception instanceof RebalanceInProgressException || | ||
| exception instanceof IllegalGenerationException || | ||
| exception instanceof RebalanceInProgressException || | ||
| exception instanceof MemberIdRequiredException) | ||
| continue; | ||
| else if (!future.isRetriable()) | ||
| throw exception; | ||
|
|
||
| resetStateAndRejoin(); | ||
| resetStateAndRejoin(String.format("rebalance failed with retriable error %s", exception)); | ||
| timer.sleep(rebalanceConfig.retryBackoffMs); | ||
| } | ||
| } | ||
|
|
@@ -655,6 +651,8 @@ public void handle(JoinGroupResponse joinResponse, RequestFuture<ByteBuffer> fut | |
| synchronized (AbstractCoordinator.this) { | ||
| AbstractCoordinator.this.generation = new Generation(OffsetCommitRequest.DEFAULT_GENERATION_ID, memberId, null); | ||
| } | ||
| requestRejoin("need to re-join with the given member-id"); | ||
|
|
||
| future.raise(error); | ||
| } else if (error == Errors.REBALANCE_IN_PROGRESS) { | ||
| log.info("JoinGroup failed due to non-fatal error: REBALANCE_IN_PROGRESS, " + | ||
|
|
@@ -775,8 +773,6 @@ public void handle(SyncGroupResponse syncResponse, | |
| } | ||
| } | ||
| } else { | ||
| requestRejoin(); | ||
|
|
||
| if (error == Errors.GROUP_AUTHORIZATION_FAILED) { | ||
| future.raise(GroupAuthorizationException.forGroupId(rebalanceConfig.groupId)); | ||
| } else if (error == Errors.REBALANCE_IN_PROGRESS) { | ||
|
|
@@ -860,7 +856,7 @@ public void onSuccess(ClientResponse resp, RequestFuture<Void> future) { | |
|
|
||
| @Override | ||
| public void onFailure(RuntimeException e, RequestFuture<Void> future) { | ||
| log.debug("FindCoordinator request failed due to {}", e); | ||
| log.debug("FindCoordinator request failed due to {}", e.toString()); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor cleanup, we only need to print the error message but not the stack trace. |
||
|
|
||
| if (!(e instanceof RetriableException)) { | ||
| // Remember the exception if fatal so we can ensure it gets thrown by the main thread | ||
|
|
@@ -961,29 +957,29 @@ protected synchronized String memberId() { | |
| return generation.memberId; | ||
| } | ||
|
|
||
| private synchronized void resetState() { | ||
| private synchronized void resetStateAndGeneration(final String reason) { | ||
| log.info("Resetting generation due to: {}", reason); | ||
|
|
||
| state = MemberState.UNJOINED; | ||
| generation = Generation.NO_GENERATION; | ||
| } | ||
|
|
||
| private synchronized void resetStateAndRejoin() { | ||
| resetState(); | ||
| rejoinNeeded = true; | ||
| private synchronized void resetStateAndRejoin(final String reason) { | ||
| resetStateAndGeneration(reason); | ||
| requestRejoin(reason); | ||
| } | ||
|
|
||
| synchronized void resetGenerationOnResponseError(ApiKeys api, Errors error) { | ||
| log.debug("Resetting generation after encountering {} from {} response and requesting re-join", error, api); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that I intentionally bumped up the log level from debug to info here since I think this is necessarily a message that users should pay attention to in production, where they mostly use INFO. Open for counter suggestions though.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. SGTM. If we find it flooding the logs and not helpful we can reconsider |
||
|
|
||
| resetStateAndRejoin(); | ||
| final String reason = String.format("encountered %s from %s response", error, api); | ||
| resetStateAndRejoin(reason); | ||
| } | ||
|
|
||
| synchronized void resetGenerationOnLeaveGroup() { | ||
| log.debug("Resetting generation due to consumer pro-actively leaving the group"); | ||
|
|
||
| resetStateAndRejoin(); | ||
| resetStateAndRejoin("consumer pro-actively leaving the group"); | ||
| } | ||
|
|
||
| public synchronized void requestRejoin() { | ||
| public synchronized void requestRejoin(final String reason) { | ||
| log.info("Request joining group due to: {}", reason); | ||
| this.rejoinNeeded = true; | ||
| } | ||
|
|
||
|
|
@@ -1120,8 +1116,7 @@ public void handle(HeartbeatResponse heartbeatResponse, RequestFuture<Void> futu | |
| // since we may be sending the request during rebalance, we should check | ||
| // this case and ignore the REBALANCE_IN_PROGRESS error | ||
| if (state == MemberState.STABLE) { | ||
| log.info("Attempt to heartbeat failed since group is rebalancing"); | ||
| requestRejoin(); | ||
| requestRejoin("group is already rebalancing"); | ||
| future.raise(error); | ||
| } else { | ||
| log.debug("Ignoring heartbeat response with error {} during {} state", error, state); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can remove this since it is a bit redundant now as we call for each case if necessary.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just to clarify, you mean we don't need to rejoin here since we will always raise an error, and always rejoin (if necessary) when checking that error?
Or are you referring to the
requestRejoinOnResponseErrorcalls you added to the two last cases in the below if/else?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I meant the latter: we call that inside the conditions already -- for those fatal errors, we do not need to call this anyways since the consumer will throw and crash.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@guozhangwang I think something may have been messed up during a merge/rebase: I no longer see
requestRejoinOnResponseErrorbeing invoked anywhereThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added that function for sync group handler that handles retriable
COORDINATOR_NOT_AVAILABLE / NOT_COORDINATORand any unexpected error. After the refactoring PR they are not all fall into thejoinGroupIfNeededinThis is part of the principle I mentioned:
But I forgot to remove this function as part of the second pass; will remove.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok cool, thanks. One last question then: after this refactoring, since we no longer call
requestRejoinOnResponseErrorbelow, should we re-add therequestRejoin()call here? Or add arequestRejointo the specific cases in the SyncGroup handler, egThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we do not need to, since it would be called on
resetStateAndRejoin(String.format("rebalance failed with retriable error %s", exception));--- previously we are calling rejoin double times.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm...but
resetStateAndRejoin(String.format("rebalance failed with retriable error %s", exception));is only called injoinGroupIfNeededwhich is only called inensureActiveGroup, which is in turn only invoked inConsumerCoordinator#poll.That said, inside
SyncGroupResponseHandler#handlewe would already haverejoinNeeded = trueand only set it to false if the SyncGroup succeeds. So for that reason I guess we don't need therequestRejoinanywhere inside the SyncGroup handler