From 3a5bc010127e993ac8c1637535b74f0dbd4c8ef3 Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Mon, 11 Sep 2023 16:44:01 -0700 Subject: [PATCH 01/10] HBASE-28050 RSProcedureDispatcher to fail-fast for SaslException --- .../procedure/RSProcedureDispatcher.java | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java index af22fba27290..7b92345d5139 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java @@ -22,6 +22,7 @@ import java.util.List; import java.util.Set; import java.util.concurrent.TimeUnit; +import javax.security.sasl.SaslException; import org.apache.hadoop.hbase.CallQueueTooBigException; import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.ServerName; @@ -306,6 +307,10 @@ private boolean scheduleForRetry(IOException e) { serverName, e.toString(), numberOfAttemptsSoFar); return false; } + if (isSaslError(e)) { + LOG.warn("{} is not reachable; give up", serverName, e); + return false; + } if (e instanceof RegionServerAbortedException || e instanceof RegionServerStoppedException) { // A better way is to return true here to let the upper layer quit, and then schedule a // background task to check whether the region server is dead. And if it is dead, call @@ -330,6 +335,44 @@ private boolean scheduleForRetry(IOException e) { return true; } + private boolean isSaslError(IOException e) { + if (e instanceof SaslException) { + return true; + } + // check 4 level of cause + Throwable cause = e.getCause(); + if (cause == null) { + return false; + } + if (isSaslError(cause)) { + return true; + } + cause = cause.getCause(); + if (cause == null) { + return false; + } + if (isSaslError(cause)) { + return true; + } + cause = cause.getCause(); + if (cause == null) { + return false; + } + if (isSaslError(cause)) { + return true; + } + cause = cause.getCause(); + if (cause == null) { + return false; + } + return isSaslError(cause); + } + + private boolean isSaslError(Throwable cause) { + return cause instanceof IOException && unwrapException( + (IOException) cause) instanceof SaslException; + } + private long getMaxWaitTime() { if (this.maxWaitTime < 0) { // This is the max attempts, not retries, so it should be at least 1. From a61c49fd631d4b7a8539147f68c5cd6032f3cb23 Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Tue, 12 Sep 2023 09:47:22 -0700 Subject: [PATCH 02/10] addendum --- .../hbase/master/procedure/RSProcedureDispatcher.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java index 7b92345d5139..e36a34133419 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java @@ -307,8 +307,8 @@ private boolean scheduleForRetry(IOException e) { serverName, e.toString(), numberOfAttemptsSoFar); return false; } - if (isSaslError(e)) { - LOG.warn("{} is not reachable; give up", serverName, e); + if (isSaslError(e) && numberOfAttemptsSoFar == 0) { + LOG.warn("{} is not reachable; give up after first attempt", serverName, e); return false; } if (e instanceof RegionServerAbortedException || e instanceof RegionServerStoppedException) { @@ -369,8 +369,8 @@ private boolean isSaslError(IOException e) { } private boolean isSaslError(Throwable cause) { - return cause instanceof IOException && unwrapException( - (IOException) cause) instanceof SaslException; + return cause instanceof IOException + && unwrapException((IOException) cause) instanceof SaslException; } private long getMaxWaitTime() { From 4a79223a37ad60dad690ed68be3b26d3524f8aff Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Tue, 12 Sep 2023 13:00:48 -0700 Subject: [PATCH 03/10] addendum --- .../hadoop/hbase/ipc/NettyRpcConnection.java | 3 ++- .../java/org/apache/hadoop/hbase/HConstants.java | 3 +++ .../master/procedure/RSProcedureDispatcher.java | 15 ++++++++++++--- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/NettyRpcConnection.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/NettyRpcConnection.java index 3f9a58d51263..d9f650a649f9 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/NettyRpcConnection.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/NettyRpcConnection.java @@ -30,6 +30,7 @@ import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; +import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.io.crypto.tls.X509Util; import org.apache.hadoop.hbase.ipc.BufferCallBeforeInitHandler.BufferCallEvent; import org.apache.hadoop.hbase.ipc.HBaseRpcController.CancellationCallback; @@ -347,7 +348,7 @@ public void operationComplete(ChannelFuture future) throws Exception { private void sendRequest0(Call call, HBaseRpcController hrc) throws IOException { assert eventLoop.inEventLoop(); if (reloginInProgress) { - throw new IOException("Can not send request because relogin is in progress."); + throw new IOException(HConstants.RELOGIN_IS_IN_PROGRESS); } hrc.notifyOnCancel(new RpcCallback() { diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java index 12479979b2ba..9176237ed540 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java @@ -91,6 +91,9 @@ public final class HConstants { /** Just an array of bytes of the right size. */ public static final byte[] HFILEBLOCK_DUMMY_HEADER = new byte[HFILEBLOCK_HEADER_SIZE]; + public static final String RELOGIN_IS_IN_PROGRESS = + "Can not send request because relogin is in progress."; + // End HFileBlockConstants. /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java index e36a34133419..dac85728acd4 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java @@ -25,6 +25,7 @@ import javax.security.sasl.SaslException; import org.apache.hadoop.hbase.CallQueueTooBigException; import org.apache.hadoop.hbase.DoNotRetryIOException; +import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.AsyncRegionServerAdmin; import org.apache.hadoop.hbase.client.RegionInfo; @@ -336,7 +337,10 @@ private boolean scheduleForRetry(IOException e) { } private boolean isSaslError(IOException e) { - if (e instanceof SaslException) { + if ( + e instanceof SaslException + || (e.getMessage() != null && e.getMessage().contains(HConstants.RELOGIN_IS_IN_PROGRESS)) + ) { return true; } // check 4 level of cause @@ -369,8 +373,13 @@ private boolean isSaslError(IOException e) { } private boolean isSaslError(Throwable cause) { - return cause instanceof IOException - && unwrapException((IOException) cause) instanceof SaslException; + if (cause instanceof IOException) { + IOException unwrappedException = unwrapException((IOException) cause); + return unwrappedException instanceof SaslException + || (unwrappedException.getMessage() != null + && unwrappedException.getMessage().contains(HConstants.RELOGIN_IS_IN_PROGRESS)); + } + return false; } private long getMaxWaitTime() { From 85495058fb01b533c57d349b03ef4ce161cea603 Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Wed, 13 Sep 2023 12:41:28 -0700 Subject: [PATCH 04/10] addendum --- .../procedure/RSProcedureDispatcher.java | 37 +++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java index dac85728acd4..b1974a00a332 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java @@ -289,17 +289,7 @@ private boolean scheduleForRetry(IOException e) { numberOfAttemptsSoFar); return false; } - // This exception is thrown in the rpc framework, where we can make sure that the call has not - // been executed yet, so it is safe to mark it as fail. Especially for open a region, we'd - // better choose another region server. - // Notice that, it is safe to quit only if this is the first time we send request to region - // server. Maybe the region server has accepted our request the first time, and then there is - // a network error which prevents we receive the response, and the second time we hit a - // CallQueueTooBigException, obviously it is not safe to quit here, otherwise it may lead to a - // double assign... - if (e instanceof CallQueueTooBigException && numberOfAttemptsSoFar == 0) { - LOG.warn("request to {} failed due to {}, try={}, this usually because" - + " server is overloaded, give up", serverName, e.toString(), numberOfAttemptsSoFar); + if (unableToConnectToServerInFirstAttempt(e)) { return false; } // Always retry for other exception types if the region server is not dead yet. @@ -308,10 +298,6 @@ private boolean scheduleForRetry(IOException e) { serverName, e.toString(), numberOfAttemptsSoFar); return false; } - if (isSaslError(e) && numberOfAttemptsSoFar == 0) { - LOG.warn("{} is not reachable; give up after first attempt", serverName, e); - return false; - } if (e instanceof RegionServerAbortedException || e instanceof RegionServerStoppedException) { // A better way is to return true here to let the upper layer quit, and then schedule a // background task to check whether the region server is dead. And if it is dead, call @@ -336,6 +322,27 @@ private boolean scheduleForRetry(IOException e) { return true; } + private boolean unableToConnectToServerInFirstAttempt(IOException e) { + // This exception is thrown in the rpc framework, where we can make sure that the call has not + // been executed yet, so it is safe to mark it as fail. Especially for open a region, we'd + // better choose another region server. + // Notice that, it is safe to quit only if this is the first time we send request to region + // server. Maybe the region server has accepted our request the first time, and then there is + // a network error which prevents we receive the response, and the second time we hit a + // CallQueueTooBigException, obviously it is not safe to quit here, otherwise it may lead to a + // double assign... + if (e instanceof CallQueueTooBigException && numberOfAttemptsSoFar == 0) { + LOG.warn("request to {} failed due to {}, try={}, this usually because" + + " server is overloaded, give up", serverName, e, numberOfAttemptsSoFar); + return true; + } + if (isSaslError(e) && numberOfAttemptsSoFar == 0) { + LOG.warn("{} is not reachable; give up after first attempt", serverName, e); + return true; + } + return false; + } + private boolean isSaslError(IOException e) { if ( e instanceof SaslException From a8ba2eeac08895727e59ae6b3519b9f07385ba53 Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Wed, 13 Sep 2023 22:53:34 -0700 Subject: [PATCH 05/10] addendum --- .../hbase/master/procedure/RSProcedureDispatcher.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java index b1974a00a332..96d2edd63ea1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java @@ -289,7 +289,7 @@ private boolean scheduleForRetry(IOException e) { numberOfAttemptsSoFar); return false; } - if (unableToConnectToServerInFirstAttempt(e)) { + if (numberOfAttemptsSoFar == 0 && unableToConnectToServer(e)) { return false; } // Always retry for other exception types if the region server is not dead yet. @@ -322,7 +322,7 @@ private boolean scheduleForRetry(IOException e) { return true; } - private boolean unableToConnectToServerInFirstAttempt(IOException e) { + private boolean unableToConnectToServer(IOException e) { // This exception is thrown in the rpc framework, where we can make sure that the call has not // been executed yet, so it is safe to mark it as fail. Especially for open a region, we'd // better choose another region server. @@ -331,12 +331,12 @@ private boolean unableToConnectToServerInFirstAttempt(IOException e) { // a network error which prevents we receive the response, and the second time we hit a // CallQueueTooBigException, obviously it is not safe to quit here, otherwise it may lead to a // double assign... - if (e instanceof CallQueueTooBigException && numberOfAttemptsSoFar == 0) { + if (e instanceof CallQueueTooBigException) { LOG.warn("request to {} failed due to {}, try={}, this usually because" + " server is overloaded, give up", serverName, e, numberOfAttemptsSoFar); return true; } - if (isSaslError(e) && numberOfAttemptsSoFar == 0) { + if (isSaslError(e)) { LOG.warn("{} is not reachable; give up after first attempt", serverName, e); return true; } From 420fcb69a3acf3072b1cbf90b6109dbb22e8874b Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Wed, 13 Sep 2023 23:00:34 -0700 Subject: [PATCH 06/10] addendum --- .../hadoop/hbase/ipc/NettyRpcConnection.java | 3 +- .../hbase/ipc/RpcConnectionConstants.java | 31 +++++++++++++++++++ .../org/apache/hadoop/hbase/HConstants.java | 3 -- .../procedure/RSProcedureDispatcher.java | 10 +++--- 4 files changed, 37 insertions(+), 10 deletions(-) create mode 100644 hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/RpcConnectionConstants.java diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/NettyRpcConnection.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/NettyRpcConnection.java index d9f650a649f9..408ea347e7a3 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/NettyRpcConnection.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/NettyRpcConnection.java @@ -30,7 +30,6 @@ import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; -import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.io.crypto.tls.X509Util; import org.apache.hadoop.hbase.ipc.BufferCallBeforeInitHandler.BufferCallEvent; import org.apache.hadoop.hbase.ipc.HBaseRpcController.CancellationCallback; @@ -348,7 +347,7 @@ public void operationComplete(ChannelFuture future) throws Exception { private void sendRequest0(Call call, HBaseRpcController hrc) throws IOException { assert eventLoop.inEventLoop(); if (reloginInProgress) { - throw new IOException(HConstants.RELOGIN_IS_IN_PROGRESS); + throw new IOException(RpcConnectionConstants.RELOGIN_IS_IN_PROGRESS); } hrc.notifyOnCancel(new RpcCallback() { diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/RpcConnectionConstants.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/RpcConnectionConstants.java new file mode 100644 index 000000000000..c774e8166c6c --- /dev/null +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/RpcConnectionConstants.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.ipc; + +import org.apache.yetus.audience.InterfaceAudience; + +/** + * Constants to be used by RPC connection based utilities. + */ +@InterfaceAudience.Private +public class RpcConnectionConstants { + + public static final String RELOGIN_IS_IN_PROGRESS = + "Can not send request because relogin is in progress."; + +} diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java index 9176237ed540..12479979b2ba 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java @@ -91,9 +91,6 @@ public final class HConstants { /** Just an array of bytes of the right size. */ public static final byte[] HFILEBLOCK_DUMMY_HEADER = new byte[HFILEBLOCK_HEADER_SIZE]; - public static final String RELOGIN_IS_IN_PROGRESS = - "Can not send request because relogin is in progress."; - // End HFileBlockConstants. /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java index 96d2edd63ea1..639f203ce1d2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java @@ -25,10 +25,10 @@ import javax.security.sasl.SaslException; import org.apache.hadoop.hbase.CallQueueTooBigException; import org.apache.hadoop.hbase.DoNotRetryIOException; -import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.AsyncRegionServerAdmin; import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.ipc.RpcConnectionConstants; import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException; import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.ServerListener; @@ -345,8 +345,8 @@ private boolean unableToConnectToServer(IOException e) { private boolean isSaslError(IOException e) { if ( - e instanceof SaslException - || (e.getMessage() != null && e.getMessage().contains(HConstants.RELOGIN_IS_IN_PROGRESS)) + e instanceof SaslException || (e.getMessage() != null + && e.getMessage().contains(RpcConnectionConstants.RELOGIN_IS_IN_PROGRESS)) ) { return true; } @@ -383,8 +383,8 @@ private boolean isSaslError(Throwable cause) { if (cause instanceof IOException) { IOException unwrappedException = unwrapException((IOException) cause); return unwrappedException instanceof SaslException - || (unwrappedException.getMessage() != null - && unwrappedException.getMessage().contains(HConstants.RELOGIN_IS_IN_PROGRESS)); + || (unwrappedException.getMessage() != null && unwrappedException.getMessage() + .contains(RpcConnectionConstants.RELOGIN_IS_IN_PROGRESS)); } return false; } From 8b59b137c69537453dee73acc1405696012fee9f Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Wed, 13 Sep 2023 23:44:54 -0700 Subject: [PATCH 07/10] addendum --- .../org/apache/hadoop/hbase/ipc/RpcConnectionConstants.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/RpcConnectionConstants.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/RpcConnectionConstants.java index c774e8166c6c..2b9853033393 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/RpcConnectionConstants.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/RpcConnectionConstants.java @@ -23,7 +23,10 @@ * Constants to be used by RPC connection based utilities. */ @InterfaceAudience.Private -public class RpcConnectionConstants { +public final class RpcConnectionConstants { + + private RpcConnectionConstants() { + } public static final String RELOGIN_IS_IN_PROGRESS = "Can not send request because relogin is in progress."; From 6fbfe4d2e5dd9c5b8a6590cdbc5b213c3cf1755d Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Thu, 14 Sep 2023 23:08:44 -0700 Subject: [PATCH 08/10] addendum --- .../procedure/RSProcedureDispatcher.java | 61 ++++++++----------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java index 639f203ce1d2..c3993ffbb2a7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java @@ -289,6 +289,14 @@ private boolean scheduleForRetry(IOException e) { numberOfAttemptsSoFar); return false; } + // This category of exceptions is thrown in the rpc framework, where we can make sure + // that the call has not been executed yet, so it is safe to mark it as fail. + // Especially for open a region, we'd better choose another region server. + // Notice that, it is safe to quit only if this is the first time we send request to region + // server. Maybe the region server has accepted our request the first time, and then there is + // a network error which prevents we receive the response, and the second time we hit + // this category of exceptions, obviously it is not safe to quit here, otherwise it may lead + // to a double assign... if (numberOfAttemptsSoFar == 0 && unableToConnectToServer(e)) { return false; } @@ -322,15 +330,15 @@ private boolean scheduleForRetry(IOException e) { return true; } + /** + * The category of exceptions where we can ensure that the request has not yet been received + * and/or processed by the target regionserver yet and hence we can determine whether it is safe + * to choose different regionserver as the target. + * @param e IOException thrown by the underlying rpc framework. + * @return true if the exception belongs to the category where the regionserver has not yet + * received the request yet. + */ private boolean unableToConnectToServer(IOException e) { - // This exception is thrown in the rpc framework, where we can make sure that the call has not - // been executed yet, so it is safe to mark it as fail. Especially for open a region, we'd - // better choose another region server. - // Notice that, it is safe to quit only if this is the first time we send request to region - // server. Maybe the region server has accepted our request the first time, and then there is - // a network error which prevents we receive the response, and the second time we hit a - // CallQueueTooBigException, obviously it is not safe to quit here, otherwise it may lead to a - // double assign... if (e instanceof CallQueueTooBigException) { LOG.warn("request to {} failed due to {}, try={}, this usually because" + " server is overloaded, give up", serverName, e, numberOfAttemptsSoFar); @@ -351,35 +359,20 @@ private boolean isSaslError(IOException e) { return true; } // check 4 level of cause - Throwable cause = e.getCause(); - if (cause == null) { - return false; - } - if (isSaslError(cause)) { - return true; - } - cause = cause.getCause(); - if (cause == null) { - return false; - } - if (isSaslError(cause)) { - return true; - } - cause = cause.getCause(); - if (cause == null) { - return false; - } - if (isSaslError(cause)) { - return true; - } - cause = cause.getCause(); - if (cause == null) { - return false; + Throwable cause = e; + for (int i = 0; i < 4; i++) { + cause = cause.getCause(); + if (cause == null) { + return false; + } + if (isThrowableOfTypeSasl(cause)) { + return true; + } } - return isSaslError(cause); + return false; } - private boolean isSaslError(Throwable cause) { + private boolean isThrowableOfTypeSasl(Throwable cause) { if (cause instanceof IOException) { IOException unwrappedException = unwrapException((IOException) cause); return unwrappedException instanceof SaslException From fbea61e43f0f9ffcf97da1ec7922083ad7001392 Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Thu, 14 Sep 2023 23:41:25 -0700 Subject: [PATCH 09/10] addendum --- .../hadoop/hbase/master/procedure/RSProcedureDispatcher.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java index c3993ffbb2a7..fd59ab2e83a2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java @@ -358,9 +358,8 @@ private boolean isSaslError(IOException e) { ) { return true; } - // check 4 level of cause Throwable cause = e; - for (int i = 0; i < 4; i++) { + while (true) { cause = cause.getCause(); if (cause == null) { return false; @@ -369,7 +368,6 @@ private boolean isSaslError(IOException e) { return true; } } - return false; } private boolean isThrowableOfTypeSasl(Throwable cause) { From a8626fa227352f8b0e21b4990cd86d6de121e88a Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Sat, 16 Sep 2023 14:29:41 -0700 Subject: [PATCH 10/10] addendum --- .../procedure/RSProcedureDispatcher.java | 29 +++++++------------ 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java index fd59ab2e83a2..abc9c575a62e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java @@ -352,34 +352,25 @@ private boolean unableToConnectToServer(IOException e) { } private boolean isSaslError(IOException e) { - if ( - e instanceof SaslException || (e.getMessage() != null - && e.getMessage().contains(RpcConnectionConstants.RELOGIN_IS_IN_PROGRESS)) - ) { - return true; - } Throwable cause = e; while (true) { + if (cause instanceof IOException) { + IOException unwrappedCause = unwrapException((IOException) cause); + if ( + unwrappedCause instanceof SaslException + || (unwrappedCause.getMessage() != null && unwrappedCause.getMessage() + .contains(RpcConnectionConstants.RELOGIN_IS_IN_PROGRESS)) + ) { + return true; + } + } cause = cause.getCause(); if (cause == null) { return false; } - if (isThrowableOfTypeSasl(cause)) { - return true; - } } } - private boolean isThrowableOfTypeSasl(Throwable cause) { - if (cause instanceof IOException) { - IOException unwrappedException = unwrapException((IOException) cause); - return unwrappedException instanceof SaslException - || (unwrappedException.getMessage() != null && unwrappedException.getMessage() - .contains(RpcConnectionConstants.RELOGIN_IS_IN_PROGRESS)); - } - return false; - } - private long getMaxWaitTime() { if (this.maxWaitTime < 0) { // This is the max attempts, not retries, so it should be at least 1.