From 7cc4e8699001f33ea4c62df2d1d80cd056faee76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Knut=20Olav=20L=C3=B8ite?= Date: Wed, 18 Dec 2024 10:30:37 +0100 Subject: [PATCH 1/5] chore: make internal auth backend errors retryable Spanner occasionally returns INTERNAL errors regarding the auth backend server. These errors should be regarded as retryable. --- .../spanner/IsRetryableInternalError.java | 26 +++++++------------ .../spanner/IsRetryableInternalErrorTest.java | 11 ++++++++ 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/google-cloud-spanner/src/main/java/com/google/cloud/spanner/IsRetryableInternalError.java b/google-cloud-spanner/src/main/java/com/google/cloud/spanner/IsRetryableInternalError.java index d250c0ad6c4..57aa4dff687 100644 --- a/google-cloud-spanner/src/main/java/com/google/cloud/spanner/IsRetryableInternalError.java +++ b/google-cloud-spanner/src/main/java/com/google/cloud/spanner/IsRetryableInternalError.java @@ -18,31 +18,25 @@ import com.google.api.gax.rpc.InternalException; import com.google.common.base.Predicate; +import com.google.common.collect.ImmutableList; import io.grpc.Status; import io.grpc.StatusRuntimeException; public class IsRetryableInternalError implements Predicate { - private static final String HTTP2_ERROR_MESSAGE = "HTTP/2 error code: INTERNAL_ERROR"; - private static final String CONNECTION_CLOSED_ERROR_MESSAGE = - "Connection closed with unknown cause"; - private static final String EOS_ERROR_MESSAGE = - "Received unexpected EOS on DATA frame from server"; - - private static final String RST_STREAM_ERROR_MESSAGE = "stream terminated by RST_STREAM"; + private static final ImmutableList RETRYABLE_ERROR_MESSAGES = + ImmutableList.of( + "HTTP/2 error code: INTERNAL_ERROR", + "Connection closed with unknown cause", + "Received unexpected EOS on DATA frame from server", + "stream terminated by RST_STREAM", + "Authentication backend internal server error. Please retry."); @Override public boolean apply(Throwable cause) { if (isInternalError(cause)) { - if (cause.getMessage().contains(HTTP2_ERROR_MESSAGE)) { - return true; - } else if (cause.getMessage().contains(CONNECTION_CLOSED_ERROR_MESSAGE)) { - return true; - } else if (cause.getMessage().contains(EOS_ERROR_MESSAGE)) { - return true; - } else if (cause.getMessage().contains(RST_STREAM_ERROR_MESSAGE)) { - return true; - } + return RETRYABLE_ERROR_MESSAGES.stream() + .anyMatch(message -> cause.getMessage().contains(message)); } return false; } diff --git a/google-cloud-spanner/src/test/java/com/google/cloud/spanner/IsRetryableInternalErrorTest.java b/google-cloud-spanner/src/test/java/com/google/cloud/spanner/IsRetryableInternalErrorTest.java index 63039fcd237..514b1e96b7f 100644 --- a/google-cloud-spanner/src/test/java/com/google/cloud/spanner/IsRetryableInternalErrorTest.java +++ b/google-cloud-spanner/src/test/java/com/google/cloud/spanner/IsRetryableInternalErrorTest.java @@ -127,6 +127,17 @@ public void rstStreamInternalExceptionIsRetryable() { assertTrue(predicate.apply(e)); } + @Test + public void testAuthenticationBackendInternalServerErrorIsRetryable() { + final StatusRuntimeException exception = + new StatusRuntimeException( + Status.fromCode(Code.INTERNAL) + .withDescription( + "INTERNAL: Authentication backend internal server error. Please retry.")); + + assertTrue(predicate.apply(exception)); + } + @Test public void genericInternalExceptionIsNotRetryable() { final InternalException e = From 4506f6fd85caefa4cd4f39ea217f18110474d6ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Knut=20Olav=20L=C3=B8ite?= Date: Wed, 18 Dec 2024 14:14:03 +0100 Subject: [PATCH 2/5] fix: retry specific internal errors Some specific internal errors should be retrid. Instead of adding INTERNAL as a standard retryable error code, we use an interceptor to catch and translate those specific errors. See also b/375684610 --- .../spanner/IsRetryableInternalError.java | 18 +++-- .../spi/v1/SpannerErrorInterceptor.java | 8 ++ .../spanner/RetryableInternalErrorTest.java | 80 +++++++++++++++++++ 3 files changed, 101 insertions(+), 5 deletions(-) create mode 100644 google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java diff --git a/google-cloud-spanner/src/main/java/com/google/cloud/spanner/IsRetryableInternalError.java b/google-cloud-spanner/src/main/java/com/google/cloud/spanner/IsRetryableInternalError.java index 57aa4dff687..e69e1ec9d78 100644 --- a/google-cloud-spanner/src/main/java/com/google/cloud/spanner/IsRetryableInternalError.java +++ b/google-cloud-spanner/src/main/java/com/google/cloud/spanner/IsRetryableInternalError.java @@ -20,9 +20,11 @@ import com.google.common.base.Predicate; import com.google.common.collect.ImmutableList; import io.grpc.Status; +import io.grpc.Status.Code; import io.grpc.StatusRuntimeException; public class IsRetryableInternalError implements Predicate { + public static final IsRetryableInternalError INSTANCE = new IsRetryableInternalError(); private static final ImmutableList RETRYABLE_ERROR_MESSAGES = ImmutableList.of( @@ -32,13 +34,19 @@ public class IsRetryableInternalError implements Predicate { "stream terminated by RST_STREAM", "Authentication backend internal server error. Please retry."); + public boolean isRetryableInternalError(Status status) { + return status.getCode() == Code.INTERNAL + && status.getDescription() != null + && isRetryableErrorMessage(status.getDescription()); + } + @Override public boolean apply(Throwable cause) { - if (isInternalError(cause)) { - return RETRYABLE_ERROR_MESSAGES.stream() - .anyMatch(message -> cause.getMessage().contains(message)); - } - return false; + return isInternalError(cause) && isRetryableErrorMessage(cause.getMessage()); + } + + private boolean isRetryableErrorMessage(String errorMessage) { + return RETRYABLE_ERROR_MESSAGES.stream().anyMatch(errorMessage::contains); } private boolean isInternalError(Throwable cause) { diff --git a/google-cloud-spanner/src/main/java/com/google/cloud/spanner/spi/v1/SpannerErrorInterceptor.java b/google-cloud-spanner/src/main/java/com/google/cloud/spanner/spi/v1/SpannerErrorInterceptor.java index 65db088ffac..01b08188c6d 100644 --- a/google-cloud-spanner/src/main/java/com/google/cloud/spanner/spi/v1/SpannerErrorInterceptor.java +++ b/google-cloud-spanner/src/main/java/com/google/cloud/spanner/spi/v1/SpannerErrorInterceptor.java @@ -16,6 +16,7 @@ package com.google.cloud.spanner.spi.v1; +import com.google.cloud.spanner.IsRetryableInternalError; import com.google.rpc.BadRequest; import com.google.rpc.Help; import com.google.rpc.LocalizedMessage; @@ -32,6 +33,7 @@ import io.grpc.Metadata; import io.grpc.MethodDescriptor; import io.grpc.Status; +import io.grpc.Status.Code; import io.grpc.protobuf.ProtoUtils; import java.util.logging.Level; import java.util.logging.Logger; @@ -69,6 +71,12 @@ public void start(Listener responseListener, Metadata headers) { @Override public void onClose(Status status, Metadata trailers) { try { + // Translate INTERNAL errors that should be retried to a retryable error code. + if (status.getCode() == Code.INTERNAL + && IsRetryableInternalError.INSTANCE.isRetryableInternalError(status)) { + status = + Status.fromCode(Code.UNAVAILABLE).withDescription(status.getDescription()); + } if (trailers.containsKey(LOCALIZED_MESSAGE_KEY)) { status = Status.fromCodeValue(status.getCode().value()) diff --git a/google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java b/google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java new file mode 100644 index 00000000000..a77535333e4 --- /dev/null +++ b/google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java @@ -0,0 +1,80 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.spanner; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import com.google.cloud.NoCredentials; +import com.google.cloud.spanner.MockSpannerServiceImpl.SimulatedExecutionTime; +import com.google.cloud.spanner.connection.AbstractMockServerTest; +import com.google.spanner.v1.BatchCreateSessionsRequest; +import com.google.spanner.v1.ExecuteSqlRequest; +import io.grpc.ManagedChannelBuilder; +import io.grpc.Status; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class RetryableInternalErrorTest extends AbstractMockServerTest { + @Test + public void testTranslateInternalException() { + try (Spanner spanner = + SpannerOptions.newBuilder() + .setProjectId("my-project") + .setHost(String.format("http://localhost:%d", getPort())) + .setChannelConfigurator(ManagedChannelBuilder::usePlaintext) + .setCredentials(NoCredentials.getInstance()) + .setSessionPoolOption( + SessionPoolOptions.newBuilder().setMinSessions(1).setMaxSessions(1).build()) + .build() + .getService()) { + DatabaseClient client = spanner.getDatabaseClient(DatabaseId.of("p", "i", "d")); + mockSpanner.setBatchCreateSessionsExecutionTime( + SimulatedExecutionTime.ofException( + Status.INTERNAL + .withDescription("Authentication backend internal server error. Please retry.") + .asRuntimeException())); + mockSpanner.setExecuteStreamingSqlExecutionTime( + SimulatedExecutionTime.ofException( + Status.INTERNAL + .withDescription("Authentication backend internal server error. Please retry.") + .asRuntimeException())); + mockSpanner.setExecuteSqlExecutionTime( + SimulatedExecutionTime.ofException( + Status.INTERNAL + .withDescription("Authentication backend internal server error. Please retry.") + .asRuntimeException())); + + try (ResultSet resultSet = client.singleUse().executeQuery(SELECT1_STATEMENT)) { + assertTrue(resultSet.next()); + assertFalse(resultSet.next()); + } + assertEquals( + Long.valueOf(1L), + client + .readWriteTransaction() + .run(transaction -> transaction.executeUpdate(INSERT_STATEMENT))); + + // Verify that all the RPCs were retried. + assertEquals(2, mockSpanner.countRequestsOfType(BatchCreateSessionsRequest.class)); + assertEquals(4, mockSpanner.countRequestsOfType(ExecuteSqlRequest.class)); + } + } +} From 5109f13aff7309aa1a415edcb2403c9debd85d60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Knut=20Olav=20L=C3=B8ite?= Date: Wed, 18 Dec 2024 15:50:03 +0100 Subject: [PATCH 3/5] chore: address review comments --- .../spi/v1/SpannerErrorInterceptor.java | 3 +-- .../spanner/RetryableInternalErrorTest.java | 27 ++++++++++++------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/google-cloud-spanner/src/main/java/com/google/cloud/spanner/spi/v1/SpannerErrorInterceptor.java b/google-cloud-spanner/src/main/java/com/google/cloud/spanner/spi/v1/SpannerErrorInterceptor.java index 01b08188c6d..549ea18a97f 100644 --- a/google-cloud-spanner/src/main/java/com/google/cloud/spanner/spi/v1/SpannerErrorInterceptor.java +++ b/google-cloud-spanner/src/main/java/com/google/cloud/spanner/spi/v1/SpannerErrorInterceptor.java @@ -72,8 +72,7 @@ public void start(Listener responseListener, Metadata headers) { public void onClose(Status status, Metadata trailers) { try { // Translate INTERNAL errors that should be retried to a retryable error code. - if (status.getCode() == Code.INTERNAL - && IsRetryableInternalError.INSTANCE.isRetryableInternalError(status)) { + if (IsRetryableInternalError.INSTANCE.isRetryableInternalError(status)) { status = Status.fromCode(Code.UNAVAILABLE).withDescription(status.getDescription()); } diff --git a/google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java b/google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java index a77535333e4..ee2cbe833fd 100644 --- a/google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java +++ b/google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java @@ -56,25 +56,34 @@ public void testTranslateInternalException() { Status.INTERNAL .withDescription("Authentication backend internal server error. Please retry.") .asRuntimeException())); - mockSpanner.setExecuteSqlExecutionTime( - SimulatedExecutionTime.ofException( - Status.INTERNAL - .withDescription("Authentication backend internal server error. Please retry.") - .asRuntimeException())); + // Execute a query. This will block until a BatchCreateSessions call has finished and then + // invoke ExecuteStreamingSql. Both of these RPCs should be retried. try (ResultSet resultSet = client.singleUse().executeQuery(SELECT1_STATEMENT)) { assertTrue(resultSet.next()); assertFalse(resultSet.next()); } + // Verify that both the BatchCreateSessions call and the ExecuteStreamingSql call were + // retried. + assertEquals(2, mockSpanner.countRequestsOfType(BatchCreateSessionsRequest.class)); + assertEquals(2, mockSpanner.countRequestsOfType(ExecuteSqlRequest.class)); + // Clear the requests before the next test. + mockSpanner.clearRequests(); + + // Execute a DML statement. This uses the ExecuteSql RPC. + assertEquals(0, mockSpanner.countRequestsOfType(ExecuteSqlRequest.class)); + mockSpanner.setExecuteSqlExecutionTime( + SimulatedExecutionTime.ofException( + Status.INTERNAL + .withDescription("Authentication backend internal server error. Please retry.") + .asRuntimeException())); assertEquals( Long.valueOf(1L), client .readWriteTransaction() .run(transaction -> transaction.executeUpdate(INSERT_STATEMENT))); - - // Verify that all the RPCs were retried. - assertEquals(2, mockSpanner.countRequestsOfType(BatchCreateSessionsRequest.class)); - assertEquals(4, mockSpanner.countRequestsOfType(ExecuteSqlRequest.class)); + // Verify that also this request was retried. + assertEquals(2, mockSpanner.countRequestsOfType(ExecuteSqlRequest.class)); } } } From 8b147503539a7bceec94f761a9a3959aed5a58b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Knut=20Olav=20L=C3=B8ite?= Date: Wed, 18 Dec 2024 15:56:21 +0100 Subject: [PATCH 4/5] fix: wait for session pool to initialize --- .../google/cloud/spanner/RetryableInternalErrorTest.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java b/google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java index ee2cbe833fd..41e9779c207 100644 --- a/google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java +++ b/google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java @@ -30,6 +30,7 @@ import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; +import org.threeten.bp.Duration; @RunWith(JUnit4.class) public class RetryableInternalErrorTest extends AbstractMockServerTest { @@ -42,7 +43,11 @@ public void testTranslateInternalException() { .setChannelConfigurator(ManagedChannelBuilder::usePlaintext) .setCredentials(NoCredentials.getInstance()) .setSessionPoolOption( - SessionPoolOptions.newBuilder().setMinSessions(1).setMaxSessions(1).build()) + SessionPoolOptions.newBuilder() + .setMinSessions(1) + .setMaxSessions(1) + .setWaitForMinSessions(Duration.ofSeconds(1)) + .build()) .build() .getService()) { DatabaseClient client = spanner.getDatabaseClient(DatabaseId.of("p", "i", "d")); From 98331e9e354d8e1f05f34b1c01eea2d519b1403e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Knut=20Olav=20L=C3=B8ite?= Date: Wed, 18 Dec 2024 16:05:47 +0100 Subject: [PATCH 5/5] fix: register errors before creating the client --- .../spanner/RetryableInternalErrorTest.java | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java b/google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java index 41e9779c207..3106bd16526 100644 --- a/google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java +++ b/google-cloud-spanner/src/test/java/com/google/cloud/spanner/RetryableInternalErrorTest.java @@ -36,6 +36,17 @@ public class RetryableInternalErrorTest extends AbstractMockServerTest { @Test public void testTranslateInternalException() { + mockSpanner.setBatchCreateSessionsExecutionTime( + SimulatedExecutionTime.ofException( + Status.INTERNAL + .withDescription("Authentication backend internal server error. Please retry.") + .asRuntimeException())); + mockSpanner.setExecuteStreamingSqlExecutionTime( + SimulatedExecutionTime.ofException( + Status.INTERNAL + .withDescription("Authentication backend internal server error. Please retry.") + .asRuntimeException())); + try (Spanner spanner = SpannerOptions.newBuilder() .setProjectId("my-project") @@ -46,22 +57,12 @@ public void testTranslateInternalException() { SessionPoolOptions.newBuilder() .setMinSessions(1) .setMaxSessions(1) - .setWaitForMinSessions(Duration.ofSeconds(1)) + .setWaitForMinSessions(Duration.ofSeconds(5)) .build()) .build() .getService()) { - DatabaseClient client = spanner.getDatabaseClient(DatabaseId.of("p", "i", "d")); - mockSpanner.setBatchCreateSessionsExecutionTime( - SimulatedExecutionTime.ofException( - Status.INTERNAL - .withDescription("Authentication backend internal server error. Please retry.") - .asRuntimeException())); - mockSpanner.setExecuteStreamingSqlExecutionTime( - SimulatedExecutionTime.ofException( - Status.INTERNAL - .withDescription("Authentication backend internal server error. Please retry.") - .asRuntimeException())); + DatabaseClient client = spanner.getDatabaseClient(DatabaseId.of("p", "i", "d")); // Execute a query. This will block until a BatchCreateSessions call has finished and then // invoke ExecuteStreamingSql. Both of these RPCs should be retried. try (ResultSet resultSet = client.singleUse().executeQuery(SELECT1_STATEMENT)) {