From 9e15cf31aa4eb484ced6f24f2430725a8b163ebb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elek=20M=C3=A1rton?= Date: Tue, 3 Nov 2020 16:54:44 +0100 Subject: [PATCH 01/11] HDDS-4429. Create unit test for SimpleContainerDownloader --- .../SimpleContainerDownloader.java | 27 ++-- .../TestSimpleContainerDownloader.java | 120 ++++++++++++++++++ 2 files changed, 137 insertions(+), 10 deletions(-) create mode 100644 hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/SimpleContainerDownloader.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/SimpleContainerDownloader.java index 9d7b5516a5c3..9d19f55e1e21 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/SimpleContainerDownloader.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/SimpleContainerDownloader.java @@ -32,6 +32,7 @@ import org.apache.hadoop.hdds.security.x509.SecurityConfig; import org.apache.hadoop.ozone.OzoneConfigKeys; +import com.google.common.annotations.VisibleForTesting; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -74,21 +75,13 @@ public CompletableFuture getContainerDataFromReplicas(long containerId, for (DatanodeDetails datanode : sourceDatanodes) { try { if (result == null) { - GrpcReplicationClient grpcReplicationClient = - new GrpcReplicationClient(datanode.getIpAddress(), - datanode.getPort(Name.STANDALONE).getValue(), - workingDirectory, securityConfig, caCert); - result = grpcReplicationClient.download(containerId); + result = downloadContainer(containerId, datanode); } else { result = result.thenApply(CompletableFuture::completedFuture) .exceptionally(t -> { LOG.error("Error on replicating container: " + containerId, t); try { - GrpcReplicationClient grpcReplicationClient = - new GrpcReplicationClient(datanode.getIpAddress(), - datanode.getPort(Name.STANDALONE).getValue(), - workingDirectory, securityConfig, caCert); - return grpcReplicationClient.download(containerId); + return downloadContainer(containerId, datanode); } catch (IOException e) { LOG.error("Error on replicating container: " + containerId, t); @@ -107,6 +100,20 @@ public CompletableFuture getContainerDataFromReplicas(long containerId, } + @VisibleForTesting + protected CompletableFuture downloadContainer( + long containerId, + DatanodeDetails datanode + ) throws IOException { + CompletableFuture result; + GrpcReplicationClient grpcReplicationClient = + new GrpcReplicationClient(datanode.getIpAddress(), + datanode.getPort(Name.STANDALONE).getValue(), + workingDirectory, securityConfig, caCert); + result = grpcReplicationClient.download(containerId); + return result; + } + @Override public void close() { // noop diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java new file mode 100644 index 000000000000..abece9072e61 --- /dev/null +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.container.replication; + +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeUnit; + +import org.apache.hadoop.hdds.conf.ConfigurationSource; +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.protocol.MockDatanodeDetails; + +import org.junit.Assert; +import org.junit.Test; + +/** + * Test SimpleContainerDownloader. + */ +public class TestSimpleContainerDownloader { + + @Test + public void testGetContainerDataFromReplicasHappyPath() throws Exception { + + //GIVEN + List datanodes = createDatanodes(); + + SimpleContainerDownloader downloader = + createDownloaderWithPredefinedFailures(); + + //WHEN + final Path result = + downloader.getContainerDataFromReplicas(1L, datanodes) + .get(1L, TimeUnit.SECONDS); + + //THEN + Assert.assertEquals(datanodes.get(0).getUuidString(), result.toString()); + } + + @Test + public void testGetContainerDataFromReplicasOneFailure() throws Exception { + + //GIVEN + List datanodes = createDatanodes(); + + SimpleContainerDownloader downloader = + createDownloaderWithPredefinedFailures(datanodes.get(0)); + + //WHEN + final Path result = + downloader.getContainerDataFromReplicas(1L, datanodes) + .get(1L, TimeUnit.SECONDS); + + //THEN + //first datanode is failed, second worked + Assert.assertEquals(datanodes.get(1).getUuidString(), result.toString()); + } + + /** + * Creates downloader which fails with datanodes in the arguments. + */ + private SimpleContainerDownloader createDownloaderWithPredefinedFailures( + DatanodeDetails... failedDatanodes + ) { + + ConfigurationSource conf = new OzoneConfiguration(); + + final List datanodes = + Arrays.asList(failedDatanodes); + + return new SimpleContainerDownloader(conf, null) { + + @Override + protected CompletableFuture downloadContainer( + long containerId, + DatanodeDetails datanode + ) throws IOException { + + if (datanodes.contains(datanode)) { + throw new IOException("Unavailable datanode"); + } else { + + //path includes the dn id to make it possible to assert. + return CompletableFuture.completedFuture( + Paths.get(datanode.getUuidString())); + } + + } + }; + } + + private List createDatanodes() { + List datanodes = new ArrayList<>(); + datanodes.add(MockDatanodeDetails.randomDatanodeDetails()); + datanodes.add(MockDatanodeDetails.randomDatanodeDetails()); + datanodes.add(MockDatanodeDetails.randomDatanodeDetails()); + return datanodes; + } +} \ No newline at end of file From 1e96edd667182099017a043f90603fd06610eb14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elek=2C=20M=C3=A1rton?= Date: Thu, 12 Nov 2020 10:07:35 +0100 Subject: [PATCH 02/11] Print out the right exception in LOG.error Co-authored-by: Doroszlai, Attila <6454655+adoroszlai@users.noreply.github.com> --- .../ozone/container/replication/SimpleContainerDownloader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/SimpleContainerDownloader.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/SimpleContainerDownloader.java index 9d19f55e1e21..71a1e1a2b3fe 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/SimpleContainerDownloader.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/SimpleContainerDownloader.java @@ -84,7 +84,7 @@ public CompletableFuture getContainerDataFromReplicas(long containerId, return downloadContainer(containerId, datanode); } catch (IOException e) { LOG.error("Error on replicating container: " + containerId, - t); + e); return null; } }).thenCompose(Function.identity()); From 17e50ae544ba2deb3b66183da9ebd3e3d8b3f854 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elek=20M=C3=A1rton?= Date: Thu, 12 Nov 2020 10:18:14 +0100 Subject: [PATCH 03/11] test both direct and future wrapped exceptions --- .../TestSimpleContainerDownloader.java | 38 +++++++++++++++++-- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java index abece9072e61..9ba3dc92357b 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java @@ -47,7 +47,7 @@ public void testGetContainerDataFromReplicasHappyPath() throws Exception { List datanodes = createDatanodes(); SimpleContainerDownloader downloader = - createDownloaderWithPredefinedFailures(); + createDownloaderWithPredefinedFailures(true); //WHEN final Path result = @@ -59,13 +59,33 @@ public void testGetContainerDataFromReplicasHappyPath() throws Exception { } @Test - public void testGetContainerDataFromReplicasOneFailure() throws Exception { + public void testGetContainerDataFromReplicasDirectFailure() + throws Exception { //GIVEN List datanodes = createDatanodes(); SimpleContainerDownloader downloader = - createDownloaderWithPredefinedFailures(datanodes.get(0)); + createDownloaderWithPredefinedFailures(true, datanodes.get(0)); + + //WHEN + final Path result = + downloader.getContainerDataFromReplicas(1L, datanodes) + .get(1L, TimeUnit.SECONDS); + + //THEN + //first datanode is failed, second worked + Assert.assertEquals(datanodes.get(1).getUuidString(), result.toString()); + } + + @Test + public void testGetContainerDataFromReplicasAsyncFailure() throws Exception { + + //GIVEN + List datanodes = createDatanodes(); + + SimpleContainerDownloader downloader = + createDownloaderWithPredefinedFailures(false, datanodes.get(0)); //WHEN final Path result = @@ -79,8 +99,12 @@ public void testGetContainerDataFromReplicasOneFailure() throws Exception { /** * Creates downloader which fails with datanodes in the arguments. + * + * @param directException if false the exception will be wrapped in the + * returning future. */ private SimpleContainerDownloader createDownloaderWithPredefinedFailures( + boolean directException, DatanodeDetails... failedDatanodes ) { @@ -98,7 +122,13 @@ protected CompletableFuture downloadContainer( ) throws IOException { if (datanodes.contains(datanode)) { - throw new IOException("Unavailable datanode"); + if (directException) { + throw new IOException("Unavailable datanode"); + } else { + return CompletableFuture.supplyAsync(() -> { + throw new RuntimeException("Unavailable datanode"); + }); + } } else { //path includes the dn id to make it possible to assert. From c149ce443e5c757ba451de7f837f5a829f12b220 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elek=20M=C3=A1rton?= Date: Thu, 12 Nov 2020 10:32:59 +0100 Subject: [PATCH 04/11] test both direct and future wrapped exceptions --- .../TestSimpleContainerDownloader.java | 38 +++++++++++++++++-- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java index abece9072e61..9ba3dc92357b 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java @@ -47,7 +47,7 @@ public void testGetContainerDataFromReplicasHappyPath() throws Exception { List datanodes = createDatanodes(); SimpleContainerDownloader downloader = - createDownloaderWithPredefinedFailures(); + createDownloaderWithPredefinedFailures(true); //WHEN final Path result = @@ -59,13 +59,33 @@ public void testGetContainerDataFromReplicasHappyPath() throws Exception { } @Test - public void testGetContainerDataFromReplicasOneFailure() throws Exception { + public void testGetContainerDataFromReplicasDirectFailure() + throws Exception { //GIVEN List datanodes = createDatanodes(); SimpleContainerDownloader downloader = - createDownloaderWithPredefinedFailures(datanodes.get(0)); + createDownloaderWithPredefinedFailures(true, datanodes.get(0)); + + //WHEN + final Path result = + downloader.getContainerDataFromReplicas(1L, datanodes) + .get(1L, TimeUnit.SECONDS); + + //THEN + //first datanode is failed, second worked + Assert.assertEquals(datanodes.get(1).getUuidString(), result.toString()); + } + + @Test + public void testGetContainerDataFromReplicasAsyncFailure() throws Exception { + + //GIVEN + List datanodes = createDatanodes(); + + SimpleContainerDownloader downloader = + createDownloaderWithPredefinedFailures(false, datanodes.get(0)); //WHEN final Path result = @@ -79,8 +99,12 @@ public void testGetContainerDataFromReplicasOneFailure() throws Exception { /** * Creates downloader which fails with datanodes in the arguments. + * + * @param directException if false the exception will be wrapped in the + * returning future. */ private SimpleContainerDownloader createDownloaderWithPredefinedFailures( + boolean directException, DatanodeDetails... failedDatanodes ) { @@ -98,7 +122,13 @@ protected CompletableFuture downloadContainer( ) throws IOException { if (datanodes.contains(datanode)) { - throw new IOException("Unavailable datanode"); + if (directException) { + throw new IOException("Unavailable datanode"); + } else { + return CompletableFuture.supplyAsync(() -> { + throw new RuntimeException("Unavailable datanode"); + }); + } } else { //path includes the dn id to make it possible to assert. From 9adbce2ae1c96b455f46d88955c785565cd20114 Mon Sep 17 00:00:00 2001 From: "Doroszlai, Attila" Date: Thu, 12 Nov 2020 20:14:03 +0100 Subject: [PATCH 05/11] trigger new CI check From 62c34d7608b78cf417f1d70b07322764550425e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elek=20M=C3=A1rton?= Date: Mon, 23 Nov 2020 13:31:29 +0100 Subject: [PATCH 06/11] fix retry test after HDDS-4453 --- .../SimpleContainerDownloader.java | 25 ++++++++++++++----- .../TestSimpleContainerDownloader.java | 9 +++++++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/SimpleContainerDownloader.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/SimpleContainerDownloader.java index e95adae21e81..e0586c2b772b 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/SimpleContainerDownloader.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/SimpleContainerDownloader.java @@ -34,6 +34,7 @@ import org.apache.hadoop.ozone.OzoneConfigKeys; import com.google.common.annotations.VisibleForTesting; +import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -75,12 +76,8 @@ public CompletableFuture getContainerDataFromReplicas(long containerId, CompletableFuture result = null; - //There is a chance for the download is successful but import is failed, - //due to data corruption. We need a random selected datanode to have a - //chance to succeed next time. - final ArrayList shuffledDatanodes = - new ArrayList<>(sourceDatanodes); - Collections.shuffle(shuffledDatanodes); + final List shuffledDatanodes = + shuffleDatanodes(sourceDatanodes); for (DatanodeDetails datanode : shuffledDatanodes) { try { @@ -109,6 +106,22 @@ public CompletableFuture getContainerDataFromReplicas(long containerId, } + //There is a chance for the download is successful but import is failed, + //due to data corruption. We need a random selected datanode to have a + //chance to succeed next time. + @NotNull + protected List shuffleDatanodes( + List sourceDatanodes + ) { + + final ArrayList shuffledDatanodes = + new ArrayList<>(sourceDatanodes); + + Collections.shuffle(shuffledDatanodes); + + return shuffledDatanodes; + } + @VisibleForTesting protected CompletableFuture downloadContainer( long containerId, diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java index e63c5ce02f41..e871562786c8 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java @@ -156,6 +156,15 @@ private SimpleContainerDownloader createDownloaderWithPredefinedFailures( return new SimpleContainerDownloader(conf, null) { + //for retry testing we use predictable list of datanodes. + @Override + protected List shuffleDatanodes( + List sourceDatanodes + ) { + //turn off randomization + return sourceDatanodes; + } + @Override protected CompletableFuture downloadContainer( long containerId, From c54424b120dfb6c26ea2d70f370f789475b82e7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elek=20M=C3=A1rton?= Date: Wed, 25 Nov 2020 16:16:47 +0100 Subject: [PATCH 07/11] increased timeout --- .../container/replication/TestSimpleContainerDownloader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java index e871562786c8..f29b1579198a 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestSimpleContainerDownloader.java @@ -102,7 +102,7 @@ public void testGetContainerDataFromReplicasAsyncFailure() throws Exception { /** * Test if different datanode is used for each download attempt. */ - @Test(timeout = 1000L) + @Test(timeout = 10_000L) public void testRandomSelection() throws ExecutionException, InterruptedException { From 38eee8a799db4a6b3bcd52a1d58ded105085787b Mon Sep 17 00:00:00 2001 From: "Doroszlai, Attila" Date: Wed, 25 Nov 2020 18:05:27 +0100 Subject: [PATCH 08/11] trigger new CI check From e5ee7b62f05807b8132ae0522badeacbf8b715d3 Mon Sep 17 00:00:00 2001 From: "Doroszlai, Attila" Date: Wed, 25 Nov 2020 22:17:18 +0100 Subject: [PATCH 09/11] trigger new CI check From 2b66437098ba77ef65ad6ede3b64ebb109697f1c Mon Sep 17 00:00:00 2001 From: "Doroszlai, Attila" Date: Thu, 26 Nov 2020 15:51:04 +0100 Subject: [PATCH 10/11] trigger new CI check From a280682b6a9301f73d8a5d2d2f832790b2dccd62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elek=20M=C3=A1rton?= Date: Mon, 30 Nov 2020 09:23:35 +0100 Subject: [PATCH 11/11] fix unit test and error handling --- .../SimpleContainerDownloader.java | 50 +++++++++++-------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/SimpleContainerDownloader.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/SimpleContainerDownloader.java index e0586c2b772b..5d8a86bc930a 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/SimpleContainerDownloader.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/SimpleContainerDownloader.java @@ -25,7 +25,6 @@ import java.util.Collections; import java.util.List; import java.util.concurrent.CompletableFuture; -import java.util.function.Function; import org.apache.hadoop.hdds.conf.ConfigurationSource; import org.apache.hadoop.hdds.protocol.DatanodeDetails; @@ -54,8 +53,10 @@ public class SimpleContainerDownloader implements ContainerDownloader { private final SecurityConfig securityConfig; private final X509Certificate caCert; - public SimpleContainerDownloader(ConfigurationSource conf, - X509Certificate caCert) { + public SimpleContainerDownloader( + ConfigurationSource conf, + X509Certificate caCert + ) { String workDirString = conf.get(OzoneConfigKeys.OZONE_CONTAINER_COPY_WORKDIR); @@ -71,8 +72,10 @@ public SimpleContainerDownloader(ConfigurationSource conf, } @Override - public CompletableFuture getContainerDataFromReplicas(long containerId, - List sourceDatanodes) { + public CompletableFuture getContainerDataFromReplicas( + long containerId, + List sourceDatanodes + ) { CompletableFuture result = null; @@ -84,17 +87,16 @@ public CompletableFuture getContainerDataFromReplicas(long containerId, if (result == null) { result = downloadContainer(containerId, datanode); } else { - result = result.thenApply(CompletableFuture::completedFuture) - .exceptionally(t -> { - LOG.error("Error on replicating container: " + containerId, t); - try { - return downloadContainer(containerId, datanode); - } catch (Exception e) { - LOG.error("Error on replicating container: " + containerId, - e); - return null; - } - }).thenCompose(Function.identity()); + result = result.exceptionally(t -> { + LOG.error("Error on replicating container: " + containerId, t); + try { + return downloadContainer(containerId, datanode).join(); + } catch (Exception e) { + LOG.error("Error on replicating container: " + containerId, + e); + return null; + } + }); } } catch (Exception ex) { LOG.error(String.format( @@ -128,12 +130,20 @@ protected CompletableFuture downloadContainer( DatanodeDetails datanode ) throws Exception { CompletableFuture result; - try (GrpcReplicationClient grpcReplicationClient = + GrpcReplicationClient grpcReplicationClient = new GrpcReplicationClient(datanode.getIpAddress(), datanode.getPort(Name.STANDALONE).getValue(), - workingDirectory, securityConfig, caCert)) { - result = grpcReplicationClient.download(containerId); - } + workingDirectory, securityConfig, caCert); + result = grpcReplicationClient.download(containerId) + .thenApply(r -> { + try { + grpcReplicationClient.close(); + } catch (Exception e) { + LOG.error("Couldn't close Grpc replication client", e); + } + return r; + }); + return result; }