From b58be97410880254fe358a722590253dd37020d6 Mon Sep 17 00:00:00 2001 From: Stephen O'Donnell Date: Fri, 12 Apr 2024 09:44:18 +0100 Subject: [PATCH 1/3] HDDS-10681. EC Reconstruction does not issue put block to data index if it is unused (#6514) (cherry picked from commit cba8c85e22435e2c3e6aaa0e66aedc2a0631c912) Conflicts: hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java --- .../ECReconstructionCoordinator.java | 119 +++++++++--------- .../scm/storage/TestContainerCommandsEC.java | 18 ++- 2 files changed, 72 insertions(+), 65 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java index 90756bbc8898..3d9b5b77ee35 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java @@ -58,6 +58,7 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -247,24 +248,15 @@ public void reconstructECBlockGroup(BlockLocationInfo blockLocationInfo, int dataLocs = ECBlockInputStreamProxy .expectedDataLocations(repConfig, safeBlockGroupLength); List toReconstructIndexes = new ArrayList<>(); + List notReconstructIndexes = new ArrayList<>(); for (Integer index : missingContainerIndexes) { if (index <= dataLocs || index > repConfig.getData()) { toReconstructIndexes.add(index); + } else { + // Don't need to be reconstructed, but we do need a stream to write + // the block data to. + notReconstructIndexes.add(index); } - // else padded indexes. - } - - // Looks like we don't need to reconstruct any missing blocks in this block - // group. The reason for this should be block group had only padding blocks - // in the missing locations. - if (toReconstructIndexes.size() == 0) { - if (LOG.isDebugEnabled()) { - LOG.debug("Skipping the reconstruction for the block: " - + blockLocationInfo.getBlockID() + ". In the missing locations: " - + missingContainerIndexes - + ", this block group has only padded blocks."); - } - return; } try (ECBlockReconstructedStripeInputStream sis @@ -276,71 +268,78 @@ public void reconstructECBlockGroup(BlockLocationInfo blockLocationInfo, ECBlockOutputStream[] targetBlockStreams = new ECBlockOutputStream[toReconstructIndexes.size()]; + ECBlockOutputStream[] emptyBlockStreams = + new ECBlockOutputStream[notReconstructIndexes.size()]; ByteBuffer[] bufs = new ByteBuffer[toReconstructIndexes.size()]; try { + // Create streams and buffers for all indexes that need reconstructed for (int i = 0; i < toReconstructIndexes.size(); i++) { int replicaIndex = toReconstructIndexes.get(i); - DatanodeDetails datanodeDetails = - targetMap.get(replicaIndex); - targetBlockStreams[i] = getECBlockOutputStream(blockLocationInfo, - datanodeDetails, repConfig, replicaIndex - ); + DatanodeDetails datanodeDetails = targetMap.get(replicaIndex); + targetBlockStreams[i] = getECBlockOutputStream(blockLocationInfo, datanodeDetails, repConfig, replicaIndex); bufs[i] = byteBufferPool.getBuffer(false, repConfig.getEcChunkSize()); - // Make sure it's clean. Don't want to reuse the erroneously returned - // buffers from the pool. bufs[i].clear(); } + // Then create a stream for all indexes that don't need reconstructed, but still need a stream to + // write the empty block data to. + for (int i = 0; i < notReconstructIndexes.size(); i++) { + int replicaIndex = notReconstructIndexes.get(i); + DatanodeDetails datanodeDetails = targetMap.get(replicaIndex); + emptyBlockStreams[i] = getECBlockOutputStream(blockLocationInfo, datanodeDetails, repConfig, replicaIndex); + } - sis.setRecoveryIndexes(toReconstructIndexes.stream().map(i -> (i - 1)) - .collect(Collectors.toSet())); - long length = safeBlockGroupLength; - while (length > 0) { - int readLen; - try { - readLen = sis.recoverChunks(bufs); - Set failedIndexes = sis.getFailedIndexes(); - if (!failedIndexes.isEmpty()) { - // There was a problem reading some of the block indexes, but we - // did not get an exception as there must have been spare indexes - // to try and recover from. Therefore we should log out the block - // group details in the same way as for the exception case below. + if (toReconstructIndexes.size() > 0) { + sis.setRecoveryIndexes(toReconstructIndexes.stream().map(i -> (i - 1)) + .collect(Collectors.toSet())); + long length = safeBlockGroupLength; + while (length > 0) { + int readLen; + try { + readLen = sis.recoverChunks(bufs); + Set failedIndexes = sis.getFailedIndexes(); + if (!failedIndexes.isEmpty()) { + // There was a problem reading some of the block indexes, but we + // did not get an exception as there must have been spare indexes + // to try and recover from. Therefore we should log out the block + // group details in the same way as for the exception case below. + logBlockGroupDetails(blockLocationInfo, repConfig, + blockDataGroup); + } + } catch (IOException e) { + // When we see exceptions here, it could be due to some transient + // issue that causes the block read to fail when reconstructing it, + // but we have seen issues where the containers don't have the + // blocks they appear they should have, or the block chunks are the + // wrong length etc. In order to debug these sort of cases, if we + // get an error, we will log out the details about the block group + // length on each source, along with their chunk list and chunk + // lengths etc. logBlockGroupDetails(blockLocationInfo, repConfig, blockDataGroup); + throw e; } - } catch (IOException e) { - // When we see exceptions here, it could be due to some transient - // issue that causes the block read to fail when reconstructing it, - // but we have seen issues where the containers don't have the - // blocks they appear they should have, or the block chunks are the - // wrong length etc. In order to debug these sort of cases, if we - // get an error, we will log out the details about the block group - // length on each source, along with their chunk list and chunk - // lengths etc. - logBlockGroupDetails(blockLocationInfo, repConfig, - blockDataGroup); - throw e; - } - // TODO: can be submitted in parallel - for (int i = 0; i < bufs.length; i++) { - CompletableFuture - future = targetBlockStreams[i].write(bufs[i]); - checkFailures(targetBlockStreams[i], future); - bufs[i].clear(); + // TODO: can be submitted in parallel + for (int i = 0; i < bufs.length; i++) { + CompletableFuture + future = targetBlockStreams[i].write(bufs[i]); + checkFailures(targetBlockStreams[i], future); + bufs[i].clear(); + } + length -= readLen; } - length -= readLen; } - - for (ECBlockOutputStream targetStream : targetBlockStreams) { - targetStream.executePutBlock(true, true, - blockLocationInfo.getLength(), blockDataGroup); - checkFailures(targetStream, - targetStream.getCurrentPutBlkResponseFuture()); + List allStreams = new ArrayList<>(Arrays.asList(targetBlockStreams)); + allStreams.addAll(Arrays.asList(emptyBlockStreams)); + for (ECBlockOutputStream targetStream : allStreams) { + targetStream.executePutBlock(true, true, blockLocationInfo.getLength(), blockDataGroup); + checkFailures(targetStream, targetStream.getCurrentPutBlkResponseFuture()); } } finally { for (ByteBuffer buf : bufs) { byteBufferPool.putBuffer(buf); } IOUtils.cleanupWithLogger(LOG, targetBlockStreams); + IOUtils.cleanupWithLogger(LOG, emptyBlockStreams); } } } diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java index f2fe3fa31a1c..5cd4ce63f891 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java @@ -614,8 +614,15 @@ void testECReconstructionCoordinatorWith(List missingIndexes) testECReconstructionCoordinator(missingIndexes, 3); } + @ParameterizedTest + @MethodSource("recoverableMissingIndexes") + void testECReconstructionCoordinatorWithPartialStripe(List missingIndexes) + throws Exception { + testECReconstructionCoordinator(missingIndexes, 1); + } + @Test - void testECReconstructionWithPartialStripe() + void testECReconstructParityWithPartialStripe() throws Exception { testECReconstructionCoordinator(ImmutableList.of(4, 5), 1); } @@ -895,18 +902,19 @@ private void checkBlockData( reconstructedBlockData) { for (int i = 0; i < blockData.length; i++) { + Assert.assertEquals(blockData[i].getBlockID(), reconstructedBlockData[i].getBlockID()); + Assert.assertEquals(blockData[i].getSize(), reconstructedBlockData[i].getSize()); + Assert.assertEquals(blockData[i].getMetadata(), reconstructedBlockData[i].getMetadata()); List oldBlockDataChunks = blockData[i].getChunks(); List newBlockDataChunks = reconstructedBlockData[i].getChunks(); for (int j = 0; j < oldBlockDataChunks.size(); j++) { ContainerProtos.ChunkInfo chunkInfo = oldBlockDataChunks.get(j); - if (chunkInfo.getLen() == 0) { - // let's ignore the empty chunks - continue; - } Assert.assertEquals(chunkInfo, newBlockDataChunks.get(j)); } + // Ensure there are no extra chunks in the reconstructed block + Assert.assertEquals(oldBlockDataChunks.size(), newBlockDataChunks.size()); } } From e453c61d0b214c1cfa4465520bc2dafb8064615a Mon Sep 17 00:00:00 2001 From: Stephen O'Donnell Date: Fri, 12 Apr 2024 21:02:33 +0100 Subject: [PATCH 2/3] HDDS-10682. EC Reconstruction creates empty chunks at the end of blocks with partial stripes (#6515) (cherry picked from commit a5fccbc1d6539cdccf5366edf26382a139897789) --- .../ECReconstructionCoordinator.java | 10 +++++++--- .../hdds/scm/storage/TestContainerCommandsEC.java | 14 ++++++++------ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java index 3d9b5b77ee35..62f96d8adf00 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java @@ -320,9 +320,13 @@ public void reconstructECBlockGroup(BlockLocationInfo blockLocationInfo, } // TODO: can be submitted in parallel for (int i = 0; i < bufs.length; i++) { - CompletableFuture - future = targetBlockStreams[i].write(bufs[i]); - checkFailures(targetBlockStreams[i], future); + if (bufs[i].remaining() != 0) { + // If the buffer is empty, we don't need to write it as it will cause + // an empty chunk to be added to the end of the block. + CompletableFuture + future = targetBlockStreams[i].write(bufs[i]); + checkFailures(targetBlockStreams[i], future); + } bufs[i].clear(); } length -= readLen; diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java index 5cd4ce63f891..c5f123935383 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java @@ -135,7 +135,9 @@ public class TestContainerCommandsEC { private static final int EC_CHUNK_SIZE = 1024 * 1024; private static final int STRIPE_DATA_SIZE = EC_DATA * EC_CHUNK_SIZE; private static final int NUM_DN = EC_DATA + EC_PARITY + 3; - private static byte[][] inputChunks = new byte[EC_DATA][EC_CHUNK_SIZE]; + // Data slots are EC_DATA + 1 so we can generate enough data to have a full stripe + // plus one extra chunk. + private static byte[][] inputChunks = new byte[EC_DATA + 1][EC_CHUNK_SIZE]; // Each key size will be in range [min, max), min inclusive, max exclusive private static final int[][] KEY_SIZE_RANGES = @@ -621,13 +623,13 @@ void testECReconstructionCoordinatorWithPartialStripe(List missingIndex testECReconstructionCoordinator(missingIndexes, 1); } - @Test - void testECReconstructParityWithPartialStripe() - throws Exception { - testECReconstructionCoordinator(ImmutableList.of(4, 5), 1); + @ParameterizedTest + @MethodSource("recoverableMissingIndexes") + void testECReconstructionCoordinatorWithFullAndPartialStripe(List missingIndexes) + throws Exception { + testECReconstructionCoordinator(missingIndexes, 4); } - static Stream> recoverableMissingIndexes() { return Stream .concat(IntStream.rangeClosed(1, 5).mapToObj(ImmutableList::of), Stream From 1f16f430d2f4afb00280b6bd03dfca2751bfb55f Mon Sep 17 00:00:00 2001 From: Stephen O'Donnell Date: Wed, 17 Apr 2024 11:01:06 +0100 Subject: [PATCH 3/3] HDDS-10704. Do not fail read of EC block if the last chunk is empty (#6540) (cherry picked from commit 4f9b86ece1184de830d305ee8bd1e71bd1ca5d51) (cherry picked from commit 962a72d159282a07aeb75a27c966cac7bc6a572b) --- .../hadoop/hdds/scm/storage/BlockInputStream.java | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/BlockInputStream.java b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/BlockInputStream.java index 385ea6d0c3ea..3d789912a66b 100644 --- a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/BlockInputStream.java +++ b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/BlockInputStream.java @@ -289,10 +289,18 @@ private static void validate(ContainerCommandResponseProto response) throw new IllegalArgumentException("Not GetBlock: response=" + response); } final GetBlockResponseProto b = response.getGetBlock(); + final long blockLength = b.getBlockData().getSize(); final List chunks = b.getBlockData().getChunksList(); for (int i = 0; i < chunks.size(); i++) { final ChunkInfo c = chunks.get(i); - if (c.getLen() <= 0) { + // HDDS-10682 caused an empty chunk to get written to the end of some EC blocks. Due to this + // validation, these blocks will not be readable. In the EC case, the empty chunk is always + // the last chunk and the offset is the block length. We can safely ignore this case and not fail. + if (c.getLen() <= 0 && i == chunks.size() - 1 && c.getOffset() == blockLength) { + DatanodeBlockID blockID = b.getBlockData().getBlockID(); + LOG.warn("The last chunk is empty for container/block {}/{} with an offset of the block length. " + + "Likely due to HDDS-10682. This is safe to ignore.", blockID.getContainerID(), blockID.getLocalID()); + } else if (c.getLen() <= 0) { throw new IOException("Failed to get chunkInfo[" + i + "]: len == " + c.getLen()); }