From ff5fff2a8625eb498fa32e356d59febb659f25ab Mon Sep 17 00:00:00 2001 From: arafat Date: Tue, 6 Feb 2024 10:59:22 +0530 Subject: [PATCH 01/10] HDDS-10293. IllegalArgumentException: containerSize Negative - Container ID #3979. --- .../recon/tasks/ContainerSizeCountTask.java | 55 ++++++++----- .../tasks/TestContainerSizeCountTask.java | 79 ++++++++++++++++++- 2 files changed, 112 insertions(+), 22 deletions(-) diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java index fb387861f0e3..864cc818b3b5 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java @@ -19,6 +19,8 @@ package org.apache.hadoop.ozone.recon.tasks; +import com.google.common.base.Preconditions; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos; import org.apache.hadoop.hdds.scm.container.ContainerID; import org.apache.hadoop.hdds.scm.container.ContainerInfo; import org.apache.hadoop.hdds.scm.container.ContainerManager; @@ -41,6 +43,7 @@ import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.DELETED; import static org.hadoop.ozone.recon.schema.tables.ContainerCountBySizeTable.CONTAINER_COUNT_BY_SIZE; @@ -122,6 +125,11 @@ private void process(ContainerInfo container, Map map) { final ContainerID id = container.containerID(); final long currentSize = container.getUsedBytes(); + if (currentSize < 0) { + LOG.error("Negative container size: {} for container: {}", currentSize, + id); + return; + } final Long previousSize = processedContainers.put(id, currentSize); if (previousSize != null) { decrementContainerSizeCount(previousSize, map); @@ -132,36 +140,40 @@ private void process(ContainerInfo container, /** * The process() function is responsible for updating the counts of * containers being tracked in a containerSizeCountMap based on the - * ContainerInfo objects in the list containers.It then iterates through + * ContainerInfo objects in the list containers. It then iterates through * the list of containers and does the following for each container: * - * 1) If the container is not present in processedContainers, - * it is a new container, so it is added to the processedContainers map - * and the count for its size in the containerSizeCountMap is incremented - * by 1 using the handlePutKeyEvent() function. - * 2) If the container is present in processedContainers but its size has - * been updated to the new size then the count for the old size in the - * containerSizeCountMap is decremented by 1 using the - * handleDeleteKeyEvent() function. The count for the new size is then - * incremented by 1 using the handlePutKeyEvent() function. - * 3) If the container is not present in containers list, it means the - * container has been deleted. - * The remaining containers inside the deletedContainers map are the ones - * that are not in the cluster and need to be deleted. Finally, the counts in - * the containerSizeCountMap are written to the database using the - * writeCountsToDB() function. + * 1) If the container's state is not "deleted," it will be processed: + * - If the container is not present in processedContainers, it is a new + * container. Therefore, it is added to the processedContainers map, and + * the count for its size in the containerSizeCountMap is incremented by + * 1 using the handlePutKeyEvent() function. + * - If the container is present in processedContainers but its size has + * been updated to a new size, the count for the old size in the + * containerSizeCountMap is decremented by 1 using the + * handleDeleteKeyEvent() function. Subsequently, the count for the new + * size is incremented by 1 using the handlePutKeyEvent() function. + * + * 2) If the container's state is "deleted," it is skipped, as deleted + * containers are not processed. + * + * After processing, the remaining containers inside the deletedContainers map + * are those that are not in the cluster and need to be deleted from the total + * size counts. Finally, the counts in the containerSizeCountMap are written + * to the database using the writeCountsToDB() function. */ public void process(List containers) { lock.writeLock().lock(); try { final Map containerSizeCountMap = new HashMap<>(); - final Map deletedContainers - = new HashMap<>(processedContainers); - + final Map deletedContainers = + new HashMap<>(processedContainers); // Loop to handle container create and size-update operations for (ContainerInfo container : containers) { - // The containers present in the cache hence it is not yet deleted + if (container.getState().equals(DELETED)) { + continue; // Skip deleted containers + } deletedContainers.remove(container.containerID()); // For New Container being created try { @@ -261,6 +273,9 @@ private void handleContainerDeleteOperations( Map containerSizeCountMap) { for (Map.Entry containerId : deletedContainers.entrySet()) { + // processedContainers will only keep a track of all containers that have + // been processed except DELETED containers. + processedContainers.remove(containerId.getKey()); long containerSize = deletedContainers.get(containerId.getKey()); decrementContainerSizeCount(containerSize, containerSizeCountMap); } diff --git a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/tasks/TestContainerSizeCountTask.java b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/tasks/TestContainerSizeCountTask.java index eff330a796c9..67d7694f08ea 100644 --- a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/tasks/TestContainerSizeCountTask.java +++ b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/tasks/TestContainerSizeCountTask.java @@ -18,6 +18,7 @@ package org.apache.hadoop.ozone.recon.tasks; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.*; import static org.hadoop.ozone.recon.schema.tables.ContainerCountBySizeTable.CONTAINER_COUNT_BY_SIZE; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.mockito.BDDMockito.given; @@ -84,18 +85,21 @@ public void setUp() { @Test public void testProcess() { // mock a container with invalid used bytes - final ContainerInfo omContainerInfo0 = mock(ContainerInfo.class); + ContainerInfo omContainerInfo0 = mock(ContainerInfo.class); given(omContainerInfo0.containerID()).willReturn(new ContainerID(0)); given(omContainerInfo0.getUsedBytes()).willReturn(-1L); + given(omContainerInfo0.getState()).willReturn(OPEN); // Write 2 keys ContainerInfo omContainerInfo1 = mock(ContainerInfo.class); given(omContainerInfo1.containerID()).willReturn(new ContainerID(1)); given(omContainerInfo1.getUsedBytes()).willReturn(1500000000L); // 1.5GB + given(omContainerInfo1.getState()).willReturn(CLOSED); ContainerInfo omContainerInfo2 = mock(ContainerInfo.class); given(omContainerInfo2.containerID()).willReturn(new ContainerID(2)); given(omContainerInfo2.getUsedBytes()).willReturn(2500000000L); // 2.5GB + given(omContainerInfo2.getState()).willReturn(CLOSING); // mock getContainers method to return a list of containers List containers = new ArrayList<>(); @@ -124,10 +128,11 @@ public void testProcess() { containerCountBySizeDao.findById(recordToFind.value1()).getCount() .longValue()); - // Add a new key + // Add a new container ContainerInfo omContainerInfo3 = mock(ContainerInfo.class); given(omContainerInfo3.containerID()).willReturn(new ContainerID(3)); given(omContainerInfo3.getUsedBytes()).willReturn(1000000000L); // 1GB + given(omContainerInfo3.getState()).willReturn(QUASI_CLOSED); containers.add(omContainerInfo3); // Update existing key. @@ -164,4 +169,74 @@ public void testProcess() { .getCount() .longValue()); } + + @Test + public void testProcessDeletedContainers() { + // Create a list of containers, including one that is deleted + ContainerInfo omContainerInfo1 = mock(ContainerInfo.class); + given(omContainerInfo1.containerID()).willReturn(new ContainerID(1)); + given(omContainerInfo1.getUsedBytes()).willReturn(1500000000L); // 1.5GB + given(omContainerInfo1.getState()).willReturn(OPEN); + + ContainerInfo omContainerInfo2 = mock(ContainerInfo.class); + given(omContainerInfo2.containerID()).willReturn(new ContainerID(2)); + given(omContainerInfo2.getUsedBytes()).willReturn(2500000000L); // 2.5GB + given(omContainerInfo2.getState()).willReturn(CLOSED); + + ContainerInfo omContainerInfoDeleted = mock(ContainerInfo.class); + given(omContainerInfoDeleted.containerID()).willReturn(new ContainerID(3)); + given(omContainerInfoDeleted.getUsedBytes()).willReturn(1000000000L); + given(omContainerInfoDeleted.getState()).willReturn(DELETED); // 1GB + + List containers = new ArrayList<>(); + containers.add(omContainerInfo1); + containers.add(omContainerInfo2); + containers.add(omContainerInfoDeleted); + + task.process(containers); + + // Verify that the deleted container is not counted + assertEquals(2, containerCountBySizeDao.count()); + } + + @Test + public void testProcessWithNegativeSizeContainers() { + // Create a mock container with negative size + final ContainerInfo negativeSizeContainer = mock(ContainerInfo.class); + given(negativeSizeContainer.containerID()).willReturn(new ContainerID(0)); + given(negativeSizeContainer.getUsedBytes()).willReturn(-1L); + given(negativeSizeContainer.getState()).willReturn(OPEN); + + // Create a mock container with negative size and DELETE state + final ContainerInfo negativeSizeDeletedContainer = + mock(ContainerInfo.class); + given(negativeSizeDeletedContainer.containerID()).willReturn( + new ContainerID(0)); + given(negativeSizeDeletedContainer.getUsedBytes()).willReturn(-1L); + given(negativeSizeDeletedContainer.getState()).willReturn(DELETED); + + // Create a container with valid size + final ContainerInfo validSizeContainer = mock(ContainerInfo.class); + given(validSizeContainer.containerID()).willReturn(new ContainerID(1)); + given(validSizeContainer.getUsedBytes()).willReturn(1000000000L); // 1GB + given(validSizeContainer.getState()).willReturn(CLOSED); + + // Mock getContainers method to return a list of containers including + // the one with negative size + List containers = new ArrayList<>(); + containers.add(negativeSizeContainer); + containers.add(validSizeContainer); + + task.process(containers); + + // Verify that only the container with valid size has been processed + assertEquals(1, containerCountBySizeDao.count()); + + // Check whether container size upper bound for 1000000000L is correctly added + Record1 recordToFind = dslContext.newRecord( + CONTAINER_COUNT_BY_SIZE.CONTAINER_SIZE).value1(1073741824L); // 1GB + assertEquals(1L, containerCountBySizeDao.findById(recordToFind.value1()) + .getCount().longValue()); + } + } From 2ec77259269d12553b0b492b5793e274b5702bcb Mon Sep 17 00:00:00 2001 From: arafat Date: Tue, 6 Feb 2024 11:10:22 +0530 Subject: [PATCH 02/10] Merge both the added test methods together --- .../tasks/TestContainerSizeCountTask.java | 31 +++++-------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/tasks/TestContainerSizeCountTask.java b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/tasks/TestContainerSizeCountTask.java index 67d7694f08ea..18a746262e24 100644 --- a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/tasks/TestContainerSizeCountTask.java +++ b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/tasks/TestContainerSizeCountTask.java @@ -171,7 +171,7 @@ public void testProcess() { } @Test - public void testProcessDeletedContainers() { + public void testProcessDeletedAndNegativeSizedContainers() { // Create a list of containers, including one that is deleted ContainerInfo omContainerInfo1 = mock(ContainerInfo.class); given(omContainerInfo1.containerID()).willReturn(new ContainerID(1)); @@ -188,19 +188,6 @@ public void testProcessDeletedContainers() { given(omContainerInfoDeleted.getUsedBytes()).willReturn(1000000000L); given(omContainerInfoDeleted.getState()).willReturn(DELETED); // 1GB - List containers = new ArrayList<>(); - containers.add(omContainerInfo1); - containers.add(omContainerInfo2); - containers.add(omContainerInfoDeleted); - - task.process(containers); - - // Verify that the deleted container is not counted - assertEquals(2, containerCountBySizeDao.count()); - } - - @Test - public void testProcessWithNegativeSizeContainers() { // Create a mock container with negative size final ContainerInfo negativeSizeContainer = mock(ContainerInfo.class); given(negativeSizeContainer.containerID()).willReturn(new ContainerID(0)); @@ -222,21 +209,19 @@ public void testProcessWithNegativeSizeContainers() { given(validSizeContainer.getState()).willReturn(CLOSED); // Mock getContainers method to return a list of containers including - // the one with negative size + // both valid and invalid ones List containers = new ArrayList<>(); + containers.add(omContainerInfo1); + containers.add(omContainerInfo2); + containers.add(omContainerInfoDeleted); containers.add(negativeSizeContainer); + containers.add(negativeSizeDeletedContainer); containers.add(validSizeContainer); task.process(containers); - // Verify that only the container with valid size has been processed - assertEquals(1, containerCountBySizeDao.count()); - - // Check whether container size upper bound for 1000000000L is correctly added - Record1 recordToFind = dslContext.newRecord( - CONTAINER_COUNT_BY_SIZE.CONTAINER_SIZE).value1(1073741824L); // 1GB - assertEquals(1L, containerCountBySizeDao.findById(recordToFind.value1()) - .getCount().longValue()); + // Verify that only the valid containers are counted + assertEquals(2, containerCountBySizeDao.count()); } } From b3f862003e9e7bce832a92f1e0c5187c4ec2a803 Mon Sep 17 00:00:00 2001 From: arafat Date: Tue, 6 Feb 2024 11:17:56 +0530 Subject: [PATCH 03/10] Fixed checkstyle issues --- .../hadoop/ozone/recon/tasks/ContainerSizeCountTask.java | 2 -- .../ozone/recon/tasks/TestContainerSizeCountTask.java | 6 +++++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java index 864cc818b3b5..010c068deb5a 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java @@ -19,8 +19,6 @@ package org.apache.hadoop.ozone.recon.tasks; -import com.google.common.base.Preconditions; -import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos; import org.apache.hadoop.hdds.scm.container.ContainerID; import org.apache.hadoop.hdds.scm.container.ContainerInfo; import org.apache.hadoop.hdds.scm.container.ContainerManager; diff --git a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/tasks/TestContainerSizeCountTask.java b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/tasks/TestContainerSizeCountTask.java index 18a746262e24..83ae9858cfda 100644 --- a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/tasks/TestContainerSizeCountTask.java +++ b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/tasks/TestContainerSizeCountTask.java @@ -18,7 +18,11 @@ package org.apache.hadoop.ozone.recon.tasks; -import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.*; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.OPEN; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.CLOSED; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.DELETED; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.QUASI_CLOSED; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.CLOSING; import static org.hadoop.ozone.recon.schema.tables.ContainerCountBySizeTable.CONTAINER_COUNT_BY_SIZE; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.mockito.BDDMockito.given; From b4bb56e055a47dba66378778c70585d201ec29de Mon Sep 17 00:00:00 2001 From: arafat Date: Tue, 6 Feb 2024 12:35:37 +0530 Subject: [PATCH 04/10] Fixed failing UT --- .../java/org/apache/hadoop/ozone/recon/api/TestEndpoints.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/api/TestEndpoints.java b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/api/TestEndpoints.java index 05d9927d6c93..b1ae355531b3 100644 --- a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/api/TestEndpoints.java +++ b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/api/TestEndpoints.java @@ -866,10 +866,12 @@ public void testGetContainerCounts() throws Exception { ContainerInfo omContainerInfo1 = mock(ContainerInfo.class); given(omContainerInfo1.containerID()).willReturn(new ContainerID(1)); given(omContainerInfo1.getUsedBytes()).willReturn(1500000000L); // 1.5GB + given(omContainerInfo1.getState()).willReturn(LifeCycleState.OPEN); ContainerInfo omContainerInfo2 = mock(ContainerInfo.class); given(omContainerInfo2.containerID()).willReturn(new ContainerID(2)); given(omContainerInfo2.getUsedBytes()).willReturn(2500000000L); // 2.5GB + given(omContainerInfo2.getState()).willReturn(LifeCycleState.OPEN); // Create a list of container info objects List containers = new ArrayList<>(); From 63ef581dfdee641c6609558da7f3727e70e73b3f Mon Sep 17 00:00:00 2001 From: arafat Date: Thu, 8 Feb 2024 13:23:53 +0530 Subject: [PATCH 05/10] Fixed review comments --- .../ozone/recon/tasks/ContainerSizeCountTask.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java index 010c068deb5a..40bc29225654 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java @@ -256,10 +256,10 @@ public String getTaskName() { /** * - * The handleContainerDeleteOperations() function loops through the entries - * in the deletedContainers map and calls the handleDeleteKeyEvent() function - * for each one. This will decrement the size counts of those containers by - * one which are no longer present in the cluster + * Handles the deletion of containers by updating the tracking of processed containers + * and adjusting the count of containers based on their sizes. When a container is deleted, + * it is removed from the tracking of processed containers, and the count of containers + * corresponding to its size is decremented in the container size count map. * * Used by process() * @@ -325,7 +325,7 @@ private static void updateContainerSizeCount(long containerSize, int delta, Map containerSizeCountMap) { ContainerSizeCountKey key = getContainerSizeCountKey(containerSize); containerSizeCountMap.compute(key, - (k, previous) -> previous != null ? previous + delta : delta); + (k, previous) -> previous != null ? previous + delta : 0); } /** From b4e2408d6f9a29590bbaa9e319f273ceecc1d668 Mon Sep 17 00:00:00 2001 From: arafat Date: Mon, 12 Feb 2024 13:55:38 +0530 Subject: [PATCH 06/10] Persisting the containers with negative sizes in the UnhealthyContainerTable --- .../recon/schema/ContainerSchemaDefinition.java | 5 +++-- .../api/types/UnhealthyContainersResponse.java | 13 +++++++++++++ .../ozone/recon/fsck/ContainerHealthTask.java | 15 +++++++++++++++ .../ozone/recon/tasks/ContainerSizeCountTask.java | 8 ++++---- 4 files changed, 35 insertions(+), 6 deletions(-) diff --git a/hadoop-ozone/recon-codegen/src/main/java/org/hadoop/ozone/recon/schema/ContainerSchemaDefinition.java b/hadoop-ozone/recon-codegen/src/main/java/org/hadoop/ozone/recon/schema/ContainerSchemaDefinition.java index 43e2d728b763..8e5bfda069a9 100644 --- a/hadoop-ozone/recon-codegen/src/main/java/org/hadoop/ozone/recon/schema/ContainerSchemaDefinition.java +++ b/hadoop-ozone/recon-codegen/src/main/java/org/hadoop/ozone/recon/schema/ContainerSchemaDefinition.java @@ -51,8 +51,9 @@ public enum UnHealthyContainerStates { UNDER_REPLICATED, OVER_REPLICATED, MIS_REPLICATED, - ALL_REPLICAS_UNHEALTHY - } + ALL_REPLICAS_UNHEALTHY, + NEGATIVE_SIZE // Added new state to track containers with negative sizes + } private static final String CONTAINER_ID = "container_id"; private static final String CONTAINER_STATE = "container_state"; diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/types/UnhealthyContainersResponse.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/types/UnhealthyContainersResponse.java index eaf08d9ca83e..ba03ec61f145 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/types/UnhealthyContainersResponse.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/api/types/UnhealthyContainersResponse.java @@ -50,6 +50,12 @@ public class UnhealthyContainersResponse { @JsonProperty("misReplicatedCount") private long misReplicatedCount = 0; + /** + * Total count of containers with negative size. + */ + @JsonProperty("negativeSizeCount") + private long negativeSizeCount = 0; + /** * A collection of unhealthy containers. */ @@ -77,6 +83,9 @@ public void setSummaryCount(String state, long count) { } else if (state.equals( UnHealthyContainerStates.MIS_REPLICATED.toString())) { this.misReplicatedCount = count; + } else if (state.equals( + UnHealthyContainerStates.NEGATIVE_SIZE.toString())) { + this.negativeSizeCount = count; } } @@ -96,6 +105,10 @@ public long getMisReplicatedCount() { return misReplicatedCount; } + public long getNegativeSizeCount() { + return negativeSizeCount; + } + public Collection getContainers() { return containers; } diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/fsck/ContainerHealthTask.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/fsck/ContainerHealthTask.java index 577fb7d2bcc1..44c02d478972 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/fsck/ContainerHealthTask.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/fsck/ContainerHealthTask.java @@ -217,6 +217,8 @@ private void initializeUnhealthyContainerStateStatsMap( UnHealthyContainerStates.OVER_REPLICATED, new HashMap<>()); unhealthyContainerStateStatsMap.put( UnHealthyContainerStates.MIS_REPLICATED, new HashMap<>()); + unhealthyContainerStateStatsMap.put( + UnHealthyContainerStates.NEGATIVE_SIZE, new HashMap<>()); } private ContainerHealthStatus setCurrentContainer(long recordId) @@ -499,6 +501,19 @@ public static List generateUnhealthyRecords( unhealthyContainerStateStatsMap); } + ContainerInfo containerInfo = container.getContainer(); + if (containerInfo.getUsedBytes() < 0) { + LOG.error("Container {} has negative size. Please visit Recon's " + + "missing container page for a list of keys (and their metadata) " + + "mapped to this container.", containerInfo.getContainerID()); + records.add( + recordForState(container, UnHealthyContainerStates.NEGATIVE_SIZE, + time)); + populateContainerStats(container, + UnHealthyContainerStates.NEGATIVE_SIZE, + unhealthyContainerStateStatsMap); + } + return records; } diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java index 40bc29225654..b9b1da02c05d 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java @@ -25,6 +25,7 @@ import org.apache.hadoop.ozone.recon.ReconUtils; import org.apache.hadoop.ozone.recon.scm.ReconScmTask; import org.apache.hadoop.ozone.recon.spi.StorageContainerServiceProvider; +import org.hadoop.ozone.recon.schema.ContainerSchemaDefinition; import org.hadoop.ozone.recon.schema.UtilizationSchemaDefinition; import org.hadoop.ozone.recon.schema.tables.daos.ContainerCountBySizeDao; import org.hadoop.ozone.recon.schema.tables.daos.ReconTaskStatusDao; @@ -34,10 +35,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; @@ -61,6 +59,8 @@ public class ContainerSizeCountTask extends ReconScmTask { private ContainerCountBySizeDao containerCountBySizeDao; private DSLContext dslContext; private HashMap processedContainers = new HashMap<>(); + private Map> + unhealthyContainerStateStatsMap; private ReadWriteLock lock = new ReentrantReadWriteLock(true); public ContainerSizeCountTask( From edeea8acf91517cb01f2c0b1ccbc348a4323770a Mon Sep 17 00:00:00 2001 From: arafat Date: Tue, 13 Feb 2024 03:15:51 +0530 Subject: [PATCH 07/10] Finished unit tests for identifying containers with negative size. --- .../schema/ContainerSchemaDefinition.java | 2 +- .../ozone/recon/fsck/ContainerHealthTask.java | 49 ++++++++++----- .../recon/tasks/ContainerSizeCountTask.java | 5 +- .../recon/fsck/TestContainerHealthTask.java | 59 +++++++++++++++++++ 4 files changed, 99 insertions(+), 16 deletions(-) diff --git a/hadoop-ozone/recon-codegen/src/main/java/org/hadoop/ozone/recon/schema/ContainerSchemaDefinition.java b/hadoop-ozone/recon-codegen/src/main/java/org/hadoop/ozone/recon/schema/ContainerSchemaDefinition.java index 8e5bfda069a9..4d62ca886cda 100644 --- a/hadoop-ozone/recon-codegen/src/main/java/org/hadoop/ozone/recon/schema/ContainerSchemaDefinition.java +++ b/hadoop-ozone/recon-codegen/src/main/java/org/hadoop/ozone/recon/schema/ContainerSchemaDefinition.java @@ -53,7 +53,7 @@ public enum UnHealthyContainerStates { MIS_REPLICATED, ALL_REPLICAS_UNHEALTHY, NEGATIVE_SIZE // Added new state to track containers with negative sizes - } + } private static final String CONTAINER_ID = "container_id"; private static final String CONTAINER_STATE = "container_state"; diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/fsck/ContainerHealthTask.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/fsck/ContainerHealthTask.java index 44c02d478972..a5d259d3e939 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/fsck/ContainerHealthTask.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/fsck/ContainerHealthTask.java @@ -315,13 +315,21 @@ private long processExistingDBRecords(long currentTime, private void processContainer(ContainerInfo container, long currentTime, Map> - unhealthyContainerStateStatsMap) { + unhealthyContainerStateStatsMap) { try { Set containerReplicas = containerManager.getContainerReplicas(container.containerID()); ContainerHealthStatus h = new ContainerHealthStatus(container, containerReplicas, placementPolicy, reconContainerMetadataManager, conf); + + // Handle negative sized containers separately + if (h.getContainer().getUsedBytes() < 0) { + handleNegativeSizedContainers(h, currentTime, + unhealthyContainerStateStatsMap); + return; + } + if (h.isHealthilyReplicated() || h.isDeleted()) { return; } @@ -367,6 +375,32 @@ private boolean containerDeletedInSCM(ContainerInfo containerInfo) { return false; } + /** + * This method is used to handle containers with negative sizes. It logs an + * error message and inserts a record into the UNHEALTHY_CONTAINERS table. + * @param containerHealthStatus + * @param currentTime + * @param unhealthyContainerStateStatsMap + */ + private void handleNegativeSizedContainers( + ContainerHealthStatus containerHealthStatus, long currentTime, + Map> + unhealthyContainerStateStatsMap) { + ContainerInfo container = containerHealthStatus.getContainer(); + LOG.error( + "Container {} has negative size. Please visit Recon's unhealthy " + + "container endpoint for more details.", + container.getContainerID()); + UnhealthyContainers record = + ContainerHealthRecords.recordForState(containerHealthStatus, + UnHealthyContainerStates.NEGATIVE_SIZE, currentTime); + List records = Collections.singletonList(record); + populateContainerStats(containerHealthStatus, + UnHealthyContainerStates.NEGATIVE_SIZE, + unhealthyContainerStateStatsMap); + containerHealthSchemaManager.insertUnhealthyContainerRecords(records); + } + /** * Helper methods to generate and update the required database records for * unhealthy containers. @@ -501,19 +535,6 @@ public static List generateUnhealthyRecords( unhealthyContainerStateStatsMap); } - ContainerInfo containerInfo = container.getContainer(); - if (containerInfo.getUsedBytes() < 0) { - LOG.error("Container {} has negative size. Please visit Recon's " + - "missing container page for a list of keys (and their metadata) " + - "mapped to this container.", containerInfo.getContainerID()); - records.add( - recordForState(container, UnHealthyContainerStates.NEGATIVE_SIZE, - time)); - populateContainerStats(container, - UnHealthyContainerStates.NEGATIVE_SIZE, - unhealthyContainerStateStatsMap); - } - return records; } diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java index b9b1da02c05d..2cfd47173997 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java @@ -35,7 +35,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.*; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.ArrayList; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; diff --git a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/fsck/TestContainerHealthTask.java b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/fsck/TestContainerHealthTask.java index 371fb6f9d675..001d44d9c203 100644 --- a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/fsck/TestContainerHealthTask.java +++ b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/fsck/TestContainerHealthTask.java @@ -343,6 +343,65 @@ public void testDeletedContainer() throws Exception { .isGreaterThan(currentTime); } + @Test + public void testNegativeSizeContainers() throws Exception { + // Setup mock objects and test environment + UnhealthyContainersDao unhealthyContainersDao = + getDao(UnhealthyContainersDao.class); + ContainerHealthSchemaManager containerHealthSchemaManager = + new ContainerHealthSchemaManager( + getSchemaDefinition(ContainerSchemaDefinition.class), + unhealthyContainersDao); + ReconStorageContainerManagerFacade scmMock = + mock(ReconStorageContainerManagerFacade.class); + ContainerManager containerManagerMock = mock(ContainerManager.class); + StorageContainerServiceProvider scmClientMock = + mock(StorageContainerServiceProvider.class); + ReconContainerMetadataManager reconContainerMetadataManager = + mock(ReconContainerMetadataManager.class); + MockPlacementPolicy placementMock = new MockPlacementPolicy(); + + // Mock container info setup + List mockContainers = getMockContainers(3); + when(scmMock.getContainerManager()).thenReturn(containerManagerMock); + when(scmMock.getScmServiceProvider()).thenReturn(scmClientMock); + when(containerManagerMock.getContainers(any(ContainerID.class), + anyInt())).thenReturn(mockContainers); + for (ContainerInfo c : mockContainers) { + when(containerManagerMock.getContainer( + c.containerID())).thenReturn(c); + when(scmClientMock.getContainerWithPipeline( + c.getContainerID())).thenReturn(new ContainerWithPipeline(c, null)); + when(containerManagerMock.getContainer(c.containerID()) + .getUsedBytes()).thenReturn(Long.valueOf(-10)); + } + + // Verify the table is initially empty + assertThat(unhealthyContainersDao.findAll()).isEmpty(); + + // Setup and start the container health task + ReconTaskStatusDao reconTaskStatusDao = getDao(ReconTaskStatusDao.class); + ReconTaskConfig reconTaskConfig = new ReconTaskConfig(); + reconTaskConfig.setMissingContainerTaskInterval(Duration.ofSeconds(2)); + ContainerHealthTask containerHealthTask = new ContainerHealthTask( + scmMock.getContainerManager(), scmMock.getScmServiceProvider(), + reconTaskStatusDao, + containerHealthSchemaManager, placementMock, reconTaskConfig, + reconContainerMetadataManager, + new OzoneConfiguration()); + containerHealthTask.start(); + + // Wait for the task to identify unhealthy containers + LambdaTestUtils.await(6000, 1000, + () -> unhealthyContainersDao.count() == 3); + + // Assert that all unhealthy containers have been identified as NEGATIVE_SIZE states + List negativeSizeContainers = + unhealthyContainersDao.fetchByContainerState("NEGATIVE_SIZE"); + assertThat(negativeSizeContainers).hasSize(3); + } + + private Set getMockReplicas( long containerId, State...states) { Set replicas = new HashSet<>(); From 345b2aa1fdec48af2ecc179117a4dfbfdabe7127 Mon Sep 17 00:00:00 2001 From: arafat Date: Tue, 13 Feb 2024 03:35:56 +0530 Subject: [PATCH 08/10] Made review chages --- .../hadoop/ozone/recon/tasks/ContainerSizeCountTask.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java index 2cfd47173997..92a5406daedd 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java @@ -168,8 +168,9 @@ public void process(List containers) { try { final Map containerSizeCountMap = new HashMap<>(); - final Map deletedContainers = - new HashMap<>(processedContainers); + final Map deletedContainers + = new HashMap<>(processedContainers); + // Loop to handle container create and size-update operations for (ContainerInfo container : containers) { if (container.getState().equals(DELETED)) { @@ -328,7 +329,7 @@ private static void updateContainerSizeCount(long containerSize, int delta, Map containerSizeCountMap) { ContainerSizeCountKey key = getContainerSizeCountKey(containerSize); containerSizeCountMap.compute(key, - (k, previous) -> previous != null ? previous + delta : 0); + (k, previous) -> previous != null ? previous + delta : delta); } /** From 75e66daac2a8c3992da979dacbcbc2866b8a9bf6 Mon Sep 17 00:00:00 2001 From: arafat Date: Thu, 22 Feb 2024 00:11:54 +0530 Subject: [PATCH 09/10] Treating negative container sizes and zero --- .../ozone/recon/tasks/ContainerSizeCountTask.java | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java index 92a5406daedd..93b2d9c0c5ab 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java @@ -125,12 +125,17 @@ protected synchronized void run() { private void process(ContainerInfo container, Map map) { final ContainerID id = container.containerID(); - final long currentSize = container.getUsedBytes(); - if (currentSize < 0) { - LOG.error("Negative container size: {} for container: {}", currentSize, - id); - return; + final long usedBytes = container.getUsedBytes(); + final long currentSize; + + if (usedBytes < 0) { + LOG.warn("Negative usedBytes ({}) for container {}, treating it as 0", + usedBytes, id); + currentSize = 0; + } else { + currentSize = usedBytes; } + final Long previousSize = processedContainers.put(id, currentSize); if (previousSize != null) { decrementContainerSizeCount(previousSize, map); From 3757c65acc2fa5a8076a22c620250d8006ad24de Mon Sep 17 00:00:00 2001 From: arafat Date: Thu, 22 Feb 2024 12:37:44 +0530 Subject: [PATCH 10/10] Fixed failing UT --- .../ozone/recon/tasks/ContainerSizeCountTask.java | 11 +++++++++-- .../ozone/recon/tasks/TestContainerSizeCountTask.java | 10 +++++----- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java index 93b2d9c0c5ab..105406f2bdf6 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ContainerSizeCountTask.java @@ -338,19 +338,26 @@ private static void updateContainerSizeCount(long containerSize, int delta, } /** - * * The purpose of this function is to categorize containers into different * size ranges, or "bins," based on their size. * The ContainerSizeCountKey object is used to store the upper bound value * for each size range, and is later used to lookup the count of containers * in that size range within a Map. * - * Used by decrementContainerSizeCount() and incrementContainerSizeCount() + * If the container size is 0, the method sets the size of + * ContainerSizeCountKey as zero without calculating the upper bound. Used by + * decrementContainerSizeCount() and incrementContainerSizeCount() * * @param containerSize to calculate the upperSizeBound */ private static ContainerSizeCountKey getContainerSizeCountKey( long containerSize) { + // If containerSize is 0, return a ContainerSizeCountKey with size 0 + if (containerSize == 0) { + return new ContainerSizeCountKey(0L); + } + + // Otherwise, calculate the upperSizeBound return new ContainerSizeCountKey( ReconUtils.getContainerSizeUpperBound(containerSize)); } diff --git a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/tasks/TestContainerSizeCountTask.java b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/tasks/TestContainerSizeCountTask.java index 83ae9858cfda..a996f167a1bb 100644 --- a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/tasks/TestContainerSizeCountTask.java +++ b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/tasks/TestContainerSizeCountTask.java @@ -113,8 +113,8 @@ public void testProcess() { task.process(containers); - // Verify 2 containers are in correct bins. - assertEquals(2, containerCountBySizeDao.count()); + // Verify 3 containers are in correct bins. + assertEquals(3, containerCountBySizeDao.count()); // container size upper bound for // 1500000000L (1.5GB) is 2147483648L = 2^31 = 2GB (next highest power of 2) @@ -146,7 +146,7 @@ public void testProcess() { task.process(containers); // Total size groups added to the database - assertEquals(4, containerCountBySizeDao.count()); + assertEquals(5, containerCountBySizeDao.count()); // Check whether container size upper bound for // 50000L is 536870912L = 2^29 = 512MB (next highest power of 2) @@ -206,7 +206,7 @@ public void testProcessDeletedAndNegativeSizedContainers() { given(negativeSizeDeletedContainer.getUsedBytes()).willReturn(-1L); given(negativeSizeDeletedContainer.getState()).willReturn(DELETED); - // Create a container with valid size + // Create a mock container with id 1 and updated size of 1GB from 1.5GB final ContainerInfo validSizeContainer = mock(ContainerInfo.class); given(validSizeContainer.containerID()).willReturn(new ContainerID(1)); given(validSizeContainer.getUsedBytes()).willReturn(1000000000L); // 1GB @@ -225,7 +225,7 @@ public void testProcessDeletedAndNegativeSizedContainers() { task.process(containers); // Verify that only the valid containers are counted - assertEquals(2, containerCountBySizeDao.count()); + assertEquals(3, containerCountBySizeDao.count()); } }