From d9ddd43c4ae53de4a10d14b58cb25ad7ffedf654 Mon Sep 17 00:00:00 2001 From: Junegunn Choi Date: Thu, 13 Mar 2025 22:29:51 +0900 Subject: [PATCH 1/6] Add test case for disabling table with FAILED_OPEN regions --- .../TestTransitRegionStateProcedure.java | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTransitRegionStateProcedure.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTransitRegionStateProcedure.java index 286bbbf9323a..424870c699a0 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTransitRegionStateProcedure.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTransitRegionStateProcedure.java @@ -17,23 +17,41 @@ */ package org.apache.hadoop.hbase.master.assignment; +import static org.apache.hadoop.hbase.master.assignment.AssignmentManager.ASSIGN_MAX_ATTEMPTS; +import static org.apache.hadoop.hbase.master.assignment.RegionStateStore.getStateColumn; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; import java.io.IOException; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.CellComparator; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.TableDescriptor; +import org.apache.hadoop.hbase.client.TableDescriptorBuilder; +import org.apache.hadoop.hbase.client.TableState; import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.procedure.MasterProcedureConstants; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.MasterProcedureTestingUtility; import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; +import org.apache.hadoop.hbase.regionserver.DefaultStoreEngine; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.regionserver.HStore; import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.util.Bytes; @@ -66,6 +84,7 @@ public class TestTransitRegionStateProcedure { @BeforeClass public static void setUpBeforeClass() throws Exception { UTIL.getConfiguration().setInt(MasterProcedureConstants.MASTER_PROCEDURE_THREADS, 1); + UTIL.getConfiguration().setInt(ASSIGN_MAX_ATTEMPTS, 1); UTIL.startMiniCluster(3); UTIL.getAdmin().balancerSwitch(false, true); } @@ -168,4 +187,44 @@ public void testRecoveryAndDoubleExecutionUnassignAndAssign() throws Exception { // confirm that the region is successfully opened assertTrue(openSeqNum2 > openSeqNum); } + + private static class BuggyStoreEngine extends DefaultStoreEngine { + @Override + protected void createComponents(Configuration conf, HStore store, CellComparator comparator) + throws IOException { + throw new IOException("I'm broken"); + } + } + + @Test + public void testDisableFailedOpenRegions() throws Exception { + // Perform a faulty modification to put regions into FAILED_OPEN state + Admin admin = UTIL.getAdmin(); + TableDescriptor td = admin.getDescriptor(tableName); + TableDescriptorBuilder tdb = TableDescriptorBuilder.newBuilder(td); + tdb.setValue("hbase.hstore.engine.class", BuggyStoreEngine.class.getName()); + try { + admin.modifyTable(tdb.build()); + fail("Should fail to modify the table"); + } catch (IOException e) { + // expected + } + + // Table is still "ENABLED" + assertEquals(TableState.State.ENABLED, + MetaTableAccessor.getTableState(UTIL.getConnection(), tableName).getState()); + + // But the regions are in "FAILED_OPEN" state + List regions = MetaTableAccessor.getTableRegions(UTIL.getConnection(), tableName); + Set regionStates = new HashSet<>(); + for (RegionInfo region : regions) { + Result result = MetaTableAccessor.getRegionResult(UTIL.getConnection(), region); + String state = Bytes.toString( + result.getValue(HConstants.CATALOG_FAMILY, getStateColumn(region.getReplicaId()))); + regionStates.add(state); + } + assertEquals(Collections.singleton("FAILED_OPEN"), regionStates); + + // tearDown should be able to disable and delete the table successfully + } } From 42715b681d05428c63eca926b5967d5fa64eb1a0 Mon Sep 17 00:00:00 2001 From: Junegunn Choi Date: Wed, 12 Feb 2025 16:04:04 +0900 Subject: [PATCH 2/6] Allow disabling FAILED_OPEN regions --- .../master/assignment/AssignmentManager.java | 2 +- .../TransitRegionStateProcedure.java | 31 ++++++++++++++++--- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index d05b1f9d3d34..7467585af165 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -2087,7 +2087,7 @@ public RegionInfo getRegionInfo(final String encodedRegionName) { State.OPEN // Retrying }; - private static final State[] STATES_EXPECTED_ON_CLOSING = { State.OPEN, // Normal case + static final State[] STATES_EXPECTED_ON_CLOSING = { State.OPEN, // Normal case State.CLOSING, // Retrying State.SPLITTING, // Offline the split parent State.MERGING // Offline the merge parents diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java index e1952964a2ae..cb18ce23c8fd 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java @@ -23,6 +23,7 @@ import static org.apache.hadoop.hbase.io.hfile.CacheConfig.EVICT_BLOCKS_ON_SPLIT_KEY; import static org.apache.hadoop.hbase.master.LoadBalancer.BOGUS_SERVER_NAME; import static org.apache.hadoop.hbase.master.assignment.AssignmentManager.FORCE_REGION_RETAINMENT; +import static org.apache.hadoop.hbase.master.assignment.AssignmentManager.STATES_EXPECTED_ON_CLOSING; import edu.umd.cs.findbugs.annotations.Nullable; import java.io.IOException; @@ -372,6 +373,12 @@ private Flow confirmOpened(MasterProcedureEnv env, RegionStateNode regionNode) } private void closeRegionAfterUpdatingMeta(MasterProcedureEnv env, RegionStateNode regionNode) { + // This region was in FAILED_OPEN state. No need to close it. + if (regionNode.getRegionLocation() == null) { + setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED); + return; + } + LOG.debug("Close region: isSplit: {}: evictOnSplit: {}: evictOnClose: {}", isSplit, env.getMasterConfiguration().getBoolean(EVICT_BLOCKS_ON_SPLIT_KEY, DEFAULT_EVICT_ON_SPLIT), evictCache); @@ -392,10 +399,26 @@ private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode) ) { return; } - if (regionNode.isInState(State.OPEN, State.CLOSING, State.MERGING, State.SPLITTING)) { - // this is the normal case - ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture, - env.getAssignmentManager().regionClosing(regionNode), env, + + CompletableFuture future = null; + if (regionNode.isInState(STATES_EXPECTED_ON_CLOSING)) { + // This is the normal case + future = env.getAssignmentManager().regionClosing(regionNode); + } else if (regionNode.setState(State.CLOSED, State.FAILED_OPEN)) { + // FAILED_OPEN doesn't need further transition, immediately mark the region as closed + AssignmentManager am = env.getAssignmentManager(); + am.getRegionStates().removeFromFailedOpen(regionNode.getRegionInfo()); + future = am.getRegionStateStore().updateRegionLocation(regionNode); + FutureUtils.addListener(future, (r, e) -> { + if (e != null) { + // Failed to persist CLOSED state to meta. Revert. + LOG.error("Failed to update meta for {} to CLOSED", regionNode.getRegionInfo(), e); + regionNode.setState(State.FAILED_OPEN); + } + }); + } + if (future != null) { + ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture, future, env, () -> closeRegionAfterUpdatingMeta(env, regionNode)); } else { forceNewPlan = true; From cab6eb41543d6ca4ef57cab2d87a3a6e6796c88c Mon Sep 17 00:00:00 2001 From: Junegunn Choi Date: Fri, 23 May 2025 11:38:26 +0900 Subject: [PATCH 3/6] Remove unnecessary error handler --- .../master/assignment/TransitRegionStateProcedure.java | 7 ------- 1 file changed, 7 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java index cb18ce23c8fd..6b5aaf6975f8 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java @@ -409,13 +409,6 @@ private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode) AssignmentManager am = env.getAssignmentManager(); am.getRegionStates().removeFromFailedOpen(regionNode.getRegionInfo()); future = am.getRegionStateStore().updateRegionLocation(regionNode); - FutureUtils.addListener(future, (r, e) -> { - if (e != null) { - // Failed to persist CLOSED state to meta. Revert. - LOG.error("Failed to update meta for {} to CLOSED", regionNode.getRegionInfo(), e); - regionNode.setState(State.FAILED_OPEN); - } - }); } if (future != null) { ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture, future, env, From dd8716f2a2d4ec4495b2fcfeb4190361177f4cb0 Mon Sep 17 00:00:00 2001 From: Junegunn Choi Date: Fri, 23 May 2025 11:40:24 +0900 Subject: [PATCH 4/6] Add more steps to the test case (disable, fix, and re-enable) --- .../TestTransitRegionStateProcedure.java | 48 +++++++++++++++---- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTransitRegionStateProcedure.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTransitRegionStateProcedure.java index 424870c699a0..dc945dd18639 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTransitRegionStateProcedure.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTransitRegionStateProcedure.java @@ -21,6 +21,7 @@ import static org.apache.hadoop.hbase.master.assignment.RegionStateStore.getStateColumn; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -196,6 +197,23 @@ protected void createComponents(Configuration conf, HStore store, CellComparator } } + private Set getRegionStates() throws IOException { + List regions = MetaTableAccessor.getTableRegions(UTIL.getConnection(), tableName); + Set regionStates = new HashSet<>(); + for (RegionInfo region : regions) { + Result result = MetaTableAccessor.getRegionResult(UTIL.getConnection(), region); + String state = Bytes.toString( + result.getValue(HConstants.CATALOG_FAMILY, getStateColumn(region.getReplicaId()))); + regionStates.add(state); + } + return regionStates; + } + + private static int getTotalRITs() throws IOException { + final AssignmentManager am = UTIL.getHBaseCluster().getMaster().getAssignmentManager(); + return am.computeRegionInTransitionStat().getTotalRITs(); + } + @Test public void testDisableFailedOpenRegions() throws Exception { // Perform a faulty modification to put regions into FAILED_OPEN state @@ -215,16 +233,26 @@ public void testDisableFailedOpenRegions() throws Exception { MetaTableAccessor.getTableState(UTIL.getConnection(), tableName).getState()); // But the regions are in "FAILED_OPEN" state - List regions = MetaTableAccessor.getTableRegions(UTIL.getConnection(), tableName); - Set regionStates = new HashSet<>(); - for (RegionInfo region : regions) { - Result result = MetaTableAccessor.getRegionResult(UTIL.getConnection(), region); - String state = Bytes.toString( - result.getValue(HConstants.CATALOG_FAMILY, getStateColumn(region.getReplicaId()))); - regionStates.add(state); - } - assertEquals(Collections.singleton("FAILED_OPEN"), regionStates); + assertEquals(Collections.singleton("FAILED_OPEN"), getRegionStates()); + + // We should be able to disable the table + assertNotEquals(0, getTotalRITs()); + UTIL.getAdmin().disableTable(tableName); + + // The number of RITs should be 0 after disabling the table + assertEquals(0, getTotalRITs()); + + // The regions are now in "CLOSED" state + assertEquals(Collections.singleton("CLOSED"), getRegionStates()); + + // Fix the error in the table descriptor + tdb = TableDescriptorBuilder.newBuilder(td); + tdb.setValue("hbase.hstore.engine.class", DefaultStoreEngine.class.getName()); + admin.modifyTable(tdb.build()); - // tearDown should be able to disable and delete the table successfully + // We can then re-enable the table + UTIL.getAdmin().enableTable(tableName); + assertEquals(Collections.singleton("OPEN"), getRegionStates()); + assertEquals(0, getTotalRITs()); } } From 718156ed93907b4f99ab9c8d55e58afff89ed121 Mon Sep 17 00:00:00 2001 From: Junegunn Choi Date: Sat, 24 May 2025 15:49:21 +0900 Subject: [PATCH 5/6] Use two separate ProcedureFutureUtil.suspendIfNecessary calls --- .../TransitRegionStateProcedure.java | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java index 6b5aaf6975f8..a4ad2e42a3ee 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java @@ -373,7 +373,8 @@ private Flow confirmOpened(MasterProcedureEnv env, RegionStateNode regionNode) } private void closeRegionAfterUpdatingMeta(MasterProcedureEnv env, RegionStateNode regionNode) { - // This region was in FAILED_OPEN state. No need to close it. + // Absence of location indicates that this region was in FAILED_OPEN state. + // This happens when disabling a table with regions in FAILED_OPEN state. if (regionNode.getRegionLocation() == null) { setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED); return; @@ -400,19 +401,23 @@ private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode) return; } - CompletableFuture future = null; if (regionNode.isInState(STATES_EXPECTED_ON_CLOSING)) { // This is the normal case - future = env.getAssignmentManager().regionClosing(regionNode); + ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture, + env.getAssignmentManager().regionClosing(regionNode), env, + () -> closeRegionAfterUpdatingMeta(env, regionNode)); } else if (regionNode.setState(State.CLOSED, State.FAILED_OPEN)) { - // FAILED_OPEN doesn't need further transition, immediately mark the region as closed - AssignmentManager am = env.getAssignmentManager(); + // If a region was in FAILED_OPEN state, it was not OPEN and effectively CLOSED. + // So we should not try to close it again. We just need to update the state to CLOSED. + + // Remove the region from RIT list to prevent periodic "RITs over threshold" messages. + final AssignmentManager am = env.getAssignmentManager(); am.getRegionStates().removeFromFailedOpen(regionNode.getRegionInfo()); - future = am.getRegionStateStore().updateRegionLocation(regionNode); - } - if (future != null) { - ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture, future, env, - () -> closeRegionAfterUpdatingMeta(env, regionNode)); + + // Persistent CLOSED state to meta and proceed to the next state. + ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture, + am.getRegionStateStore().updateRegionLocation(regionNode), env, + () -> setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED)); } else { forceNewPlan = true; regionNode.setRegionLocation(null); From 8bd00b5d5a21729ae0e50773d88c9f74c93fa3d3 Mon Sep 17 00:00:00 2001 From: Junegunn Choi Date: Sat, 24 May 2025 18:51:41 +0900 Subject: [PATCH 6/6] Revert the previous change --- .../assignment/TransitRegionStateProcedure.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java index a4ad2e42a3ee..36c259253e51 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java @@ -401,11 +401,10 @@ private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode) return; } + CompletableFuture future = null; if (regionNode.isInState(STATES_EXPECTED_ON_CLOSING)) { // This is the normal case - ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture, - env.getAssignmentManager().regionClosing(regionNode), env, - () -> closeRegionAfterUpdatingMeta(env, regionNode)); + future = env.getAssignmentManager().regionClosing(regionNode); } else if (regionNode.setState(State.CLOSED, State.FAILED_OPEN)) { // If a region was in FAILED_OPEN state, it was not OPEN and effectively CLOSED. // So we should not try to close it again. We just need to update the state to CLOSED. @@ -415,9 +414,11 @@ private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode) am.getRegionStates().removeFromFailedOpen(regionNode.getRegionInfo()); // Persistent CLOSED state to meta and proceed to the next state. - ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture, - am.getRegionStateStore().updateRegionLocation(regionNode), env, - () -> setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED)); + future = am.getRegionStateStore().updateRegionLocation(regionNode); + } + if (future != null) { + ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture, future, env, + () -> closeRegionAfterUpdatingMeta(env, regionNode)); } else { forceNewPlan = true; regionNode.setRegionLocation(null);