diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index d05b1f9d3d34..7467585af165 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -2087,7 +2087,7 @@ public RegionInfo getRegionInfo(final String encodedRegionName) { State.OPEN // Retrying }; - private static final State[] STATES_EXPECTED_ON_CLOSING = { State.OPEN, // Normal case + static final State[] STATES_EXPECTED_ON_CLOSING = { State.OPEN, // Normal case State.CLOSING, // Retrying State.SPLITTING, // Offline the split parent State.MERGING // Offline the merge parents diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java index 12ddb2559367..e77c14544144 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java @@ -23,6 +23,7 @@ import static org.apache.hadoop.hbase.io.hfile.CacheConfig.EVICT_BLOCKS_ON_SPLIT_KEY; import static org.apache.hadoop.hbase.master.LoadBalancer.BOGUS_SERVER_NAME; import static org.apache.hadoop.hbase.master.assignment.AssignmentManager.FORCE_REGION_RETAINMENT; +import static org.apache.hadoop.hbase.master.assignment.AssignmentManager.STATES_EXPECTED_ON_CLOSING; import edu.umd.cs.findbugs.annotations.Nullable; import java.io.IOException; @@ -371,6 +372,13 @@ private Flow confirmOpened(MasterProcedureEnv env, RegionStateNode regionNode) } private void closeRegionAfterUpdatingMeta(MasterProcedureEnv env, RegionStateNode regionNode) { + // Absence of location indicates that this region was in FAILED_OPEN state. + // This happens when disabling a table with regions in FAILED_OPEN state. + if (regionNode.getRegionLocation() == null) { + setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED); + return; + } + LOG.debug("Close region: isSplit: {}: evictOnSplit: {}: evictOnClose: {}", isSplit, env.getMasterConfiguration().getBoolean(EVICT_BLOCKS_ON_SPLIT_KEY, DEFAULT_EVICT_ON_SPLIT), evictCache); @@ -391,10 +399,24 @@ private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode) ) { return; } - if (regionNode.isInState(State.OPEN, State.CLOSING, State.MERGING, State.SPLITTING)) { - // this is the normal case - ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture, - env.getAssignmentManager().regionClosing(regionNode), env, + + CompletableFuture future = null; + if (regionNode.isInState(STATES_EXPECTED_ON_CLOSING)) { + // This is the normal case + future = env.getAssignmentManager().regionClosing(regionNode); + } else if (regionNode.setState(State.CLOSED, State.FAILED_OPEN)) { + // If a region was in FAILED_OPEN state, it was not OPEN and effectively CLOSED. + // So we should not try to close it again. We just need to update the state to CLOSED. + + // Remove the region from RIT list to prevent periodic "RITs over threshold" messages. + final AssignmentManager am = env.getAssignmentManager(); + am.getRegionStates().removeFromFailedOpen(regionNode.getRegionInfo()); + + // Persistent CLOSED state to meta and proceed to the next state. + future = am.getRegionStateStore().updateRegionLocation(regionNode); + } + if (future != null) { + ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture, future, env, () -> closeRegionAfterUpdatingMeta(env, regionNode)); } else { forceNewPlan = true; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTransitRegionStateProcedure.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTransitRegionStateProcedure.java index 286bbbf9323a..dc945dd18639 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTransitRegionStateProcedure.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTransitRegionStateProcedure.java @@ -17,23 +17,42 @@ */ package org.apache.hadoop.hbase.master.assignment; +import static org.apache.hadoop.hbase.master.assignment.AssignmentManager.ASSIGN_MAX_ATTEMPTS; +import static org.apache.hadoop.hbase.master.assignment.RegionStateStore.getStateColumn; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; import java.io.IOException; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.CellComparator; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.TableDescriptor; +import org.apache.hadoop.hbase.client.TableDescriptorBuilder; +import org.apache.hadoop.hbase.client.TableState; import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.procedure.MasterProcedureConstants; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.MasterProcedureTestingUtility; import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; +import org.apache.hadoop.hbase.regionserver.DefaultStoreEngine; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.regionserver.HStore; import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.util.Bytes; @@ -66,6 +85,7 @@ public class TestTransitRegionStateProcedure { @BeforeClass public static void setUpBeforeClass() throws Exception { UTIL.getConfiguration().setInt(MasterProcedureConstants.MASTER_PROCEDURE_THREADS, 1); + UTIL.getConfiguration().setInt(ASSIGN_MAX_ATTEMPTS, 1); UTIL.startMiniCluster(3); UTIL.getAdmin().balancerSwitch(false, true); } @@ -168,4 +188,71 @@ public void testRecoveryAndDoubleExecutionUnassignAndAssign() throws Exception { // confirm that the region is successfully opened assertTrue(openSeqNum2 > openSeqNum); } + + private static class BuggyStoreEngine extends DefaultStoreEngine { + @Override + protected void createComponents(Configuration conf, HStore store, CellComparator comparator) + throws IOException { + throw new IOException("I'm broken"); + } + } + + private Set getRegionStates() throws IOException { + List regions = MetaTableAccessor.getTableRegions(UTIL.getConnection(), tableName); + Set regionStates = new HashSet<>(); + for (RegionInfo region : regions) { + Result result = MetaTableAccessor.getRegionResult(UTIL.getConnection(), region); + String state = Bytes.toString( + result.getValue(HConstants.CATALOG_FAMILY, getStateColumn(region.getReplicaId()))); + regionStates.add(state); + } + return regionStates; + } + + private static int getTotalRITs() throws IOException { + final AssignmentManager am = UTIL.getHBaseCluster().getMaster().getAssignmentManager(); + return am.computeRegionInTransitionStat().getTotalRITs(); + } + + @Test + public void testDisableFailedOpenRegions() throws Exception { + // Perform a faulty modification to put regions into FAILED_OPEN state + Admin admin = UTIL.getAdmin(); + TableDescriptor td = admin.getDescriptor(tableName); + TableDescriptorBuilder tdb = TableDescriptorBuilder.newBuilder(td); + tdb.setValue("hbase.hstore.engine.class", BuggyStoreEngine.class.getName()); + try { + admin.modifyTable(tdb.build()); + fail("Should fail to modify the table"); + } catch (IOException e) { + // expected + } + + // Table is still "ENABLED" + assertEquals(TableState.State.ENABLED, + MetaTableAccessor.getTableState(UTIL.getConnection(), tableName).getState()); + + // But the regions are in "FAILED_OPEN" state + assertEquals(Collections.singleton("FAILED_OPEN"), getRegionStates()); + + // We should be able to disable the table + assertNotEquals(0, getTotalRITs()); + UTIL.getAdmin().disableTable(tableName); + + // The number of RITs should be 0 after disabling the table + assertEquals(0, getTotalRITs()); + + // The regions are now in "CLOSED" state + assertEquals(Collections.singleton("CLOSED"), getRegionStates()); + + // Fix the error in the table descriptor + tdb = TableDescriptorBuilder.newBuilder(td); + tdb.setValue("hbase.hstore.engine.class", DefaultStoreEngine.class.getName()); + admin.modifyTable(tdb.build()); + + // We can then re-enable the table + UTIL.getAdmin().enableTable(tableName); + assertEquals(Collections.singleton("OPEN"), getRegionStates()); + assertEquals(0, getTotalRITs()); + } }