From 62267300f31edd4f4dc010c4a10ff57aa3313a1a Mon Sep 17 00:00:00 2001 From: Ray Mattingly Date: Wed, 19 Mar 2025 09:10:32 -0400 Subject: [PATCH] HBASE-29202 Balancer conditionals make balancer actions more likely to be approved --- .../master/balancer/BalancerConditionals.java | 2 +- .../balancer/CandidateGeneratorTestUtil.java | 32 ++++-- ...gTableIsolationAndReplicaDistribution.java | 7 +- .../TestUnattainableBalancerCostGoal.java | 108 ++++++++++++++++++ 4 files changed, 136 insertions(+), 13 deletions(-) create mode 100644 hbase-balancer/src/test/java/org/apache/hadoop/hbase/master/balancer/TestUnattainableBalancerCostGoal.java diff --git a/hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java b/hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java index bbd85e5c1efc..7e5bd98a0133 100644 --- a/hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java +++ b/hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java @@ -145,7 +145,7 @@ int getViolationCountChange(BalancerClusterState cluster, BalanceAction action) // Reset cluster cluster.doAction(undoAction); - if (isViolatingPre && isViolatingPost) { + if (isViolatingPre == isViolatingPost) { return 0; } else if (!isViolatingPre && isViolatingPost) { return 1; diff --git a/hbase-balancer/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java b/hbase-balancer/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java index b7e7b5fa8445..870c97a1f49c 100644 --- a/hbase-balancer/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java +++ b/hbase-balancer/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java @@ -46,16 +46,22 @@ public final class CandidateGeneratorTestUtil { private CandidateGeneratorTestUtil() { } + enum ExhaustionType { + COST_GOAL_ACHIEVED, + NO_MORE_MOVES; + } + static void runBalancerToExhaustion(Configuration conf, Map> serverToRegions, Set> expectations, float targetMaxBalancerCost) { - runBalancerToExhaustion(conf, serverToRegions, expectations, targetMaxBalancerCost, 15000); + runBalancerToExhaustion(conf, serverToRegions, expectations, targetMaxBalancerCost, 15000, + ExhaustionType.COST_GOAL_ACHIEVED); } static void runBalancerToExhaustion(Configuration conf, Map> serverToRegions, Set> expectations, float targetMaxBalancerCost, - long maxRunningTime) { + long maxRunningTime, ExhaustionType exhaustionType) { // Do the full plan. We're testing with a lot of regions conf.setBoolean("hbase.master.balancer.stochastic.runMaxSteps", true); conf.setLong(MAX_RUNNING_TIME_KEY, maxRunningTime); @@ -71,7 +77,7 @@ static void runBalancerToExhaustion(Configuration conf, boolean isBalanced = false; while (!isBalanced) { balancerRuns++; - if (balancerRuns > 1000) { + if (balancerRuns > 10) { throw new RuntimeException("Balancer failed to find balance & meet expectations"); } long start = System.currentTimeMillis(); @@ -106,16 +112,24 @@ static void runBalancerToExhaustion(Configuration conf, } } if (isBalanced) { // Check if the balancer thinks we're done too - LOG.info("All balancer conditions passed. Checking if balancer thinks it's done."); - if (stochasticLoadBalancer.needsBalance(HConstants.ENSEMBLE_TABLE_NAME, cluster)) { - LOG.info("Balancer would still like to run"); - isBalanced = false; + if (exhaustionType == ExhaustionType.COST_GOAL_ACHIEVED) { + // If we expect to achieve the cost goal, then needsBalance should be false + if (stochasticLoadBalancer.needsBalance(HConstants.ENSEMBLE_TABLE_NAME, cluster)) { + LOG.info("Balancer cost goal is not achieved. needsBalance=true"); + isBalanced = false; + } } else { - LOG.info("Balancer is done"); + // If we anticipate running out of moves, then our last balance run should have produced + // nothing + if (regionPlans != null && !regionPlans.isEmpty()) { + LOG.info("Balancer is not out of moves. regionPlans.size()={}", regionPlans.size()); + isBalanced = false; + } } } } - LOG.info("Balancing took {}sec", Duration.ofMillis(balancingMillis).toMinutes()); + LOG.info("Balancer is done. Balancing took {}sec", + Duration.ofMillis(balancingMillis).toMinutes()); } /** diff --git a/hbase-balancer/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java b/hbase-balancer/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java index 5fec827b9bf4..0ea739faf78b 100644 --- a/hbase-balancer/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java +++ b/hbase-balancer/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java @@ -104,9 +104,10 @@ public void testTableIsolationAndReplicaDistribution() { conf.setBoolean(BalancerConditionals.ISOLATE_SYSTEM_TABLES_KEY, true); DistributeReplicasTestConditional.enableConditionalReplicaDistributionForTest(conf); - runBalancerToExhaustion(conf, serverToRegions, Set.of(this::isMetaTableIsolated, - this::isSystemTableIsolated, CandidateGeneratorTestUtil::areAllReplicasDistributed), 10.0f, - 60_000); + runBalancerToExhaustion(conf, serverToRegions, + Set.of(this::isMetaTableIsolated, this::isSystemTableIsolated, + CandidateGeneratorTestUtil::areAllReplicasDistributed), + 10.0f, 60_000, CandidateGeneratorTestUtil.ExhaustionType.COST_GOAL_ACHIEVED); LOG.info("Meta table regions are successfully isolated, " + "and region replicas are appropriately distributed."); } diff --git a/hbase-balancer/src/test/java/org/apache/hadoop/hbase/master/balancer/TestUnattainableBalancerCostGoal.java b/hbase-balancer/src/test/java/org/apache/hadoop/hbase/master/balancer/TestUnattainableBalancerCostGoal.java new file mode 100644 index 000000000000..ffa2b4a78212 --- /dev/null +++ b/hbase-balancer/src/test/java/org/apache/hadoop/hbase/master/balancer/TestUnattainableBalancerCostGoal.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.balancer; + +import static org.apache.hadoop.hbase.master.balancer.CandidateGeneratorTestUtil.isTableIsolated; +import static org.apache.hadoop.hbase.master.balancer.CandidateGeneratorTestUtil.runBalancerToExhaustion; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.RegionInfoBuilder; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * If your minCostNeedsBalance is set too low, then the balancer should still eventually stop making + * moves as further cost improvements become impossible, and balancer plan calculation becomes + * wasteful. This test ensures that the balancer will not get stuck in a loop of continuously moving + * regions. + */ +@Category({ MasterTests.class, MediumTests.class }) +public class TestUnattainableBalancerCostGoal { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestUnattainableBalancerCostGoal.class); + + private static final Logger LOG = LoggerFactory.getLogger(TestUnattainableBalancerCostGoal.class); + + private static final TableName SYSTEM_TABLE_NAME = TableName.valueOf("hbase:system"); + private static final TableName NON_SYSTEM_TABLE_NAME = TableName.valueOf("userTable"); + + private static final int NUM_SERVERS = 10; + private static final int NUM_REGIONS = 1000; + private static final float UNACHIEVABLE_COST_GOAL = 0.01f; + + private static final ServerName[] servers = new ServerName[NUM_SERVERS]; + private static final Map> serverToRegions = new HashMap<>(); + + @BeforeClass + public static void setup() { + // Initialize servers + for (int i = 0; i < NUM_SERVERS; i++) { + servers[i] = ServerName.valueOf("server" + i, i, System.currentTimeMillis()); + } + + // Create regions + List allRegions = new ArrayList<>(); + for (int i = 0; i < NUM_REGIONS; i++) { + TableName tableName = i < 3 ? SYSTEM_TABLE_NAME : NON_SYSTEM_TABLE_NAME; + byte[] startKey = new byte[1]; + startKey[0] = (byte) i; + byte[] endKey = new byte[1]; + endKey[0] = (byte) (i + 1); + + RegionInfo regionInfo = + RegionInfoBuilder.newBuilder(tableName).setStartKey(startKey).setEndKey(endKey).build(); + allRegions.add(regionInfo); + } + + // Assign all regions to the first server + serverToRegions.put(servers[0], new ArrayList<>(allRegions)); + for (int i = 1; i < NUM_SERVERS; i++) { + serverToRegions.put(servers[i], new ArrayList<>()); + } + } + + @Test + public void testSystemTableIsolation() { + Configuration conf = new Configuration(false); + conf.setBoolean(BalancerConditionals.ISOLATE_SYSTEM_TABLES_KEY, true); + runBalancerToExhaustion(conf, serverToRegions, Set.of(this::isSystemTableIsolated), + UNACHIEVABLE_COST_GOAL, 10_000, CandidateGeneratorTestUtil.ExhaustionType.NO_MORE_MOVES); + LOG.info("Meta table regions are successfully isolated."); + } + + private boolean isSystemTableIsolated(BalancerClusterState cluster) { + return isTableIsolated(cluster, SYSTEM_TABLE_NAME, "System"); + } +}