From 9e3f3107c0e2f14ba0e078c106743765509fff9d Mon Sep 17 00:00:00 2001 From: Ahmad Alhour Date: Wed, 14 Feb 2024 18:07:16 +0100 Subject: [PATCH 1/3] Configure HMaster to reject decommissioned hosts --- .../org/apache/hadoop/hbase/HConstants.java | 14 ++ .../apache/hadoop/hbase/master/HMaster.java | 4 +- ...stIsConsideredDecommissionedException.java | 28 ++++ .../hadoop/hbase/master/ServerManager.java | 73 +++++++++- .../hbase/regionserver/HRegionServer.java | 6 + .../TestRegionServerReportForDuty.java | 132 ++++++++++++++++++ 6 files changed, 253 insertions(+), 4 deletions(-) create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/HostIsConsideredDecommissionedException.java diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java index 5b53d2b2c0d3..948b17b8221b 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java @@ -1622,6 +1622,20 @@ public enum OperationStatusCode { */ public final static boolean HBASE_SERVER_USEIP_ENABLED_DEFAULT = false; + /** + * Should the HMaster reject hosts of decommissioned RegionServers, bypassing matching their port + * and startcode parts of their ServerName or not? When True, the HMaster will reject a + * RegionServer's request to `reportForDuty` if it's hostname exists in the list of decommissioned + * RegionServers it maintains internally. Added in HBASE-28342. + */ + public final static String REJECT_DECOMMISSIONED_HOSTS_KEY = + "hbase.master.reject.decommissioned.hosts"; + + /** + * Default value of {@link #REJECT_DECOMMISSIONED_HOSTS_KEY} + */ + public final static boolean REJECT_DECOMMISSIONED_HOSTS_DEFAULT = false; + private HConstants() { // Can't be instantiated with this ctor. } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 88b82f01069e..ddef3e27b405 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -546,7 +546,6 @@ public HMaster(final Configuration conf) throws IOException { HConstants.DEFAULT_HBASE_MASTER_BALANCER_MAX_RIT_PERCENT); // Do we publish the status? - boolean shouldPublish = conf.getBoolean(HConstants.STATUS_PUBLISHED, HConstants.STATUS_PUBLISHED_DEFAULT); Class publisherClass = @@ -997,7 +996,10 @@ private void finishActiveMasterInitialization() throws IOException, InterruptedE masterRegion = MasterRegionFactory.create(this); rsListStorage = new MasterRegionServerList(masterRegion, this); + // Initialize the ServerManager and register it as a configuration observer this.serverManager = createServerManager(this, rsListStorage); + this.configurationManager.registerObserver(this.serverManager); + this.syncReplicationReplayWALManager = new SyncReplicationReplayWALManager(this); if ( !conf.getBoolean(HBASE_SPLIT_WAL_COORDINATED_BY_ZK, DEFAULT_HBASE_SPLIT_COORDINATED_BY_ZK) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HostIsConsideredDecommissionedException.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HostIsConsideredDecommissionedException.java new file mode 100644 index 000000000000..0a5b5a7cb0c4 --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HostIsConsideredDecommissionedException.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master; + +import org.apache.hadoop.hbase.HBaseIOException; +import org.apache.yetus.audience.InterfaceAudience; + +@InterfaceAudience.Private +public class HostIsConsideredDecommissionedException extends HBaseIOException { + public HostIsConsideredDecommissionedException(String message) { + super(message); + } +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 2afd48c58df5..c6b3a9531b40 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -28,6 +28,7 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Objects; import java.util.Set; import java.util.concurrent.ConcurrentNavigableMap; import java.util.concurrent.ConcurrentSkipListMap; @@ -51,6 +52,7 @@ import org.apache.hadoop.hbase.client.AsyncClusterConnection; import org.apache.hadoop.hbase.client.AsyncRegionServerAdmin; import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.conf.ConfigurationObserver; import org.apache.hadoop.hbase.ipc.RemoteWithExtrasException; import org.apache.hadoop.hbase.master.assignment.RegionStates; import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; @@ -100,7 +102,7 @@ * only after the handler is fully enabled and has completed the handling. */ @InterfaceAudience.Private -public class ServerManager { +public class ServerManager implements ConfigurationObserver { public static final String WAIT_ON_REGIONSERVERS_MAXTOSTART = "hbase.master.wait.on.regionservers.maxtostart"; @@ -172,6 +174,9 @@ public class ServerManager { /** Listeners that are called on server events. */ private List listeners = new CopyOnWriteArrayList<>(); + /** Configured value of HConstants.REJECT_DECOMMISSIONED_HOSTS_KEY */ + private volatile boolean rejectDecommissionedHostsConfig; + /** * Constructor. */ @@ -183,6 +188,35 @@ public ServerManager(final MasterServices master, RegionServerList storage) { warningSkew = c.getLong("hbase.master.warningclockskew", 10000); persistFlushedSequenceId = c.getBoolean(PERSIST_FLUSHEDSEQUENCEID, PERSIST_FLUSHEDSEQUENCEID_DEFAULT); + rejectDecommissionedHostsConfig = getRejectDecommissionedHostsConfig(c); + } + + /** + * Implementation of the ConfigurationObserver interface. We are interested in live-loading the + * configuration value of HConstants.REJECT_DECOMMISSIONED_HOSTS_KEY + * @param conf Server configuration instance + */ + @Override + public void onConfigurationChange(Configuration conf) { + final boolean newValue = getRejectDecommissionedHostsConfig(conf); + if (rejectDecommissionedHostsConfig == newValue) { + // no-op + return; + } + + LOG.info("Config Reload for RejectDecommissionedHosts. previous value: {}, new value: {}", + rejectDecommissionedHostsConfig, newValue); + + rejectDecommissionedHostsConfig = newValue; + } + + /** + * Reads the value of HConstants.REJECT_DECOMMISSIONED_HOSTS_KEY from the config and returns it + * @param conf Configuration instance of the Master + */ + public boolean getRejectDecommissionedHostsConfig(Configuration conf) { + return conf.getBoolean(HConstants.REJECT_DECOMMISSIONED_HOSTS_KEY, + HConstants.REJECT_DECOMMISSIONED_HOSTS_DEFAULT); } /** @@ -227,11 +261,14 @@ ServerName regionServerStartup(RegionServerStartupRequest request, int versionNu final String hostname = request.hasUseThisHostnameInstead() ? request.getUseThisHostnameInstead() : isaHostName; ServerName sn = ServerName.valueOf(hostname, request.getPort(), request.getServerStartCode()); + + // Check if the host should be rejected based on it's decommissioned status + checkRejectableDecommissionedStatus(sn); + checkClockSkew(sn, request.getServerCurrentTime()); checkIsDead(sn, "STARTUP"); if (!checkAndRecordNewServer(sn, ServerMetricsBuilder.of(sn, versionNumber, version))) { - LOG.warn( - "THIS SHOULD NOT HAPPEN, RegionServerStartup" + " could not record the server: " + sn); + LOG.warn("THIS SHOULD NOT HAPPEN, RegionServerStartup could not record the server: {}", sn); } storage.started(sn); return sn; @@ -293,6 +330,36 @@ public void regionServerReport(ServerName sn, ServerMetrics sl) throws YouAreDea updateLastFlushedSequenceIds(sn, sl); } + /** + * Checks if the Master is configured to reject decommissioned hosts or not. When it's configured + * to do so, any RegionServer trying to join the cluster will have it's host checked against the + * list of hosts of currently decommissioned servers and potentially get prevented from reporting + * for duty; otherwise, we do nothing and we let them pass to the next check. See HBASE-28342 for + * details. + * @param sn The ServerName to check for + * @throws HostIsConsideredDecommissionedException if the Master is configured to reject + * decommissioned hosts and this host exists in + * the list of the decommissioned servers + */ + private void checkRejectableDecommissionedStatus(ServerName sn) + throws HostIsConsideredDecommissionedException { + // If the Master is not configured to reject decommissioned hosts, return early. + if (!rejectDecommissionedHostsConfig) { + return; + } + + // Look for a match for the hostname in the list of decommissioned servers + for (ServerName server : getDrainingServersList()) { + if (Objects.equals(server.getHostname(), sn.getHostname())) { + // Found a match and master is configured to reject decommissioned hosts, throw exception! + LOG.warn("Rejecting RegionServer {} from reporting for duty because Master is configured " + + "to reject decommissioned hosts and this host was marked as such in the past.", sn); + throw new HostIsConsideredDecommissionedException( + "Master is configured to reject decommissioned hosts"); + } + } + } + /** * Check is a server of same host and port already exists, if not, or the existed one got a * smaller start code, record it. diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index dfb8e2a204fe..d54660405aa5 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -120,6 +120,7 @@ import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException; import org.apache.hadoop.hbase.ipc.ServerRpcController; import org.apache.hadoop.hbase.log.HBaseMarkers; +import org.apache.hadoop.hbase.master.HostIsConsideredDecommissionedException; import org.apache.hadoop.hbase.mob.MobFileCache; import org.apache.hadoop.hbase.mob.RSMobFileCleanerChore; import org.apache.hadoop.hbase.monitoring.TaskMonitor; @@ -2664,6 +2665,11 @@ private RegionServerStartupResponse reportForDuty() throws IOException { LOG.error(HBaseMarkers.FATAL, "Master rejected startup because clock is out of sync", ioe); // Re-throw IOE will cause RS to abort throw ioe; + } else if (ioe instanceof HostIsConsideredDecommissionedException) { + LOG.error(HBaseMarkers.FATAL, + "Master rejected startup because the host is considered decommissioned", ioe); + // Re-throw IOE will cause RS to abort + throw ioe; } else if (ioe instanceof ServerNotRunningYetException) { LOG.debug("Master is not running yet"); } else { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java index 4917d6f5aefa..abdbcaedc7e0 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java @@ -22,6 +22,8 @@ import java.io.IOException; import java.io.StringWriter; +import java.util.ArrayList; +import java.util.List; import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.TimeUnit; import org.apache.commons.lang3.StringUtils; @@ -241,6 +243,136 @@ public void run() { waitForClusterOnline(master); } + /** + * Tests that the RegionServer's reportForDuty gets rejected by the master when the master is + * configured to reject decommissioned hosts and when there is a match for the joining + * RegionServer in the list of decommissioned servers. Test case for HBASE-28342. + */ + @Test + public void testReportForDutyGetsRejectedByMasterWhenConfiguredToRejectDecommissionedHosts() + throws Exception { + // Start a master and wait for it to become the active/primary master. + // Use a random unique port + cluster.getConfiguration().setInt(HConstants.MASTER_PORT, HBaseTestingUtil.randomFreePort()); + cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1); + cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 1); + + // Set the cluster to reject decommissioned hosts + cluster.getConfiguration().setBoolean(HConstants.REJECT_DECOMMISSIONED_HOSTS_KEY, true); + + master = cluster.addMaster(); + rs = cluster.addRegionServer(); + LOG.debug("Starting master: " + master.getMaster().getServerName()); + master.start(); + rs.start(); + waitForClusterOnline(master); + + assertEquals(0, master.getMaster().listDecommissionedRegionServers().size()); + assertEquals(0, master.getMaster().getServerManager().getDrainingServersList().size()); + assertEquals(1, master.getMaster().getServerManager().getOnlineServers().size()); + + // Decommission the region server and tries to re-add it + List serversToDecommission = new ArrayList(); + serversToDecommission.add(rs.getRegionServer().getServerName()); + master.getMaster().decommissionRegionServers(serversToDecommission, true); + + // Assert that the server is now decommissioned + ServerName decommissionedServer = master.getMaster().listDecommissionedRegionServers().get(0); + assertEquals(1, master.getMaster().listDecommissionedRegionServers().size()); + assertEquals(1, master.getMaster().getServerManager().getDrainingServersList().size()); + assertEquals(1, master.getMaster().getServerManager().getOnlineServers().size()); + assertEquals(rs.getRegionServer().getServerName().toString(), + decommissionedServer.getServerName()); + + // Create a second region server + cluster.getConfiguration().set(HConstants.REGION_SERVER_IMPL, MyRegionServer.class.getName()); + rs2 = cluster.addRegionServer(); + rs2.start(); + waitForSecondRsStarted(); + + // Assert that the number of decommissioned and live hosts didn't change and that the hostname + // of rs2 matches that of the decommissioned server + String rs2HostName = rs2.getRegionServer().getServerName().getHostname(); + assertEquals(1, master.getMaster().listDecommissionedRegionServers().size()); + assertEquals(1, master.getMaster().getServerManager().getDrainingServersList().size()); + assertEquals(1, master.getMaster().getServerManager().getOnlineServers().size()); + assertEquals(rs2HostName, decommissionedServer.getHostname()); + } + + /** + * Tests that the RegionServer's reportForDuty gets accepted by the master when the master is not + * configured to reject decommissioned hosts, even when there is a match for the joining + * RegionServer in the list of decommissioned servers. Test case for HBASE-28342. + */ + @Test + public void testReportForDutyGetsAcceptedByMasterWhenNotConfiguredToRejectDecommissionedHosts() + throws Exception { + // Start a master and wait for it to become the active/primary master. + // Use a random unique port + cluster.getConfiguration().setInt(HConstants.MASTER_PORT, HBaseTestingUtil.randomFreePort()); + cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1); + cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 1); + + // Set the cluster to not reject decommissioned hosts (default behavior) + cluster.getConfiguration().setBoolean(HConstants.REJECT_DECOMMISSIONED_HOSTS_KEY, false); + + master = cluster.addMaster(); + rs = cluster.addRegionServer(); + LOG.debug("Starting master: " + master.getMaster().getServerName()); + master.start(); + rs.start(); + waitForClusterOnline(master); + + assertEquals(0, master.getMaster().listDecommissionedRegionServers().size()); + assertEquals(0, master.getMaster().getServerManager().getDrainingServersList().size()); + assertEquals(1, master.getMaster().getServerManager().getOnlineServers().size()); + + // Decommission the region server and tries to re-add it + List serversToDecommission = new ArrayList<>(); + serversToDecommission.add(rs.getRegionServer().getServerName()); + master.getMaster().decommissionRegionServers(serversToDecommission, true); + + // Assert that the server is now decommissioned + ServerName decommissionedServer = master.getMaster().listDecommissionedRegionServers().get(0); + assertEquals(1, master.getMaster().listDecommissionedRegionServers().size()); + assertEquals(1, master.getMaster().getServerManager().getDrainingServersList().size()); + assertEquals(1, master.getMaster().getServerManager().getOnlineServers().size()); + assertEquals(rs.getRegionServer().getServerName().toString(), + decommissionedServer.getServerName()); + + // Create a second region server and try adding both region servers to it, it should succeed + cluster.getConfiguration().set(HConstants.REGION_SERVER_IMPL, MyRegionServer.class.getName()); + rs2 = cluster.addRegionServer(); + rs2.start(); + waitForSecondRsStarted(); + + master.getMaster().stop("Stopping master"); + + // Start a new master and use another random unique port + // Also let it wait for exactly 2 region severs to report in + cluster.getConfiguration().setInt(HConstants.MASTER_PORT, HBaseTestingUtil.randomFreePort()); + cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 2); + cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 2); + backupMaster = cluster.addMaster(); + LOG.debug("Starting new master: " + backupMaster.getMaster().getServerName()); + backupMaster.start(); + + waitForClusterOnline(backupMaster); + + // Assert that the backup master has become active. + assertTrue(backupMaster.getMaster().isActiveMaster()); + assertTrue(backupMaster.getMaster().isInitialized()); + + // Assert that the number of decommissioned hosts is the same and that the live and online hosts + // did in fact change and rs2 is now part of the cluster even though there is a match for its + // hostname in the list of decommissioned servers + String rs2HostName = rs2.getRegionServer().getServerName().getHostname(); + assertEquals(1, backupMaster.getMaster().listDecommissionedRegionServers().size()); + assertEquals(1, backupMaster.getMaster().getServerManager().getDrainingServersList().size()); + assertEquals(2, backupMaster.getMaster().getServerManager().getOnlineServers().size()); + assertEquals(rs2HostName, decommissionedServer.getHostname()); + } + /** * Tests region sever reportForDuty with a non-default environment edge */ From b9778586d277af6b7cf67f5fb61be007622b694a Mon Sep 17 00:00:00 2001 From: Ahmad Alhour Date: Tue, 20 Feb 2024 17:54:41 +0100 Subject: [PATCH 2/3] Addressed PR comments and refactored the tests --- .../org/apache/hadoop/hbase/HConstants.java | 8 +- ... DecommissionedHostRejectedException.java} | 4 +- .../hadoop/hbase/master/ServerManager.java | 22 ++-- .../hbase/regionserver/HRegionServer.java | 4 +- .../TestRegionServerReportForDuty.java | 108 +++--------------- 5 files changed, 36 insertions(+), 110 deletions(-) rename hbase-server/src/main/java/org/apache/hadoop/hbase/master/{HostIsConsideredDecommissionedException.java => DecommissionedHostRejectedException.java} (87%) diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java index 948b17b8221b..9597ec23d81a 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java @@ -1623,10 +1623,10 @@ public enum OperationStatusCode { public final static boolean HBASE_SERVER_USEIP_ENABLED_DEFAULT = false; /** - * Should the HMaster reject hosts of decommissioned RegionServers, bypassing matching their port - * and startcode parts of their ServerName or not? When True, the HMaster will reject a - * RegionServer's request to `reportForDuty` if it's hostname exists in the list of decommissioned - * RegionServers it maintains internally. Added in HBASE-28342. + * Should the HMaster reject hosts of decommissioned RegionServers, bypass matching their port and + * startcode parts of their ServerName or not? When True, the HMaster will reject a RegionServer's + * request to `reportForDuty` if it's hostname exists in the list of decommissioned RegionServers + * it maintains internally. Added in HBASE-28342. */ public final static String REJECT_DECOMMISSIONED_HOSTS_KEY = "hbase.master.reject.decommissioned.hosts"; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HostIsConsideredDecommissionedException.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DecommissionedHostRejectedException.java similarity index 87% rename from hbase-server/src/main/java/org/apache/hadoop/hbase/master/HostIsConsideredDecommissionedException.java rename to hbase-server/src/main/java/org/apache/hadoop/hbase/master/DecommissionedHostRejectedException.java index 0a5b5a7cb0c4..3d28b1e75be8 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HostIsConsideredDecommissionedException.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DecommissionedHostRejectedException.java @@ -21,8 +21,8 @@ import org.apache.yetus.audience.InterfaceAudience; @InterfaceAudience.Private -public class HostIsConsideredDecommissionedException extends HBaseIOException { - public HostIsConsideredDecommissionedException(String message) { +public class DecommissionedHostRejectedException extends HBaseIOException { + public DecommissionedHostRejectedException(String message) { super(message); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index c6b3a9531b40..178ed16e2561 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -337,12 +337,14 @@ public void regionServerReport(ServerName sn, ServerMetrics sl) throws YouAreDea * for duty; otherwise, we do nothing and we let them pass to the next check. See HBASE-28342 for * details. * @param sn The ServerName to check for - * @throws HostIsConsideredDecommissionedException if the Master is configured to reject - * decommissioned hosts and this host exists in - * the list of the decommissioned servers + * @throws DecommissionedHostRejectedException if the Master is configured to reject + * decommissioned hosts and this host exists in the + * list of the decommissioned servers */ private void checkRejectableDecommissionedStatus(ServerName sn) - throws HostIsConsideredDecommissionedException { + throws DecommissionedHostRejectedException { + LOG.info("Checking decommissioned status of RegionServer {}", sn.getServerName()); + // If the Master is not configured to reject decommissioned hosts, return early. if (!rejectDecommissionedHostsConfig) { return; @@ -352,10 +354,14 @@ private void checkRejectableDecommissionedStatus(ServerName sn) for (ServerName server : getDrainingServersList()) { if (Objects.equals(server.getHostname(), sn.getHostname())) { // Found a match and master is configured to reject decommissioned hosts, throw exception! - LOG.warn("Rejecting RegionServer {} from reporting for duty because Master is configured " - + "to reject decommissioned hosts and this host was marked as such in the past.", sn); - throw new HostIsConsideredDecommissionedException( - "Master is configured to reject decommissioned hosts"); + LOG.warn( + "Rejecting RegionServer {} from reporting for duty because Master is configured " + + "to reject decommissioned hosts and this host was marked as such in the past.", + sn.getServerName()); + throw new DecommissionedHostRejectedException(String.format( + "Host %s exists in the list of decommissioned servers and Master is configured to " + + "reject decommissioned hosts", + sn.getHostname())); } } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index d54660405aa5..c71859ee6c1e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -120,7 +120,7 @@ import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException; import org.apache.hadoop.hbase.ipc.ServerRpcController; import org.apache.hadoop.hbase.log.HBaseMarkers; -import org.apache.hadoop.hbase.master.HostIsConsideredDecommissionedException; +import org.apache.hadoop.hbase.master.DecommissionedHostRejectedException; import org.apache.hadoop.hbase.mob.MobFileCache; import org.apache.hadoop.hbase.mob.RSMobFileCleanerChore; import org.apache.hadoop.hbase.monitoring.TaskMonitor; @@ -2665,7 +2665,7 @@ private RegionServerStartupResponse reportForDuty() throws IOException { LOG.error(HBaseMarkers.FATAL, "Master rejected startup because clock is out of sync", ioe); // Re-throw IOE will cause RS to abort throw ioe; - } else if (ioe instanceof HostIsConsideredDecommissionedException) { + } else if (ioe instanceof DecommissionedHostRejectedException) { LOG.error(HBaseMarkers.FATAL, "Master rejected startup because the host is considered decommissioned", ioe); // Re-throw IOE will cause RS to abort diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java index abdbcaedc7e0..c879fe1b2c38 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java @@ -17,13 +17,12 @@ */ package org.apache.hadoop.hbase.regionserver; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.hamcrest.Matchers.is; +import static org.junit.Assert.*; import java.io.IOException; import java.io.StringWriter; -import java.util.ArrayList; -import java.util.List; +import java.util.Collections; import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.TimeUnit; import org.apache.commons.lang3.StringUtils; @@ -32,6 +31,7 @@ import org.apache.hadoop.hbase.HBaseTestingUtil; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.LocalHBaseCluster; +import org.apache.hadoop.hbase.MatcherPredicate; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.SingleProcessHBaseCluster.MiniHBaseClusterRegionServer; import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException; @@ -271,106 +271,26 @@ public void testReportForDutyGetsRejectedByMasterWhenConfiguredToRejectDecommiss assertEquals(0, master.getMaster().getServerManager().getDrainingServersList().size()); assertEquals(1, master.getMaster().getServerManager().getOnlineServers().size()); - // Decommission the region server and tries to re-add it - List serversToDecommission = new ArrayList(); - serversToDecommission.add(rs.getRegionServer().getServerName()); - master.getMaster().decommissionRegionServers(serversToDecommission, true); - - // Assert that the server is now decommissioned - ServerName decommissionedServer = master.getMaster().listDecommissionedRegionServers().get(0); - assertEquals(1, master.getMaster().listDecommissionedRegionServers().size()); - assertEquals(1, master.getMaster().getServerManager().getDrainingServersList().size()); - assertEquals(1, master.getMaster().getServerManager().getOnlineServers().size()); - assertEquals(rs.getRegionServer().getServerName().toString(), - decommissionedServer.getServerName()); + // Decommission the 1st rs + master.getMaster().decommissionRegionServers( + Collections.singletonList(rs.getRegionServer().getServerName()), false); // Create a second region server cluster.getConfiguration().set(HConstants.REGION_SERVER_IMPL, MyRegionServer.class.getName()); rs2 = cluster.addRegionServer(); rs2.start(); - waitForSecondRsStarted(); - // Assert that the number of decommissioned and live hosts didn't change and that the hostname - // of rs2 matches that of the decommissioned server + // Assert that the number of decommissioned and live hosts didn't change and that rs2 is aborted String rs2HostName = rs2.getRegionServer().getServerName().getHostname(); + String decommissionedHostName = + master.getMaster().listDecommissionedRegionServers().get(0).getHostname(); assertEquals(1, master.getMaster().listDecommissionedRegionServers().size()); assertEquals(1, master.getMaster().getServerManager().getDrainingServersList().size()); assertEquals(1, master.getMaster().getServerManager().getOnlineServers().size()); - assertEquals(rs2HostName, decommissionedServer.getHostname()); - } - - /** - * Tests that the RegionServer's reportForDuty gets accepted by the master when the master is not - * configured to reject decommissioned hosts, even when there is a match for the joining - * RegionServer in the list of decommissioned servers. Test case for HBASE-28342. - */ - @Test - public void testReportForDutyGetsAcceptedByMasterWhenNotConfiguredToRejectDecommissionedHosts() - throws Exception { - // Start a master and wait for it to become the active/primary master. - // Use a random unique port - cluster.getConfiguration().setInt(HConstants.MASTER_PORT, HBaseTestingUtil.randomFreePort()); - cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1); - cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 1); - - // Set the cluster to not reject decommissioned hosts (default behavior) - cluster.getConfiguration().setBoolean(HConstants.REJECT_DECOMMISSIONED_HOSTS_KEY, false); - - master = cluster.addMaster(); - rs = cluster.addRegionServer(); - LOG.debug("Starting master: " + master.getMaster().getServerName()); - master.start(); - rs.start(); - waitForClusterOnline(master); - - assertEquals(0, master.getMaster().listDecommissionedRegionServers().size()); - assertEquals(0, master.getMaster().getServerManager().getDrainingServersList().size()); - assertEquals(1, master.getMaster().getServerManager().getOnlineServers().size()); - - // Decommission the region server and tries to re-add it - List serversToDecommission = new ArrayList<>(); - serversToDecommission.add(rs.getRegionServer().getServerName()); - master.getMaster().decommissionRegionServers(serversToDecommission, true); - - // Assert that the server is now decommissioned - ServerName decommissionedServer = master.getMaster().listDecommissionedRegionServers().get(0); - assertEquals(1, master.getMaster().listDecommissionedRegionServers().size()); - assertEquals(1, master.getMaster().getServerManager().getDrainingServersList().size()); - assertEquals(1, master.getMaster().getServerManager().getOnlineServers().size()); - assertEquals(rs.getRegionServer().getServerName().toString(), - decommissionedServer.getServerName()); - - // Create a second region server and try adding both region servers to it, it should succeed - cluster.getConfiguration().set(HConstants.REGION_SERVER_IMPL, MyRegionServer.class.getName()); - rs2 = cluster.addRegionServer(); - rs2.start(); - waitForSecondRsStarted(); - - master.getMaster().stop("Stopping master"); - - // Start a new master and use another random unique port - // Also let it wait for exactly 2 region severs to report in - cluster.getConfiguration().setInt(HConstants.MASTER_PORT, HBaseTestingUtil.randomFreePort()); - cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 2); - cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 2); - backupMaster = cluster.addMaster(); - LOG.debug("Starting new master: " + backupMaster.getMaster().getServerName()); - backupMaster.start(); - - waitForClusterOnline(backupMaster); - - // Assert that the backup master has become active. - assertTrue(backupMaster.getMaster().isActiveMaster()); - assertTrue(backupMaster.getMaster().isInitialized()); - - // Assert that the number of decommissioned hosts is the same and that the live and online hosts - // did in fact change and rs2 is now part of the cluster even though there is a match for its - // hostname in the list of decommissioned servers - String rs2HostName = rs2.getRegionServer().getServerName().getHostname(); - assertEquals(1, backupMaster.getMaster().listDecommissionedRegionServers().size()); - assertEquals(1, backupMaster.getMaster().getServerManager().getDrainingServersList().size()); - assertEquals(2, backupMaster.getMaster().getServerManager().getOnlineServers().size()); - assertEquals(rs2HostName, decommissionedServer.getHostname()); + assertTrue(master.getMaster().getServerManager().getOnlineServers() + .containsKey(rs.getRegionServer().getServerName())); + assertEquals(rs2HostName, decommissionedHostName); + testUtil.waitFor(10000, new MatcherPredicate<>(rs2.getRegionServer()::isAborted, is(false))); } /** From d8fc20b045926a2227eaa13b5f1a60d0414921da Mon Sep 17 00:00:00 2001 From: Ahmad Alhour Date: Wed, 21 Feb 2024 12:12:03 +0100 Subject: [PATCH 3/3] Refactored the test suite based on abort status and log lines --- .../hadoop/hbase/master/ServerManager.java | 30 ++++----- .../TestRegionServerReportForDuty.java | 63 ++++++++++++------- 2 files changed, 55 insertions(+), 38 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 178ed16e2561..a2ed4da53e39 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -720,13 +720,8 @@ public synchronized void moveFromOnlineToDeadServers(final ServerName sn) { * Remove the server from the drain list. */ public synchronized boolean removeServerFromDrainList(final ServerName sn) { - // Warn if the server (sn) is not online. ServerName is of the form: - // , , + LOG.info("Removing server {} from the draining list.", sn); - if (!this.isServerOnline(sn)) { - LOG.warn("Server " + sn + " is not currently online. " - + "Removing from draining list anyway, as requested."); - } // Remove the server from the draining servers lists. return this.drainingServers.remove(sn); } @@ -736,22 +731,23 @@ public synchronized boolean removeServerFromDrainList(final ServerName sn) { * @return True if the server is added or the server is already on the drain list. */ public synchronized boolean addServerToDrainList(final ServerName sn) { - // Warn if the server (sn) is not online. ServerName is of the form: - // , , - - if (!this.isServerOnline(sn)) { - LOG.warn("Server " + sn + " is not currently online. " - + "Ignoring request to add it to draining list."); + // If master is not rejecting decommissioned hosts, warn if the server (sn) is not online. + // However, we want to add servers even if they're not online if the master is configured + // to reject decommissioned hosts + if (!rejectDecommissionedHostsConfig && !this.isServerOnline(sn)) { + LOG.warn("Server {} is not currently online. Ignoring request to add it to draining list.", + sn); return false; } - // Add the server to the draining servers lists, if it's not already in - // it. + + // Add the server to the draining servers lists, if it's not already in it. if (this.drainingServers.contains(sn)) { - LOG.warn("Server " + sn + " is already in the draining server list." - + "Ignoring request to add it again."); + LOG.warn( + "Server {} is already in the draining server list. Ignoring request to add it again.", sn); return true; } - LOG.info("Server " + sn + " added to draining server list."); + + LOG.info("Server {} added to draining server list.", sn); return this.drainingServers.add(sn); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java index c879fe1b2c38..b408229f59fa 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java @@ -17,11 +17,15 @@ */ package org.apache.hadoop.hbase.regionserver; +import static org.hamcrest.CoreMatchers.containsString; +import static org.hamcrest.Matchers.allOf; +import static org.hamcrest.Matchers.hasItem; import static org.hamcrest.Matchers.is; import static org.junit.Assert.*; import java.io.IOException; import java.io.StringWriter; +import java.util.Arrays; import java.util.Collections; import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.TimeUnit; @@ -35,6 +39,7 @@ import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.SingleProcessHBaseCluster.MiniHBaseClusterRegionServer; import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException; +import org.apache.hadoop.hbase.master.DecommissionedHostRejectedException; import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.ServerManager; import org.apache.hadoop.hbase.testclassification.LargeTests; @@ -262,35 +267,51 @@ public void testReportForDutyGetsRejectedByMasterWhenConfiguredToRejectDecommiss master = cluster.addMaster(); rs = cluster.addRegionServer(); - LOG.debug("Starting master: " + master.getMaster().getServerName()); master.start(); rs.start(); waitForClusterOnline(master); - assertEquals(0, master.getMaster().listDecommissionedRegionServers().size()); - assertEquals(0, master.getMaster().getServerManager().getDrainingServersList().size()); - assertEquals(1, master.getMaster().getServerManager().getOnlineServers().size()); - - // Decommission the 1st rs - master.getMaster().decommissionRegionServers( - Collections.singletonList(rs.getRegionServer().getServerName()), false); + // Add a second decommissioned region server to the cluster, wait for it to fail reportForDuty + LogCapturer capturer = + new LogCapturer((org.apache.logging.log4j.core.Logger) org.apache.logging.log4j.LogManager + .getLogger(HRegionServer.class)); - // Create a second region server - cluster.getConfiguration().set(HConstants.REGION_SERVER_IMPL, MyRegionServer.class.getName()); rs2 = cluster.addRegionServer(); + master.getMaster().decommissionRegionServers( + Collections.singletonList(rs2.getRegionServer().getServerName()), false); rs2.start(); - // Assert that the number of decommissioned and live hosts didn't change and that rs2 is aborted - String rs2HostName = rs2.getRegionServer().getServerName().getHostname(); - String decommissionedHostName = - master.getMaster().listDecommissionedRegionServers().get(0).getHostname(); - assertEquals(1, master.getMaster().listDecommissionedRegionServers().size()); - assertEquals(1, master.getMaster().getServerManager().getDrainingServersList().size()); - assertEquals(1, master.getMaster().getServerManager().getOnlineServers().size()); - assertTrue(master.getMaster().getServerManager().getOnlineServers() - .containsKey(rs.getRegionServer().getServerName())); - assertEquals(rs2HostName, decommissionedHostName); - testUtil.waitFor(10000, new MatcherPredicate<>(rs2.getRegionServer()::isAborted, is(false))); + // Assert that the second regionserver has aborted + testUtil.waitFor(TimeUnit.SECONDS.toMillis(90), + new MatcherPredicate<>(() -> rs2.getRegionServer().isAborted(), is(true))); + + // Assert that the log messages for DecommissionedHostRejectedException exist in the logs + capturer.stopCapturing(); + + assertThat(capturer.getOutput(), + containsString("Master rejected startup because the host is considered decommissioned")); + + /** + * Assert that the following log message occurred (one line): + * "org.apache.hadoop.hbase.master.DecommissionedHostRejectedException: + * org.apache.hadoop.hbase.master.DecommissionedHostRejectedException: Host localhost exists in + * the list of decommissioned servers and Master is configured to reject decommissioned hosts" + */ + assertThat(Arrays.asList(capturer.getOutput().split("\n")), + hasItem(allOf(containsString(DecommissionedHostRejectedException.class.getSimpleName()), + containsString(DecommissionedHostRejectedException.class.getSimpleName()), + containsString("Host " + rs2.getRegionServer().getServerName().getHostname() + + " exists in the list of decommissioned servers and Master is configured to reject" + + " decommissioned hosts")))); + + assertThat(Arrays.asList(capturer.getOutput().split("\n")), + hasItem( + allOf(containsString("ABORTING region server " + rs2.getRegionServer().getServerName()), + containsString("Unhandled"), + containsString(DecommissionedHostRejectedException.class.getSimpleName()), + containsString("Host " + rs2.getRegionServer().getServerName().getHostname() + + " exists in the list of decommissioned servers and Master is configured to reject" + + " decommissioned hosts")))); } /**