From a44b321ca167318cb7ec217ee92de9cecabf07ac Mon Sep 17 00:00:00 2001 From: haxiaolin Date: Wed, 19 Jun 2019 10:04:17 +0800 Subject: [PATCH 01/16] HBASE-22414 Interruption of moving regions in RSGroup will cause regions on wrong rs --- .../hbase/rsgroup/RSGroupAdminServer.java | 66 ++++-- .../hbase/rsgroup/TestRSGroupsAdmin2.java | 201 +++++++++++++++++- 2 files changed, 249 insertions(+), 18 deletions(-) diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java index 2dae85356dfc..2ab4dceb091c 100644 --- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java +++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java @@ -204,6 +204,7 @@ private void checkServersAndTables(Set
servers, Set tables, private void moveServerRegionsFromGroup(Set
servers, String targetGroupName) throws IOException { boolean hasRegionsToMove; + int retry = 0; RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName); Set
allSevers = new HashSet<>(servers); do { @@ -215,7 +216,12 @@ private void moveServerRegionsFromGroup(Set
servers, String targetGroup if (!targetGrp.containsTable(region.getTable())) { LOG.info("Moving server region {}, which do not belong to RSGroup {}", region.getShortNameToLog(), targetGroupName); - this.master.getAssignmentManager().move(region); + try { + this.master.getAssignmentManager().move(region); + }catch (IOException ioe){ + LOG.error("Move region {} from group failed, will retry, current retry time is {}", + region.getShortNameToLog(), retry, ioe); + } if (master.getAssignmentManager().getRegionStates(). getRegionState(region).isFailedOpen()) { continue; @@ -229,13 +235,15 @@ private void moveServerRegionsFromGroup(Set
servers, String targetGroup iter.remove(); } } + + retry++; try { rsGroupInfoManager.wait(1000); } catch (InterruptedException e) { LOG.warn("Sleep interrupted", e); Thread.currentThread().interrupt(); } - } while (hasRegionsToMove); + } while (hasRegionsToMove && retry <= 50); } /** @@ -247,23 +255,49 @@ private void moveServerRegionsFromGroup(Set
servers, String targetGroup */ private void moveTableRegionsToGroup(Set tables, String targetGroupName) throws IOException { + boolean hasRegionsToMove; + int retry = 0; RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName); - for (TableName table : tables) { - if (master.getAssignmentManager().isTableDisabled(table)) { - LOG.debug("Skipping move regions because the table {} is disabled", table); - continue; - } - LOG.info("Moving region(s) for table {} to RSGroup {}", table, targetGroupName); - for (RegionInfo region : master.getAssignmentManager().getRegionStates() - .getRegionsOfTable(table)) { - ServerName sn = - master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(region); - if (!targetGrp.containsServer(sn.getAddress())) { - LOG.info("Moving region {} to RSGroup {}", region.getShortNameToLog(), targetGroupName); - master.getAssignmentManager().move(region); + Set allTables = new HashSet<>(tables); + do { + hasRegionsToMove = false; + for (Iterator iter = allTables.iterator(); iter.hasNext(); ) { + TableName table = iter.next(); + if (master.getAssignmentManager().isTableDisabled(table)) { + LOG.debug("Skipping move regions because the table {} is disabled", table); + continue; + } + LOG.info("Moving region(s) for table {} to RSGroup {}", table, targetGroupName); + for (RegionInfo region : master.getAssignmentManager().getRegionStates() + .getRegionsOfTable(table)) { + ServerName sn = + master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(region); + if (!targetGrp.containsServer(sn.getAddress())) { + LOG.info("Moving region {} to RSGroup {}", region.getShortNameToLog(), targetGroupName); + try { + master.getAssignmentManager().move(region); + }catch (IOException ioe){ + LOG.error("Move region {} to group failed, will retry, current retry time is {}", + region.getShortNameToLog(), retry, ioe); + } + hasRegionsToMove = true; + } + } + + if (!hasRegionsToMove) { + LOG.info("Table {} has no more regions to move for RSGroup", table.getNameAsString()); + iter.remove(); } } - } + + retry++; + try { + rsGroupInfoManager.wait(1000); + } catch (InterruptedException e) { + LOG.warn("Sleep interrupted", e); + Thread.currentThread().interrupt(); + } + } while (hasRegionsToMove && retry <= 50); } @edu.umd.cs.findbugs.annotations.SuppressWarnings( diff --git a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java index d18bb669df41..e3f56c5332c3 100644 --- a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java +++ b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hbase.rsgroup; +import static org.apache.hadoop.hbase.util.Threads.sleep; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -29,6 +30,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; import org.apache.hadoop.hbase.ClusterMetrics.Option; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.ServerName; @@ -36,8 +38,10 @@ import org.apache.hadoop.hbase.Waiter; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.constraint.ConstraintException; +import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.master.assignment.RegionStateNode; import org.apache.hadoop.hbase.net.Address; -import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.util.Bytes; import org.junit.After; import org.junit.AfterClass; @@ -52,7 +56,7 @@ import org.apache.hbase.thirdparty.com.google.common.collect.Sets; -@Category({ MediumTests.class }) +@Category({ LargeTests.class }) public class TestRSGroupsAdmin2 extends TestRSGroupsBase { @ClassRule @@ -459,4 +463,197 @@ public boolean evaluate() throws Exception { Assert.assertEquals(null, rsGroupAdmin.getRSGroupInfo(fooGroup.getName())); } + @Test + public void testFailedMoveWhenMoveServer() throws Exception { + final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); + final byte[] familyNameBytes = Bytes.toBytes("f"); + final int tableRegionCount = 10; + // All the regions created below will be assigned to the default group. + TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, tableRegionCount); + TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate() { + @Override + public boolean evaluate() throws Exception { + List regions = getTableRegionMap().get(tableName); + if (regions == null) { + return false; + } + return getTableRegionMap().get(tableName).size() >= tableRegionCount; + } + }); + + // get target server to move, which should has more than one regions + // randomly set a region state to SPLITTING + Map> assignMap = getTableServerRegionMap().get(tableName); + String rregion = null; + ServerName toMoveServer = null; + for (ServerName server : assignMap.keySet()) { + rregion = assignMap.get(server).size() > 1 && !newGroup.containsServer(server.getAddress()) ? + assignMap.get(server).get(0) : + null; + if (rregion != null) { + toMoveServer = server; + break; + } + } + assert toMoveServer != null; + RegionInfo ri = TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(). + getRegionInfo(Bytes.toBytesBinary(rregion)); + RegionStateNode rsn = + TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().getRegionStates() + .getRegionStateNode(ri); + rsn.setState(RegionState.State.SPLITTING); + + // start thread to recover region state + final ServerName movedServer = toMoveServer; + final String sregion = rregion; + AtomicBoolean changed = new AtomicBoolean(false); + Thread t1 = new Thread(() -> { + LOG.debug("thread1 start running, will recover region state"); + long current = System.currentTimeMillis(); + while (System.currentTimeMillis() - current <= 50000) { + List regions = master.getAssignmentManager().getRegionsOnServer(movedServer); + LOG.debug("server region size is:{}", regions.size()); + assert regions.size() >= 1; + // when there is exactly one region left, we can determine the move operation encountered + // exception caused by the strange region state. + if (regions.size() == 1) { + assertEquals(regions.get(0).getRegionNameAsString(), sregion); + rsn.setState(RegionState.State.OPEN); + LOG.info("set region {} state OPEN", sregion); + changed.set(true); + break; + } + sleep(5000); + } + }); + t1.start(); + + // move target server to group + Thread t2 = new Thread(() -> { + LOG.info("thread2 start running, to move regions"); + try { + rsGroupAdmin.moveServers(Sets.newHashSet(movedServer.getAddress()), newGroup.getName()); + } catch (IOException e) { + LOG.error("move server error", e); + } + }); + t2.start(); + + t1.join(); + t2.join(); + + TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate() { + @Override + public boolean evaluate() { + if (changed.get()) { + return master.getAssignmentManager().getRegionsOnServer(movedServer).size() == 0 && !rsn + .getRegionLocation().equals(movedServer); + } + return false; + } + }); + } + + @Test + public void testFailedMoveWhenMoveTable() throws Exception { + final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); + final byte[] familyNameBytes = Bytes.toBytes("f"); + final int tableRegionCount = 5; + // All the regions created below will be assigned to the default group. + TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, tableRegionCount); + TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate() { + @Override + public boolean evaluate() throws Exception { + List regions = getTableRegionMap().get(tableName); + if (regions == null) { + return false; + } + return getTableRegionMap().get(tableName).size() >= tableRegionCount; + } + }); + + // randomly set a region state to SPLITTING + Map> assignMap = getTableServerRegionMap().get(tableName); + String rregion = null; + ServerName srcServer = null; + for (ServerName server : assignMap.keySet()) { + rregion = assignMap.get(server).size() >= 1 && !newGroup.containsServer(server.getAddress()) ? + assignMap.get(server).get(0) : + null; + if (rregion != null) { + srcServer = server; + break; + } + } + assert srcServer != null; + RegionInfo ri = TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(). + getRegionInfo(Bytes.toBytesBinary(rregion)); + RegionStateNode rsn = + TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().getRegionStates() + .getRegionStateNode(ri); + rsn.setState(RegionState.State.SPLITTING); + + // move table to group + Thread t2 = new Thread(() -> { + LOG.info("thread2 start running, to move regions"); + try { + rsGroupAdmin.moveTables(Sets.newHashSet(tableName), newGroup.getName()); + } catch (IOException e) { + LOG.error("move server error", e); + } + }); + t2.start(); + + // start thread to recover region state + final ServerName ss = srcServer; + final String sregion = rregion; + AtomicBoolean changed = new AtomicBoolean(false); + Thread t1 = new Thread(() -> { + LOG.info("thread1 start running, will recover region state"); + long current = System.currentTimeMillis(); + while (System.currentTimeMillis() - current <= 50000) { + List regions = master.getAssignmentManager().getRegionsOnServer(ss); + List tableRegions = new ArrayList<>(); + for (RegionInfo regionInfo : regions) { + if (regionInfo.getTable().equals(tableName)) { + tableRegions.add(regionInfo); + } + } + LOG.debug("server table region size is:{}", tableRegions.size()); + assert tableRegions.size() >= 1; + // when there is exactly one region left, we can determine the move operation encountered + // exception caused by the strange region state. + if (tableRegions.size() == 1) { + assertEquals(tableRegions.get(0).getRegionNameAsString(), sregion); + rsn.setState(RegionState.State.OPEN); + LOG.info("set region {} state OPEN", sregion); + changed.set(true); + break; + } + sleep(5000); + } + }); + t1.start(); + + t1.join(); + t2.join(); + + TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate() { + @Override + public boolean evaluate() { + if (changed.get()) { + boolean serverHasTableRegions = false; + for (RegionInfo regionInfo : master.getAssignmentManager().getRegionsOnServer(ss)) { + if (regionInfo.getTable().equals(tableName)) { + serverHasTableRegions = true; + break; + } + } + return !serverHasTableRegions && !rsn.getRegionLocation().equals(ss); + } + return false; + } + }); + } + } From 3471e7fccac8784f19441a3b56ceec260a8cc738 Mon Sep 17 00:00:00 2001 From: haxiaolin Date: Wed, 19 Jun 2019 10:04:17 +0800 Subject: [PATCH 02/16] HBASE-22414 Interruption of moving regions in RSGroup will cause regions on wrong rs --- .../hbase/rsgroup/RSGroupAdminServer.java | 163 +++++++++--------- .../hbase/rsgroup/TestRSGroupsAdmin2.java | 158 +++++++++++------ .../hbase/rsgroup/TestRSGroupsBase.java | 4 +- 3 files changed, 190 insertions(+), 135 deletions(-) diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java index 2ab4dceb091c..3be2cc29b01e 100644 --- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java +++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java @@ -28,6 +28,7 @@ import java.util.Map; import java.util.Set; import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.NamespaceDescriptor; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; @@ -60,9 +61,13 @@ public class RSGroupAdminServer implements RSGroupAdmin { private MasterServices master; private final RSGroupInfoManager rsGroupInfoManager; + private String FAILED_MOVE_MAX_RETRY_KEY = "hbase.rsgroup.move.max.retry"; + private int moveMaxRetry; + public RSGroupAdminServer(MasterServices master, RSGroupInfoManager rsGroupInfoManager) { this.master = master; this.rsGroupInfoManager = rsGroupInfoManager; + this.moveMaxRetry = master.getConfiguration().getInt(FAILED_MOVE_MAX_RETRY_KEY, 50); } @Override @@ -193,103 +198,87 @@ private void checkServersAndTables(Set
servers, Set tables, } } - /** - * Move every region from servers which are currently located on these servers, - * but should not be located there. - * - * @param servers the servers that will move to new group - * @param targetGroupName the target group name - * @throws IOException if moving the server and tables fail - */ - private void moveServerRegionsFromGroup(Set
servers, String targetGroupName) - throws IOException { - boolean hasRegionsToMove; - int retry = 0; - RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName); - Set
allSevers = new HashSet<>(servers); - do { - hasRegionsToMove = false; - for (Iterator
iter = allSevers.iterator(); iter.hasNext();) { - Address rs = iter.next(); - // Get regions that are associated with this server and filter regions by group tables. - for (RegionInfo region : getRegions(rs)) { - if (!targetGrp.containsTable(region.getTable())) { - LOG.info("Moving server region {}, which do not belong to RSGroup {}", - region.getShortNameToLog(), targetGroupName); - try { - this.master.getAssignmentManager().move(region); - }catch (IOException ioe){ - LOG.error("Move region {} from group failed, will retry, current retry time is {}", - region.getShortNameToLog(), retry, ioe); - } - if (master.getAssignmentManager().getRegionStates(). - getRegionState(region).isFailedOpen()) { - continue; - } - hasRegionsToMove = true; - } - } - - if (!hasRegionsToMove) { - LOG.info("Server {} has no more regions to move for RSGroup", rs.getHostname()); - iter.remove(); - } - } - - retry++; - try { - rsGroupInfoManager.wait(1000); - } catch (InterruptedException e) { - LOG.warn("Sleep interrupted", e); - Thread.currentThread().interrupt(); - } - } while (hasRegionsToMove && retry <= 50); + private enum MoveType { + TO, FROM } /** - * Moves regions of tables which are not on target group servers. + * When move a table to a group, all regions of it must be moved to group servers (to the group). + * When move a server to a group, some regions on it which are of tables that do not belong to + * the target group must be moved (from the group). + * So MoveType.TO means TableName set, and MoveType.FROM means server Address set. * - * @param tables the tables that will move to new group - * @param targetGroupName the target group name - * @throws IOException if moving the region fails + * @param set it's a table set or a server set + * @param targetGroupName + * @param type + * @param + * @throws IOException */ - private void moveTableRegionsToGroup(Set tables, String targetGroupName) + private void moveRegionsToOrFromGroup(Set set, String targetGroupName, MoveType type) throws IOException { + Set newSet = new HashSet<>(set); boolean hasRegionsToMove; int retry = 0; + IOException toThrow = null; + Set failedRegions = new HashSet<>(); RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName); - Set allTables = new HashSet<>(tables); do { hasRegionsToMove = false; - for (Iterator iter = allTables.iterator(); iter.hasNext(); ) { - TableName table = iter.next(); - if (master.getAssignmentManager().isTableDisabled(table)) { - LOG.debug("Skipping move regions because the table {} is disabled", table); - continue; + for (Iterator iter = newSet.iterator(); iter.hasNext(); ) { + T el = iter.next(); + List toMoveRegions = new ArrayList<>(); + if (type == MoveType.TO) { + // means element type of set is TableName + assert el instanceof TableName; + if (master.getAssignmentManager().isTableDisabled((TableName) el)) { + LOG.debug("Skipping move regions because the table {} is disabled", el); + }else { + // Get regions of these tables and filter regions by group servers. + for (RegionInfo region : + master.getAssignmentManager().getRegionStates().getRegionsOfTable((TableName) el)) { + ServerName sn = + master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(region); + if (!targetGrp.containsServer(sn.getAddress())) { + toMoveRegions.add(region); + } + } + } } - LOG.info("Moving region(s) for table {} to RSGroup {}", table, targetGroupName); - for (RegionInfo region : master.getAssignmentManager().getRegionStates() - .getRegionsOfTable(table)) { - ServerName sn = - master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(region); - if (!targetGrp.containsServer(sn.getAddress())) { - LOG.info("Moving region {} to RSGroup {}", region.getShortNameToLog(), targetGroupName); - try { - master.getAssignmentManager().move(region); - }catch (IOException ioe){ - LOG.error("Move region {} to group failed, will retry, current retry time is {}", - region.getShortNameToLog(), retry, ioe); + if (type == MoveType.FROM) { + // means element type of set is Address + assert el instanceof Address; + // Get regions that are associated with these servers and filter regions by group tables. + for (RegionInfo region : getRegions((Address) el)) { + if (!targetGrp.containsTable(region.getTable())) { + toMoveRegions.add(region); } - hasRegionsToMove = true; } } + //move regions + for (RegionInfo region : toMoveRegions) { + LOG.info("Moving server region {}, which do not belong to RSGroup {}", + region.getShortNameToLog(), targetGroupName); + try { + this.master.getAssignmentManager().move(region); + failedRegions.remove(region.getRegionNameAsString()); + } catch (IOException ioe) { + LOG.debug("Move region {} from group failed, will retry, current retry time is {}", + region.getShortNameToLog(), retry, ioe); + toThrow = ioe; + failedRegions.add(region.getRegionNameAsString()); + } + if (master.getAssignmentManager().getRegionStates(). + getRegionState(region).isFailedOpen()) { + continue; + } + hasRegionsToMove = true; + } if (!hasRegionsToMove) { - LOG.info("Table {} has no more regions to move for RSGroup", table.getNameAsString()); + LOG.info("There are no more regions of {} to move for RSGroup {}", el, targetGroupName); iter.remove(); } } - retry++; try { rsGroupInfoManager.wait(1000); @@ -297,7 +286,17 @@ private void moveTableRegionsToGroup(Set tables, String targetGroupNa LOG.warn("Sleep interrupted", e); Thread.currentThread().interrupt(); } - } while (hasRegionsToMove && retry <= 50); + } while (hasRegionsToMove && retry <= moveMaxRetry); + + //has up to max retry time or there are no more regions to move + if (hasRegionsToMove) { + // print failed moved regions, for later process conveniently + String msg = String.format("move regions for group %s failed, failed regions: %s", + targetGroupName, failedRegions); + LOG.error(msg); + throw new DoNotRetryIOException(msg + + ", just record the last failed region's cause, more details in server log", toThrow); + } } @edu.umd.cs.findbugs.annotations.SuppressWarnings( @@ -356,7 +355,7 @@ public void moveServers(Set
servers, String targetGroupName) // MovedServers may be < passed in 'servers'. Set
movedServers = rsGroupInfoManager.moveServers(servers, srcGrp.getName(), targetGroupName); - moveServerRegionsFromGroup(movedServers, targetGroupName); + moveRegionsToOrFromGroup(movedServers, targetGroupName, MoveType.FROM); LOG.info("Move servers done: {} => {}", srcGrp.getName(), targetGroupName); } } @@ -398,7 +397,7 @@ public void moveTables(Set tables, String targetGroup) throws IOExcep // targetGroup is null when a table is being deleted. In this case no further // action is required. if (targetGroup != null) { - moveTableRegionsToGroup(tables, targetGroup); + moveRegionsToOrFromGroup(tables, targetGroup, MoveType.TO); } } } @@ -530,9 +529,9 @@ public void moveServersAndTables(Set
servers, Set tables, St rsGroupInfoManager.moveServersAndTables(servers, tables, srcGroup, targetGroup); //move regions on these servers which do not belong to group tables - moveServerRegionsFromGroup(servers, targetGroup); + moveRegionsToOrFromGroup(servers, targetGroup, MoveType.FROM); //move regions of these tables which are not on group servers - moveTableRegionsToGroup(tables, targetGroup); + moveRegionsToOrFromGroup(tables, targetGroup, MoveType.TO); } LOG.info("Move servers and tables done. Severs: {}, Tables: {} => {}", servers, tables, targetGroup); diff --git a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java index e3f56c5332c3..3184d4f603ed 100644 --- a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java +++ b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java @@ -43,6 +43,7 @@ import org.apache.hadoop.hbase.net.Address; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.Pair; import org.junit.After; import org.junit.AfterClass; import org.junit.Assert; @@ -464,8 +465,10 @@ public boolean evaluate() throws Exception { } @Test - public void testFailedMoveWhenMoveServer() throws Exception { - final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); + public void testFailedMoveBeforeRetryExhaustedWhenMoveServer() throws Exception { + String groupName = getGroupName(name.getMethodName()); + rsGroupAdmin.addRSGroup(groupName); + final RSGroupInfo newGroup = rsGroupAdmin.getRSGroupInfo(groupName); final byte[] familyNameBytes = Bytes.toBytes("f"); final int tableRegionCount = 10; // All the regions created below will be assigned to the default group. @@ -481,31 +484,11 @@ public boolean evaluate() throws Exception { } }); - // get target server to move, which should has more than one regions - // randomly set a region state to SPLITTING - Map> assignMap = getTableServerRegionMap().get(tableName); - String rregion = null; - ServerName toMoveServer = null; - for (ServerName server : assignMap.keySet()) { - rregion = assignMap.get(server).size() > 1 && !newGroup.containsServer(server.getAddress()) ? - assignMap.get(server).get(0) : - null; - if (rregion != null) { - toMoveServer = server; - break; - } - } - assert toMoveServer != null; - RegionInfo ri = TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(). - getRegionInfo(Bytes.toBytesBinary(rregion)); - RegionStateNode rsn = - TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().getRegionStates() - .getRegionStateNode(ri); - rsn.setState(RegionState.State.SPLITTING); + Pair gotPair = setARegionState(newGroup); // start thread to recover region state - final ServerName movedServer = toMoveServer; - final String sregion = rregion; + final ServerName movedServer = gotPair.getFirst(); + final RegionStateNode rsn = gotPair.getSecond(); AtomicBoolean changed = new AtomicBoolean(false); Thread t1 = new Thread(() -> { LOG.debug("thread1 start running, will recover region state"); @@ -517,9 +500,10 @@ public boolean evaluate() throws Exception { // when there is exactly one region left, we can determine the move operation encountered // exception caused by the strange region state. if (regions.size() == 1) { - assertEquals(regions.get(0).getRegionNameAsString(), sregion); + assertEquals(regions.get(0).getRegionNameAsString(), + rsn.getRegionInfo().getRegionNameAsString()); rsn.setState(RegionState.State.OPEN); - LOG.info("set region {} state OPEN", sregion); + LOG.info("set region {} state OPEN", rsn.getRegionInfo().getRegionNameAsString()); changed.set(true); break; } @@ -555,7 +539,7 @@ public boolean evaluate() { } @Test - public void testFailedMoveWhenMoveTable() throws Exception { + public void testFailedMoveBeforeRetryExhaustedWhenMoveTable() throws Exception { final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); final byte[] familyNameBytes = Bytes.toBytes("f"); final int tableRegionCount = 5; @@ -572,26 +556,7 @@ public boolean evaluate() throws Exception { } }); - // randomly set a region state to SPLITTING - Map> assignMap = getTableServerRegionMap().get(tableName); - String rregion = null; - ServerName srcServer = null; - for (ServerName server : assignMap.keySet()) { - rregion = assignMap.get(server).size() >= 1 && !newGroup.containsServer(server.getAddress()) ? - assignMap.get(server).get(0) : - null; - if (rregion != null) { - srcServer = server; - break; - } - } - assert srcServer != null; - RegionInfo ri = TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(). - getRegionInfo(Bytes.toBytesBinary(rregion)); - RegionStateNode rsn = - TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().getRegionStates() - .getRegionStateNode(ri); - rsn.setState(RegionState.State.SPLITTING); + Pair gotPair = setARegionState(newGroup); // move table to group Thread t2 = new Thread(() -> { @@ -605,8 +570,8 @@ public boolean evaluate() throws Exception { t2.start(); // start thread to recover region state - final ServerName ss = srcServer; - final String sregion = rregion; + final ServerName ss = gotPair.getFirst(); + final RegionStateNode rsn = gotPair.getSecond(); AtomicBoolean changed = new AtomicBoolean(false); Thread t1 = new Thread(() -> { LOG.info("thread1 start running, will recover region state"); @@ -624,9 +589,10 @@ public boolean evaluate() throws Exception { // when there is exactly one region left, we can determine the move operation encountered // exception caused by the strange region state. if (tableRegions.size() == 1) { - assertEquals(tableRegions.get(0).getRegionNameAsString(), sregion); + assertEquals(tableRegions.get(0).getRegionNameAsString(), + rsn.getRegionInfo().getRegionNameAsString()); rsn.setState(RegionState.State.OPEN); - LOG.info("set region {} state OPEN", sregion); + LOG.info("set region {} state OPEN", rsn.getRegionInfo().getRegionNameAsString()); changed.set(true); break; } @@ -656,4 +622,92 @@ public boolean evaluate() { }); } + @Test + public void testFailedMoveWhenMoveServer() throws Exception{ + String groupName = getGroupName(name.getMethodName()); + rsGroupAdmin.addRSGroup(groupName); + final RSGroupInfo newGroup = rsGroupAdmin.getRSGroupInfo(groupName); + final byte[] familyNameBytes = Bytes.toBytes("f"); + final int tableRegionCount = 10; + // All the regions created below will be assigned to the default group. + TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, tableRegionCount); + TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate() { + @Override + public boolean evaluate() throws Exception { + List regions = getTableRegionMap().get(tableName); + if (regions == null) { + return false; + } + return getTableRegionMap().get(tableName).size() >= tableRegionCount; + } + }); + + Pair gotPair = setARegionState(newGroup); + try{ + rsGroupAdmin.moveServers(Sets.newHashSet(gotPair.getFirst().getAddress()), + newGroup.getName()); + }catch (IOException e){ + assertTrue(e.getMessage().contains( + gotPair.getSecond().getRegionInfo().getRegionNameAsString())); + } + } + + @Test + public void testFailedMoveWhenMoveTable() throws Exception{ + final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); + final byte[] familyNameBytes = Bytes.toBytes("f"); + final int tableRegionCount = 10; + // All the regions created below will be assigned to the default group. + TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, tableRegionCount); + TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate() { + @Override + public boolean evaluate() throws Exception { + List regions = getTableRegionMap().get(tableName); + if (regions == null) { + return false; + } + return getTableRegionMap().get(tableName).size() >= tableRegionCount; + } + }); + + Pair gotPair = setARegionState(newGroup); + try{ + rsGroupAdmin.moveTables(Sets.newHashSet(tableName), newGroup.getName()); + }catch (IOException e){ + assertTrue(e.getMessage().contains( + gotPair.getSecond().getRegionInfo().getRegionNameAsString())); + } + } + + /** + * Randomly choose a region to set state. + * @param newGroup, target group + * @return source server of region, and region state + * @throws IOException + */ + private Pair setARegionState(RSGroupInfo newGroup) + throws IOException{ + // get target server to move, which should has more than one regions + // randomly set a region state to SPLITTING to make move fail + Map> assignMap = getTableServerRegionMap().get(tableName); + String rregion = null; + ServerName toMoveServer = null; + for (ServerName server : assignMap.keySet()) { + rregion = assignMap.get(server).size() > 1 && + !newGroup.containsServer(server.getAddress()) ? assignMap.get(server).get(0) : null; + if (rregion != null) { + toMoveServer = server; + break; + } + } + assert toMoveServer != null; + RegionInfo ri = TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(). + getRegionInfo(Bytes.toBytesBinary(rregion)); + RegionStateNode rsn = + TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().getRegionStates() + .getRegionStateNode(ri); + rsn.setState(RegionState.State.SPLITTING); + return new Pair<>(toMoveServer, rsn); + } + } diff --git a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsBase.java b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsBase.java index 3d8dd167330e..c5520cf11f1c 100644 --- a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsBase.java +++ b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsBase.java @@ -100,11 +100,13 @@ public static void setUpTestBeforeClass() throws Exception { RSGroupBasedLoadBalancer.class.getName()); TEST_UTIL.getConfiguration().set(CoprocessorHost.MASTER_COPROCESSOR_CONF_KEY, RSGroupAdminEndpoint.class.getName() + "," + CPMasterObserver.class.getName()); - TEST_UTIL.startMiniCluster(NUM_SLAVES_BASE - 1); TEST_UTIL.getConfiguration().setInt( ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, NUM_SLAVES_BASE - 1); TEST_UTIL.getConfiguration().setBoolean(SnapshotManager.HBASE_SNAPSHOT_ENABLED, true); + TEST_UTIL.getConfiguration().setInt("hbase.rpc.timeout", 100000); + + TEST_UTIL.startMiniCluster(NUM_SLAVES_BASE - 1); initialize(); } From 3586a3ff0f43cc6b227ac9e83f665fdadbf964c4 Mon Sep 17 00:00:00 2001 From: haxiaolin Date: Mon, 24 Jun 2019 10:05:35 +0800 Subject: [PATCH 03/16] fix checkstyle --- .../apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java | 8 ++++---- .../apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java index 3be2cc29b01e..7e0c1253f459 100644 --- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java +++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java @@ -209,10 +209,10 @@ private enum MoveType { * So MoveType.TO means TableName set, and MoveType.FROM means server Address set. * * @param set it's a table set or a server set - * @param targetGroupName - * @param type - * @param - * @throws IOException + * @param targetGroupName target group name + * @param type type of move regions + * @param the type of elements in Set + * @throws IOException if move haven't succeed even after max number of retries */ private void moveRegionsToOrFromGroup(Set set, String targetGroupName, MoveType type) throws IOException { diff --git a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java index 3184d4f603ed..665c184856ba 100644 --- a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java +++ b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java @@ -681,7 +681,7 @@ public boolean evaluate() throws Exception { /** * Randomly choose a region to set state. - * @param newGroup, target group + * @param newGroup target group * @return source server of region, and region state * @throws IOException */ From 310bc21fe9ae5ed20b0d980f0dbb78330d19695e Mon Sep 17 00:00:00 2001 From: haxiaolin Date: Mon, 24 Jun 2019 13:57:58 +0800 Subject: [PATCH 04/16] update checkstyle of TestRSGroupsAdmin2 --- .../org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java index 665c184856ba..a9f59e1c1659 100644 --- a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java +++ b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java @@ -683,7 +683,7 @@ public boolean evaluate() throws Exception { * Randomly choose a region to set state. * @param newGroup target group * @return source server of region, and region state - * @throws IOException + * @throws IOException if methods called throw */ private Pair setARegionState(RSGroupInfo newGroup) throws IOException{ From 27ecfc2f4b35e002b1d397629dbf8917344213a1 Mon Sep 17 00:00:00 2001 From: haxiaolin Date: Thu, 27 Jun 2019 10:27:46 +0800 Subject: [PATCH 05/16] Change codes according to comments. --- .../hbase/rsgroup/RSGroupAdminServer.java | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java index 7e0c1253f459..33a4515a725a 100644 --- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java +++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java @@ -61,13 +61,13 @@ public class RSGroupAdminServer implements RSGroupAdmin { private MasterServices master; private final RSGroupInfoManager rsGroupInfoManager; - private String FAILED_MOVE_MAX_RETRY_KEY = "hbase.rsgroup.move.max.retry"; + private String FAILED_MOVE_MAX_RETRY = "hbase.rsgroup.move.max.retry"; private int moveMaxRetry; public RSGroupAdminServer(MasterServices master, RSGroupInfoManager rsGroupInfoManager) { this.master = master; this.rsGroupInfoManager = rsGroupInfoManager; - this.moveMaxRetry = master.getConfiguration().getInt(FAILED_MOVE_MAX_RETRY_KEY, 50); + this.moveMaxRetry = master.getConfiguration().getInt(FAILED_MOVE_MAX_RETRY, 50); } @Override @@ -203,10 +203,12 @@ private enum MoveType { } /** - * When move a table to a group, all regions of it must be moved to group servers (to the group). - * When move a server to a group, some regions on it which are of tables that do not belong to - * the target group must be moved (from the group). - * So MoveType.TO means TableName set, and MoveType.FROM means server Address set. + * When move a table to a group, we can only move regions of it which are not on group servers + * TO the group. Because all regions of the table must be in target group. When move a server to + * a group, we can only move regions on it which do not belong to group tables FROM the group. + * And what's more, table regions or server regions that already in target group need not to be + * moved. So MoveType.TO means move regions of TABLES, and MoveType.FROM means move regions on + * SERVERS. * * @param set it's a table set or a server set * @param targetGroupName target group name @@ -225,17 +227,17 @@ private void moveRegionsToOrFromGroup(Set set, String targetGroupName, Mo do { hasRegionsToMove = false; for (Iterator iter = newSet.iterator(); iter.hasNext(); ) { - T el = iter.next(); + T element = iter.next(); List toMoveRegions = new ArrayList<>(); if (type == MoveType.TO) { // means element type of set is TableName - assert el instanceof TableName; - if (master.getAssignmentManager().isTableDisabled((TableName) el)) { - LOG.debug("Skipping move regions because the table {} is disabled", el); + assert element instanceof TableName; + if (master.getAssignmentManager().isTableDisabled((TableName) element)) { + LOG.debug("Skipping move regions because the table {} is disabled", element); }else { // Get regions of these tables and filter regions by group servers. - for (RegionInfo region : - master.getAssignmentManager().getRegionStates().getRegionsOfTable((TableName) el)) { + for (RegionInfo region : master.getAssignmentManager().getRegionStates() + .getRegionsOfTable((TableName) element)) { ServerName sn = master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(region); if (!targetGrp.containsServer(sn.getAddress())) { @@ -246,9 +248,9 @@ private void moveRegionsToOrFromGroup(Set set, String targetGroupName, Mo } if (type == MoveType.FROM) { // means element type of set is Address - assert el instanceof Address; + assert element instanceof Address; // Get regions that are associated with these servers and filter regions by group tables. - for (RegionInfo region : getRegions((Address) el)) { + for (RegionInfo region : getRegions((Address) element)) { if (!targetGrp.containsTable(region.getTable())) { toMoveRegions.add(region); } @@ -275,7 +277,8 @@ private void moveRegionsToOrFromGroup(Set set, String targetGroupName, Mo hasRegionsToMove = true; } if (!hasRegionsToMove) { - LOG.info("There are no more regions of {} to move for RSGroup {}", el, targetGroupName); + LOG.info("There are no more regions of {} to move for RSGroup {}", element, + targetGroupName); iter.remove(); } } From f9927e54ce35b074ab86c9cfab54462844503d4f Mon Sep 17 00:00:00 2001 From: Wellington Chevreuil Date: Mon, 1 Jul 2019 14:55:35 +0100 Subject: [PATCH 06/16] defined common skeleton for moveServerRegionsFromGroup and moveTableRegionsToGroup --- .../hbase/rsgroup/RSGroupAdminServer.java | 155 +++++++++--------- 1 file changed, 74 insertions(+), 81 deletions(-) diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java index 33a4515a725a..45535a2fb798 100644 --- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java +++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java @@ -27,6 +27,8 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.function.Function; + import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.NamespaceDescriptor; @@ -198,90 +200,91 @@ private void checkServersAndTables(Set
servers, Set tables, } } - private enum MoveType { - TO, FROM + /** + * Move every region from servers which are currently located on these servers, + * but should not be located there. + * + * @param servers the servers that will move to new group + * @param targetGroupName the target group name + * @throws IOException if moving the server and tables fail + */ + private void moveServerRegionsFromGroup(Set
servers, String targetGroupName) + throws IOException { + moveRegionsBetweenGroups(servers, targetGroupName, + rs -> getRegions(rs), + info -> { + try { + RSGroupInfo group = getRSGroupInfo(targetGroupName); + return group.containsTable(info.getTable()); + } catch (IOException e) { + e.printStackTrace(); + return false; + } + }, + rs -> rs.getHostname()); } /** - * When move a table to a group, we can only move regions of it which are not on group servers - * TO the group. Because all regions of the table must be in target group. When move a server to - * a group, we can only move regions on it which do not belong to group tables FROM the group. - * And what's more, table regions or server regions that already in target group need not to be - * moved. So MoveType.TO means move regions of TABLES, and MoveType.FROM means move regions on - * SERVERS. + * Moves regions of tables which are not on target group servers. * - * @param set it's a table set or a server set - * @param targetGroupName target group name - * @param type type of move regions - * @param the type of elements in Set - * @throws IOException if move haven't succeed even after max number of retries + * @param tables the tables that will move to new group + * @param targetGroupName the target group name + * @throws IOException if moving the region fails */ - private void moveRegionsToOrFromGroup(Set set, String targetGroupName, MoveType type) - throws IOException { - Set newSet = new HashSet<>(set); + private void moveTableRegionsToGroup(Set tables, String targetGroupName) + throws IOException { + moveRegionsBetweenGroups(tables, targetGroupName, + table -> master.getAssignmentManager().getRegionStates().getRegionsOfTable(table), + info -> { + try { + RSGroupInfo group = getRSGroupInfo(targetGroupName); + ServerName sn = + master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(info); + return group.containsServer(sn.getAddress()); + } catch (IOException e) { + e.printStackTrace(); + return false; + } + }, + table -> table.getNameWithNamespaceInclAsString()); + } + + private void moveRegionsBetweenGroups(Set regionsOwners, String targetGroupName, + Function> getRegionsInfo, + Function validation, + Function getOwnerName) throws IOException { boolean hasRegionsToMove; int retry = 0; - IOException toThrow = null; - Set failedRegions = new HashSet<>(); - RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName); + Set allOwners = new HashSet<>(regionsOwners); do { hasRegionsToMove = false; - for (Iterator iter = newSet.iterator(); iter.hasNext(); ) { - T element = iter.next(); - List toMoveRegions = new ArrayList<>(); - if (type == MoveType.TO) { - // means element type of set is TableName - assert element instanceof TableName; - if (master.getAssignmentManager().isTableDisabled((TableName) element)) { - LOG.debug("Skipping move regions because the table {} is disabled", element); - }else { - // Get regions of these tables and filter regions by group servers. - for (RegionInfo region : master.getAssignmentManager().getRegionStates() - .getRegionsOfTable((TableName) element)) { - ServerName sn = - master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(region); - if (!targetGrp.containsServer(sn.getAddress())) { - toMoveRegions.add(region); - } - } - } - } - if (type == MoveType.FROM) { - // means element type of set is Address - assert element instanceof Address; - // Get regions that are associated with these servers and filter regions by group tables. - for (RegionInfo region : getRegions((Address) element)) { - if (!targetGrp.containsTable(region.getTable())) { - toMoveRegions.add(region); - } - } - } - - //move regions - for (RegionInfo region : toMoveRegions) { - LOG.info("Moving server region {}, which do not belong to RSGroup {}", + for (Iterator iter = allOwners.iterator(); iter.hasNext();) { + T owner = iter.next(); + // Get regions that are associated with this server and filter regions by group tables. + for (RegionInfo region : getRegionsInfo.apply(owner)) { + if (!validation.apply(region)) { + LOG.info("Moving region {}, which do not belong to RSGroup {}", region.getShortNameToLog(), targetGroupName); - try { - this.master.getAssignmentManager().move(region); - failedRegions.remove(region.getRegionNameAsString()); - } catch (IOException ioe) { - LOG.debug("Move region {} from group failed, will retry, current retry time is {}", + try { + this.master.getAssignmentManager().move(region); + }catch (IOException ioe){ + LOG.error("Move region {} from group failed, will retry, current retry time is {}", region.getShortNameToLog(), retry, ioe); - toThrow = ioe; - failedRegions.add(region.getRegionNameAsString()); - } - if (master.getAssignmentManager().getRegionStates(). + } + if (master.getAssignmentManager().getRegionStates(). getRegionState(region).isFailedOpen()) { - continue; + continue; + } + hasRegionsToMove = true; } - hasRegionsToMove = true; } + if (!hasRegionsToMove) { - LOG.info("There are no more regions of {} to move for RSGroup {}", element, - targetGroupName); + LOG.info("No more regions to move from {} to RSGroup", getOwnerName.apply(owner)); iter.remove(); } } + retry++; try { rsGroupInfoManager.wait(1000); @@ -289,17 +292,7 @@ private void moveRegionsToOrFromGroup(Set set, String targetGroupName, Mo LOG.warn("Sleep interrupted", e); Thread.currentThread().interrupt(); } - } while (hasRegionsToMove && retry <= moveMaxRetry); - - //has up to max retry time or there are no more regions to move - if (hasRegionsToMove) { - // print failed moved regions, for later process conveniently - String msg = String.format("move regions for group %s failed, failed regions: %s", - targetGroupName, failedRegions); - LOG.error(msg); - throw new DoNotRetryIOException(msg + - ", just record the last failed region's cause, more details in server log", toThrow); - } + } while (hasRegionsToMove && retry <= 50); } @edu.umd.cs.findbugs.annotations.SuppressWarnings( @@ -358,7 +351,7 @@ public void moveServers(Set
servers, String targetGroupName) // MovedServers may be < passed in 'servers'. Set
movedServers = rsGroupInfoManager.moveServers(servers, srcGrp.getName(), targetGroupName); - moveRegionsToOrFromGroup(movedServers, targetGroupName, MoveType.FROM); + moveServerRegionsFromGroup(movedServers, targetGroupName); LOG.info("Move servers done: {} => {}", srcGrp.getName(), targetGroupName); } } @@ -400,7 +393,7 @@ public void moveTables(Set tables, String targetGroup) throws IOExcep // targetGroup is null when a table is being deleted. In this case no further // action is required. if (targetGroup != null) { - moveRegionsToOrFromGroup(tables, targetGroup, MoveType.TO); + moveTableRegionsToGroup(tables, targetGroup); } } } @@ -532,9 +525,9 @@ public void moveServersAndTables(Set
servers, Set tables, St rsGroupInfoManager.moveServersAndTables(servers, tables, srcGroup, targetGroup); //move regions on these servers which do not belong to group tables - moveRegionsToOrFromGroup(servers, targetGroup, MoveType.FROM); + moveServerRegionsFromGroup(servers, targetGroup); //move regions of these tables which are not on group servers - moveRegionsToOrFromGroup(tables, targetGroup, MoveType.TO); + moveTableRegionsToGroup(tables, targetGroup); } LOG.info("Move servers and tables done. Severs: {}, Tables: {} => {}", servers, tables, targetGroup); From d12694cfd78e4f40b6575fd4379ff11d9fdcca02 Mon Sep 17 00:00:00 2001 From: Wellington Chevreuil Date: Tue, 2 Jul 2019 12:27:50 +0100 Subject: [PATCH 07/16] addressing checkstyle issue --- .../java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java | 1 - 1 file changed, 1 deletion(-) diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java index 45535a2fb798..4ed618b3ae50 100644 --- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java +++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java @@ -30,7 +30,6 @@ import java.util.function.Function; import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.NamespaceDescriptor; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; From e559550196fdcc78ca266d71fb401e4976ef69aa Mon Sep 17 00:00:00 2001 From: haxiaolin Date: Mon, 8 Jul 2019 10:58:43 +0800 Subject: [PATCH 08/16] config max retry time and throw ex when up to retry limit --- .../hbase/rsgroup/RSGroupAdminServer.java | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java index 4ed618b3ae50..cd928290be8a 100644 --- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java +++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java @@ -30,6 +30,7 @@ import java.util.function.Function; import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.NamespaceDescriptor; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; @@ -255,6 +256,8 @@ private void moveRegionsBetweenGroups(Set regionsOwners, String targetGro boolean hasRegionsToMove; int retry = 0; Set allOwners = new HashSet<>(regionsOwners); + Set failedRegions = new HashSet<>(); + IOException toThrow = null; do { hasRegionsToMove = false; for (Iterator iter = allOwners.iterator(); iter.hasNext();) { @@ -266,9 +269,12 @@ private void moveRegionsBetweenGroups(Set regionsOwners, String targetGro region.getShortNameToLog(), targetGroupName); try { this.master.getAssignmentManager().move(region); + failedRegions.remove(region); }catch (IOException ioe){ - LOG.error("Move region {} from group failed, will retry, current retry time is {}", + LOG.debug("Move region {} from group failed, will retry, current retry time is {}", region.getShortNameToLog(), retry, ioe); + toThrow = ioe; + failedRegions.add(region); } if (master.getAssignmentManager().getRegionStates(). getRegionState(region).isFailedOpen()) { @@ -291,7 +297,17 @@ private void moveRegionsBetweenGroups(Set regionsOwners, String targetGro LOG.warn("Sleep interrupted", e); Thread.currentThread().interrupt(); } - } while (hasRegionsToMove && retry <= 50); + } while (hasRegionsToMove && retry <= moveMaxRetry); + + //has up to max retry time or there are no more regions to move + if (hasRegionsToMove) { + // print failed moved regions, for later process conveniently + String msg = String.format("move regions for group %s failed, failed regions: %s", + targetGroupName, failedRegions); + LOG.error(msg); + throw new DoNotRetryIOException(msg + + ", just record the last failed region's cause, more details in server log", toThrow); + } } @edu.umd.cs.findbugs.annotations.SuppressWarnings( From cc9e0a02fb96a6f74a197ee2e0efefc32ec0312c Mon Sep 17 00:00:00 2001 From: haxiaolin Date: Mon, 8 Jul 2019 15:19:32 +0800 Subject: [PATCH 09/16] addressing failed UT and reformatting some codes --- .../hbase/rsgroup/RSGroupAdminServer.java | 74 +++++++++---------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java index cd928290be8a..df5cf7075f28 100644 --- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java +++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java @@ -210,18 +210,15 @@ private void checkServersAndTables(Set
servers, Set tables, */ private void moveServerRegionsFromGroup(Set
servers, String targetGroupName) throws IOException { - moveRegionsBetweenGroups(servers, targetGroupName, - rs -> getRegions(rs), - info -> { - try { - RSGroupInfo group = getRSGroupInfo(targetGroupName); - return group.containsTable(info.getTable()); - } catch (IOException e) { - e.printStackTrace(); - return false; - } - }, - rs -> rs.getHostname()); + moveRegionsBetweenGroups(servers, targetGroupName, rs -> getRegions(rs), info -> { + try { + RSGroupInfo group = getRSGroupInfo(targetGroupName); + return group.containsTable(info.getTable()); + } catch (IOException e) { + e.printStackTrace(); + return false; + } + }, rs -> rs.getHostname()); } /** @@ -233,26 +230,27 @@ private void moveServerRegionsFromGroup(Set
servers, String targetGroup */ private void moveTableRegionsToGroup(Set tables, String targetGroupName) throws IOException { - moveRegionsBetweenGroups(tables, targetGroupName, - table -> master.getAssignmentManager().getRegionStates().getRegionsOfTable(table), - info -> { - try { - RSGroupInfo group = getRSGroupInfo(targetGroupName); - ServerName sn = + moveRegionsBetweenGroups(tables, targetGroupName, table -> { + if (master.getAssignmentManager().isTableDisabled(table)) { + return new ArrayList<>(); + } + return master.getAssignmentManager().getRegionStates().getRegionsOfTable(table); + }, info -> { + try { + RSGroupInfo group = getRSGroupInfo(targetGroupName); + ServerName sn = master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(info); - return group.containsServer(sn.getAddress()); - } catch (IOException e) { - e.printStackTrace(); - return false; - } - }, - table -> table.getNameWithNamespaceInclAsString()); + return group.containsServer(sn.getAddress()); + } catch (IOException e) { + e.printStackTrace(); + return false; + } + }, table -> table.getNameWithNamespaceInclAsString()); } private void moveRegionsBetweenGroups(Set regionsOwners, String targetGroupName, - Function> getRegionsInfo, - Function validation, - Function getOwnerName) throws IOException { + Function> getRegionsInfo, Function validation, + Function getOwnerName) throws IOException { boolean hasRegionsToMove; int retry = 0; Set allOwners = new HashSet<>(regionsOwners); @@ -260,24 +258,24 @@ private void moveRegionsBetweenGroups(Set regionsOwners, String targetGro IOException toThrow = null; do { hasRegionsToMove = false; - for (Iterator iter = allOwners.iterator(); iter.hasNext();) { + for (Iterator iter = allOwners.iterator(); iter.hasNext(); ) { T owner = iter.next(); // Get regions that are associated with this server and filter regions by group tables. for (RegionInfo region : getRegionsInfo.apply(owner)) { if (!validation.apply(region)) { LOG.info("Moving region {}, which do not belong to RSGroup {}", - region.getShortNameToLog(), targetGroupName); + region.getShortNameToLog(), targetGroupName); try { this.master.getAssignmentManager().move(region); failedRegions.remove(region); - }catch (IOException ioe){ + } catch (IOException ioe) { LOG.debug("Move region {} from group failed, will retry, current retry time is {}", - region.getShortNameToLog(), retry, ioe); + region.getShortNameToLog(), retry, ioe); toThrow = ioe; failedRegions.add(region); } if (master.getAssignmentManager().getRegionStates(). - getRegionState(region).isFailedOpen()) { + getRegionState(region).isFailedOpen()) { continue; } hasRegionsToMove = true; @@ -302,11 +300,13 @@ private void moveRegionsBetweenGroups(Set regionsOwners, String targetGro //has up to max retry time or there are no more regions to move if (hasRegionsToMove) { // print failed moved regions, for later process conveniently - String msg = String.format("move regions for group %s failed, failed regions: %s", - targetGroupName, failedRegions); + String msg = String + .format("move regions for group %s failed, failed regions: %s", targetGroupName, + failedRegions); LOG.error(msg); - throw new DoNotRetryIOException(msg + - ", just record the last failed region's cause, more details in server log", toThrow); + throw new DoNotRetryIOException( + msg + ", just record the last failed region's cause, more details in server log", + toThrow); } } From ec50f4f66fae7acca596dbd47b229ebf2d34508a Mon Sep 17 00:00:00 2001 From: haxiaolin Date: Mon, 8 Jul 2019 17:06:12 +0800 Subject: [PATCH 10/16] addressing checkstyle issue --- .../hbase/rsgroup/RSGroupAdminServer.java | 56 ++++++++++--------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java index df5cf7075f28..4e966ae1252f 100644 --- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java +++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java @@ -210,15 +210,18 @@ private void checkServersAndTables(Set
servers, Set tables, */ private void moveServerRegionsFromGroup(Set
servers, String targetGroupName) throws IOException { - moveRegionsBetweenGroups(servers, targetGroupName, rs -> getRegions(rs), info -> { - try { - RSGroupInfo group = getRSGroupInfo(targetGroupName); - return group.containsTable(info.getTable()); - } catch (IOException e) { - e.printStackTrace(); - return false; - } - }, rs -> rs.getHostname()); + moveRegionsBetweenGroups(servers, targetGroupName, + rs -> getRegions(rs), + info -> { + try { + RSGroupInfo group = getRSGroupInfo(targetGroupName); + return group.containsTable(info.getTable()); + } catch (IOException e) { + e.printStackTrace(); + return false; + } + }, + rs -> rs.getHostname()); } /** @@ -230,22 +233,25 @@ private void moveServerRegionsFromGroup(Set
servers, String targetGroup */ private void moveTableRegionsToGroup(Set tables, String targetGroupName) throws IOException { - moveRegionsBetweenGroups(tables, targetGroupName, table -> { - if (master.getAssignmentManager().isTableDisabled(table)) { - return new ArrayList<>(); - } - return master.getAssignmentManager().getRegionStates().getRegionsOfTable(table); - }, info -> { - try { - RSGroupInfo group = getRSGroupInfo(targetGroupName); - ServerName sn = - master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(info); - return group.containsServer(sn.getAddress()); - } catch (IOException e) { - e.printStackTrace(); - return false; - } - }, table -> table.getNameWithNamespaceInclAsString()); + moveRegionsBetweenGroups(tables, targetGroupName, + table -> { + if (master.getAssignmentManager().isTableDisabled(table)) { + return new ArrayList<>(); + } + return master.getAssignmentManager().getRegionStates().getRegionsOfTable(table); + }, + info -> { + try { + RSGroupInfo group = getRSGroupInfo(targetGroupName); + ServerName sn = + master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(info); + return group.containsServer(sn.getAddress()); + } catch (IOException e) { + e.printStackTrace(); + return false; + } + }, + table -> table.getNameWithNamespaceInclAsString()); } private void moveRegionsBetweenGroups(Set regionsOwners, String targetGroupName, From 670ab0ea6022521fed24d0e652027caa5adc3f08 Mon Sep 17 00:00:00 2001 From: haxiaolin Date: Tue, 9 Jul 2019 14:44:55 +0800 Subject: [PATCH 11/16] addressing checkstyle issue --- .../hbase/rsgroup/RSGroupAdminServer.java | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java index 4e966ae1252f..e058e49595e1 100644 --- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java +++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java @@ -211,17 +211,17 @@ private void checkServersAndTables(Set
servers, Set tables, private void moveServerRegionsFromGroup(Set
servers, String targetGroupName) throws IOException { moveRegionsBetweenGroups(servers, targetGroupName, - rs -> getRegions(rs), - info -> { - try { - RSGroupInfo group = getRSGroupInfo(targetGroupName); - return group.containsTable(info.getTable()); - } catch (IOException e) { - e.printStackTrace(); - return false; - } - }, - rs -> rs.getHostname()); + rs -> getRegions(rs), + info -> { + try { + RSGroupInfo group = getRSGroupInfo(targetGroupName); + return group.containsTable(info.getTable()); + } catch (IOException e) { + e.printStackTrace(); + return false; + } + }, + rs -> rs.getHostname()); } /** @@ -234,24 +234,24 @@ private void moveServerRegionsFromGroup(Set
servers, String targetGroup private void moveTableRegionsToGroup(Set tables, String targetGroupName) throws IOException { moveRegionsBetweenGroups(tables, targetGroupName, - table -> { - if (master.getAssignmentManager().isTableDisabled(table)) { - return new ArrayList<>(); - } - return master.getAssignmentManager().getRegionStates().getRegionsOfTable(table); - }, - info -> { - try { - RSGroupInfo group = getRSGroupInfo(targetGroupName); - ServerName sn = - master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(info); - return group.containsServer(sn.getAddress()); - } catch (IOException e) { - e.printStackTrace(); - return false; - } - }, - table -> table.getNameWithNamespaceInclAsString()); + table -> { + if (master.getAssignmentManager().isTableDisabled(table)) { + return new ArrayList<>(); + } + return master.getAssignmentManager().getRegionStates().getRegionsOfTable(table); + }, + info -> { + try { + RSGroupInfo group = getRSGroupInfo(targetGroupName); + ServerName sn = + master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(info); + return group.containsServer(sn.getAddress()); + } catch (IOException e) { + e.printStackTrace(); + return false; + } + }, + table -> table.getNameWithNamespaceInclAsString()); } private void moveRegionsBetweenGroups(Set regionsOwners, String targetGroupName, From bdb68f55f20efddd82ebc5892102d78810aeaf6c Mon Sep 17 00:00:00 2001 From: haxiaolin Date: Wed, 10 Jul 2019 11:25:25 +0800 Subject: [PATCH 12/16] add skeleton function recoverRegionStateThread(), and change error message to contains region name. --- .../hbase/rsgroup/RSGroupAdminServer.java | 6 +- .../hbase/rsgroup/TestRSGroupsAdmin2.java | 82 +++++++++---------- 2 files changed, 40 insertions(+), 48 deletions(-) diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java index e058e49595e1..dee3dbc29b64 100644 --- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java +++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java @@ -260,7 +260,7 @@ private void moveRegionsBetweenGroups(Set regionsOwners, String targetGro boolean hasRegionsToMove; int retry = 0; Set allOwners = new HashSet<>(regionsOwners); - Set failedRegions = new HashSet<>(); + Set failedRegions = new HashSet<>(); IOException toThrow = null; do { hasRegionsToMove = false; @@ -273,12 +273,12 @@ private void moveRegionsBetweenGroups(Set regionsOwners, String targetGro region.getShortNameToLog(), targetGroupName); try { this.master.getAssignmentManager().move(region); - failedRegions.remove(region); + failedRegions.remove(region.getRegionNameAsString()); } catch (IOException ioe) { LOG.debug("Move region {} from group failed, will retry, current retry time is {}", region.getShortNameToLog(), retry, ioe); toThrow = ioe; - failedRegions.add(region); + failedRegions.add(region.getRegionNameAsString()); } if (master.getAssignmentManager().getRegionStates(). getRegionState(region).isFailedOpen()) { diff --git a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java index a9f59e1c1659..b8d2a4b9ec9c 100644 --- a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java +++ b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java @@ -31,6 +31,7 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.function.Function; import org.apache.hadoop.hbase.ClusterMetrics.Option; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.ServerName; @@ -490,26 +491,8 @@ public boolean evaluate() throws Exception { final ServerName movedServer = gotPair.getFirst(); final RegionStateNode rsn = gotPair.getSecond(); AtomicBoolean changed = new AtomicBoolean(false); - Thread t1 = new Thread(() -> { - LOG.debug("thread1 start running, will recover region state"); - long current = System.currentTimeMillis(); - while (System.currentTimeMillis() - current <= 50000) { - List regions = master.getAssignmentManager().getRegionsOnServer(movedServer); - LOG.debug("server region size is:{}", regions.size()); - assert regions.size() >= 1; - // when there is exactly one region left, we can determine the move operation encountered - // exception caused by the strange region state. - if (regions.size() == 1) { - assertEquals(regions.get(0).getRegionNameAsString(), - rsn.getRegionInfo().getRegionNameAsString()); - rsn.setState(RegionState.State.OPEN); - LOG.info("set region {} state OPEN", rsn.getRegionInfo().getRegionNameAsString()); - changed.set(true); - break; - } - sleep(5000); - } - }); + Thread t1 = recoverRegionStateThread(movedServer, + server -> master.getAssignmentManager().getRegionsOnServer(movedServer), rsn, changed); t1.start(); // move target server to group @@ -573,32 +556,17 @@ public boolean evaluate() throws Exception { final ServerName ss = gotPair.getFirst(); final RegionStateNode rsn = gotPair.getSecond(); AtomicBoolean changed = new AtomicBoolean(false); - Thread t1 = new Thread(() -> { - LOG.info("thread1 start running, will recover region state"); - long current = System.currentTimeMillis(); - while (System.currentTimeMillis() - current <= 50000) { - List regions = master.getAssignmentManager().getRegionsOnServer(ss); - List tableRegions = new ArrayList<>(); - for (RegionInfo regionInfo : regions) { - if (regionInfo.getTable().equals(tableName)) { - tableRegions.add(regionInfo); - } - } - LOG.debug("server table region size is:{}", tableRegions.size()); - assert tableRegions.size() >= 1; - // when there is exactly one region left, we can determine the move operation encountered - // exception caused by the strange region state. - if (tableRegions.size() == 1) { - assertEquals(tableRegions.get(0).getRegionNameAsString(), - rsn.getRegionInfo().getRegionNameAsString()); - rsn.setState(RegionState.State.OPEN); - LOG.info("set region {} state OPEN", rsn.getRegionInfo().getRegionNameAsString()); - changed.set(true); - break; + + Thread t1 = recoverRegionStateThread(ss, server -> { + List regions = master.getAssignmentManager().getRegionsOnServer(ss); + List tableRegions = new ArrayList<>(); + for (RegionInfo regionInfo : regions) { + if (regionInfo.getTable().equals(tableName)) { + tableRegions.add(regionInfo); } - sleep(5000); } - }); + return tableRegions; + }, rsn, changed); t1.start(); t1.join(); @@ -622,6 +590,31 @@ public boolean evaluate() { }); } + public Thread recoverRegionStateThread(T owner, Function> getRegions, + RegionStateNode rsn, AtomicBoolean changed){ + return new Thread(() -> { + LOG.info("thread1 start running, will recover region state"); + long current = System.currentTimeMillis(); + while (System.currentTimeMillis() - current <= 50000) { + List regions = getRegions.apply(owner); + LOG.debug("server table region size is:{}", regions.size()); + assert regions.size() >= 1; + // when there is exactly one region left, we can determine the move operation encountered + // exception caused by the strange region state. + if (regions.size() == 1) { + assertEquals(regions.get(0).getRegionNameAsString(), + rsn.getRegionInfo().getRegionNameAsString()); + rsn.setState(RegionState.State.OPEN); + LOG.info("set region {} state OPEN", rsn.getRegionInfo().getRegionNameAsString()); + changed.set(true); + break; + } + sleep(5000); + } + }); + + } + @Test public void testFailedMoveWhenMoveServer() throws Exception{ String groupName = getGroupName(name.getMethodName()); @@ -709,5 +702,4 @@ private Pair setARegionState(RSGroupInfo newGroup) rsn.setState(RegionState.State.SPLITTING); return new Pair<>(toMoveServer, rsn); } - } From 3ed96219b2aeebd7d0bf9f9f093264936af3bf1c Mon Sep 17 00:00:00 2001 From: haxiaolin Date: Wed, 10 Jul 2019 14:04:28 +0800 Subject: [PATCH 13/16] addressing checkstyle issue --- .../org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java index b8d2a4b9ec9c..582ecbbd20a7 100644 --- a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java +++ b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java @@ -492,7 +492,7 @@ public boolean evaluate() throws Exception { final RegionStateNode rsn = gotPair.getSecond(); AtomicBoolean changed = new AtomicBoolean(false); Thread t1 = recoverRegionStateThread(movedServer, - server -> master.getAssignmentManager().getRegionsOnServer(movedServer), rsn, changed); + server -> master.getAssignmentManager().getRegionsOnServer(movedServer), rsn, changed); t1.start(); // move target server to group From 9ca8b9dab760513854c99d35d8788ac837a13ce9 Mon Sep 17 00:00:00 2001 From: haxiaolin Date: Thu, 11 Jul 2019 11:13:46 +0800 Subject: [PATCH 14/16] changing according to comments --- .../hbase/rsgroup/RSGroupAdminServer.java | 14 +++- .../hbase/rsgroup/TestRSGroupsAdmin2.java | 81 +++++-------------- 2 files changed, 33 insertions(+), 62 deletions(-) diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java index dee3dbc29b64..f240d2e003e6 100644 --- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java +++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java @@ -57,19 +57,27 @@ @InterfaceAudience.Private public class RSGroupAdminServer implements RSGroupAdmin { private static final Logger LOG = LoggerFactory.getLogger(RSGroupAdminServer.class); - public static final String KEEP_ONE_SERVER_IN_DEFAULT_ERROR_MESSAGE = "should keep at least " + + static final String KEEP_ONE_SERVER_IN_DEFAULT_ERROR_MESSAGE = "should keep at least " + "one server in 'default' RSGroup."; private MasterServices master; private final RSGroupInfoManager rsGroupInfoManager; - private String FAILED_MOVE_MAX_RETRY = "hbase.rsgroup.move.max.retry"; + /** Define the config key of retries threshold when movements failed */ + //made package private for testing + static final String FAILED_MOVE_MAX_RETRY = "hbase.rsgroup.move.max.retry"; + + /** Define the default number of retries */ + //made package private for testing + static final int DEFAULT_MAX_RETRY_VALUE = 50; + private int moveMaxRetry; public RSGroupAdminServer(MasterServices master, RSGroupInfoManager rsGroupInfoManager) { this.master = master; this.rsGroupInfoManager = rsGroupInfoManager; - this.moveMaxRetry = master.getConfiguration().getInt(FAILED_MOVE_MAX_RETRY, 50); + this.moveMaxRetry = master.getConfiguration().getInt(FAILED_MOVE_MAX_RETRY, + DEFAULT_MAX_RETRY_VALUE); } @Override diff --git a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java index 582ecbbd20a7..a6225a081436 100644 --- a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java +++ b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hbase.rsgroup; +import static org.apache.hadoop.hbase.rsgroup.RSGroupAdminServer.DEFAULT_MAX_RETRY_VALUE; import static org.apache.hadoop.hbase.util.Threads.sleep; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -470,22 +471,8 @@ public void testFailedMoveBeforeRetryExhaustedWhenMoveServer() throws Exception String groupName = getGroupName(name.getMethodName()); rsGroupAdmin.addRSGroup(groupName); final RSGroupInfo newGroup = rsGroupAdmin.getRSGroupInfo(groupName); - final byte[] familyNameBytes = Bytes.toBytes("f"); - final int tableRegionCount = 10; - // All the regions created below will be assigned to the default group. - TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, tableRegionCount); - TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate() { - @Override - public boolean evaluate() throws Exception { - List regions = getTableRegionMap().get(tableName); - if (regions == null) { - return false; - } - return getTableRegionMap().get(tableName).size() >= tableRegionCount; - } - }); - - Pair gotPair = setARegionState(newGroup); + Pair gotPair = createTableAndSetARegionState(newGroup, + 10); // start thread to recover region state final ServerName movedServer = gotPair.getFirst(); @@ -524,22 +511,7 @@ public boolean evaluate() { @Test public void testFailedMoveBeforeRetryExhaustedWhenMoveTable() throws Exception { final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); - final byte[] familyNameBytes = Bytes.toBytes("f"); - final int tableRegionCount = 5; - // All the regions created below will be assigned to the default group. - TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, tableRegionCount); - TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate() { - @Override - public boolean evaluate() throws Exception { - List regions = getTableRegionMap().get(tableName); - if (regions == null) { - return false; - } - return getTableRegionMap().get(tableName).size() >= tableRegionCount; - } - }); - - Pair gotPair = setARegionState(newGroup); + Pair gotPair = createTableAndSetARegionState(newGroup, 5); // move table to group Thread t2 = new Thread(() -> { @@ -590,12 +562,12 @@ public boolean evaluate() { }); } - public Thread recoverRegionStateThread(T owner, Function> getRegions, + private Thread recoverRegionStateThread(T owner, Function> getRegions, RegionStateNode rsn, AtomicBoolean changed){ return new Thread(() -> { LOG.info("thread1 start running, will recover region state"); long current = System.currentTimeMillis(); - while (System.currentTimeMillis() - current <= 50000) { + while (System.currentTimeMillis() - current <= DEFAULT_MAX_RETRY_VALUE * 1000) { List regions = getRegions.apply(owner); LOG.debug("server table region size is:{}", regions.size()); assert regions.size() >= 1; @@ -612,7 +584,6 @@ public Thread recoverRegionStateThread(T owner, Function sleep(5000); } }); - } @Test @@ -620,25 +591,12 @@ public void testFailedMoveWhenMoveServer() throws Exception{ String groupName = getGroupName(name.getMethodName()); rsGroupAdmin.addRSGroup(groupName); final RSGroupInfo newGroup = rsGroupAdmin.getRSGroupInfo(groupName); - final byte[] familyNameBytes = Bytes.toBytes("f"); - final int tableRegionCount = 10; - // All the regions created below will be assigned to the default group. - TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, tableRegionCount); - TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate() { - @Override - public boolean evaluate() throws Exception { - List regions = getTableRegionMap().get(tableName); - if (regions == null) { - return false; - } - return getTableRegionMap().get(tableName).size() >= tableRegionCount; - } - }); - - Pair gotPair = setARegionState(newGroup); + Pair gotPair = createTableAndSetARegionState(newGroup, + 10); try{ rsGroupAdmin.moveServers(Sets.newHashSet(gotPair.getFirst().getAddress()), newGroup.getName()); + fail("move servers to group should fail"); }catch (IOException e){ assertTrue(e.getMessage().contains( gotPair.getSecond().getRegionInfo().getRegionNameAsString())); @@ -648,8 +606,19 @@ public boolean evaluate() throws Exception { @Test public void testFailedMoveWhenMoveTable() throws Exception{ final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); + Pair gotPair = createTableAndSetARegionState(newGroup, 5); + try{ + rsGroupAdmin.moveTables(Sets.newHashSet(tableName), newGroup.getName()); + fail("move tables to group should fail"); + }catch (IOException e){ + assertTrue(e.getMessage().contains( + gotPair.getSecond().getRegionInfo().getRegionNameAsString())); + } + } + + private Pair createTableAndSetARegionState(RSGroupInfo rsGroupInfo, + int tableRegionCount) throws Exception{ final byte[] familyNameBytes = Bytes.toBytes("f"); - final int tableRegionCount = 10; // All the regions created below will be assigned to the default group. TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, tableRegionCount); TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate() { @@ -663,13 +632,7 @@ public boolean evaluate() throws Exception { } }); - Pair gotPair = setARegionState(newGroup); - try{ - rsGroupAdmin.moveTables(Sets.newHashSet(tableName), newGroup.getName()); - }catch (IOException e){ - assertTrue(e.getMessage().contains( - gotPair.getSecond().getRegionInfo().getRegionNameAsString())); - } + return setARegionState(rsGroupInfo); } /** From 963fba65620a6c3884e655fb41d74334531f27a3 Mon Sep 17 00:00:00 2001 From: haxiaolin Date: Thu, 11 Jul 2019 15:46:20 +0800 Subject: [PATCH 15/16] add comment for using constant DEFAULT_MAX_RETRY_VALUE in UT --- .../org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java index a6225a081436..2edf9394c520 100644 --- a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java +++ b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java @@ -567,6 +567,9 @@ private Thread recoverRegionStateThread(T owner, Function { LOG.info("thread1 start running, will recover region state"); long current = System.currentTimeMillis(); + // wait until there is only left the region we changed state and recover its state. + // wait time is set according to the number of max retries, all except failed regions will be + // moved in one retry, and will sleep 1s until next retry. while (System.currentTimeMillis() - current <= DEFAULT_MAX_RETRY_VALUE * 1000) { List regions = getRegions.apply(owner); LOG.debug("server table region size is:{}", regions.size()); From 8bf26f41480ebd1ea6f3c58b8a27c139bf8888e0 Mon Sep 17 00:00:00 2001 From: haxiaolin Date: Mon, 15 Jul 2019 09:58:29 +0800 Subject: [PATCH 16/16] change codes according to comments --- .../hbase/rsgroup/TestRSGroupsAdmin2.java | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java index 2edf9394c520..d9c1b10cb6a2 100644 --- a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java +++ b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsAdmin2.java @@ -471,7 +471,7 @@ public void testFailedMoveBeforeRetryExhaustedWhenMoveServer() throws Exception String groupName = getGroupName(name.getMethodName()); rsGroupAdmin.addRSGroup(groupName); final RSGroupInfo newGroup = rsGroupAdmin.getRSGroupInfo(groupName); - Pair gotPair = createTableAndSetARegionState(newGroup, + Pair gotPair = createTableWithRegionSplitting(newGroup, 10); // start thread to recover region state @@ -511,7 +511,8 @@ public boolean evaluate() { @Test public void testFailedMoveBeforeRetryExhaustedWhenMoveTable() throws Exception { final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); - Pair gotPair = createTableAndSetARegionState(newGroup, 5); + Pair gotPair = createTableWithRegionSplitting(newGroup, + 5); // move table to group Thread t2 = new Thread(() -> { @@ -594,12 +595,13 @@ public void testFailedMoveWhenMoveServer() throws Exception{ String groupName = getGroupName(name.getMethodName()); rsGroupAdmin.addRSGroup(groupName); final RSGroupInfo newGroup = rsGroupAdmin.getRSGroupInfo(groupName); - Pair gotPair = createTableAndSetARegionState(newGroup, + Pair gotPair = createTableWithRegionSplitting(newGroup, 10); try{ rsGroupAdmin.moveServers(Sets.newHashSet(gotPair.getFirst().getAddress()), newGroup.getName()); - fail("move servers to group should fail"); + fail("should get IOException when retry exhausted but there still exists failed moved " + + "regions"); }catch (IOException e){ assertTrue(e.getMessage().contains( gotPair.getSecond().getRegionInfo().getRegionNameAsString())); @@ -609,17 +611,19 @@ public void testFailedMoveWhenMoveServer() throws Exception{ @Test public void testFailedMoveWhenMoveTable() throws Exception{ final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); - Pair gotPair = createTableAndSetARegionState(newGroup, 5); + Pair gotPair = createTableWithRegionSplitting(newGroup, + 5); try{ rsGroupAdmin.moveTables(Sets.newHashSet(tableName), newGroup.getName()); - fail("move tables to group should fail"); + fail("should get IOException when retry exhausted but there still exists failed moved " + + "regions"); }catch (IOException e){ assertTrue(e.getMessage().contains( gotPair.getSecond().getRegionInfo().getRegionNameAsString())); } } - private Pair createTableAndSetARegionState(RSGroupInfo rsGroupInfo, + private Pair createTableWithRegionSplitting(RSGroupInfo rsGroupInfo, int tableRegionCount) throws Exception{ final byte[] familyNameBytes = Bytes.toBytes("f"); // All the regions created below will be assigned to the default group. @@ -635,7 +639,7 @@ public boolean evaluate() throws Exception { } }); - return setARegionState(rsGroupInfo); + return randomlySetOneRegionStateToSplitting(rsGroupInfo); } /** @@ -644,8 +648,8 @@ public boolean evaluate() throws Exception { * @return source server of region, and region state * @throws IOException if methods called throw */ - private Pair setARegionState(RSGroupInfo newGroup) - throws IOException{ + private Pair randomlySetOneRegionStateToSplitting( + RSGroupInfo newGroup) throws IOException{ // get target server to move, which should has more than one regions // randomly set a region state to SPLITTING to make move fail Map> assignMap = getTableServerRegionMap().get(tableName);