Skip to content

Commit 6a5e340

Browse files
committed
HBASE-28180 Review the usage of RegionStates.getOrCreateServer
1 parent 208e9b1 commit 6a5e340

File tree

7 files changed

+112
-80
lines changed

7 files changed

+112
-80
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ public void upgrade(Set<ServerName> deadServersFromPE, Set<ServerName> liveServe
135135
.forEach(s -> LOG.error("{} has no matching ServerCrashProcedure", s));
136136
// create ServerNode for all possible live servers from wal directory and master local region
137137
liveServersBeforeRestart
138-
.forEach(sn -> server.getAssignmentManager().getRegionStates().getOrCreateServer(sn));
138+
.forEach(sn -> server.getAssignmentManager().getRegionStates().createServer(sn));
139139
ServerManager serverManager = server.getServerManager();
140140
synchronized (this) {
141141
Set<ServerName> liveServers = regionServers;

hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,7 @@ public ServerName findServerWithSameHostnamePortWithLock(final ServerName server
426426
void recordNewServerWithLock(final ServerName serverName, final ServerMetrics sl) {
427427
LOG.info("Registering regionserver=" + serverName);
428428
this.onlineServers.put(serverName, sl);
429+
master.getAssignmentManager().getRegionStates().createServer(serverName);
429430
}
430431

431432
public ConcurrentNavigableMap<byte[], Long> getFlushedSequenceIdByRegion() {
@@ -603,6 +604,10 @@ synchronized long expireServer(final ServerName serverName, boolean force) {
603604
}
604605
LOG.info("Processing expiration of " + serverName + " on " + this.master.getServerName());
605606
long pid = master.getAssignmentManager().submitServerCrash(serverName, true, force);
607+
if (pid == Procedure.NO_PROC_ID) {
608+
// skip later processing as we failed to submit SCP
609+
return Procedure.NO_PROC_ID;
610+
}
606611
storage.expired(serverName);
607612
// Tell our listeners that a server was removed
608613
if (!this.listeners.isEmpty()) {

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java

Lines changed: 74 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,7 @@ public void start() throws IOException, KeeperException {
331331
regionNode.getProcedure().stateLoaded(this, regionNode);
332332
}
333333
if (regionLocation != null) {
334+
regionStates.createServer(regionLocation);
334335
regionStates.addRegionToServer(regionNode);
335336
}
336337
if (RegionReplicaUtil.isDefaultReplica(regionInfo.getReplicaId())) {
@@ -1108,7 +1109,7 @@ public void deleteTable(final TableName tableName) throws IOException {
11081109
// RS Region Transition Report helpers
11091110
// ============================================================================================
11101111
private void reportRegionStateTransition(ReportRegionStateTransitionResponse.Builder builder,
1111-
ServerName serverName, List<RegionStateTransition> transitionList) throws IOException {
1112+
ServerStateNode serverNode, List<RegionStateTransition> transitionList) throws IOException {
11121113
for (RegionStateTransition transition : transitionList) {
11131114
switch (transition.getTransitionCode()) {
11141115
case OPENED:
@@ -1118,7 +1119,7 @@ private void reportRegionStateTransition(ReportRegionStateTransitionResponse.Bui
11181119
final RegionInfo hri = ProtobufUtil.toRegionInfo(transition.getRegionInfo(0));
11191120
long procId =
11201121
transition.getProcIdCount() > 0 ? transition.getProcId(0) : Procedure.NO_PROC_ID;
1121-
updateRegionTransition(serverName, transition.getTransitionCode(), hri,
1122+
updateRegionTransition(serverNode, transition.getTransitionCode(), hri,
11221123
transition.hasOpenSeqNum() ? transition.getOpenSeqNum() : HConstants.NO_SEQNUM, procId);
11231124
break;
11241125
case READY_TO_SPLIT:
@@ -1128,7 +1129,7 @@ private void reportRegionStateTransition(ReportRegionStateTransitionResponse.Bui
11281129
final RegionInfo parent = ProtobufUtil.toRegionInfo(transition.getRegionInfo(0));
11291130
final RegionInfo splitA = ProtobufUtil.toRegionInfo(transition.getRegionInfo(1));
11301131
final RegionInfo splitB = ProtobufUtil.toRegionInfo(transition.getRegionInfo(2));
1131-
updateRegionSplitTransition(serverName, transition.getTransitionCode(), parent, splitA,
1132+
updateRegionSplitTransition(serverNode, transition.getTransitionCode(), parent, splitA,
11321133
splitB);
11331134
break;
11341135
case READY_TO_MERGE:
@@ -1138,7 +1139,7 @@ private void reportRegionStateTransition(ReportRegionStateTransitionResponse.Bui
11381139
final RegionInfo merged = ProtobufUtil.toRegionInfo(transition.getRegionInfo(0));
11391140
final RegionInfo mergeA = ProtobufUtil.toRegionInfo(transition.getRegionInfo(1));
11401141
final RegionInfo mergeB = ProtobufUtil.toRegionInfo(transition.getRegionInfo(2));
1141-
updateRegionMergeTransition(serverName, transition.getTransitionCode(), merged, mergeA,
1142+
updateRegionMergeTransition(serverNode, transition.getTransitionCode(), merged, mergeA,
11421143
mergeB);
11431144
break;
11441145
}
@@ -1150,7 +1151,10 @@ public ReportRegionStateTransitionResponse reportRegionStateTransition(
11501151
ReportRegionStateTransitionResponse.Builder builder =
11511152
ReportRegionStateTransitionResponse.newBuilder();
11521153
ServerName serverName = ProtobufUtil.toServerName(req.getServer());
1153-
ServerStateNode serverNode = regionStates.getOrCreateServer(serverName);
1154+
ServerStateNode serverNode = regionStates.getServerNode(serverName);
1155+
if (serverNode == null) {
1156+
1157+
}
11541158
// here we have to acquire a read lock instead of a simple exclusive lock. This is because that
11551159
// we should not block other reportRegionStateTransition call from the same region server. This
11561160
// is not only about performance, but also to prevent dead lock. Think of the meta region is
@@ -1163,7 +1167,7 @@ public ReportRegionStateTransitionResponse reportRegionStateTransition(
11631167
// above in submitServerCrash method and HBASE-21508 for more details.
11641168
if (serverNode.isInState(ServerState.ONLINE)) {
11651169
try {
1166-
reportRegionStateTransition(builder, serverName, req.getTransitionList());
1170+
reportRegionStateTransition(builder, serverNode, req.getTransitionList());
11671171
} catch (PleaseHoldException e) {
11681172
LOG.trace("Failed transition ", e);
11691173
throw e;
@@ -1185,21 +1189,20 @@ public ReportRegionStateTransitionResponse reportRegionStateTransition(
11851189
return builder.build();
11861190
}
11871191

1188-
private void updateRegionTransition(ServerName serverName, TransitionCode state,
1192+
private void updateRegionTransition(ServerStateNode serverNode, TransitionCode state,
11891193
RegionInfo regionInfo, long seqId, long procId) throws IOException {
11901194
checkMetaLoaded(regionInfo);
11911195

11921196
RegionStateNode regionNode = regionStates.getRegionStateNode(regionInfo);
11931197
if (regionNode == null) {
11941198
// the table/region is gone. maybe a delete, split, merge
11951199
throw new UnexpectedStateException(String.format(
1196-
"Server %s was trying to transition region %s to %s. but Region is not known.", serverName,
1197-
regionInfo, state));
1200+
"Server %s was trying to transition region %s to %s. but Region is not known.",
1201+
serverNode.getServerName(), regionInfo, state));
11981202
}
1199-
LOG.trace("Update region transition serverName={} region={} regionState={}", serverName,
1200-
regionNode, state);
1203+
LOG.trace("Update region transition serverName={} region={} regionState={}",
1204+
serverNode.getServerName(), regionNode, state);
12011205

1202-
ServerStateNode serverNode = regionStates.getOrCreateServer(serverName);
12031206
regionNode.lock();
12041207
try {
12051208
if (!reportTransition(regionNode, serverNode, state, seqId, procId)) {
@@ -1216,8 +1219,8 @@ private void updateRegionTransition(ServerName serverName, TransitionCode state,
12161219
) {
12171220
LOG.info("RegionServer {} {}", state, regionNode.getRegionInfo().getEncodedName());
12181221
} else {
1219-
LOG.warn("No matching procedure found for {} transition on {} to {}", serverName,
1220-
regionNode, state);
1222+
LOG.warn("No matching procedure found for {} transition on {} to {}",
1223+
serverNode.getServerName(), regionNode, state);
12211224
}
12221225
}
12231226
} finally {
@@ -1237,8 +1240,9 @@ private boolean reportTransition(RegionStateNode regionNode, ServerStateNode ser
12371240
return true;
12381241
}
12391242

1240-
private void updateRegionSplitTransition(final ServerName serverName, final TransitionCode state,
1241-
final RegionInfo parent, final RegionInfo hriA, final RegionInfo hriB) throws IOException {
1243+
private void updateRegionSplitTransition(final ServerStateNode serverNode,
1244+
final TransitionCode state, final RegionInfo parent, final RegionInfo hriA,
1245+
final RegionInfo hriB) throws IOException {
12421246
checkMetaLoaded(parent);
12431247

12441248
if (state != TransitionCode.READY_TO_SPLIT) {
@@ -1262,8 +1266,8 @@ private void updateRegionSplitTransition(final ServerName serverName, final Tran
12621266
// Submit the Split procedure
12631267
final byte[] splitKey = hriB.getStartKey();
12641268
if (LOG.isDebugEnabled()) {
1265-
LOG.debug("Split request from " + serverName + ", parent=" + parent + " splitKey="
1266-
+ Bytes.toStringBinary(splitKey));
1269+
LOG.debug("Split request from {}, parent={}, splitKey={}", serverNode.getServerName(), parent,
1270+
Bytes.toStringBinary(splitKey));
12671271
}
12681272
// Processing this report happens asynchronously from other activities which can mutate
12691273
// the region state. For example, a split procedure may already be running for this parent.
@@ -1277,21 +1281,22 @@ private void updateRegionSplitTransition(final ServerName serverName, final Tran
12771281
if (parentState != null && parentState.isOpened()) {
12781282
master.getMasterProcedureExecutor().submitProcedure(createSplitProcedure(parent, splitKey));
12791283
} else {
1280-
LOG.info("Ignoring split request from " + serverName + ", parent=" + parent
1281-
+ " because parent is unknown or not open");
1284+
LOG.info("Ignoring split request from {}, parent={} because parent is unknown or not open",
1285+
serverNode.getServerName(), parent);
12821286
return;
12831287
}
12841288

12851289
// If the RS is < 2.0 throw an exception to abort the operation, we are handling the split
1286-
if (master.getServerManager().getVersionNumber(serverName) < 0x0200000) {
1290+
if (master.getServerManager().getVersionNumber(serverNode.getServerName()) < 0x0200000) {
12871291
throw new UnsupportedOperationException(
12881292
String.format("Split handled by the master: " + "parent=%s hriA=%s hriB=%s",
12891293
parent.getShortNameToLog(), hriA, hriB));
12901294
}
12911295
}
12921296

1293-
private void updateRegionMergeTransition(final ServerName serverName, final TransitionCode state,
1294-
final RegionInfo merged, final RegionInfo hriA, final RegionInfo hriB) throws IOException {
1297+
private void updateRegionMergeTransition(final ServerStateNode serverNode,
1298+
final TransitionCode state, final RegionInfo merged, final RegionInfo hriA,
1299+
final RegionInfo hriB) throws IOException {
12951300
checkMetaLoaded(merged);
12961301

12971302
if (state != TransitionCode.READY_TO_MERGE) {
@@ -1313,7 +1318,7 @@ private void updateRegionMergeTransition(final ServerName serverName, final Tran
13131318
master.getMasterProcedureExecutor().submitProcedure(createMergeProcedure(hriA, hriB));
13141319

13151320
// If the RS is < 2.0 throw an exception to abort the operation, we are handling the merge
1316-
if (master.getServerManager().getVersionNumber(serverName) < 0x0200000) {
1321+
if (master.getServerManager().getVersionNumber(serverNode.getServerName()) < 0x0200000) {
13171322
throw new UnsupportedOperationException(
13181323
String.format("Merge not handled yet: regionState=%s merged=%s hriA=%s hriB=%s", state,
13191324
merged, hriA, hriB));
@@ -1343,12 +1348,19 @@ public void reportOnlineRegions(ServerName serverName, Set<byte[]> regionNames)
13431348
regionNames.stream().map(Bytes::toStringBinary).collect(Collectors.toList()));
13441349
}
13451350

1346-
ServerStateNode serverNode = regionStates.getOrCreateServer(serverName);
1347-
synchronized (serverNode) {
1351+
ServerStateNode serverNode = regionStates.getServerNode(serverName);
1352+
if (serverNode == null) {
1353+
LOG.warn("Got a report from server {} where its server node is null", serverName);
1354+
return;
1355+
}
1356+
serverNode.readLock().lock();
1357+
try {
13481358
if (!serverNode.isInState(ServerState.ONLINE)) {
1349-
LOG.warn("Got a report from a server result in state " + serverNode.getState());
1359+
LOG.warn("Got a report from a server result in state {}", serverNode);
13501360
return;
13511361
}
1362+
} finally {
1363+
serverNode.readLock().unlock();
13521364
}
13531365

13541366
// Track the regionserver reported online regions in memory.
@@ -1839,52 +1851,60 @@ public long submitServerCrash(ServerName serverName, boolean shouldSplitWal, boo
18391851
synchronized (rsReports) {
18401852
rsReports.remove(serverName);
18411853
}
1854+
if (serverNode == null) {
1855+
if (force) {
1856+
LOG.info("Force adding ServerCrashProcedure for {} when server node is null", serverName);
1857+
} else {
1858+
// for normal case, do not schedule SCP if ServerStateNode is null
1859+
LOG.warn("Skip adding ServerCrashProcedure for {} because server node is null", serverName);
1860+
return Procedure.NO_PROC_ID;
1861+
}
1862+
}
18421863

1864+
ProcedureExecutor<MasterProcedureEnv> procExec = this.master.getMasterProcedureExecutor();
18431865
// We hold the write lock here for fencing on reportRegionStateTransition. Once we set the
18441866
// server state to CRASHED, we will no longer accept the reportRegionStateTransition call from
18451867
// this server. This is used to simplify the implementation for TRSP and SCP, where we can make
18461868
// sure that, the region list fetched by SCP will not be changed any more.
18471869
if (serverNode != null) {
18481870
serverNode.writeLock().lock();
18491871
}
1850-
boolean carryingMeta;
1851-
long pid;
18521872
try {
1853-
ProcedureExecutor<MasterProcedureEnv> procExec = this.master.getMasterProcedureExecutor();
1854-
carryingMeta = isCarryingMeta(serverName);
1855-
if (!force && serverNode != null && !serverNode.isInState(ServerState.ONLINE)) {
1856-
LOG.info("Skip adding ServerCrashProcedure for {} (meta={}) -- running?", serverNode,
1857-
carryingMeta);
1858-
return Procedure.NO_PROC_ID;
1859-
} else {
1860-
MasterProcedureEnv mpe = procExec.getEnvironment();
1861-
// If serverNode == null, then 'Unknown Server'. Schedule HBCKSCP instead.
1862-
// HBCKSCP scours Master in-memory state AND hbase;meta for references to
1863-
// serverName just-in-case. An SCP that is scheduled when the server is
1864-
// 'Unknown' probably originated externally with HBCK2 fix-it tool.
1865-
ServerState oldState = null;
1866-
if (serverNode != null) {
1867-
oldState = serverNode.getState();
1868-
serverNode.setState(ServerState.CRASHED);
1869-
}
18701873

1874+
boolean carryingMeta = isCarryingMeta(serverName);
1875+
if (serverNode != null && !serverNode.isInState(ServerState.ONLINE)) {
18711876
if (force) {
1872-
pid = procExec.submitProcedure(
1873-
new HBCKServerCrashProcedure(mpe, serverName, shouldSplitWal, carryingMeta));
1877+
LOG.info("Force adding ServerCrashProcedure for {} (meta={}) when state is not {}",
1878+
serverNode, carryingMeta, ServerState.ONLINE);
18741879
} else {
1875-
pid = procExec.submitProcedure(
1876-
new ServerCrashProcedure(mpe, serverName, shouldSplitWal, carryingMeta));
1880+
LOG.info("Skip adding ServerCrashProcedure for {} (meta={}) when state is not {}",
1881+
serverNode, carryingMeta, ServerState.ONLINE);
1882+
return Procedure.NO_PROC_ID;
18771883
}
1878-
LOG.info("Scheduled ServerCrashProcedure pid={} for {} (carryingMeta={}){}.", pid,
1879-
serverName, carryingMeta,
1880-
serverNode == null ? "" : " " + serverNode.toString() + ", oldState=" + oldState);
18811884
}
1885+
MasterProcedureEnv mpe = procExec.getEnvironment();
1886+
// If serverNode == null, then 'Unknown Server'. Schedule HBCKSCP instead.
1887+
// HBCKSCP scours Master in-memory state AND hbase;meta for references to
1888+
// serverName just-in-case. An SCP that is scheduled when the server is
1889+
// 'Unknown' probably originated externally with HBCK2 fix-it tool.
1890+
ServerState oldState = null;
1891+
if (serverNode != null) {
1892+
oldState = serverNode.getState();
1893+
serverNode.setState(ServerState.CRASHED);
1894+
}
1895+
ServerCrashProcedure scp = force
1896+
? new HBCKServerCrashProcedure(mpe, serverName, shouldSplitWal, carryingMeta)
1897+
: new ServerCrashProcedure(mpe, serverName, shouldSplitWal, carryingMeta);
1898+
long pid = procExec.submitProcedure(scp);
1899+
LOG.info("Scheduled ServerCrashProcedure pid={} for {} (carryingMeta={}){}.", pid, serverName,
1900+
carryingMeta,
1901+
serverNode == null ? "" : " " + serverNode.toString() + ", oldState=" + oldState);
1902+
return pid;
18821903
} finally {
18831904
if (serverNode != null) {
18841905
serverNode.writeLock().unlock();
18851906
}
18861907
}
1887-
return pid;
18881908
}
18891909

18901910
public void offlineRegion(final RegionInfo regionInfo) {

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,7 @@ private boolean include(final RegionStateNode node, final boolean offline) {
405405
// ============================================================================================
406406

407407
private void setServerState(ServerName serverName, ServerState state) {
408-
ServerStateNode serverNode = getOrCreateServer(serverName);
408+
ServerStateNode serverNode = getServerNode(serverName);
409409
synchronized (serverNode) {
410410
serverNode.setState(state);
411411
}
@@ -746,18 +746,16 @@ public List<RegionState> getRegionFailedOpen() {
746746
// ==========================================================================
747747

748748
/**
749-
* Be judicious calling this method. Do it on server register ONLY otherwise you could mess up
750-
* online server accounting. TOOD: Review usage and convert to {@link #getServerNode(ServerName)}
751-
* where we can.
749+
* Create the ServerStateNode when registering a new region server
752750
*/
753-
public ServerStateNode getOrCreateServer(final ServerName serverName) {
754-
return serverMap.computeIfAbsent(serverName, key -> new ServerStateNode(key));
751+
public void createServer(ServerName serverName) {
752+
serverMap.computeIfAbsent(serverName, key -> new ServerStateNode(key));
755753
}
756754

757755
/**
758756
* Called by SCP at end of successful processing.
759757
*/
760-
public void removeServer(final ServerName serverName) {
758+
public void removeServer(ServerName serverName) {
761759
serverMap.remove(serverName);
762760
}
763761

@@ -777,14 +775,14 @@ public double getAverageLoad() {
777775
}
778776

779777
public ServerStateNode addRegionToServer(final RegionStateNode regionNode) {
780-
ServerStateNode serverNode = getOrCreateServer(regionNode.getRegionLocation());
778+
ServerStateNode serverNode = getServerNode(regionNode.getRegionLocation());
781779
serverNode.addRegion(regionNode);
782780
return serverNode;
783781
}
784782

785783
public ServerStateNode removeRegionFromServer(final ServerName serverName,
786784
final RegionStateNode regionNode) {
787-
ServerStateNode serverNode = getOrCreateServer(serverName);
785+
ServerStateNode serverNode = getServerNode(serverName);
788786
serverNode.removeRegion(regionNode);
789787
return serverNode;
790788
}

hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestBalancer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ public void testAssignmentsForBalancer() throws Exception {
7878
AssignmentManager assignmentManager = master.getAssignmentManager();
7979
RegionStates regionStates = assignmentManager.getRegionStates();
8080
ServerName sn1 = ServerName.parseServerName("asf903.gq1.ygridcore.net,52690,1517835491385");
81-
regionStates.getOrCreateServer(sn1);
81+
regionStates.createServer(sn1);
8282

8383
TableStateManager tableStateManager = master.getTableStateManager();
8484
ServerManager serverManager = master.getServerManager();

0 commit comments

Comments
 (0)