@@ -331,6 +331,7 @@ public void start() throws IOException, KeeperException {
331331 regionNode .getProcedure ().stateLoaded (this , regionNode );
332332 }
333333 if (regionLocation != null ) {
334+ regionStates .createServer (regionLocation );
334335 regionStates .addRegionToServer (regionNode );
335336 }
336337 if (RegionReplicaUtil .isDefaultReplica (regionInfo .getReplicaId ())) {
@@ -1108,7 +1109,7 @@ public void deleteTable(final TableName tableName) throws IOException {
11081109 // RS Region Transition Report helpers
11091110 // ============================================================================================
11101111 private void reportRegionStateTransition (ReportRegionStateTransitionResponse .Builder builder ,
1111- ServerName serverName , List <RegionStateTransition > transitionList ) throws IOException {
1112+ ServerStateNode serverNode , List <RegionStateTransition > transitionList ) throws IOException {
11121113 for (RegionStateTransition transition : transitionList ) {
11131114 switch (transition .getTransitionCode ()) {
11141115 case OPENED :
@@ -1118,7 +1119,7 @@ private void reportRegionStateTransition(ReportRegionStateTransitionResponse.Bui
11181119 final RegionInfo hri = ProtobufUtil .toRegionInfo (transition .getRegionInfo (0 ));
11191120 long procId =
11201121 transition .getProcIdCount () > 0 ? transition .getProcId (0 ) : Procedure .NO_PROC_ID ;
1121- updateRegionTransition (serverName , transition .getTransitionCode (), hri ,
1122+ updateRegionTransition (serverNode , transition .getTransitionCode (), hri ,
11221123 transition .hasOpenSeqNum () ? transition .getOpenSeqNum () : HConstants .NO_SEQNUM , procId );
11231124 break ;
11241125 case READY_TO_SPLIT :
@@ -1128,7 +1129,7 @@ private void reportRegionStateTransition(ReportRegionStateTransitionResponse.Bui
11281129 final RegionInfo parent = ProtobufUtil .toRegionInfo (transition .getRegionInfo (0 ));
11291130 final RegionInfo splitA = ProtobufUtil .toRegionInfo (transition .getRegionInfo (1 ));
11301131 final RegionInfo splitB = ProtobufUtil .toRegionInfo (transition .getRegionInfo (2 ));
1131- updateRegionSplitTransition (serverName , transition .getTransitionCode (), parent , splitA ,
1132+ updateRegionSplitTransition (serverNode , transition .getTransitionCode (), parent , splitA ,
11321133 splitB );
11331134 break ;
11341135 case READY_TO_MERGE :
@@ -1138,7 +1139,7 @@ private void reportRegionStateTransition(ReportRegionStateTransitionResponse.Bui
11381139 final RegionInfo merged = ProtobufUtil .toRegionInfo (transition .getRegionInfo (0 ));
11391140 final RegionInfo mergeA = ProtobufUtil .toRegionInfo (transition .getRegionInfo (1 ));
11401141 final RegionInfo mergeB = ProtobufUtil .toRegionInfo (transition .getRegionInfo (2 ));
1141- updateRegionMergeTransition (serverName , transition .getTransitionCode (), merged , mergeA ,
1142+ updateRegionMergeTransition (serverNode , transition .getTransitionCode (), merged , mergeA ,
11421143 mergeB );
11431144 break ;
11441145 }
@@ -1150,7 +1151,10 @@ public ReportRegionStateTransitionResponse reportRegionStateTransition(
11501151 ReportRegionStateTransitionResponse .Builder builder =
11511152 ReportRegionStateTransitionResponse .newBuilder ();
11521153 ServerName serverName = ProtobufUtil .toServerName (req .getServer ());
1153- ServerStateNode serverNode = regionStates .getOrCreateServer (serverName );
1154+ ServerStateNode serverNode = regionStates .getServerNode (serverName );
1155+ if (serverNode == null ) {
1156+
1157+ }
11541158 // here we have to acquire a read lock instead of a simple exclusive lock. This is because that
11551159 // we should not block other reportRegionStateTransition call from the same region server. This
11561160 // is not only about performance, but also to prevent dead lock. Think of the meta region is
@@ -1163,7 +1167,7 @@ public ReportRegionStateTransitionResponse reportRegionStateTransition(
11631167 // above in submitServerCrash method and HBASE-21508 for more details.
11641168 if (serverNode .isInState (ServerState .ONLINE )) {
11651169 try {
1166- reportRegionStateTransition (builder , serverName , req .getTransitionList ());
1170+ reportRegionStateTransition (builder , serverNode , req .getTransitionList ());
11671171 } catch (PleaseHoldException e ) {
11681172 LOG .trace ("Failed transition " , e );
11691173 throw e ;
@@ -1185,21 +1189,20 @@ public ReportRegionStateTransitionResponse reportRegionStateTransition(
11851189 return builder .build ();
11861190 }
11871191
1188- private void updateRegionTransition (ServerName serverName , TransitionCode state ,
1192+ private void updateRegionTransition (ServerStateNode serverNode , TransitionCode state ,
11891193 RegionInfo regionInfo , long seqId , long procId ) throws IOException {
11901194 checkMetaLoaded (regionInfo );
11911195
11921196 RegionStateNode regionNode = regionStates .getRegionStateNode (regionInfo );
11931197 if (regionNode == null ) {
11941198 // the table/region is gone. maybe a delete, split, merge
11951199 throw new UnexpectedStateException (String .format (
1196- "Server %s was trying to transition region %s to %s. but Region is not known." , serverName ,
1197- regionInfo , state ));
1200+ "Server %s was trying to transition region %s to %s. but Region is not known." ,
1201+ serverNode . getServerName (), regionInfo , state ));
11981202 }
1199- LOG .trace ("Update region transition serverName={} region={} regionState={}" , serverName ,
1200- regionNode , state );
1203+ LOG .trace ("Update region transition serverName={} region={} regionState={}" ,
1204+ serverNode . getServerName (), regionNode , state );
12011205
1202- ServerStateNode serverNode = regionStates .getOrCreateServer (serverName );
12031206 regionNode .lock ();
12041207 try {
12051208 if (!reportTransition (regionNode , serverNode , state , seqId , procId )) {
@@ -1216,8 +1219,8 @@ private void updateRegionTransition(ServerName serverName, TransitionCode state,
12161219 ) {
12171220 LOG .info ("RegionServer {} {}" , state , regionNode .getRegionInfo ().getEncodedName ());
12181221 } else {
1219- LOG .warn ("No matching procedure found for {} transition on {} to {}" , serverName ,
1220- regionNode , state );
1222+ LOG .warn ("No matching procedure found for {} transition on {} to {}" ,
1223+ serverNode . getServerName (), regionNode , state );
12211224 }
12221225 }
12231226 } finally {
@@ -1237,8 +1240,9 @@ private boolean reportTransition(RegionStateNode regionNode, ServerStateNode ser
12371240 return true ;
12381241 }
12391242
1240- private void updateRegionSplitTransition (final ServerName serverName , final TransitionCode state ,
1241- final RegionInfo parent , final RegionInfo hriA , final RegionInfo hriB ) throws IOException {
1243+ private void updateRegionSplitTransition (final ServerStateNode serverNode ,
1244+ final TransitionCode state , final RegionInfo parent , final RegionInfo hriA ,
1245+ final RegionInfo hriB ) throws IOException {
12421246 checkMetaLoaded (parent );
12431247
12441248 if (state != TransitionCode .READY_TO_SPLIT ) {
@@ -1262,8 +1266,8 @@ private void updateRegionSplitTransition(final ServerName serverName, final Tran
12621266 // Submit the Split procedure
12631267 final byte [] splitKey = hriB .getStartKey ();
12641268 if (LOG .isDebugEnabled ()) {
1265- LOG .debug ("Split request from " + serverName + " , parent=" + parent + " splitKey="
1266- + Bytes .toStringBinary (splitKey ));
1269+ LOG .debug ("Split request from {} , parent={}, splitKey={}" , serverNode . getServerName (), parent ,
1270+ Bytes .toStringBinary (splitKey ));
12671271 }
12681272 // Processing this report happens asynchronously from other activities which can mutate
12691273 // the region state. For example, a split procedure may already be running for this parent.
@@ -1277,21 +1281,22 @@ private void updateRegionSplitTransition(final ServerName serverName, final Tran
12771281 if (parentState != null && parentState .isOpened ()) {
12781282 master .getMasterProcedureExecutor ().submitProcedure (createSplitProcedure (parent , splitKey ));
12791283 } else {
1280- LOG .info ("Ignoring split request from " + serverName + " , parent=" + parent
1281- + " because parent is unknown or not open" );
1284+ LOG .info ("Ignoring split request from {} , parent={} because parent is unknown or not open" ,
1285+ serverNode . getServerName (), parent );
12821286 return ;
12831287 }
12841288
12851289 // If the RS is < 2.0 throw an exception to abort the operation, we are handling the split
1286- if (master .getServerManager ().getVersionNumber (serverName ) < 0x0200000 ) {
1290+ if (master .getServerManager ().getVersionNumber (serverNode . getServerName () ) < 0x0200000 ) {
12871291 throw new UnsupportedOperationException (
12881292 String .format ("Split handled by the master: " + "parent=%s hriA=%s hriB=%s" ,
12891293 parent .getShortNameToLog (), hriA , hriB ));
12901294 }
12911295 }
12921296
1293- private void updateRegionMergeTransition (final ServerName serverName , final TransitionCode state ,
1294- final RegionInfo merged , final RegionInfo hriA , final RegionInfo hriB ) throws IOException {
1297+ private void updateRegionMergeTransition (final ServerStateNode serverNode ,
1298+ final TransitionCode state , final RegionInfo merged , final RegionInfo hriA ,
1299+ final RegionInfo hriB ) throws IOException {
12951300 checkMetaLoaded (merged );
12961301
12971302 if (state != TransitionCode .READY_TO_MERGE ) {
@@ -1313,7 +1318,7 @@ private void updateRegionMergeTransition(final ServerName serverName, final Tran
13131318 master .getMasterProcedureExecutor ().submitProcedure (createMergeProcedure (hriA , hriB ));
13141319
13151320 // If the RS is < 2.0 throw an exception to abort the operation, we are handling the merge
1316- if (master .getServerManager ().getVersionNumber (serverName ) < 0x0200000 ) {
1321+ if (master .getServerManager ().getVersionNumber (serverNode . getServerName () ) < 0x0200000 ) {
13171322 throw new UnsupportedOperationException (
13181323 String .format ("Merge not handled yet: regionState=%s merged=%s hriA=%s hriB=%s" , state ,
13191324 merged , hriA , hriB ));
@@ -1343,12 +1348,19 @@ public void reportOnlineRegions(ServerName serverName, Set<byte[]> regionNames)
13431348 regionNames .stream ().map (Bytes ::toStringBinary ).collect (Collectors .toList ()));
13441349 }
13451350
1346- ServerStateNode serverNode = regionStates .getOrCreateServer (serverName );
1347- synchronized (serverNode ) {
1351+ ServerStateNode serverNode = regionStates .getServerNode (serverName );
1352+ if (serverNode == null ) {
1353+ LOG .warn ("Got a report from server {} where its server node is null" , serverName );
1354+ return ;
1355+ }
1356+ serverNode .readLock ().lock ();
1357+ try {
13481358 if (!serverNode .isInState (ServerState .ONLINE )) {
1349- LOG .warn ("Got a report from a server result in state " + serverNode . getState () );
1359+ LOG .warn ("Got a report from a server result in state {}" , serverNode );
13501360 return ;
13511361 }
1362+ } finally {
1363+ serverNode .readLock ().unlock ();
13521364 }
13531365
13541366 // Track the regionserver reported online regions in memory.
@@ -1839,52 +1851,60 @@ public long submitServerCrash(ServerName serverName, boolean shouldSplitWal, boo
18391851 synchronized (rsReports ) {
18401852 rsReports .remove (serverName );
18411853 }
1854+ if (serverNode == null ) {
1855+ if (force ) {
1856+ LOG .info ("Force adding ServerCrashProcedure for {} when server node is null" , serverName );
1857+ } else {
1858+ // for normal case, do not schedule SCP if ServerStateNode is null
1859+ LOG .warn ("Skip adding ServerCrashProcedure for {} because server node is null" , serverName );
1860+ return Procedure .NO_PROC_ID ;
1861+ }
1862+ }
18421863
1864+ ProcedureExecutor <MasterProcedureEnv > procExec = this .master .getMasterProcedureExecutor ();
18431865 // We hold the write lock here for fencing on reportRegionStateTransition. Once we set the
18441866 // server state to CRASHED, we will no longer accept the reportRegionStateTransition call from
18451867 // this server. This is used to simplify the implementation for TRSP and SCP, where we can make
18461868 // sure that, the region list fetched by SCP will not be changed any more.
18471869 if (serverNode != null ) {
18481870 serverNode .writeLock ().lock ();
18491871 }
1850- boolean carryingMeta ;
1851- long pid ;
18521872 try {
1853- ProcedureExecutor <MasterProcedureEnv > procExec = this .master .getMasterProcedureExecutor ();
1854- carryingMeta = isCarryingMeta (serverName );
1855- if (!force && serverNode != null && !serverNode .isInState (ServerState .ONLINE )) {
1856- LOG .info ("Skip adding ServerCrashProcedure for {} (meta={}) -- running?" , serverNode ,
1857- carryingMeta );
1858- return Procedure .NO_PROC_ID ;
1859- } else {
1860- MasterProcedureEnv mpe = procExec .getEnvironment ();
1861- // If serverNode == null, then 'Unknown Server'. Schedule HBCKSCP instead.
1862- // HBCKSCP scours Master in-memory state AND hbase;meta for references to
1863- // serverName just-in-case. An SCP that is scheduled when the server is
1864- // 'Unknown' probably originated externally with HBCK2 fix-it tool.
1865- ServerState oldState = null ;
1866- if (serverNode != null ) {
1867- oldState = serverNode .getState ();
1868- serverNode .setState (ServerState .CRASHED );
1869- }
18701873
1874+ boolean carryingMeta = isCarryingMeta (serverName );
1875+ if (serverNode != null && !serverNode .isInState (ServerState .ONLINE )) {
18711876 if (force ) {
1872- pid = procExec . submitProcedure (
1873- new HBCKServerCrashProcedure ( mpe , serverName , shouldSplitWal , carryingMeta ) );
1877+ LOG . info ( "Force adding ServerCrashProcedure for {} (meta={}) when state is not {}" ,
1878+ serverNode , carryingMeta , ServerState . ONLINE );
18741879 } else {
1875- pid = procExec .submitProcedure (
1876- new ServerCrashProcedure (mpe , serverName , shouldSplitWal , carryingMeta ));
1880+ LOG .info ("Skip adding ServerCrashProcedure for {} (meta={}) when state is not {}" ,
1881+ serverNode , carryingMeta , ServerState .ONLINE );
1882+ return Procedure .NO_PROC_ID ;
18771883 }
1878- LOG .info ("Scheduled ServerCrashProcedure pid={} for {} (carryingMeta={}){}." , pid ,
1879- serverName , carryingMeta ,
1880- serverNode == null ? "" : " " + serverNode .toString () + ", oldState=" + oldState );
18811884 }
1885+ MasterProcedureEnv mpe = procExec .getEnvironment ();
1886+ // If serverNode == null, then 'Unknown Server'. Schedule HBCKSCP instead.
1887+ // HBCKSCP scours Master in-memory state AND hbase;meta for references to
1888+ // serverName just-in-case. An SCP that is scheduled when the server is
1889+ // 'Unknown' probably originated externally with HBCK2 fix-it tool.
1890+ ServerState oldState = null ;
1891+ if (serverNode != null ) {
1892+ oldState = serverNode .getState ();
1893+ serverNode .setState (ServerState .CRASHED );
1894+ }
1895+ ServerCrashProcedure scp = force
1896+ ? new HBCKServerCrashProcedure (mpe , serverName , shouldSplitWal , carryingMeta )
1897+ : new ServerCrashProcedure (mpe , serverName , shouldSplitWal , carryingMeta );
1898+ long pid = procExec .submitProcedure (scp );
1899+ LOG .info ("Scheduled ServerCrashProcedure pid={} for {} (carryingMeta={}){}." , pid , serverName ,
1900+ carryingMeta ,
1901+ serverNode == null ? "" : " " + serverNode .toString () + ", oldState=" + oldState );
1902+ return pid ;
18821903 } finally {
18831904 if (serverNode != null ) {
18841905 serverNode .writeLock ().unlock ();
18851906 }
18861907 }
1887- return pid ;
18881908 }
18891909
18901910 public void offlineRegion (final RegionInfo regionInfo ) {
0 commit comments