From 91406d03ef335e865db2c2c23e442050b59f5ac5 Mon Sep 17 00:00:00 2001 From: Duo Zhang Date: Mon, 15 Aug 2022 10:46:27 +0800 Subject: [PATCH 01/16] HBASE-27212 Implement a new table based replication queue storage and make the minimum replication system work (#4672) Signed-off-by: Xin Sun --- .../server/master/MasterProcedure.proto | 17 +- .../replication/ReplicationGroupOffset.java | 57 ++ .../replication/ReplicationQueueData.java | 47 + .../hbase/replication/ReplicationQueueId.java | 141 +++ .../replication/ReplicationQueueStorage.java | 154 ++- .../ReplicationStorageFactory.java | 43 +- .../hbase/replication/ReplicationUtils.java | 15 - .../TableReplicationQueueStorage.java | 535 ++++++++++ .../ZKReplicationQueueStorage.java | 689 ------------- .../TestReplicationStateBasic.java | 243 +---- .../TestReplicationStateZKImpl.java | 1 - .../TestZKReplicationQueueStorage.java | 341 ------- .../apache/hadoop/hbase/master/HMaster.java | 3 +- .../procedure/ServerCrashProcedure.java | 26 +- .../AssignReplicationQueuesProcedure.java | 204 ++++ .../ClaimReplicationQueueRemoteProcedure.java | 44 +- .../ClaimReplicationQueuesProcedure.java | 44 +- .../replication/ReplicationPeerManager.java | 128 ++- .../hbase/regionserver/wal/AbstractFSWAL.java | 6 +- .../master/ReplicationHFileCleaner.java | 71 +- .../master/ReplicationLogCleaner.java | 25 +- .../ClaimReplicationQueueCallable.java | 17 +- .../regionserver/DumpReplicationQueues.java | 62 +- .../regionserver/PeerProcedureHandler.java | 5 +- .../PeerProcedureHandlerImpl.java | 6 +- .../RecoveredReplicationSource.java | 138 +-- .../RecoveredReplicationSourceShipper.java | 64 +- .../replication/regionserver/Replication.java | 2 +- .../regionserver/ReplicationLoad.java | 4 +- .../regionserver/ReplicationSource.java | 70 +- .../ReplicationSourceFactory.java | 7 +- .../ReplicationSourceInterface.java | 26 +- .../ReplicationSourceLogQueue.java | 4 +- .../ReplicationSourceManager.java | 302 +++--- .../ReplicationSourceShipper.java | 6 +- .../ReplicationSourceWALActionListener.java | 5 - .../regionserver/ReplicationSyncUp.java | 23 +- .../apache/hadoop/hbase/util/HBaseFsck.java | 4 +- .../hbase/util/hbck/ReplicationChecker.java | 47 +- .../hbase/wal/AbstractFSWALProvider.java | 40 + .../client/TestAsyncReplicationAdminApi.java | 10 +- .../master/assignment/MockMasterServices.java | 2 +- .../hbase/master/cleaner/TestLogsCleaner.java | 42 +- .../cleaner/TestReplicationHFileCleaner.java | 93 +- .../TestBulkLoadReplicationHFileRefs.java | 2 +- ...upWithLegacyRegionReplicationEndpoint.java | 35 +- .../replication/ReplicationSourceDummy.java | 13 +- .../TestAddToSerialReplicationPeer.java | 5 +- .../TestClaimReplicationQueue.java | 8 +- ...amespaceReplicationWithBulkLoadedData.java | 4 +- .../TestReplicationEmptyWALRecovery.java | 1 - .../TestReplicationSyncUpTool.java | 3 + ...plicationSyncUpToolWithBulkLoadedData.java | 3 + .../TestSerialReplicationFailover.java | 3 + .../TestTableReplicationQueueStorage.java | 423 ++++++++ ...icationSyncUpToolWithMultipleAsyncWAL.java | 3 + ...tReplicationSyncUpToolWithMultipleWAL.java | 3 + .../TestDrainReplicationQueuesForStandBy.java | 3 + .../TestDumpReplicationQueues.java | 3 + .../regionserver/TestReplicationSource.java | 82 +- .../TestReplicationSourceManager.java | 933 ++++-------------- .../TestReplicationSourceManagerZkImpl.java | 107 -- .../regionserver/TestReplicationWALEdits.java | 147 +++ .../TestSerialReplicationChecker.java | 17 +- .../regionserver/TestWALEntrySinkFilter.java | 23 - ...TestHBaseFsckCleanReplicationBarriers.java | 20 +- .../hbase/util/TestHBaseFsckEncryption.java | 3 + .../hadoop/hbase/util/TestHBaseFsckMOB.java | 3 + .../hbase/util/TestHBaseFsckReplication.java | 103 +- 69 files changed, 2708 insertions(+), 3055 deletions(-) create mode 100644 hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationGroupOffset.java create mode 100644 hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueData.java create mode 100644 hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueId.java create mode 100644 hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/TableReplicationQueueStorage.java delete mode 100644 hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ZKReplicationQueueStorage.java delete mode 100644 hbase-replication/src/test/java/org/apache/hadoop/hbase/replication/TestZKReplicationQueueStorage.java create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AssignReplicationQueuesProcedure.java create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestTableReplicationQueueStorage.java delete mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManagerZkImpl.java create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationWALEdits.java diff --git a/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto b/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto index 59bb031589af..2e0da0deb842 100644 --- a/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto +++ b/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto @@ -679,16 +679,13 @@ message ClaimReplicationQueueRemoteStateData { required ServerName crashed_server = 1; required string queue = 2; required ServerName target_server = 3; + optional ServerName source_server = 4; } message ClaimReplicationQueueRemoteParameter { required ServerName crashed_server = 1; required string queue = 2; -} - -enum ClaimReplicationQueuesState { - CLAIM_REPLICATION_QUEUES_DISPATCH = 1; - CLAIM_REPLICATION_QUEUES_FINISH = 2; + optional ServerName source_server = 3; } enum ModifyTableDescriptorState { @@ -715,3 +712,13 @@ message ModifyStoreFileTrackerStateData { message ModifyColumnFamilyStoreFileTrackerStateData { required bytes family = 1; } + +enum AssignReplicationQueuesState { + ASSIGN_REPLICATION_QUEUES_PRE_CHECK = 1; + ASSIGN_REPLICATION_QUEUES_ADD_MISSING_QUEUES = 2; + ASSIGN_REPLICATION_QUEUES_CLAIM = 3; +} + +message AssignReplicationQueuesStateData { + required ServerName crashed_server = 1; +} diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationGroupOffset.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationGroupOffset.java new file mode 100644 index 000000000000..bd13594b99a0 --- /dev/null +++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationGroupOffset.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.replication; + +import org.apache.yetus.audience.InterfaceAudience; + +@InterfaceAudience.Private +public class ReplicationGroupOffset { + + public static final ReplicationGroupOffset BEGIN = new ReplicationGroupOffset("", 0L); + + private final String wal; + + private final long offset; + + public ReplicationGroupOffset(String wal, long offset) { + this.wal = wal; + this.offset = offset; + } + + public String getWal() { + return wal; + } + + /** + * A negative value means this file has already been fully replicated out + */ + public long getOffset() { + return offset; + } + + @Override + public String toString() { + return wal + ":" + offset; + } + + public static ReplicationGroupOffset parse(String str) { + int index = str.lastIndexOf(':'); + return new ReplicationGroupOffset(str.substring(0, index), + Long.parseLong(str.substring(index + 1))); + } +} diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueData.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueData.java new file mode 100644 index 000000000000..794ae9d3a558 --- /dev/null +++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueData.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.replication; + +import org.apache.yetus.audience.InterfaceAudience; + +import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap; + +/** + * Representing all the information for a replication queue. + */ +@InterfaceAudience.Private +public class ReplicationQueueData { + + private final ReplicationQueueId id; + + private final ImmutableMap offsets; + + public ReplicationQueueData(ReplicationQueueId id, + ImmutableMap offsets) { + this.id = id; + this.offsets = offsets; + } + + public ReplicationQueueId getId() { + return id; + } + + public ImmutableMap getOffsets() { + return offsets; + } +} diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueId.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueId.java new file mode 100644 index 000000000000..73633dda9a4a --- /dev/null +++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueId.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.replication; + +import java.util.Objects; +import java.util.Optional; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.yetus.audience.InterfaceAudience; + +@InterfaceAudience.Private +public class ReplicationQueueId { + + private final ServerName serverName; + + private final String peerId; + + private final Optional sourceServerName; + + // we do not allow '-' in peer names so it is safe to use it as the separator for peer id and + // server name + private static final char PEER_ID_SEPARATOR = '-'; + + // The '/' character is not valid for a hostname or a nodename(FQDN, so it is safe to use it as + // the separator for server names) + private static final char SERVER_NAME_SEPARATOR = '/'; + + public ReplicationQueueId(ServerName serverName, String peerId) { + this.serverName = Objects.requireNonNull(serverName); + this.peerId = Objects.requireNonNull(peerId); + this.sourceServerName = Optional.empty(); + } + + public ReplicationQueueId(ServerName serverName, String peerId, ServerName sourceServerName) { + this.serverName = Objects.requireNonNull(serverName); + this.peerId = Objects.requireNonNull(peerId); + this.sourceServerName = Optional.of(sourceServerName); + } + + public ServerName getServerName() { + return serverName; + } + + public String getPeerId() { + return peerId; + } + + public Optional getSourceServerName() { + return sourceServerName; + } + + public ServerName getServerWALsBelongTo() { + return sourceServerName.orElse(serverName); + } + + public boolean isRecovered() { + return sourceServerName.isPresent(); + } + + public ReplicationQueueId claim(ServerName targetServerName) { + ServerName newSourceServerName = sourceServerName.orElse(serverName); + return new ReplicationQueueId(targetServerName, peerId, newSourceServerName); + } + + @Override + public int hashCode() { + return Objects.hash(peerId, serverName, sourceServerName); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof ReplicationQueueId)) { + return false; + } + ReplicationQueueId other = (ReplicationQueueId) obj; + return Objects.equals(peerId, other.peerId) && Objects.equals(serverName, other.serverName) + && Objects.equals(sourceServerName, other.sourceServerName); + } + + @Override + public String toString() { + StringBuilder sb = + new StringBuilder().append(peerId).append(PEER_ID_SEPARATOR).append(serverName); + sourceServerName.ifPresent(s -> sb.append(SERVER_NAME_SEPARATOR).append(s.toString())); + return sb.toString(); + } + + public static ReplicationQueueId parse(String str) { + int dashIndex = str.indexOf(PEER_ID_SEPARATOR); + String peerId = str.substring(0, dashIndex); + int slashIndex = str.indexOf(SERVER_NAME_SEPARATOR, dashIndex + 1); + if (slashIndex < 0) { + String serverName = str.substring(dashIndex + 1); + return new ReplicationQueueId(ServerName.valueOf(serverName), peerId); + } else { + String serverName = str.substring(dashIndex + 1, slashIndex); + String sourceServerName = str.substring(slashIndex + 1); + return new ReplicationQueueId(ServerName.valueOf(serverName), peerId, + ServerName.valueOf(sourceServerName)); + } + } + + public static String getPeerId(String str) { + int dashIndex = str.indexOf(PEER_ID_SEPARATOR); + return str.substring(0, dashIndex); + } + + public static byte[] getScanPrefix(ServerName serverName, String peerId) { + return Bytes.toBytes(peerId + PEER_ID_SEPARATOR + serverName.toString()); + } + + public static byte[] getScanPrefix(String peerId) { + return Bytes.toBytes(peerId + PEER_ID_SEPARATOR); + } + + private static char getNextChar(char c) { + return (char) ((int) c + 1); + } + + public static byte[] getScanStartRowForNextPeerId(String peerId) { + return Bytes.toBytes(peerId + getNextChar(PEER_ID_SEPARATOR)); + } +} diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java index 0f95c04b2542..c4204f0e8c45 100644 --- a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java +++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java @@ -20,7 +20,6 @@ import java.util.List; import java.util.Map; import java.util.Set; -import java.util.SortedSet; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.util.Pair; @@ -33,40 +32,79 @@ public interface ReplicationQueueStorage { /** - * Remove a replication queue for a given regionserver. - * @param serverName the name of the regionserver - * @param queueId a String that identifies the queue. + * Set the current offset for a specific WAL group in a given queue. + * @param queueId the id of the queue + * @param walGroup the group of the WAL, can be empty if multi wal is not enabled + * @param offset the current offset of replication progress + * @param lastSeqIds map with {encodedRegionName, sequenceId} pairs for serial replication. + */ + void setOffset(ReplicationQueueId queueId, String walGroup, ReplicationGroupOffset offset, + Map lastSeqIds) throws ReplicationException; + + /** + * Get the current offset of all the WAL groups for a queue + * @param queueId the id of the queue + * @return a map of all offsets of the WAL groups. The key the is WAL group and the value is the + * position. + */ + Map getOffsets(ReplicationQueueId queueId) + throws ReplicationException; + + /** + * Get a list of all queues for the specific peer. + * @param peerId the id of the peer + * @return a list of queueIds */ - void removeQueue(ServerName serverName, String queueId) throws ReplicationException; + List listAllQueueIds(String peerId) throws ReplicationException; /** - * Add a new WAL file to the given queue for a given regionserver. If the queue does not exist it - * is created. - * @param serverName the name of the regionserver - * @param queueId a String that identifies the queue. - * @param fileName name of the WAL + * Get a list of all queues for the specific region server. + * @param serverName the server name of the region server that owns the set of queues + * @return a list of queueIds */ - void addWAL(ServerName serverName, String queueId, String fileName) throws ReplicationException; + List listAllQueueIds(ServerName serverName) throws ReplicationException; /** - * Remove an WAL file from the given queue for a given regionserver. - * @param serverName the name of the regionserver - * @param queueId a String that identifies the queue. - * @param fileName name of the WAL + * Get a list of all queues for the specific region server and the specific peer + * @param peerId the id of the peer + * @param serverName the server name of the region server that owns the set of queues + * @return a list of queueIds */ - void removeWAL(ServerName serverName, String queueId, String fileName) + List listAllQueueIds(String peerId, ServerName serverName) throws ReplicationException; /** - * Set the current position for a specific WAL in a given queue for a given regionserver. - * @param serverName the name of the regionserver - * @param queueId a String that identifies the queue - * @param fileName name of the WAL - * @param position the current position in the file. Will ignore if less than or equal to 0. - * @param lastSeqIds map with {encodedRegionName, sequenceId} pairs for serial replication. + * Get a list of all queues and the offsets. */ - void setWALPosition(ServerName serverName, String queueId, String fileName, long position, - Map lastSeqIds) throws ReplicationException; + List listAllQueues() throws ReplicationException; + + /** + * Get a list of all region servers that have outstanding replication queues. These servers could + * be alive, dead or from a previous run of the cluster. + * @return a list of server names + */ + List listAllReplicators() throws ReplicationException; + + /** + * Change ownership for the queue identified by queueId and belongs to a dead region server. + * @param queueId the id of the queue + * @param targetServerName the name of the target region server + * @return the new PeerId and A SortedSet of WALs in its queue + */ + Map claimQueue(ReplicationQueueId queueId, + ServerName targetServerName) throws ReplicationException; + + /** + * Remove a replication queue + * @param queueId the id of the queue to remove + */ + void removeQueue(ReplicationQueueId queueId) throws ReplicationException; + + /** + * Remove all the replication queues for the given peer. Usually used when removing a peer. + * @param peerId the id of the peer + */ + void removeAllQueues(String peerId) throws ReplicationException; /** * Read the max sequence id of the specific region for a given peer. For serial replication, we @@ -99,67 +137,6 @@ void setWALPosition(ServerName serverName, String queueId, String fileName, long void removeLastSequenceIds(String peerId, List encodedRegionNames) throws ReplicationException; - /** - * Get the current position for a specific WAL in a given queue for a given regionserver. - * @param serverName the name of the regionserver - * @param queueId a String that identifies the queue - * @param fileName name of the WAL - * @return the current position in the file - */ - long getWALPosition(ServerName serverName, String queueId, String fileName) - throws ReplicationException; - - /** - * Get a list of all WALs in the given queue on the given region server. - * @param serverName the server name of the region server that owns the queue - * @param queueId a String that identifies the queue - * @return a list of WALs - */ - List getWALsInQueue(ServerName serverName, String queueId) throws ReplicationException; - - /** - * Get a list of all queues for the specified region server. - * @param serverName the server name of the region server that owns the set of queues - * @return a list of queueIds - */ - List getAllQueues(ServerName serverName) throws ReplicationException; - - /** - * Change ownership for the queue identified by queueId and belongs to a dead region server. - * @param sourceServerName the name of the dead region server - * @param destServerName the name of the target region server - * @param queueId the id of the queue - * @return the new PeerId and A SortedSet of WALs in its queue - */ - Pair> claimQueue(ServerName sourceServerName, String queueId, - ServerName destServerName) throws ReplicationException; - - /** - * Remove the record of region server if the queue is empty. - */ - void removeReplicatorIfQueueIsEmpty(ServerName serverName) throws ReplicationException; - - /** - * Get a list of all region servers that have outstanding replication queues. These servers could - * be alive, dead or from a previous run of the cluster. - * @return a list of server names - */ - List getListOfReplicators() throws ReplicationException; - - /** - * Load all wals in all replication queues. This method guarantees to return a snapshot which - * contains all WALs at the start of this call even there is concurrent queue failover. However, - * some newly created WALs during the call may not be included. - */ - Set getAllWALs() throws ReplicationException; - - /** - * Add a peer to hfile reference queue if peer does not exist. - * @param peerId peer cluster id to be added - * @throws ReplicationException if fails to add a peer id to hfile reference queue - */ - void addPeerToHFileRefs(String peerId) throws ReplicationException; - /** * Remove a peer from hfile reference queue. * @param peerId peer cluster id to be removed @@ -201,11 +178,4 @@ Pair> claimQueue(ServerName sourceServerName, String q * created hfile references during the call may not be included. */ Set getAllHFileRefs() throws ReplicationException; - - /** - * Get full znode name for given region server - * @param serverName the name of the region server - * @return full znode name - */ - String getRsNode(ServerName serverName); } diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationStorageFactory.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationStorageFactory.java index 0124dbdd113d..d0c204f99349 100644 --- a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationStorageFactory.java +++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationStorageFactory.java @@ -17,9 +17,18 @@ */ package org.apache.hadoop.hbase.replication; +import java.io.IOException; import java.lang.reflect.Constructor; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.hbase.Coprocessor; +import org.apache.hadoop.hbase.NamespaceDescriptor; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.client.CoprocessorDescriptorBuilder; +import org.apache.hadoop.hbase.client.TableDescriptor; +import org.apache.hadoop.hbase.client.TableDescriptorBuilder; import org.apache.hadoop.hbase.util.ReflectionUtils; import org.apache.hadoop.hbase.zookeeper.ZKWatcher; import org.apache.yetus.audience.InterfaceAudience; @@ -37,6 +46,27 @@ public final class ReplicationStorageFactory { public static final ReplicationPeerStorageType DEFAULT_REPLICATION_PEER_STORAGE_IMPL = ReplicationPeerStorageType.ZOOKEEPER; + public static final String REPLICATION_QUEUE_TABLE_NAME = "hbase.replication.queue.table.name"; + + public static final TableName REPLICATION_QUEUE_TABLE_NAME_DEFAULT = + TableName.valueOf(NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR, "replication"); + + public static TableDescriptor createReplicationQueueTableDescriptor(TableName tableName) + throws IOException { + return TableDescriptorBuilder.newBuilder(tableName) + .setColumnFamily(ColumnFamilyDescriptorBuilder.of(TableReplicationQueueStorage.QUEUE_FAMILY)) + .setColumnFamily( + ColumnFamilyDescriptorBuilder.of(TableReplicationQueueStorage.LAST_SEQUENCE_ID_FAMILY)) + .setColumnFamily( + ColumnFamilyDescriptorBuilder.of(TableReplicationQueueStorage.HFILE_REF_FAMILY)) + .setValue("hbase.regionserver.region.split_restriction.type", "DelimitedKeyPrefix") + .setValue("hbase.regionserver.region.split_restriction.delimiter", "-") + .setCoprocessor(CoprocessorDescriptorBuilder + .newBuilder("org.apache.hadoop.hbase.coprocessor.MultiRowMutationEndpoint") + .setPriority(Coprocessor.PRIORITY_SYSTEM).build()) + .build(); + } + private ReplicationStorageFactory() { } @@ -76,8 +106,17 @@ public static ReplicationPeerStorage getReplicationPeerStorage(FileSystem fs, ZK /** * Create a new {@link ReplicationQueueStorage}. */ - public static ReplicationQueueStorage getReplicationQueueStorage(ZKWatcher zk, + public static ReplicationQueueStorage getReplicationQueueStorage(Connection conn, Configuration conf) { - return new ZKReplicationQueueStorage(zk, conf); + return getReplicationQueueStorage(conn, TableName.valueOf(conf.get(REPLICATION_QUEUE_TABLE_NAME, + REPLICATION_QUEUE_TABLE_NAME_DEFAULT.getNameAsString()))); + } + + /** + * Create a new {@link ReplicationQueueStorage}. + */ + public static ReplicationQueueStorage getReplicationQueueStorage(Connection conn, + TableName tableName) { + return new TableReplicationQueueStorage(conn, tableName); } } diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationUtils.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationUtils.java index d1bca8b4b042..ae78781a3133 100644 --- a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationUtils.java +++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationUtils.java @@ -28,7 +28,6 @@ import org.apache.hadoop.hbase.CompoundConfiguration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; @@ -81,20 +80,6 @@ public static Configuration getPeerClusterConfiguration(ReplicationPeerConfig pe return otherConf; } - public static void removeAllQueues(ReplicationQueueStorage queueStorage, String peerId) - throws ReplicationException { - for (ServerName replicator : queueStorage.getListOfReplicators()) { - List queueIds = queueStorage.getAllQueues(replicator); - for (String queueId : queueIds) { - ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueId); - if (queueInfo.getPeerId().equals(peerId)) { - queueStorage.removeQueue(replicator, queueId); - } - } - queueStorage.removeReplicatorIfQueueIsEmpty(replicator); - } - } - private static boolean isCollectionEqual(Collection c1, Collection c2) { if (c1 == null) { return c2 == null; diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/TableReplicationQueueStorage.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/TableReplicationQueueStorage.java new file mode 100644 index 000000000000..0c9553f4fd89 --- /dev/null +++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/TableReplicationQueueStorage.java @@ -0,0 +1,535 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.replication; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Set; +import java.util.function.Supplier; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.Cell; +import org.apache.hadoop.hbase.CellScanner; +import org.apache.hadoop.hbase.CellUtil; +import org.apache.hadoop.hbase.CompareOperator; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.AsyncTable; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.client.Delete; +import org.apache.hadoop.hbase.client.Get; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.client.Scan.ReadType; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.filter.KeyOnlyFilter; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.FutureUtils; +import org.apache.hadoop.hbase.util.Pair; +import org.apache.yetus.audience.InterfaceAudience; + +import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap; + +import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.MutationProto.MutationType; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MultiRowMutationProtos; + +/** + * HBase table based replication queue storage. + */ +@InterfaceAudience.Private +public class TableReplicationQueueStorage implements ReplicationQueueStorage { + + public static final byte[] QUEUE_FAMILY = Bytes.toBytes("queue"); + + public static final byte[] LAST_SEQUENCE_ID_FAMILY = Bytes.toBytes("sid"); + + public static final byte[] HFILE_REF_FAMILY = Bytes.toBytes("hfileref"); + + private final Connection conn; + + private final TableName tableName; + + @FunctionalInterface + private interface TableCreator { + + void create() throws IOException; + } + + public TableReplicationQueueStorage(Connection conn, TableName tableName) { + this.conn = conn; + this.tableName = tableName; + } + + private void addLastSeqIdsPut(MultiRowMutationProtos.MutateRowsRequest.Builder builder, + String peerId, Map lastSeqIds, AsyncTable table) throws IOException { + // get the previous sequence ids first + byte[] row = Bytes.toBytes(peerId); + Get get = new Get(row); + lastSeqIds.keySet().forEach(encodedRegionName -> get.addColumn(LAST_SEQUENCE_ID_FAMILY, + Bytes.toBytes(encodedRegionName))); + Result result = FutureUtils.get(table.get(get)); + Put put = new Put(row); + for (Map.Entry entry : lastSeqIds.entrySet()) { + String encodedRegionName = entry.getKey(); + long lastSeqId = entry.getValue(); + byte[] encodedRegionNameAsBytes = Bytes.toBytes(encodedRegionName); + byte[] previousLastSeqIdAsBytes = + result.getValue(LAST_SEQUENCE_ID_FAMILY, encodedRegionNameAsBytes); + if (previousLastSeqIdAsBytes != null) { + long previousLastSeqId = Bytes.toLong(previousLastSeqIdAsBytes); + if (lastSeqId > previousLastSeqId) { + // update last seq id when it is greater, and use CAS to make sure we do not overwrite + // other's value. + put.addColumn(LAST_SEQUENCE_ID_FAMILY, encodedRegionNameAsBytes, + Bytes.toBytes(lastSeqId)); + builder.addCondition(ProtobufUtil.toCondition(row, LAST_SEQUENCE_ID_FAMILY, + encodedRegionNameAsBytes, CompareOperator.EQUAL, previousLastSeqIdAsBytes, null)); + } + } else { + // also update last seq id when there is no value yet, and use CAS to make sure we do not + // overwrite + // other's value. + put.addColumn(LAST_SEQUENCE_ID_FAMILY, encodedRegionNameAsBytes, Bytes.toBytes(lastSeqId)); + builder.addCondition(ProtobufUtil.toCondition(row, LAST_SEQUENCE_ID_FAMILY, + encodedRegionNameAsBytes, CompareOperator.EQUAL, null, null)); + } + } + if (!put.isEmpty()) { + builder.addMutationRequest(ProtobufUtil.toMutation(MutationType.PUT, put)); + } + } + + @Override + public void setOffset(ReplicationQueueId queueId, String walGroup, ReplicationGroupOffset offset, + Map lastSeqIds) throws ReplicationException { + Put put = new Put(Bytes.toBytes(queueId.toString())).addColumn(QUEUE_FAMILY, + Bytes.toBytes(walGroup), Bytes.toBytes(offset.toString())); + AsyncTable asyncTable = conn.toAsyncConnection().getTable(tableName); + try { + if (lastSeqIds.isEmpty()) { + FutureUtils.get(asyncTable.put(put)); + } else { + for (;;) { + MultiRowMutationProtos.MutateRowsRequest.Builder builder = + MultiRowMutationProtos.MutateRowsRequest.newBuilder(); + addLastSeqIdsPut(builder, queueId.getPeerId(), lastSeqIds, asyncTable); + if (builder.getMutationRequestCount() > 0) { + // use MultiRowMutationService to atomically update offset and last sequence ids + MultiRowMutationProtos.MutateRowsRequest request = + builder.addMutationRequest(ProtobufUtil.toMutation(MutationType.PUT, put)).build(); + MultiRowMutationProtos.MutateRowsResponse responose = + FutureUtils.get(asyncTable. coprocessorService( + MultiRowMutationProtos.MultiRowMutationService::newStub, + (stub, controller, done) -> stub.mutateRows(controller, request, done), + put.getRow())); + if (responose.getProcessed()) { + break; + } + } else { + // we do not need to update last seq id, fallback to single put + FutureUtils.get(asyncTable.put(put)); + break; + } + } + } + } catch (IOException e) { + throw new ReplicationException("failed to setOffset, queueId=" + queueId + ", walGroup=" + + walGroup + ", offset=" + offset + ", lastSeqIds=" + lastSeqIds, e); + } + } + + private ImmutableMap parseOffsets(Result result) { + ImmutableMap.Builder builder = + ImmutableMap.builderWithExpectedSize(result.size()); + NavigableMap map = result.getFamilyMap(QUEUE_FAMILY); + if (map != null) { + map.forEach((k, v) -> { + String walGroup = Bytes.toString(k); + ReplicationGroupOffset offset = ReplicationGroupOffset.parse(Bytes.toString(v)); + builder.put(walGroup, offset); + }); + } + return builder.build(); + } + + private Map getOffsets0(Table table, ReplicationQueueId queueId) + throws IOException { + Result result = table.get(new Get(Bytes.toBytes(queueId.toString())).addFamily(QUEUE_FAMILY)); + return parseOffsets(result); + } + + @Override + public Map getOffsets(ReplicationQueueId queueId) + throws ReplicationException { + try (Table table = conn.getTable(tableName)) { + return getOffsets0(table, queueId); + } catch (IOException e) { + throw new ReplicationException("failed to getOffsets, queueId=" + queueId, e); + } + } + + private void listAllQueueIds(Table table, Scan scan, List queueIds) + throws IOException { + try (ResultScanner scanner = table.getScanner(scan)) { + for (;;) { + Result result = scanner.next(); + if (result == null) { + break; + } + ReplicationQueueId queueId = ReplicationQueueId.parse(Bytes.toString(result.getRow())); + queueIds.add(queueId); + } + } + } + + private void listAllQueueIds(Table table, String peerId, ServerName serverName, + List queueIds) throws IOException { + listAllQueueIds(table, + new Scan().setStartStopRowForPrefixScan(ReplicationQueueId.getScanPrefix(serverName, peerId)) + .addFamily(QUEUE_FAMILY).setFilter(new KeyOnlyFilter()), + queueIds); + } + + @Override + public List listAllQueueIds(String peerId) throws ReplicationException { + Scan scan = new Scan().setStartStopRowForPrefixScan(ReplicationQueueId.getScanPrefix(peerId)) + .addFamily(QUEUE_FAMILY).setFilter(new KeyOnlyFilter()); + List queueIds = new ArrayList<>(); + try (Table table = conn.getTable(tableName)) { + listAllQueueIds(table, scan, queueIds); + } catch (IOException e) { + throw new ReplicationException("failed to listAllQueueIds, peerId=" + peerId, e); + } + return queueIds; + } + + @Override + public List listAllQueueIds(ServerName serverName) + throws ReplicationException { + List queueIds = new ArrayList<>(); + try (Table table = conn.getTable(tableName)) { + KeyOnlyFilter keyOnlyFilter = new KeyOnlyFilter(); + String previousPeerId = null; + for (;;) { + // first, get the next peerId + Scan peerScan = + new Scan().addFamily(QUEUE_FAMILY).setOneRowLimit().setFilter(keyOnlyFilter); + if (previousPeerId != null) { + peerScan.withStartRow(ReplicationQueueId.getScanStartRowForNextPeerId(previousPeerId)); + } + String peerId; + try (ResultScanner scanner = table.getScanner(peerScan)) { + Result result = scanner.next(); + if (result == null) { + // no more peers, break + break; + } + peerId = ReplicationQueueId.getPeerId(Bytes.toString(result.getRow())); + } + listAllQueueIds(table, peerId, serverName, queueIds); + previousPeerId = peerId; + } + } catch (IOException e) { + throw new ReplicationException("failed to listAllQueueIds, serverName=" + serverName, e); + } + return queueIds; + } + + @Override + public List listAllQueueIds(String peerId, ServerName serverName) + throws ReplicationException { + List queueIds = new ArrayList<>(); + try (Table table = conn.getTable(tableName)) { + listAllQueueIds(table, peerId, serverName, queueIds); + } catch (IOException e) { + throw new ReplicationException( + "failed to listAllQueueIds, peerId=" + peerId + ", serverName=" + serverName, e); + } + return queueIds; + } + + @Override + public List listAllQueues() throws ReplicationException { + List queues = new ArrayList<>(); + Scan scan = new Scan().addFamily(QUEUE_FAMILY).setReadType(ReadType.STREAM); + try (Table table = conn.getTable(tableName); ResultScanner scanner = table.getScanner(scan)) { + for (;;) { + Result result = scanner.next(); + if (result == null) { + break; + } + ReplicationQueueId queueId = ReplicationQueueId.parse(Bytes.toString(result.getRow())); + ReplicationQueueData queueData = new ReplicationQueueData(queueId, parseOffsets(result)); + queues.add(queueData); + } + } catch (IOException e) { + throw new ReplicationException("failed to listAllQueues", e); + } + return queues; + } + + @Override + public List listAllReplicators() throws ReplicationException { + Set replicators = new HashSet<>(); + Scan scan = new Scan().addFamily(QUEUE_FAMILY).setFilter(new KeyOnlyFilter()) + .setReadType(ReadType.STREAM); + try (Table table = conn.getTable(tableName); ResultScanner scanner = table.getScanner(scan)) { + for (;;) { + Result result = scanner.next(); + if (result == null) { + break; + } + ReplicationQueueId queueId = ReplicationQueueId.parse(Bytes.toString(result.getRow())); + replicators.add(queueId.getServerName()); + } + } catch (IOException e) { + throw new ReplicationException("failed to listAllReplicators", e); + } + return new ArrayList<>(replicators); + } + + @Override + public Map claimQueue(ReplicationQueueId queueId, + ServerName targetServerName) throws ReplicationException { + ReplicationQueueId newQueueId = queueId.claim(targetServerName); + byte[] coprocessorRow = ReplicationQueueId.getScanPrefix(queueId.getPeerId()); + AsyncTable asyncTable = conn.toAsyncConnection().getTable(tableName); + try (Table table = conn.getTable(tableName)) { + for (;;) { + Map offsets = getOffsets0(table, queueId); + if (offsets.isEmpty()) { + return Collections.emptyMap(); + } + Map.Entry entry = offsets.entrySet().iterator().next(); + ClientProtos.Condition condition = ProtobufUtil.toCondition( + Bytes.toBytes(queueId.toString()), QUEUE_FAMILY, Bytes.toBytes(entry.getKey()), + CompareOperator.EQUAL, Bytes.toBytes(entry.getValue().toString()), null); + Delete delete = new Delete(Bytes.toBytes(queueId.toString())).addFamily(QUEUE_FAMILY); + Put put = new Put(Bytes.toBytes(newQueueId.toString())); + offsets.forEach((walGroup, offset) -> put.addColumn(QUEUE_FAMILY, Bytes.toBytes(walGroup), + Bytes.toBytes(offset.toString()))); + MultiRowMutationProtos.MutateRowsRequest request = + MultiRowMutationProtos.MutateRowsRequest.newBuilder().addCondition(condition) + .addMutationRequest(ProtobufUtil.toMutation(MutationType.DELETE, delete)) + .addMutationRequest(ProtobufUtil.toMutation(MutationType.PUT, put)).build(); + MultiRowMutationProtos.MutateRowsResponse resp = + FutureUtils.get(asyncTable. coprocessorService( + MultiRowMutationProtos.MultiRowMutationService::newStub, + (stub, controller, done) -> stub.mutateRows(controller, request, done), + coprocessorRow)); + if (resp.getProcessed()) { + return offsets; + } + // if the multi is not processed, which usually the queue has already been claimed by + // others, for safety, let's try claiming again, usually the next get operation above will + // return an empty map and we will quit the loop. + } + } catch (IOException e) { + throw new ReplicationException( + "failed to claimQueue, queueId=" + queueId + ", targetServerName=" + targetServerName, e); + } + } + + @Override + public void removeQueue(ReplicationQueueId queueId) throws ReplicationException { + try (Table table = conn.getTable(tableName)) { + table.delete(new Delete(Bytes.toBytes(queueId.toString())).addFamily(QUEUE_FAMILY)); + } catch (IOException e) { + throw new ReplicationException("failed to removeQueue, queueId=" + queueId, e); + } + } + + @Override + public void removeAllQueues(String peerId) throws ReplicationException { + Scan scan = new Scan().setStartStopRowForPrefixScan(ReplicationQueueId.getScanPrefix(peerId)) + .addFamily(QUEUE_FAMILY).setFilter(new KeyOnlyFilter()); + try (Table table = conn.getTable(tableName); ResultScanner scanner = table.getScanner(scan)) { + for (;;) { + Result result = scanner.next(); + if (result == null) { + break; + } + table.delete(new Delete(result.getRow())); + } + } catch (IOException e) { + throw new ReplicationException("failed to listAllQueueIds, peerId=" + peerId, e); + } + } + + @Override + public long getLastSequenceId(String encodedRegionName, String peerId) + throws ReplicationException { + byte[] qual = Bytes.toBytes(encodedRegionName); + try (Table table = conn.getTable(tableName)) { + Result result = + table.get(new Get(Bytes.toBytes(peerId)).addColumn(LAST_SEQUENCE_ID_FAMILY, qual)); + byte[] lastSeqId = result.getValue(LAST_SEQUENCE_ID_FAMILY, qual); + return lastSeqId != null ? Bytes.toLong(lastSeqId) : HConstants.NO_SEQNUM; + } catch (IOException e) { + throw new ReplicationException("failed to getLastSequenceId, encodedRegionName=" + + encodedRegionName + ", peerId=" + peerId, e); + } + } + + @Override + public void setLastSequenceIds(String peerId, Map lastSeqIds) + throws ReplicationException { + // No need CAS and retry here, because it'll call setLastSequenceIds() for disabled peers + // only, so no conflict happen. + Put put = new Put(Bytes.toBytes(peerId)); + lastSeqIds.forEach((encodedRegionName, lastSeqId) -> put.addColumn(LAST_SEQUENCE_ID_FAMILY, + Bytes.toBytes(encodedRegionName), Bytes.toBytes(lastSeqId))); + try (Table table = conn.getTable(tableName)) { + table.put(put); + } catch (IOException e) { + throw new ReplicationException( + "failed to setLastSequenceIds, peerId=" + peerId + ", lastSeqIds=" + lastSeqIds, e); + } + } + + @Override + public void removeLastSequenceIds(String peerId) throws ReplicationException { + Delete delete = new Delete(Bytes.toBytes(peerId)).addFamily(LAST_SEQUENCE_ID_FAMILY); + try (Table table = conn.getTable(tableName)) { + table.delete(delete); + } catch (IOException e) { + throw new ReplicationException("failed to removeLastSequenceIds, peerId=" + peerId, e); + } + } + + @Override + public void removeLastSequenceIds(String peerId, List encodedRegionNames) + throws ReplicationException { + Delete delete = new Delete(Bytes.toBytes(peerId)); + encodedRegionNames.forEach(n -> delete.addColumns(LAST_SEQUENCE_ID_FAMILY, Bytes.toBytes(n))); + try (Table table = conn.getTable(tableName)) { + table.delete(delete); + } catch (IOException e) { + throw new ReplicationException("failed to removeLastSequenceIds, peerId=" + peerId + + ", encodedRegionNames=" + encodedRegionNames, e); + } + } + + @Override + public void removePeerFromHFileRefs(String peerId) throws ReplicationException { + try (Table table = conn.getTable(tableName)) { + table.delete(new Delete(Bytes.toBytes(peerId)).addFamily(HFILE_REF_FAMILY)); + } catch (IOException e) { + throw new ReplicationException("failed to removePeerFromHFileRefs, peerId=" + peerId, e); + } + } + + @Override + public void addHFileRefs(String peerId, List> pairs) + throws ReplicationException { + Put put = new Put(Bytes.toBytes(peerId)); + pairs.forEach(p -> put.addColumn(HFILE_REF_FAMILY, Bytes.toBytes(p.getSecond().getName()), + HConstants.EMPTY_BYTE_ARRAY)); + try (Table table = conn.getTable(tableName)) { + table.put(put); + } catch (IOException e) { + throw new ReplicationException( + "failed to addHFileRefs, peerId=" + peerId + ", pairs=" + pairs, e); + } + } + + @Override + public void removeHFileRefs(String peerId, List files) throws ReplicationException { + Delete delete = new Delete(Bytes.toBytes(peerId)); + files.forEach(f -> delete.addColumns(HFILE_REF_FAMILY, Bytes.toBytes(f))); + try (Table table = conn.getTable(tableName)) { + table.delete(delete); + } catch (IOException e) { + throw new ReplicationException( + "failed to removeHFileRefs, peerId=" + peerId + ", files=" + files, e); + } + } + + @Override + public List getAllPeersFromHFileRefsQueue() throws ReplicationException { + List peerIds = new ArrayList<>(); + Scan scan = new Scan().addFamily(HFILE_REF_FAMILY).setReadType(ReadType.STREAM) + .setFilter(new KeyOnlyFilter()); + try (Table table = conn.getTable(tableName); ResultScanner scanner = table.getScanner(scan)) { + for (;;) { + Result result = scanner.next(); + if (result == null) { + break; + } + peerIds.add(Bytes.toString(result.getRow())); + } + } catch (IOException e) { + throw new ReplicationException("failed to getAllPeersFromHFileRefsQueue", e); + } + return peerIds; + } + + private > T scanHFiles(Scan scan, Supplier creator) + throws IOException { + T files = creator.get(); + try (Table table = conn.getTable(tableName); ResultScanner scanner = table.getScanner(scan)) { + for (;;) { + Result result = scanner.next(); + if (result == null) { + break; + } + CellScanner cellScanner = result.cellScanner(); + while (cellScanner.advance()) { + Cell cell = cellScanner.current(); + files.add(Bytes.toString(CellUtil.cloneQualifier(cell))); + } + } + } + return files; + } + + @Override + public List getReplicableHFiles(String peerId) throws ReplicationException { + // use scan to avoid getting a too large row one time, which may cause a very huge memory usage. + Scan scan = new Scan().addFamily(HFILE_REF_FAMILY) + .setStartStopRowForPrefixScan(Bytes.toBytes(peerId)).setAllowPartialResults(true); + try { + return scanHFiles(scan, ArrayList::new); + } catch (IOException e) { + throw new ReplicationException("failed to getReplicableHFiles, peerId=" + peerId, e); + } + } + + @Override + public Set getAllHFileRefs() throws ReplicationException { + Scan scan = new Scan().addFamily(HFILE_REF_FAMILY).setReadType(ReadType.STREAM) + .setAllowPartialResults(true); + try { + return scanHFiles(scan, HashSet::new); + } catch (IOException e) { + throw new ReplicationException("failed to getAllHFileRefs", e); + } + } +} diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ZKReplicationQueueStorage.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ZKReplicationQueueStorage.java deleted file mode 100644 index f3506ad3555a..000000000000 --- a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ZKReplicationQueueStorage.java +++ /dev/null @@ -1,689 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.replication; - -import static java.util.stream.Collectors.toList; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; -import java.util.SortedSet; -import java.util.TreeSet; -import java.util.stream.Collectors; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.ServerName; -import org.apache.hadoop.hbase.client.RegionInfo; -import org.apache.hadoop.hbase.exceptions.DeserializationException; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hbase.util.Pair; -import org.apache.hadoop.hbase.zookeeper.ZKUtil; -import org.apache.hadoop.hbase.zookeeper.ZKUtil.ZKUtilOp; -import org.apache.hadoop.hbase.zookeeper.ZKWatcher; -import org.apache.hadoop.hbase.zookeeper.ZNodePaths; -import org.apache.yetus.audience.InterfaceAudience; -import org.apache.zookeeper.KeeperException; -import org.apache.zookeeper.KeeperException.BadVersionException; -import org.apache.zookeeper.KeeperException.NoNodeException; -import org.apache.zookeeper.KeeperException.NodeExistsException; -import org.apache.zookeeper.KeeperException.NotEmptyException; -import org.apache.zookeeper.data.Stat; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.hbase.thirdparty.org.apache.commons.collections4.CollectionUtils; - -/** - * ZK based replication queue storage. - *

- * The base znode for each regionserver is the regionserver name. For example: - * - *

- * /hbase/replication/rs/hostname.example.org,6020,1234
- * 
- * - * Within this znode, the region server maintains a set of WAL replication queues. These queues are - * represented by child znodes named using there give queue id. For example: - * - *
- * /hbase/replication/rs/hostname.example.org,6020,1234/1
- * /hbase/replication/rs/hostname.example.org,6020,1234/2
- * 
- * - * Each queue has one child znode for every WAL that still needs to be replicated. The value of - * these WAL child znodes is the latest position that has been replicated. This position is updated - * every time a WAL entry is replicated. For example: - * - *
- * /hbase/replication/rs/hostname.example.org,6020,1234/1/23522342.23422 [VALUE: 254]
- * 
- */ -@InterfaceAudience.Private -class ZKReplicationQueueStorage extends ZKReplicationStorageBase - implements ReplicationQueueStorage { - - private static final Logger LOG = LoggerFactory.getLogger(ZKReplicationQueueStorage.class); - - public static final String ZOOKEEPER_ZNODE_REPLICATION_HFILE_REFS_KEY = - "zookeeper.znode.replication.hfile.refs"; - public static final String ZOOKEEPER_ZNODE_REPLICATION_HFILE_REFS_DEFAULT = "hfile-refs"; - - public static final String ZOOKEEPER_ZNODE_REPLICATION_REGIONS_KEY = - "zookeeper.znode.replication.regions"; - public static final String ZOOKEEPER_ZNODE_REPLICATION_REGIONS_DEFAULT = "regions"; - - /** - * The name of the znode that contains all replication queues - */ - private final String queuesZNode; - - /** - * The name of the znode that contains queues of hfile references to be replicated - */ - private final String hfileRefsZNode; - - final String regionsZNode; - - public ZKReplicationQueueStorage(ZKWatcher zookeeper, Configuration conf) { - super(zookeeper, conf); - - String queuesZNodeName = conf.get("zookeeper.znode.replication.rs", "rs"); - String hfileRefsZNodeName = conf.get(ZOOKEEPER_ZNODE_REPLICATION_HFILE_REFS_KEY, - ZOOKEEPER_ZNODE_REPLICATION_HFILE_REFS_DEFAULT); - this.queuesZNode = ZNodePaths.joinZNode(replicationZNode, queuesZNodeName); - this.hfileRefsZNode = ZNodePaths.joinZNode(replicationZNode, hfileRefsZNodeName); - this.regionsZNode = ZNodePaths.joinZNode(replicationZNode, conf - .get(ZOOKEEPER_ZNODE_REPLICATION_REGIONS_KEY, ZOOKEEPER_ZNODE_REPLICATION_REGIONS_DEFAULT)); - } - - @Override - public String getRsNode(ServerName serverName) { - return ZNodePaths.joinZNode(queuesZNode, serverName.getServerName()); - } - - private String getQueueNode(ServerName serverName, String queueId) { - return ZNodePaths.joinZNode(getRsNode(serverName), queueId); - } - - private String getFileNode(String queueNode, String fileName) { - return ZNodePaths.joinZNode(queueNode, fileName); - } - - private String getFileNode(ServerName serverName, String queueId, String fileName) { - return getFileNode(getQueueNode(serverName, queueId), fileName); - } - - /** - *

- * Put all regions under /hbase/replication/regions znode will lead to too many children because - * of the huge number of regions in real production environment. So here we will distribute the - * znodes to multiple directories. - *

- *

- * So the final znode path will be format like this: - * - *

-   * /hbase/replication/regions/dd/04/e76a6966d4ffa908ed0586764767-100
-   * 
- * - * Here the full encoded region name is dd04e76a6966d4ffa908ed0586764767, and we use the first two - * characters 'dd' as the first level directory name, and use the next two characters '04' as the - * second level directory name, and the rest part as the prefix of the znode, and the suffix '100' - * is the peer id. - *

- * @param encodedRegionName the encoded region name. - * @param peerId peer id for replication. - * @return ZNode path to persist the max sequence id that we've pushed for the given region and - * peer. - */ - String getSerialReplicationRegionPeerNode(String encodedRegionName, String peerId) { - if (encodedRegionName == null || encodedRegionName.length() != RegionInfo.MD5_HEX_LENGTH) { - throw new IllegalArgumentException( - "Invalid encoded region name: " + encodedRegionName + ", length should be 32."); - } - return new StringBuilder(regionsZNode).append(ZNodePaths.ZNODE_PATH_SEPARATOR) - .append(encodedRegionName, 0, 2).append(ZNodePaths.ZNODE_PATH_SEPARATOR) - .append(encodedRegionName, 2, 4).append(ZNodePaths.ZNODE_PATH_SEPARATOR) - .append(encodedRegionName, 4, encodedRegionName.length()).append("-").append(peerId) - .toString(); - } - - @Override - public void removeQueue(ServerName serverName, String queueId) throws ReplicationException { - try { - ZKUtil.deleteNodeRecursively(zookeeper, getQueueNode(serverName, queueId)); - } catch (KeeperException e) { - throw new ReplicationException( - "Failed to delete queue (serverName=" + serverName + ", queueId=" + queueId + ")", e); - } - } - - @Override - public void addWAL(ServerName serverName, String queueId, String fileName) - throws ReplicationException { - try { - ZKUtil.createWithParents(zookeeper, getFileNode(serverName, queueId, fileName)); - } catch (KeeperException e) { - throw new ReplicationException("Failed to add wal to queue (serverName=" + serverName - + ", queueId=" + queueId + ", fileName=" + fileName + ")", e); - } - } - - @Override - public void removeWAL(ServerName serverName, String queueId, String fileName) - throws ReplicationException { - String fileNode = getFileNode(serverName, queueId, fileName); - try { - ZKUtil.deleteNode(zookeeper, fileNode); - } catch (NoNodeException e) { - LOG.warn("{} already deleted when removing log", fileNode); - } catch (KeeperException e) { - throw new ReplicationException("Failed to remove wal from queue (serverName=" + serverName - + ", queueId=" + queueId + ", fileName=" + fileName + ")", e); - } - } - - private void addLastSeqIdsToOps(String queueId, Map lastSeqIds, - List listOfOps) throws KeeperException, ReplicationException { - String peerId = new ReplicationQueueInfo(queueId).getPeerId(); - for (Entry lastSeqEntry : lastSeqIds.entrySet()) { - String path = getSerialReplicationRegionPeerNode(lastSeqEntry.getKey(), peerId); - Pair p = getLastSequenceIdWithVersion(lastSeqEntry.getKey(), peerId); - byte[] data = ZKUtil.positionToByteArray(lastSeqEntry.getValue()); - if (p.getSecond() < 0) { // ZNode does not exist. - ZKUtil.createWithParents(zookeeper, - path.substring(0, path.lastIndexOf(ZNodePaths.ZNODE_PATH_SEPARATOR))); - listOfOps.add(ZKUtilOp.createAndFailSilent(path, data)); - continue; - } - // Perform CAS in a specific version v0 (HBASE-20138) - int v0 = p.getSecond(); - long lastPushedSeqId = p.getFirst(); - if (lastSeqEntry.getValue() <= lastPushedSeqId) { - continue; - } - listOfOps.add(ZKUtilOp.setData(path, data, v0)); - } - } - - @Override - public void setWALPosition(ServerName serverName, String queueId, String fileName, long position, - Map lastSeqIds) throws ReplicationException { - try { - for (int retry = 0;; retry++) { - List listOfOps = new ArrayList<>(); - if (position > 0) { - listOfOps.add(ZKUtilOp.setData(getFileNode(serverName, queueId, fileName), - ZKUtil.positionToByteArray(position))); - } - // Persist the max sequence id(s) of regions for serial replication atomically. - addLastSeqIdsToOps(queueId, lastSeqIds, listOfOps); - if (listOfOps.isEmpty()) { - return; - } - try { - ZKUtil.multiOrSequential(zookeeper, listOfOps, false); - return; - } catch (KeeperException.BadVersionException | KeeperException.NodeExistsException e) { - LOG.warn( - "Bad version(or node exist) when persist the last pushed sequence id to zookeeper " - + "storage, Retry = " + retry + ", serverName=" + serverName + ", queueId=" + queueId - + ", fileName=" + fileName); - } - } - } catch (KeeperException e) { - throw new ReplicationException("Failed to set log position (serverName=" + serverName - + ", queueId=" + queueId + ", fileName=" + fileName + ", position=" + position + ")", e); - } - } - - /** - * Return the {lastPushedSequenceId, ZNodeDataVersion} pair. if ZNodeDataVersion is -1, it means - * that the ZNode does not exist. - */ - protected Pair getLastSequenceIdWithVersion(String encodedRegionName, - String peerId) throws KeeperException { - Stat stat = new Stat(); - String path = getSerialReplicationRegionPeerNode(encodedRegionName, peerId); - byte[] data = ZKUtil.getDataNoWatch(zookeeper, path, stat); - if (data == null) { - // ZNode does not exist, so just return version -1 to indicate that no node exist. - return Pair.newPair(HConstants.NO_SEQNUM, -1); - } - try { - return Pair.newPair(ZKUtil.parseWALPositionFrom(data), stat.getVersion()); - } catch (DeserializationException de) { - LOG.warn("Failed to parse log position (region=" + encodedRegionName + ", peerId=" + peerId - + "), data=" + Bytes.toStringBinary(data)); - } - return Pair.newPair(HConstants.NO_SEQNUM, stat.getVersion()); - } - - @Override - public long getLastSequenceId(String encodedRegionName, String peerId) - throws ReplicationException { - try { - return getLastSequenceIdWithVersion(encodedRegionName, peerId).getFirst(); - } catch (KeeperException e) { - throw new ReplicationException("Failed to get last pushed sequence id (encodedRegionName=" - + encodedRegionName + ", peerId=" + peerId + ")", e); - } - } - - @Override - public void setLastSequenceIds(String peerId, Map lastSeqIds) - throws ReplicationException { - try { - // No need CAS and retry here, because it'll call setLastSequenceIds() for disabled peers - // only, so no conflict happen. - List listOfOps = new ArrayList<>(); - for (Entry lastSeqEntry : lastSeqIds.entrySet()) { - String path = getSerialReplicationRegionPeerNode(lastSeqEntry.getKey(), peerId); - ZKUtil.createWithParents(zookeeper, path); - listOfOps.add(ZKUtilOp.setData(path, ZKUtil.positionToByteArray(lastSeqEntry.getValue()))); - } - if (!listOfOps.isEmpty()) { - ZKUtil.multiOrSequential(zookeeper, listOfOps, true); - } - } catch (KeeperException e) { - throw new ReplicationException("Failed to set last sequence ids, peerId=" + peerId - + ", size of lastSeqIds=" + lastSeqIds.size(), e); - } - } - - @Override - public void removeLastSequenceIds(String peerId) throws ReplicationException { - String suffix = "-" + peerId; - try { - StringBuilder sb = new StringBuilder(regionsZNode); - int regionsZNodeLength = regionsZNode.length(); - int levelOneLength = regionsZNodeLength + 3; - int levelTwoLength = levelOneLength + 3; - List levelOneDirs = ZKUtil.listChildrenNoWatch(zookeeper, regionsZNode); - // it is possible that levelOneDirs is null if we haven't write any last pushed sequence ids - // yet, so we need an extra check here. - if (CollectionUtils.isEmpty(levelOneDirs)) { - return; - } - for (String levelOne : levelOneDirs) { - sb.append(ZNodePaths.ZNODE_PATH_SEPARATOR).append(levelOne); - for (String levelTwo : ZKUtil.listChildrenNoWatch(zookeeper, sb.toString())) { - sb.append(ZNodePaths.ZNODE_PATH_SEPARATOR).append(levelTwo); - for (String znode : ZKUtil.listChildrenNoWatch(zookeeper, sb.toString())) { - if (znode.endsWith(suffix)) { - sb.append(ZNodePaths.ZNODE_PATH_SEPARATOR).append(znode); - ZKUtil.deleteNode(zookeeper, sb.toString()); - sb.setLength(levelTwoLength); - } - } - sb.setLength(levelOneLength); - } - sb.setLength(regionsZNodeLength); - } - } catch (KeeperException e) { - throw new ReplicationException("Failed to remove all last sequence ids, peerId=" + peerId, e); - } - } - - @Override - public void removeLastSequenceIds(String peerId, List encodedRegionNames) - throws ReplicationException { - try { - List listOfOps = - encodedRegionNames.stream().map(n -> getSerialReplicationRegionPeerNode(n, peerId)) - .map(ZKUtilOp::deleteNodeFailSilent).collect(Collectors.toList()); - ZKUtil.multiOrSequential(zookeeper, listOfOps, true); - } catch (KeeperException e) { - throw new ReplicationException("Failed to remove last sequence ids, peerId=" + peerId - + ", encodedRegionNames.size=" + encodedRegionNames.size(), e); - } - } - - @Override - public long getWALPosition(ServerName serverName, String queueId, String fileName) - throws ReplicationException { - byte[] bytes; - try { - bytes = ZKUtil.getData(zookeeper, getFileNode(serverName, queueId, fileName)); - } catch (KeeperException | InterruptedException e) { - throw new ReplicationException("Failed to get log position (serverName=" + serverName - + ", queueId=" + queueId + ", fileName=" + fileName + ")", e); - } - try { - return ZKUtil.parseWALPositionFrom(bytes); - } catch (DeserializationException de) { - LOG.warn("Failed parse log position (serverName={}, queueId={}, fileName={})", serverName, - queueId, fileName); - } - // if we can not parse the position, start at the beginning of the wal file again - return 0; - } - - /** - * This implement must update the cversion of root {@link #queuesZNode}. The optimistic lock of - * the {@link #getAllWALs()} method is based on the cversion of root {@link #queuesZNode}. - * @see #getAllWALs() to show the usage of the cversion of root {@link #queuesZNode} . - */ - @Override - public Pair> claimQueue(ServerName sourceServerName, String queueId, - ServerName destServerName) throws ReplicationException { - LOG.info("Atomically moving {}/{}'s WALs to {}", sourceServerName, queueId, destServerName); - try { - ZKUtil.createWithParents(zookeeper, getRsNode(destServerName)); - } catch (KeeperException e) { - throw new ReplicationException("Claim queue queueId=" + queueId + " from " + sourceServerName - + " to " + destServerName + " failed when creating the node for " + destServerName, e); - } - String newQueueId = queueId + "-" + sourceServerName; - try { - String oldQueueNode = getQueueNode(sourceServerName, queueId); - List wals = ZKUtil.listChildrenNoWatch(zookeeper, oldQueueNode); - if (CollectionUtils.isEmpty(wals)) { - ZKUtil.deleteNodeFailSilent(zookeeper, oldQueueNode); - LOG.info("Removed empty {}/{}", sourceServerName, queueId); - return new Pair<>(newQueueId, Collections.emptySortedSet()); - } - String newQueueNode = getQueueNode(destServerName, newQueueId); - List listOfOps = new ArrayList<>(); - SortedSet logQueue = new TreeSet<>(); - // create the new cluster znode - listOfOps.add(ZKUtilOp.createAndFailSilent(newQueueNode, HConstants.EMPTY_BYTE_ARRAY)); - // get the offset of the logs and set it to new znodes - for (String wal : wals) { - String oldWalNode = getFileNode(oldQueueNode, wal); - byte[] logOffset = ZKUtil.getData(this.zookeeper, oldWalNode); - LOG.debug("Creating {} with data {}", wal, Bytes.toStringBinary(logOffset)); - String newWalNode = getFileNode(newQueueNode, wal); - listOfOps.add(ZKUtilOp.createAndFailSilent(newWalNode, logOffset)); - listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldWalNode)); - logQueue.add(wal); - } - // add delete op for peer - listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldQueueNode)); - // Append new queue id for prevent lock competition in zookeeper server. - String claimLockZNode = ZNodePaths.joinZNode(queuesZNode, "cversion_" + newQueueId); - // A trick for update the cversion of root queuesZNode . - // The optimistic lock of the getAllWALs() method is based on the cversion of root queuesZNode - listOfOps.add(ZKUtilOp.createAndFailSilent(claimLockZNode, HConstants.EMPTY_BYTE_ARRAY)); - listOfOps.add(ZKUtilOp.deleteNodeFailSilent(claimLockZNode)); - - LOG.trace("The multi list size is {}", listOfOps.size()); - ZKUtil.multiOrSequential(zookeeper, listOfOps, false); - - LOG.info("Atomically moved {}/{}'s WALs to {}", sourceServerName, queueId, destServerName); - return new Pair<>(newQueueId, logQueue); - } catch (NoNodeException | NodeExistsException | NotEmptyException | BadVersionException e) { - // Multi call failed; it looks like some other regionserver took away the logs. - // These exceptions mean that zk tells us the request can not be execute. So return an empty - // queue to tell the upper layer that claim nothing. For other types of exception should be - // thrown out to notify the upper layer. - LOG.info("Claim queue queueId={} from {} to {} failed with {}, someone else took the log?", - queueId, sourceServerName, destServerName, e.toString()); - return new Pair<>(newQueueId, Collections.emptySortedSet()); - } catch (KeeperException | InterruptedException e) { - throw new ReplicationException("Claim queue queueId=" + queueId + " from " + sourceServerName - + " to " + destServerName + " failed", e); - } - } - - @Override - public void removeReplicatorIfQueueIsEmpty(ServerName serverName) throws ReplicationException { - try { - ZKUtil.deleteNodeFailSilent(zookeeper, getRsNode(serverName)); - } catch (NotEmptyException e) { - // keep silence to avoid logging too much. - } catch (KeeperException e) { - throw new ReplicationException("Failed to remove replicator for " + serverName, e); - } - } - - private List getListOfReplicators0() throws KeeperException { - List children = ZKUtil.listChildrenNoWatch(zookeeper, queuesZNode); - if (children == null) { - children = Collections.emptyList(); - } - return children.stream().map(ServerName::parseServerName).collect(toList()); - } - - @Override - public List getListOfReplicators() throws ReplicationException { - try { - return getListOfReplicators0(); - } catch (KeeperException e) { - throw new ReplicationException("Failed to get list of replicators", e); - } - } - - private List getWALsInQueue0(ServerName serverName, String queueId) - throws KeeperException { - List children = - ZKUtil.listChildrenNoWatch(zookeeper, getQueueNode(serverName, queueId)); - return children != null ? children : Collections.emptyList(); - } - - @Override - public List getWALsInQueue(ServerName serverName, String queueId) - throws ReplicationException { - try { - return getWALsInQueue0(serverName, queueId); - } catch (KeeperException e) { - throw new ReplicationException( - "Failed to get wals in queue (serverName=" + serverName + ", queueId=" + queueId + ")", e); - } - } - - private List getAllQueues0(ServerName serverName) throws KeeperException { - List children = ZKUtil.listChildrenNoWatch(zookeeper, getRsNode(serverName)); - return children != null ? children : Collections.emptyList(); - } - - @Override - public List getAllQueues(ServerName serverName) throws ReplicationException { - try { - return getAllQueues0(serverName); - } catch (KeeperException e) { - throw new ReplicationException("Failed to get all queues (serverName=" + serverName + ")", e); - } - } - - // will be overridden in UTs - protected int getQueuesZNodeCversion() throws KeeperException { - Stat stat = new Stat(); - ZKUtil.getDataNoWatch(this.zookeeper, this.queuesZNode, stat); - return stat.getCversion(); - } - - /** - * The optimistic lock of this implement is based on the cversion of root {@link #queuesZNode}. - * Therefore, we must update the cversion of root {@link #queuesZNode} when migrate wal nodes to - * other queues. - * @see #claimQueue(ServerName, String, ServerName) as an example of updating root - * {@link #queuesZNode} cversion. - */ - @Override - public Set getAllWALs() throws ReplicationException { - try { - for (int retry = 0;; retry++) { - int v0 = getQueuesZNodeCversion(); - List rss = getListOfReplicators0(); - if (rss.isEmpty()) { - LOG.debug("Didn't find a RegionServer that replicates, won't prevent deletions."); - return Collections.emptySet(); - } - Set wals = new HashSet<>(); - for (ServerName rs : rss) { - for (String queueId : getAllQueues0(rs)) { - wals.addAll(getWALsInQueue0(rs, queueId)); - } - } - int v1 = getQueuesZNodeCversion(); - if (v0 == v1) { - return wals; - } - LOG.info("Replication queue node cversion changed from %d to %d, retry = %d", v0, v1, - retry); - } - } catch (KeeperException e) { - throw new ReplicationException("Failed to get all wals", e); - } - } - - private String getHFileRefsPeerNode(String peerId) { - return ZNodePaths.joinZNode(hfileRefsZNode, peerId); - } - - private String getHFileNode(String peerNode, String fileName) { - return ZNodePaths.joinZNode(peerNode, fileName); - } - - @Override - public void addPeerToHFileRefs(String peerId) throws ReplicationException { - String peerNode = getHFileRefsPeerNode(peerId); - try { - if (ZKUtil.checkExists(zookeeper, peerNode) == -1) { - LOG.info("Adding peer {} to hfile reference queue.", peerId); - ZKUtil.createWithParents(zookeeper, peerNode); - } - } catch (KeeperException e) { - throw new ReplicationException("Failed to add peer " + peerId + " to hfile reference queue.", - e); - } - } - - @Override - public void removePeerFromHFileRefs(String peerId) throws ReplicationException { - String peerNode = getHFileRefsPeerNode(peerId); - try { - if (ZKUtil.checkExists(zookeeper, peerNode) == -1) { - LOG.debug("Peer {} not found in hfile reference queue.", peerNode); - } else { - LOG.info("Removing peer {} from hfile reference queue.", peerNode); - ZKUtil.deleteNodeRecursively(zookeeper, peerNode); - } - } catch (KeeperException e) { - throw new ReplicationException( - "Failed to remove peer " + peerId + " from hfile reference queue.", e); - } - } - - @Override - public void addHFileRefs(String peerId, List> pairs) - throws ReplicationException { - String peerNode = getHFileRefsPeerNode(peerId); - LOG.debug("Adding hfile references {} in queue {}", pairs, peerNode); - List listOfOps = - pairs.stream().map(p -> p.getSecond().getName()).map(n -> getHFileNode(peerNode, n)) - .map(f -> ZKUtilOp.createAndFailSilent(f, HConstants.EMPTY_BYTE_ARRAY)).collect(toList()); - LOG.debug("The multi list size for adding hfile references in zk for node {} is {}", peerNode, - listOfOps.size()); - try { - ZKUtil.multiOrSequential(this.zookeeper, listOfOps, true); - } catch (KeeperException e) { - throw new ReplicationException("Failed to add hfile reference to peer " + peerId, e); - } - } - - @Override - public void removeHFileRefs(String peerId, List files) throws ReplicationException { - String peerNode = getHFileRefsPeerNode(peerId); - LOG.debug("Removing hfile references {} from queue {}", files, peerNode); - - List listOfOps = files.stream().map(n -> getHFileNode(peerNode, n)) - .map(ZKUtilOp::deleteNodeFailSilent).collect(toList()); - LOG.debug("The multi list size for removing hfile references in zk for node {} is {}", peerNode, - listOfOps.size()); - try { - ZKUtil.multiOrSequential(this.zookeeper, listOfOps, true); - } catch (KeeperException e) { - throw new ReplicationException("Failed to remove hfile reference from peer " + peerId, e); - } - } - - private List getAllPeersFromHFileRefsQueue0() throws KeeperException { - List children = ZKUtil.listChildrenNoWatch(zookeeper, hfileRefsZNode); - return children != null ? children : Collections.emptyList(); - } - - @Override - public List getAllPeersFromHFileRefsQueue() throws ReplicationException { - try { - return getAllPeersFromHFileRefsQueue0(); - } catch (KeeperException e) { - throw new ReplicationException("Failed to get list of all peers in hfile references node.", - e); - } - } - - private List getReplicableHFiles0(String peerId) throws KeeperException { - List children = - ZKUtil.listChildrenNoWatch(this.zookeeper, getHFileRefsPeerNode(peerId)); - return children != null ? children : Collections.emptyList(); - } - - @Override - public List getReplicableHFiles(String peerId) throws ReplicationException { - try { - return getReplicableHFiles0(peerId); - } catch (KeeperException e) { - throw new ReplicationException("Failed to get list of hfile references for peer " + peerId, - e); - } - } - - // will be overridden in UTs - protected int getHFileRefsZNodeCversion() throws ReplicationException { - Stat stat = new Stat(); - try { - ZKUtil.getDataNoWatch(zookeeper, hfileRefsZNode, stat); - } catch (KeeperException e) { - throw new ReplicationException("Failed to get stat of replication hfile references node.", e); - } - return stat.getCversion(); - } - - @Override - public Set getAllHFileRefs() throws ReplicationException { - try { - for (int retry = 0;; retry++) { - int v0 = getHFileRefsZNodeCversion(); - List peers = getAllPeersFromHFileRefsQueue(); - if (peers.isEmpty()) { - LOG.debug("Didn't find any peers with hfile references, won't prevent deletions."); - return Collections.emptySet(); - } - Set hfileRefs = new HashSet<>(); - for (String peer : peers) { - hfileRefs.addAll(getReplicableHFiles0(peer)); - } - int v1 = getHFileRefsZNodeCversion(); - if (v0 == v1) { - return hfileRefs; - } - LOG.debug("Replication hfile references node cversion changed from %d to %d, retry = %d", - v0, v1, retry); - } - } catch (KeeperException e) { - throw new ReplicationException("Failed to get all hfile refs", e); - } - } -} diff --git a/hbase-replication/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationStateBasic.java b/hbase-replication/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationStateBasic.java index 15cf5b1f1f64..dc46e4f1c7c8 100644 --- a/hbase-replication/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationStateBasic.java +++ b/hbase-replication/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationStateBasic.java @@ -17,30 +17,18 @@ */ package org.apache.hadoop.hbase.replication; -import static org.hamcrest.CoreMatchers.hasItems; -import static org.hamcrest.MatcherAssert.assertThat; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.replication.ReplicationPeer.PeerState; -import org.apache.hadoop.hbase.util.Pair; -import org.apache.hadoop.hbase.zookeeper.MiniZooKeeperCluster; import org.apache.hadoop.hbase.zookeeper.ZKConfig; -import org.apache.zookeeper.KeeperException; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap; - /** * White box testing for replication state interfaces. Implementations should extend this class, and * initialize the interfaces properly. @@ -49,7 +37,6 @@ public abstract class TestReplicationStateBasic { private static final Logger LOG = LoggerFactory.getLogger(TestReplicationStateBasic.class); - protected ReplicationQueueStorage rqs; protected ServerName server1 = ServerName.valueOf("hostname1.example.org", 1234, 12345); protected ServerName server2 = ServerName.valueOf("hostname2.example.org", 1234, 12345); protected ServerName server3 = ServerName.valueOf("hostname3.example.org", 1234, 12345); @@ -66,161 +53,6 @@ public abstract class TestReplicationStateBasic { protected static final int ZK_MAX_COUNT = 300; protected static final int ZK_SLEEP_INTERVAL = 100; // millis - @Test - public void testReplicationQueueStorage() throws ReplicationException { - // Test methods with empty state - assertEquals(0, rqs.getListOfReplicators().size()); - assertTrue(rqs.getWALsInQueue(server1, "qId1").isEmpty()); - assertTrue(rqs.getAllQueues(server1).isEmpty()); - - /* - * Set up data Two replicators: -- server1: three queues with 0, 1 and 2 log files each -- - * server2: zero queues - */ - rqs.addWAL(server1, "qId1", "trash"); - rqs.removeWAL(server1, "qId1", "trash"); - rqs.addWAL(server1, "qId2", "filename1"); - rqs.addWAL(server1, "qId3", "filename2"); - rqs.addWAL(server1, "qId3", "filename3"); - rqs.addWAL(server2, "trash", "trash"); - rqs.removeQueue(server2, "trash"); - - List reps = rqs.getListOfReplicators(); - assertEquals(2, reps.size()); - assertTrue(server1.getServerName(), reps.contains(server1)); - assertTrue(server2.getServerName(), reps.contains(server2)); - - assertTrue(rqs.getWALsInQueue(ServerName.valueOf("bogus", 12345, 12345), "bogus").isEmpty()); - assertTrue(rqs.getWALsInQueue(server1, "bogus").isEmpty()); - assertEquals(0, rqs.getWALsInQueue(server1, "qId1").size()); - assertEquals(1, rqs.getWALsInQueue(server1, "qId2").size()); - assertEquals("filename1", rqs.getWALsInQueue(server1, "qId2").get(0)); - - assertTrue(rqs.getAllQueues(ServerName.valueOf("bogus", 12345, -1L)).isEmpty()); - assertEquals(0, rqs.getAllQueues(server2).size()); - List list = rqs.getAllQueues(server1); - assertEquals(3, list.size()); - assertTrue(list.contains("qId2")); - assertTrue(list.contains("qId3")); - } - - private void removeAllQueues(ServerName serverName) throws ReplicationException { - for (String queue : rqs.getAllQueues(serverName)) { - rqs.removeQueue(serverName, queue); - } - } - - @Test - public void testReplicationQueues() throws ReplicationException { - // Initialize ReplicationPeer so we can add peers (we don't transfer lone queues) - rp.init(); - - rqs.removeQueue(server1, "bogus"); - rqs.removeWAL(server1, "bogus", "bogus"); - removeAllQueues(server1); - assertEquals(0, rqs.getAllQueues(server1).size()); - assertEquals(0, rqs.getWALPosition(server1, "bogus", "bogus")); - assertTrue(rqs.getWALsInQueue(server1, "bogus").isEmpty()); - assertTrue(rqs.getAllQueues(ServerName.valueOf("bogus", 1234, 12345)).isEmpty()); - - populateQueues(); - - assertEquals(3, rqs.getListOfReplicators().size()); - assertEquals(0, rqs.getWALsInQueue(server2, "qId1").size()); - assertEquals(5, rqs.getWALsInQueue(server3, "qId5").size()); - assertEquals(0, rqs.getWALPosition(server3, "qId1", "filename0")); - rqs.setWALPosition(server3, "qId5", "filename4", 354L, Collections.emptyMap()); - assertEquals(354L, rqs.getWALPosition(server3, "qId5", "filename4")); - - assertEquals(5, rqs.getWALsInQueue(server3, "qId5").size()); - assertEquals(0, rqs.getWALsInQueue(server2, "qId1").size()); - assertEquals(0, rqs.getAllQueues(server1).size()); - assertEquals(1, rqs.getAllQueues(server2).size()); - assertEquals(5, rqs.getAllQueues(server3).size()); - - assertEquals(0, rqs.getAllQueues(server1).size()); - rqs.removeReplicatorIfQueueIsEmpty(server1); - assertEquals(2, rqs.getListOfReplicators().size()); - - List queues = rqs.getAllQueues(server3); - assertEquals(5, queues.size()); - for (String queue : queues) { - rqs.claimQueue(server3, queue, server2); - } - rqs.removeReplicatorIfQueueIsEmpty(server3); - assertEquals(1, rqs.getListOfReplicators().size()); - - assertEquals(6, rqs.getAllQueues(server2).size()); - removeAllQueues(server2); - rqs.removeReplicatorIfQueueIsEmpty(server2); - assertEquals(0, rqs.getListOfReplicators().size()); - } - - @Test - public void testHfileRefsReplicationQueues() throws ReplicationException, KeeperException { - rp.init(); - - List> files1 = new ArrayList<>(3); - files1.add(new Pair<>(null, new Path("file_1"))); - files1.add(new Pair<>(null, new Path("file_2"))); - files1.add(new Pair<>(null, new Path("file_3"))); - assertTrue(rqs.getReplicableHFiles(ID_ONE).isEmpty()); - assertEquals(0, rqs.getAllPeersFromHFileRefsQueue().size()); - rp.getPeerStorage().addPeer(ID_ONE, - ReplicationPeerConfig.newBuilder().setClusterKey(KEY_ONE).build(), true, - SyncReplicationState.NONE); - rqs.addPeerToHFileRefs(ID_ONE); - rqs.addHFileRefs(ID_ONE, files1); - assertEquals(1, rqs.getAllPeersFromHFileRefsQueue().size()); - assertEquals(3, rqs.getReplicableHFiles(ID_ONE).size()); - List hfiles2 = new ArrayList<>(files1.size()); - for (Pair p : files1) { - hfiles2.add(p.getSecond().getName()); - } - String removedString = hfiles2.remove(0); - rqs.removeHFileRefs(ID_ONE, hfiles2); - assertEquals(1, rqs.getReplicableHFiles(ID_ONE).size()); - hfiles2 = new ArrayList<>(1); - hfiles2.add(removedString); - rqs.removeHFileRefs(ID_ONE, hfiles2); - assertEquals(0, rqs.getReplicableHFiles(ID_ONE).size()); - rp.getPeerStorage().removePeer(ID_ONE); - } - - @Test - public void testRemovePeerForHFileRefs() throws ReplicationException, KeeperException { - rp.init(); - rp.getPeerStorage().addPeer(ID_ONE, - ReplicationPeerConfig.newBuilder().setClusterKey(KEY_ONE).build(), true, - SyncReplicationState.NONE); - rqs.addPeerToHFileRefs(ID_ONE); - rp.getPeerStorage().addPeer(ID_TWO, - ReplicationPeerConfig.newBuilder().setClusterKey(KEY_TWO).build(), true, - SyncReplicationState.NONE); - rqs.addPeerToHFileRefs(ID_TWO); - - List> files1 = new ArrayList<>(3); - files1.add(new Pair<>(null, new Path("file_1"))); - files1.add(new Pair<>(null, new Path("file_2"))); - files1.add(new Pair<>(null, new Path("file_3"))); - rqs.addHFileRefs(ID_ONE, files1); - rqs.addHFileRefs(ID_TWO, files1); - assertEquals(2, rqs.getAllPeersFromHFileRefsQueue().size()); - assertEquals(3, rqs.getReplicableHFiles(ID_ONE).size()); - assertEquals(3, rqs.getReplicableHFiles(ID_TWO).size()); - - rp.getPeerStorage().removePeer(ID_ONE); - rqs.removePeerFromHFileRefs(ID_ONE); - assertEquals(1, rqs.getAllPeersFromHFileRefsQueue().size()); - assertTrue(rqs.getReplicableHFiles(ID_ONE).isEmpty()); - assertEquals(3, rqs.getReplicableHFiles(ID_TWO).size()); - - rp.getPeerStorage().removePeer(ID_TWO); - rqs.removePeerFromHFileRefs(ID_TWO); - assertEquals(0, rqs.getAllPeersFromHFileRefsQueue().size()); - assertTrue(rqs.getReplicableHFiles(ID_TWO).isEmpty()); - } - @Test public void testReplicationPeers() throws Exception { rp.init(); @@ -286,55 +118,7 @@ public void testReplicationPeers() throws Exception { assertNumberOfPeers(2); } - private String getFileName(String base, int i) { - return String.format(base + "-%04d", i); - } - - @Test - public void testPersistLogPositionAndSeqIdAtomically() throws Exception { - ServerName serverName1 = ServerName.valueOf("127.0.0.1", 8000, 10000); - assertTrue(rqs.getAllQueues(serverName1).isEmpty()); - String queue1 = "1"; - String region0 = "6b2c8f8555335cc9af74455b94516cbe", - region1 = "6ecd2e9e010499f8ddef97ee8f70834f"; - for (int i = 0; i < 10; i++) { - rqs.addWAL(serverName1, queue1, getFileName("file1", i)); - } - List queueIds = rqs.getAllQueues(serverName1); - assertEquals(1, queueIds.size()); - assertThat(queueIds, hasItems("1")); - - List wals1 = rqs.getWALsInQueue(serverName1, queue1); - assertEquals(10, wals1.size()); - for (int i = 0; i < 10; i++) { - assertThat(wals1, hasItems(getFileName("file1", i))); - } - - for (int i = 0; i < 10; i++) { - assertEquals(0, rqs.getWALPosition(serverName1, queue1, getFileName("file1", i))); - } - assertEquals(HConstants.NO_SEQNUM, rqs.getLastSequenceId(region0, queue1)); - assertEquals(HConstants.NO_SEQNUM, rqs.getLastSequenceId(region1, queue1)); - - for (int i = 0; i < 10; i++) { - rqs.setWALPosition(serverName1, queue1, getFileName("file1", i), (i + 1) * 100, - ImmutableMap.of(region0, i * 100L, region1, (i + 1) * 100L)); - } - - for (int i = 0; i < 10; i++) { - assertEquals((i + 1) * 100, rqs.getWALPosition(serverName1, queue1, getFileName("file1", i))); - } - assertEquals(900L, rqs.getLastSequenceId(region0, queue1)); - assertEquals(1000L, rqs.getLastSequenceId(region1, queue1)); - - // Try to decrease the last pushed id by setWALPosition method. - rqs.setWALPosition(serverName1, queue1, getFileName("file1", 0), 11 * 100, - ImmutableMap.of(region0, 899L, region1, 1001L)); - assertEquals(900L, rqs.getLastSequenceId(region0, queue1)); - assertEquals(1001L, rqs.getLastSequenceId(region1, queue1)); - } - - protected void assertConnectedPeerStatus(boolean status, String peerId) throws Exception { + private void assertConnectedPeerStatus(boolean status, String peerId) throws Exception { // we can first check if the value was changed in the store, if it wasn't then fail right away if (status != rp.getPeerStorage().isPeerEnabled(peerId)) { fail("ConnectedPeerStatus was " + !status + " but expected " + status + " in ZK"); @@ -353,30 +137,7 @@ protected void assertConnectedPeerStatus(boolean status, String peerId) throws E } } - protected void assertNumberOfPeers(int total) throws ReplicationException { + private void assertNumberOfPeers(int total) throws ReplicationException { assertEquals(total, rp.getPeerStorage().listPeerIds().size()); } - - /* - * three replicators: rq1 has 0 queues, rq2 has 1 queue with no logs, rq3 has 5 queues with 1, 2, - * 3, 4, 5 log files respectively - */ - protected void populateQueues() throws ReplicationException { - rqs.addWAL(server1, "trash", "trash"); - rqs.removeQueue(server1, "trash"); - - rqs.addWAL(server2, "qId1", "trash"); - rqs.removeWAL(server2, "qId1", "trash"); - - for (int i = 1; i < 6; i++) { - for (int j = 0; j < i; j++) { - rqs.addWAL(server3, "qId" + i, "filename" + j); - } - // Add peers for the corresponding queues so they are not orphans - rp.getPeerStorage().addPeer("qId" + i, - ReplicationPeerConfig.newBuilder() - .setClusterKey(MiniZooKeeperCluster.HOST + ":2818:/bogus" + i).build(), - true, SyncReplicationState.NONE); - } - } } diff --git a/hbase-replication/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationStateZKImpl.java b/hbase-replication/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationStateZKImpl.java index d2540987906a..c5c8769282a6 100644 --- a/hbase-replication/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationStateZKImpl.java +++ b/hbase-replication/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationStateZKImpl.java @@ -80,7 +80,6 @@ private static String initPeerClusterState(String baseZKNode) @Before public void setUp() throws IOException { zkTimeoutCount = 0; - rqs = ReplicationStorageFactory.getReplicationQueueStorage(zkw, conf); rp = ReplicationFactory.getReplicationPeers(FileSystem.get(utility.getConfiguration()), zkw, conf); OUR_KEY = ZKConfig.getZooKeeperClusterKey(conf); diff --git a/hbase-replication/src/test/java/org/apache/hadoop/hbase/replication/TestZKReplicationQueueStorage.java b/hbase-replication/src/test/java/org/apache/hadoop/hbase/replication/TestZKReplicationQueueStorage.java deleted file mode 100644 index ccd3c17f3bca..000000000000 --- a/hbase-replication/src/test/java/org/apache/hadoop/hbase/replication/TestZKReplicationQueueStorage.java +++ /dev/null @@ -1,341 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.replication; - -import static org.hamcrest.CoreMatchers.hasItems; -import static org.hamcrest.MatcherAssert.assertThat; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Set; -import java.util.SortedSet; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.HBaseClassTestRule; -import org.apache.hadoop.hbase.HBaseZKTestingUtil; -import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.ServerName; -import org.apache.hadoop.hbase.testclassification.MediumTests; -import org.apache.hadoop.hbase.testclassification.ReplicationTests; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hbase.util.MD5Hash; -import org.apache.hadoop.hbase.util.Pair; -import org.apache.hadoop.hbase.zookeeper.ZKUtil; -import org.apache.zookeeper.KeeperException; -import org.junit.After; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.ClassRule; -import org.junit.Test; -import org.junit.experimental.categories.Category; - -import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap; - -@Category({ ReplicationTests.class, MediumTests.class }) -public class TestZKReplicationQueueStorage { - - @ClassRule - public static final HBaseClassTestRule CLASS_RULE = - HBaseClassTestRule.forClass(TestZKReplicationQueueStorage.class); - - private static final HBaseZKTestingUtil UTIL = new HBaseZKTestingUtil(); - - private static ZKReplicationQueueStorage STORAGE; - - @BeforeClass - public static void setUp() throws Exception { - UTIL.startMiniZKCluster(); - STORAGE = new ZKReplicationQueueStorage(UTIL.getZooKeeperWatcher(), UTIL.getConfiguration()); - } - - @AfterClass - public static void tearDown() throws IOException { - UTIL.shutdownMiniZKCluster(); - } - - @After - public void tearDownAfterTest() throws ReplicationException, KeeperException, IOException { - for (ServerName serverName : STORAGE.getListOfReplicators()) { - for (String queue : STORAGE.getAllQueues(serverName)) { - STORAGE.removeQueue(serverName, queue); - } - STORAGE.removeReplicatorIfQueueIsEmpty(serverName); - } - for (String peerId : STORAGE.getAllPeersFromHFileRefsQueue()) { - STORAGE.removePeerFromHFileRefs(peerId); - } - } - - private ServerName getServerName(int i) { - return ServerName.valueOf("127.0.0.1", 8000 + i, 10000 + i); - } - - @Test - public void testReplicator() throws ReplicationException { - assertTrue(STORAGE.getListOfReplicators().isEmpty()); - String queueId = "1"; - for (int i = 0; i < 10; i++) { - STORAGE.addWAL(getServerName(i), queueId, "file" + i); - } - List replicators = STORAGE.getListOfReplicators(); - assertEquals(10, replicators.size()); - for (int i = 0; i < 10; i++) { - assertThat(replicators, hasItems(getServerName(i))); - } - for (int i = 0; i < 5; i++) { - STORAGE.removeQueue(getServerName(i), queueId); - } - for (int i = 0; i < 10; i++) { - STORAGE.removeReplicatorIfQueueIsEmpty(getServerName(i)); - } - replicators = STORAGE.getListOfReplicators(); - assertEquals(5, replicators.size()); - for (int i = 5; i < 10; i++) { - assertThat(replicators, hasItems(getServerName(i))); - } - } - - private String getFileName(String base, int i) { - return String.format(base + "-%04d", i); - } - - @Test - public void testAddRemoveLog() throws ReplicationException { - ServerName serverName1 = ServerName.valueOf("127.0.0.1", 8000, 10000); - assertTrue(STORAGE.getAllQueues(serverName1).isEmpty()); - String queue1 = "1"; - String queue2 = "2"; - for (int i = 0; i < 10; i++) { - STORAGE.addWAL(serverName1, queue1, getFileName("file1", i)); - STORAGE.addWAL(serverName1, queue2, getFileName("file2", i)); - } - List queueIds = STORAGE.getAllQueues(serverName1); - assertEquals(2, queueIds.size()); - assertThat(queueIds, hasItems("1", "2")); - - List wals1 = STORAGE.getWALsInQueue(serverName1, queue1); - List wals2 = STORAGE.getWALsInQueue(serverName1, queue2); - assertEquals(10, wals1.size()); - assertEquals(10, wals2.size()); - for (int i = 0; i < 10; i++) { - assertThat(wals1, hasItems(getFileName("file1", i))); - assertThat(wals2, hasItems(getFileName("file2", i))); - } - - for (int i = 0; i < 10; i++) { - assertEquals(0, STORAGE.getWALPosition(serverName1, queue1, getFileName("file1", i))); - assertEquals(0, STORAGE.getWALPosition(serverName1, queue2, getFileName("file2", i))); - STORAGE.setWALPosition(serverName1, queue1, getFileName("file1", i), (i + 1) * 100, - Collections.emptyMap()); - STORAGE.setWALPosition(serverName1, queue2, getFileName("file2", i), (i + 1) * 100 + 10, - Collections.emptyMap()); - } - - for (int i = 0; i < 10; i++) { - assertEquals((i + 1) * 100, - STORAGE.getWALPosition(serverName1, queue1, getFileName("file1", i))); - assertEquals((i + 1) * 100 + 10, - STORAGE.getWALPosition(serverName1, queue2, getFileName("file2", i))); - } - - for (int i = 0; i < 10; i++) { - if (i % 2 == 0) { - STORAGE.removeWAL(serverName1, queue1, getFileName("file1", i)); - } else { - STORAGE.removeWAL(serverName1, queue2, getFileName("file2", i)); - } - } - - queueIds = STORAGE.getAllQueues(serverName1); - assertEquals(2, queueIds.size()); - assertThat(queueIds, hasItems("1", "2")); - - ServerName serverName2 = ServerName.valueOf("127.0.0.1", 8001, 10001); - Pair> peer1 = STORAGE.claimQueue(serverName1, "1", serverName2); - - assertEquals("1-" + serverName1.getServerName(), peer1.getFirst()); - assertEquals(5, peer1.getSecond().size()); - int i = 1; - for (String wal : peer1.getSecond()) { - assertEquals(getFileName("file1", i), wal); - assertEquals((i + 1) * 100, - STORAGE.getWALPosition(serverName2, peer1.getFirst(), getFileName("file1", i))); - i += 2; - } - - queueIds = STORAGE.getAllQueues(serverName1); - assertEquals(1, queueIds.size()); - assertThat(queueIds, hasItems("2")); - wals2 = STORAGE.getWALsInQueue(serverName1, queue2); - assertEquals(5, wals2.size()); - for (i = 0; i < 10; i += 2) { - assertThat(wals2, hasItems(getFileName("file2", i))); - } - - queueIds = STORAGE.getAllQueues(serverName2); - assertEquals(1, queueIds.size()); - assertThat(queueIds, hasItems(peer1.getFirst())); - wals1 = STORAGE.getWALsInQueue(serverName2, peer1.getFirst()); - assertEquals(5, wals1.size()); - for (i = 1; i < 10; i += 2) { - assertThat(wals1, hasItems(getFileName("file1", i))); - } - - Set allWals = STORAGE.getAllWALs(); - assertEquals(10, allWals.size()); - for (i = 0; i < 10; i++) { - assertThat(allWals, hasItems(i % 2 == 0 ? getFileName("file2", i) : getFileName("file1", i))); - } - } - - // For HBASE-12865, HBASE-26482 - @Test - public void testClaimQueueChangeCversion() throws ReplicationException, KeeperException { - ServerName serverName1 = ServerName.valueOf("127.0.0.1", 8000, 10000); - STORAGE.addWAL(serverName1, "1", "file"); - STORAGE.addWAL(serverName1, "2", "file"); - - ServerName serverName2 = ServerName.valueOf("127.0.0.1", 8001, 10001); - // Avoid claimQueue update cversion for prepare server2 rsNode. - STORAGE.addWAL(serverName2, "1", "file"); - STORAGE.addWAL(serverName2, "2", "file"); - - int v0 = STORAGE.getQueuesZNodeCversion(); - - STORAGE.claimQueue(serverName1, "1", serverName2); - int v1 = STORAGE.getQueuesZNodeCversion(); - // cversion should be increased by claimQueue method. - assertTrue(v1 > v0); - - STORAGE.claimQueue(serverName1, "2", serverName2); - int v2 = STORAGE.getQueuesZNodeCversion(); - // cversion should be increased by claimQueue method. - assertTrue(v2 > v1); - } - - private ZKReplicationQueueStorage createWithUnstableVersion() throws IOException { - return new ZKReplicationQueueStorage(UTIL.getZooKeeperWatcher(), UTIL.getConfiguration()) { - - private int called = 0; - private int getLastSeqIdOpIndex = 0; - - @Override - protected int getQueuesZNodeCversion() throws KeeperException { - if (called < 4) { - called++; - } - return called; - } - - @Override - protected Pair getLastSequenceIdWithVersion(String encodedRegionName, - String peerId) throws KeeperException { - Pair oldPair = super.getLastSequenceIdWithVersion(encodedRegionName, peerId); - if (getLastSeqIdOpIndex < 100) { - // Let the ZNode version increase. - String path = getSerialReplicationRegionPeerNode(encodedRegionName, peerId); - ZKUtil.createWithParents(zookeeper, path); - ZKUtil.setData(zookeeper, path, ZKUtil.positionToByteArray(100L)); - } - getLastSeqIdOpIndex++; - return oldPair; - } - }; - } - - @Test - public void testGetAllWALsCversionChange() throws IOException, ReplicationException { - ZKReplicationQueueStorage storage = createWithUnstableVersion(); - storage.addWAL(getServerName(0), "1", "file"); - // This should return eventually when cversion stabilizes - Set allWals = storage.getAllWALs(); - assertEquals(1, allWals.size()); - assertThat(allWals, hasItems("file")); - } - - // For HBASE-14621 - @Test - public void testGetAllHFileRefsCversionChange() throws IOException, ReplicationException { - ZKReplicationQueueStorage storage = createWithUnstableVersion(); - storage.addPeerToHFileRefs("1"); - Path p = new Path("/test"); - storage.addHFileRefs("1", Arrays.asList(Pair.newPair(p, p))); - // This should return eventually when cversion stabilizes - Set allHFileRefs = storage.getAllHFileRefs(); - assertEquals(1, allHFileRefs.size()); - assertThat(allHFileRefs, hasItems("test")); - } - - // For HBASE-20138 - @Test - public void testSetWALPositionBadVersion() throws IOException, ReplicationException { - ZKReplicationQueueStorage storage = createWithUnstableVersion(); - ServerName serverName1 = ServerName.valueOf("128.0.0.1", 8000, 10000); - assertTrue(storage.getAllQueues(serverName1).isEmpty()); - String queue1 = "1"; - String fileName = getFileName("file1", 0); - String encodedRegionName = "31d9792f4435b99d9fb1016f6fbc8dc6"; - storage.addWAL(serverName1, queue1, fileName); - - List wals1 = storage.getWALsInQueue(serverName1, queue1); - assertEquals(1, wals1.size()); - - assertEquals(0, storage.getWALPosition(serverName1, queue1, fileName)); - // This should return eventually when data version stabilizes - storage.setWALPosition(serverName1, queue1, fileName, 100, - ImmutableMap.of(encodedRegionName, 120L)); - - assertEquals(100, storage.getWALPosition(serverName1, queue1, fileName)); - assertEquals(120L, storage.getLastSequenceId(encodedRegionName, queue1)); - } - - @Test - public void testRegionsZNodeLayout() throws Exception { - String peerId = "1"; - String encodedRegionName = "31d9792f4435b99d9fb1016f6fbc8dc7"; - String expectedPath = "/hbase/replication/regions/31/d9/792f4435b99d9fb1016f6fbc8dc7-" + peerId; - String path = STORAGE.getSerialReplicationRegionPeerNode(encodedRegionName, peerId); - assertEquals(expectedPath, path); - } - - @Test - public void testRemoveAllLastPushedSeqIdsForPeer() throws Exception { - String peerId = "1"; - String peerIdToDelete = "2"; - for (int i = 0; i < 100; i++) { - String encodedRegionName = MD5Hash.getMD5AsHex(Bytes.toBytes(i)); - STORAGE.setLastSequenceIds(peerId, ImmutableMap.of(encodedRegionName, (long) i)); - STORAGE.setLastSequenceIds(peerIdToDelete, ImmutableMap.of(encodedRegionName, (long) i)); - } - for (int i = 0; i < 100; i++) { - String encodedRegionName = MD5Hash.getMD5AsHex(Bytes.toBytes(i)); - assertEquals(i, STORAGE.getLastSequenceId(encodedRegionName, peerId)); - assertEquals(i, STORAGE.getLastSequenceId(encodedRegionName, peerIdToDelete)); - } - STORAGE.removeLastSequenceIds(peerIdToDelete); - for (int i = 0; i < 100; i++) { - String encodedRegionName = MD5Hash.getMD5AsHex(Bytes.toBytes(i)); - assertEquals(i, STORAGE.getLastSequenceId(encodedRegionName, peerId)); - assertEquals(HConstants.NO_SEQNUM, - STORAGE.getLastSequenceId(encodedRegionName, peerIdToDelete)); - } - } -} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index f3d4dfb292d3..896f9a5d0860 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -788,8 +788,7 @@ private void initializeZKBasedSystemTrackers() } this.rsGroupInfoManager = RSGroupInfoManager.create(this); - this.replicationPeerManager = - ReplicationPeerManager.create(fileSystemManager.getFileSystem(), zooKeeper, conf, clusterId); + this.replicationPeerManager = ReplicationPeerManager.create(this, clusterId); this.configurationManager.registerObserver(replicationPeerManager); this.replicationPeerModificationStateStore = new ReplicationPeerModificationStateStore(masterRegion); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java index 0bd0f3ba0c7e..487c45e5c5cb 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java @@ -36,7 +36,7 @@ import org.apache.hadoop.hbase.master.assignment.AssignmentManager; import org.apache.hadoop.hbase.master.assignment.RegionStateNode; import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure; -import org.apache.hadoop.hbase.master.replication.ClaimReplicationQueuesProcedure; +import org.apache.hadoop.hbase.master.replication.AssignReplicationQueuesProcedure; import org.apache.hadoop.hbase.monitoring.MonitoredTask; import org.apache.hadoop.hbase.monitoring.TaskMonitor; import org.apache.hadoop.hbase.procedure2.Procedure; @@ -240,15 +240,33 @@ protected Flow executeFromState(MasterProcedureEnv env, ServerCrashState state) } assignRegions(env, regionsOnCrashedServer); } - setNextState(ServerCrashState.SERVER_CRASH_CLAIM_REPLICATION_QUEUES); + // If there is no replication peer, we do not need to enter the claim queues stage. + // This is also very important that now we will later initialize ReplicationQueueStorage + // so if there is no replication peer added yet, the storage can not be accessed. + // And there will be no race because: + // 1. For adding replication peer, if the peer storage has not been updated yet, the crash + // region server will not have any replication queues for this peer, so it is safe to skip + // claiming. + // 2. For removing replication peer, it it has already updated the peer storage, then + // there is no way to rollback and region servers are already started to close and delete + // replication queues, so it is also safe to skip claiming. + if (env.getReplicationPeerManager().listPeers(null).isEmpty()) { + setNextState(ServerCrashState.SERVER_CRASH_FINISH); + } else { + setNextState(ServerCrashState.SERVER_CRASH_CLAIM_REPLICATION_QUEUES); + } break; case SERVER_CRASH_HANDLE_RIT2: // Noop. Left in place because we used to call handleRIT here for a second time // but no longer necessary since HBASE-20634. - setNextState(ServerCrashState.SERVER_CRASH_CLAIM_REPLICATION_QUEUES); + if (env.getReplicationPeerManager().listPeers(null).isEmpty()) { + setNextState(ServerCrashState.SERVER_CRASH_FINISH); + } else { + setNextState(ServerCrashState.SERVER_CRASH_CLAIM_REPLICATION_QUEUES); + } break; case SERVER_CRASH_CLAIM_REPLICATION_QUEUES: - addChildProcedure(new ClaimReplicationQueuesProcedure(serverName)); + addChildProcedure(new AssignReplicationQueuesProcedure(serverName)); setNextState(ServerCrashState.SERVER_CRASH_FINISH); break; case SERVER_CRASH_FINISH: diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AssignReplicationQueuesProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AssignReplicationQueuesProcedure.java new file mode 100644 index 000000000000..e7fb5e517159 --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AssignReplicationQueuesProcedure.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.replication; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; +import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface; +import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; +import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; +import org.apache.hadoop.hbase.procedure2.ProcedureUtil; +import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; +import org.apache.hadoop.hbase.procedure2.StateMachineProcedure; +import org.apache.hadoop.hbase.replication.ReplicationException; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; +import org.apache.hadoop.hbase.replication.ReplicationPeerDescription; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; +import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; +import org.apache.hadoop.hbase.util.RetryCounter; +import org.apache.yetus.audience.InterfaceAudience; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.AssignReplicationQueuesState; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.AssignReplicationQueuesStateData; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos; + +@InterfaceAudience.Private +public class AssignReplicationQueuesProcedure + extends StateMachineProcedure + implements ServerProcedureInterface { + + private static final Logger LOG = LoggerFactory.getLogger(AssignReplicationQueuesProcedure.class); + + private ServerName crashedServer; + + private RetryCounter retryCounter; + + public AssignReplicationQueuesProcedure() { + } + + public AssignReplicationQueuesProcedure(ServerName crashedServer) { + this.crashedServer = crashedServer; + } + + @Override + public ServerName getServerName() { + return crashedServer; + } + + @Override + public boolean hasMetaTableRegion() { + return false; + } + + @Override + public ServerOperationType getServerOperationType() { + return ServerOperationType.CLAIM_REPLICATION_QUEUES; + } + + private void addMissingQueues(MasterProcedureEnv env) throws ReplicationException { + ReplicationQueueStorage storage = env.getReplicationPeerManager().getQueueStorage(); + + Set existingQueuePeerIds = new HashSet<>(); + List queueIds = storage.listAllQueueIds(crashedServer); + for (Iterator iter = queueIds.iterator(); iter.hasNext();) { + ReplicationQueueId queueId = iter.next(); + if (!queueId.isRecovered()) { + existingQueuePeerIds.add(queueId.getPeerId()); + } + } + List peers = env.getReplicationPeerManager().listPeers(null); + for (ReplicationPeerDescription peer : peers) { + if (!existingQueuePeerIds.contains(peer.getPeerId())) { + ReplicationQueueId queueId = new ReplicationQueueId(crashedServer, peer.getPeerId()); + LOG.debug("Add replication queue {} for claiming", queueId); + env.getReplicationPeerManager().getQueueStorage().setOffset(queueId, + crashedServer.toString(), ReplicationGroupOffset.BEGIN, Collections.emptyMap()); + } + } + } + + private Flow claimQueues(MasterProcedureEnv env) throws ReplicationException { + ReplicationQueueStorage storage = env.getReplicationPeerManager().getQueueStorage(); + List queueIds = storage.listAllQueueIds(crashedServer); + if (queueIds.isEmpty()) { + LOG.debug("Finish claiming replication queues for {}", crashedServer); + // we are done + return Flow.NO_MORE_STATE; + } + LOG.debug("There are {} replication queues need to be claimed for {}", queueIds.size(), + crashedServer); + List targetServers = + env.getMasterServices().getServerManager().getOnlineServersList(); + if (targetServers.isEmpty()) { + throw new ReplicationException("no region server available"); + } + Collections.shuffle(targetServers); + for (int i = 0, n = Math.min(queueIds.size(), targetServers.size()); i < n; i++) { + addChildProcedure( + new ClaimReplicationQueueRemoteProcedure(queueIds.get(i), targetServers.get(i))); + } + retryCounter = null; + return Flow.HAS_MORE_STATE; + } + + @Override + protected Flow executeFromState(MasterProcedureEnv env, AssignReplicationQueuesState state) + throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException { + try { + switch (state) { + case ASSIGN_REPLICATION_QUEUES_PRE_CHECK: + // TODO: reserved for implementing the fencing logic with Add/Remove/UpdatePeerProcedure + setNextState(AssignReplicationQueuesState.ASSIGN_REPLICATION_QUEUES_ADD_MISSING_QUEUES); + return Flow.HAS_MORE_STATE; + case ASSIGN_REPLICATION_QUEUES_ADD_MISSING_QUEUES: + addMissingQueues(env); + retryCounter = null; + setNextState(AssignReplicationQueuesState.ASSIGN_REPLICATION_QUEUES_CLAIM); + return Flow.HAS_MORE_STATE; + case ASSIGN_REPLICATION_QUEUES_CLAIM: + return claimQueues(env); + default: + throw new UnsupportedOperationException("unhandled state=" + state); + } + } catch (Exception e) { + if (retryCounter == null) { + retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration()); + } + long backoff = retryCounter.getBackoffTimeAndIncrementAttempts(); + LOG.warn("Failed to claim replication queues for {}, suspend {}secs {}; {};", crashedServer, + backoff / 1000, e); + setTimeout(Math.toIntExact(backoff)); + setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT); + skipPersistence(); + throw new ProcedureSuspendedException(); + } + } + + @Override + protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) { + setState(ProcedureProtos.ProcedureState.RUNNABLE); + env.getProcedureScheduler().addFront(this); + return false; + } + + @Override + protected void rollbackState(MasterProcedureEnv env, AssignReplicationQueuesState state) + throws IOException, InterruptedException { + throw new UnsupportedOperationException(); + } + + @Override + protected AssignReplicationQueuesState getState(int stateId) { + return AssignReplicationQueuesState.forNumber(stateId); + } + + @Override + protected int getStateId(AssignReplicationQueuesState state) { + return state.getNumber(); + } + + @Override + protected AssignReplicationQueuesState getInitialState() { + return AssignReplicationQueuesState.ASSIGN_REPLICATION_QUEUES_PRE_CHECK; + } + + @Override + protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException { + super.serializeStateData(serializer); + serializer.serialize(AssignReplicationQueuesStateData.newBuilder() + .setCrashedServer(ProtobufUtil.toServerName(crashedServer)).build()); + } + + @Override + protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException { + super.deserializeStateData(serializer); + AssignReplicationQueuesStateData proto = + serializer.deserialize(AssignReplicationQueuesStateData.class); + crashedServer = ProtobufUtil.toServerName(proto.getCrashedServer()); + } + +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ClaimReplicationQueueRemoteProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ClaimReplicationQueueRemoteProcedure.java index 9ef97d1fff62..7b637384398a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ClaimReplicationQueueRemoteProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ClaimReplicationQueueRemoteProcedure.java @@ -27,6 +27,7 @@ import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteOperation; import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteProcedure; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.hadoop.hbase.replication.regionserver.ClaimReplicationQueueCallable; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; @@ -43,34 +44,33 @@ public class ClaimReplicationQueueRemoteProcedure extends ServerRemoteProcedure private static final Logger LOG = LoggerFactory.getLogger(ClaimReplicationQueueRemoteProcedure.class); - private ServerName crashedServer; - - private String queue; + private ReplicationQueueId queueId; public ClaimReplicationQueueRemoteProcedure() { } - public ClaimReplicationQueueRemoteProcedure(ServerName crashedServer, String queue, - ServerName targetServer) { - this.crashedServer = crashedServer; - this.queue = queue; + public ClaimReplicationQueueRemoteProcedure(ReplicationQueueId queueId, ServerName targetServer) { + this.queueId = queueId; this.targetServer = targetServer; } @Override public Optional remoteCallBuild(MasterProcedureEnv env, ServerName remote) { assert targetServer.equals(remote); + ClaimReplicationQueueRemoteParameter.Builder builder = ClaimReplicationQueueRemoteParameter + .newBuilder().setCrashedServer(ProtobufUtil.toServerName(queueId.getServerName())) + .setQueue(queueId.getPeerId()); + queueId.getSourceServerName() + .ifPresent(sourceServer -> builder.setSourceServer(ProtobufUtil.toServerName(sourceServer))); return Optional.of(new ServerOperation(this, getProcId(), ClaimReplicationQueueCallable.class, - ClaimReplicationQueueRemoteParameter.newBuilder() - .setCrashedServer(ProtobufUtil.toServerName(crashedServer)).setQueue(queue).build() - .toByteArray())); + builder.build().toByteArray())); } @Override public ServerName getServerName() { // return crashed server here, as we are going to recover its replication queues so we should // use its scheduler queue instead of the one for the target server. - return crashedServer; + return queueId.getServerName(); } @Override @@ -86,8 +86,7 @@ public ServerOperationType getServerOperationType() { @Override protected void complete(MasterProcedureEnv env, Throwable error) { if (error != null) { - LOG.warn("Failed to claim replication queue {} of crashed server on server {} ", queue, - crashedServer, targetServer, error); + LOG.warn("Failed to claim replication queue {} on server {} ", queueId, targetServer, error); this.succ = false; } else { this.succ = true; @@ -111,17 +110,26 @@ protected boolean waitInitialized(MasterProcedureEnv env) { @Override protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException { - serializer.serialize(ClaimReplicationQueueRemoteStateData.newBuilder() - .setCrashedServer(ProtobufUtil.toServerName(crashedServer)).setQueue(queue) - .setTargetServer(ProtobufUtil.toServerName(targetServer)).build()); + ClaimReplicationQueueRemoteStateData.Builder builder = ClaimReplicationQueueRemoteStateData + .newBuilder().setCrashedServer(ProtobufUtil.toServerName(queueId.getServerName())) + .setQueue(queueId.getPeerId()).setTargetServer(ProtobufUtil.toServerName(targetServer)); + queueId.getSourceServerName() + .ifPresent(sourceServer -> builder.setSourceServer(ProtobufUtil.toServerName(sourceServer))); + serializer.serialize(builder.build()); } @Override protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException { ClaimReplicationQueueRemoteStateData data = serializer.deserialize(ClaimReplicationQueueRemoteStateData.class); - crashedServer = ProtobufUtil.toServerName(data.getCrashedServer()); - queue = data.getQueue(); targetServer = ProtobufUtil.toServerName(data.getTargetServer()); + ServerName crashedServer = ProtobufUtil.toServerName(data.getCrashedServer()); + String queue = data.getQueue(); + if (data.hasSourceServer()) { + queueId = new ReplicationQueueId(crashedServer, queue, + ProtobufUtil.toServerName(data.getSourceServer())); + } else { + queueId = new ReplicationQueueId(crashedServer, queue); + } } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ClaimReplicationQueuesProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ClaimReplicationQueuesProcedure.java index 5eb6608f4ee6..747d352d2aa5 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ClaimReplicationQueuesProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ClaimReplicationQueuesProcedure.java @@ -19,8 +19,10 @@ import java.io.IOException; import java.util.Collections; +import java.util.HashSet; import java.util.Iterator; import java.util.List; +import java.util.Set; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface; @@ -30,7 +32,9 @@ import org.apache.hadoop.hbase.procedure2.ProcedureUtil; import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; import org.apache.hadoop.hbase.replication.ReplicationException; -import org.apache.hadoop.hbase.replication.ReplicationQueueInfo; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; +import org.apache.hadoop.hbase.replication.ReplicationPeerDescription; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; import org.apache.hadoop.hbase.util.RetryCounter; import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil; @@ -44,7 +48,10 @@ /** * Used to assign the replication queues of a dead server to other region servers. + * @deprecated Use {@link AssignReplicationQueuesProcedure} instead, kept only for keeping + * compatibility. */ +@Deprecated @InterfaceAudience.Private public class ClaimReplicationQueuesProcedure extends Procedure implements ServerProcedureInterface { @@ -82,22 +89,36 @@ protected Procedure[] execute(MasterProcedureEnv env) throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException { ReplicationQueueStorage storage = env.getReplicationPeerManager().getQueueStorage(); try { - List queues = storage.getAllQueues(crashedServer); + List queues = storage.listAllQueueIds(crashedServer); + Set existQueuePeerIds = new HashSet<>(); // this is for upgrading to the new region replication framework, where we will delete the - // legacy region_replica_replication peer directly, without deleting the replication queues, - // as it may still be used by region servers which have not been upgraded yet. - for (Iterator iter = queues.iterator(); iter.hasNext();) { - ReplicationQueueInfo queue = new ReplicationQueueInfo(iter.next()); - if (queue.getPeerId().equals(ServerRegionReplicaUtil.REGION_REPLICA_REPLICATION_PEER)) { + // legacy region_replica_replication peer directly, without deleting the replication queues + for (Iterator iter = queues.iterator(); iter.hasNext();) { + ReplicationQueueId queueId = iter.next(); + if (queueId.getPeerId().equals(ServerRegionReplicaUtil.REGION_REPLICA_REPLICATION_PEER)) { LOG.info("Found replication queue {} for legacy region replication peer, " - + "skipping claiming and removing...", queue.getQueueId()); + + "skipping claiming and removing...", queueId); iter.remove(); - storage.removeQueue(crashedServer, queue.getQueueId()); + storage.removeQueue(queueId); + } else if (!queueId.isRecovered()) { + existQueuePeerIds.add(queueId.getPeerId()); + } + } + List peers = env.getReplicationPeerManager().listPeers(null); + // TODO: the implementation is not enough yet, if there are retries, we need to know whether + // the replication queue for the given peer has been claimed or not, otherwise this logic will + // introduce redundant replication queues for the same peer. Add this logic to make some UTs + // pass first. + for (ReplicationPeerDescription peer : peers) { + if (!existQueuePeerIds.contains(peer.getPeerId())) { + ReplicationQueueId queueId = new ReplicationQueueId(crashedServer, peer.getPeerId()); + env.getReplicationPeerManager().getQueueStorage().setOffset(queueId, + crashedServer.toString(), ReplicationGroupOffset.BEGIN, Collections.emptyMap()); + queues.add(queueId); } } if (queues.isEmpty()) { LOG.debug("Finish claiming replication queues for {}", crashedServer); - storage.removeReplicatorIfQueueIsEmpty(crashedServer); // we are done return null; } @@ -112,8 +133,7 @@ protected Procedure[] execute(MasterProcedureEnv env) ClaimReplicationQueueRemoteProcedure[] procs = new ClaimReplicationQueueRemoteProcedure[Math.min(queues.size(), targetServers.size())]; for (int i = 0; i < procs.length; i++) { - procs[i] = new ClaimReplicationQueueRemoteProcedure(crashedServer, queues.get(i), - targetServers.get(i)); + procs[i] = new ClaimReplicationQueueRemoteProcedure(queues.get(i), targetServers.get(i)); } return procs; } catch (ReplicationException e) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java index bfb7b7c10c08..53270bcbb04e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hbase.master.replication; +import com.google.errorprone.annotations.RestrictedApi; import java.io.IOException; import java.net.URI; import java.util.ArrayList; @@ -29,6 +30,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.Semaphore; +import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; @@ -38,10 +40,13 @@ import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.ReplicationPeerNotFoundException; -import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.replication.ReplicationPeerConfigUtil; import org.apache.hadoop.hbase.conf.ConfigurationObserver; +import org.apache.hadoop.hbase.master.MasterServices; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; +import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; import org.apache.hadoop.hbase.replication.BaseReplicationEndpoint; import org.apache.hadoop.hbase.replication.HBaseReplicationEndpoint; import org.apache.hadoop.hbase.replication.ReplicationEndpoint; @@ -50,11 +55,12 @@ import org.apache.hadoop.hbase.replication.ReplicationPeerConfigBuilder; import org.apache.hadoop.hbase.replication.ReplicationPeerDescription; import org.apache.hadoop.hbase.replication.ReplicationPeerStorage; -import org.apache.hadoop.hbase.replication.ReplicationQueueInfo; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; import org.apache.hadoop.hbase.replication.ReplicationUtils; import org.apache.hadoop.hbase.replication.SyncReplicationState; +import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.zookeeper.ZKClusterId; import org.apache.hadoop.hbase.zookeeper.ZKConfig; import org.apache.hadoop.hbase.zookeeper.ZKWatcher; @@ -105,9 +111,20 @@ public class ReplicationPeerManager implements ConfigurationObserver { private final ZKWatcher zk; + @FunctionalInterface + private interface ReplicationQueueStorageInitializer { + + void initialize() throws IOException; + } + + private final ReplicationQueueStorageInitializer queueStorageInitializer; + + // we will mock this class in UT so leave the constructor as package private and not mark the + // class as final, since mockito can not mock a final class ReplicationPeerManager(FileSystem fs, ZKWatcher zk, ReplicationPeerStorage peerStorage, ReplicationQueueStorage queueStorage, ConcurrentMap peers, - Configuration conf, String clusterId) { + Configuration conf, String clusterId, + ReplicationQueueStorageInitializer queueStorageInitializer) { this.fs = fs; this.zk = zk; this.peerStorage = peerStorage; @@ -115,19 +132,15 @@ public class ReplicationPeerManager implements ConfigurationObserver { this.peers = peers; this.conf = conf; this.clusterId = clusterId; + this.queueStorageInitializer = queueStorageInitializer; } private void checkQueuesDeleted(String peerId) throws ReplicationException, DoNotRetryIOException { - for (ServerName replicator : queueStorage.getListOfReplicators()) { - List queueIds = queueStorage.getAllQueues(replicator); - for (String queueId : queueIds) { - ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueId); - if (queueInfo.getPeerId().equals(peerId)) { - throw new DoNotRetryIOException("undeleted queue for peerId: " + peerId + ", replicator: " - + replicator + ", queueId: " + queueId); - } - } + List queueIds = queueStorage.listAllQueueIds(peerId); + if (!queueIds.isEmpty()) { + throw new DoNotRetryIOException("There are still " + queueIds.size() + + " undeleted queue(s) for peerId: " + peerId + ", first is " + queueIds.get(0)); } if (queueStorage.getAllPeersFromHFileRefsQueue().contains(peerId)) { throw new DoNotRetryIOException("Undeleted queue for peer " + peerId + " in hfile-refs"); @@ -135,7 +148,7 @@ private void checkQueuesDeleted(String peerId) } void preAddPeer(String peerId, ReplicationPeerConfig peerConfig) - throws DoNotRetryIOException, ReplicationException { + throws ReplicationException, IOException { if (peerId.contains("-")) { throw new DoNotRetryIOException("Found invalid peer name: " + peerId); } @@ -146,6 +159,9 @@ void preAddPeer(String peerId, ReplicationPeerConfig peerConfig) if (peers.containsKey(peerId)) { throw new DoNotRetryIOException("Replication peer " + peerId + " already exists"); } + + // lazy create table + queueStorageInitializer.initialize(); // make sure that there is no queues with the same peer id. This may happen when we create a // peer with the same id with a old deleted peer. If the replication queues for the old peer // have not been cleaned up yet then we should not create the new peer, otherwise the old wal @@ -365,8 +381,8 @@ public void removeAllQueues(String peerId) throws ReplicationException { // claimed once after the refresh peer procedure done(as the next claim queue will just delete // it), so we can make sure that a two pass scan will finally find the queue and remove it, // unless it has already been removed by others. - ReplicationUtils.removeAllQueues(queueStorage, peerId); - ReplicationUtils.removeAllQueues(queueStorage, peerId); + queueStorage.removeAllQueues(peerId); + queueStorage.removeAllQueues(peerId); } public void removeAllQueuesAndHFileRefs(String peerId) throws ReplicationException { @@ -568,14 +584,69 @@ public List getSerialPeerIdsBelongsTo(TableName tableName) { .collect(Collectors.toList()); } + @RestrictedApi(explanation = "Should only be called in tests", link = "", + allowedOnPath = ".*/src/test/.*") + public ReplicationPeerStorage getPeerStorage() { + return peerStorage; + } + public ReplicationQueueStorage getQueueStorage() { return queueStorage; } - public static ReplicationPeerManager create(FileSystem fs, ZKWatcher zk, Configuration conf, - String clusterId) throws ReplicationException { + private static Pair + createReplicationQueueStorage(MasterServices services) throws IOException { + Configuration conf = services.getConfiguration(); + TableName replicationQueueTableName = + TableName.valueOf(conf.get(ReplicationStorageFactory.REPLICATION_QUEUE_TABLE_NAME, + ReplicationStorageFactory.REPLICATION_QUEUE_TABLE_NAME_DEFAULT.getNameAsString())); + ReplicationQueueStorageInitializer initializer; + if (services.getTableDescriptors().exists(replicationQueueTableName)) { + // no need to create the table + initializer = () -> { + }; + } else { + // lazy create the replication table. + initializer = new ReplicationQueueStorageInitializer() { + + private volatile boolean created = false; + + @Override + public void initialize() throws IOException { + if (created) { + return; + } + synchronized (this) { + if (created) { + return; + } + if (services.getTableDescriptors().exists(replicationQueueTableName)) { + created = true; + return; + } + long procId = services.createSystemTable(ReplicationStorageFactory + .createReplicationQueueTableDescriptor(replicationQueueTableName)); + ProcedureExecutor procExec = services.getMasterProcedureExecutor(); + ProcedureSyncWait.waitFor(procExec.getEnvironment(), TimeUnit.MINUTES.toMillis(1), + "Creating table " + replicationQueueTableName, () -> procExec.isFinished(procId)); + } + } + }; + } + return Pair.newPair(ReplicationStorageFactory.getReplicationQueueStorage( + services.getConnection(), replicationQueueTableName), initializer); + } + + public static ReplicationPeerManager create(MasterServices services, String clusterId) + throws ReplicationException, IOException { + Configuration conf = services.getConfiguration(); + FileSystem fs = services.getMasterFileSystem().getFileSystem(); + ZKWatcher zk = services.getZooKeeper(); ReplicationPeerStorage peerStorage = ReplicationStorageFactory.getReplicationPeerStorage(fs, zk, conf); + Pair pair = + createReplicationQueueStorage(services); + ReplicationQueueStorage queueStorage = pair.getFirst(); ConcurrentMap peers = new ConcurrentHashMap<>(); for (String peerId : peerStorage.listPeerIds()) { ReplicationPeerConfig peerConfig = peerStorage.getPeerConfig(peerId); @@ -585,7 +656,24 @@ public static ReplicationPeerManager create(FileSystem fs, ZKWatcher zk, Configu ) { // we do not use this endpoint for region replication any more, see HBASE-26233 LOG.info("Legacy region replication peer found, removing: {}", peerConfig); - peerStorage.removePeer(peerId); + // do it asynchronous to not block the start up of HMaster + new Thread("Remove legacy replication peer " + peerId) { + + @Override + public void run() { + try { + // need to delete two times to make sure we delete all the queues, see the comments in + // above + // removeAllQueues method for more details. + queueStorage.removeAllQueues(peerId); + queueStorage.removeAllQueues(peerId); + // delete queue first and then peer, because we use peer as a flag. + peerStorage.removePeer(peerId); + } catch (Exception e) { + LOG.warn("Failed to delete legacy replication peer {}", peerId); + } + } + }.start(); continue; } peerConfig = ReplicationPeerConfigUtil.updateReplicationBasePeerConfigs(conf, peerConfig); @@ -594,8 +682,8 @@ public static ReplicationPeerManager create(FileSystem fs, ZKWatcher zk, Configu SyncReplicationState state = peerStorage.getPeerSyncReplicationState(peerId); peers.put(peerId, new ReplicationPeerDescription(peerId, enabled, peerConfig, state)); } - return new ReplicationPeerManager(fs, zk, peerStorage, - ReplicationStorageFactory.getReplicationQueueStorage(zk, conf), peers, conf, clusterId); + return new ReplicationPeerManager(fs, zk, peerStorage, queueStorage, peers, conf, clusterId, + pair.getSecond()); } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java index dc4c3a0a6bcb..8df65487c676 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java @@ -32,6 +32,7 @@ import java.io.InterruptedIOException; import java.lang.management.MemoryType; import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; @@ -441,8 +442,9 @@ protected AbstractFSWAL(final FileSystem fs, final Abortable abortable, final Pa } // If prefix is null||empty then just name it wal - this.walFilePrefix = - prefix == null || prefix.isEmpty() ? "wal" : URLEncoder.encode(prefix, "UTF8"); + this.walFilePrefix = prefix == null || prefix.isEmpty() + ? "wal" + : URLEncoder.encode(prefix, StandardCharsets.UTF_8.name()); // we only correctly differentiate suffices when numeric ones start with '.' if (suffix != null && !(suffix.isEmpty()) && !(suffix.startsWith(WAL_FILE_NAME_DELIMITER))) { throw new IllegalArgumentException("WAL suffix must start with '" + WAL_FILE_NAME_DELIMITER diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationHFileCleaner.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationHFileCleaner.java index 819e4c5e54ac..00e875f8be56 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationHFileCleaner.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationHFileCleaner.java @@ -19,23 +19,28 @@ import java.io.IOException; import java.util.Collections; +import java.util.Map; import java.util.Set; -import org.apache.hadoop.conf.Configuration; +import org.apache.commons.io.IOUtils; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.hbase.HBaseInterfaceAudience; import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.Server; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.client.ConnectionFactory; +import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.cleaner.BaseHFileCleanerDelegate; import org.apache.hadoop.hbase.master.cleaner.HFileCleaner; import org.apache.hadoop.hbase.replication.ReplicationException; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; -import org.apache.hadoop.hbase.zookeeper.ZKWatcher; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hbase.thirdparty.com.google.common.base.Predicate; import org.apache.hbase.thirdparty.com.google.common.collect.Iterables; +import org.apache.hbase.thirdparty.org.apache.commons.collections4.MapUtils; /** * Implementation of a file cleaner that checks if a hfile is still scheduled for replication before @@ -44,15 +49,20 @@ @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.CONFIG) public class ReplicationHFileCleaner extends BaseHFileCleanerDelegate { private static final Logger LOG = LoggerFactory.getLogger(ReplicationHFileCleaner.class); - private ZKWatcher zkw; + private Connection conn; + private boolean shareConn; private ReplicationQueueStorage rqs; private boolean stopped = false; @Override public Iterable getDeletableFiles(Iterable files) { - // all members of this class are null if replication is disabled, - // so we cannot filter the files - if (this.getConf() == null) { + if ( + !(getConf().getBoolean(HConstants.REPLICATION_BULKLOAD_ENABLE_KEY, + HConstants.REPLICATION_BULKLOAD_ENABLE_DEFAULT)) + ) { + LOG.warn(HConstants.REPLICATION_BULKLOAD_ENABLE_KEY + " is not enabled. Better to remove " + + ReplicationHFileCleaner.class + " from " + HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS + + " configuration."); return files; } @@ -88,51 +98,34 @@ public boolean apply(FileStatus file) { } @Override - public void setConf(Configuration config) { - // If either replication or replication of bulk load hfiles is disabled, keep all members null - if ( - !(config.getBoolean(HConstants.REPLICATION_BULKLOAD_ENABLE_KEY, - HConstants.REPLICATION_BULKLOAD_ENABLE_DEFAULT)) - ) { - LOG.warn(HConstants.REPLICATION_BULKLOAD_ENABLE_KEY + " is not enabled. Better to remove " - + ReplicationHFileCleaner.class + " from " + HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS - + " configuration."); - return; - } - // Make my own Configuration. Then I'll have my own connection to zk that - // I can close myself when time comes. - Configuration conf = new Configuration(config); + public void init(Map params) { + super.init(params); try { - setConf(conf, new ZKWatcher(conf, "replicationHFileCleaner", null)); + if (MapUtils.isNotEmpty(params)) { + Object master = params.get(HMaster.MASTER); + if (master != null && master instanceof Server) { + conn = ((Server) master).getConnection(); + shareConn = true; + } + } + if (conn == null) { + conn = ConnectionFactory.createConnection(getConf()); + } + this.rqs = ReplicationStorageFactory.getReplicationQueueStorage(conn, getConf()); } catch (IOException e) { LOG.error("Error while configuring " + this.getClass().getName(), e); } } - @InterfaceAudience.Private - public void setConf(Configuration conf, ZKWatcher zk) { - super.setConf(conf); - try { - initReplicationQueueStorage(conf, zk); - } catch (Exception e) { - LOG.error("Error while configuring " + this.getClass().getName(), e); - } - } - - private void initReplicationQueueStorage(Configuration conf, ZKWatcher zk) { - this.zkw = zk; - this.rqs = ReplicationStorageFactory.getReplicationQueueStorage(zk, conf); - } - @Override public void stop(String why) { if (this.stopped) { return; } this.stopped = true; - if (this.zkw != null) { - LOG.info("Stopping " + this.zkw); - this.zkw.close(); + if (!shareConn && this.conn != null) { + LOG.info("Stopping " + this.conn); + IOUtils.closeQuietly(conn); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java index 54e600e09ada..7135ca9a9b20 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java @@ -26,9 +26,7 @@ import org.apache.hadoop.hbase.HBaseInterfaceAudience; import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.cleaner.BaseLogCleanerDelegate; -import org.apache.hadoop.hbase.replication.ReplicationException; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; -import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.zookeeper.ZKWatcher; import org.apache.yetus.audience.InterfaceAudience; @@ -56,14 +54,15 @@ public class ReplicationLogCleaner extends BaseLogCleanerDelegate { @Override public void preClean() { readZKTimestamp = EnvironmentEdgeManager.currentTime(); - try { - // The concurrently created new WALs may not be included in the return list, - // but they won't be deleted because they're not in the checking set. - wals = queueStorage.getAllWALs(); - } catch (ReplicationException e) { - LOG.warn("Failed to read zookeeper, skipping checking deletable files"); - wals = null; - } + // TODO: revisit the implementation + // try { + // // The concurrently created new WALs may not be included in the return list, + // // but they won't be deleted because they're not in the checking set. + // wals = queueStorage.getAllWALs(); + // } catch (ReplicationException e) { + // LOG.warn("Failed to read zookeeper, skipping checking deletable files"); + // wals = null; + // } } @Override @@ -115,7 +114,8 @@ public void init(Map params) { if (zkw == null) { zkw = new ZKWatcher(getConf(), "replicationLogCleaner", null); } - this.queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zkw, getConf()); + // TODO: revisit the implementation + // this.queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zkw, getConf()); } catch (IOException e) { LOG.error("Error while configuring " + this.getClass().getName(), e); } @@ -126,7 +126,8 @@ public void setConf(Configuration conf, ZKWatcher zk) { super.setConf(conf); try { this.zkw = zk; - this.queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zk, conf); + // TODO: revisit the implementation + // this.queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zk, conf); } catch (Exception e) { LOG.error("Error while configuring " + this.getClass().getName(), e); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ClaimReplicationQueueCallable.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ClaimReplicationQueueCallable.java index b9a7be813af8..2b7e14f9f7aa 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ClaimReplicationQueueCallable.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ClaimReplicationQueueCallable.java @@ -20,6 +20,7 @@ import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.executor.EventType; import org.apache.hadoop.hbase.procedure2.BaseRSProcedureCallable; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.yetus.audience.InterfaceAudience; import org.apache.hbase.thirdparty.com.google.protobuf.InvalidProtocolBufferException; @@ -30,9 +31,7 @@ @InterfaceAudience.Private public class ClaimReplicationQueueCallable extends BaseRSProcedureCallable { - private ServerName crashedServer; - - private String queue; + private ReplicationQueueId queueId; @Override public EventType getEventType() { @@ -42,14 +41,20 @@ public EventType getEventType() { @Override protected void doCall() throws Exception { PeerProcedureHandler handler = rs.getReplicationSourceService().getPeerProcedureHandler(); - handler.claimReplicationQueue(crashedServer, queue); + handler.claimReplicationQueue(queueId); } @Override protected void initParameter(byte[] parameter) throws InvalidProtocolBufferException { ClaimReplicationQueueRemoteParameter param = ClaimReplicationQueueRemoteParameter.parseFrom(parameter); - crashedServer = ProtobufUtil.toServerName(param.getCrashedServer()); - queue = param.getQueue(); + ServerName crashedServer = ProtobufUtil.toServerName(param.getCrashedServer()); + String queue = param.getQueue(); + if (param.hasSourceServer()) { + ServerName sourceServer = ProtobufUtil.toServerName(param.getSourceServer()); + queueId = new ReplicationQueueId(crashedServer, queue, sourceServer); + } else { + queueId = new ReplicationQueueId(crashedServer, queue); + } } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/DumpReplicationQueues.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/DumpReplicationQueues.java index 4636e239904a..98d0a55fbc43 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/DumpReplicationQueues.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/DumpReplicationQueues.java @@ -21,7 +21,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -45,10 +44,8 @@ import org.apache.hadoop.hbase.replication.ReplicationPeerDescription; import org.apache.hadoop.hbase.replication.ReplicationQueueInfo; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; -import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.zookeeper.ZKDump; -import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.hadoop.hbase.zookeeper.ZKWatcher; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; @@ -59,6 +56,8 @@ import org.apache.hbase.thirdparty.com.google.common.util.concurrent.AtomicLongMap; /** + * TODO: reimplement this tool + *

* Provides information about the existing states of replication, replication peers and queues. * Usage: hbase org.apache.hadoop.hbase.replication.regionserver.DumpReplicationQueues [args] * Arguments: --distributed Polls each RS to dump information about the queue --hdfs Reports HDFS @@ -299,32 +298,33 @@ public String dumpQueues(ZKWatcher zkw, Set peerIds, boolean hdfs) throw ReplicationQueueStorage queueStorage; StringBuilder sb = new StringBuilder(); - queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zkw, getConf()); - Set liveRegionServers = ZKUtil.listChildrenNoWatch(zkw, zkw.getZNodePaths().rsZNode) - .stream().map(ServerName::parseServerName).collect(Collectors.toSet()); - + // queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zkw, getConf()); + // Set liveRegionServers = ZKUtil.listChildrenNoWatch(zkw, + // zkw.getZNodePaths().rsZNode) + // .stream().map(ServerName::parseServerName).collect(Collectors.toSet()); + // // Loops each peer on each RS and dumps the queues - List regionservers = queueStorage.getListOfReplicators(); - if (regionservers == null || regionservers.isEmpty()) { - return sb.toString(); - } - for (ServerName regionserver : regionservers) { - List queueIds = queueStorage.getAllQueues(regionserver); - if (!liveRegionServers.contains(regionserver)) { - deadRegionServers.add(regionserver.getServerName()); - } - for (String queueId : queueIds) { - ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueId); - List wals = queueStorage.getWALsInQueue(regionserver, queueId); - Collections.sort(wals); - if (!peerIds.contains(queueInfo.getPeerId())) { - deletedQueues.add(regionserver + "/" + queueId); - sb.append(formatQueue(regionserver, queueStorage, queueInfo, queueId, wals, true, hdfs)); - } else { - sb.append(formatQueue(regionserver, queueStorage, queueInfo, queueId, wals, false, hdfs)); - } - } - } + // List regionservers = queueStorage.getListOfReplicators(); + // if (regionservers == null || regionservers.isEmpty()) { + // return sb.toString(); + // } + // for (ServerName regionserver : regionservers) { + // List queueIds = queueStorage.getAllQueues(regionserver); + // if (!liveRegionServers.contains(regionserver)) { + // deadRegionServers.add(regionserver.getServerName()); + // } + // for (String queueId : queueIds) { + // ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueId); + // List wals = queueStorage.getWALsInQueue(regionserver, queueId); + // Collections.sort(wals); + // if (!peerIds.contains(queueInfo.getPeerId())) { + // deletedQueues.add(regionserver + "/" + queueId); + // sb.append(formatQueue(regionserver, queueStorage, queueInfo, queueId, wals, true, hdfs)); + // } else { + // sb.append(formatQueue(regionserver, queueStorage, queueInfo, queueId, wals, false, hdfs)); + // } + // } + // } return sb.toString(); } @@ -350,9 +350,9 @@ private String formatQueue(ServerName regionserver, ReplicationQueueStorage queu peersQueueSize.addAndGet(queueInfo.getPeerId(), wals.size()); for (String wal : wals) { - long position = queueStorage.getWALPosition(regionserver, queueInfo.getPeerId(), wal); - sb.append(" Replication position for " + wal + ": " - + (position > 0 ? position : "0" + " (not started or nothing to replicate)") + "\n"); + // long position = queueStorage.getWALPosition(regionserver, queueInfo.getPeerId(), wal); + // sb.append(" Replication position for " + wal + ": " + // + (position > 0 ? position : "0" + " (not started or nothing to replicate)") + "\n"); } if (hdfs) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/PeerProcedureHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/PeerProcedureHandler.java index 3df78c1d8313..b2cffd59fd4a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/PeerProcedureHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/PeerProcedureHandler.java @@ -18,9 +18,9 @@ package org.apache.hadoop.hbase.replication.regionserver; import java.io.IOException; -import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.replication.ReplicationException; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.yetus.audience.InterfaceAudience; /** @@ -42,6 +42,5 @@ public interface PeerProcedureHandler { void transitSyncReplicationPeerState(String peerId, int stage, HRegionServer rs) throws ReplicationException, IOException; - void claimReplicationQueue(ServerName crashedServer, String queue) - throws ReplicationException, IOException; + void claimReplicationQueue(ReplicationQueueId queueId) throws ReplicationException, IOException; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/PeerProcedureHandlerImpl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/PeerProcedureHandlerImpl.java index 0187de14f806..cd3db44d8fa3 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/PeerProcedureHandlerImpl.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/PeerProcedureHandlerImpl.java @@ -20,7 +20,6 @@ import java.io.IOException; import java.io.InterruptedIOException; import java.util.concurrent.locks.Lock; -import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.regionserver.LogRoller; import org.apache.hadoop.hbase.replication.ReplicationException; @@ -28,6 +27,7 @@ import org.apache.hadoop.hbase.replication.ReplicationPeerConfig; import org.apache.hadoop.hbase.replication.ReplicationPeerImpl; import org.apache.hadoop.hbase.replication.ReplicationPeers; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.hadoop.hbase.replication.ReplicationUtils; import org.apache.hadoop.hbase.replication.SyncReplicationState; import org.apache.hadoop.hbase.util.KeyLocker; @@ -226,8 +226,8 @@ public void transitSyncReplicationPeerState(String peerId, int stage, HRegionSer } @Override - public void claimReplicationQueue(ServerName crashedServer, String queue) + public void claimReplicationQueue(ReplicationQueueId queueId) throws ReplicationException, IOException { - replicationSourceManager.claimQueue(crashedServer, queue); + replicationSourceManager.claimQueue(queueId); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSource.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSource.java index 024248a3f8c9..e740a01dc4f7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSource.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSource.java @@ -17,23 +17,7 @@ */ package org.apache.hadoop.hbase.replication.regionserver; -import java.io.IOException; -import java.util.List; -import java.util.UUID; -import java.util.concurrent.PriorityBlockingQueue; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.Server; -import org.apache.hadoop.hbase.ServerName; -import org.apache.hadoop.hbase.replication.ReplicationPeer; -import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; -import org.apache.hadoop.hbase.util.CommonFSUtils; -import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.apache.yetus.audience.InterfaceAudience; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * Class that handles the recovered source of a replication stream, which is transfered from another @@ -42,124 +26,14 @@ @InterfaceAudience.Private public class RecoveredReplicationSource extends ReplicationSource { - private static final Logger LOG = LoggerFactory.getLogger(RecoveredReplicationSource.class); - - private String actualPeerId; - - @Override - public void init(Configuration conf, FileSystem fs, ReplicationSourceManager manager, - ReplicationQueueStorage queueStorage, ReplicationPeer replicationPeer, Server server, - String peerClusterZnode, UUID clusterId, WALFileLengthProvider walFileLengthProvider, - MetricsSource metrics) throws IOException { - super.init(conf, fs, manager, queueStorage, replicationPeer, server, peerClusterZnode, - clusterId, walFileLengthProvider, metrics); - this.actualPeerId = this.replicationQueueInfo.getPeerId(); - } - @Override protected RecoveredReplicationSourceShipper createNewShipper(String walGroupId) { - return new RecoveredReplicationSourceShipper(conf, walGroupId, logQueue, this, queueStorage); - } - - public void locateRecoveredPaths(String walGroupId) throws IOException { - boolean hasPathChanged = false; - PriorityBlockingQueue queue = logQueue.getQueue(walGroupId); - PriorityBlockingQueue newPaths = new PriorityBlockingQueue(queueSizePerGroup, - new AbstractFSWALProvider.WALStartTimeComparator()); - pathsLoop: for (Path path : queue) { - if (fs.exists(path)) { // still in same location, don't need to do anything - newPaths.add(path); - continue; - } - // Path changed - try to find the right path. - hasPathChanged = true; - if (server instanceof ReplicationSyncUp.DummyServer) { - // In the case of disaster/recovery, HMaster may be shutdown/crashed before flush data - // from .logs to .oldlogs. Loop into .logs folders and check whether a match exists - Path newPath = getReplSyncUpPath(path); - newPaths.add(newPath); - continue; - } else { - // See if Path exists in the dead RS folder (there could be a chain of failures - // to look at) - List deadRegionServers = this.replicationQueueInfo.getDeadRegionServers(); - LOG.info("NB dead servers : " + deadRegionServers.size()); - final Path walDir = CommonFSUtils.getWALRootDir(conf); - for (ServerName curDeadServerName : deadRegionServers) { - final Path deadRsDirectory = new Path(walDir, - AbstractFSWALProvider.getWALDirectoryName(curDeadServerName.getServerName())); - Path[] locs = new Path[] { new Path(deadRsDirectory, path.getName()), - new Path(deadRsDirectory.suffix(AbstractFSWALProvider.SPLITTING_EXT), path.getName()) }; - for (Path possibleLogLocation : locs) { - LOG.info("Possible location " + possibleLogLocation.toUri().toString()); - if (manager.getFs().exists(possibleLogLocation)) { - // We found the right new location - LOG.info("Log " + path + " still exists at " + possibleLogLocation); - newPaths.add(possibleLogLocation); - continue pathsLoop; - } - } - } - // didn't find a new location - LOG.error( - String.format("WAL Path %s doesn't exist and couldn't find its new location", path)); - newPaths.add(path); - } - } - - if (hasPathChanged) { - if (newPaths.size() != queue.size()) { // this shouldn't happen - LOG.error("Recovery queue size is incorrect"); - throw new IOException("Recovery queue size error"); - } - // put the correct locations in the queue - // since this is a recovered queue with no new incoming logs, - // there shouldn't be any concurrency issues - logQueue.clear(walGroupId); - for (Path path : newPaths) { - logQueue.enqueueLog(path, walGroupId); - } - } - } - - // N.B. the ReplicationSyncUp tool sets the manager.getWALDir to the root of the wal - // area rather than to the wal area for a particular region server. - private Path getReplSyncUpPath(Path path) throws IOException { - FileStatus[] rss = fs.listStatus(manager.getLogDir()); - for (FileStatus rs : rss) { - Path p = rs.getPath(); - FileStatus[] logs = fs.listStatus(p); - for (FileStatus log : logs) { - p = new Path(p, log.getPath().getName()); - if (p.getName().equals(path.getName())) { - LOG.info("Log " + p.getName() + " found at " + p); - return p; + return new RecoveredReplicationSourceShipper(conf, walGroupId, logQueue, this, queueStorage, + () -> { + if (workerThreads.isEmpty()) { + this.getSourceMetrics().clear(); + manager.finishRecoveredSource(this); } - } - } - LOG.error("Didn't find path for: " + path.getName()); - return path; - } - - void tryFinish() { - if (workerThreads.isEmpty()) { - this.getSourceMetrics().clear(); - manager.finishRecoveredSource(this); - } - } - - @Override - public String getPeerId() { - return this.actualPeerId; - } - - @Override - public ServerName getServerWALsBelongTo() { - return this.replicationQueueInfo.getDeadRegionServers().get(0); - } - - @Override - public boolean isRecovered() { - return true; + }); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSourceShipper.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSourceShipper.java index 4f2bafcf156d..2bb3a7c3591c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSourceShipper.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSourceShipper.java @@ -17,83 +17,27 @@ */ package org.apache.hadoop.hbase.replication.regionserver; -import java.io.IOException; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hbase.replication.ReplicationException; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; -import org.apache.hadoop.hbase.util.Threads; import org.apache.yetus.audience.InterfaceAudience; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * Used by a {@link RecoveredReplicationSource}. */ @InterfaceAudience.Private public class RecoveredReplicationSourceShipper extends ReplicationSourceShipper { - private static final Logger LOG = - LoggerFactory.getLogger(RecoveredReplicationSourceShipper.class); - protected final RecoveredReplicationSource source; - private final ReplicationQueueStorage replicationQueues; + private final Runnable tryFinish; public RecoveredReplicationSourceShipper(Configuration conf, String walGroupId, ReplicationSourceLogQueue logQueue, RecoveredReplicationSource source, - ReplicationQueueStorage queueStorage) { + ReplicationQueueStorage queueStorage, Runnable tryFinish) { super(conf, walGroupId, logQueue, source); - this.source = source; - this.replicationQueues = queueStorage; + this.tryFinish = tryFinish; } @Override protected void postFinish() { - source.tryFinish(); - } - - @Override - public long getStartPosition() { - long startPosition = getRecoveredQueueStartPos(); - int numRetries = 0; - while (numRetries <= maxRetriesMultiplier) { - try { - source.locateRecoveredPaths(walGroupId); - break; - } catch (IOException e) { - LOG.error("Error while locating recovered queue paths, attempt #" + numRetries, e); - numRetries++; - } - } - return startPosition; - } - - // If this is a recovered queue, the queue is already full and the first log - // normally has a position (unless the RS failed between 2 logs) - private long getRecoveredQueueStartPos() { - long startPosition = 0; - String peerClusterZNode = source.getQueueId(); - try { - startPosition = this.replicationQueues.getWALPosition(source.getServer().getServerName(), - peerClusterZNode, this.logQueue.getQueue(walGroupId).peek().getName()); - LOG.trace("Recovered queue started with log {} at position {}", - this.logQueue.getQueue(walGroupId).peek(), startPosition); - } catch (ReplicationException e) { - terminate("Couldn't get the position of this recovered queue " + peerClusterZNode, e); - } - return startPosition; - } - - private void terminate(String reason, Exception cause) { - if (cause == null) { - LOG.info("Closing worker for wal group {} because: {}", this.walGroupId, reason); - } else { - LOG.error( - "Closing worker for wal group " + this.walGroupId + " because an error occurred: " + reason, - cause); - } - entryReader.interrupt(); - Threads.shutdown(entryReader, sleepForRetries); - this.interrupt(); - Threads.shutdown(this, sleepForRetries); - LOG.info("ReplicationSourceWorker {} terminated", this.getName()); + tryFinish.run(); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/Replication.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/Replication.java index 338718bd8ea5..6279c4b9596c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/Replication.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/Replication.java @@ -100,7 +100,7 @@ public void initialize(Server server, FileSystem fs, Path logDir, Path oldLogDir try { this.queueStorage = - ReplicationStorageFactory.getReplicationQueueStorage(server.getZooKeeper(), conf); + ReplicationStorageFactory.getReplicationQueueStorage(server.getConnection(), conf); this.replicationPeers = ReplicationFactory.getReplicationPeers(server.getFileSystem(), server.getZooKeeper(), this.conf); this.replicationPeers.init(); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationLoad.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationLoad.java index 00306dd1702a..7e8ca88b7303 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationLoad.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationLoad.java @@ -89,8 +89,8 @@ public void buildReplicationLoad(final List sources, rLoadSourceBuild.setOPsShipped(oPsShipped); if (source instanceof ReplicationSource) { ReplicationSource replSource = (ReplicationSource) source; - rLoadSourceBuild.setRecovered(replSource.getReplicationQueueInfo().isQueueRecovered()); - rLoadSourceBuild.setQueueId(replSource.getReplicationQueueInfo().getQueueId()); + rLoadSourceBuild.setRecovered(replSource.getQueueId().isRecovered()); + rLoadSourceBuild.setQueueId(replSource.getQueueId().toString()); rLoadSourceBuild.setRunning(replSource.isWorkerRunning()); rLoadSourceBuild.setEditsSinceRestart(timeStampOfNextToReplicate > 0); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java index 034204456380..a49bfd7b623d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java @@ -19,6 +19,7 @@ import static org.apache.hadoop.hbase.wal.AbstractFSWALProvider.findArchivedLog; +import com.google.errorprone.annotations.RestrictedApi; import java.io.FileNotFoundException; import java.io.IOException; import java.lang.reflect.InvocationTargetException; @@ -52,8 +53,10 @@ import org.apache.hadoop.hbase.replication.ClusterMarkingEntryFilter; import org.apache.hadoop.hbase.replication.ReplicationEndpoint; import org.apache.hadoop.hbase.replication.ReplicationException; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; import org.apache.hadoop.hbase.replication.ReplicationPeer; -import org.apache.hadoop.hbase.replication.ReplicationQueueInfo; +import org.apache.hadoop.hbase.replication.ReplicationQueueData; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; import org.apache.hadoop.hbase.replication.SystemTableWALEntryFilter; import org.apache.hadoop.hbase.replication.WALEntryFilter; @@ -67,6 +70,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap; import org.apache.hbase.thirdparty.com.google.common.collect.Lists; /** @@ -90,7 +94,6 @@ public class ReplicationSource implements ReplicationSourceInterface { protected ReplicationPeer replicationPeer; protected Configuration conf; - protected ReplicationQueueInfo replicationQueueInfo; // The manager of all sources to which we ping back our progress protected ReplicationSourceManager manager; @@ -103,8 +106,11 @@ public class ReplicationSource implements ReplicationSourceInterface { private UUID clusterId; // total number of edits we replicated private AtomicLong totalReplicatedEdits = new AtomicLong(0); - // The znode we currently play with - protected String queueId; + // The id of the replication queue + protected ReplicationQueueId queueId; + // The start offsets. Usually only recovered replication queue needs this, but probably when we + // update the peer config and restart the replication peer, we also need this? + protected ImmutableMap startOffsets; // Maximum number of retries before taking bold actions private int maxRetriesMultiplier; // Indicates if this particular source is running @@ -184,14 +190,14 @@ public class ReplicationSource implements ReplicationSourceInterface { * @param fs file system to use * @param manager replication manager to ping to * @param server the server for this region server - * @param queueId the id of our replication queue + * @param queueData the id and offsets of our replication queue * @param clusterId unique UUID for the cluster * @param metrics metrics for replication source */ @Override public void init(Configuration conf, FileSystem fs, ReplicationSourceManager manager, ReplicationQueueStorage queueStorage, ReplicationPeer replicationPeer, Server server, - String queueId, UUID clusterId, WALFileLengthProvider walFileLengthProvider, + ReplicationQueueData queueData, UUID clusterId, WALFileLengthProvider walFileLengthProvider, MetricsSource metrics) throws IOException { this.server = server; this.conf = HBaseConfiguration.create(conf); @@ -211,8 +217,8 @@ public void init(Configuration conf, FileSystem fs, ReplicationSourceManager man this.metrics = metrics; this.clusterId = clusterId; - this.queueId = queueId; - this.replicationQueueInfo = new ReplicationQueueInfo(queueId); + this.queueId = queueData.getId(); + this.startOffsets = queueData.getOffsets(); // A defaultBandwidth of '0' means no bandwidth; i.e. no throttling. defaultBandwidth = this.conf.getLong("replication.source.per.peer.node.bandwidth", 0); @@ -240,24 +246,24 @@ public void enqueueLog(Path wal) { return; } // Use WAL prefix as the WALGroupId for this peer. - String walPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(wal.getName()); - boolean queueExists = logQueue.enqueueLog(wal, walPrefix); + String walGroupId = AbstractFSWALProvider.getWALPrefixFromWALName(wal.getName()); + boolean queueExists = logQueue.enqueueLog(wal, walGroupId); if (!queueExists) { if (this.isSourceActive() && this.walEntryFilter != null) { // new wal group observed after source startup, start a new worker thread to track it // notice: it's possible that wal enqueued when this.running is set but worker thread // still not launched, so it's necessary to check workerThreads before start the worker - tryStartNewShipper(walPrefix); + tryStartNewShipper(walGroupId); } } if (LOG.isTraceEnabled()) { - LOG.trace("{} Added wal {} to queue of source {}.", logPeerId(), walPrefix, - this.replicationQueueInfo.getQueueId()); + LOG.trace("{} Added wal {} to queue of source {}.", logPeerId(), walGroupId, queueId); } } - @InterfaceAudience.Private + @RestrictedApi(explanation = "Should only be called in tests", link = "", + allowedOnPath = ".*/src/test/.*") public Map> getQueues() { return logQueue.getQueues(); } @@ -330,6 +336,30 @@ private void initializeWALEntryFilter(UUID peerClusterId) { this.walEntryFilter = new ChainWALEntryFilter(filters); } + private long getStartOffset(String walGroupId) { + ReplicationGroupOffset startOffset = startOffsets.get(walGroupId); + if (startOffset == null || startOffset == ReplicationGroupOffset.BEGIN) { + return 0L; + } + // this method will only be called when start new shipper, and we will only start new shipper + // when there is a new queue, so here the queue for walGroupId will never be null. + Path first = logQueue.getQueue(walGroupId).peek(); + if (!startOffset.getWal().equals(first.getName())) { + return 0L; + } + // Usually, if we arrive here, the start offset should never be -1, as it means this file has + // been fully replicated so we should have filtered it out in upper layer, usually in + // ReplicationSourceManager. Add a warn message for safety, as usually replicate more data will + // not cause big problems. + if (startOffset.getOffset() < 0) { + LOG.warn("Should have already replicated wal {}, return start offset as 0", + startOffset.getWal()); + return 0L; + } else { + return startOffset.getOffset(); + } + } + private void tryStartNewShipper(String walGroupId) { workerThreads.compute(walGroupId, (key, value) -> { if (value != null) { @@ -339,7 +369,7 @@ private void tryStartNewShipper(String walGroupId) { LOG.debug("{} starting shipping worker for walGroupId={}", logPeerId(), walGroupId); ReplicationSourceShipper worker = createNewShipper(walGroupId); ReplicationSourceWALReader walReader = - createNewWALReader(walGroupId, worker.getStartPosition()); + createNewWALReader(walGroupId, getStartOffset(walGroupId)); Threads.setDaemonThreadRunning( walReader, Thread.currentThread().getName() + ".replicationSource.wal-reader." + walGroupId + "," + queueId, @@ -568,7 +598,7 @@ private void initialize() { } LOG.info("{} queueId={} (queues={}) is replicating from cluster={} to cluster={}", logPeerId(), - this.replicationQueueInfo.getQueueId(), logQueue.getNumQueues(), clusterId, peerClusterId); + queueId, logQueue.getNumQueues(), clusterId, peerClusterId); initializeWALEntryFilter(peerClusterId); // Start workers for (String walGroupId : logQueue.getQueues().keySet()) { @@ -715,7 +745,7 @@ public void terminate(String reason, Exception cause, boolean clearMetrics, bool } @Override - public String getQueueId() { + public ReplicationQueueId getQueueId() { return this.queueId; } @@ -735,10 +765,6 @@ public boolean isSourceActive() { return !this.server.isStopped() && this.sourceRunning; } - public ReplicationQueueInfo getReplicationQueueInfo() { - return replicationQueueInfo; - } - public boolean isWorkerRunning() { for (ReplicationSourceShipper worker : this.workerThreads.values()) { if (worker.isActive()) { @@ -791,7 +817,7 @@ public WALFileLengthProvider getWALFileLengthProvider() { @Override public ServerName getServerWALsBelongTo() { - return server.getServerName(); + return queueId.getServerWALsBelongTo(); } @Override diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceFactory.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceFactory.java index 331f2269cf9d..225c6fd4d745 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceFactory.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceFactory.java @@ -18,7 +18,7 @@ package org.apache.hadoop.hbase.replication.regionserver; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hbase.replication.ReplicationQueueInfo; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -34,9 +34,8 @@ public final class ReplicationSourceFactory { private ReplicationSourceFactory() { } - static ReplicationSourceInterface create(Configuration conf, String queueId) { - ReplicationQueueInfo replicationQueueInfo = new ReplicationQueueInfo(queueId); - boolean isQueueRecovered = replicationQueueInfo.isQueueRecovered(); + static ReplicationSourceInterface create(Configuration conf, ReplicationQueueId queueId) { + boolean isQueueRecovered = queueId.isRecovered(); ReplicationSourceInterface src; try { String defaultReplicationSourceImpl = isQueueRecovered diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceInterface.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceInterface.java index fa026b919f3b..69ad2887064a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceInterface.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceInterface.java @@ -31,6 +31,8 @@ import org.apache.hadoop.hbase.replication.ReplicationEndpoint; import org.apache.hadoop.hbase.replication.ReplicationException; import org.apache.hadoop.hbase.replication.ReplicationPeer; +import org.apache.hadoop.hbase.replication.ReplicationQueueData; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.wal.WAL.Entry; @@ -43,14 +45,22 @@ public interface ReplicationSourceInterface { /** * Initializer for the source - * @param conf the configuration to use - * @param fs the file system to use - * @param manager the manager to use - * @param server the server for this region server + * @param conf the configuration to use + * @param fs the file system to use + * @param manager the manager to use + * @param queueStorage the replication queue storage + * @param replicationPeer the replication peer + * @param server the server for this region server + * @param queueData the existing replication queue data, contains the queue id and + * replication start offsets + * @param clusterId the cluster id + * @param walFileLengthProvider for getting the length of the WAL file which is currently being + * written + * @param metrics the replication metrics */ void init(Configuration conf, FileSystem fs, ReplicationSourceManager manager, ReplicationQueueStorage queueStorage, ReplicationPeer replicationPeer, Server server, - String queueId, UUID clusterId, WALFileLengthProvider walFileLengthProvider, + ReplicationQueueData queueData, UUID clusterId, WALFileLengthProvider walFileLengthProvider, MetricsSource metrics) throws IOException; /** @@ -106,14 +116,14 @@ void addHFileRefs(TableName tableName, byte[] family, List> pai * Get the queue id that the source is replicating to * @return queue id */ - String getQueueId(); + ReplicationQueueId getQueueId(); /** * Get the id that the source is replicating to. * @return peer id */ default String getPeerId() { - return getPeer().getId(); + return getQueueId().getPeerId(); } /** @@ -183,7 +193,7 @@ default Map getWalGroupStatus() { /** Returns whether this is a replication source for recovery. */ default boolean isRecovered() { - return false; + return getQueueId().isRecovered(); } /** Returns The instance of queueStorage used by this ReplicationSource. */ diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceLogQueue.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceLogQueue.java index a0e6f1b8d1fa..93a28b60d274 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceLogQueue.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceLogQueue.java @@ -97,7 +97,7 @@ public boolean enqueueLog(Path wal, String walGroupId) { * @param walGroupId walGroupId */ public int getQueueSize(String walGroupId) { - Queue queue = queues.get(walGroupId); + Queue queue = queues.get(walGroupId); if (queue == null) { return 0; } @@ -117,7 +117,7 @@ public Map> getQueues() { /** * Return queue for the given walGroupId Please don't add or remove elements from the returned - * queue. Use @enqueueLog and @remove methods respectively. + * queue. Use {@link #enqueueLog(Path, String)} and {@link #remove(String)} methods respectively. * @param walGroupId walGroupId */ public PriorityBlockingQueue getQueue(String walGroupId) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java index eb991b7f349e..c16ba8b133c6 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java @@ -17,17 +17,22 @@ */ package org.apache.hadoop.hbase.replication.regionserver; +import com.google.errorprone.annotations.RestrictedApi; import java.io.FileNotFoundException; import java.io.IOException; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.NavigableSet; import java.util.OptionalLong; +import java.util.PriorityQueue; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; @@ -49,12 +54,13 @@ import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.replication.ReplicationException; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; import org.apache.hadoop.hbase.replication.ReplicationPeer; -import org.apache.hadoop.hbase.replication.ReplicationPeer.PeerState; import org.apache.hadoop.hbase.replication.ReplicationPeerConfig; import org.apache.hadoop.hbase.replication.ReplicationPeerImpl; import org.apache.hadoop.hbase.replication.ReplicationPeers; -import org.apache.hadoop.hbase.replication.ReplicationQueueInfo; +import org.apache.hadoop.hbase.replication.ReplicationQueueData; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; import org.apache.hadoop.hbase.replication.ReplicationUtils; import org.apache.hadoop.hbase.replication.SyncReplicationState; @@ -68,6 +74,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap; import org.apache.hbase.thirdparty.com.google.common.collect.Sets; import org.apache.hbase.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder; @@ -90,26 +97,25 @@ * operations. *

  • Need synchronized on {@link #walsById}. There are four methods which modify it, * {@link #addPeer(String)}, {@link #removePeer(String)}, - * {@link #cleanOldLogs(String, boolean, ReplicationSourceInterface)} and {@link #preLogRoll(Path)}. - * {@link #walsById} is a ConcurrentHashMap and there is a Lock for peer id in - * {@link PeerProcedureHandlerImpl}. So there is no race between {@link #addPeer(String)} and + * {@link #cleanOldLogs(String, boolean, ReplicationSourceInterface)} and + * {@link #postLogRoll(Path)}. {@link #walsById} is a ConcurrentHashMap and there is a Lock for peer + * id in {@link PeerProcedureHandlerImpl}. So there is no race between {@link #addPeer(String)} and * {@link #removePeer(String)}. {@link #cleanOldLogs(String, boolean, ReplicationSourceInterface)} * is called by {@link ReplicationSourceInterface}. So no race with {@link #addPeer(String)}. * {@link #removePeer(String)} will terminate the {@link ReplicationSourceInterface} firstly, then * remove the wals from {@link #walsById}. So no race with {@link #removePeer(String)}. The only * case need synchronized is {@link #cleanOldLogs(String, boolean, ReplicationSourceInterface)} and - * {@link #preLogRoll(Path)}.
  • + * {@link #postLogRoll(Path)}. *
  • No need synchronized on {@link #walsByIdRecoveredQueues}. There are three methods which * modify it, {@link #removePeer(String)} , * {@link #cleanOldLogs(String, boolean, ReplicationSourceInterface)} and - * {@link ReplicationSourceManager#claimQueue(ServerName, String)}. + * {@link #claimQueue(ReplicationQueueId)}. * {@link #cleanOldLogs(String, boolean, ReplicationSourceInterface)} is called by * {@link ReplicationSourceInterface}. {@link #removePeer(String)} will terminate the * {@link ReplicationSourceInterface} firstly, then remove the wals from - * {@link #walsByIdRecoveredQueues}. And - * {@link ReplicationSourceManager#claimQueue(ServerName, String)} will add the wals to - * {@link #walsByIdRecoveredQueues} firstly, then start up a {@link ReplicationSourceInterface}. So - * there is no race here. For {@link ReplicationSourceManager#claimQueue(ServerName, String)} and + * {@link #walsByIdRecoveredQueues}. And {@link #claimQueue(ReplicationQueueId)} will add the wals + * to {@link #walsByIdRecoveredQueues} firstly, then start up a {@link ReplicationSourceInterface}. + * So there is no race here. For {@link #claimQueue(ReplicationQueueId)} and * {@link #removePeer(String)}, there is already synchronized on {@link #oldsources}. So no need * synchronized on {@link #walsByIdRecoveredQueues}.
  • *
  • Need synchronized on {@link #latestPaths} to avoid the new open source miss new log.
  • @@ -141,11 +147,12 @@ public class ReplicationSourceManager { // All logs we are currently tracking // Index structure of the map is: queue_id->logPrefix/logGroup->logs // For normal replication source, the peer id is same with the queue id - private final ConcurrentMap>> walsById; + private final ConcurrentMap>> walsById; // Logs for recovered sources we are currently tracking // the map is: queue_id->logPrefix/logGroup->logs // For recovered source, the queue id's format is peer_id-servername-* - private final ConcurrentMap>> walsByIdRecoveredQueues; + private final ConcurrentMap>> walsByIdRecoveredQueues; private final SyncReplicationPeerMappingManager syncReplicationPeerMappingManager; @@ -163,8 +170,6 @@ public class ReplicationSourceManager { // Homemade executer service for replication private final ThreadPoolExecutor executor; - private final boolean replicationForBulkLoadDataEnabled; - private AtomicLong totalBufferUsed = new AtomicLong(); // How long should we sleep for each retry when deleting remote wal files for sync replication @@ -219,8 +224,6 @@ public ReplicationSourceManager(ReplicationQueueStorage queueStorage, tfb.setDaemon(true); this.executor.setThreadFactory(tfb.build()); this.latestPaths = new HashMap<>(); - this.replicationForBulkLoadDataEnabled = conf.getBoolean( - HConstants.REPLICATION_BULKLOAD_ENABLE_KEY, HConstants.REPLICATION_BULKLOAD_ENABLE_DEFAULT); this.sleepForRetries = this.conf.getLong("replication.source.sync.sleepforretries", 1000); this.maxRetriesMultiplier = this.conf.getInt("replication.source.sync.maxretriesmultiplier", 60); @@ -235,11 +238,6 @@ public ReplicationSourceManager(ReplicationQueueStorage queueStorage, void init() throws IOException { for (String id : this.replicationPeers.getAllPeerIds()) { addSource(id); - if (replicationForBulkLoadDataEnabled) { - // Check if peer exists in hfile-refs queue, if not add it. This can happen in the case - // when a peer was added before replication for bulk loaded data was enabled. - throwIOExceptionWhenFail(() -> this.queueStorage.addPeerToHFileRefs(id)); - } } } @@ -260,9 +258,6 @@ public void addPeer(String peerId) throws IOException { } if (added) { addSource(peerId); - if (replicationForBulkLoadDataEnabled) { - throwIOExceptionWhenFail(() -> this.queueStorage.addPeerToHFileRefs(peerId)); - } } } @@ -293,26 +288,17 @@ public void removePeer(String peerId) { removeRecoveredSource(src); } } - LOG - .info("Number of deleted recovered sources for " + peerId + ": " + oldSourcesToDelete.size()); + LOG.info("Number of deleted recovered sources for {}: {}", peerId, oldSourcesToDelete.size()); // Now close the normal source for this peer ReplicationSourceInterface srcToRemove = this.sources.get(peerId); if (srcToRemove != null) { srcToRemove.terminate(terminateMessage); removeSource(srcToRemove); - } else { - // This only happened in unit test TestReplicationSourceManager#testPeerRemovalCleanup - // Delete queue from storage and memory and queue id is same with peer id for normal - // source - deleteQueue(peerId); - this.walsById.remove(peerId); } ReplicationPeerConfig peerConfig = peer.getPeerConfig(); if (peerConfig.isSyncReplication()) { syncReplicationPeerMappingManager.remove(peerId, peerConfig); } - // Remove HFile Refs - abortWhenFail(() -> this.queueStorage.removePeerFromHFileRefs(peerId)); } /** @@ -320,17 +306,17 @@ public void removePeer(String peerId) { * @param queueId the id of the replication queue to associate the ReplicationSource with. * @see #createCatalogReplicationSource(RegionInfo) for creating a ReplicationSource for meta. */ - private ReplicationSourceInterface createSource(String queueId, ReplicationPeer replicationPeer) - throws IOException { - ReplicationSourceInterface src = ReplicationSourceFactory.create(conf, queueId); + private ReplicationSourceInterface createSource(ReplicationQueueData queueData, + ReplicationPeer replicationPeer) throws IOException { + ReplicationSourceInterface src = ReplicationSourceFactory.create(conf, queueData.getId()); // Init the just created replication source. Pass the default walProvider's wal file length // provider. Presumption is we replicate user-space Tables only. For hbase:meta region replica // replication, see #createCatalogReplicationSource(). WALFileLengthProvider walFileLengthProvider = this.walFactory.getWALProvider() != null ? this.walFactory.getWALProvider().getWALFileLengthProvider() : p -> OptionalLong.empty(); - src.init(conf, fs, this, queueStorage, replicationPeer, server, queueId, clusterId, - walFileLengthProvider, new MetricsSource(queueId)); + src.init(conf, fs, this, queueStorage, replicationPeer, server, queueData, clusterId, + walFileLengthProvider, new MetricsSource(queueData.getId().toString())); return src; } @@ -351,12 +337,14 @@ void addSource(String peerId) throws IOException { LOG.info("Legacy region replication peer found, skip adding: {}", peer.getPeerConfig()); return; } - ReplicationSourceInterface src = createSource(peerId, peer); + ReplicationQueueId queueId = new ReplicationQueueId(server.getServerName(), peerId); + ReplicationSourceInterface src = + createSource(new ReplicationQueueData(queueId, ImmutableMap.of()), peer); // synchronized on latestPaths to avoid missing the new log synchronized (this.latestPaths) { this.sources.put(peerId, src); Map> walsByGroup = new HashMap<>(); - this.walsById.put(peerId, walsByGroup); + this.walsById.put(queueId, walsByGroup); // Add the latest wal to that source's queue if (!latestPaths.isEmpty()) { for (Map.Entry walPrefixAndPath : latestPaths.entrySet()) { @@ -365,8 +353,10 @@ void addSource(String peerId) throws IOException { wals.add(walPath.getName()); walsByGroup.put(walPrefixAndPath.getKey(), wals); // Abort RS and throw exception to make add peer failed + // TODO: can record the length of the current wal file so we could replicate less data abortAndThrowIOExceptionWhenFail( - () -> this.queueStorage.addWAL(server.getServerName(), peerId, walPath.getName())); + () -> this.queueStorage.setOffset(queueId, walPrefixAndPath.getKey(), + new ReplicationGroupOffset(walPath.getName(), 0), Collections.emptyMap())); src.enqueueLog(walPath); LOG.trace("Enqueued {} to source {} during source creation.", walPath, src.getQueueId()); } @@ -399,7 +389,10 @@ public void drainSources(String peerId) throws IOException, ReplicationException + " is transiting to STANDBY. Will close the previous replication source and open a new one"; ReplicationPeer peer = replicationPeers.getPeer(peerId); assert peer.getPeerConfig().isSyncReplication(); - ReplicationSourceInterface src = createSource(peerId, peer); + ReplicationQueueId queueId = new ReplicationQueueId(server.getServerName(), peerId); + // TODO: use empty initial offsets for now, revisit when adding support for sync replication + ReplicationSourceInterface src = + createSource(new ReplicationQueueData(queueId, ImmutableMap.of()), peer); // synchronized here to avoid race with preLogRoll where we add new log to source and also // walsById. ReplicationSourceInterface toRemove; @@ -416,17 +409,18 @@ public void drainSources(String peerId) throws IOException, ReplicationException // map from walsById since later we may fail to delete them from the replication queue // storage, and when we retry next time, we can not know the wal files that need to be deleted // from the replication queue storage. - walsById.get(peerId).forEach((k, v) -> wals.put(k, new TreeSet<>(v))); + walsById.get(queueId).forEach((k, v) -> wals.put(k, new TreeSet<>(v))); } LOG.info("Startup replication source for " + src.getPeerId()); src.startup(); for (NavigableSet walsByGroup : wals.values()) { - for (String wal : walsByGroup) { - queueStorage.removeWAL(server.getServerName(), peerId, wal); - } + // TODO: just need to reset the replication offset + // for (String wal : walsByGroup) { + // queueStorage.removeWAL(server.getServerName(), peerId, wal); + // } } synchronized (walsById) { - Map> oldWals = walsById.get(peerId); + Map> oldWals = walsById.get(queueId); wals.forEach((k, v) -> { NavigableSet walsByGroup = oldWals.get(k); if (walsByGroup != null) { @@ -441,17 +435,28 @@ public void drainSources(String peerId) throws IOException, ReplicationException for (Iterator iter = oldsources.iterator(); iter.hasNext();) { ReplicationSourceInterface oldSource = iter.next(); if (oldSource.getPeerId().equals(peerId)) { - String queueId = oldSource.getQueueId(); + ReplicationQueueId oldSourceQueueId = oldSource.getQueueId(); oldSource.terminate(terminateMessage); oldSource.getSourceMetrics().clear(); - queueStorage.removeQueue(server.getServerName(), queueId); - walsByIdRecoveredQueues.remove(queueId); + queueStorage.removeQueue(oldSourceQueueId); + walsByIdRecoveredQueues.remove(oldSourceQueueId); iter.remove(); } } } } + private ReplicationSourceInterface createRefreshedSource(ReplicationQueueId queueId, + ReplicationPeer peer) throws IOException { + Map offsets; + try { + offsets = queueStorage.getOffsets(queueId); + } catch (ReplicationException e) { + throw new IOException(e); + } + return createSource(new ReplicationQueueData(queueId, ImmutableMap.copyOf(offsets)), peer); + } + /** * Close the previous replication sources of this peer id and open new sources to trigger the new * replication state changes or new replication config changes. Here we don't need to change @@ -462,6 +467,7 @@ public void refreshSources(String peerId) throws IOException { String terminateMessage = "Peer " + peerId + " state or config changed. Will close the previous replication source and open a new one"; ReplicationPeer peer = replicationPeers.getPeer(peerId); + ReplicationQueueId queueId = new ReplicationQueueId(server.getServerName(), peerId); ReplicationSourceInterface src; // synchronized on latestPaths to avoid missing the new log synchronized (this.latestPaths) { @@ -471,9 +477,9 @@ public void refreshSources(String peerId) throws IOException { // Do not clear metrics toRemove.terminate(terminateMessage, null, false); } - src = createSource(peerId, peer); + src = createRefreshedSource(queueId, peer); this.sources.put(peerId, src); - for (NavigableSet walsByGroup : walsById.get(peerId).values()) { + for (NavigableSet walsByGroup : walsById.get(queueId).values()) { walsByGroup.forEach(wal -> src.enqueueLog(new Path(this.logDir, wal))); } } @@ -483,20 +489,22 @@ public void refreshSources(String peerId) throws IOException { List toStartup = new ArrayList<>(); // synchronized on oldsources to avoid race with NodeFailoverWorker synchronized (this.oldsources) { - List previousQueueIds = new ArrayList<>(); + List oldSourceQueueIds = new ArrayList<>(); for (Iterator iter = this.oldsources.iterator(); iter .hasNext();) { ReplicationSourceInterface oldSource = iter.next(); if (oldSource.getPeerId().equals(peerId)) { - previousQueueIds.add(oldSource.getQueueId()); + oldSourceQueueIds.add(oldSource.getQueueId()); oldSource.terminate(terminateMessage); iter.remove(); } } - for (String queueId : previousQueueIds) { - ReplicationSourceInterface recoveredReplicationSource = createSource(queueId, peer); + for (ReplicationQueueId oldSourceQueueId : oldSourceQueueIds) { + ReplicationSourceInterface recoveredReplicationSource = + createRefreshedSource(oldSourceQueueId, peer); this.oldsources.add(recoveredReplicationSource); - for (SortedSet walsByGroup : walsByIdRecoveredQueues.get(queueId).values()) { + for (SortedSet walsByGroup : walsByIdRecoveredQueues.get(oldSourceQueueId) + .values()) { walsByGroup.forEach(wal -> recoveredReplicationSource.enqueueLog(new Path(wal))); } toStartup.add(recoveredReplicationSource); @@ -549,8 +557,8 @@ void removeSource(ReplicationSourceInterface src) { * Delete a complete queue of wals associated with a replication source * @param queueId the id of replication queue to delete */ - private void deleteQueue(String queueId) { - abortWhenFail(() -> this.queueStorage.removeQueue(server.getServerName(), queueId)); + private void deleteQueue(ReplicationQueueId queueId) { + abortWhenFail(() -> this.queueStorage.removeQueue(queueId)); } @FunctionalInterface @@ -616,10 +624,15 @@ private void abortAndThrowIOExceptionWhenFail(ReplicationQueueOperation op) thro */ public void logPositionAndCleanOldLogs(ReplicationSourceInterface source, WALEntryBatch entryBatch) { - String fileName = entryBatch.getLastWalPath().getName(); - interruptOrAbortWhenFail(() -> this.queueStorage.setWALPosition(server.getServerName(), - source.getQueueId(), fileName, entryBatch.getLastWalPosition(), entryBatch.getLastSeqIds())); - cleanOldLogs(fileName, entryBatch.isEndOfFile(), source); + String walName = entryBatch.getLastWalPath().getName(); + String walPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(walName); + // if end of file, we just set the offset to -1 so we know that this file has already been fully + // replicated, otherwise we need to compare the file length + ReplicationGroupOffset offset = new ReplicationGroupOffset(walName, + entryBatch.isEndOfFile() ? -1 : entryBatch.getLastWalPosition()); + interruptOrAbortWhenFail(() -> this.queueStorage.setOffset(source.getQueueId(), walPrefix, + offset, entryBatch.getLastSeqIds())); + cleanOldLogs(walName, entryBatch.isEndOfFile(), source); } /** @@ -644,7 +657,7 @@ void cleanOldLogs(String log, boolean inclusive, ReplicationSourceInterface sour } else { NavigableSet wals; NavigableSet walsToRemove; - // synchronized on walsById to avoid race with preLogRoll + // synchronized on walsById to avoid race with postLogRoll synchronized (this.walsById) { wals = walsById.get(source.getQueueId()).get(logPrefix); if (wals == null) { @@ -726,33 +739,21 @@ private void cleanOldLogs(NavigableSet wals, ReplicationSourceInterface } } } - String queueId = source.getQueueId(); - for (String wal : wals) { - interruptOrAbortWhenFail( - () -> this.queueStorage.removeWAL(server.getServerName(), queueId, wal)); - } } // public because of we call it in TestReplicationEmptyWALRecovery - public void preLogRoll(Path newLog) throws IOException { + public void postLogRoll(Path newLog) throws IOException { String logName = newLog.getName(); String logPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(logName); // synchronized on latestPaths to avoid the new open source miss the new log synchronized (this.latestPaths) { - // Add log to queue storage - for (ReplicationSourceInterface source : this.sources.values()) { - // If record log to queue storage failed, abort RS and throw exception to make log roll - // failed - abortAndThrowIOExceptionWhenFail( - () -> this.queueStorage.addWAL(server.getServerName(), source.getQueueId(), logName)); - } - // synchronized on walsById to avoid race with cleanOldLogs synchronized (this.walsById) { // Update walsById map - for (Map.Entry>> entry : this.walsById + for (Map.Entry>> entry : this.walsById .entrySet()) { - String peerId = entry.getKey(); + ReplicationQueueId queueId = entry.getKey(); + String peerId = queueId.getPeerId(); Map> walsByPrefix = entry.getValue(); boolean existingPrefix = false; for (Map.Entry> walsEntry : walsByPrefix.entrySet()) { @@ -780,10 +781,6 @@ public void preLogRoll(Path newLog) throws IOException { // Add to latestPaths latestPaths.put(logPrefix, newLog); } - } - - // public because of we call it in TestReplicationEmptyWALRecovery - public void postLogRoll(Path newLog) throws IOException { // This only updates the sources we own, not the recovered ones for (ReplicationSourceInterface source : this.sources.values()) { source.enqueueLog(newLog); @@ -792,7 +789,29 @@ public void postLogRoll(Path newLog) throws IOException { } } - void claimQueue(ServerName deadRS, String queue) { + /** + * Check whether we should replicate the given {@code wal}. + * @param wal the file name of the wal + * @return {@code true} means we should replicate the given {@code wal}, otherwise {@code false}. + */ + private boolean shouldReplicate(ReplicationGroupOffset offset, String wal) { + if (offset == null || offset == ReplicationGroupOffset.BEGIN) { + return false; + } + long walTs = AbstractFSWALProvider.getTimestamp(wal); + long startWalTs = AbstractFSWALProvider.getTimestamp(offset.getWal()); + if (walTs < startWalTs) { + return false; + } else if (walTs > startWalTs) { + return true; + } + // if the timestamp equals, usually it means we should include this wal but there is a special + // case, a negative offset means the wal has already been fully replicated, so here we should + // check the offset. + return offset.getOffset() >= 0; + } + + void claimQueue(ReplicationQueueId queueId) { // Wait a bit before transferring the queues, we may be shutting down. // This sleep may not be enough in some cases. try { @@ -807,66 +826,83 @@ void claimQueue(ServerName deadRS, String queue) { LOG.info("Not transferring queue since we are shutting down"); return; } - // After claim the queues from dead region server, wewill skip to start the + // After claim the queues from dead region server, we will skip to start the // RecoveredReplicationSource if the peer has been removed. but there's possible that remove a // peer with peerId = 2 and add a peer with peerId = 2 again during failover. So we need to get // a copy of the replication peer first to decide whether we should start the // RecoveredReplicationSource. If the latest peer is not the old peer, we should also skip to // start the RecoveredReplicationSource, Otherwise the rs will abort (See HBASE-20475). - String peerId = new ReplicationQueueInfo(queue).getPeerId(); + String peerId = queueId.getPeerId(); ReplicationPeerImpl oldPeer = replicationPeers.getPeer(peerId); if (oldPeer == null) { LOG.info("Not transferring queue since the replication peer {} for queue {} does not exist", - peerId, queue); + peerId, queueId); return; } - Pair> claimedQueue; + Map offsets; try { - claimedQueue = queueStorage.claimQueue(deadRS, queue, server.getServerName()); + offsets = queueStorage.claimQueue(queueId, server.getServerName()); } catch (ReplicationException e) { - LOG.error( - "ReplicationException: cannot claim dead region ({})'s " + "replication queue. Znode : ({})" - + " Possible solution: check if znode size exceeds jute.maxBuffer value. " - + " If so, increase it for both client and server side.", - deadRS, queueStorage.getRsNode(deadRS), e); + LOG.error("ReplicationException: cannot claim dead region ({})'s replication queue", + queueId.getServerName(), e); server.abort("Failed to claim queue from dead regionserver.", e); return; } - if (claimedQueue.getSecond().isEmpty()) { + if (offsets.isEmpty()) { + // someone else claimed the queue return; } - String queueId = claimedQueue.getFirst(); - Set walsSet = claimedQueue.getSecond(); + ServerName sourceRS = queueId.getServerWALsBelongTo(); + ReplicationQueueId claimedQueueId = queueId.claim(server.getServerName()); ReplicationPeerImpl peer = replicationPeers.getPeer(peerId); if (peer == null || peer != oldPeer) { - LOG.warn("Skipping failover for peer {} of node {}, peer is null", peerId, deadRS); - abortWhenFail(() -> queueStorage.removeQueue(server.getServerName(), queueId)); + LOG.warn("Skipping failover for peer {} of node {}, peer is null", peerId, sourceRS); + deleteQueue(claimedQueueId); return; } - if ( - server instanceof ReplicationSyncUp.DummyServer - && peer.getPeerState().equals(PeerState.DISABLED) - ) { - LOG.warn( - "Peer {} is disabled. ReplicationSyncUp tool will skip " + "replicating data to this peer.", - peerId); - return; - } - ReplicationSourceInterface src; try { - src = createSource(queueId, peer); + src = + createSource(new ReplicationQueueData(claimedQueueId, ImmutableMap.copyOf(offsets)), peer); } catch (IOException e) { - LOG.error("Can not create replication source for peer {} and queue {}", peerId, queueId, e); + LOG.error("Can not create replication source for peer {} and queue {}", peerId, + claimedQueueId, e); server.abort("Failed to create replication source after claiming queue.", e); return; } + List walFiles; + try { + walFiles = AbstractFSWALProvider.getArchivedWALFiles(conf, sourceRS, + URLEncoder.encode(sourceRS.toString(), StandardCharsets.UTF_8.name())); + } catch (IOException e) { + LOG.error("Can not list all wal files for peer {} and queue {}", peerId, queueId, e); + server.abort("Can not list all wal files after claiming queue.", e); + return; + } + PriorityQueue walFilesPQ = new PriorityQueue<>( + Comparator. comparing(p -> AbstractFSWALProvider.getTimestamp(p.getName())) + .thenComparing(Path::getName)); + // sort the wal files and also filter out replicated files + for (Path file : walFiles) { + String walGroupId = AbstractFSWALProvider.getWALPrefixFromWALName(file.getName()); + ReplicationGroupOffset groupOffset = offsets.get(walGroupId); + if (shouldReplicate(groupOffset, file.getName())) { + walFilesPQ.add(file); + } else { + LOG.debug("Skip enqueuing log {} because it is before the start offset {}", file.getName(), + groupOffset); + } + walFilesPQ.add(file); + } + // the method is a bit long, so assign it to null here to avoid later we reuse it again by + // mistake, we should use the sorted walFilesPQ instead + walFiles = null; // synchronized on oldsources to avoid adding recovered source for the to-be-removed peer synchronized (oldsources) { peer = replicationPeers.getPeer(src.getPeerId()); if (peer == null || peer != oldPeer) { src.terminate("Recovered queue doesn't belong to any current peer"); - deleteQueue(queueId); + deleteQueue(claimedQueueId); return; } // Do not setup recovered queue if a sync replication peer is in STANDBY state, or is @@ -882,26 +918,26 @@ void claimQueue(ServerName deadRS, String queue) { || stateAndNewState.getSecond().equals(SyncReplicationState.STANDBY) ) { src.terminate("Sync replication peer is in STANDBY state"); - deleteQueue(queueId); + deleteQueue(claimedQueueId); return; } } // track sources in walsByIdRecoveredQueues Map> walsByGroup = new HashMap<>(); - walsByIdRecoveredQueues.put(queueId, walsByGroup); - for (String wal : walsSet) { - String walPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(wal); + walsByIdRecoveredQueues.put(claimedQueueId, walsByGroup); + for (Path wal : walFilesPQ) { + String walPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(wal.getName()); NavigableSet wals = walsByGroup.get(walPrefix); if (wals == null) { wals = new TreeSet<>(); walsByGroup.put(walPrefix, wals); } - wals.add(wal); + wals.add(wal.getName()); } oldsources.add(src); - LOG.info("Added source for recovered queue {}", src.getQueueId()); - for (String wal : walsSet) { - LOG.trace("Enqueueing log from recovered queue for source: " + src.getQueueId()); + LOG.info("Added source for recovered queue {}", claimedQueueId); + for (Path wal : walFilesPQ) { + LOG.debug("Enqueueing log {} from recovered queue for source: {}", wal, claimedQueueId); src.enqueueLog(new Path(oldLogDir, wal)); } src.startup(); @@ -927,7 +963,9 @@ public void join() { * Get a copy of the wals of the normal sources on this rs * @return a sorted set of wal names */ - public Map>> getWALs() { + @RestrictedApi(explanation = "Should only be called in tests", link = "", + allowedOnPath = ".*/src/test/.*") + public Map>> getWALs() { return Collections.unmodifiableMap(walsById); } @@ -935,7 +973,9 @@ public Map>> getWALs() { * Get a copy of the wals of the recovered sources on this rs * @return a sorted set of wal names */ - Map>> getWalsByIdRecoveredQueues() { + @RestrictedApi(explanation = "Should only be called in tests", link = "", + allowedOnPath = ".*/src/test/.*") + Map>> getWalsByIdRecoveredQueues() { return Collections.unmodifiableMap(walsByIdRecoveredQueues); } @@ -963,16 +1003,6 @@ public ReplicationSourceInterface getSource(String peerId) { return this.sources.get(peerId); } - List getAllQueues() throws IOException { - List allQueues = Collections.emptyList(); - try { - allQueues = queueStorage.getAllQueues(server.getServerName()); - } catch (ReplicationException e) { - throw new IOException(e); - } - return allQueues; - } - int getSizeOfLatestPath() { synchronized (latestPaths) { return latestPaths.size(); @@ -1068,6 +1098,8 @@ MetricsReplicationGlobalSourceSource getGlobalMetrics() { return this.globalMetrics; } + @RestrictedApi(explanation = "Should only be called in tests", link = "", + allowedOnPath = ".*/src/test/.*") ReplicationQueueStorage getQueueStorage() { return queueStorage; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java index 0733c61bc206..7b863dc35ae9 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java @@ -57,7 +57,7 @@ public enum WorkerState { private final Configuration conf; protected final String walGroupId; protected final ReplicationSourceLogQueue logQueue; - private final ReplicationSource source; + protected final ReplicationSource source; // Last position in the log that we sent to ZooKeeper // It will be accessed by the stats thread so make it volatile @@ -299,10 +299,6 @@ void setWALReader(ReplicationSourceWALReader entryReader) { this.entryReader = entryReader; } - long getStartPosition() { - return 0; - } - protected boolean isActive() { return source.isSourceActive() && state == WorkerState.RUNNING && !isInterrupted(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALActionListener.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALActionListener.java index 7337694addbf..9380c6b63050 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALActionListener.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALActionListener.java @@ -43,11 +43,6 @@ public ReplicationSourceWALActionListener(Configuration conf, ReplicationSourceM this.manager = manager; } - @Override - public void preLogRoll(Path oldPath, Path newPath) throws IOException { - manager.preLogRoll(newPath); - } - @Override public void postLogRoll(Path oldPath, Path newPath) throws IOException { manager.postLogRoll(newPath); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSyncUp.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSyncUp.java index daf9081234d0..50ffd6df1afd 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSyncUp.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSyncUp.java @@ -80,17 +80,18 @@ private Set getLiveRegionServers(ZKWatcher zkw) throws KeeperExcepti // replication queues for the dead region servers first and then replicate the data out. private void claimReplicationQueues(ZKWatcher zkw, ReplicationSourceManager mgr) throws ReplicationException, KeeperException { - List replicators = mgr.getQueueStorage().getListOfReplicators(); - Set liveRegionServers = getLiveRegionServers(zkw); - for (ServerName sn : replicators) { - if (!liveRegionServers.contains(sn)) { - List replicationQueues = mgr.getQueueStorage().getAllQueues(sn); - System.out.println(sn + " is dead, claim its replication queues: " + replicationQueues); - for (String queue : replicationQueues) { - mgr.claimQueue(sn, queue); - } - } - } + // TODO: reimplement this tool + // List replicators = mgr.getQueueStorage().getListOfReplicators(); + // Set liveRegionServers = getLiveRegionServers(zkw); + // for (ServerName sn : replicators) { + // if (!liveRegionServers.contains(sn)) { + // List replicationQueues = mgr.getQueueStorage().getAllQueues(sn); + // System.out.println(sn + " is dead, claim its replication queues: " + replicationQueues); + // for (String queue : replicationQueues) { + // mgr.claimQueue(sn, queue); + // } + // } + // } } @Override diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java index 7e10fd786a45..36acffa96642 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java @@ -2558,7 +2558,7 @@ private synchronized HbckRegionInfo getOrCreateInfo(String name) { } private void checkAndFixReplication() throws ReplicationException, IOException { - ReplicationChecker checker = new ReplicationChecker(getConf(), zkw, errors); + ReplicationChecker checker = new ReplicationChecker(getConf(), zkw, connection, errors); checker.checkUnDeletedQueues(); if (checker.hasUnDeletedQueues() && this.fixReplication) { @@ -3831,7 +3831,7 @@ public void cleanReplicationBarrier() throws IOException { return; } ReplicationQueueStorage queueStorage = - ReplicationStorageFactory.getReplicationQueueStorage(zkw, getConf()); + ReplicationStorageFactory.getReplicationQueueStorage(connection, getConf()); List peerDescriptions = admin.listReplicationPeers(); if (peerDescriptions != null && peerDescriptions.size() > 0) { List peers = peerDescriptions.stream() diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/hbck/ReplicationChecker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/hbck/ReplicationChecker.java index 7e7a46573b8a..497304a31113 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/hbck/ReplicationChecker.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/hbck/ReplicationChecker.java @@ -27,9 +27,11 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.replication.ReplicationException; import org.apache.hadoop.hbase.replication.ReplicationPeerStorage; -import org.apache.hadoop.hbase.replication.ReplicationQueueInfo; +import org.apache.hadoop.hbase.replication.ReplicationQueueData; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; import org.apache.hadoop.hbase.util.HbckErrorReporter; @@ -48,18 +50,18 @@ public class ReplicationChecker { private final HbckErrorReporter errorReporter; // replicator with its queueIds for removed peers - private Map> undeletedQueueIds = new HashMap<>(); + private Map> undeletedQueueIds = new HashMap<>(); // replicator with its undeleted queueIds for removed peers in hfile-refs queue private Set undeletedHFileRefsPeerIds = new HashSet<>(); private final ReplicationPeerStorage peerStorage; private final ReplicationQueueStorage queueStorage; - public ReplicationChecker(Configuration conf, ZKWatcher zkw, HbckErrorReporter errorReporter) - throws IOException { + public ReplicationChecker(Configuration conf, ZKWatcher zkw, Connection conn, + HbckErrorReporter errorReporter) throws IOException { this.peerStorage = ReplicationStorageFactory.getReplicationPeerStorage(FileSystem.get(conf), zkw, conf); - this.queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zkw, conf); + this.queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(conn, conf); this.errorReporter = errorReporter; } @@ -68,19 +70,19 @@ public boolean hasUnDeletedQueues() { .contains(HbckErrorReporter.ERROR_CODE.UNDELETED_REPLICATION_QUEUE); } - private Map> getUnDeletedQueues() throws ReplicationException { - Map> undeletedQueues = new HashMap<>(); + private Map> getUnDeletedQueues() + throws ReplicationException { + Map> undeletedQueues = new HashMap<>(); Set peerIds = new HashSet<>(peerStorage.listPeerIds()); - for (ServerName replicator : queueStorage.getListOfReplicators()) { - for (String queueId : queueStorage.getAllQueues(replicator)) { - ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueId); - if (!peerIds.contains(queueInfo.getPeerId())) { - undeletedQueues.computeIfAbsent(replicator, key -> new ArrayList<>()).add(queueId); - LOG.debug( - "Undeleted replication queue for removed peer found: " - + "[removedPeerId={}, replicator={}, queueId={}]", - queueInfo.getPeerId(), replicator, queueId); - } + for (ReplicationQueueData queueData : queueStorage.listAllQueues()) { + ReplicationQueueId queueId = queueData.getId(); + if (!peerIds.contains(queueId.getPeerId())) { + undeletedQueues.computeIfAbsent(queueId.getServerName(), key -> new ArrayList<>()) + .add(queueId); + LOG.debug( + "Undeleted replication queue for removed peer found: " + + "[removedPeerId={}, replicator={}, queueId={}]", + queueId.getPeerId(), queueId.getServerName(), queueId); } } return undeletedQueues; @@ -103,9 +105,8 @@ public void checkUnDeletedQueues() throws ReplicationException { undeletedQueueIds = getUnDeletedQueues(); undeletedQueueIds.forEach((replicator, queueIds) -> { queueIds.forEach(queueId -> { - ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueId); String msg = "Undeleted replication queue for removed peer found: " - + String.format("[removedPeerId=%s, replicator=%s, queueId=%s]", queueInfo.getPeerId(), + + String.format("[removedPeerId=%s, replicator=%s, queueId=%s]", queueId.getPeerId(), replicator, queueId); errorReporter.reportError(HbckErrorReporter.ERROR_CODE.UNDELETED_REPLICATION_QUEUE, msg); }); @@ -118,12 +119,12 @@ public void checkUnDeletedQueues() throws ReplicationException { } public void fixUnDeletedQueues() throws ReplicationException { - for (Map.Entry> replicatorAndQueueIds : undeletedQueueIds.entrySet()) { + for (Map.Entry> replicatorAndQueueIds : undeletedQueueIds + .entrySet()) { ServerName replicator = replicatorAndQueueIds.getKey(); - for (String queueId : replicatorAndQueueIds.getValue()) { - queueStorage.removeQueue(replicator, queueId); + for (ReplicationQueueId queueId : replicatorAndQueueIds.getValue()) { + queueStorage.removeQueue(queueId); } - queueStorage.removeReplicatorIfQueueIsEmpty(replicator); } for (String peerId : undeletedHFileRefsPeerIds) { queueStorage.removePeerFromHFileRefs(peerId); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java index 4ca32e83e638..db39a8ba0232 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hbase.wal; +import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; @@ -28,6 +29,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.Abortable; @@ -335,6 +337,44 @@ public static String getWALArchiveDirectoryName(Configuration conf, final String return dirName.toString(); } + /** + * List all the old wal files for a dead region server. + *

    + * Initially added for supporting replication, where we need to get the wal files to replicate for + * a dead region server. + */ + public static List getArchivedWALFiles(Configuration conf, ServerName serverName, + String logPrefix) throws IOException { + Path walRootDir = CommonFSUtils.getWALRootDir(conf); + FileSystem fs = walRootDir.getFileSystem(conf); + List archivedWalFiles = new ArrayList<>(); + // list both the root old wal dir and the separate old wal dir, so we will not miss any files if + // the SEPARATE_OLDLOGDIR config is changed + Path oldWalDir = new Path(walRootDir, HConstants.HREGION_OLDLOGDIR_NAME); + try { + for (FileStatus status : fs.listStatus(oldWalDir, p -> p.getName().startsWith(logPrefix))) { + if (status.isFile()) { + archivedWalFiles.add(status.getPath()); + } + } + } catch (FileNotFoundException e) { + LOG.info("Old WAL dir {} not exists", oldWalDir); + return Collections.emptyList(); + } + Path separatedOldWalDir = new Path(oldWalDir, serverName.toString()); + try { + for (FileStatus status : fs.listStatus(separatedOldWalDir, + p -> p.getName().startsWith(logPrefix))) { + if (status.isFile()) { + archivedWalFiles.add(status.getPath()); + } + } + } catch (FileNotFoundException e) { + LOG.info("Seprated old WAL dir {} not exists", separatedOldWalDir); + } + return archivedWalFiles; + } + /** * Pulls a ServerName out of a Path generated according to our layout rules. In the below layouts, * this method ignores the format of the logfile component. Current format: [base directory for diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAsyncReplicationAdminApi.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAsyncReplicationAdminApi.java index f942f4ed99db..157277d83022 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAsyncReplicationAdminApi.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAsyncReplicationAdminApi.java @@ -43,12 +43,12 @@ import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.ReplicationPeerNotFoundException; -import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.replication.ReplicationException; import org.apache.hadoop.hbase.replication.ReplicationPeerConfig; import org.apache.hadoop.hbase.replication.ReplicationPeerConfigBuilder; import org.apache.hadoop.hbase.replication.ReplicationPeerDescription; +import org.apache.hadoop.hbase.replication.ReplicationQueueData; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; import org.apache.hadoop.hbase.replication.VerifyWALEntriesReplicationEndpoint; @@ -102,11 +102,9 @@ public void clearPeerAndQueues() throws IOException, ReplicationException { } catch (Exception e) { } ReplicationQueueStorage queueStorage = ReplicationStorageFactory - .getReplicationQueueStorage(TEST_UTIL.getZooKeeperWatcher(), TEST_UTIL.getConfiguration()); - for (ServerName serverName : queueStorage.getListOfReplicators()) { - for (String queue : queueStorage.getAllQueues(serverName)) { - queueStorage.removeQueue(serverName, queue); - } + .getReplicationQueueStorage(TEST_UTIL.getConnection(), TEST_UTIL.getConfiguration()); + for (ReplicationQueueData queueData : queueStorage.listAllQueues()) { + queueStorage.removeQueue(queueData.getId()); } admin.replicationPeerModificationSwitch(true).join(); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java index 11f882eb45a3..c601425e5f0a 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java @@ -151,7 +151,7 @@ public MultiResponse answer(InvocationOnMock invocation) throws Throwable { CommonFSUtils.setRootDir(getConfiguration(), rootdir); this.rpm = mock(ReplicationPeerManager.class); ReplicationQueueStorage rqs = mock(ReplicationQueueStorage.class); - when(rqs.getAllQueues(any())).thenReturn(Collections.emptyList()); + when(rqs.listAllQueueIds(any(ServerName.class))).thenReturn(Collections.emptyList()); when(rpm.getQueueStorage()).thenReturn(rqs); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestLogsCleaner.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestLogsCleaner.java index e9e5f973cf8e..1a0537bcbafe 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestLogsCleaner.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestLogsCleaner.java @@ -20,7 +20,6 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; -import static org.mockito.Mockito.doAnswer; import static org.mockito.Mockito.doThrow; import static org.mockito.Mockito.spy; @@ -45,8 +44,8 @@ import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.Waiter; import org.apache.hadoop.hbase.ZooKeeperConnectionException; +import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.master.HMaster; -import org.apache.hadoop.hbase.replication.ReplicationException; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; import org.apache.hadoop.hbase.replication.master.ReplicationLogCleaner; @@ -62,13 +61,14 @@ import org.junit.Before; import org.junit.BeforeClass; import org.junit.ClassRule; +import org.junit.Ignore; import org.junit.Test; import org.junit.experimental.categories.Category; -import org.mockito.invocation.InvocationOnMock; -import org.mockito.stubbing.Answer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +// revisit later after we implement new replication log cleaner +@Ignore @Category({ MasterTests.class, MediumTests.class }) public class TestLogsCleaner { @@ -131,8 +131,8 @@ public void testLogCleaning() throws Exception { HMaster.decorateMasterConfiguration(conf); Server server = new DummyServer(); - ReplicationQueueStorage queueStorage = - ReplicationStorageFactory.getReplicationQueueStorage(server.getZooKeeper(), conf); + ReplicationQueueStorage queueStorage = ReplicationStorageFactory + .getReplicationQueueStorage(ConnectionFactory.createConnection(conf), conf); String fakeMachineName = URLEncoder.encode(server.getServerName().toString(), StandardCharsets.UTF_8.name()); @@ -162,7 +162,7 @@ public void testLogCleaning() throws Exception { // Case 4: put 3 WALs in ZK indicating that they are scheduled for replication so these // files would pass TimeToLiveLogCleaner but would be rejected by ReplicationLogCleaner if (i % (30 / 3) == 0) { - queueStorage.addWAL(server.getServerName(), fakeMachineName, fileName.getName()); + // queueStorage.addWAL(server.getServerName(), fakeMachineName, fileName.getName()); LOG.info("Replication log file: " + fileName); } } @@ -222,20 +222,20 @@ public void testZooKeeperRecoveryDuringGetListOfReplicators() throws Exception { try { faultyZK.init(false); - ReplicationQueueStorage queueStorage = - spy(ReplicationStorageFactory.getReplicationQueueStorage(faultyZK, conf)); - doAnswer(new Answer() { - @Override - public Object answer(InvocationOnMock invocation) throws Throwable { - try { - return invocation.callRealMethod(); - } catch (ReplicationException e) { - LOG.debug("Caught Exception", e); - getListOfReplicatorsFailed.set(true); - throw e; - } - } - }).when(queueStorage).getAllWALs(); + ReplicationQueueStorage queueStorage = spy(ReplicationStorageFactory + .getReplicationQueueStorage(ConnectionFactory.createConnection(conf), conf)); + // doAnswer(new Answer() { + // @Override + // public Object answer(InvocationOnMock invocation) throws Throwable { + // try { + // return invocation.callRealMethod(); + // } catch (ReplicationException e) { + // LOG.debug("Caught Exception", e); + // getListOfReplicatorsFailed.set(true); + // throw e; + // } + // } + // }).when(queueStorage).getAllWALs(); cleaner.setConf(conf, faultyZK, queueStorage); // should keep all files due to a ConnectionLossException getting the queues znodes diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestReplicationHFileCleaner.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestReplicationHFileCleaner.java index 87d21e583dda..2409b081cce7 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestReplicationHFileCleaner.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestReplicationHFileCleaner.java @@ -17,14 +17,12 @@ */ package org.apache.hadoop.hbase.master.cleaner; -import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; -import static org.mockito.Mockito.doThrow; -import static org.mockito.Mockito.spy; import java.io.IOException; +import java.io.UncheckedIOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -32,12 +30,11 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.Abortable; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtil; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.Server; -import org.apache.hadoop.hbase.ZooKeeperConnectionException; +import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.replication.ReplicationException; import org.apache.hadoop.hbase.replication.ReplicationFactory; @@ -49,25 +46,21 @@ import org.apache.hadoop.hbase.replication.master.ReplicationHFileCleaner; import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.testclassification.SmallTests; -import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.MockServer; import org.apache.hadoop.hbase.util.Pair; -import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper; -import org.apache.hadoop.hbase.zookeeper.ZKWatcher; -import org.apache.zookeeper.KeeperException; -import org.apache.zookeeper.data.Stat; import org.junit.After; import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.ClassRule; +import org.junit.Ignore; import org.junit.Test; import org.junit.experimental.categories.Category; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hbase.thirdparty.com.google.common.collect.Lists; - +// TODO: revisit later +@Ignore @Category({ MasterTests.class, SmallTests.class }) public class TestReplicationHFileCleaner { @@ -87,14 +80,14 @@ public class TestReplicationHFileCleaner { @BeforeClass public static void setUpBeforeClass() throws Exception { - TEST_UTIL.startMiniZKCluster(); + TEST_UTIL.startMiniCluster(); server = new DummyServer(); conf.setBoolean(HConstants.REPLICATION_BULKLOAD_ENABLE_KEY, true); HMaster.decorateMasterConfiguration(conf); rp = ReplicationFactory.getReplicationPeers(server.getFileSystem(), server.getZooKeeper(), conf); rp.init(); - rq = ReplicationStorageFactory.getReplicationQueueStorage(server.getZooKeeper(), conf); + rq = ReplicationStorageFactory.getReplicationQueueStorage(server.getConnection(), conf); fs = FileSystem.get(conf); } @@ -109,7 +102,6 @@ public void setup() throws ReplicationException, IOException { rp.getPeerStorage().addPeer(peerId, ReplicationPeerConfig.newBuilder().setClusterKey(TEST_UTIL.getClusterKey()).build(), true, SyncReplicationState.NONE); - rq.addPeerToHFileRefs(peerId); } @After @@ -184,47 +176,6 @@ public void testGetDeletableFiles() throws Exception { assertTrue(deletableFilesIterator.next().getPath().equals(deletablefile)); } - /** - * ReplicationHFileCleaner should be able to ride over ZooKeeper errors without aborting. - */ - @Test - public void testZooKeeperAbort() throws Exception { - ReplicationHFileCleaner cleaner = new ReplicationHFileCleaner(); - - List dummyFiles = Lists.newArrayList( - new FileStatus(100, false, 3, 100, EnvironmentEdgeManager.currentTime(), new Path("hfile1")), - new FileStatus(100, false, 3, 100, EnvironmentEdgeManager.currentTime(), new Path("hfile2"))); - - FaultyZooKeeperWatcher faultyZK = - new FaultyZooKeeperWatcher(conf, "testZooKeeperAbort-faulty", null); - try { - faultyZK.init(); - cleaner.setConf(conf, faultyZK); - // should keep all files due to a ConnectionLossException getting the queues znodes - Iterable toDelete = cleaner.getDeletableFiles(dummyFiles); - assertFalse(toDelete.iterator().hasNext()); - assertFalse(cleaner.isStopped()); - } finally { - faultyZK.close(); - } - - // when zk is working both files should be returned - cleaner = new ReplicationHFileCleaner(); - ZKWatcher zkw = new ZKWatcher(conf, "testZooKeeperAbort-normal", null); - try { - cleaner.setConf(conf, zkw); - Iterable filesToDelete = cleaner.getDeletableFiles(dummyFiles); - Iterator iter = filesToDelete.iterator(); - assertTrue(iter.hasNext()); - assertEquals(new Path("hfile1"), iter.next().getPath()); - assertTrue(iter.hasNext()); - assertEquals(new Path("hfile2"), iter.next().getPath()); - assertFalse(iter.hasNext()); - } finally { - zkw.close(); - } - } - static class DummyServer extends MockServer { @Override @@ -233,13 +184,12 @@ public Configuration getConfiguration() { } @Override - public ZKWatcher getZooKeeper() { + public Connection getConnection() { try { - return new ZKWatcher(getConfiguration(), "dummy server", this); + return TEST_UTIL.getConnection(); } catch (IOException e) { - LOG.error("Can not get ZKWatcher", e); + throw new UncheckedIOException(e); } - return null; } @Override @@ -247,29 +197,8 @@ public FileSystem getFileSystem() { try { return TEST_UTIL.getTestFileSystem(); } catch (IOException e) { - LOG.error("Can not get FileSystem", e); + throw new UncheckedIOException(e); } - return null; - } - } - - static class FaultyZooKeeperWatcher extends ZKWatcher { - private RecoverableZooKeeper zk; - - public FaultyZooKeeperWatcher(Configuration conf, String identifier, Abortable abortable) - throws ZooKeeperConnectionException, IOException { - super(conf, identifier, abortable); - } - - public void init() throws Exception { - this.zk = spy(super.getRecoverableZooKeeper()); - doThrow(new KeeperException.ConnectionLossException()).when(zk) - .getData("/hbase/replication/hfile-refs", null, new Stat()); - } - - @Override - public RecoverableZooKeeper getRecoverableZooKeeper() { - return zk; } } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestBulkLoadReplicationHFileRefs.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestBulkLoadReplicationHFileRefs.java index 70a6e88552bd..787784c8ec40 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestBulkLoadReplicationHFileRefs.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestBulkLoadReplicationHFileRefs.java @@ -114,7 +114,7 @@ public static void setUpBeforeClass() throws Exception { admin1 = UTIL1.getConnection().getAdmin(); admin2 = UTIL2.getConnection().getAdmin(); - queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(UTIL1.getZooKeeperWatcher(), + queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(UTIL1.getConnection(), UTIL1.getConfiguration()); admin1.createNamespace(NamespaceDescriptor.create(REPLICATE_NAMESPACE).build()); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/regionreplication/TestStartupWithLegacyRegionReplicationEndpoint.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/regionreplication/TestStartupWithLegacyRegionReplicationEndpoint.java index 66eaff0493ee..5af9edb8efc2 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/regionreplication/TestStartupWithLegacyRegionReplicationEndpoint.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/regionreplication/TestStartupWithLegacyRegionReplicationEndpoint.java @@ -22,6 +22,7 @@ import static org.junit.Assert.assertTrue; import java.io.IOException; +import java.util.Collections; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtil; import org.apache.hadoop.hbase.ReplicationPeerNotFoundException; @@ -29,9 +30,13 @@ import org.apache.hadoop.hbase.SingleProcessHBaseCluster; import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; +import org.apache.hadoop.hbase.master.replication.ReplicationPeerManager; import org.apache.hadoop.hbase.procedure2.Procedure; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; import org.apache.hadoop.hbase.replication.ReplicationPeerConfig; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.hadoop.hbase.replication.ReplicationUtils; +import org.apache.hadoop.hbase.replication.regionserver.ReplicationSourceInterface; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.testclassification.RegionServerTests; import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread; @@ -57,6 +62,10 @@ public class TestStartupWithLegacyRegionReplicationEndpoint { @BeforeClass public static void setUp() throws Exception { UTIL.startMiniCluster(1); + // add a peer to force initialize the replication storage + UTIL.getAdmin().addReplicationPeer("1", ReplicationPeerConfig.newBuilder() + .setClusterKey(UTIL.getZkCluster().getAddress().toString() + ":/1").build()); + UTIL.getAdmin().removeReplicationPeer("1"); } @AfterClass @@ -66,40 +75,42 @@ public static void tearDown() throws IOException { @Test public void test() throws Exception { + String peerId = "legacy"; ReplicationPeerConfig peerConfig = ReplicationPeerConfig.newBuilder() .setClusterKey("127.0.0.1:2181:/hbase") .setReplicationEndpointImpl(ReplicationUtils.LEGACY_REGION_REPLICATION_ENDPOINT_NAME).build(); SingleProcessHBaseCluster cluster = UTIL.getMiniHBaseCluster(); HMaster master = cluster.getMaster(); // can not use Admin.addPeer as it will fail with ClassNotFound - master.getReplicationPeerManager().addPeer("legacy", peerConfig, true); + master.getReplicationPeerManager().addPeer(peerId, peerConfig, true); // add a wal file to the queue ServerName rsName = cluster.getRegionServer(0).getServerName(); - master.getReplicationPeerManager().getQueueStorage().addWAL(rsName, - ServerRegionReplicaUtil.REGION_REPLICA_REPLICATION_PEER, "test-wal-file"); + master.getReplicationPeerManager().getQueueStorage().setOffset( + new ReplicationQueueId(rsName, ServerRegionReplicaUtil.REGION_REPLICA_REPLICATION_PEER), "", + new ReplicationGroupOffset("test-wal-file", 0), Collections.emptyMap()); cluster.stopRegionServer(0); RegionServerThread rst = cluster.startRegionServer(); // we should still have this peer - assertNotNull(UTIL.getAdmin().getReplicationPeerConfig("legacy")); + assertNotNull(UTIL.getAdmin().getReplicationPeerConfig(peerId)); // but at RS side, we should not have this peer loaded as replication source - assertTrue(rst.getRegionServer().getReplicationSourceService().getReplicationManager() - .getSources().isEmpty()); + assertTrue( + rst.getRegionServer().getReplicationSourceService().getReplicationManager().getSources() + .stream().map(ReplicationSourceInterface::getPeerId).noneMatch(p -> p.equals(peerId))); UTIL.shutdownMiniHBaseCluster(); UTIL.restartHBaseCluster(1); // now we should have removed the peer assertThrows(ReplicationPeerNotFoundException.class, () -> UTIL.getAdmin().getReplicationPeerConfig("legacy")); - // at rs side, we should not have the peer this time, not only for not having replication source - assertTrue(UTIL.getMiniHBaseCluster().getRegionServer(0).getReplicationSourceService() - .getReplicationManager().getReplicationPeers().getAllPeerIds().isEmpty()); - // make sure that we can finish the SCP and delete the test-wal-file + // make sure that we can finish the SCP UTIL.waitFor(15000, () -> UTIL.getMiniHBaseCluster().getMaster().getProcedures().stream() .filter(p -> p instanceof ServerCrashProcedure).map(p -> (ServerCrashProcedure) p) .allMatch(Procedure::isSuccess)); - assertTrue(UTIL.getMiniHBaseCluster().getMaster().getReplicationPeerManager().getQueueStorage() - .getAllQueues(rsName).isEmpty()); + // the deletion is async, so wait until they get deleted + ReplicationPeerManager ppm = UTIL.getMiniHBaseCluster().getMaster().getReplicationPeerManager(); + UTIL.waitFor(15000, () -> !ppm.getPeerStorage().listPeerIds().contains(peerId) + && ppm.getQueueStorage().listAllQueueIds(peerId, rsName).isEmpty()); } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/ReplicationSourceDummy.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/ReplicationSourceDummy.java index 52ea2cd60503..da0868be885f 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/ReplicationSourceDummy.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/ReplicationSourceDummy.java @@ -41,7 +41,7 @@ public class ReplicationSourceDummy implements ReplicationSourceInterface { private ReplicationSourceManager manager; private ReplicationPeer replicationPeer; - private String peerClusterId; + private ReplicationQueueId queueId; private Path currentPath; private MetricsSource metrics; private WALFileLengthProvider walFileLengthProvider; @@ -49,11 +49,11 @@ public class ReplicationSourceDummy implements ReplicationSourceInterface { @Override public void init(Configuration conf, FileSystem fs, ReplicationSourceManager manager, - ReplicationQueueStorage rq, ReplicationPeer rp, Server server, String peerClusterId, + ReplicationQueueStorage rq, ReplicationPeer rp, Server server, ReplicationQueueData queueData, UUID clusterId, WALFileLengthProvider walFileLengthProvider, MetricsSource metrics) throws IOException { this.manager = manager; - this.peerClusterId = peerClusterId; + this.queueId = queueData.getId(); this.metrics = metrics; this.walFileLengthProvider = walFileLengthProvider; this.replicationPeer = rp; @@ -100,14 +100,13 @@ public void terminate(String reason, Exception e, boolean clearMetrics) { } @Override - public String getQueueId() { - return peerClusterId; + public ReplicationQueueId getQueueId() { + return queueId; } @Override public String getPeerId() { - String[] parts = peerClusterId.split("-", 2); - return parts.length != 1 ? parts[0] : peerClusterId; + return queueId.getPeerId(); } @Override diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestAddToSerialReplicationPeer.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestAddToSerialReplicationPeer.java index 229da6b07129..5e764ebb0ef5 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestAddToSerialReplicationPeer.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestAddToSerialReplicationPeer.java @@ -79,8 +79,9 @@ public boolean evaluate() throws Exception { ReplicationSourceManager manager = ((Replication) rs.getReplicationSourceService()).getReplicationManager(); // Make sure replication moves to the new file. - return (manager.getWALs().get(PEER_ID).get(logPrefix).size() == 1) - && !oldWalName.equals(manager.getWALs().get(PEER_ID).get(logPrefix).first()); + ReplicationQueueId queueId = new ReplicationQueueId(rs.getServerName(), PEER_ID); + return (manager.getWALs().get(queueId).get(logPrefix).size() == 1) + && !oldWalName.equals(manager.getWALs().get(queueId).get(logPrefix).first()); } @Override diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestClaimReplicationQueue.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestClaimReplicationQueue.java index b8718f3526bc..a12081a76363 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestClaimReplicationQueue.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestClaimReplicationQueue.java @@ -31,7 +31,7 @@ import org.apache.hadoop.hbase.master.RegionServerList; import org.apache.hadoop.hbase.master.ServerManager; import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; -import org.apache.hadoop.hbase.master.replication.ClaimReplicationQueuesProcedure; +import org.apache.hadoop.hbase.master.replication.AssignReplicationQueuesProcedure; import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.testclassification.ReplicationTests; @@ -47,7 +47,7 @@ /** * In HBASE-26029, we reimplement the claim queue operation with proc-v2 and make it a step in SCP, - * this is a UT to make sure the {@link ClaimReplicationQueuesProcedure} works correctly. + * this is a UT to make sure the {@link AssignReplicationQueuesProcedure} works correctly. */ @Category({ ReplicationTests.class, LargeTests.class }) public class TestClaimReplicationQueue extends TestReplicationBase { @@ -77,7 +77,7 @@ public List getOnlineServersList() { // return no region server to make the procedure hang if (EMPTY) { for (StackTraceElement e : Thread.currentThread().getStackTrace()) { - if (e.getClassName().equals(ClaimReplicationQueuesProcedure.class.getName())) { + if (e.getClassName().equals(AssignReplicationQueuesProcedure.class.getName())) { return Collections.emptyList(); } } @@ -149,7 +149,7 @@ public void testClaim() throws Exception { HMaster master = UTIL1.getMiniHBaseCluster().getMaster(); UTIL1.waitFor(30000, () -> master.getProcedures().stream() - .filter(p -> p instanceof ClaimReplicationQueuesProcedure) + .filter(p -> p instanceof AssignReplicationQueuesProcedure) .anyMatch(p -> p.getState() == ProcedureState.WAITING_TIMEOUT)); hbaseAdmin.enableReplicationPeer(PEER_ID2); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestNamespaceReplicationWithBulkLoadedData.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestNamespaceReplicationWithBulkLoadedData.java index 180991c4a7f2..5fc48b2d7298 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestNamespaceReplicationWithBulkLoadedData.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestNamespaceReplicationWithBulkLoadedData.java @@ -259,8 +259,8 @@ public void testBulkLoadReplicationActiveActive() throws Exception { MiniZooKeeperCluster zkCluster = UTIL1.getZkCluster(); ZKWatcher watcher = new ZKWatcher(UTIL1.getConfiguration(), "TestZnodeHFiles-refs", null); RecoverableZooKeeper zk = RecoverableZooKeeper.connect(UTIL1.getConfiguration(), watcher); - ZKReplicationQueueStorage replicationQueueStorage = - new ZKReplicationQueueStorage(watcher, UTIL1.getConfiguration()); + ReplicationQueueStorage replicationQueueStorage = ReplicationStorageFactory + .getReplicationQueueStorage(UTIL1.getConnection(), UTIL1.getConfiguration()); Set hfiles = replicationQueueStorage.getAllHFileRefs(); assertTrue(hfiles.isEmpty()); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationEmptyWALRecovery.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationEmptyWALRecovery.java index 63cbfe3119c4..67546febab72 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationEmptyWALRecovery.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationEmptyWALRecovery.java @@ -332,7 +332,6 @@ private void injectEmptyWAL(int numRs, List emptyWalPaths) throws IOExcept for (int i = 0; i < numRs; i++) { HRegionServer hrs = UTIL1.getHBaseCluster().getRegionServer(i); Replication replicationService = (Replication) hrs.getReplicationSourceService(); - replicationService.getReplicationManager().preLogRoll(emptyWalPaths.get(i)); replicationService.getReplicationManager().postLogRoll(emptyWalPaths.get(i)); RegionInfo regionInfo = UTIL1.getHBaseCluster().getRegions(htable1.getName()).get(0).getRegionInfo(); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpTool.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpTool.java index bf65d4db82e2..7a89af15902e 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpTool.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpTool.java @@ -32,11 +32,14 @@ import org.apache.hadoop.hbase.testclassification.ReplicationTests; import org.apache.hadoop.hbase.util.Bytes; import org.junit.ClassRule; +import org.junit.Ignore; import org.junit.Test; import org.junit.experimental.categories.Category; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +// revisit later when we implement the new ReplicationSyncUpTool +@Ignore @Category({ ReplicationTests.class, LargeTests.class }) public class TestReplicationSyncUpTool extends TestReplicationSyncUpToolBase { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpToolWithBulkLoadedData.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpToolWithBulkLoadedData.java index fbf8ac6b3c9d..b5de8e6324fe 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpToolWithBulkLoadedData.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpToolWithBulkLoadedData.java @@ -45,11 +45,14 @@ import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.HFileTestUtil; import org.junit.ClassRule; +import org.junit.Ignore; import org.junit.Test; import org.junit.experimental.categories.Category; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +//revisit later when we implement the new ReplicationSyncUpTool +@Ignore @Category({ ReplicationTests.class, LargeTests.class }) public class TestReplicationSyncUpToolWithBulkLoadedData extends TestReplicationSyncUpToolBase { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestSerialReplicationFailover.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestSerialReplicationFailover.java index 1295ea14abcd..6906db4cd466 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestSerialReplicationFailover.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestSerialReplicationFailover.java @@ -32,9 +32,12 @@ import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread; import org.junit.Before; import org.junit.ClassRule; +import org.junit.Ignore; import org.junit.Test; import org.junit.experimental.categories.Category; +// revisit later when we reviewing the implementation for serial replication +@Ignore @Category({ ReplicationTests.class, MediumTests.class }) public class TestSerialReplicationFailover extends SerialReplicationTestBase { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestTableReplicationQueueStorage.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestTableReplicationQueueStorage.java new file mode 100644 index 000000000000..4148c1c1a2c0 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestTableReplicationQueueStorage.java @@ -0,0 +1,423 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.replication; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.hasItem; +import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.not; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.TableNameTestRule; +import org.apache.hadoop.hbase.client.TableDescriptor; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.testclassification.ReplicationTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.MD5Hash; +import org.apache.hadoop.hbase.util.Pair; +import org.apache.zookeeper.KeeperException; +import org.hamcrest.Matchers; +import org.hamcrest.collection.IsEmptyCollection; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap; +import org.apache.hbase.thirdparty.com.google.common.collect.Iterables; + +@Category({ ReplicationTests.class, MediumTests.class }) +public class TestTableReplicationQueueStorage { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestTableReplicationQueueStorage.class); + + private static final Logger LOG = LoggerFactory.getLogger(TestTableReplicationQueueStorage.class); + + private static final HBaseTestingUtil UTIL = new HBaseTestingUtil(); + + @Rule + public TableNameTestRule tableNameRule = new TableNameTestRule(); + + private TableReplicationQueueStorage storage; + + @BeforeClass + public static void setUp() throws Exception { + UTIL.startMiniCluster(); + } + + @AfterClass + public static void tearDown() throws IOException { + UTIL.shutdownMiniCluster(); + } + + @Before + public void setUpBeforeTest() throws Exception { + TableName tableName = tableNameRule.getTableName(); + TableDescriptor td = ReplicationStorageFactory.createReplicationQueueTableDescriptor(tableName); + UTIL.getAdmin().createTable(td); + UTIL.waitTableAvailable(tableName); + storage = new TableReplicationQueueStorage(UTIL.getConnection(), tableName); + } + + private ServerName getServerName(int i) { + return ServerName.valueOf("127.0.0.1", 8000 + i, 10000 + i); + } + + private String getFileName(String base, int i) { + return String.format(base + "-%04d", i); + } + + @Test + public void testReplicator() throws ReplicationException { + assertTrue(storage.listAllReplicators().isEmpty()); + String peerId = "1"; + for (int i = 0; i < 10; i++) { + ReplicationQueueId queueId = new ReplicationQueueId(getServerName(i), peerId); + storage.setOffset(queueId, "group-" + i, new ReplicationGroupOffset("file-" + i, i * 100), + Collections.emptyMap()); + } + List replicators = storage.listAllReplicators(); + assertEquals(10, replicators.size()); + for (int i = 0; i < 10; i++) { + assertThat(replicators, hasItem(getServerName(i))); + } + for (int i = 0; i < 5; i++) { + ReplicationQueueId queueId = new ReplicationQueueId(getServerName(i), peerId); + storage.removeQueue(queueId); + } + replicators = storage.listAllReplicators(); + assertEquals(5, replicators.size()); + for (int i = 0; i < 5; i++) { + assertThat(replicators, not(hasItem(getServerName(i)))); + } + for (int i = 5; i < 10; i++) { + assertThat(replicators, hasItem(getServerName(i))); + } + } + + @Test + public void testGetSetOffset() { + + } + + private void assertQueueId(String peerId, ServerName serverName, ReplicationQueueId queueId) { + assertEquals(peerId, queueId.getPeerId()); + assertEquals(serverName, queueId.getServerName()); + assertFalse(queueId.getSourceServerName().isPresent()); + } + + @Test + public void testPersistLogPositionAndSeqIdAtomically() throws Exception { + ServerName serverName1 = ServerName.valueOf("127.0.0.1", 8000, 10000); + assertTrue(storage.listAllQueueIds(serverName1).isEmpty()); + String peerId1 = "1"; + String region0 = "6b2c8f8555335cc9af74455b94516cbe"; + String region1 = "6ecd2e9e010499f8ddef97ee8f70834f"; + + for (int i = 0; i < 10; i++) { + ReplicationQueueId queueId = new ReplicationQueueId(serverName1, peerId1); + assertTrue(storage.getOffsets(queueId).isEmpty()); + } + assertEquals(HConstants.NO_SEQNUM, storage.getLastSequenceId(region0, peerId1)); + assertEquals(HConstants.NO_SEQNUM, storage.getLastSequenceId(region1, peerId1)); + + for (int i = 0; i < 10; i++) { + ReplicationQueueId queueId = new ReplicationQueueId(serverName1, peerId1); + storage.setOffset(queueId, "group1-" + i, + new ReplicationGroupOffset(getFileName("file1", i), (i + 1) * 100), + ImmutableMap.of(region0, i * 100L, region1, (i + 1) * 100L)); + } + + List queueIds = storage.listAllQueueIds(serverName1); + assertEquals(1, queueIds.size()); + assertQueueId(peerId1, serverName1, queueIds.get(0)); + + Map offsets = + storage.getOffsets(new ReplicationQueueId(serverName1, peerId1)); + for (int i = 0; i < 10; i++) { + ReplicationGroupOffset offset = offsets.get("group1-" + i); + assertEquals(getFileName("file1", i), offset.getWal()); + assertEquals((i + 1) * 100, offset.getOffset()); + } + assertEquals(900L, storage.getLastSequenceId(region0, peerId1)); + assertEquals(1000L, storage.getLastSequenceId(region1, peerId1)); + + // Try to decrease the last pushed id by setWALPosition method. + storage.setOffset(new ReplicationQueueId(serverName1, peerId1), "group1-0", + new ReplicationGroupOffset(getFileName("file1", 0), 11 * 100), + ImmutableMap.of(region0, 899L, region1, 1001L)); + assertEquals(900L, storage.getLastSequenceId(region0, peerId1)); + assertEquals(1001L, storage.getLastSequenceId(region1, peerId1)); + } + + private void assertGroupOffset(String wal, long offset, ReplicationGroupOffset groupOffset) { + assertEquals(wal, groupOffset.getWal()); + assertEquals(offset, groupOffset.getOffset()); + } + + @Test + public void testClaimQueue() throws Exception { + String peerId = "1"; + ServerName serverName1 = getServerName(1); + ReplicationQueueId queueId = new ReplicationQueueId(serverName1, peerId); + for (int i = 0; i < 10; i++) { + storage.setOffset(queueId, "group-" + i, new ReplicationGroupOffset("wal-" + i, i), + Collections.emptyMap()); + } + + ServerName serverName2 = getServerName(2); + Map offsets2 = storage.claimQueue(queueId, serverName2); + assertEquals(10, offsets2.size()); + for (int i = 0; i < 10; i++) { + assertGroupOffset("wal-" + i, i, offsets2.get("group-" + i)); + } + ReplicationQueueId claimedQueueId2 = new ReplicationQueueId(serverName2, peerId, serverName1); + assertThat(storage.listAllQueueIds(peerId, serverName1), IsEmptyCollection.empty()); + assertThat(storage.listAllQueueIds(peerId, serverName2), + Matchers.> both(hasItem(claimedQueueId2)).and(hasSize(1))); + offsets2 = storage.getOffsets(claimedQueueId2); + assertEquals(10, offsets2.size()); + for (int i = 0; i < 10; i++) { + assertGroupOffset("wal-" + i, i, offsets2.get("group-" + i)); + } + + ServerName serverName3 = getServerName(3); + Map offsets3 = storage.claimQueue(claimedQueueId2, serverName3); + assertEquals(10, offsets3.size()); + for (int i = 0; i < 10; i++) { + assertGroupOffset("wal-" + i, i, offsets3.get("group-" + i)); + } + ReplicationQueueId claimedQueueId3 = new ReplicationQueueId(serverName3, peerId, serverName1); + assertThat(storage.listAllQueueIds(peerId, serverName1), IsEmptyCollection.empty()); + assertThat(storage.listAllQueueIds(peerId, serverName2), IsEmptyCollection.empty()); + assertThat(storage.listAllQueueIds(peerId, serverName3), + Matchers.> both(hasItem(claimedQueueId3)).and(hasSize(1))); + offsets3 = storage.getOffsets(claimedQueueId3); + assertEquals(10, offsets3.size()); + for (int i = 0; i < 10; i++) { + assertGroupOffset("wal-" + i, i, offsets3.get("group-" + i)); + } + storage.removeQueue(claimedQueueId3); + assertThat(storage.listAllQueueIds(peerId), IsEmptyCollection.empty()); + } + + @Test + public void testClaimQueueMultiThread() throws Exception { + String peerId = "3"; + String walGroup = "group"; + ReplicationGroupOffset groupOffset = new ReplicationGroupOffset("wal", 123); + ServerName sourceServerName = getServerName(100); + ReplicationQueueId queueId = new ReplicationQueueId(sourceServerName, peerId); + storage.setOffset(queueId, walGroup, groupOffset, Collections.emptyMap()); + List serverNames = + IntStream.range(0, 10).mapToObj(this::getServerName).collect(Collectors.toList()); + for (int i = 0; i < 10; i++) { + final ReplicationQueueId toClaim = queueId; + List threads = new ArrayList<>(); + Map> claimed = new ConcurrentHashMap<>(); + Set failed = ConcurrentHashMap.newKeySet(); + for (ServerName serverName : serverNames) { + if (serverName.equals(queueId.getServerName())) { + continue; + } + threads.add(new Thread("Claim-" + i + "-" + serverName) { + + @Override + public void run() { + try { + Map offsets = storage.claimQueue(toClaim, serverName); + if (!offsets.isEmpty()) { + claimed.put(serverName, offsets); + } + } catch (ReplicationException e) { + LOG.error("failed to claim queue", e); + failed.add(serverName); + } + } + }); + } + LOG.info("Claim round {}, there are {} threads to claim {}", i, threads.size(), toClaim); + for (Thread thread : threads) { + thread.start(); + } + for (Thread thread : threads) { + thread.join(30000); + assertFalse(thread.isAlive()); + } + LOG.info("Finish claim round {}, claimed={}, failed={}", i, claimed, failed); + assertThat(failed, IsEmptyCollection.empty()); + assertEquals(1, claimed.size()); + Map offsets = Iterables.getOnlyElement(claimed.values()); + assertEquals(1, offsets.size()); + assertGroupOffset("wal", 123, offsets.get("group")); + queueId = new ReplicationQueueId(Iterables.getOnlyElement(claimed.keySet()), peerId, + sourceServerName); + assertThat(storage.listAllQueueIds(peerId), + Matchers.> both(hasItem(queueId)).and(hasSize(1))); + } + } + + @Test + public void testListRemovePeerAllQueues() throws Exception { + String peerId1 = "1"; + String peerId2 = "2"; + for (int i = 0; i < 100; i++) { + ServerName serverName = getServerName(i); + String group = "group"; + ReplicationGroupOffset offset = new ReplicationGroupOffset("wal", i); + ReplicationQueueId queueId1 = new ReplicationQueueId(serverName, peerId1); + ReplicationQueueId queueId2 = new ReplicationQueueId(serverName, peerId2); + storage.setOffset(queueId1, group, offset, Collections.emptyMap()); + storage.setOffset(queueId2, group, offset, Collections.emptyMap()); + } + List queueDatas = storage.listAllQueues(); + assertThat(queueDatas, hasSize(200)); + for (int i = 0; i < 100; i++) { + ReplicationQueueData peerId1Data = queueDatas.get(i); + ReplicationQueueData peerId2Data = queueDatas.get(i + 100); + ServerName serverName = getServerName(i); + assertEquals(new ReplicationQueueId(serverName, peerId1), peerId1Data.getId()); + assertEquals(new ReplicationQueueId(serverName, peerId2), peerId2Data.getId()); + assertEquals(1, peerId1Data.getOffsets().size()); + assertEquals(1, peerId2Data.getOffsets().size()); + assertGroupOffset("wal", i, peerId1Data.getOffsets().get("group")); + assertGroupOffset("wal", i, peerId2Data.getOffsets().get("group")); + } + List queueIds1 = storage.listAllQueueIds(peerId1); + assertThat(queueIds1, hasSize(100)); + for (int i = 0; i < 100; i++) { + ServerName serverName = getServerName(i); + assertEquals(new ReplicationQueueId(serverName, peerId1), queueIds1.get(i)); + } + List queueIds2 = storage.listAllQueueIds(peerId2); + assertThat(queueIds2, hasSize(100)); + for (int i = 0; i < 100; i++) { + ServerName serverName = getServerName(i); + assertEquals(new ReplicationQueueId(serverName, peerId2), queueIds2.get(i)); + } + + storage.removeAllQueues(peerId1); + assertThat(storage.listAllQueues(), hasSize(100)); + assertThat(storage.listAllQueueIds(peerId1), IsEmptyCollection.empty()); + assertThat(storage.listAllQueueIds(peerId2), hasSize(100)); + + storage.removeAllQueues(peerId2); + assertThat(storage.listAllQueues(), IsEmptyCollection.empty()); + assertThat(storage.listAllQueueIds(peerId1), IsEmptyCollection.empty()); + assertThat(storage.listAllQueueIds(peerId2), IsEmptyCollection.empty()); + } + + @Test + public void testRemoveAllLastPushedSeqIdsForPeer() throws Exception { + String peerId = "1"; + String peerIdToDelete = "2"; + for (int i = 0; i < 100; i++) { + String encodedRegionName = MD5Hash.getMD5AsHex(Bytes.toBytes(i)); + storage.setLastSequenceIds(peerId, ImmutableMap.of(encodedRegionName, (long) i)); + storage.setLastSequenceIds(peerIdToDelete, ImmutableMap.of(encodedRegionName, (long) i)); + } + for (int i = 0; i < 100; i++) { + String encodedRegionName = MD5Hash.getMD5AsHex(Bytes.toBytes(i)); + assertEquals(i, storage.getLastSequenceId(encodedRegionName, peerId)); + assertEquals(i, storage.getLastSequenceId(encodedRegionName, peerIdToDelete)); + } + storage.removeLastSequenceIds(peerIdToDelete); + for (int i = 0; i < 100; i++) { + String encodedRegionName = MD5Hash.getMD5AsHex(Bytes.toBytes(i)); + assertEquals(i, storage.getLastSequenceId(encodedRegionName, peerId)); + assertEquals(HConstants.NO_SEQNUM, + storage.getLastSequenceId(encodedRegionName, peerIdToDelete)); + } + } + + @Test + public void testHfileRefsReplicationQueues() throws ReplicationException, KeeperException { + String peerId1 = "1"; + + List> files1 = new ArrayList<>(3); + files1.add(new Pair<>(null, new Path("file_1"))); + files1.add(new Pair<>(null, new Path("file_2"))); + files1.add(new Pair<>(null, new Path("file_3"))); + assertTrue(storage.getReplicableHFiles(peerId1).isEmpty()); + assertEquals(0, storage.getAllPeersFromHFileRefsQueue().size()); + + storage.addHFileRefs(peerId1, files1); + assertEquals(1, storage.getAllPeersFromHFileRefsQueue().size()); + assertEquals(3, storage.getReplicableHFiles(peerId1).size()); + List hfiles2 = new ArrayList<>(files1.size()); + for (Pair p : files1) { + hfiles2.add(p.getSecond().getName()); + } + String removedString = hfiles2.remove(0); + storage.removeHFileRefs(peerId1, hfiles2); + assertEquals(1, storage.getReplicableHFiles(peerId1).size()); + hfiles2 = new ArrayList<>(1); + hfiles2.add(removedString); + storage.removeHFileRefs(peerId1, hfiles2); + assertEquals(0, storage.getReplicableHFiles(peerId1).size()); + } + + @Test + public void testRemovePeerForHFileRefs() throws ReplicationException, KeeperException { + String peerId1 = "1"; + String peerId2 = "2"; + + List> files1 = new ArrayList<>(3); + files1.add(new Pair<>(null, new Path("file_1"))); + files1.add(new Pair<>(null, new Path("file_2"))); + files1.add(new Pair<>(null, new Path("file_3"))); + storage.addHFileRefs(peerId1, files1); + storage.addHFileRefs(peerId2, files1); + assertEquals(2, storage.getAllPeersFromHFileRefsQueue().size()); + assertEquals(3, storage.getReplicableHFiles(peerId1).size()); + assertEquals(3, storage.getReplicableHFiles(peerId2).size()); + + storage.removePeerFromHFileRefs(peerId1); + assertEquals(1, storage.getAllPeersFromHFileRefsQueue().size()); + assertTrue(storage.getReplicableHFiles(peerId1).isEmpty()); + assertEquals(3, storage.getReplicableHFiles(peerId2).size()); + + storage.removePeerFromHFileRefs(peerId2); + assertEquals(0, storage.getAllPeersFromHFileRefsQueue().size()); + assertTrue(storage.getReplicableHFiles(peerId2).isEmpty()); + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/multiwal/TestReplicationSyncUpToolWithMultipleAsyncWAL.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/multiwal/TestReplicationSyncUpToolWithMultipleAsyncWAL.java index 83cd41773ca8..28779be43995 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/multiwal/TestReplicationSyncUpToolWithMultipleAsyncWAL.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/multiwal/TestReplicationSyncUpToolWithMultipleAsyncWAL.java @@ -25,8 +25,11 @@ import org.apache.hadoop.hbase.wal.RegionGroupingProvider; import org.apache.hadoop.hbase.wal.WALFactory; import org.junit.ClassRule; +import org.junit.Ignore; import org.junit.experimental.categories.Category; +//revisit later when we implement the new ReplicationSyncUpTool +@Ignore @Category({ ReplicationTests.class, LargeTests.class }) public class TestReplicationSyncUpToolWithMultipleAsyncWAL extends TestReplicationSyncUpTool { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/multiwal/TestReplicationSyncUpToolWithMultipleWAL.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/multiwal/TestReplicationSyncUpToolWithMultipleWAL.java index 673b841430eb..f495f433bc9b 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/multiwal/TestReplicationSyncUpToolWithMultipleWAL.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/multiwal/TestReplicationSyncUpToolWithMultipleWAL.java @@ -25,8 +25,11 @@ import org.apache.hadoop.hbase.wal.RegionGroupingProvider; import org.apache.hadoop.hbase.wal.WALFactory; import org.junit.ClassRule; +import org.junit.Ignore; import org.junit.experimental.categories.Category; +//revisit later when we implement the new ReplicationSyncUpTool +@Ignore @Category({ ReplicationTests.class, LargeTests.class }) public class TestReplicationSyncUpToolWithMultipleWAL extends TestReplicationSyncUpTool { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestDrainReplicationQueuesForStandBy.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestDrainReplicationQueuesForStandBy.java index 0189d4755754..8918f8422e1d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestDrainReplicationQueuesForStandBy.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestDrainReplicationQueuesForStandBy.java @@ -35,9 +35,12 @@ import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.junit.ClassRule; +import org.junit.Ignore; import org.junit.Test; import org.junit.experimental.categories.Category; +// TODO: revisit later +@Ignore @Category({ ReplicationTests.class, MediumTests.class }) public class TestDrainReplicationQueuesForStandBy extends SyncReplicationTestBase { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestDumpReplicationQueues.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestDumpReplicationQueues.java index d78a45ca6b9c..3475ae5c1925 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestDumpReplicationQueues.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestDumpReplicationQueues.java @@ -36,12 +36,15 @@ import org.apache.hadoop.hbase.zookeeper.ZKWatcher; import org.apache.hadoop.hbase.zookeeper.ZNodePaths; import org.junit.ClassRule; +import org.junit.Ignore; import org.junit.Test; import org.junit.experimental.categories.Category; /** * Tests for DumpReplicationQueues tool */ +// TODO: reimplement +@Ignore @Category({ ReplicationTests.class, SmallTests.class }) public class TestDumpReplicationQueues { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSource.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSource.java index e80f0ea35bad..707bab875d22 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSource.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSource.java @@ -23,7 +23,6 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; -import static org.mockito.Mockito.doNothing; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -57,7 +56,8 @@ import org.apache.hadoop.hbase.replication.ReplicationEndpoint; import org.apache.hadoop.hbase.replication.ReplicationPeer; import org.apache.hadoop.hbase.replication.ReplicationPeerConfig; -import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; +import org.apache.hadoop.hbase.replication.ReplicationQueueData; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.hadoop.hbase.replication.WALEntryFilter; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.testclassification.ReplicationTests; @@ -79,6 +79,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap; + @Category({ ReplicationTests.class, MediumTests.class }) public class TestReplicationSource { @@ -134,11 +136,13 @@ public void testDefaultSkipsMetaWAL() throws IOException { ReplicationSourceManager manager = Mockito.mock(ReplicationSourceManager.class); Mockito.when(manager.getGlobalMetrics()) .thenReturn(mock(MetricsReplicationGlobalSourceSource.class)); - String queueId = "qid"; + RegionServerServices rss = TEST_UTIL.createMockRegionServerService(ServerName.parseServerName("a.b.c,1,1")); - rs.init(conf, null, manager, null, mockPeer, rss, queueId, null, p -> OptionalLong.empty(), - new MetricsSource(queueId)); + ReplicationQueueId queueId = new ReplicationQueueId(rss.getServerName(), "qid"); + rs.init(conf, null, manager, null, mockPeer, rss, + new ReplicationQueueData(queueId, ImmutableMap.of()), null, p -> OptionalLong.empty(), + new MetricsSource(queueId.toString())); try { rs.startup(); assertTrue(rs.isSourceActive()); @@ -171,11 +175,12 @@ public void testWALEntryFilter() throws IOException { .thenReturn(DoNothingReplicationEndpoint.class.getName()); Mockito.when(mockPeer.getPeerConfig()).thenReturn(peerConfig); ReplicationSourceManager manager = Mockito.mock(ReplicationSourceManager.class); - String queueId = "qid"; RegionServerServices rss = TEST_UTIL.createMockRegionServerService(ServerName.parseServerName("a.b.c,1,1")); - rs.init(conf, null, manager, null, mockPeer, rss, queueId, uuid, p -> OptionalLong.empty(), - new MetricsSource(queueId)); + ReplicationQueueId queueId = new ReplicationQueueId(rss.getServerName(), "qid"); + rs.init(conf, null, manager, null, mockPeer, rss, + new ReplicationQueueData(queueId, ImmutableMap.of()), uuid, p -> OptionalLong.empty(), + new MetricsSource(queueId.toString())); try { rs.startup(); TEST_UTIL.waitFor(30000, () -> rs.getWalEntryFilter() != null); @@ -257,8 +262,11 @@ public void testTerminateTimeout() throws Exception { Configuration testConf = HBaseConfiguration.create(); testConf.setInt("replication.source.maxretriesmultiplier", 1); ReplicationSourceManager manager = Mockito.mock(ReplicationSourceManager.class); - source.init(testConf, null, manager, null, mockPeer, null, "testPeer", null, - p -> OptionalLong.empty(), null); + ReplicationQueueId queueId = + new ReplicationQueueId(ServerName.valueOf("test,123,123"), "testPeer"); + source.init(testConf, null, manager, null, mockPeer, null, + new ReplicationQueueData(queueId, ImmutableMap.of()), null, p -> OptionalLong.empty(), + null); ExecutorService executor = Executors.newSingleThreadExecutor(); Future future = executor.submit(() -> source.terminate("testing source termination")); long sleepForRetries = testConf.getLong("replication.source.sleepforretries", 1000); @@ -276,8 +284,11 @@ public void testTerminateClearsBuffer() throws Exception { ReplicationPeer mockPeer = mock(ReplicationPeer.class); Mockito.when(mockPeer.getPeerBandwidth()).thenReturn(0L); Configuration testConf = HBaseConfiguration.create(); - source.init(testConf, null, mockManager, null, mockPeer, Mockito.mock(Server.class), "testPeer", - null, p -> OptionalLong.empty(), mock(MetricsSource.class)); + ReplicationQueueId queueId = + new ReplicationQueueId(ServerName.valueOf("test,123,123"), "testPeer"); + source.init(testConf, null, mockManager, null, mockPeer, Mockito.mock(Server.class), + new ReplicationQueueData(queueId, ImmutableMap.of()), null, p -> OptionalLong.empty(), + mock(MetricsSource.class)); ReplicationSourceWALReader reader = new ReplicationSourceWALReader(null, conf, null, 0, null, source, null); ReplicationSourceShipper shipper = new ReplicationSourceShipper(conf, null, null, source); @@ -480,35 +491,6 @@ public synchronized UUID getPeerUUID() { } - /** - * Test HBASE-20497 Moved here from TestReplicationSource because doesn't need cluster. - */ - @Test - public void testRecoveredReplicationSourceShipperGetPosition() throws Exception { - String walGroupId = "fake-wal-group-id"; - ServerName serverName = ServerName.valueOf("www.example.com", 12006, 1524679704418L); - ServerName deadServer = ServerName.valueOf("www.deadServer.com", 12006, 1524679704419L); - RecoveredReplicationSource source = mock(RecoveredReplicationSource.class); - Server server = mock(Server.class); - Mockito.when(server.getServerName()).thenReturn(serverName); - Mockito.when(source.getServer()).thenReturn(server); - Mockito.when(source.getServerWALsBelongTo()).thenReturn(deadServer); - ReplicationQueueStorage storage = mock(ReplicationQueueStorage.class); - Mockito.when(storage.getWALPosition(Mockito.eq(serverName), Mockito.any(), Mockito.any())) - .thenReturn(1001L); - Mockito.when(storage.getWALPosition(Mockito.eq(deadServer), Mockito.any(), Mockito.any())) - .thenReturn(-1L); - Configuration conf = new Configuration(TEST_UTIL.getConfiguration()); - conf.setInt("replication.source.maxretriesmultiplier", -1); - MetricsSource metricsSource = mock(MetricsSource.class); - doNothing().when(metricsSource).incrSizeOfLogQueue(); - ReplicationSourceLogQueue logQueue = new ReplicationSourceLogQueue(conf, metricsSource, source); - logQueue.enqueueLog(new Path("/www/html/test"), walGroupId); - RecoveredReplicationSourceShipper shipper = - new RecoveredReplicationSourceShipper(conf, walGroupId, logQueue, source, storage); - assertEquals(1001L, shipper.getStartPosition()); - } - private RegionServerServices setupForAbortTests(ReplicationSource rs, Configuration conf, String endpointName) throws IOException { conf.setInt("replication.source.maxretriesmultiplier", 1); @@ -522,11 +504,12 @@ private RegionServerServices setupForAbortTests(ReplicationSource rs, Configurat ReplicationSourceManager manager = Mockito.mock(ReplicationSourceManager.class); Mockito.when(manager.getGlobalMetrics()) .thenReturn(mock(MetricsReplicationGlobalSourceSource.class)); - String queueId = "qid"; RegionServerServices rss = TEST_UTIL.createMockRegionServerService(ServerName.parseServerName("a.b.c,1,1")); - rs.init(conf, null, manager, null, mockPeer, rss, queueId, null, p -> OptionalLong.empty(), - new MetricsSource(queueId)); + ReplicationQueueId queueId = new ReplicationQueueId(rss.getServerName(), "qid"); + rs.init(conf, null, manager, null, mockPeer, rss, + new ReplicationQueueData(queueId, ImmutableMap.of()), null, p -> OptionalLong.empty(), + new MetricsSource(queueId.toString())); return rss; } @@ -624,8 +607,8 @@ public void testAgeOfOldestWal() throws Exception { ManualEnvironmentEdge manualEdge = new ManualEnvironmentEdge(); EnvironmentEdgeManager.injectEdge(manualEdge); - String id = "1"; - MetricsSource metrics = new MetricsSource(id); + String peerId = "1"; + MetricsSource metrics = new MetricsSource(peerId); Configuration conf = new Configuration(TEST_UTIL.getConfiguration()); conf.setInt("replication.source.maxretriesmultiplier", 1); ReplicationPeer mockPeer = Mockito.mock(ReplicationPeer.class); @@ -640,16 +623,17 @@ public void testAgeOfOldestWal() throws Exception { .thenReturn(mock(MetricsReplicationGlobalSourceSource.class)); RegionServerServices rss = TEST_UTIL.createMockRegionServerService(ServerName.parseServerName("a.b.c,1,1")); - + ReplicationQueueId queueId = new ReplicationQueueId(rss.getServerName(), peerId); ReplicationSource source = new ReplicationSource(); - source.init(conf, null, manager, null, mockPeer, rss, id, null, p -> OptionalLong.empty(), + source.init(conf, null, manager, null, mockPeer, rss, + new ReplicationQueueData(queueId, ImmutableMap.of()), null, p -> OptionalLong.empty(), metrics); final Path log1 = new Path(logDir, "log-walgroup-a.8"); manualEdge.setValue(10); // Diff of current time (10) and log-walgroup-a.8 timestamp will be 2. source.enqueueLog(log1); - MetricsReplicationSourceSource metricsSource1 = getSourceMetrics(id); + MetricsReplicationSourceSource metricsSource1 = getSourceMetrics(peerId); assertEquals(2, metricsSource1.getOldestWalAge()); final Path log2 = new Path(logDir, "log-walgroup-b.4"); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManager.java index c48dbc39a03d..6aba327d7917 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManager.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManager.java @@ -17,578 +17,326 @@ */ package org.apache.hadoop.hbase.replication.regionserver; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.hasItems; +import static org.hamcrest.Matchers.hasSize; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertNull; -import static org.junit.Assert.assertTrue; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; import java.io.IOException; -import java.lang.reflect.Field; -import java.net.URLEncoder; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.Collections; import java.util.NavigableMap; -import java.util.NavigableSet; import java.util.Set; -import java.util.SortedSet; import java.util.TreeMap; -import java.util.TreeSet; -import java.util.UUID; -import java.util.concurrent.CountDownLatch; import java.util.stream.Collectors; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.ChoreService; -import org.apache.hadoop.hbase.ClusterId; +import org.apache.hadoop.hbase.Cell; +import org.apache.hadoop.hbase.CellBuilderFactory; +import org.apache.hadoop.hbase.CellBuilderType; +import org.apache.hadoop.hbase.CompatibilitySingletonFactory; import org.apache.hadoop.hbase.HBaseClassTestRule; -import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HBaseTestingUtil; import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; -import org.apache.hadoop.hbase.Waiter; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; -import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfoBuilder; import org.apache.hadoop.hbase.client.TableDescriptor; import org.apache.hadoop.hbase.client.TableDescriptorBuilder; -import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl; -import org.apache.hadoop.hbase.regionserver.RegionServerServices; -import org.apache.hadoop.hbase.replication.ReplicationFactory; -import org.apache.hadoop.hbase.replication.ReplicationPeer; +import org.apache.hadoop.hbase.regionserver.wal.ProtobufLogWriter; +import org.apache.hadoop.hbase.replication.DummyReplicationEndpoint; +import org.apache.hadoop.hbase.replication.ReplicationException; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; import org.apache.hadoop.hbase.replication.ReplicationPeerConfig; +import org.apache.hadoop.hbase.replication.ReplicationPeerConfigBuilder; import org.apache.hadoop.hbase.replication.ReplicationPeers; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; -import org.apache.hadoop.hbase.replication.ReplicationSourceDummy; import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; import org.apache.hadoop.hbase.replication.ReplicationUtils; import org.apache.hadoop.hbase.replication.SyncReplicationState; -import org.apache.hadoop.hbase.replication.ZKReplicationPeerStorage; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.testclassification.ReplicationTests; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.CommonFSUtils; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; -import org.apache.hadoop.hbase.util.JVMClusterUtil; -import org.apache.hadoop.hbase.util.MockServer; -import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.wal.WAL; import org.apache.hadoop.hbase.wal.WALEdit; import org.apache.hadoop.hbase.wal.WALFactory; import org.apache.hadoop.hbase.wal.WALKeyImpl; -import org.apache.hadoop.hbase.zookeeper.ZKClusterId; -import org.apache.hadoop.hbase.zookeeper.ZKUtil; -import org.apache.hadoop.hbase.zookeeper.ZKWatcher; +import org.hamcrest.Matchers; import org.junit.After; import org.junit.AfterClass; import org.junit.Before; +import org.junit.BeforeClass; import org.junit.ClassRule; import org.junit.Rule; import org.junit.Test; import org.junit.experimental.categories.Category; import org.junit.rules.TestName; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hbase.thirdparty.com.google.common.collect.Sets; -import org.apache.hbase.thirdparty.com.google.protobuf.UnsafeByteOperations; +import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap; -import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; -import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos; -import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.BulkLoadDescriptor; - -/** - * An abstract class that tests ReplicationSourceManager. Classes that extend this class should set - * up the proper config for this class and initialize the proper cluster using HBaseTestingUtility. - */ @Category({ ReplicationTests.class, MediumTests.class }) -public abstract class TestReplicationSourceManager { +public class TestReplicationSourceManager { @ClassRule public static final HBaseClassTestRule CLASS_RULE = HBaseClassTestRule.forClass(TestReplicationSourceManager.class); - protected static final Logger LOG = LoggerFactory.getLogger(TestReplicationSourceManager.class); - - protected static Configuration conf; + public static final class ReplicationEndpointForTest extends DummyReplicationEndpoint { - protected static HBaseTestingUtil utility; + private String clusterKey; - protected static Replication replication; - - protected static ReplicationSourceManager manager; - - protected static ReplicationSourceManager managerOfCluster; + @Override + public boolean replicate(ReplicateContext replicateContext) { + // if you want to block the replication, for example, do not want the recovered source to be + // removed + if (clusterKey.endsWith("error")) { + throw new RuntimeException("Inject error"); + } + return true; + } - protected static ZKWatcher zkw; + @Override + public void init(Context context) throws IOException { + super.init(context); + this.clusterKey = context.getReplicationPeer().getPeerConfig().getClusterKey(); + } - protected static TableDescriptor htd; + } - protected static RegionInfo hri; + private static final HBaseTestingUtil UTIL = new HBaseTestingUtil(); - protected static final byte[] r1 = Bytes.toBytes("r1"); + private static Configuration CONF; - protected static final byte[] r2 = Bytes.toBytes("r2"); + private static FileSystem FS; - protected static final byte[] f1 = Bytes.toBytes("f1"); + private static final byte[] F1 = Bytes.toBytes("f1"); - protected static final byte[] f2 = Bytes.toBytes("f2"); + private static final byte[] F2 = Bytes.toBytes("f2"); - protected static final TableName test = TableName.valueOf("test"); + private static final TableName TABLE_NAME = TableName.valueOf("test"); - protected static final String slaveId = "1"; + private static TableDescriptor TD; - protected static FileSystem fs; + private static RegionInfo RI; - protected static Path oldLogDir; + private static NavigableMap SCOPES; - protected static Path logDir; + @Rule + public final TestName name = new TestName(); - protected static Path remoteLogDir; + private Path oldLogDir; - protected static CountDownLatch latch; + private Path logDir; - protected static List files = new ArrayList<>(); - protected static NavigableMap scopes; + private Path remoteLogDir; - protected static void setupZkAndReplication() throws Exception { - // The implementing class should set up the conf - assertNotNull(conf); - zkw = new ZKWatcher(conf, "test", null); - ZKUtil.createWithParents(zkw, "/hbase/replication"); - ZKUtil.createWithParents(zkw, "/hbase/replication/peers/1"); - ZKUtil.setData(zkw, "/hbase/replication/peers/1", - Bytes.toBytes(conf.get(HConstants.ZOOKEEPER_QUORUM) + ":" - + conf.get(HConstants.ZOOKEEPER_CLIENT_PORT) + ":/1")); - ZKUtil.createWithParents(zkw, "/hbase/replication/peers/1/peer-state"); - ZKUtil.setData(zkw, "/hbase/replication/peers/1/peer-state", - ZKReplicationPeerStorage.ENABLED_ZNODE_BYTES); - ZKUtil.createWithParents(zkw, "/hbase/replication/peers/1/sync-rep-state"); - ZKUtil.setData(zkw, "/hbase/replication/peers/1/sync-rep-state", - ZKReplicationPeerStorage.NONE_STATE_ZNODE_BYTES); - ZKUtil.createWithParents(zkw, "/hbase/replication/peers/1/new-sync-rep-state"); - ZKUtil.setData(zkw, "/hbase/replication/peers/1/new-sync-rep-state", - ZKReplicationPeerStorage.NONE_STATE_ZNODE_BYTES); - ZKUtil.createWithParents(zkw, "/hbase/replication/state"); - ZKUtil.setData(zkw, "/hbase/replication/state", ZKReplicationPeerStorage.ENABLED_ZNODE_BYTES); + private Server server; - ZKClusterId.setClusterId(zkw, new ClusterId()); - CommonFSUtils.setRootDir(utility.getConfiguration(), utility.getDataTestDir()); - fs = FileSystem.get(conf); - oldLogDir = utility.getDataTestDir(HConstants.HREGION_OLDLOGDIR_NAME); - logDir = utility.getDataTestDir(HConstants.HREGION_LOGDIR_NAME); - remoteLogDir = utility.getDataTestDir(ReplicationUtils.REMOTE_WAL_DIR_NAME); - replication = new Replication(); - replication.initialize(new DummyServer(), fs, logDir, oldLogDir, - new WALFactory(conf, "test", null, false)); - managerOfCluster = getManagerFromCluster(); - if (managerOfCluster != null) { - // After replication procedure, we need to add peer by hand (other than by receiving - // notification from zk) - managerOfCluster.addPeer(slaveId); - } + private Replication replication; - manager = replication.getReplicationManager(); - manager.addSource(slaveId); - if (managerOfCluster != null) { - waitPeer(slaveId, managerOfCluster, true); - } - waitPeer(slaveId, manager, true); + private ReplicationSourceManager manager; - htd = TableDescriptorBuilder.newBuilder(test) - .setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(f1) + @BeforeClass + public static void setUpBeforeClass() throws Exception { + UTIL.startMiniCluster(1); + FS = UTIL.getTestFileSystem(); + CONF = new Configuration(UTIL.getConfiguration()); + CONF.setLong("replication.sleep.before.failover", 0); + TD = TableDescriptorBuilder.newBuilder(TABLE_NAME) + .setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(F1) .setScope(HConstants.REPLICATION_SCOPE_GLOBAL).build()) - .setColumnFamily(ColumnFamilyDescriptorBuilder.of(f2)).build(); - - scopes = new TreeMap<>(Bytes.BYTES_COMPARATOR); - for (byte[] fam : htd.getColumnFamilyNames()) { - scopes.put(fam, 0); - } - hri = RegionInfoBuilder.newBuilder(htd.getTableName()).setStartKey(r1).setEndKey(r2).build(); - } + .setColumnFamily(ColumnFamilyDescriptorBuilder.of(F2)).build(); - private static ReplicationSourceManager getManagerFromCluster() { - // TestReplicationSourceManagerZkImpl won't start the mini hbase cluster. - if (utility.getMiniHBaseCluster() == null) { - return null; - } - return utility.getMiniHBaseCluster().getRegionServerThreads().stream() - .map(JVMClusterUtil.RegionServerThread::getRegionServer).findAny() - .map(RegionServerServices::getReplicationSourceService).map(r -> (Replication) r) - .map(Replication::getReplicationManager).get(); + RI = RegionInfoBuilder.newBuilder(TABLE_NAME).build(); + SCOPES = new TreeMap<>(Bytes.BYTES_COMPARATOR); + SCOPES.put(F1, 1); + SCOPES.put(F2, 0); } @AfterClass - public static void tearDownAfterClass() throws Exception { - if (manager != null) { - manager.join(); - } - utility.shutdownMiniCluster(); - } - - @Rule - public TestName testName = new TestName(); - - private void cleanLogDir() throws IOException { - fs.delete(logDir, true); - fs.delete(oldLogDir, true); - fs.delete(remoteLogDir, true); + public static void tearDownAfterClass() throws IOException { + UTIL.shutdownMiniCluster(); } @Before public void setUp() throws Exception { - LOG.info("Start " + testName.getMethodName()); - cleanLogDir(); - } - - @After - public void tearDown() throws Exception { - LOG.info("End " + testName.getMethodName()); - cleanLogDir(); - List ids = manager.getSources().stream().map(ReplicationSourceInterface::getPeerId) - .collect(Collectors.toList()); - for (String id : ids) { - if (slaveId.equals(id)) { - continue; - } - removePeerAndWait(id); - } - } - - @Test - public void testLogRoll() throws Exception { - long baseline = 1000; - long time = baseline; - MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl(); - KeyValue kv = new KeyValue(r1, f1, r1); - WALEdit edit = new WALEdit(); - edit.add(kv); - - WALFactory wals = - new WALFactory(utility.getConfiguration(), URLEncoder.encode("regionserver:60020", "UTF8")); - ReplicationSourceManager replicationManager = replication.getReplicationManager(); - wals.getWALProvider() - .addWALActionsListener(new ReplicationSourceWALActionListener(conf, replicationManager)); - final WAL wal = wals.getWAL(hri); - manager.init(); - TableDescriptor htd = TableDescriptorBuilder.newBuilder(TableName.valueOf("tableame")) - .setColumnFamily(ColumnFamilyDescriptorBuilder.of(f1)).build(); - NavigableMap scopes = new TreeMap<>(Bytes.BYTES_COMPARATOR); - for (byte[] fam : htd.getColumnFamilyNames()) { - scopes.put(fam, 0); - } - // Testing normal log rolling every 20 - for (long i = 1; i < 101; i++) { - if (i > 1 && i % 20 == 0) { - wal.rollWriter(); - } - LOG.info(Long.toString(i)); - final long txid = wal.appendData(hri, new WALKeyImpl(hri.getEncodedNameAsBytes(), test, - EnvironmentEdgeManager.currentTime(), mvcc, scopes), edit); - wal.sync(txid); - } + Path rootDir = UTIL.getDataTestDirOnTestFS(name.getMethodName()); + CommonFSUtils.setRootDir(CONF, rootDir); + server = mock(Server.class); + when(server.getConfiguration()).thenReturn(CONF); + when(server.getZooKeeper()).thenReturn(UTIL.getZooKeeperWatcher()); + when(server.getConnection()).thenReturn(UTIL.getConnection()); + when(server.getServerName()).thenReturn(ServerName.valueOf("hostname.example.org", 1234, 1)); + oldLogDir = new Path(rootDir, HConstants.HREGION_OLDLOGDIR_NAME); + FS.mkdirs(oldLogDir); + logDir = new Path(rootDir, HConstants.HREGION_LOGDIR_NAME); + FS.mkdirs(logDir); + remoteLogDir = new Path(rootDir, ReplicationUtils.REMOTE_WAL_DIR_NAME); + FS.mkdirs(remoteLogDir); + TableName tableName = TableName.valueOf("replication_" + name.getMethodName()); + UTIL.getAdmin() + .createTable(ReplicationStorageFactory.createReplicationQueueTableDescriptor(tableName)); + CONF.set(ReplicationStorageFactory.REPLICATION_QUEUE_TABLE_NAME, tableName.getNameAsString()); - // Simulate a rapid insert that's followed - // by a report that's still not totally complete (missing last one) - LOG.info(baseline + " and " + time); - baseline += 101; - time = baseline; - LOG.info(baseline + " and " + time); - - for (int i = 0; i < 3; i++) { - wal.appendData(hri, new WALKeyImpl(hri.getEncodedNameAsBytes(), test, - EnvironmentEdgeManager.currentTime(), mvcc, scopes), edit); - } - wal.sync(); - - int logNumber = 0; - for (Map.Entry> entry : manager.getWALs().get(slaveId) - .entrySet()) { - logNumber += entry.getValue().size(); - } - assertEquals(6, logNumber); - - wal.rollWriter(); - - ReplicationSourceInterface source = mock(ReplicationSourceInterface.class); - when(source.getQueueId()).thenReturn("1"); - when(source.isRecovered()).thenReturn(false); - when(source.isSyncReplication()).thenReturn(false); - manager.logPositionAndCleanOldLogs(source, - new WALEntryBatch(0, manager.getSources().get(0).getCurrentPath())); - - wal.appendData(hri, new WALKeyImpl(hri.getEncodedNameAsBytes(), test, - EnvironmentEdgeManager.currentTime(), mvcc, scopes), edit); - wal.sync(); - - assertEquals(1, manager.getWALs().size()); - - // TODO Need a case with only 2 WALs and we only want to delete the first one - } - - @Test - public void testClaimQueues() throws Exception { - Server server = new DummyServer("hostname0.example.org"); - ReplicationQueueStorage rq = ReplicationStorageFactory - .getReplicationQueueStorage(server.getZooKeeper(), server.getConfiguration()); - // populate some znodes in the peer znode - files.add("log1"); - files.add("log2"); - for (String file : files) { - rq.addWAL(server.getServerName(), "1", file); - } - // create 3 DummyServers - Server s1 = new DummyServer("dummyserver1.example.org"); - Server s2 = new DummyServer("dummyserver2.example.org"); - Server s3 = new DummyServer("dummyserver3.example.org"); - - // create 3 DummyNodeFailoverWorkers - DummyNodeFailoverWorker w1 = new DummyNodeFailoverWorker(server.getServerName(), s1); - DummyNodeFailoverWorker w2 = new DummyNodeFailoverWorker(server.getServerName(), s2); - DummyNodeFailoverWorker w3 = new DummyNodeFailoverWorker(server.getServerName(), s3); - - latch = new CountDownLatch(3); - // start the threads - w1.start(); - w2.start(); - w3.start(); - // make sure only one is successful - int populatedMap = 0; - // wait for result now... till all the workers are done. - latch.await(); - populatedMap += - w1.isLogZnodesMapPopulated() + w2.isLogZnodesMapPopulated() + w3.isLogZnodesMapPopulated(); - assertEquals(1, populatedMap); - server.abort("", null); + replication = new Replication(); + replication.initialize(server, FS, logDir, oldLogDir, + new WALFactory(CONF, "test", null, false)); + manager = replication.getReplicationManager(); } - @Test - public void testCleanupFailoverQueues() throws Exception { - Server server = new DummyServer("hostname1.example.org"); - ReplicationQueueStorage rq = ReplicationStorageFactory - .getReplicationQueueStorage(server.getZooKeeper(), server.getConfiguration()); - // populate some znodes in the peer znode - SortedSet files = new TreeSet<>(); - String group = "testgroup"; - String file1 = group + "." + EnvironmentEdgeManager.currentTime() + ".log1"; - String file2 = group + "." + EnvironmentEdgeManager.currentTime() + ".log2"; - files.add(file1); - files.add(file2); - for (String file : files) { - rq.addWAL(server.getServerName(), "1", file); - } - Server s1 = new DummyServer("dummyserver1.example.org"); - ReplicationPeers rp1 = ReplicationFactory.getReplicationPeers(s1.getFileSystem(), - s1.getZooKeeper(), s1.getConfiguration()); - rp1.init(); - manager.claimQueue(server.getServerName(), "1"); - assertEquals(1, manager.getWalsByIdRecoveredQueues().size()); - String id = "1-" + server.getServerName().getServerName(); - assertEquals(files, manager.getWalsByIdRecoveredQueues().get(id).get(group)); - ReplicationSourceInterface source = mock(ReplicationSourceInterface.class); - when(source.getQueueId()).thenReturn(id); - when(source.isRecovered()).thenReturn(true); - when(source.isSyncReplication()).thenReturn(false); - manager.cleanOldLogs(file2, false, source); - // log1 should be deleted - assertEquals(Sets.newHashSet(file2), manager.getWalsByIdRecoveredQueues().get(id).get(group)); + @After + public void tearDown() { + replication.stopReplicationService(); } - @Test - public void testCleanupUnknownPeerZNode() throws Exception { - Server server = new DummyServer("hostname2.example.org"); - ReplicationQueueStorage rq = ReplicationStorageFactory - .getReplicationQueueStorage(server.getZooKeeper(), server.getConfiguration()); - // populate some znodes in the peer znode - // add log to an unknown peer - String group = "testgroup"; - rq.addWAL(server.getServerName(), "2", group + ".log1"); - rq.addWAL(server.getServerName(), "2", group + ".log2"); - - manager.claimQueue(server.getServerName(), "2"); - - // The log of the unknown peer should be removed from zk - for (String peer : manager.getAllQueues()) { - assertTrue(peer.startsWith("1")); - } + /** + * Add a peer and wait for it to initialize + */ + private void addPeerAndWait(String peerId, String clusterKey, boolean syncRep) + throws ReplicationException, IOException { + ReplicationPeerConfigBuilder builder = ReplicationPeerConfig.newBuilder() + .setClusterKey(UTIL.getZkCluster().getAddress().toString() + ":/" + clusterKey) + .setReplicationEndpointImpl(ReplicationEndpointForTest.class.getName()); + if (syncRep) { + builder.setTableCFsMap(ImmutableMap.of(TABLE_NAME, Collections.emptyList())) + .setRemoteWALDir(FS.makeQualified(remoteLogDir).toString()); + } + + manager.getReplicationPeers().getPeerStorage().addPeer(peerId, builder.build(), true, + syncRep ? SyncReplicationState.DOWNGRADE_ACTIVE : SyncReplicationState.NONE); + manager.addPeer(peerId); + UTIL.waitFor(20000, () -> { + ReplicationSourceInterface rs = manager.getSource(peerId); + return rs != null && rs.isSourceActive(); + }); } /** - * Test for HBASE-9038, Replication.scopeWALEdits would NPE if it wasn't filtering out the - * compaction WALEdit. + * Remove a peer and wait for it to get cleaned up */ - @Test - public void testCompactionWALEdits() throws Exception { - TableName tableName = TableName.valueOf("testCompactionWALEdits"); - WALProtos.CompactionDescriptor compactionDescriptor = - WALProtos.CompactionDescriptor.getDefaultInstance(); - RegionInfo hri = RegionInfoBuilder.newBuilder(tableName).setStartKey(HConstants.EMPTY_START_ROW) - .setEndKey(HConstants.EMPTY_END_ROW).build(); - WALEdit edit = WALEdit.createCompaction(hri, compactionDescriptor); - ReplicationSourceWALActionListener.scopeWALEdits(new WALKeyImpl(), edit, conf); + private void removePeerAndWait(String peerId) throws Exception { + ReplicationPeers rp = manager.getReplicationPeers(); + rp.getPeerStorage().removePeer(peerId); + manager.removePeer(peerId); + UTIL.waitFor(20000, () -> { + if (rp.getPeer(peerId) != null) { + return false; + } + if (manager.getSource(peerId) != null) { + return false; + } + return manager.getOldSources().stream().noneMatch(rs -> rs.getPeerId().equals(peerId)); + }); } - @Test - public void testBulkLoadWALEditsWithoutBulkLoadReplicationEnabled() throws Exception { - NavigableMap scope = new TreeMap<>(Bytes.BYTES_COMPARATOR); - // 1. Get the bulk load wal edit event - WALEdit logEdit = getBulkLoadWALEdit(scope); - // 2. Create wal key - WALKeyImpl logKey = new WALKeyImpl(scope); - - // 3. Get the scopes for the key - ReplicationSourceWALActionListener.scopeWALEdits(logKey, logEdit, conf); - - // 4. Assert that no bulk load entry scopes are added if bulk load hfile replication is disabled - assertNull("No bulk load entries scope should be added if bulk load replication is disabled.", - logKey.getReplicationScopes()); + private void createWALFile(Path file) throws Exception { + ProtobufLogWriter writer = new ProtobufLogWriter(); + try { + writer.init(FS, file, CONF, false, FS.getDefaultBlockSize(file), null); + WALKeyImpl key = new WALKeyImpl(RI.getEncodedNameAsBytes(), TABLE_NAME, + EnvironmentEdgeManager.currentTime(), SCOPES); + WALEdit edit = new WALEdit(); + edit.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY).setRow(F1).setFamily(F1) + .setQualifier(F1).setType(Cell.Type.Put).setValue(F1).build()); + edit.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY).setRow(F2).setFamily(F2) + .setQualifier(F2).setType(Cell.Type.Put).setValue(F2).build()); + writer.append(new WAL.Entry(key, edit)); + writer.sync(false); + } finally { + writer.close(); + } } @Test - public void testBulkLoadWALEdits() throws Exception { - // 1. Get the bulk load wal edit event - NavigableMap scope = new TreeMap<>(Bytes.BYTES_COMPARATOR); - WALEdit logEdit = getBulkLoadWALEdit(scope); - // 2. Create wal key - WALKeyImpl logKey = new WALKeyImpl(scope); - // 3. Enable bulk load hfile replication - Configuration bulkLoadConf = HBaseConfiguration.create(conf); - bulkLoadConf.setBoolean(HConstants.REPLICATION_BULKLOAD_ENABLE_KEY, true); - - // 4. Get the scopes for the key - ReplicationSourceWALActionListener.scopeWALEdits(logKey, logEdit, bulkLoadConf); - - NavigableMap scopes = logKey.getReplicationScopes(); - // Assert family with replication scope global is present in the key scopes - assertTrue("This family scope is set to global, should be part of replication key scopes.", - scopes.containsKey(f1)); - // Assert family with replication scope local is not present in the key scopes - assertFalse("This family scope is set to local, should not be part of replication key scopes", - scopes.containsKey(f2)); + public void testClaimQueue() throws Exception { + String peerId = "1"; + addPeerAndWait(peerId, "error", false); + ServerName serverName = ServerName.valueOf("hostname0.example.org", 12345, 123); + String walName1 = serverName.toString() + ".1"; + createWALFile(new Path(oldLogDir, walName1)); + ReplicationQueueId queueId = new ReplicationQueueId(serverName, peerId); + ReplicationQueueStorage queueStorage = manager.getQueueStorage(); + queueStorage.setOffset(queueId, "", new ReplicationGroupOffset(peerId, 0), + Collections.emptyMap()); + manager.claimQueue(queueId); + assertThat(manager.getOldSources(), hasSize(1)); } - /** - * Test whether calling removePeer() on a ReplicationSourceManager that failed on initializing the - * corresponding ReplicationSourceInterface correctly cleans up the corresponding replication - * queue and ReplicationPeer. See HBASE-16096. - */ @Test - public void testPeerRemovalCleanup() throws Exception { - String replicationSourceImplName = conf.get("replication.replicationsource.implementation"); - final String peerId = "FakePeer"; - final ReplicationPeerConfig peerConfig = ReplicationPeerConfig.newBuilder() - .setClusterKey(utility.getZkCluster().getAddress().toString() + ":/hbase").build(); - try { - DummyServer server = new DummyServer(); - ReplicationQueueStorage rq = ReplicationStorageFactory - .getReplicationQueueStorage(server.getZooKeeper(), server.getConfiguration()); - // Purposely fail ReplicationSourceManager.addSource() by causing ReplicationSourceInterface - // initialization to throw an exception. - conf.set("replication.replicationsource.implementation", - FailInitializeDummyReplicationSource.class.getName()); - manager.getReplicationPeers(); - // Set up the znode and ReplicationPeer for the fake peer - // Don't wait for replication source to initialize, we know it won't. - addPeerAndWait(peerId, peerConfig, false); - - // Sanity check - assertNull(manager.getSource(peerId)); - - // Create a replication queue for the fake peer - rq.addWAL(server.getServerName(), peerId, "FakeFile"); - // Unregister peer, this should remove the peer and clear all queues associated with it - // Need to wait for the ReplicationTracker to pick up the changes and notify listeners. - removePeerAndWait(peerId); - assertFalse(rq.getAllQueues(server.getServerName()).contains(peerId)); - } finally { - conf.set("replication.replicationsource.implementation", replicationSourceImplName); - removePeerAndWait(peerId); - } - } + public void testSameWALPrefix() throws IOException { + String walName1 = "localhost,8080,12345-45678-Peer.34567"; + String walName2 = "localhost,8080,12345.56789"; + manager.postLogRoll(new Path(walName1)); + manager.postLogRoll(new Path(walName2)); - private static MetricsReplicationSourceSource getGlobalSource() throws Exception { - ReplicationSourceInterface source = manager.getSource(slaveId); - // Retrieve the global replication metrics source - Field f = MetricsSource.class.getDeclaredField("globalSourceSource"); - f.setAccessible(true); - return (MetricsReplicationSourceSource) f.get(source.getSourceMetrics()); + Set latestWals = + manager.getLastestPath().stream().map(Path::getName).collect(Collectors.toSet()); + assertThat(latestWals, + Matchers.> both(hasSize(2)).and(hasItems(walName1, walName2))); } - private static long getSizeOfLatestPath() { - // If no mini cluster is running, there are extra replication manager influencing the metrics. - if (utility.getMiniHBaseCluster() == null) { - return 0; - } - return utility.getMiniHBaseCluster().getRegionServerThreads().stream() - .map(JVMClusterUtil.RegionServerThread::getRegionServer) - .map(RegionServerServices::getReplicationSourceService).map(r -> (Replication) r) - .map(Replication::getReplicationManager) - .mapToLong(ReplicationSourceManager::getSizeOfLatestPath).sum(); + private MetricsReplicationSourceSource getGlobalSource() { + return CompatibilitySingletonFactory.getInstance(MetricsReplicationSourceFactory.class) + .getGlobalSource(); } @Test public void testRemovePeerMetricsCleanup() throws Exception { - final String peerId = "DummyPeer"; - final ReplicationPeerConfig peerConfig = ReplicationPeerConfig.newBuilder() - .setClusterKey(utility.getZkCluster().getAddress().toString() + ":/hbase").build(); - try { - MetricsReplicationSourceSource globalSource = getGlobalSource(); - final int globalLogQueueSizeInitial = globalSource.getSizeOfLogQueue(); - final long sizeOfLatestPath = getSizeOfLatestPath(); - addPeerAndWait(peerId, peerConfig, true); - assertEquals(sizeOfLatestPath + globalLogQueueSizeInitial, globalSource.getSizeOfLogQueue()); - ReplicationSourceInterface source = manager.getSource(peerId); - // Sanity check - assertNotNull(source); - final int sizeOfSingleLogQueue = source.getSourceMetrics().getSizeOfLogQueue(); - // Enqueue log and check if metrics updated - source.enqueueLog(new Path("abc")); - assertEquals(1 + sizeOfSingleLogQueue, source.getSourceMetrics().getSizeOfLogQueue()); - assertEquals(source.getSourceMetrics().getSizeOfLogQueue() + globalLogQueueSizeInitial, - globalSource.getSizeOfLogQueue()); - - // Removing the peer should reset the global metrics - removePeerAndWait(peerId); - assertEquals(globalLogQueueSizeInitial, globalSource.getSizeOfLogQueue()); - - // Adding the same peer back again should reset the single source metrics - addPeerAndWait(peerId, peerConfig, true); - source = manager.getSource(peerId); - assertNotNull(source); - assertEquals(source.getSourceMetrics().getSizeOfLogQueue() + globalLogQueueSizeInitial, - globalSource.getSizeOfLogQueue()); - } finally { - removePeerAndWait(peerId); - } + MetricsReplicationSourceSource globalSource = getGlobalSource(); + int globalLogQueueSizeInitial = globalSource.getSizeOfLogQueue(); + String peerId = "DummyPeer"; + addPeerAndWait(peerId, "hbase", false); + // there is no latestPaths so the size of log queue should not change + assertEquals(globalLogQueueSizeInitial, globalSource.getSizeOfLogQueue()); + + ReplicationSourceInterface source = manager.getSource(peerId); + // Sanity check + assertNotNull(source); + int sizeOfSingleLogQueue = source.getSourceMetrics().getSizeOfLogQueue(); + // Enqueue log and check if metrics updated + Path serverLogDir = new Path(logDir, server.getServerName().toString()); + source.enqueueLog(new Path(serverLogDir, server.getServerName() + ".1")); + assertEquals(1 + sizeOfSingleLogQueue, source.getSourceMetrics().getSizeOfLogQueue()); + assertEquals(source.getSourceMetrics().getSizeOfLogQueue() + globalLogQueueSizeInitial, + globalSource.getSizeOfLogQueue()); + + // Removing the peer should reset the global metrics + removePeerAndWait(peerId); + assertEquals(globalLogQueueSizeInitial, globalSource.getSizeOfLogQueue()); + + // Adding the same peer back again should reset the single source metrics + addPeerAndWait(peerId, "hbase", false); + source = manager.getSource(peerId); + assertNotNull(source); + assertEquals(source.getSourceMetrics().getSizeOfLogQueue() + globalLogQueueSizeInitial, + globalSource.getSizeOfLogQueue()); } @Test public void testDisablePeerMetricsCleanup() throws Exception { final String peerId = "DummyPeer"; - final ReplicationPeerConfig peerConfig = ReplicationPeerConfig.newBuilder() - .setClusterKey(utility.getZkCluster().getAddress().toString() + ":/hbase").build(); try { MetricsReplicationSourceSource globalSource = getGlobalSource(); final int globalLogQueueSizeInitial = globalSource.getSizeOfLogQueue(); - final long sizeOfLatestPath = getSizeOfLatestPath(); - addPeerAndWait(peerId, peerConfig, true); - assertEquals(sizeOfLatestPath + globalLogQueueSizeInitial, globalSource.getSizeOfLogQueue()); + addPeerAndWait(peerId, "hbase", false); + assertEquals(globalLogQueueSizeInitial, globalSource.getSizeOfLogQueue()); ReplicationSourceInterface source = manager.getSource(peerId); // Sanity check assertNotNull(source); final int sizeOfSingleLogQueue = source.getSourceMetrics().getSizeOfLogQueue(); // Enqueue log and check if metrics updated - source.enqueueLog(new Path("abc")); + Path serverLogDir = new Path(logDir, server.getServerName().toString()); + source.enqueueLog(new Path(serverLogDir, server.getServerName() + ".1")); assertEquals(1 + sizeOfSingleLogQueue, source.getSourceMetrics().getSizeOfLogQueue()); assertEquals(source.getSourceMetrics().getSizeOfLogQueue() + globalLogQueueSizeInitial, globalSource.getSizeOfLogQueue()); @@ -607,274 +355,27 @@ public void testDisablePeerMetricsCleanup() throws Exception { } } - private ReplicationSourceInterface mockReplicationSource(String peerId) { - ReplicationSourceInterface source = mock(ReplicationSourceInterface.class); - when(source.getPeerId()).thenReturn(peerId); - when(source.getQueueId()).thenReturn(peerId); - when(source.isRecovered()).thenReturn(false); - when(source.isSyncReplication()).thenReturn(true); - ReplicationPeerConfig config = mock(ReplicationPeerConfig.class); - when(config.getRemoteWALDir()) - .thenReturn(remoteLogDir.makeQualified(fs.getUri(), fs.getWorkingDirectory()).toString()); - ReplicationPeer peer = mock(ReplicationPeer.class); - when(peer.getPeerConfig()).thenReturn(config); - when(source.getPeer()).thenReturn(peer); - return source; - } - @Test public void testRemoveRemoteWALs() throws Exception { - String peerId2 = slaveId + "_2"; - addPeerAndWait(peerId2, - ReplicationPeerConfig.newBuilder() - .setClusterKey("localhost:" + utility.getZkCluster().getClientPort() + ":/hbase").build(), - true); - try { - // make sure that we can deal with files which does not exist - String walNameNotExists = - "remoteWAL-12345-" + slaveId + ".12345" + ReplicationUtils.SYNC_WAL_SUFFIX; - Path wal = new Path(logDir, walNameNotExists); - manager.preLogRoll(wal); - manager.postLogRoll(wal); - - Path remoteLogDirForPeer = new Path(remoteLogDir, slaveId); - fs.mkdirs(remoteLogDirForPeer); - String walName = "remoteWAL-12345-" + slaveId + ".23456" + ReplicationUtils.SYNC_WAL_SUFFIX; - Path remoteWAL = - new Path(remoteLogDirForPeer, walName).makeQualified(fs.getUri(), fs.getWorkingDirectory()); - fs.create(remoteWAL).close(); - wal = new Path(logDir, walName); - manager.preLogRoll(wal); - manager.postLogRoll(wal); - - ReplicationSourceInterface source = mockReplicationSource(peerId2); - manager.cleanOldLogs(walName, true, source); - // still there if peer id does not match - assertTrue(fs.exists(remoteWAL)); - - source = mockReplicationSource(slaveId); - manager.cleanOldLogs(walName, true, source); - assertFalse(fs.exists(remoteWAL)); - } finally { - removePeerAndWait(peerId2); - } - } - - @Test - public void testSameWALPrefix() throws IOException { - Set latestWalsBefore = - manager.getLastestPath().stream().map(Path::getName).collect(Collectors.toSet()); - String walName1 = "localhost,8080,12345-45678-Peer.34567"; - String walName2 = "localhost,8080,12345.56789"; - manager.preLogRoll(new Path(walName1)); - manager.preLogRoll(new Path(walName2)); - - Set latestWals = manager.getLastestPath().stream().map(Path::getName) - .filter(n -> !latestWalsBefore.contains(n)).collect(Collectors.toSet()); - assertEquals(2, latestWals.size()); - assertTrue(latestWals.contains(walName1)); - assertTrue(latestWals.contains(walName2)); - } - - /** - * Add a peer and wait for it to initialize - * @param waitForSource Whether to wait for replication source to initialize - */ - private void addPeerAndWait(final String peerId, final ReplicationPeerConfig peerConfig, - final boolean waitForSource) throws Exception { - final ReplicationPeers rp = manager.getReplicationPeers(); - rp.getPeerStorage().addPeer(peerId, peerConfig, true, SyncReplicationState.NONE); - try { - manager.addPeer(peerId); - } catch (Exception e) { - // ignore the failed exception, because we'll test both success & failed case. - } - waitPeer(peerId, manager, waitForSource); - if (managerOfCluster != null) { - managerOfCluster.addPeer(peerId); - waitPeer(peerId, managerOfCluster, waitForSource); - } - } - - private static void waitPeer(final String peerId, ReplicationSourceManager manager, - final boolean waitForSource) { - ReplicationPeers rp = manager.getReplicationPeers(); - Waiter.waitFor(conf, 20000, () -> { - if (waitForSource) { - ReplicationSourceInterface rs = manager.getSource(peerId); - if (rs == null) { - return false; - } - if (rs instanceof ReplicationSourceDummy) { - return ((ReplicationSourceDummy) rs).isStartup(); - } - return true; - } else { - return (rp.getPeer(peerId) != null); - } - }); - } - - /** - * Remove a peer and wait for it to get cleaned up - */ - private void removePeerAndWait(final String peerId) throws Exception { - final ReplicationPeers rp = manager.getReplicationPeers(); - if (rp.getPeerStorage().listPeerIds().contains(peerId)) { - rp.getPeerStorage().removePeer(peerId); - try { - manager.removePeer(peerId); - } catch (Exception e) { - // ignore the failed exception and continue. - } - } - Waiter.waitFor(conf, 20000, new Waiter.Predicate() { - @Override - public boolean evaluate() throws Exception { - Collection peers = rp.getPeerStorage().listPeerIds(); - return (!manager.getAllQueues().contains(peerId)) && (rp.getPeer(peerId) == null) - && (!peers.contains(peerId)) && manager.getSource(peerId) == null; - } - }); - } - - private WALEdit getBulkLoadWALEdit(NavigableMap scope) { - // 1. Create store files for the families - Map> storeFiles = new HashMap<>(1); - Map storeFilesSize = new HashMap<>(1); - List p = new ArrayList<>(1); - Path hfilePath1 = new Path(Bytes.toString(f1)); - p.add(hfilePath1); - try { - storeFilesSize.put(hfilePath1.getName(), fs.getFileStatus(hfilePath1).getLen()); - } catch (IOException e) { - LOG.debug("Failed to calculate the size of hfile " + hfilePath1); - storeFilesSize.put(hfilePath1.getName(), 0L); - } - storeFiles.put(f1, p); - scope.put(f1, 1); - p = new ArrayList<>(1); - Path hfilePath2 = new Path(Bytes.toString(f2)); - p.add(hfilePath2); - try { - storeFilesSize.put(hfilePath2.getName(), fs.getFileStatus(hfilePath2).getLen()); - } catch (IOException e) { - LOG.debug("Failed to calculate the size of hfile " + hfilePath2); - storeFilesSize.put(hfilePath2.getName(), 0L); - } - storeFiles.put(f2, p); - // 2. Create bulk load descriptor - BulkLoadDescriptor desc = ProtobufUtil.toBulkLoadDescriptor(hri.getTable(), - UnsafeByteOperations.unsafeWrap(hri.getEncodedNameAsBytes()), storeFiles, storeFilesSize, 1); - - // 3. create bulk load wal edit event - WALEdit logEdit = WALEdit.createBulkLoadEvent(hri, desc); - return logEdit; - } - - static class DummyNodeFailoverWorker extends Thread { - private Map> logZnodesMap; - Server server; - private ServerName deadRS; - ReplicationQueueStorage rq; - - public DummyNodeFailoverWorker(ServerName deadRS, Server s) throws Exception { - this.deadRS = deadRS; - this.server = s; - this.rq = ReplicationStorageFactory.getReplicationQueueStorage(server.getZooKeeper(), - server.getConfiguration()); - } - - @Override - public void run() { - try { - logZnodesMap = new HashMap<>(); - List queues = rq.getAllQueues(deadRS); - for (String queue : queues) { - Pair> pair = - rq.claimQueue(deadRS, queue, server.getServerName()); - if (pair != null) { - logZnodesMap.put(pair.getFirst(), pair.getSecond()); - } - } - server.abort("Done with testing", null); - } catch (Exception e) { - LOG.error("Got exception while running NodeFailoverWorker", e); - } finally { - latch.countDown(); - } - } - - /** Returns 1 when the map is not empty. */ - private int isLogZnodesMapPopulated() { - Collection> sets = logZnodesMap.values(); - if (sets.size() > 1) { - throw new RuntimeException("unexpected size of logZnodesMap: " + sets.size()); - } - if (sets.size() == 1) { - Set s = sets.iterator().next(); - for (String file : files) { - // at least one file was missing - if (!s.contains(file)) { - return 0; - } - } - return 1; // we found all the files - } - return 0; - } - } - - static class FailInitializeDummyReplicationSource extends ReplicationSourceDummy { - - @Override - public void init(Configuration conf, FileSystem fs, ReplicationSourceManager manager, - ReplicationQueueStorage rq, ReplicationPeer rp, Server server, String peerClusterId, - UUID clusterId, WALFileLengthProvider walFileLengthProvider, MetricsSource metrics) - throws IOException { - throw new IOException("Failing deliberately"); - } - } - - static class DummyServer extends MockServer { - String hostname; - - DummyServer() { - hostname = "hostname.example.org"; - } - - DummyServer(String hostname) { - this.hostname = hostname; - } - - @Override - public Configuration getConfiguration() { - return conf; - } - - @Override - public ZKWatcher getZooKeeper() { - return zkw; - } - - @Override - public FileSystem getFileSystem() { - return fs; - } - - @Override - public Connection getConnection() { - return null; - } - - @Override - public ChoreService getChoreService() { - return null; - } - - @Override - public ServerName getServerName() { - return ServerName.valueOf(hostname, 1234, 1L); - } + String peerId = "2"; + addPeerAndWait(peerId, "hbase", true); + // make sure that we can deal with files which does not exist + String walNameNotExists = + "remoteWAL-12345-" + peerId + ".12345" + ReplicationUtils.SYNC_WAL_SUFFIX; + Path wal = new Path(logDir, walNameNotExists); + manager.postLogRoll(wal); + + Path remoteLogDirForPeer = new Path(remoteLogDir, peerId); + FS.mkdirs(remoteLogDirForPeer); + String walName = "remoteWAL-12345-" + peerId + ".23456" + ReplicationUtils.SYNC_WAL_SUFFIX; + Path remoteWAL = + new Path(remoteLogDirForPeer, walName).makeQualified(FS.getUri(), FS.getWorkingDirectory()); + FS.create(remoteWAL).close(); + wal = new Path(logDir, walName); + manager.postLogRoll(wal); + + ReplicationSourceInterface source = manager.getSource(peerId); + manager.cleanOldLogs(walName, true, source); + assertFalse(FS.exists(remoteWAL)); } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManagerZkImpl.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManagerZkImpl.java deleted file mode 100644 index b26505a6270f..000000000000 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManagerZkImpl.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.replication.regionserver; - -import static org.junit.Assert.assertTrue; - -import java.util.List; -import org.apache.hadoop.hbase.HBaseClassTestRule; -import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.HBaseTestingUtil; -import org.apache.hadoop.hbase.Server; -import org.apache.hadoop.hbase.ServerName; -import org.apache.hadoop.hbase.replication.ReplicationQueueInfo; -import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; -import org.apache.hadoop.hbase.replication.ReplicationSourceDummy; -import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; -import org.apache.hadoop.hbase.testclassification.MediumTests; -import org.apache.hadoop.hbase.testclassification.ReplicationTests; -import org.junit.BeforeClass; -import org.junit.ClassRule; -import org.junit.Test; -import org.junit.experimental.categories.Category; - -/** - * Tests the ReplicationSourceManager with ReplicationQueueZkImpl's and - * ReplicationQueuesClientZkImpl. Also includes extra tests outside of those in - * TestReplicationSourceManager that test ReplicationQueueZkImpl-specific behaviors. - */ -@Category({ ReplicationTests.class, MediumTests.class }) -public class TestReplicationSourceManagerZkImpl extends TestReplicationSourceManager { - - @ClassRule - public static final HBaseClassTestRule CLASS_RULE = - HBaseClassTestRule.forClass(TestReplicationSourceManagerZkImpl.class); - - @BeforeClass - public static void setUpBeforeClass() throws Exception { - conf = HBaseConfiguration.create(); - conf.set("replication.replicationsource.implementation", - ReplicationSourceDummy.class.getCanonicalName()); - conf.setLong("replication.sleep.before.failover", 2000); - conf.setInt("replication.source.maxretriesmultiplier", 10); - utility = new HBaseTestingUtil(conf); - utility.startMiniZKCluster(); - setupZkAndReplication(); - } - - // Tests the naming convention of adopted queues for ReplicationQueuesZkImpl - @Test - public void testNodeFailoverDeadServerParsing() throws Exception { - Server server = new DummyServer("ec2-54-234-230-108.compute-1.amazonaws.com"); - ReplicationQueueStorage queueStorage = - ReplicationStorageFactory.getReplicationQueueStorage(server.getZooKeeper(), conf); - // populate some znodes in the peer znode - files.add("log1"); - files.add("log2"); - for (String file : files) { - queueStorage.addWAL(server.getServerName(), "1", file); - } - - // create 3 DummyServers - Server s1 = new DummyServer("ip-10-8-101-114.ec2.internal"); - Server s2 = new DummyServer("ec2-107-20-52-47.compute-1.amazonaws.com"); - Server s3 = new DummyServer("ec2-23-20-187-167.compute-1.amazonaws.com"); - - // simulate three servers fail sequentially - ServerName serverName = server.getServerName(); - List unclaimed = queueStorage.getAllQueues(serverName); - queueStorage.claimQueue(serverName, unclaimed.get(0), s1.getServerName()); - queueStorage.removeReplicatorIfQueueIsEmpty(serverName); - - serverName = s1.getServerName(); - unclaimed = queueStorage.getAllQueues(serverName); - queueStorage.claimQueue(serverName, unclaimed.get(0), s2.getServerName()); - queueStorage.removeReplicatorIfQueueIsEmpty(serverName); - - serverName = s2.getServerName(); - unclaimed = queueStorage.getAllQueues(serverName); - String queue3 = - queueStorage.claimQueue(serverName, unclaimed.get(0), s3.getServerName()).getFirst(); - queueStorage.removeReplicatorIfQueueIsEmpty(serverName); - - ReplicationQueueInfo replicationQueueInfo = new ReplicationQueueInfo(queue3); - List result = replicationQueueInfo.getDeadRegionServers(); - // verify - assertTrue(result.contains(server.getServerName())); - assertTrue(result.contains(s1.getServerName())); - assertTrue(result.contains(s2.getServerName())); - - server.stop(""); - } -} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationWALEdits.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationWALEdits.java new file mode 100644 index 000000000000..bc885db0df54 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationWALEdits.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.replication.regionserver; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.TreeMap; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.RegionInfoBuilder; +import org.apache.hadoop.hbase.testclassification.ReplicationTests; +import org.apache.hadoop.hbase.testclassification.SmallTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.wal.WALEdit; +import org.apache.hadoop.hbase.wal.WALKeyImpl; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import org.apache.hbase.thirdparty.com.google.protobuf.UnsafeByteOperations; + +import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; +import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos; +import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.BulkLoadDescriptor; + +@Category({ ReplicationTests.class, SmallTests.class }) +public class TestReplicationWALEdits { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestReplicationWALEdits.class); + + private static final Configuration CONF = HBaseConfiguration.create(); + + private static final TableName TABLE_NAME = TableName.valueOf("test"); + + private static final byte[] F1 = Bytes.toBytes("f1"); + + private static final byte[] F2 = Bytes.toBytes("f2"); + + private static final RegionInfo RI = RegionInfoBuilder.newBuilder(TABLE_NAME).build(); + + /** + * Test for HBASE-9038, Replication.scopeWALEdits would NPE if it wasn't filtering out the + * compaction WALEdit. + */ + @Test + public void testCompactionWALEdits() throws Exception { + TableName tableName = TableName.valueOf("testCompactionWALEdits"); + WALProtos.CompactionDescriptor compactionDescriptor = + WALProtos.CompactionDescriptor.getDefaultInstance(); + RegionInfo hri = RegionInfoBuilder.newBuilder(tableName).setStartKey(HConstants.EMPTY_START_ROW) + .setEndKey(HConstants.EMPTY_END_ROW).build(); + WALEdit edit = WALEdit.createCompaction(hri, compactionDescriptor); + ReplicationSourceWALActionListener.scopeWALEdits(new WALKeyImpl(), edit, CONF); + } + + private WALEdit getBulkLoadWALEdit(NavigableMap scope) { + // 1. Create store files for the families + Map> storeFiles = new HashMap<>(1); + Map storeFilesSize = new HashMap<>(1); + List p = new ArrayList<>(1); + Path hfilePath1 = new Path(Bytes.toString(F1)); + p.add(hfilePath1); + storeFilesSize.put(hfilePath1.getName(), 0L); + storeFiles.put(F1, p); + scope.put(F1, 1); + p = new ArrayList<>(1); + Path hfilePath2 = new Path(Bytes.toString(F2)); + p.add(hfilePath2); + storeFilesSize.put(hfilePath2.getName(), 0L); + storeFiles.put(F2, p); + // 2. Create bulk load descriptor + BulkLoadDescriptor desc = ProtobufUtil.toBulkLoadDescriptor(RI.getTable(), + UnsafeByteOperations.unsafeWrap(RI.getEncodedNameAsBytes()), storeFiles, storeFilesSize, 1); + + // 3. create bulk load wal edit event + WALEdit logEdit = WALEdit.createBulkLoadEvent(RI, desc); + return logEdit; + } + + @Test + public void testBulkLoadWALEditsWithoutBulkLoadReplicationEnabled() throws Exception { + NavigableMap scope = new TreeMap<>(Bytes.BYTES_COMPARATOR); + // 1. Get the bulk load wal edit event + WALEdit logEdit = getBulkLoadWALEdit(scope); + // 2. Create wal key + WALKeyImpl logKey = new WALKeyImpl(scope); + + // 3. Get the scopes for the key + ReplicationSourceWALActionListener.scopeWALEdits(logKey, logEdit, CONF); + + // 4. Assert that no bulk load entry scopes are added if bulk load hfile replication is disabled + assertNull("No bulk load entries scope should be added if bulk load replication is disabled.", + logKey.getReplicationScopes()); + } + + @Test + public void testBulkLoadWALEdits() throws Exception { + // 1. Get the bulk load wal edit event + NavigableMap scope = new TreeMap<>(Bytes.BYTES_COMPARATOR); + WALEdit logEdit = getBulkLoadWALEdit(scope); + // 2. Create wal key + WALKeyImpl logKey = new WALKeyImpl(scope); + // 3. Enable bulk load hfile replication + Configuration bulkLoadConf = HBaseConfiguration.create(CONF); + bulkLoadConf.setBoolean(HConstants.REPLICATION_BULKLOAD_ENABLE_KEY, true); + + // 4. Get the scopes for the key + ReplicationSourceWALActionListener.scopeWALEdits(logKey, logEdit, bulkLoadConf); + + NavigableMap scopes = logKey.getReplicationScopes(); + // Assert family with replication scope global is present in the key scopes + assertTrue("This family scope is set to global, should be part of replication key scopes.", + scopes.containsKey(F1)); + // Assert family with replication scope local is not present in the key scopes + assertFalse("This family scope is set to local, should not be part of replication key scopes", + scopes.containsKey(F2)); + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestSerialReplicationChecker.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestSerialReplicationChecker.java index 9da367694290..1544265435c7 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestSerialReplicationChecker.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestSerialReplicationChecker.java @@ -46,6 +46,8 @@ import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.replication.ReplicationBarrierFamilyFormat; import org.apache.hadoop.hbase.replication.ReplicationException; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; import org.apache.hadoop.hbase.testclassification.MediumTests; @@ -94,10 +96,11 @@ public class TestSerialReplicationChecker { @BeforeClass public static void setUpBeforeClass() throws Exception { UTIL.startMiniCluster(1); - QUEUE_STORAGE = ReplicationStorageFactory.getReplicationQueueStorage(UTIL.getZooKeeperWatcher(), - UTIL.getConfiguration()); - QUEUE_STORAGE.addWAL(UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName(), PEER_ID, - WAL_FILE_NAME); + TableName repTable = TableName.valueOf("test_serial_rep"); + UTIL.getAdmin() + .createTable(ReplicationStorageFactory.createReplicationQueueTableDescriptor(repTable)); + QUEUE_STORAGE = + ReplicationStorageFactory.getReplicationQueueStorage(UTIL.getConnection(), repTable); } @AfterClass @@ -174,8 +177,10 @@ private void setState(RegionInfo region, RegionState.State state) throws IOExcep } private void updatePushedSeqId(RegionInfo region, long seqId) throws ReplicationException { - QUEUE_STORAGE.setWALPosition(UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName(), - PEER_ID, WAL_FILE_NAME, 10, ImmutableMap.of(region.getEncodedName(), seqId)); + ReplicationQueueId queueId = new ReplicationQueueId( + UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName(), PEER_ID); + QUEUE_STORAGE.setOffset(queueId, "", new ReplicationGroupOffset(WAL_FILE_NAME, 10), + ImmutableMap.of(region.getEncodedName(), seqId)); } private void addParents(RegionInfo region, List parents) throws IOException { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestWALEntrySinkFilter.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestWALEntrySinkFilter.java index d66aef492ffe..93fa22c00fd3 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestWALEntrySinkFilter.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestWALEntrySinkFilter.java @@ -25,7 +25,6 @@ import java.util.ArrayList; import java.util.List; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.Cell; @@ -35,7 +34,6 @@ import org.apache.hadoop.hbase.CellScanner; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseConfiguration; -import org.apache.hadoop.hbase.Stoppable; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.AdvancedScanResultConsumer; import org.apache.hadoop.hbase.client.AsyncClusterConnection; @@ -55,8 +53,6 @@ import org.junit.Test; import org.junit.experimental.categories.Category; import org.junit.rules.TestName; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.apache.hbase.thirdparty.com.google.protobuf.ByteString; @@ -72,31 +68,12 @@ public class TestWALEntrySinkFilter { public static final HBaseClassTestRule CLASS_RULE = HBaseClassTestRule.forClass(TestWALEntrySinkFilter.class); - private static final Logger LOG = LoggerFactory.getLogger(TestReplicationSink.class); @Rule public TestName name = new TestName(); static final int BOUNDARY = 5; static final AtomicInteger UNFILTERED = new AtomicInteger(); static final AtomicInteger FILTERED = new AtomicInteger(); - /** - * Implemetentation of Stoppable to pass into ReplicationSink. - */ - private static Stoppable STOPPABLE = new Stoppable() { - private final AtomicBoolean stop = new AtomicBoolean(false); - - @Override - public boolean isStopped() { - return this.stop.get(); - } - - @Override - public void stop(String why) { - LOG.info("STOPPING BECAUSE: " + why); - this.stop.set(true); - } - }; - /** * Test filter. Filter will filter out any write time that is <= 5 (BOUNDARY). We count how many * items we filter out and we count how many cells make it through for distribution way down below diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckCleanReplicationBarriers.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckCleanReplicationBarriers.java index d4f0ec664e36..20ed3796dbd9 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckCleanReplicationBarriers.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckCleanReplicationBarriers.java @@ -29,6 +29,7 @@ import org.apache.hadoop.hbase.HBaseTestingUtil; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.MetaTableAccessor; +import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; import org.apache.hadoop.hbase.client.Put; @@ -43,7 +44,9 @@ import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.replication.ReplicationBarrierFamilyFormat; import org.apache.hadoop.hbase.replication.ReplicationException; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; import org.apache.hadoop.hbase.replication.ReplicationPeerConfig; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; import org.apache.hadoop.hbase.testclassification.MediumTests; @@ -78,13 +81,9 @@ public class TestHBaseFsckCleanReplicationBarriers { @BeforeClass public static void setUp() throws Exception { UTIL.startMiniCluster(1); - QUEUE_STORAGE = ReplicationStorageFactory.getReplicationQueueStorage(UTIL.getZooKeeperWatcher(), + QUEUE_STORAGE = ReplicationStorageFactory.getReplicationQueueStorage(UTIL.getConnection(), UTIL.getConfiguration()); createPeer(); - QUEUE_STORAGE.addWAL(UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName(), PEER_1, - WAL_FILE_NAME); - QUEUE_STORAGE.addWAL(UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName(), PEER_2, - WAL_FILE_NAME); } @AfterClass @@ -205,9 +204,12 @@ private void addStateAndBarrier(RegionInfo region, RegionState.State state, long } private void updatePushedSeqId(RegionInfo region, long seqId) throws ReplicationException { - QUEUE_STORAGE.setWALPosition(UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName(), - PEER_1, WAL_FILE_NAME, 10, ImmutableMap.of(region.getEncodedName(), seqId)); - QUEUE_STORAGE.setWALPosition(UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName(), - PEER_2, WAL_FILE_NAME, 10, ImmutableMap.of(region.getEncodedName(), seqId)); + ServerName sn = UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName(); + QUEUE_STORAGE.setOffset(new ReplicationQueueId(sn, PEER_1), "", + new ReplicationGroupOffset(WAL_FILE_NAME, 10), + ImmutableMap.of(region.getEncodedName(), seqId)); + QUEUE_STORAGE.setOffset(new ReplicationQueueId(sn, PEER_2), "", + new ReplicationGroupOffset(WAL_FILE_NAME, 10), + ImmutableMap.of(region.getEncodedName(), seqId)); } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckEncryption.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckEncryption.java index 66fd10bd156e..aeed1a9a4837 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckEncryption.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckEncryption.java @@ -55,9 +55,12 @@ import org.junit.After; import org.junit.Before; import org.junit.ClassRule; +import org.junit.Ignore; import org.junit.Test; import org.junit.experimental.categories.Category; +//revisit later +@Ignore @Category({ MiscTests.class, MediumTests.class }) public class TestHBaseFsckEncryption { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckMOB.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckMOB.java index 70ea559e3301..b24b721762d3 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckMOB.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckMOB.java @@ -40,11 +40,14 @@ import org.junit.Before; import org.junit.BeforeClass; import org.junit.ClassRule; +import org.junit.Ignore; import org.junit.Test; import org.junit.experimental.categories.Category; import org.apache.hbase.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder; +// revisit later +@Ignore @Category({ MiscTests.class, MediumTests.class }) public class TestHBaseFsckMOB extends BaseTestHBaseFsck { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckReplication.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckReplication.java index e44e00d2d375..fdf0d2d6a250 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckReplication.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckReplication.java @@ -17,25 +17,14 @@ */ package org.apache.hadoop.hbase.util; -import static org.junit.Assert.assertEquals; - -import java.util.List; -import java.util.stream.Stream; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtil; -import org.apache.hadoop.hbase.ServerName; -import org.apache.hadoop.hbase.replication.ReplicationPeerConfig; -import org.apache.hadoop.hbase.replication.ReplicationPeerStorage; -import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; -import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; -import org.apache.hadoop.hbase.replication.SyncReplicationState; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.testclassification.ReplicationTests; -import org.apache.hadoop.hbase.util.HbckErrorReporter.ERROR_CODE; -import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.ClassRule; +import org.junit.Ignore; import org.junit.Test; import org.junit.experimental.categories.Category; @@ -59,51 +48,53 @@ public static void tearDown() throws Exception { UTIL.shutdownMiniCluster(); } + // TODO: reimplement + @Ignore @Test public void test() throws Exception { - ReplicationPeerStorage peerStorage = ReplicationStorageFactory.getReplicationPeerStorage( - UTIL.getTestFileSystem(), UTIL.getZooKeeperWatcher(), UTIL.getConfiguration()); - ReplicationQueueStorage queueStorage = ReplicationStorageFactory - .getReplicationQueueStorage(UTIL.getZooKeeperWatcher(), UTIL.getConfiguration()); - - String peerId1 = "1"; - String peerId2 = "2"; - peerStorage.addPeer(peerId1, ReplicationPeerConfig.newBuilder().setClusterKey("key").build(), - true, SyncReplicationState.NONE); - peerStorage.addPeer(peerId2, ReplicationPeerConfig.newBuilder().setClusterKey("key").build(), - true, SyncReplicationState.NONE); - for (int i = 0; i < 10; i++) { - queueStorage.addWAL(ServerName.valueOf("localhost", 10000 + i, 100000 + i), peerId1, - "file-" + i); - } - queueStorage.addWAL(ServerName.valueOf("localhost", 10000, 100000), peerId2, "file"); - HBaseFsck fsck = HbckTestingUtil.doFsck(UTIL.getConfiguration(), true); - HbckTestingUtil.assertNoErrors(fsck); - - // should not remove anything since the replication peer is still alive - assertEquals(10, queueStorage.getListOfReplicators().size()); - peerStorage.removePeer(peerId1); - // there should be orphan queues - assertEquals(10, queueStorage.getListOfReplicators().size()); - fsck = HbckTestingUtil.doFsck(UTIL.getConfiguration(), false); - HbckTestingUtil.assertErrors(fsck, Stream.generate(() -> { - return ERROR_CODE.UNDELETED_REPLICATION_QUEUE; - }).limit(10).toArray(ERROR_CODE[]::new)); - - // should not delete anything when fix is false - assertEquals(10, queueStorage.getListOfReplicators().size()); - - fsck = HbckTestingUtil.doFsck(UTIL.getConfiguration(), true); - HbckTestingUtil.assertErrors(fsck, Stream.generate(() -> { - return ERROR_CODE.UNDELETED_REPLICATION_QUEUE; - }).limit(10).toArray(ERROR_CODE[]::new)); - - List replicators = queueStorage.getListOfReplicators(); - // should not remove the server with queue for peerId2 - assertEquals(1, replicators.size()); - assertEquals(ServerName.valueOf("localhost", 10000, 100000), replicators.get(0)); - for (String queueId : queueStorage.getAllQueues(replicators.get(0))) { - assertEquals(peerId2, queueId); - } + // ReplicationPeerStorage peerStorage = ReplicationStorageFactory + // .getReplicationPeerStorage(UTIL.getZooKeeperWatcher(), UTIL.getConfiguration()); + // ReplicationQueueStorage queueStorage = ReplicationStorageFactory + // .getReplicationQueueStorage(UTIL.getZooKeeperWatcher(), UTIL.getConfiguration()); + // + // String peerId1 = "1"; + // String peerId2 = "2"; + // peerStorage.addPeer(peerId1, ReplicationPeerConfig.newBuilder().setClusterKey("key").build(), + // true, SyncReplicationState.NONE); + // peerStorage.addPeer(peerId2, ReplicationPeerConfig.newBuilder().setClusterKey("key").build(), + // true, SyncReplicationState.NONE); + // for (int i = 0; i < 10; i++) { + // queueStorage.addWAL(ServerName.valueOf("localhost", 10000 + i, 100000 + i), peerId1, + // "file-" + i); + // } + // queueStorage.addWAL(ServerName.valueOf("localhost", 10000, 100000), peerId2, "file"); + // HBaseFsck fsck = HbckTestingUtil.doFsck(UTIL.getConfiguration(), true); + // HbckTestingUtil.assertNoErrors(fsck); + // + // // should not remove anything since the replication peer is still alive + // assertEquals(10, queueStorage.getListOfReplicators().size()); + // peerStorage.removePeer(peerId1); + // // there should be orphan queues + // assertEquals(10, queueStorage.getListOfReplicators().size()); + // fsck = HbckTestingUtil.doFsck(UTIL.getConfiguration(), false); + // HbckTestingUtil.assertErrors(fsck, Stream.generate(() -> { + // return ERROR_CODE.UNDELETED_REPLICATION_QUEUE; + // }).limit(10).toArray(ERROR_CODE[]::new)); + // + // // should not delete anything when fix is false + // assertEquals(10, queueStorage.getListOfReplicators().size()); + // + // fsck = HbckTestingUtil.doFsck(UTIL.getConfiguration(), true); + // HbckTestingUtil.assertErrors(fsck, Stream.generate(() -> { + // return ERROR_CODE.UNDELETED_REPLICATION_QUEUE; + // }).limit(10).toArray(ERROR_CODE[]::new)); + // + // List replicators = queueStorage.getListOfReplicators(); + // // should not remove the server with queue for peerId2 + // assertEquals(1, replicators.size()); + // assertEquals(ServerName.valueOf("localhost", 10000, 100000), replicators.get(0)); + // for (String queueId : queueStorage.getAllQueues(replicators.get(0))) { + // assertEquals(peerId2, queueId); + // } } } From 254f4bc1df01371acfd8e0b35b0490de7b6a2d82 Mon Sep 17 00:00:00 2001 From: Duo Zhang Date: Sat, 20 Aug 2022 23:10:58 +0800 Subject: [PATCH 02/16] HBASE-27213 Add support for claim queue operation (#4708) Signed-off-by: Xin Sun --- .../server/master/MasterProcedure.proto | 6 +- .../AssignReplicationQueuesProcedure.java | 13 +- .../replication/ModifyPeerProcedure.java | 2 +- .../replication/RemovePeerProcedure.java | 41 +++- .../ReplicationSourceManager.java | 37 ++-- .../TestClaimReplicationQueue.java | 2 +- .../TestRemovePeerProcedureWaitForSCP.java | 180 ++++++++++++++++++ .../TestSerialReplicationFailover.java | 3 - 8 files changed, 258 insertions(+), 26 deletions(-) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestRemovePeerProcedureWaitForSCP.java diff --git a/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto b/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto index 2e0da0deb842..76a1d676487a 100644 --- a/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto +++ b/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto @@ -515,6 +515,7 @@ message UpdatePeerConfigStateData { message RemovePeerStateData { optional ReplicationPeer peer_config = 1; + repeated int64 ongoing_assign_replication_queues_proc_ids = 2; } message EnablePeerStateData { @@ -714,9 +715,8 @@ message ModifyColumnFamilyStoreFileTrackerStateData { } enum AssignReplicationQueuesState { - ASSIGN_REPLICATION_QUEUES_PRE_CHECK = 1; - ASSIGN_REPLICATION_QUEUES_ADD_MISSING_QUEUES = 2; - ASSIGN_REPLICATION_QUEUES_CLAIM = 3; + ASSIGN_REPLICATION_QUEUES_ADD_MISSING_QUEUES = 1; + ASSIGN_REPLICATION_QUEUES_CLAIM = 2; } message AssignReplicationQueuesStateData { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AssignReplicationQueuesProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AssignReplicationQueuesProcedure.java index e7fb5e517159..d33259dd4368 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AssignReplicationQueuesProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AssignReplicationQueuesProcedure.java @@ -23,6 +23,7 @@ import java.util.Iterator; import java.util.List; import java.util.Set; +import java.util.stream.Collectors; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface; @@ -102,8 +103,12 @@ private void addMissingQueues(MasterProcedureEnv env) throws ReplicationExceptio } private Flow claimQueues(MasterProcedureEnv env) throws ReplicationException { + Set existingPeerIds = env.getReplicationPeerManager().listPeers(null).stream() + .map(ReplicationPeerDescription::getPeerId).collect(Collectors.toSet()); ReplicationQueueStorage storage = env.getReplicationPeerManager().getQueueStorage(); - List queueIds = storage.listAllQueueIds(crashedServer); + // filter out replication queue for deleted peers + List queueIds = storage.listAllQueueIds(crashedServer).stream() + .filter(q -> existingPeerIds.contains(q.getPeerId())).collect(Collectors.toList()); if (queueIds.isEmpty()) { LOG.debug("Finish claiming replication queues for {}", crashedServer); // we are done @@ -130,10 +135,6 @@ protected Flow executeFromState(MasterProcedureEnv env, AssignReplicationQueuesS throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException { try { switch (state) { - case ASSIGN_REPLICATION_QUEUES_PRE_CHECK: - // TODO: reserved for implementing the fencing logic with Add/Remove/UpdatePeerProcedure - setNextState(AssignReplicationQueuesState.ASSIGN_REPLICATION_QUEUES_ADD_MISSING_QUEUES); - return Flow.HAS_MORE_STATE; case ASSIGN_REPLICATION_QUEUES_ADD_MISSING_QUEUES: addMissingQueues(env); retryCounter = null; @@ -183,7 +184,7 @@ protected int getStateId(AssignReplicationQueuesState state) { @Override protected AssignReplicationQueuesState getInitialState() { - return AssignReplicationQueuesState.ASSIGN_REPLICATION_QUEUES_PRE_CHECK; + return AssignReplicationQueuesState.ASSIGN_REPLICATION_QUEUES_ADD_MISSING_QUEUES; } @Override diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ModifyPeerProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ModifyPeerProcedure.java index 3af902e1d8a4..50214e205192 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ModifyPeerProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ModifyPeerProcedure.java @@ -74,7 +74,7 @@ protected abstract void prePeerModification(MasterProcedureEnv env) * update the peer storage. */ protected abstract void postPeerModification(MasterProcedureEnv env) - throws IOException, ReplicationException; + throws IOException, ReplicationException, ProcedureSuspendedException; protected void releaseLatch(MasterProcedureEnv env) { ProcedurePrepareLatch.releaseLatch(latch, this); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/RemovePeerProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/RemovePeerProcedure.java index 2042e8468497..2fadc3fd6642 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/RemovePeerProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/RemovePeerProcedure.java @@ -18,10 +18,17 @@ package org.apache.hadoop.hbase.master.replication; import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; import org.apache.hadoop.hbase.client.replication.ReplicationPeerConfigUtil; import org.apache.hadoop.hbase.master.MasterCoprocessorHost; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; +import org.apache.hadoop.hbase.procedure2.Procedure; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; +import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; import org.apache.hadoop.hbase.replication.ReplicationException; import org.apache.hadoop.hbase.replication.ReplicationPeerConfig; import org.apache.yetus.audience.InterfaceAudience; @@ -40,6 +47,8 @@ public class RemovePeerProcedure extends ModifyPeerProcedure { private ReplicationPeerConfig peerConfig; + private List ongoingAssignReplicationQueuesProcIds = Collections.emptyList(); + public RemovePeerProcedure() { } @@ -64,15 +73,43 @@ protected void prePeerModification(MasterProcedureEnv env) throws IOException { @Override protected void updatePeerStorage(MasterProcedureEnv env) throws ReplicationException { env.getReplicationPeerManager().removePeer(peerId); + // record ongoing AssignReplicationQueuesProcedures after we update the peer storage + ongoingAssignReplicationQueuesProcIds = env.getMasterServices().getMasterProcedureExecutor() + .getProcedures().stream().filter(p -> p instanceof AssignReplicationQueuesProcedure) + .filter(p -> !p.isFinished()).map(Procedure::getProcId).collect(Collectors.toList()); } private void removeRemoteWALs(MasterProcedureEnv env) throws IOException { env.getMasterServices().getSyncReplicationReplayWALManager().removePeerRemoteWALs(peerId); } + private void checkAssignReplicationQueuesFinished(MasterProcedureEnv env) + throws ProcedureSuspendedException { + if (ongoingAssignReplicationQueuesProcIds.isEmpty()) { + LOG.info("No ongoing assign replication queues procedures when removing peer {}, move on", + peerId); + } + ProcedureExecutor procExec = + env.getMasterServices().getMasterProcedureExecutor(); + long[] unfinishedProcIds = + ongoingAssignReplicationQueuesProcIds.stream().map(procExec::getProcedure) + .filter(p -> p != null && !p.isFinished()).mapToLong(Procedure::getProcId).toArray(); + if (unfinishedProcIds.length == 0) { + LOG.info( + "All assign replication queues procedures are finished when removing peer {}, move on", + peerId); + } else { + throw suspend(env.getMasterConfiguration(), backoff -> LOG.info( + "There are still {} pending assign replication queues procedures {} when removing peer {}, sleep {} secs", + unfinishedProcIds.length, Arrays.toString(unfinishedProcIds), peerId, backoff / 1000)); + } + } + @Override protected void postPeerModification(MasterProcedureEnv env) - throws IOException, ReplicationException { + throws IOException, ReplicationException, ProcedureSuspendedException { + checkAssignReplicationQueuesFinished(env); + if (peerConfig.isSyncReplication()) { removeRemoteWALs(env); } @@ -94,6 +131,7 @@ protected void serializeStateData(ProcedureStateSerializer serializer) throws IO if (peerConfig != null) { builder.setPeerConfig(ReplicationPeerConfigUtil.convert(peerConfig)); } + builder.addAllOngoingAssignReplicationQueuesProcIds(ongoingAssignReplicationQueuesProcIds); serializer.serialize(builder.build()); } @@ -104,5 +142,6 @@ protected void deserializeStateData(ProcedureStateSerializer serializer) throws if (data.hasPeerConfig()) { this.peerConfig = ReplicationPeerConfigUtil.convert(data.getPeerConfig()); } + ongoingAssignReplicationQueuesProcIds = data.getOngoingAssignReplicationQueuesProcIdsList(); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java index c16ba8b133c6..5d77600a187b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java @@ -237,7 +237,7 @@ public ReplicationSourceManager(ReplicationQueueStorage queueStorage, */ void init() throws IOException { for (String id : this.replicationPeers.getAllPeerIds()) { - addSource(id); + addSource(id, true); } } @@ -257,7 +257,7 @@ public void addPeer(String peerId) throws IOException { throw new IOException(e); } if (added) { - addSource(peerId); + addSource(peerId, false); } } @@ -323,11 +323,16 @@ private ReplicationSourceInterface createSource(ReplicationQueueData queueData, /** * Add a normal source for the given peer on this region server. Meanwhile, add new replication * queue to storage. For the newly added peer, we only need to enqueue the latest log of each wal - * group and do replication + * group and do replication. + *

    + * We add a {@code init} parameter to indicate whether this is part of the initialization process. + * If so, we should skip adding the replication queues as this may introduce dead lock on region + * server start up and hbase:replication table online. * @param peerId the id of the replication peer + * @param init whether this call is part of the initialization process * @return the source that was created */ - void addSource(String peerId) throws IOException { + void addSource(String peerId, boolean init) throws IOException { ReplicationPeer peer = replicationPeers.getPeer(peerId); if ( ReplicationUtils.LEGACY_REGION_REPLICATION_ENDPOINT_NAME @@ -352,11 +357,16 @@ void addSource(String peerId) throws IOException { NavigableSet wals = new TreeSet<>(); wals.add(walPath.getName()); walsByGroup.put(walPrefixAndPath.getKey(), wals); - // Abort RS and throw exception to make add peer failed - // TODO: can record the length of the current wal file so we could replicate less data - abortAndThrowIOExceptionWhenFail( - () -> this.queueStorage.setOffset(queueId, walPrefixAndPath.getKey(), - new ReplicationGroupOffset(walPath.getName(), 0), Collections.emptyMap())); + if (!init) { + // Abort RS and throw exception to make add peer failed + // Ideally we'd better use the current file size as offset so we can skip replicating + // the data before adding replication peer, but the problem is that the file may not end + // at a valid entry's ending, and the current WAL Reader implementation can not deal + // with reading from the middle of a WAL entry. Can improve later. + abortAndThrowIOExceptionWhenFail( + () -> this.queueStorage.setOffset(queueId, walPrefixAndPath.getKey(), + new ReplicationGroupOffset(walPath.getName(), 0), Collections.emptyMap())); + } src.enqueueLog(walPath); LOG.trace("Enqueued {} to source {} during source creation.", walPath, src.getQueueId()); } @@ -795,9 +805,15 @@ public void postLogRoll(Path newLog) throws IOException { * @return {@code true} means we should replicate the given {@code wal}, otherwise {@code false}. */ private boolean shouldReplicate(ReplicationGroupOffset offset, String wal) { - if (offset == null || offset == ReplicationGroupOffset.BEGIN) { + // skip replicating meta wals + if (AbstractFSWALProvider.isMetaFile(wal)) { return false; } + // if no offset or the offset is just a place marker, replicate + if (offset == null || offset == ReplicationGroupOffset.BEGIN) { + return true; + } + // otherwise, compare the timestamp long walTs = AbstractFSWALProvider.getTimestamp(wal); long startWalTs = AbstractFSWALProvider.getTimestamp(offset.getWal()); if (walTs < startWalTs) { @@ -892,7 +908,6 @@ Comparator. comparing(p -> AbstractFSWALProvider.getTimestamp(p.getN LOG.debug("Skip enqueuing log {} because it is before the start offset {}", file.getName(), groupOffset); } - walFilesPQ.add(file); } // the method is a bit long, so assign it to null here to avoid later we reuse it again by // mistake, we should use the sorted walFilesPQ instead diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestClaimReplicationQueue.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestClaimReplicationQueue.java index a12081a76363..de226b13e8fc 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestClaimReplicationQueue.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestClaimReplicationQueue.java @@ -156,7 +156,7 @@ public void testClaim() throws Exception { hbaseAdmin.enableReplicationPeer(PEER_ID3); EMPTY = false; - // wait until the SCP finished, ClaimReplicationQueuesProcedure is a sub procedure of SCP + // wait until the SCP finished, AssignReplicationQueuesProcedure is a sub procedure of SCP UTIL1.waitFor(30000, () -> master.getProcedures().stream() .filter(p -> p instanceof ServerCrashProcedure).allMatch(Procedure::isSuccess)); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestRemovePeerProcedureWaitForSCP.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestRemovePeerProcedureWaitForSCP.java new file mode 100644 index 000000000000..e93fa3b01e87 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestRemovePeerProcedureWaitForSCP.java @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.replication; + +import static org.hamcrest.MatcherAssert.*; +import static org.hamcrest.Matchers.*; +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.MasterServices; +import org.apache.hadoop.hbase.master.RegionServerList; +import org.apache.hadoop.hbase.master.ServerManager; +import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; +import org.apache.hadoop.hbase.master.replication.AssignReplicationQueuesProcedure; +import org.apache.hadoop.hbase.master.replication.RemovePeerProcedure; +import org.apache.hadoop.hbase.procedure2.Procedure; +import org.apache.hadoop.hbase.testclassification.LargeTests; +import org.apache.hadoop.hbase.testclassification.ReplicationTests; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import org.apache.hbase.thirdparty.com.google.common.io.Closeables; + +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.PeerModificationState; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState; + +/** + * Make sure we will wait until all the SCPs finished in RemovePeerProcedure. + *

    + * See HBASE-27109 for more details. + */ +@Category({ ReplicationTests.class, LargeTests.class }) +public class TestRemovePeerProcedureWaitForSCP extends TestReplicationBase { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestRemovePeerProcedureWaitForSCP.class); + + private static final TableName tableName3 = TableName.valueOf("test3"); + + private static final String PEER_ID3 = "3"; + + private static Table table3; + + private static volatile boolean EMPTY = false; + + public static final class ServerManagerForTest extends ServerManager { + + public ServerManagerForTest(MasterServices master, RegionServerList storage) { + super(master, storage); + } + + @Override + public List getOnlineServersList() { + // return no region server to make the procedure hang + if (EMPTY) { + for (StackTraceElement e : Thread.currentThread().getStackTrace()) { + if (e.getClassName().equals(AssignReplicationQueuesProcedure.class.getName())) { + return Collections.emptyList(); + } + } + } + return super.getOnlineServersList(); + } + } + + public static final class HMasterForTest extends HMaster { + + public HMasterForTest(Configuration conf) throws IOException { + super(conf); + } + + @Override + protected ServerManager createServerManager(MasterServices master, RegionServerList storage) + throws IOException { + setupClusterConnection(); + return new ServerManagerForTest(master, storage); + } + } + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + CONF1.setClass(HConstants.MASTER_IMPL, HMasterForTest.class, HMaster.class); + TestReplicationBase.setUpBeforeClass(); + createTable(tableName3); + table3 = connection1.getTable(tableName3); + } + + @Override + public void setUpBase() throws Exception { + super.setUpBase(); + // set up two replication peers and only 1 rs to test claim replication queue with multiple + // round + addPeer(PEER_ID3, tableName3); + } + + @Override + public void tearDownBase() throws Exception { + super.tearDownBase(); + removePeer(PEER_ID3); + } + + @AfterClass + public static void tearDownAfterClass() throws Exception { + Closeables.close(table3, true); + TestReplicationBase.tearDownAfterClass(); + } + + @Test + public void testWait() throws Exception { + // disable the peers + hbaseAdmin.disableReplicationPeer(PEER_ID2); + hbaseAdmin.disableReplicationPeer(PEER_ID3); + + // put some data + UTIL1.loadTable(htable1, famName); + UTIL1.loadTable(table3, famName); + + EMPTY = true; + UTIL1.getMiniHBaseCluster().stopRegionServer(0).join(); + UTIL1.getMiniHBaseCluster().startRegionServer(); + + // since there is no active region server to get the replication queue, the procedure should be + // in WAITING_TIMEOUT state for most time to retry + HMaster master = UTIL1.getMiniHBaseCluster().getMaster(); + UTIL1.waitFor(30000, + () -> master.getProcedures().stream() + .filter(p -> p instanceof AssignReplicationQueuesProcedure) + .anyMatch(p -> p.getState() == ProcedureState.WAITING_TIMEOUT)); + + // call remove replication peer, and make sure it will be stuck in the POST_PEER_MODIFICATION + // state. + hbaseAdmin.removeReplicationPeerAsync(PEER_ID3); + UTIL1.waitFor(30000, + () -> master.getProcedures().stream().filter(p -> p instanceof RemovePeerProcedure) + .anyMatch(p -> ((RemovePeerProcedure) p).getCurrentStateId() + == PeerModificationState.POST_PEER_MODIFICATION_VALUE)); + Thread.sleep(5000); + assertEquals(PeerModificationState.POST_PEER_MODIFICATION_VALUE, + ((RemovePeerProcedure) master.getProcedures().stream() + .filter(p -> p instanceof RemovePeerProcedure).findFirst().get()).getCurrentStateId()); + EMPTY = false; + // wait until the SCP finished, AssignReplicationQueuesProcedure is a sub procedure of SCP + UTIL1.waitFor(30000, () -> master.getProcedures().stream() + .filter(p -> p instanceof ServerCrashProcedure).allMatch(Procedure::isSuccess)); + // the RemovePeerProcedure should have also finished + UTIL1.waitFor(30000, () -> master.getProcedures().stream() + .filter(p -> p instanceof RemovePeerProcedure).allMatch(Procedure::isSuccess)); + // make sure there is no remaining replication queues for PEER_ID3 + assertThat(master.getReplicationPeerManager().getQueueStorage().listAllQueueIds(PEER_ID3), + empty()); + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestSerialReplicationFailover.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestSerialReplicationFailover.java index 6906db4cd466..1295ea14abcd 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestSerialReplicationFailover.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestSerialReplicationFailover.java @@ -32,12 +32,9 @@ import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread; import org.junit.Before; import org.junit.ClassRule; -import org.junit.Ignore; import org.junit.Test; import org.junit.experimental.categories.Category; -// revisit later when we reviewing the implementation for serial replication -@Ignore @Category({ ReplicationTests.class, MediumTests.class }) public class TestSerialReplicationFailover extends SerialReplicationTestBase { From 3966835fc021e673ed0ab6580677a9f68aa6bd2f Mon Sep 17 00:00:00 2001 From: Duo Zhang Date: Wed, 31 Aug 2022 21:24:09 +0800 Subject: [PATCH 03/16] HBASE-27214 Implement the new replication hfile/log cleaner (#4722) Signed-off-by: Xin Sun --- .../master/cleaner/FileCleanerDelegate.java | 2 +- .../hbase/master/region/MasterRegion.java | 2 +- .../master/replication/AddPeerProcedure.java | 21 +- .../replication/ReplicationPeerManager.java | 8 + .../hbase/regionserver/HRegionServer.java | 2 +- .../replication/ReplicationOffsetUtil.java | 47 +++ .../master/ReplicationLogCleaner.java | 234 +++++++---- .../master/ReplicationLogCleanerBarrier.java | 85 ++++ .../ReplicationSourceManager.java | 18 +- .../regionserver/ReplicationSyncUp.java | 5 +- .../hbase/wal/AbstractFSWALProvider.java | 29 ++ .../apache/hadoop/hbase/wal/WALFactory.java | 29 +- .../hbase/master/cleaner/TestLogsCleaner.java | 227 ++++------- .../cleaner/TestReplicationHFileCleaner.java | 43 +- .../TestReplicationOffsetUtil.java | 52 +++ .../master/TestLogCleanerBarrier.java | 60 +++ .../master/TestReplicationLogCleaner.java | 385 ++++++++++++++++++ .../TestReplicationSourceManager.java | 2 +- .../hadoop/hbase/wal/TestWALFactory.java | 2 +- .../hadoop/hbase/wal/TestWALMethods.java | 14 + 20 files changed, 1014 insertions(+), 253 deletions(-) create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/replication/ReplicationOffsetUtil.java create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleanerBarrier.java create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationOffsetUtil.java create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/replication/master/TestLogCleanerBarrier.java create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/replication/master/TestReplicationLogCleaner.java diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/cleaner/FileCleanerDelegate.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/cleaner/FileCleanerDelegate.java index d37bb6202730..e08f53294336 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/cleaner/FileCleanerDelegate.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/cleaner/FileCleanerDelegate.java @@ -50,7 +50,7 @@ default void preClean() { } /** - * Used to do some cleanup work + * Will be called after cleaner run. */ default void postClean() { } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java index 86c231144581..e45b6271f7b9 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java @@ -380,7 +380,7 @@ public static MasterRegion create(MasterRegionParams params) throws IOException params.archivedWalSuffix(), params.rollPeriodMs(), params.flushSize()); walRoller.start(); - WALFactory walFactory = new WALFactory(conf, server.getServerName().toString(), server, false); + WALFactory walFactory = new WALFactory(conf, server.getServerName(), server, false); Path tableDir = CommonFSUtils.getTableDir(rootDir, td.getTableName()); Path initializingFlag = new Path(tableDir, INITIALIZING_FLAG); Path initializedFlag = new Path(tableDir, INITIALIZED_FLAG); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AddPeerProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AddPeerProcedure.java index 6d0acee76caa..1d02fab5f194 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AddPeerProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AddPeerProcedure.java @@ -21,7 +21,6 @@ import org.apache.hadoop.hbase.client.replication.ReplicationPeerConfigUtil; import org.apache.hadoop.hbase.master.MasterCoprocessorHost; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; -import org.apache.hadoop.hbase.master.procedure.ProcedurePrepareLatch; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; import org.apache.hadoop.hbase.replication.ReplicationException; @@ -45,6 +44,8 @@ public class AddPeerProcedure extends ModifyPeerProcedure { private boolean enabled; + private boolean cleanerDisabled; + public AddPeerProcedure() { } @@ -84,15 +85,24 @@ protected ReplicationPeerConfig getNewPeerConfig() { @Override protected void releaseLatch(MasterProcedureEnv env) { + if (cleanerDisabled) { + env.getReplicationPeerManager().getReplicationLogCleanerBarrier().enable(); + } if (peerConfig.isSyncReplication()) { env.getReplicationPeerManager().releaseSyncReplicationPeerLock(); } - ProcedurePrepareLatch.releaseLatch(latch, this); + super.releaseLatch(env); } @Override protected void prePeerModification(MasterProcedureEnv env) throws IOException, ReplicationException, ProcedureSuspendedException { + if (!env.getReplicationPeerManager().getReplicationLogCleanerBarrier().disable()) { + throw suspend(env.getMasterConfiguration(), + backoff -> LOG.warn("LogCleaner is run at the same time when adding peer {}, sleep {} secs", + peerId, backoff / 1000)); + } + cleanerDisabled = true; MasterCoprocessorHost cpHost = env.getMasterCoprocessorHost(); if (cpHost != null) { cpHost.preAddReplicationPeer(peerId, peerConfig); @@ -128,9 +138,14 @@ protected void postPeerModification(MasterProcedureEnv env) @Override protected void afterReplay(MasterProcedureEnv env) { if (getCurrentState() == getInitialState()) { - // will try to acquire the lock when executing the procedure, no need to acquire it here + // do not need to disable log cleaner or acquire lock if we are in the initial state, later + // when executing the procedure we will try to disable and acquire. return; } + if (!env.getReplicationPeerManager().getReplicationLogCleanerBarrier().disable()) { + throw new IllegalStateException("can not disable log cleaner, this should not happen"); + } + cleanerDisabled = true; if (peerConfig.isSyncReplication()) { if (!env.getReplicationPeerManager().tryAcquireSyncReplicationPeerLock()) { throw new IllegalStateException( diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java index 53270bcbb04e..57380920d0fc 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java @@ -60,6 +60,7 @@ import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; import org.apache.hadoop.hbase.replication.ReplicationUtils; import org.apache.hadoop.hbase.replication.SyncReplicationState; +import org.apache.hadoop.hbase.replication.master.ReplicationLogCleanerBarrier; import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.zookeeper.ZKClusterId; import org.apache.hadoop.hbase.zookeeper.ZKConfig; @@ -102,6 +103,9 @@ public class ReplicationPeerManager implements ConfigurationObserver { // Only allow to add one sync replication peer concurrently private final Semaphore syncReplicationPeerLock = new Semaphore(1); + private final ReplicationLogCleanerBarrier replicationLogCleanerBarrier = + new ReplicationLogCleanerBarrier(); + private final String clusterId; private volatile Configuration conf; @@ -705,6 +709,10 @@ public void releaseSyncReplicationPeerLock() { syncReplicationPeerLock.release(); } + public ReplicationLogCleanerBarrier getReplicationLogCleanerBarrier() { + return replicationLogCleanerBarrier; + } + @Override public void onConfigurationChange(Configuration conf) { this.conf = conf; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 6daae10b726f..1bdf6a225c62 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -1733,7 +1733,7 @@ public boolean isOnline() { * be hooked up to WAL. */ private void setupWALAndReplication() throws IOException { - WALFactory factory = new WALFactory(conf, serverName.toString(), this, true); + WALFactory factory = new WALFactory(conf, serverName, this, true); // TODO Replication make assumptions here based on the default filesystem impl Path oldLogDir = new Path(walRootDir, HConstants.HREGION_OLDLOGDIR_NAME); String logName = AbstractFSWALProvider.getWALDirectoryName(this.serverName.toString()); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/ReplicationOffsetUtil.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/ReplicationOffsetUtil.java new file mode 100644 index 000000000000..052c5542d47a --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/ReplicationOffsetUtil.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.replication; + +import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; +import org.apache.yetus.audience.InterfaceAudience; + +@InterfaceAudience.Private +public final class ReplicationOffsetUtil { + + private ReplicationOffsetUtil() { + } + + public static boolean shouldReplicate(ReplicationGroupOffset offset, String wal) { + // if no offset or the offset is just a place marker, replicate + if (offset == null || offset == ReplicationGroupOffset.BEGIN) { + return true; + } + // otherwise, compare the timestamp + long walTs = AbstractFSWALProvider.getTimestamp(wal); + long startWalTs = AbstractFSWALProvider.getTimestamp(offset.getWal()); + if (walTs < startWalTs) { + return false; + } else if (walTs > startWalTs) { + return true; + } + // if the timestamp equals, usually it means we should include this wal but there is a special + // case, a negative offset means the wal has already been fully replicated, so here we should + // check the offset. + return offset.getOffset() >= 0; + } +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java index 7135ca9a9b20..f1fd8f8d6b3a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java @@ -17,18 +17,29 @@ */ package org.apache.hadoop.hbase.replication.master; -import java.io.IOException; import java.util.Collections; +import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Set; -import org.apache.hadoop.conf.Configuration; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.hbase.HBaseInterfaceAudience; +import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.cleaner.BaseLogCleanerDelegate; -import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; -import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; -import org.apache.hadoop.hbase.zookeeper.ZKWatcher; +import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; +import org.apache.hadoop.hbase.master.replication.ReplicationPeerManager; +import org.apache.hadoop.hbase.replication.ReplicationException; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; +import org.apache.hadoop.hbase.replication.ReplicationOffsetUtil; +import org.apache.hadoop.hbase.replication.ReplicationPeerDescription; +import org.apache.hadoop.hbase.replication.ReplicationQueueData; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; +import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -40,35 +51,129 @@ /** * Implementation of a log cleaner that checks if a log is still scheduled for replication before * deleting it when its TTL is over. + *

    + * The logic is a bit complicated after we switch to use table based replication queue storage, see + * the design doc in HBASE-27109 and the comments in HBASE-27214 for more details. */ @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.CONFIG) public class ReplicationLogCleaner extends BaseLogCleanerDelegate { private static final Logger LOG = LoggerFactory.getLogger(ReplicationLogCleaner.class); - private ZKWatcher zkw = null; - private boolean shareZK = false; - private ReplicationQueueStorage queueStorage; + private Set notFullyDeadServers; + private Set peerIds; + // ServerName -> PeerId -> WalGroup -> Offset + // Here the server name is the source server name, so we can make sure that there is only one + // queue for a given peer, that why we can use a String peerId as key instead of + // ReplicationQueueId. + private Map>> replicationOffsets; + private ReplicationPeerManager rpm; + private Supplier> getNotFullyDeadServers; + + private boolean canFilter; private boolean stopped = false; - private Set wals; - private long readZKTimestamp = 0; @Override public void preClean() { - readZKTimestamp = EnvironmentEdgeManager.currentTime(); - // TODO: revisit the implementation - // try { - // // The concurrently created new WALs may not be included in the return list, - // // but they won't be deleted because they're not in the checking set. - // wals = queueStorage.getAllWALs(); - // } catch (ReplicationException e) { - // LOG.warn("Failed to read zookeeper, skipping checking deletable files"); - // wals = null; - // } + if (this.getConf() == null) { + return; + } + canFilter = rpm.getReplicationLogCleanerBarrier().start(); + if (canFilter) { + notFullyDeadServers = getNotFullyDeadServers.get(); + peerIds = rpm.listPeers(null).stream().map(ReplicationPeerDescription::getPeerId) + .collect(Collectors.toSet()); + // must get the not fully dead servers first and then get the replication queue data, in this + // way we can make sure that, we should have added the missing replication queues for the dead + // region servers recorded in the above set, otherwise the logic in the + // filterForDeadRegionServer method may lead us delete wal still in use. + List allQueueData; + try { + allQueueData = rpm.getQueueStorage().listAllQueues(); + } catch (ReplicationException e) { + LOG.error("Can not list all replication queues, give up cleaning", e); + rpm.getReplicationLogCleanerBarrier().stop(); + canFilter = false; + notFullyDeadServers = null; + peerIds = null; + return; + } + replicationOffsets = new HashMap<>(); + for (ReplicationQueueData queueData : allQueueData) { + ReplicationQueueId queueId = queueData.getId(); + ServerName serverName = queueId.getServerWALsBelongTo(); + Map> peerId2Offsets = + replicationOffsets.computeIfAbsent(serverName, k -> new HashMap<>()); + Map offsets = + peerId2Offsets.computeIfAbsent(queueId.getPeerId(), k -> new HashMap<>()); + offsets.putAll(queueData.getOffsets()); + } + } else { + LOG.info("Skip replication log cleaner because an AddPeerProcedure is running"); + } } @Override public void postClean() { - // release memory - wals = null; + if (canFilter) { + rpm.getReplicationLogCleanerBarrier().stop(); + canFilter = false; + // release memory + notFullyDeadServers = null; + peerIds = null; + replicationOffsets = null; + } + } + + private boolean shouldDelete(ReplicationGroupOffset offset, FileStatus file) { + return !ReplicationOffsetUtil.shouldReplicate(offset, file.getPath().getName()); + } + + private boolean filterForLiveRegionServer(ServerName serverName, FileStatus file) { + Map> peerId2Offsets = + replicationOffsets.get(serverName); + if (peerId2Offsets == null) { + // if there are replication queues missing, we can not delete the wal + return false; + } + for (String peerId : peerIds) { + Map offsets = peerId2Offsets.get(peerId); + // if no replication queue for a peer, we can not delete the wal + if (offsets == null) { + return false; + } + String walGroupId = AbstractFSWALProvider.getWALPrefixFromWALName(file.getPath().getName()); + ReplicationGroupOffset offset = offsets.get(walGroupId); + // if a replication queue still need to replicate this wal, we can not delete it + if (!shouldDelete(offset, file)) { + return false; + } + } + // if all replication queues have already finished replicating this wal, we can delete it. + return true; + } + + private boolean filterForDeadRegionServer(ServerName serverName, FileStatus file) { + Map> peerId2Offsets = + replicationOffsets.get(serverName); + if (peerId2Offsets == null) { + // no replication queue for this dead rs, we can delete all wal files for it + return true; + } + for (String peerId : peerIds) { + Map offsets = peerId2Offsets.get(peerId); + if (offsets == null) { + // for dead server, we only care about existing replication queues, as we will delete a + // queue after we finish replicating it. + continue; + } + String walGroupId = AbstractFSWALProvider.getWALPrefixFromWALName(file.getPath().getName()); + ReplicationGroupOffset offset = offsets.get(walGroupId); + // if a replication queue still need to replicate this wal, we can not delete it + if (!shouldDelete(offset, file)) { + return false; + } + } + // if all replication queues have already finished replicating this wal, we can delete it. + return true; } @Override @@ -78,10 +183,12 @@ public Iterable getDeletableFiles(Iterable files) { if (this.getConf() == null) { return files; } - - if (wals == null) { + if (!canFilter) { + // We can not delete anything if there are AddPeerProcedure running at the same time + // See HBASE-27214 for more details. return Collections.emptyList(); } + return Iterables.filter(files, new Predicate() { @Override public boolean apply(FileStatus file) { @@ -90,65 +197,56 @@ public boolean apply(FileStatus file) { if (file == null) { return false; } - String wal = file.getPath().getName(); - boolean logInReplicationQueue = wals.contains(wal); - if (logInReplicationQueue) { - LOG.debug("Found up in ZooKeeper, NOT deleting={}", wal); + if (peerIds.isEmpty()) { + // no peer, can always delete + return true; + } + // not a valid wal file name, delete + if (!AbstractFSWALProvider.validateWALFilename(file.getPath().getName())) { + return true; + } + // meta wal is always deletable as we will never replicate it + if (AbstractFSWALProvider.isMetaFile(file.getPath())) { + return true; + } + ServerName serverName = + AbstractFSWALProvider.parseServerNameFromWALName(file.getPath().getName()); + if (notFullyDeadServers.contains(serverName)) { + return filterForLiveRegionServer(serverName, file); + } else { + return filterForDeadRegionServer(serverName, file); } - return !logInReplicationQueue && (file.getModificationTime() < readZKTimestamp); } }); } + private Set getNotFullyDeadServers(MasterServices services) { + List onlineServers = services.getServerManager().getOnlineServersList(); + return Stream.concat(onlineServers.stream(), + services.getMasterProcedureExecutor().getProcedures().stream() + .filter(p -> p instanceof ServerCrashProcedure).filter(p -> !p.isFinished()) + .map(p -> ((ServerCrashProcedure) p).getServerName())) + .collect(Collectors.toSet()); + } + @Override public void init(Map params) { super.init(params); - try { - if (MapUtils.isNotEmpty(params)) { - Object master = params.get(HMaster.MASTER); - if (master != null && master instanceof HMaster) { - zkw = ((HMaster) master).getZooKeeper(); - shareZK = true; - } - } - if (zkw == null) { - zkw = new ZKWatcher(getConf(), "replicationLogCleaner", null); + if (MapUtils.isNotEmpty(params)) { + Object master = params.get(HMaster.MASTER); + if (master != null && master instanceof MasterServices) { + MasterServices m = (MasterServices) master; + rpm = m.getReplicationPeerManager(); + getNotFullyDeadServers = () -> getNotFullyDeadServers(m); + return; } - // TODO: revisit the implementation - // this.queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zkw, getConf()); - } catch (IOException e) { - LOG.error("Error while configuring " + this.getClass().getName(), e); } - } - - @InterfaceAudience.Private - public void setConf(Configuration conf, ZKWatcher zk) { - super.setConf(conf); - try { - this.zkw = zk; - // TODO: revisit the implementation - // this.queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zk, conf); - } catch (Exception e) { - LOG.error("Error while configuring " + this.getClass().getName(), e); - } - } - - @InterfaceAudience.Private - public void setConf(Configuration conf, ZKWatcher zk, - ReplicationQueueStorage replicationQueueStorage) { - super.setConf(conf); - this.zkw = zk; - this.queueStorage = replicationQueueStorage; + throw new IllegalArgumentException("Missing " + HMaster.MASTER + " parameter"); } @Override public void stop(String why) { - if (this.stopped) return; this.stopped = true; - if (!shareZK && this.zkw != null) { - LOG.info("Stopping " + this.zkw); - this.zkw.close(); - } } @Override diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleanerBarrier.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleanerBarrier.java new file mode 100644 index 000000000000..d87565187280 --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleanerBarrier.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.replication.master; + +import org.apache.yetus.audience.InterfaceAudience; + +/** + * A barrier to guard the execution of {@link ReplicationLogCleaner}. + *

    + * The reason why we introduce this class is because there could be race between + * {@link org.apache.hadoop.hbase.master.replication.AddPeerProcedure} and + * {@link ReplicationLogCleaner}. See HBASE-27214 for more details. + */ +@InterfaceAudience.Private +public class ReplicationLogCleanerBarrier { + + private enum State { + // the cleaner is not running + NOT_RUNNING, + // the cleaner is running + RUNNING, + // the cleaner is disabled + DISABLED + } + + private State state = State.NOT_RUNNING; + + // we could have multiple AddPeerProcedure running at the same time, so here we need to do + // reference counting. + private int numberDisabled = 0; + + public synchronized boolean start() { + if (state == State.NOT_RUNNING) { + state = State.RUNNING; + return true; + } + if (state == State.DISABLED) { + return false; + } + throw new IllegalStateException("Unexpected state " + state); + } + + public synchronized void stop() { + if (state != State.RUNNING) { + throw new IllegalStateException("Unexpected state " + state); + } + state = State.NOT_RUNNING; + } + + public synchronized boolean disable() { + if (state == State.RUNNING) { + return false; + } + if (state == State.NOT_RUNNING) { + state = State.DISABLED; + } + numberDisabled++; + return true; + } + + public synchronized void enable() { + if (state != State.DISABLED) { + throw new IllegalStateException("Unexpected state " + state); + } + numberDisabled--; + if (numberDisabled == 0) { + state = State.NOT_RUNNING; + } + } +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java index 5d77600a187b..b521766ae3dc 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java @@ -55,6 +55,7 @@ import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.replication.ReplicationException; import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; +import org.apache.hadoop.hbase.replication.ReplicationOffsetUtil; import org.apache.hadoop.hbase.replication.ReplicationPeer; import org.apache.hadoop.hbase.replication.ReplicationPeerConfig; import org.apache.hadoop.hbase.replication.ReplicationPeerImpl; @@ -809,22 +810,7 @@ private boolean shouldReplicate(ReplicationGroupOffset offset, String wal) { if (AbstractFSWALProvider.isMetaFile(wal)) { return false; } - // if no offset or the offset is just a place marker, replicate - if (offset == null || offset == ReplicationGroupOffset.BEGIN) { - return true; - } - // otherwise, compare the timestamp - long walTs = AbstractFSWALProvider.getTimestamp(wal); - long startWalTs = AbstractFSWALProvider.getTimestamp(offset.getWal()); - if (walTs < startWalTs) { - return false; - } else if (walTs > startWalTs) { - return true; - } - // if the timestamp equals, usually it means we should include this wal but there is a special - // case, a negative offset means the wal has already been fully replicated, so here we should - // check the offset. - return offset.getOffset() >= 0; + return ReplicationOffsetUtil.shouldReplicate(offset, wal); } void claimQueue(ReplicationQueueId queueId) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSyncUp.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSyncUp.java index 50ffd6df1afd..b63ad473719c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSyncUp.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSyncUp.java @@ -117,7 +117,10 @@ public boolean isAborted() { System.out.println("Start Replication Server start"); Replication replication = new Replication(); replication.initialize(new DummyServer(zkw), fs, logDir, oldLogDir, - new WALFactory(conf, "test", null, false)); + new WALFactory(conf, + ServerName + .valueOf(getClass().getSimpleName() + ",16010," + EnvironmentEdgeManager.currentTime()), + null, false)); ReplicationSourceManager manager = replication.getReplicationManager(); manager.init(); claimReplicationQueues(zkw, manager); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java index db39a8ba0232..480866949993 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java @@ -19,6 +19,9 @@ import java.io.FileNotFoundException; import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; @@ -38,6 +41,7 @@ import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL; import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener; +import org.apache.hadoop.hbase.util.Addressing; import org.apache.hadoop.hbase.util.CancelableProgressable; import org.apache.hadoop.hbase.util.CommonFSUtils; import org.apache.hadoop.hbase.util.RecoverLeaseFSUtils; @@ -582,4 +586,29 @@ private static String getWALNameGroupFromWALName(String name, int group) { public static String getWALPrefixFromWALName(String name) { return getWALNameGroupFromWALName(name, 1); } + + private static final Pattern SERVER_NAME_PATTERN = Pattern.compile("^[^" + + ServerName.SERVERNAME_SEPARATOR + "]+" + ServerName.SERVERNAME_SEPARATOR + + Addressing.VALID_PORT_REGEX + ServerName.SERVERNAME_SEPARATOR + Addressing.VALID_PORT_REGEX); + + /** + * Parse the server name from wal prefix. A wal's name is always started with a server name in non + * test code. + * @throws IllegalArgumentException if the name passed in is not started with a server name + * @return the server name + */ + public static ServerName parseServerNameFromWALName(String name) { + String decoded; + try { + decoded = URLDecoder.decode(name, StandardCharsets.UTF_8.name()); + } catch (UnsupportedEncodingException e) { + throw new AssertionError("should never happen", e); + } + Matcher matcher = SERVER_NAME_PATTERN.matcher(decoded); + if (matcher.find()) { + return ServerName.valueOf(matcher.group()); + } else { + throw new IllegalArgumentException(name + " is not started with a server name"); + } + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALFactory.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALFactory.java index 92d96c5e2100..bc0a9eec73a4 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALFactory.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALFactory.java @@ -26,6 +26,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.Abortable; +import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager; import org.apache.hadoop.hbase.regionserver.wal.MetricsWAL; @@ -191,17 +192,35 @@ static WALProvider createProvider(Class clazz) throws IOE } /** - * @param conf must not be null, will keep a reference to read params in later reader/writer - * instances. - * @param factoryId a unique identifier for this factory. used i.e. by filesystem implementations - * to make a directory + * Create a WALFactory. */ + @RestrictedApi(explanation = "Should only be called in tests", link = "", + allowedOnPath = ".*/src/test/.*|.*/HBaseTestingUtility.java") public WALFactory(Configuration conf, String factoryId) throws IOException { // default enableSyncReplicationWALProvider is true, only disable SyncReplicationWALProvider // for HMaster or HRegionServer which take system table only. See HBASE-19999 this(conf, factoryId, null, true); } + /** + * Create a WALFactory. + *

    + * This is the constructor you should use when creating a WALFactory in normal code, to make sure + * that the {@code factoryId} is the server name. We need this assumption in some places for + * parsing the server name out from the wal file name. + * @param conf must not be null, will keep a reference to read params + * in later reader/writer instances. + * @param serverName use to generate the factoryId, which will be append at + * the first of the final file name + * @param abortable the server associated with this WAL file + * @param enableSyncReplicationWALProvider whether wrap the wal provider to a + * {@link SyncReplicationWALProvider} n + */ + public WALFactory(Configuration conf, ServerName serverName, Abortable abortable, + boolean enableSyncReplicationWALProvider) throws IOException { + this(conf, serverName.toString(), abortable, enableSyncReplicationWALProvider); + } + /** * @param conf must not be null, will keep a reference to read params * in later reader/writer instances. @@ -211,7 +230,7 @@ public WALFactory(Configuration conf, String factoryId) throws IOException { * @param enableSyncReplicationWALProvider whether wrap the wal provider to a * {@link SyncReplicationWALProvider} */ - public WALFactory(Configuration conf, String factoryId, Abortable abortable, + private WALFactory(Configuration conf, String factoryId, Abortable abortable, boolean enableSyncReplicationWALProvider) throws IOException { // until we've moved reader/writer construction down into providers, this initialization must // happen prior to provider initialization, in case they need to instantiate a reader/writer. diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestLogsCleaner.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestLogsCleaner.java index 1a0537bcbafe..d7ba6c227c6d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestLogsCleaner.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestLogsCleaner.java @@ -18,57 +18,60 @@ package org.apache.hadoop.hbase.master.cleaner; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; -import static org.mockito.Mockito.doThrow; -import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; import java.io.IOException; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; import java.util.Arrays; -import java.util.Iterator; -import java.util.List; +import java.util.Collections; import java.util.concurrent.ThreadLocalRandom; -import java.util.concurrent.atomic.AtomicBoolean; import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.Abortable; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtil; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.Server; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.TableNameTestRule; import org.apache.hadoop.hbase.Waiter; -import org.apache.hadoop.hbase.ZooKeeperConnectionException; -import org.apache.hadoop.hbase.client.ConnectionFactory; +import org.apache.hadoop.hbase.client.TableDescriptor; import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.MasterServices; +import org.apache.hadoop.hbase.master.ServerManager; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; +import org.apache.hadoop.hbase.master.replication.ReplicationPeerManager; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; +import org.apache.hadoop.hbase.replication.ReplicationPeerDescription; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; -import org.apache.hadoop.hbase.replication.master.ReplicationLogCleaner; +import org.apache.hadoop.hbase.replication.master.ReplicationLogCleanerBarrier; import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.MockServer; -import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper; import org.apache.hadoop.hbase.zookeeper.ZKWatcher; -import org.apache.zookeeper.KeeperException; import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.ClassRule; -import org.junit.Ignore; +import org.junit.Rule; import org.junit.Test; import org.junit.experimental.categories.Category; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -// revisit later after we implement new replication log cleaner -@Ignore +import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap; + @Category({ MasterTests.class, MediumTests.class }) public class TestLogsCleaner { @@ -88,22 +91,29 @@ public class TestLogsCleaner { private static DirScanPool POOL; + private static String peerId = "1"; + + private MasterServices masterServices; + + private ReplicationQueueStorage queueStorage; + + @Rule + public final TableNameTestRule tableNameRule = new TableNameTestRule(); + @BeforeClass public static void setUpBeforeClass() throws Exception { - TEST_UTIL.startMiniZKCluster(); - TEST_UTIL.startMiniDFSCluster(1); + TEST_UTIL.startMiniCluster(); POOL = DirScanPool.getLogCleanerScanPool(TEST_UTIL.getConfiguration()); } @AfterClass public static void tearDownAfterClass() throws Exception { - TEST_UTIL.shutdownMiniZKCluster(); - TEST_UTIL.shutdownMiniDFSCluster(); + TEST_UTIL.shutdownMiniCluster(); POOL.shutdownNow(); } @Before - public void beforeTest() throws IOException { + public void beforeTest() throws Exception { conf = TEST_UTIL.getConfiguration(); FileSystem fs = TEST_UTIL.getDFSCluster().getFileSystem(); @@ -112,14 +122,51 @@ public void beforeTest() throws IOException { // root directory fs.mkdirs(OLD_WALS_DIR); + + TableName tableName = tableNameRule.getTableName(); + TableDescriptor td = ReplicationStorageFactory.createReplicationQueueTableDescriptor(tableName); + TEST_UTIL.getAdmin().createTable(td); + TEST_UTIL.waitTableAvailable(tableName); + queueStorage = + ReplicationStorageFactory.getReplicationQueueStorage(TEST_UTIL.getConnection(), tableName); + + masterServices = mock(MasterServices.class); + when(masterServices.getConnection()).thenReturn(TEST_UTIL.getConnection()); + ReplicationPeerManager rpm = mock(ReplicationPeerManager.class); + when(masterServices.getReplicationPeerManager()).thenReturn(rpm); + when(rpm.getQueueStorage()).thenReturn(queueStorage); + when(rpm.getReplicationLogCleanerBarrier()).thenReturn(new ReplicationLogCleanerBarrier()); + when(rpm.listPeers(null)).thenReturn(new ArrayList<>()); + ServerManager sm = mock(ServerManager.class); + when(masterServices.getServerManager()).thenReturn(sm); + when(sm.getOnlineServersList()).thenReturn(Collections.emptyList()); + @SuppressWarnings("unchecked") + ProcedureExecutor procExec = mock(ProcedureExecutor.class); + when(masterServices.getMasterProcedureExecutor()).thenReturn(procExec); + when(procExec.getProcedures()).thenReturn(Collections.emptyList()); } /** * This tests verifies LogCleaner works correctly with WALs and Procedure WALs located in the same - * oldWALs directory. Created files: - 2 invalid files - 5 old Procedure WALs - 30 old WALs from - * which 3 are in replication - 5 recent Procedure WALs - 1 recent WAL - 1 very new WAL (timestamp - * in future) - masterProcedureWALs subdirectory Files which should stay: - 3 replication WALs - 2 - * new WALs - 5 latest Procedure WALs - masterProcedureWALs subdirectory + * oldWALs directory. + *

    + * Created files: + *

      + *
    • 2 invalid files
    • + *
    • 5 old Procedure WALs
    • + *
    • 30 old WALs from which 3 are in replication
    • + *
    • 5 recent Procedure WALs
    • + *
    • 1 recent WAL
    • + *
    • 1 very new WAL (timestamp in future)
    • + *
    • masterProcedureWALs subdirectory
    • + *
    + * Files which should stay: + *
      + *
    • 3 replication WALs
    • + *
    • 2 new WALs
    • + *
    • 5 latest Procedure WALs
    • + *
    • masterProcedureWALs subdirectory
    • + *
    */ @Test public void testLogCleaning() throws Exception { @@ -131,9 +178,6 @@ public void testLogCleaning() throws Exception { HMaster.decorateMasterConfiguration(conf); Server server = new DummyServer(); - ReplicationQueueStorage queueStorage = ReplicationStorageFactory - .getReplicationQueueStorage(ConnectionFactory.createConnection(conf), conf); - String fakeMachineName = URLEncoder.encode(server.getServerName().toString(), StandardCharsets.UTF_8.name()); @@ -159,14 +203,12 @@ public void testLogCleaning() throws Exception { for (int i = 1; i <= 30; i++) { Path fileName = new Path(OLD_WALS_DIR, fakeMachineName + "." + (now - i)); fs.createNewFile(fileName); - // Case 4: put 3 WALs in ZK indicating that they are scheduled for replication so these - // files would pass TimeToLiveLogCleaner but would be rejected by ReplicationLogCleaner - if (i % (30 / 3) == 0) { - // queueStorage.addWAL(server.getServerName(), fakeMachineName, fileName.getName()); - LOG.info("Replication log file: " + fileName); - } } - + // Case 4: the newest 3 WALs will be kept because they are beyond the replication offset + masterServices.getReplicationPeerManager().listPeers(null) + .add(new ReplicationPeerDescription(peerId, true, null, null)); + queueStorage.setOffset(new ReplicationQueueId(server.getServerName(), peerId), fakeMachineName, + new ReplicationGroupOffset(fakeMachineName + "." + (now - 3), 0), Collections.emptyMap()); // Case 5: 5 Procedure WALs that are new, will stay for (int i = 6; i <= 10; i++) { Path fileName = new Path(OLD_PROCEDURE_WALS_DIR, String.format("pv2-%020d.log", i)); @@ -189,7 +231,8 @@ public void testLogCleaning() throws Exception { // 10 procedure WALs assertEquals(10, fs.listStatus(OLD_PROCEDURE_WALS_DIR).length); - LogCleaner cleaner = new LogCleaner(1000, server, conf, fs, OLD_WALS_DIR, POOL, null); + LogCleaner cleaner = new LogCleaner(1000, server, conf, fs, OLD_WALS_DIR, POOL, + ImmutableMap.of(HMaster.MASTER, masterServices)); cleaner.chore(); // In oldWALs we end up with the current WAL, a newer WAL, the 3 old WALs which @@ -208,98 +251,14 @@ public void testLogCleaning() throws Exception { } } - @Test - public void testZooKeeperRecoveryDuringGetListOfReplicators() throws Exception { - ReplicationLogCleaner cleaner = new ReplicationLogCleaner(); - - List dummyFiles = Arrays.asList( - new FileStatus(100, false, 3, 100, EnvironmentEdgeManager.currentTime(), new Path("log1")), - new FileStatus(100, false, 3, 100, EnvironmentEdgeManager.currentTime(), new Path("log2"))); - - FaultyZooKeeperWatcher faultyZK = - new FaultyZooKeeperWatcher(conf, "testZooKeeperAbort-faulty", null); - final AtomicBoolean getListOfReplicatorsFailed = new AtomicBoolean(false); - - try { - faultyZK.init(false); - ReplicationQueueStorage queueStorage = spy(ReplicationStorageFactory - .getReplicationQueueStorage(ConnectionFactory.createConnection(conf), conf)); - // doAnswer(new Answer() { - // @Override - // public Object answer(InvocationOnMock invocation) throws Throwable { - // try { - // return invocation.callRealMethod(); - // } catch (ReplicationException e) { - // LOG.debug("Caught Exception", e); - // getListOfReplicatorsFailed.set(true); - // throw e; - // } - // } - // }).when(queueStorage).getAllWALs(); - - cleaner.setConf(conf, faultyZK, queueStorage); - // should keep all files due to a ConnectionLossException getting the queues znodes - cleaner.preClean(); - Iterable toDelete = cleaner.getDeletableFiles(dummyFiles); - - assertTrue(getListOfReplicatorsFailed.get()); - assertFalse(toDelete.iterator().hasNext()); - assertFalse(cleaner.isStopped()); - - // zk recovery. - faultyZK.init(true); - cleaner.preClean(); - Iterable filesToDelete = cleaner.getDeletableFiles(dummyFiles); - Iterator iter = filesToDelete.iterator(); - assertTrue(iter.hasNext()); - assertEquals(new Path("log1"), iter.next().getPath()); - assertTrue(iter.hasNext()); - assertEquals(new Path("log2"), iter.next().getPath()); - assertFalse(iter.hasNext()); - - } finally { - faultyZK.close(); - } - } - - /** - * When zk is working both files should be returned - * @throws Exception from ZK watcher - */ - @Test - public void testZooKeeperNormal() throws Exception { - ReplicationLogCleaner cleaner = new ReplicationLogCleaner(); - - // Subtract 1000 from current time so modtime is for sure older - // than 'now'. - long modTime = EnvironmentEdgeManager.currentTime() - 1000; - List dummyFiles = - Arrays.asList(new FileStatus(100, false, 3, 100, modTime, new Path("log1")), - new FileStatus(100, false, 3, 100, modTime, new Path("log2"))); - - ZKWatcher zkw = new ZKWatcher(conf, "testZooKeeperAbort-normal", null); - try { - cleaner.setConf(conf, zkw); - cleaner.preClean(); - Iterable filesToDelete = cleaner.getDeletableFiles(dummyFiles); - Iterator iter = filesToDelete.iterator(); - assertTrue(iter.hasNext()); - assertEquals(new Path("log1"), iter.next().getPath()); - assertTrue(iter.hasNext()); - assertEquals(new Path("log2"), iter.next().getPath()); - assertFalse(iter.hasNext()); - } finally { - zkw.close(); - } - } - @Test public void testOnConfigurationChange() throws Exception { // Prepare environments Server server = new DummyServer(); FileSystem fs = TEST_UTIL.getDFSCluster().getFileSystem(); - LogCleaner cleaner = new LogCleaner(3000, server, conf, fs, OLD_WALS_DIR, POOL, null); + LogCleaner cleaner = new LogCleaner(3000, server, conf, fs, OLD_WALS_DIR, POOL, + ImmutableMap.of(HMaster.MASTER, masterServices)); int size = cleaner.getSizeOfCleaners(); assertEquals(LogCleaner.DEFAULT_OLD_WALS_CLEANER_THREAD_TIMEOUT_MSEC, cleaner.getCleanerThreadTimeoutMsec()); @@ -338,7 +297,7 @@ private void createFiles(FileSystem fs, Path parentDir, int numOfFiles) throws I } } - static class DummyServer extends MockServer { + private static final class DummyServer extends MockServer { @Override public Configuration getConfiguration() { @@ -355,26 +314,4 @@ public ZKWatcher getZooKeeper() { return null; } } - - static class FaultyZooKeeperWatcher extends ZKWatcher { - private RecoverableZooKeeper zk; - - public FaultyZooKeeperWatcher(Configuration conf, String identifier, Abortable abortable) - throws ZooKeeperConnectionException, IOException { - super(conf, identifier, abortable); - } - - public void init(boolean autoRecovery) throws Exception { - this.zk = spy(super.getRecoverableZooKeeper()); - if (!autoRecovery) { - doThrow(new KeeperException.ConnectionLossException()).when(zk) - .getChildren("/hbase/replication/rs", null); - } - } - - @Override - public RecoverableZooKeeper getRecoverableZooKeeper() { - return zk; - } - } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestReplicationHFileCleaner.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestReplicationHFileCleaner.java index 2409b081cce7..5aef1eaf1c6b 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestReplicationHFileCleaner.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestReplicationHFileCleaner.java @@ -26,6 +26,7 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -34,7 +35,9 @@ import org.apache.hadoop.hbase.HBaseTestingUtil; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.Server; +import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.client.TableDescriptor; import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.replication.ReplicationException; import org.apache.hadoop.hbase.replication.ReplicationFactory; @@ -48,19 +51,19 @@ import org.apache.hadoop.hbase.testclassification.SmallTests; import org.apache.hadoop.hbase.util.MockServer; import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.zookeeper.ZKWatcher; import org.junit.After; import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.ClassRule; -import org.junit.Ignore; import org.junit.Test; import org.junit.experimental.categories.Category; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -// TODO: revisit later -@Ignore +import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap; + @Category({ MasterTests.class, SmallTests.class }) public class TestReplicationHFileCleaner { @@ -71,19 +74,25 @@ public class TestReplicationHFileCleaner { private static final Logger LOG = LoggerFactory.getLogger(TestReplicationHFileCleaner.class); private final static HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); private static Server server; + private static final TableName tableName = TableName.valueOf("test_cleaner"); private static ReplicationQueueStorage rq; private static ReplicationPeers rp; private static final String peerId = "TestReplicationHFileCleaner"; private static Configuration conf = TEST_UTIL.getConfiguration(); - static FileSystem fs = null; - Path root; + private static FileSystem fs = null; + private static Map params; + private Path root; @BeforeClass public static void setUpBeforeClass() throws Exception { TEST_UTIL.startMiniCluster(); server = new DummyServer(); + params = ImmutableMap.of(HMaster.MASTER, server); conf.setBoolean(HConstants.REPLICATION_BULKLOAD_ENABLE_KEY, true); HMaster.decorateMasterConfiguration(conf); + TableDescriptor td = ReplicationStorageFactory.createReplicationQueueTableDescriptor(tableName); + TEST_UTIL.getAdmin().createTable(td); + conf.set(ReplicationStorageFactory.REPLICATION_QUEUE_TABLE_NAME, tableName.getNameAsString()); rp = ReplicationFactory.getReplicationPeers(server.getFileSystem(), server.getZooKeeper(), conf); rp.init(); @@ -93,7 +102,7 @@ public static void setUpBeforeClass() throws Exception { @AfterClass public static void tearDownAfterClass() throws Exception { - TEST_UTIL.shutdownMiniZKCluster(); + TEST_UTIL.shutdownMiniCluster(); } @Before @@ -116,6 +125,13 @@ public void cleanup() throws ReplicationException { rp.getPeerStorage().removePeer(peerId); } + private ReplicationHFileCleaner createCleaner() { + ReplicationHFileCleaner cleaner = new ReplicationHFileCleaner(); + cleaner.setConf(conf); + cleaner.init(params); + return cleaner; + } + @Test public void testIsFileDeletable() throws IOException, ReplicationException { // 1. Create a file @@ -123,8 +139,7 @@ public void testIsFileDeletable() throws IOException, ReplicationException { fs.createNewFile(file); // 2. Assert file is successfully created assertTrue("Test file not created!", fs.exists(file)); - ReplicationHFileCleaner cleaner = new ReplicationHFileCleaner(); - cleaner.setConf(conf); + ReplicationHFileCleaner cleaner = createCleaner(); // 3. Assert that file as is should be deletable assertTrue("Cleaner should allow to delete this file as there is no hfile reference node " + "for it in the queue.", cleaner.isFileDeletable(fs.getFileStatus(file))); @@ -161,8 +176,7 @@ public void testGetDeletableFiles() throws Exception { // 2. Add one file to hfile-refs queue rq.addHFileRefs(peerId, hfiles); - ReplicationHFileCleaner cleaner = new ReplicationHFileCleaner(); - cleaner.setConf(conf); + ReplicationHFileCleaner cleaner = createCleaner(); Iterator deletableFilesIterator = cleaner.getDeletableFiles(files).iterator(); int i = 0; while (deletableFilesIterator.hasNext() && i < 2) { @@ -183,6 +197,15 @@ public Configuration getConfiguration() { return TEST_UTIL.getConfiguration(); } + @Override + public ZKWatcher getZooKeeper() { + try { + return TEST_UTIL.getZooKeeperWatcher(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + @Override public Connection getConnection() { try { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationOffsetUtil.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationOffsetUtil.java new file mode 100644 index 000000000000..f54a49583743 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationOffsetUtil.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.replication; + +import static org.apache.hadoop.hbase.replication.ReplicationOffsetUtil.shouldReplicate; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.testclassification.ReplicationTests; +import org.apache.hadoop.hbase.testclassification.SmallTests; +import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category({ ReplicationTests.class, SmallTests.class }) +public class TestReplicationOffsetUtil { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestReplicationOffsetUtil.class); + + @Test + public void test() { + assertTrue(shouldReplicate(null, "whatever")); + assertTrue(shouldReplicate(ReplicationGroupOffset.BEGIN, "whatever")); + ServerName sn = ServerName.valueOf("host", 16010, EnvironmentEdgeManager.currentTime()); + ReplicationGroupOffset offset = new ReplicationGroupOffset(sn + ".12345", 100); + assertTrue(shouldReplicate(offset, sn + ".12346")); + assertFalse(shouldReplicate(offset, sn + ".12344")); + assertTrue(shouldReplicate(offset, sn + ".12345")); + // -1 means finish replication, so should not replicate + assertFalse(shouldReplicate(new ReplicationGroupOffset(sn + ".12345", -1), sn + ".12345")); + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/master/TestLogCleanerBarrier.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/master/TestLogCleanerBarrier.java new file mode 100644 index 000000000000..06cb85523d3b --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/master/TestLogCleanerBarrier.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.replication.master; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertThrows; +import static org.junit.Assert.assertTrue; + +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.SmallTests; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category({ MasterTests.class, SmallTests.class }) +public class TestLogCleanerBarrier { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestLogCleanerBarrier.class); + + @Test + public void test() { + ReplicationLogCleanerBarrier barrier = new ReplicationLogCleanerBarrier(); + assertThrows(IllegalStateException.class, () -> barrier.stop()); + assertThrows(IllegalStateException.class, () -> barrier.enable()); + assertTrue(barrier.start()); + assertThrows(IllegalStateException.class, () -> barrier.start()); + assertThrows(IllegalStateException.class, () -> barrier.enable()); + assertFalse(barrier.disable()); + assertThrows(IllegalStateException.class, () -> barrier.enable()); + barrier.stop(); + + for (int i = 0; i < 3; i++) { + assertTrue(barrier.disable()); + assertFalse(barrier.start()); + } + for (int i = 0; i < 3; i++) { + assertFalse(barrier.start()); + barrier.enable(); + } + assertTrue(barrier.start()); + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/master/TestReplicationLogCleaner.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/master/TestReplicationLogCleaner.java new file mode 100644 index 000000000000..7a227fb0603d --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/master/TestReplicationLogCleaner.java @@ -0,0 +1,385 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.replication.master; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.emptyIterable; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertSame; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.MasterServices; +import org.apache.hadoop.hbase.master.ServerManager; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; +import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; +import org.apache.hadoop.hbase.master.replication.ReplicationPeerManager; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; +import org.apache.hadoop.hbase.replication.ReplicationException; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; +import org.apache.hadoop.hbase.replication.ReplicationPeerDescription; +import org.apache.hadoop.hbase.replication.ReplicationQueueData; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; +import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.SmallTests; +import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; +import org.junit.After; +import org.junit.Before; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap; + +@Category({ MasterTests.class, SmallTests.class }) +public class TestReplicationLogCleaner { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestReplicationLogCleaner.class); + + private static final Configuration CONF = HBaseConfiguration.create(); + + private MasterServices services; + + private ReplicationLogCleaner cleaner; + + @Before + public void setUp() throws ReplicationException { + services = mock(MasterServices.class); + ReplicationPeerManager rpm = mock(ReplicationPeerManager.class); + when(rpm.getReplicationLogCleanerBarrier()).thenReturn(new ReplicationLogCleanerBarrier()); + when(services.getReplicationPeerManager()).thenReturn(rpm); + when(rpm.listPeers(null)).thenReturn(new ArrayList<>()); + ReplicationQueueStorage rqs = mock(ReplicationQueueStorage.class); + when(rpm.getQueueStorage()).thenReturn(rqs); + when(rqs.listAllQueues()).thenReturn(new ArrayList<>()); + ServerManager sm = mock(ServerManager.class); + when(services.getServerManager()).thenReturn(sm); + when(sm.getOnlineServersList()).thenReturn(new ArrayList<>()); + @SuppressWarnings("unchecked") + ProcedureExecutor procExec = mock(ProcedureExecutor.class); + when(services.getMasterProcedureExecutor()).thenReturn(procExec); + when(procExec.getProcedures()).thenReturn(new ArrayList<>()); + + cleaner = new ReplicationLogCleaner(); + cleaner.setConf(CONF); + Map params = ImmutableMap.of(HMaster.MASTER, services); + cleaner.init(params); + } + + @After + public void tearDown() { + cleaner.postClean(); + } + + private static Iterable runCleaner(ReplicationLogCleaner cleaner, + Iterable files) { + cleaner.preClean(); + return cleaner.getDeletableFiles(files); + } + + private static FileStatus createFileStatus(Path path) { + return new FileStatus(100, false, 3, 256, EnvironmentEdgeManager.currentTime(), path); + } + + private static FileStatus createFileStatus(ServerName sn, int number) { + Path path = new Path(sn.toString() + "." + number); + return createFileStatus(path); + } + + private static ReplicationPeerDescription createPeer(String peerId) { + return new ReplicationPeerDescription(peerId, true, null, null); + } + + private void addServer(ServerName serverName) { + services.getServerManager().getOnlineServersList().add(serverName); + } + + private void addSCP(ServerName serverName, boolean finished) { + ServerCrashProcedure scp = mock(ServerCrashProcedure.class); + when(scp.getServerName()).thenReturn(serverName); + when(scp.isFinished()).thenReturn(finished); + services.getMasterProcedureExecutor().getProcedures().add(scp); + } + + private void addPeer(String... peerIds) { + services.getReplicationPeerManager().listPeers(null).addAll( + Stream.of(peerIds).map(TestReplicationLogCleaner::createPeer).collect(Collectors.toList())); + } + + private void addQueueData(ReplicationQueueData... datas) throws ReplicationException { + services.getReplicationPeerManager().getQueueStorage().listAllQueues() + .addAll(Arrays.asList(datas)); + } + + @Test + public void testNoConf() { + ReplicationLogCleaner cleaner = new ReplicationLogCleaner(); + List files = Arrays.asList(new FileStatus()); + assertSame(files, runCleaner(cleaner, files)); + cleaner.postClean(); + } + + @Test + public void testCanNotFilter() { + assertTrue(services.getReplicationPeerManager().getReplicationLogCleanerBarrier().disable()); + List files = Arrays.asList(new FileStatus()); + assertSame(Collections.emptyList(), runCleaner(cleaner, files)); + } + + @Test + public void testNoPeer() { + Path path = new Path("/wal." + EnvironmentEdgeManager.currentTime()); + assertTrue(AbstractFSWALProvider.validateWALFilename(path.getName())); + FileStatus file = createFileStatus(path); + Iterator iter = runCleaner(cleaner, Arrays.asList(file)).iterator(); + assertSame(file, iter.next()); + assertFalse(iter.hasNext()); + } + + @Test + public void testNotValidWalFile() { + addPeer("1"); + Path path = new Path("/whatever"); + assertFalse(AbstractFSWALProvider.validateWALFilename(path.getName())); + FileStatus file = createFileStatus(path); + Iterator iter = runCleaner(cleaner, Arrays.asList(file)).iterator(); + assertSame(file, iter.next()); + assertFalse(iter.hasNext()); + } + + @Test + public void testMetaWalFile() { + addPeer("1"); + Path path = new Path( + "/wal." + EnvironmentEdgeManager.currentTime() + AbstractFSWALProvider.META_WAL_PROVIDER_ID); + assertTrue(AbstractFSWALProvider.validateWALFilename(path.getName())); + assertTrue(AbstractFSWALProvider.isMetaFile(path)); + FileStatus file = createFileStatus(path); + Iterator iter = runCleaner(cleaner, Arrays.asList(file)).iterator(); + assertSame(file, iter.next()); + assertFalse(iter.hasNext()); + } + + @Test + public void testLiveRegionServerNoQueues() { + addPeer("1"); + ServerName sn = ServerName.valueOf("server,123," + EnvironmentEdgeManager.currentTime()); + addServer(sn); + List files = Arrays.asList(createFileStatus(sn, 1)); + assertThat(runCleaner(cleaner, files), emptyIterable()); + } + + @Test + public void testLiveRegionServerWithSCPNoQueues() { + addPeer("1"); + ServerName sn = ServerName.valueOf("server,123," + EnvironmentEdgeManager.currentTime()); + addSCP(sn, false); + List files = Arrays.asList(createFileStatus(sn, 1)); + assertThat(runCleaner(cleaner, files), emptyIterable()); + } + + @Test + public void testDeadRegionServerNoQueues() { + addPeer("1"); + ServerName sn = ServerName.valueOf("server,123," + EnvironmentEdgeManager.currentTime()); + FileStatus file = createFileStatus(sn, 1); + Iterator iter = runCleaner(cleaner, Arrays.asList(file)).iterator(); + assertSame(file, iter.next()); + assertFalse(iter.hasNext()); + } + + @Test + public void testDeadRegionServerWithSCPNoQueues() { + addPeer("1"); + ServerName sn = ServerName.valueOf("server,123," + EnvironmentEdgeManager.currentTime()); + addSCP(sn, true); + FileStatus file = createFileStatus(sn, 1); + Iterator iter = runCleaner(cleaner, Arrays.asList(file)).iterator(); + assertSame(file, iter.next()); + assertFalse(iter.hasNext()); + } + + @Test + public void testLiveRegionServerMissingQueue() throws ReplicationException { + String peerId1 = "1"; + String peerId2 = "2"; + addPeer(peerId1, peerId2); + ServerName sn = ServerName.valueOf("server,123," + EnvironmentEdgeManager.currentTime()); + addServer(sn); + FileStatus file = createFileStatus(sn, 1); + ReplicationQueueData data1 = new ReplicationQueueData(new ReplicationQueueId(sn, peerId1), + ImmutableMap.of(sn.toString(), new ReplicationGroupOffset(file.getPath().getName(), -1))); + addQueueData(data1); + assertThat(runCleaner(cleaner, Arrays.asList(file)), emptyIterable()); + } + + @Test + public void testLiveRegionServerShouldNotDelete() throws ReplicationException { + String peerId = "1"; + addPeer(peerId); + ServerName sn = ServerName.valueOf("server,123," + EnvironmentEdgeManager.currentTime()); + addServer(sn); + FileStatus file = createFileStatus(sn, 1); + ReplicationQueueData data = new ReplicationQueueData(new ReplicationQueueId(sn, peerId), + ImmutableMap.of(sn.toString(), new ReplicationGroupOffset(file.getPath().getName(), 0))); + addQueueData(data); + assertThat(runCleaner(cleaner, Arrays.asList(file)), emptyIterable()); + } + + @Test + public void testLiveRegionServerShouldNotDeleteTwoPeers() throws ReplicationException { + String peerId1 = "1"; + String peerId2 = "2"; + addPeer(peerId1, peerId2); + ServerName sn = ServerName.valueOf("server,123," + EnvironmentEdgeManager.currentTime()); + addServer(sn); + FileStatus file = createFileStatus(sn, 1); + ReplicationQueueData data1 = new ReplicationQueueData(new ReplicationQueueId(sn, peerId1), + ImmutableMap.of(sn.toString(), new ReplicationGroupOffset(file.getPath().getName(), -1))); + ReplicationQueueData data2 = new ReplicationQueueData(new ReplicationQueueId(sn, peerId2), + ImmutableMap.of(sn.toString(), new ReplicationGroupOffset(file.getPath().getName(), 0))); + addQueueData(data1, data2); + assertThat(runCleaner(cleaner, Arrays.asList(file)), emptyIterable()); + } + + @Test + public void testLiveRegionServerShouldDelete() throws ReplicationException { + String peerId = "1"; + addPeer(peerId); + ServerName sn = ServerName.valueOf("server,123," + EnvironmentEdgeManager.currentTime()); + addServer(sn); + FileStatus file = createFileStatus(sn, 1); + ReplicationQueueData data = new ReplicationQueueData(new ReplicationQueueId(sn, peerId), + ImmutableMap.of(sn.toString(), new ReplicationGroupOffset(file.getPath().getName(), -1))); + services.getReplicationPeerManager().getQueueStorage().listAllQueues().add(data); + Iterator iter = runCleaner(cleaner, Arrays.asList(file)).iterator(); + assertSame(file, iter.next()); + assertFalse(iter.hasNext()); + } + + @Test + public void testLiveRegionServerShouldDeleteTwoPeers() throws ReplicationException { + String peerId1 = "1"; + String peerId2 = "2"; + addPeer(peerId1, peerId2); + ServerName sn = ServerName.valueOf("server,123," + EnvironmentEdgeManager.currentTime()); + addServer(sn); + FileStatus file = createFileStatus(sn, 1); + ReplicationQueueData data1 = new ReplicationQueueData(new ReplicationQueueId(sn, peerId1), + ImmutableMap.of(sn.toString(), new ReplicationGroupOffset(file.getPath().getName(), -1))); + ReplicationQueueData data2 = new ReplicationQueueData(new ReplicationQueueId(sn, peerId2), + ImmutableMap.of(sn.toString(), new ReplicationGroupOffset(file.getPath().getName(), -1))); + addQueueData(data1, data2); + Iterator iter = runCleaner(cleaner, Arrays.asList(file)).iterator(); + assertSame(file, iter.next()); + assertFalse(iter.hasNext()); + } + + @Test + public void testDeadRegionServerMissingQueue() throws ReplicationException { + String peerId1 = "1"; + String peerId2 = "2"; + addPeer(peerId1, peerId2); + ServerName sn = ServerName.valueOf("server,123," + EnvironmentEdgeManager.currentTime()); + FileStatus file = createFileStatus(sn, 1); + ReplicationQueueData data1 = new ReplicationQueueData(new ReplicationQueueId(sn, peerId1), + ImmutableMap.of(sn.toString(), new ReplicationGroupOffset(file.getPath().getName(), -1))); + addQueueData(data1); + Iterator iter = runCleaner(cleaner, Arrays.asList(file)).iterator(); + assertSame(file, iter.next()); + assertFalse(iter.hasNext()); + } + + @Test + public void testDeadRegionServerShouldNotDelete() throws ReplicationException { + String peerId = "1"; + addPeer(peerId); + ServerName sn = ServerName.valueOf("server,123," + EnvironmentEdgeManager.currentTime()); + FileStatus file = createFileStatus(sn, 1); + ReplicationQueueData data = new ReplicationQueueData(new ReplicationQueueId(sn, peerId), + ImmutableMap.of(sn.toString(), new ReplicationGroupOffset(file.getPath().getName(), 0))); + addQueueData(data); + assertThat(runCleaner(cleaner, Arrays.asList(file)), emptyIterable()); + } + + @Test + public void testDeadRegionServerShouldNotDeleteTwoPeers() throws ReplicationException { + String peerId1 = "1"; + String peerId2 = "2"; + addPeer(peerId1, peerId2); + ServerName sn = ServerName.valueOf("server,123," + EnvironmentEdgeManager.currentTime()); + FileStatus file = createFileStatus(sn, 1); + ReplicationQueueData data1 = new ReplicationQueueData(new ReplicationQueueId(sn, peerId1), + ImmutableMap.of(sn.toString(), new ReplicationGroupOffset(file.getPath().getName(), -1))); + ReplicationQueueData data2 = new ReplicationQueueData(new ReplicationQueueId(sn, peerId2), + ImmutableMap.of(sn.toString(), new ReplicationGroupOffset(file.getPath().getName(), 0))); + addQueueData(data1, data2); + assertThat(runCleaner(cleaner, Arrays.asList(file)), emptyIterable()); + } + + @Test + public void testDeadRegionServerShouldDelete() throws ReplicationException { + String peerId = "1"; + addPeer(peerId); + ServerName sn = ServerName.valueOf("server,123," + EnvironmentEdgeManager.currentTime()); + FileStatus file = createFileStatus(sn, 1); + ReplicationQueueData data = new ReplicationQueueData(new ReplicationQueueId(sn, peerId), + ImmutableMap.of(sn.toString(), new ReplicationGroupOffset(file.getPath().getName(), -1))); + services.getReplicationPeerManager().getQueueStorage().listAllQueues().add(data); + Iterator iter = runCleaner(cleaner, Arrays.asList(file)).iterator(); + assertSame(file, iter.next()); + assertFalse(iter.hasNext()); + } + + @Test + public void testDeadRegionServerShouldDeleteTwoPeers() throws ReplicationException { + String peerId1 = "1"; + String peerId2 = "2"; + addPeer(peerId1, peerId2); + ServerName sn = ServerName.valueOf("server,123," + EnvironmentEdgeManager.currentTime()); + FileStatus file = createFileStatus(sn, 1); + ReplicationQueueData data1 = new ReplicationQueueData(new ReplicationQueueId(sn, peerId1), + ImmutableMap.of(sn.toString(), new ReplicationGroupOffset(file.getPath().getName(), -1))); + ReplicationQueueData data2 = new ReplicationQueueData(new ReplicationQueueId(sn, peerId2), + ImmutableMap.of(sn.toString(), new ReplicationGroupOffset(file.getPath().getName(), -1))); + addQueueData(data1, data2); + Iterator iter = runCleaner(cleaner, Arrays.asList(file)).iterator(); + assertSame(file, iter.next()); + assertFalse(iter.hasNext()); + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManager.java index 6aba327d7917..b7564ed9168d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManager.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManager.java @@ -190,7 +190,7 @@ public void setUp() throws Exception { replication = new Replication(); replication.initialize(server, FS, logDir, oldLogDir, - new WALFactory(CONF, "test", null, false)); + new WALFactory(CONF, server.getServerName(), null, false)); manager = replication.getReplicationManager(); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALFactory.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALFactory.java index b59ebc0d9a66..26c1152c05a3 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALFactory.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALFactory.java @@ -630,7 +630,7 @@ public void testWALProviders() throws IOException { assertEquals(wrappedWALProvider.getClass(), walFactory.getMetaProvider().getClass()); // if providers are not set and do not enable SyncReplicationWALProvider - walFactory = new WALFactory(conf, this.currentServername.toString(), null, false); + walFactory = new WALFactory(conf, this.currentServername, null, false); assertEquals(walFactory.getWALProvider().getClass(), walFactory.getMetaProvider().getClass()); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALMethods.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALMethods.java index 8273b3d60410..6a1e98d9fd5d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALMethods.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALMethods.java @@ -21,6 +21,7 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNotSame; import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertThrows; import static org.junit.Assert.assertTrue; import java.io.IOException; @@ -183,4 +184,17 @@ private WAL.Entry createTestLogEntry(int i) { return entry; } + @Test + public void testParseServerNameFromWALName() { + assertEquals(ServerName.valueOf("abc,123,123"), + AbstractFSWALProvider.parseServerNameFromWALName("abc,123,123.1.12345.meta")); + assertEquals(ServerName.valueOf("abc,123,123"), + AbstractFSWALProvider.parseServerNameFromWALName("abc,123,123.12345")); + assertEquals(ServerName.valueOf("abc,123,123"), + AbstractFSWALProvider.parseServerNameFromWALName("abc,123,123")); + assertThrows(IllegalArgumentException.class, + () -> AbstractFSWALProvider.parseServerNameFromWALName("test,abc,123,123.12345")); + assertThrows(IllegalArgumentException.class, + () -> AbstractFSWALProvider.parseServerNameFromWALName("abc")); + } } From 70945cdd3e27bdbf4e51545e6740d3f5a4f0aab6 Mon Sep 17 00:00:00 2001 From: Duo Zhang Date: Thu, 15 Sep 2022 22:58:29 +0800 Subject: [PATCH 04/16] HBASE-27215 Add support for sync replication (#4762) Signed-off-by: Xiaolin Ha --- .../regionserver/ReplicationSource.java | 2 +- .../ReplicationSourceManager.java | 53 ++++++++++--------- .../TestDrainReplicationQueuesForStandBy.java | 3 -- 3 files changed, 28 insertions(+), 30 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java index a49bfd7b623d..7eba88895112 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java @@ -462,7 +462,7 @@ private void uncaughtException(Thread t, Throwable e, ReplicationSourceManager m t.getName()); manager.refreshSources(peerId); break; - } catch (IOException e1) { + } catch (IOException | ReplicationException e1) { LOG.error("Replication sources refresh failed.", e1); sleepForRetries("Sleeping before try refreshing sources again", maxRetriesMultiplier); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java index b521766ae3dc..2fb996c6e4dc 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java @@ -404,38 +404,44 @@ public void drainSources(String peerId) throws IOException, ReplicationException // TODO: use empty initial offsets for now, revisit when adding support for sync replication ReplicationSourceInterface src = createSource(new ReplicationQueueData(queueId, ImmutableMap.of()), peer); - // synchronized here to avoid race with preLogRoll where we add new log to source and also + // synchronized here to avoid race with postLogRoll where we add new log to source and also // walsById. ReplicationSourceInterface toRemove; - Map> wals = new HashMap<>(); + ReplicationQueueData queueData; synchronized (latestPaths) { + // Here we make a copy of all the remaining wal files and then delete them from the + // replication queue storage after releasing the lock. It is not safe to just remove the old + // map from walsById since later we may fail to update the replication queue storage, and when + // we retry next time, we can not know the wal files that needs to be set to the replication + // queue storage + ImmutableMap.Builder builder = ImmutableMap.builder(); + synchronized (walsById) { + walsById.get(queueId).forEach((group, wals) -> { + if (!wals.isEmpty()) { + builder.put(group, new ReplicationGroupOffset(wals.last(), -1)); + } + }); + } + queueData = new ReplicationQueueData(queueId, builder.build()); + src = createSource(queueData, peer); toRemove = sources.put(peerId, src); if (toRemove != null) { LOG.info("Terminate replication source for " + toRemove.getPeerId()); toRemove.terminate(terminateMessage); toRemove.getSourceMetrics().clear(); } - // Here we make a copy of all the remaining wal files and then delete them from the - // replication queue storage after releasing the lock. It is not safe to just remove the old - // map from walsById since later we may fail to delete them from the replication queue - // storage, and when we retry next time, we can not know the wal files that need to be deleted - // from the replication queue storage. - walsById.get(queueId).forEach((k, v) -> wals.put(k, new TreeSet<>(v))); + } + for (Map.Entry entry : queueData.getOffsets().entrySet()) { + queueStorage.setOffset(queueId, entry.getKey(), entry.getValue(), Collections.emptyMap()); } LOG.info("Startup replication source for " + src.getPeerId()); src.startup(); - for (NavigableSet walsByGroup : wals.values()) { - // TODO: just need to reset the replication offset - // for (String wal : walsByGroup) { - // queueStorage.removeWAL(server.getServerName(), peerId, wal); - // } - } synchronized (walsById) { - Map> oldWals = walsById.get(queueId); - wals.forEach((k, v) -> { - NavigableSet walsByGroup = oldWals.get(k); + Map> wals = walsById.get(queueId); + queueData.getOffsets().forEach((group, offset) -> { + NavigableSet walsByGroup = wals.get(group); if (walsByGroup != null) { - walsByGroup.removeAll(v); + walsByGroup.headSet(offset.getWal(), true).clear(); } }); } @@ -458,13 +464,8 @@ public void drainSources(String peerId) throws IOException, ReplicationException } private ReplicationSourceInterface createRefreshedSource(ReplicationQueueId queueId, - ReplicationPeer peer) throws IOException { - Map offsets; - try { - offsets = queueStorage.getOffsets(queueId); - } catch (ReplicationException e) { - throw new IOException(e); - } + ReplicationPeer peer) throws IOException, ReplicationException { + Map offsets = queueStorage.getOffsets(queueId); return createSource(new ReplicationQueueData(queueId, ImmutableMap.copyOf(offsets)), peer); } @@ -474,7 +475,7 @@ private ReplicationSourceInterface createRefreshedSource(ReplicationQueueId queu * replication queue storage and only to enqueue all logs to the new replication source * @param peerId the id of the replication peer */ - public void refreshSources(String peerId) throws IOException { + public void refreshSources(String peerId) throws ReplicationException, IOException { String terminateMessage = "Peer " + peerId + " state or config changed. Will close the previous replication source and open a new one"; ReplicationPeer peer = replicationPeers.getPeer(peerId); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestDrainReplicationQueuesForStandBy.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestDrainReplicationQueuesForStandBy.java index 8918f8422e1d..0189d4755754 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestDrainReplicationQueuesForStandBy.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestDrainReplicationQueuesForStandBy.java @@ -35,12 +35,9 @@ import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.junit.ClassRule; -import org.junit.Ignore; import org.junit.Test; import org.junit.experimental.categories.Category; -// TODO: revisit later -@Ignore @Category({ ReplicationTests.class, MediumTests.class }) public class TestDrainReplicationQueuesForStandBy extends SyncReplicationTestBase { From 3f294ae7227cac27078446f477df14b7ce0a4a73 Mon Sep 17 00:00:00 2001 From: Duo Zhang Date: Thu, 29 Sep 2022 10:08:02 +0800 Subject: [PATCH 05/16] HBASE-27392 Add a new procedure type for implementing some global operations such as migration (#4803) Signed-off-by: Xin Sun --- .../hbase/procedure2/LockedResourceType.java | 3 +- .../procedure/GlobalProcedureInterface.java | 29 +++++ .../hbase/master/procedure/GlobalQueue.java | 35 ++++++ .../procedure/MasterProcedureScheduler.java | 119 +++++++++++++++++- .../hbase/master/procedure/SchemaLocking.java | 18 ++- .../TestMasterProcedureScheduler.java | 48 +++++++ 6 files changed, 246 insertions(+), 6 deletions(-) create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/GlobalProcedureInterface.java create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/GlobalQueue.java diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/LockedResourceType.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/LockedResourceType.java index 12f899d7565b..401410170097 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/LockedResourceType.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/LockedResourceType.java @@ -26,5 +26,6 @@ public enum LockedResourceType { TABLE, REGION, PEER, - META + META, + GLOBAL } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/GlobalProcedureInterface.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/GlobalProcedureInterface.java new file mode 100644 index 000000000000..1ef168abfd8f --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/GlobalProcedureInterface.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.procedure; + +import org.apache.yetus.audience.InterfaceAudience; + +/** + * Procedure interface for global operations, such as migration. + */ +@InterfaceAudience.Private +public interface GlobalProcedureInterface { + + String getGlobalId(); +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/GlobalQueue.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/GlobalQueue.java new file mode 100644 index 000000000000..1633dc4856e7 --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/GlobalQueue.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.procedure; + +import org.apache.hadoop.hbase.procedure2.LockStatus; +import org.apache.hadoop.hbase.procedure2.Procedure; +import org.apache.yetus.audience.InterfaceAudience; + +@InterfaceAudience.Private +public class GlobalQueue extends Queue { + + public GlobalQueue(String globalId, LockStatus lockStatus) { + super(globalId, lockStatus); + } + + @Override + boolean requireExclusiveLock(Procedure proc) { + return true; + } +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java index 866f2f6f4032..fbf0eb8abf32 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java @@ -22,6 +22,7 @@ import java.util.List; import java.util.function.Function; import java.util.function.Supplier; +import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableExistsException; import org.apache.hadoop.hbase.TableName; @@ -95,16 +96,20 @@ public class MasterProcedureScheduler extends AbstractProcedureScheduler { (n, k) -> n.compareKey((String) k); private final static AvlKeyComparator META_QUEUE_KEY_COMPARATOR = (n, k) -> n.compareKey((TableName) k); + private final static AvlKeyComparator GLOBAL_QUEUE_KEY_COMPARATOR = + (n, k) -> n.compareKey((String) k); private final FairQueue serverRunQueue = new FairQueue<>(); private final FairQueue tableRunQueue = new FairQueue<>(); private final FairQueue peerRunQueue = new FairQueue<>(); private final FairQueue metaRunQueue = new FairQueue<>(); + private final FairQueue globalRunQueue = new FairQueue<>(); private final ServerQueue[] serverBuckets = new ServerQueue[128]; private TableQueue tableMap = null; private PeerQueue peerMap = null; private MetaQueue metaMap = null; + private GlobalQueue globalMap = null; private final SchemaLocking locking; @@ -128,6 +133,8 @@ protected void enqueue(final Procedure proc, final boolean addFront) { doAdd(serverRunQueue, getServerQueue(spi.getServerName(), spi), proc, addFront); } else if (isPeerProcedure(proc)) { doAdd(peerRunQueue, getPeerQueue(getPeerId(proc)), proc, addFront); + } else if (isGlobalProcedure(proc)) { + doAdd(globalRunQueue, getGlobalQueue(getGlobalId(proc)), proc, addFront); } else { // TODO: at the moment we only have Table and Server procedures // if you are implementing a non-table/non-server procedure, you have two options: create @@ -163,14 +170,19 @@ private > void doAdd(FairQueue fairq, Queue queue, @Override protected boolean queueHasRunnables() { - return metaRunQueue.hasRunnables() || tableRunQueue.hasRunnables() - || serverRunQueue.hasRunnables() || peerRunQueue.hasRunnables(); + return globalRunQueue.hasRunnables() || metaRunQueue.hasRunnables() + || tableRunQueue.hasRunnables() || serverRunQueue.hasRunnables() + || peerRunQueue.hasRunnables(); } @Override protected Procedure dequeue() { - // meta procedure is always the first priority - Procedure pollResult = doPoll(metaRunQueue); + // pull global first + Procedure pollResult = doPoll(globalRunQueue); + // then meta procedure + if (pollResult == null) { + pollResult = doPoll(metaRunQueue); + } // For now, let server handling have precedence over table handling; presumption is that it // is more important handling crashed servers than it is running the // enabling/disabling tables, etc. @@ -268,6 +280,14 @@ private void clearQueue() { clear(peerMap, peerRunQueue, PEER_QUEUE_KEY_COMPARATOR); peerMap = null; + // Remove Meta + clear(metaMap, metaRunQueue, META_QUEUE_KEY_COMPARATOR); + metaMap = null; + + // Remove Global + clear(globalMap, globalRunQueue, GLOBAL_QUEUE_KEY_COMPARATOR); + globalMap = null; + assert size() == 0 : "expected queue size to be 0, got " + size(); } @@ -300,6 +320,7 @@ protected int queueSize() { count += queueSize(tableMap); count += queueSize(peerMap); count += queueSize(metaMap); + count += queueSize(globalMap); return count; } @@ -502,6 +523,51 @@ private static boolean isMetaProcedure(Procedure proc) { return proc instanceof MetaProcedureInterface; } + // ============================================================================ + // Global Queue Lookup Helpers + // ============================================================================ + private GlobalQueue getGlobalQueue(String globalId) { + GlobalQueue node = AvlTree.get(globalMap, globalId, GLOBAL_QUEUE_KEY_COMPARATOR); + if (node != null) { + return node; + } + node = new GlobalQueue(globalId, locking.getGlobalLock(globalId)); + globalMap = AvlTree.insert(globalMap, node); + return node; + } + + private void removeGlobalQueue(String globalId) { + globalMap = AvlTree.remove(globalMap, globalId, GLOBAL_QUEUE_KEY_COMPARATOR); + locking.removeGlobalLock(globalId); + } + + private void tryCleanupGlobalQueue(String globalId, Procedure procedure) { + schedLock(); + try { + GlobalQueue queue = AvlTree.get(globalMap, globalId, GLOBAL_QUEUE_KEY_COMPARATOR); + if (queue == null) { + return; + } + + final LockAndQueue lock = locking.getGlobalLock(globalId); + if (queue.isEmpty() && lock.tryExclusiveLock(procedure)) { + removeFromRunQueue(globalRunQueue, queue, + () -> "clean up global queue after " + procedure + " completed"); + removeGlobalQueue(globalId); + } + } finally { + schedUnlock(); + } + } + + private static boolean isGlobalProcedure(Procedure proc) { + return proc instanceof GlobalProcedureInterface; + } + + private static String getGlobalId(Procedure proc) { + return ((GlobalProcedureInterface) proc).getGlobalId(); + } + // ============================================================================ // Table Locking Helpers // ============================================================================ @@ -1006,6 +1072,51 @@ public void wakeMetaExclusiveLock(Procedure procedure) { } } + // ============================================================================ + // Global Locking Helpers + // ============================================================================ + /** + * Try to acquire the share lock on global. + * @see #wakeGlobalExclusiveLock(Procedure, String) + * @param procedure the procedure trying to acquire the lock + * @return true if the procedure has to wait for global to be available + */ + public boolean waitGlobalExclusiveLock(Procedure procedure, String globalId) { + schedLock(); + try { + final LockAndQueue lock = locking.getGlobalLock(globalId); + if (lock.tryExclusiveLock(procedure)) { + removeFromRunQueue(globalRunQueue, getGlobalQueue(globalId), + () -> procedure + " held shared lock"); + return false; + } + waitProcedure(lock, procedure); + logLockedResource(LockedResourceType.GLOBAL, HConstants.EMPTY_STRING); + return true; + } finally { + schedUnlock(); + } + } + + /** + * Wake the procedures waiting for global. + * @see #waitGlobalExclusiveLock(Procedure, String) + * @param procedure the procedure releasing the lock + */ + public void wakeGlobalExclusiveLock(Procedure procedure, String globalId) { + schedLock(); + try { + final LockAndQueue lock = locking.getGlobalLock(globalId); + lock.releaseExclusiveLock(procedure); + addToRunQueue(globalRunQueue, getGlobalQueue(globalId), + () -> procedure + " released shared lock"); + int waitingCount = wakeWaitingProcedures(lock); + wakePollIfNeeded(waitingCount); + } finally { + schedUnlock(); + } + } + /** * For debugging. Expensive. */ diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SchemaLocking.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SchemaLocking.java index 13419ac455ca..853d13b0c93b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SchemaLocking.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SchemaLocking.java @@ -53,6 +53,7 @@ class SchemaLocking { // Single map for all regions irrespective of tables. Key is encoded region name. private final Map regionLocks = new HashMap<>(); private final Map peerLocks = new HashMap<>(); + private final Map globalLocks = new HashMap<>(); private final LockAndQueue metaLock; public SchemaLocking(Function> procedureRetriever) { @@ -94,6 +95,10 @@ LockAndQueue getMetaLock() { return metaLock; } + LockAndQueue getGlobalLock(String globalId) { + return getLock(globalLocks, globalId); + } + LockAndQueue removeRegionLock(String encodedRegionName) { return regionLocks.remove(encodedRegionName); } @@ -114,6 +119,10 @@ LockAndQueue removePeerLock(String peerId) { return peerLocks.remove(peerId); } + LockAndQueue removeGlobalLock(String globalId) { + return globalLocks.remove(globalId); + } + private LockedResource createLockedResource(LockedResourceType resourceType, String resourceName, LockAndQueue queue) { LockType lockType; @@ -164,6 +173,8 @@ List getLocks() { addToLockedResources(lockedResources, peerLocks, Function.identity(), LockedResourceType.PEER); addToLockedResources(lockedResources, ImmutableMap.of(TableName.META_TABLE_NAME, metaLock), tn -> tn.getNameAsString(), LockedResourceType.META); + addToLockedResources(lockedResources, globalLocks, Function.identity(), + LockedResourceType.GLOBAL); return lockedResources; } @@ -191,6 +202,10 @@ LockedResource getLockResource(LockedResourceType resourceType, String resourceN break; case META: queue = metaLock; + break; + case GLOBAL: + queue = globalLocks.get(resourceName); + break; default: queue = null; break; @@ -216,7 +231,8 @@ public String toString() { + filterUnlocked(this.namespaceLocks) + ", tableLocks=" + filterUnlocked(this.tableLocks) + ", regionLocks=" + filterUnlocked(this.regionLocks) + ", peerLocks=" + filterUnlocked(this.peerLocks) + ", metaLocks=" - + filterUnlocked(ImmutableMap.of(TableName.META_TABLE_NAME, metaLock)); + + filterUnlocked(ImmutableMap.of(TableName.META_TABLE_NAME, metaLock)) + ", globalLocks=" + + filterUnlocked(globalLocks); } private String filterUnlocked(Map locks) { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestMasterProcedureScheduler.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestMasterProcedureScheduler.java index f0edf73715ea..0cf34126a945 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestMasterProcedureScheduler.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestMasterProcedureScheduler.java @@ -940,6 +940,21 @@ public PeerOperationType getPeerOperationType() { } } + public static class TestGlobalProcedure extends TestProcedure + implements GlobalProcedureInterface { + private final String globalId; + + public TestGlobalProcedure(long procId, String globalId) { + super(procId); + this.globalId = globalId; + } + + @Override + public String getGlobalId() { + return globalId; + } + } + private static LockProcedure createLockProcedure(LockType lockType, long procId) throws Exception { LockProcedure procedure = new LockProcedure(); @@ -1093,6 +1108,39 @@ public void testListLocksPeer() throws Exception { assertEquals(1, resource.getWaitingProcedures().size()); } + @Test + public void testListLocksGlobal() throws Exception { + String globalId = "1"; + LockProcedure procedure = createExclusiveLockProcedure(4); + queue.waitGlobalExclusiveLock(procedure, globalId); + + List locks = queue.getLocks(); + assertEquals(1, locks.size()); + + LockedResource resource = locks.get(0); + assertLockResource(resource, LockedResourceType.GLOBAL, globalId); + assertExclusiveLock(resource, procedure); + assertTrue(resource.getWaitingProcedures().isEmpty()); + + // Try to acquire the exclusive lock again with same procedure + assertFalse(queue.waitGlobalExclusiveLock(procedure, globalId)); + + // Try to acquire the exclusive lock again with new procedure + LockProcedure procedure2 = createExclusiveLockProcedure(5); + assertTrue(queue.waitGlobalExclusiveLock(procedure2, globalId)); + + // Same peerId, still only has 1 LockedResource + locks = queue.getLocks(); + assertEquals(1, locks.size()); + + resource = locks.get(0); + assertLockResource(resource, LockedResourceType.GLOBAL, globalId); + // LockedResource owner still is the origin procedure + assertExclusiveLock(resource, procedure); + // The new procedure should in the waiting list + assertEquals(1, resource.getWaitingProcedures().size()); + } + @Test public void testListLocksWaiting() throws Exception { LockProcedure procedure1 = createExclusiveLockProcedure(1); From d49a8a1b6749fb9ad8f025658a03302bb31f9ef0 Mon Sep 17 00:00:00 2001 From: LiangJun He <2005hithlj@163.com> Date: Wed, 12 Oct 2022 14:40:05 +0800 Subject: [PATCH 06/16] HBASE-27405 Fix the replication hfile/log cleaner report that the replication table does not exist (#4811) Signed-off-by: Duo Zhang --- .../hbase/replication/ReplicationQueueStorage.java | 6 ++++++ .../hbase/replication/TableReplicationQueueStorage.java | 9 +++++++++ .../hbase/replication/master/ReplicationLogCleaner.java | 8 ++++++++ .../replication/master/TestReplicationLogCleaner.java | 1 + 4 files changed, 24 insertions(+) diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java index c4204f0e8c45..6f6aee38cc8f 100644 --- a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java +++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java @@ -178,4 +178,10 @@ void removeLastSequenceIds(String peerId, List encodedRegionNames) * created hfile references during the call may not be included. */ Set getAllHFileRefs() throws ReplicationException; + + /** + * Whether the replication queue table exists. + * @return Whether the replication queue table exists + */ + boolean hasData() throws ReplicationException; } diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/TableReplicationQueueStorage.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/TableReplicationQueueStorage.java index 0c9553f4fd89..392a3692d66f 100644 --- a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/TableReplicationQueueStorage.java +++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/TableReplicationQueueStorage.java @@ -532,4 +532,13 @@ public Set getAllHFileRefs() throws ReplicationException { throw new ReplicationException("failed to getAllHFileRefs", e); } } + + @Override + public boolean hasData() throws ReplicationException { + try { + return conn.getAdmin().getDescriptor(tableName) != null; + } catch (IOException e) { + throw new ReplicationException("failed to get replication queue table", e); + } + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java index f1fd8f8d6b3a..3ab52da6158e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java @@ -76,6 +76,14 @@ public void preClean() { if (this.getConf() == null) { return; } + try { + if (!rpm.getQueueStorage().hasData()) { + return; + } + } catch (ReplicationException e) { + LOG.error("Error occurred while executing queueStorage.hasData()", e); + return; + } canFilter = rpm.getReplicationLogCleanerBarrier().start(); if (canFilter) { notFullyDeadServers = getNotFullyDeadServers.get(); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/master/TestReplicationLogCleaner.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/master/TestReplicationLogCleaner.java index 7a227fb0603d..7edadae03b14 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/master/TestReplicationLogCleaner.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/master/TestReplicationLogCleaner.java @@ -86,6 +86,7 @@ public void setUp() throws ReplicationException { when(rpm.listPeers(null)).thenReturn(new ArrayList<>()); ReplicationQueueStorage rqs = mock(ReplicationQueueStorage.class); when(rpm.getQueueStorage()).thenReturn(rqs); + when(rpm.getQueueStorage().hasData()).thenReturn(true); when(rqs.listAllQueues()).thenReturn(new ArrayList<>()); ServerManager sm = mock(ServerManager.class); when(services.getServerManager()).thenReturn(sm); From dfaff40fc59b44bb031a1954d003475c07d17977 Mon Sep 17 00:00:00 2001 From: Duo Zhang Date: Sun, 6 Nov 2022 16:57:11 +0800 Subject: [PATCH 07/16] HBASE-27218 Support rolling upgrading (#4808) Signed-off-by: Yu Li --- .../hadoop/hbase/zookeeper/ZNodePaths.java | 8 +- .../hadoop/hbase/procedure2/Procedure.java | 15 + .../server/master/MasterProcedure.proto | 12 + hbase-replication/pom.xml | 10 + .../replication/ReplicationQueueStorage.java | 19 + .../TableReplicationQueueStorage.java | 65 +++- ...ZKReplicationQueueStorageForMigration.java | 351 ++++++++++++++++++ .../TestZKReplicationQueueStorage.java | 317 ++++++++++++++++ hbase-server/pom.xml | 6 + .../apache/hadoop/hbase/master/HMaster.java | 13 + .../procedure/ServerCrashProcedure.java | 19 + .../AbstractPeerNoLockProcedure.java | 5 +- ...eplicationQueueFromZkToTableProcedure.java | 244 ++++++++++++ .../replication/ModifyPeerProcedure.java | 26 ++ .../replication/ReplicationPeerManager.java | 104 +++++- ...nsitPeerSyncReplicationStateProcedure.java | 14 + .../TestMigrateReplicationQueue.java | 126 +++++++ ...eplicationQueueFromZkToTableProcedure.java | 226 +++++++++++ ...onQueueFromZkToTableProcedureRecovery.java | 128 +++++++ ...icationPeerManagerMigrateQueuesFromZk.java | 216 +++++++++++ .../replication/TestReplicationBase.java | 2 +- pom.xml | 7 +- 22 files changed, 1917 insertions(+), 16 deletions(-) create mode 100644 hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ZKReplicationQueueStorageForMigration.java create mode 100644 hbase-replication/src/test/java/org/apache/hadoop/hbase/replication/TestZKReplicationQueueStorage.java create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/MigrateReplicationQueueFromZkToTableProcedure.java create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestMigrateReplicationQueue.java create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestMigrateReplicationQueueFromZkToTableProcedure.java create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestMigrateReplicationQueueFromZkToTableProcedureRecovery.java create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestReplicationPeerManagerMigrateQueuesFromZk.java diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZNodePaths.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZNodePaths.java index d19d21004667..3f66c7cdc0c2 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZNodePaths.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZNodePaths.java @@ -220,7 +220,11 @@ public String getRsPath(ServerName sn) { * @param suffix ending of znode name * @return result of properly joining prefix with suffix */ - public static String joinZNode(String prefix, String suffix) { - return prefix + ZNodePaths.ZNODE_PATH_SEPARATOR + suffix; + public static String joinZNode(String prefix, String... suffix) { + StringBuilder sb = new StringBuilder(prefix); + for (String s : suffix) { + sb.append(ZNodePaths.ZNODE_PATH_SEPARATOR).append(s); + } + return sb.toString(); } } diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java index 34c74d92c161..43adba2bc21a 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java @@ -21,6 +21,7 @@ import java.util.Arrays; import java.util.List; import java.util.Map; +import java.util.concurrent.ThreadLocalRandom; import org.apache.hadoop.hbase.exceptions.TimeoutIOException; import org.apache.hadoop.hbase.metrics.Counter; import org.apache.hadoop.hbase.metrics.Histogram; @@ -33,6 +34,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos; import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState; /** @@ -1011,6 +1013,19 @@ final void doReleaseLock(TEnvironment env, ProcedureStore store) { releaseLock(env); } + protected final ProcedureSuspendedException suspend(int timeoutMillis, boolean jitter) + throws ProcedureSuspendedException { + if (jitter) { + // 10% possible jitter + double add = (double) timeoutMillis * ThreadLocalRandom.current().nextDouble(0.1); + timeoutMillis += add; + } + setTimeout(timeoutMillis); + setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT); + skipPersistence(); + throw new ProcedureSuspendedException(); + } + @Override public int compareTo(final Procedure other) { return Long.compare(getProcId(), other.getProcId()); diff --git a/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto b/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto index 76a1d676487a..b6f5d7e50bb0 100644 --- a/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto +++ b/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto @@ -722,3 +722,15 @@ enum AssignReplicationQueuesState { message AssignReplicationQueuesStateData { required ServerName crashed_server = 1; } + +enum MigrateReplicationQueueFromZkToTableState { + MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_PREPARE = 1; + MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_DISABLE_PEER = 2; + MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_MIGRATE = 3; + MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_WAIT_UPGRADING = 4; + MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_ENABLE_PEER = 5; +} + +message MigrateReplicationQueueFromZkToTableStateData { + repeated string disabled_peer_id = 1; +} diff --git a/hbase-replication/pom.xml b/hbase-replication/pom.xml index 9acab39599fa..b4f1cfa224da 100644 --- a/hbase-replication/pom.xml +++ b/hbase-replication/pom.xml @@ -104,6 +104,16 @@ junit test + + org.hamcrest + hamcrest-core + test + + + org.hamcrest + hamcrest-library + test + org.mockito mockito-core diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java index 6f6aee38cc8f..1e36bbeb78f0 100644 --- a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java +++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java @@ -22,6 +22,7 @@ import java.util.Set; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.replication.ZKReplicationQueueStorageForMigration.ZkLastPushedSeqId; import org.apache.hadoop.hbase.util.Pair; import org.apache.yetus.audience.InterfaceAudience; @@ -184,4 +185,22 @@ void removeLastSequenceIds(String peerId, List encodedRegionNames) * @return Whether the replication queue table exists */ boolean hasData() throws ReplicationException; + + // the below 3 methods are used for migrating + /** + * Update the replication queue datas for a given region server. + */ + void batchUpdateQueues(ServerName serverName, List datas) + throws ReplicationException; + + /** + * Update last pushed sequence id for the given regions and peers. + */ + void batchUpdateLastSequenceIds(List lastPushedSeqIds) + throws ReplicationException; + + /** + * Add the given hfile refs to the given peer. + */ + void batchUpdateHFileRefs(String peerId, List hfileRefs) throws ReplicationException; } diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/TableReplicationQueueStorage.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/TableReplicationQueueStorage.java index 392a3692d66f..f3870f4d09d8 100644 --- a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/TableReplicationQueueStorage.java +++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/TableReplicationQueueStorage.java @@ -21,12 +21,14 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.NavigableMap; import java.util.Set; import java.util.function.Supplier; +import java.util.stream.Collectors; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.CellScanner; @@ -46,6 +48,7 @@ import org.apache.hadoop.hbase.client.Scan.ReadType; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.filter.KeyOnlyFilter; +import org.apache.hadoop.hbase.replication.ZKReplicationQueueStorageForMigration.ZkLastPushedSeqId; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.FutureUtils; import org.apache.hadoop.hbase.util.Pair; @@ -74,12 +77,6 @@ public class TableReplicationQueueStorage implements ReplicationQueueStorage { private final TableName tableName; - @FunctionalInterface - private interface TableCreator { - - void create() throws IOException; - } - public TableReplicationQueueStorage(Connection conn, TableName tableName) { this.conn = conn; this.tableName = tableName; @@ -541,4 +538,60 @@ public boolean hasData() throws ReplicationException { throw new ReplicationException("failed to get replication queue table", e); } } + + @Override + public void batchUpdateQueues(ServerName serverName, List datas) + throws ReplicationException { + List puts = new ArrayList<>(); + for (ReplicationQueueData data : datas) { + if (data.getOffsets().isEmpty()) { + continue; + } + Put put = new Put(Bytes.toBytes(data.getId().toString())); + data.getOffsets().forEach((walGroup, offset) -> { + put.addColumn(QUEUE_FAMILY, Bytes.toBytes(walGroup), Bytes.toBytes(offset.toString())); + }); + puts.add(put); + } + try (Table table = conn.getTable(tableName)) { + table.put(puts); + } catch (IOException e) { + throw new ReplicationException("failed to batch update queues", e); + } + } + + @Override + public void batchUpdateLastSequenceIds(List lastPushedSeqIds) + throws ReplicationException { + Map peerId2Put = new HashMap<>(); + for (ZkLastPushedSeqId lastPushedSeqId : lastPushedSeqIds) { + peerId2Put + .computeIfAbsent(lastPushedSeqId.getPeerId(), peerId -> new Put(Bytes.toBytes(peerId))) + .addColumn(LAST_SEQUENCE_ID_FAMILY, Bytes.toBytes(lastPushedSeqId.getEncodedRegionName()), + Bytes.toBytes(lastPushedSeqId.getLastPushedSeqId())); + } + try (Table table = conn.getTable(tableName)) { + table + .put(peerId2Put.values().stream().filter(p -> !p.isEmpty()).collect(Collectors.toList())); + } catch (IOException e) { + throw new ReplicationException("failed to batch update last pushed sequence ids", e); + } + } + + @Override + public void batchUpdateHFileRefs(String peerId, List hfileRefs) + throws ReplicationException { + if (hfileRefs.isEmpty()) { + return; + } + Put put = new Put(Bytes.toBytes(peerId)); + for (String ref : hfileRefs) { + put.addColumn(HFILE_REF_FAMILY, Bytes.toBytes(ref), HConstants.EMPTY_BYTE_ARRAY); + } + try (Table table = conn.getTable(tableName)) { + table.put(put); + } catch (IOException e) { + throw new ReplicationException("failed to batch update hfile references", e); + } + } } diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ZKReplicationQueueStorageForMigration.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ZKReplicationQueueStorageForMigration.java new file mode 100644 index 000000000000..22cc13145225 --- /dev/null +++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ZKReplicationQueueStorageForMigration.java @@ -0,0 +1,351 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.replication; + +import com.google.errorprone.annotations.RestrictedApi; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.zookeeper.ZKUtil; +import org.apache.hadoop.hbase.zookeeper.ZKWatcher; +import org.apache.hadoop.hbase.zookeeper.ZNodePaths; +import org.apache.yetus.audience.InterfaceAudience; +import org.apache.zookeeper.KeeperException; + +import org.apache.hbase.thirdparty.com.google.common.base.Splitter; + +/** + * Just retain a small set of the methods for the old zookeeper based replication queue storage, for + * migrating. + */ +@InterfaceAudience.Private +public class ZKReplicationQueueStorageForMigration extends ZKReplicationStorageBase { + + public static final String ZOOKEEPER_ZNODE_REPLICATION_HFILE_REFS_KEY = + "zookeeper.znode.replication.hfile.refs"; + public static final String ZOOKEEPER_ZNODE_REPLICATION_HFILE_REFS_DEFAULT = "hfile-refs"; + + public static final String ZOOKEEPER_ZNODE_REPLICATION_REGIONS_KEY = + "zookeeper.znode.replication.regions"; + public static final String ZOOKEEPER_ZNODE_REPLICATION_REGIONS_DEFAULT = "regions"; + + /** + * The name of the znode that contains all replication queues + */ + private final String queuesZNode; + + /** + * The name of the znode that contains queues of hfile references to be replicated + */ + private final String hfileRefsZNode; + + private final String regionsZNode; + + public ZKReplicationQueueStorageForMigration(ZKWatcher zookeeper, Configuration conf) { + super(zookeeper, conf); + String queuesZNodeName = conf.get("zookeeper.znode.replication.rs", "rs"); + String hfileRefsZNodeName = conf.get(ZOOKEEPER_ZNODE_REPLICATION_HFILE_REFS_KEY, + ZOOKEEPER_ZNODE_REPLICATION_HFILE_REFS_DEFAULT); + this.queuesZNode = ZNodePaths.joinZNode(replicationZNode, queuesZNodeName); + this.hfileRefsZNode = ZNodePaths.joinZNode(replicationZNode, hfileRefsZNodeName); + this.regionsZNode = ZNodePaths.joinZNode(replicationZNode, conf + .get(ZOOKEEPER_ZNODE_REPLICATION_REGIONS_KEY, ZOOKEEPER_ZNODE_REPLICATION_REGIONS_DEFAULT)); + } + + public interface MigrationIterator { + + T next() throws Exception; + } + + @SuppressWarnings("rawtypes") + private static final MigrationIterator EMPTY_ITER = new MigrationIterator() { + + @Override + public Object next() { + return null; + } + }; + + public static final class ZkReplicationQueueData { + + private final ReplicationQueueId queueId; + + private final Map walOffsets; + + public ZkReplicationQueueData(ReplicationQueueId queueId, Map walOffsets) { + this.queueId = queueId; + this.walOffsets = walOffsets; + } + + public ReplicationQueueId getQueueId() { + return queueId; + } + + public Map getWalOffsets() { + return walOffsets; + } + } + + private String getRsNode(ServerName serverName) { + return ZNodePaths.joinZNode(queuesZNode, serverName.getServerName()); + } + + private String getQueueNode(ServerName serverName, String queueId) { + return ZNodePaths.joinZNode(getRsNode(serverName), queueId); + } + + private String getFileNode(String queueNode, String fileName) { + return ZNodePaths.joinZNode(queueNode, fileName); + } + + private String getFileNode(ServerName serverName, String queueId, String fileName) { + return getFileNode(getQueueNode(serverName, queueId), fileName); + } + + @SuppressWarnings("unchecked") + public MigrationIterator>> listAllQueues() + throws KeeperException { + List replicators = ZKUtil.listChildrenNoWatch(zookeeper, queuesZNode); + if (replicators == null || replicators.isEmpty()) { + ZKUtil.deleteNodeRecursively(zookeeper, queuesZNode); + return EMPTY_ITER; + } + Iterator iter = replicators.iterator(); + return new MigrationIterator>>() { + + private ServerName previousServerName; + + @Override + public Pair> next() throws Exception { + if (previousServerName != null) { + ZKUtil.deleteNodeRecursively(zookeeper, getRsNode(previousServerName)); + } + if (!iter.hasNext()) { + ZKUtil.deleteNodeRecursively(zookeeper, queuesZNode); + return null; + } + String replicator = iter.next(); + ServerName serverName = ServerName.parseServerName(replicator); + previousServerName = serverName; + List queueIdList = ZKUtil.listChildrenNoWatch(zookeeper, getRsNode(serverName)); + if (queueIdList == null || queueIdList.isEmpty()) { + return Pair.newPair(serverName, Collections.emptyList()); + } + List queueDataList = new ArrayList<>(queueIdList.size()); + for (String queueIdStr : queueIdList) { + ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueIdStr); + ReplicationQueueId queueId; + if (queueInfo.getDeadRegionServers().isEmpty()) { + queueId = new ReplicationQueueId(serverName, queueInfo.getPeerId()); + } else { + queueId = new ReplicationQueueId(serverName, queueInfo.getPeerId(), + queueInfo.getDeadRegionServers().get(0)); + } + List wals = + ZKUtil.listChildrenNoWatch(zookeeper, getQueueNode(serverName, queueIdStr)); + ZkReplicationQueueData queueData; + if (wals == null || wals.isEmpty()) { + queueData = new ZkReplicationQueueData(queueId, Collections.emptyMap()); + } else { + Map walOffsets = new HashMap<>(); + for (String wal : wals) { + byte[] data = ZKUtil.getData(zookeeper, getFileNode(serverName, queueIdStr, wal)); + if (data == null || data.length == 0) { + walOffsets.put(wal, 0L); + } else { + walOffsets.put(wal, ZKUtil.parseWALPositionFrom(data)); + } + } + queueData = new ZkReplicationQueueData(queueId, walOffsets); + } + queueDataList.add(queueData); + } + return Pair.newPair(serverName, queueDataList); + } + }; + } + + public static final class ZkLastPushedSeqId { + + private final String encodedRegionName; + + private final String peerId; + + private final long lastPushedSeqId; + + ZkLastPushedSeqId(String encodedRegionName, String peerId, long lastPushedSeqId) { + this.encodedRegionName = encodedRegionName; + this.peerId = peerId; + this.lastPushedSeqId = lastPushedSeqId; + } + + public String getEncodedRegionName() { + return encodedRegionName; + } + + public String getPeerId() { + return peerId; + } + + public long getLastPushedSeqId() { + return lastPushedSeqId; + } + + } + + @SuppressWarnings("unchecked") + public MigrationIterator> listAllLastPushedSeqIds() + throws KeeperException { + List level1Prefixs = ZKUtil.listChildrenNoWatch(zookeeper, regionsZNode); + if (level1Prefixs == null || level1Prefixs.isEmpty()) { + ZKUtil.deleteNodeRecursively(zookeeper, regionsZNode); + return EMPTY_ITER; + } + Iterator level1Iter = level1Prefixs.iterator(); + return new MigrationIterator>() { + + private String level1Prefix; + + private Iterator level2Iter; + + private String level2Prefix; + + @Override + public List next() throws Exception { + for (;;) { + if (level2Iter == null || !level2Iter.hasNext()) { + if (!level1Iter.hasNext()) { + ZKUtil.deleteNodeRecursively(zookeeper, regionsZNode); + return null; + } + if (level1Prefix != null) { + // this will also delete the previous level2Prefix which is under this level1Prefix + ZKUtil.deleteNodeRecursively(zookeeper, + ZNodePaths.joinZNode(regionsZNode, level1Prefix)); + } + level1Prefix = level1Iter.next(); + List level2Prefixes = ZKUtil.listChildrenNoWatch(zookeeper, + ZNodePaths.joinZNode(regionsZNode, level1Prefix)); + if (level2Prefixes != null) { + level2Iter = level2Prefixes.iterator(); + // reset level2Prefix as we have switched level1Prefix, otherwise the below delete + // level2Prefix section will delete the znode with this level2Prefix under the new + // level1Prefix + level2Prefix = null; + } + } else { + if (level2Prefix != null) { + ZKUtil.deleteNodeRecursively(zookeeper, + ZNodePaths.joinZNode(regionsZNode, level1Prefix, level2Prefix)); + } + level2Prefix = level2Iter.next(); + List encodedRegionNameAndPeerIds = ZKUtil.listChildrenNoWatch(zookeeper, + ZNodePaths.joinZNode(regionsZNode, level1Prefix, level2Prefix)); + if (encodedRegionNameAndPeerIds == null || encodedRegionNameAndPeerIds.isEmpty()) { + return Collections.emptyList(); + } + List lastPushedSeqIds = new ArrayList<>(); + for (String encodedRegionNameAndPeerId : encodedRegionNameAndPeerIds) { + byte[] data = ZKUtil.getData(zookeeper, ZNodePaths.joinZNode(regionsZNode, + level1Prefix, level2Prefix, encodedRegionNameAndPeerId)); + long lastPushedSeqId = ZKUtil.parseWALPositionFrom(data); + Iterator iter = Splitter.on('-').split(encodedRegionNameAndPeerId).iterator(); + String encodedRegionName = level1Prefix + level2Prefix + iter.next(); + String peerId = iter.next(); + lastPushedSeqIds + .add(new ZkLastPushedSeqId(encodedRegionName, peerId, lastPushedSeqId)); + } + return Collections.unmodifiableList(lastPushedSeqIds); + } + } + } + }; + } + + private String getHFileRefsPeerNode(String peerId) { + return ZNodePaths.joinZNode(hfileRefsZNode, peerId); + } + + /** + * Pair<PeerId, List<HFileRefs>> + */ + @SuppressWarnings("unchecked") + public MigrationIterator>> listAllHFileRefs() throws KeeperException { + List peerIds = ZKUtil.listChildrenNoWatch(zookeeper, hfileRefsZNode); + if (peerIds == null || peerIds.isEmpty()) { + ZKUtil.deleteNodeRecursively(zookeeper, hfileRefsZNode); + return EMPTY_ITER; + } + Iterator iter = peerIds.iterator(); + return new MigrationIterator>>() { + + private String previousPeerId; + + @Override + public Pair> next() throws KeeperException { + if (previousPeerId != null) { + ZKUtil.deleteNodeRecursively(zookeeper, getHFileRefsPeerNode(previousPeerId)); + } + if (!iter.hasNext()) { + ZKUtil.deleteNodeRecursively(zookeeper, hfileRefsZNode); + return null; + } + String peerId = iter.next(); + List refs = ZKUtil.listChildrenNoWatch(zookeeper, getHFileRefsPeerNode(peerId)); + previousPeerId = peerId; + return Pair.newPair(peerId, refs != null ? refs : Collections.emptyList()); + } + }; + } + + public boolean hasData() throws KeeperException { + return ZKUtil.checkExists(zookeeper, queuesZNode) != -1 + || ZKUtil.checkExists(zookeeper, regionsZNode) != -1 + || ZKUtil.checkExists(zookeeper, hfileRefsZNode) != -1; + } + + public void deleteAllData() throws KeeperException { + ZKUtil.deleteNodeRecursively(zookeeper, queuesZNode); + ZKUtil.deleteNodeRecursively(zookeeper, regionsZNode); + ZKUtil.deleteNodeRecursively(zookeeper, hfileRefsZNode); + } + + @RestrictedApi(explanation = "Should only be called in tests", link = "", + allowedOnPath = ".*/src/test/.*") + String getQueuesZNode() { + return queuesZNode; + } + + @RestrictedApi(explanation = "Should only be called in tests", link = "", + allowedOnPath = ".*/src/test/.*") + String getHfileRefsZNode() { + return hfileRefsZNode; + } + + @RestrictedApi(explanation = "Should only be called in tests", link = "", + allowedOnPath = ".*/src/test/.*") + String getRegionsZNode() { + return regionsZNode; + } +} diff --git a/hbase-replication/src/test/java/org/apache/hadoop/hbase/replication/TestZKReplicationQueueStorage.java b/hbase-replication/src/test/java/org/apache/hadoop/hbase/replication/TestZKReplicationQueueStorage.java new file mode 100644 index 000000000000..e38b7b134e99 --- /dev/null +++ b/hbase-replication/src/test/java/org/apache/hadoop/hbase/replication/TestZKReplicationQueueStorage.java @@ -0,0 +1,317 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.replication; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.empty; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ThreadLocalRandom; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseZKTestingUtil; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.replication.ZKReplicationQueueStorageForMigration.MigrationIterator; +import org.apache.hadoop.hbase.replication.ZKReplicationQueueStorageForMigration.ZkLastPushedSeqId; +import org.apache.hadoop.hbase.replication.ZKReplicationQueueStorageForMigration.ZkReplicationQueueData; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.testclassification.ReplicationTests; +import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.apache.hadoop.hbase.util.MD5Hash; +import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.zookeeper.ZKUtil; +import org.apache.hadoop.hbase.zookeeper.ZKWatcher; +import org.apache.hadoop.hbase.zookeeper.ZNodePaths; +import org.apache.zookeeper.KeeperException; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.rules.TestName; + +import org.apache.hbase.thirdparty.com.google.common.base.Splitter; +import org.apache.hbase.thirdparty.com.google.common.collect.Iterables; +import org.apache.hbase.thirdparty.com.google.common.collect.Sets; +import org.apache.hbase.thirdparty.com.google.common.io.Closeables; + +@Category({ ReplicationTests.class, MediumTests.class }) +public class TestZKReplicationQueueStorage { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestZKReplicationQueueStorage.class); + + private static final HBaseZKTestingUtil UTIL = new HBaseZKTestingUtil(); + + private ZKWatcher zk; + + private ZKReplicationQueueStorageForMigration storage; + + @Rule + public final TestName name = new TestName(); + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + UTIL.startMiniZKCluster(); + } + + @AfterClass + public static void tearDownAfterClass() throws IOException { + UTIL.shutdownMiniZKCluster(); + } + + @Before + public void setUp() throws IOException { + Configuration conf = UTIL.getConfiguration(); + conf.set(ZKReplicationStorageBase.REPLICATION_ZNODE, name.getMethodName()); + zk = new ZKWatcher(conf, name.getMethodName(), null); + storage = new ZKReplicationQueueStorageForMigration(zk, conf); + } + + @After + public void tearDown() throws Exception { + ZKUtil.deleteNodeRecursively(zk, storage.replicationZNode); + Closeables.close(zk, true); + } + + public static void mockQueuesData(ZKReplicationQueueStorageForMigration storage, int nServers, + String peerId, ServerName deadServer) throws KeeperException { + ZKWatcher zk = storage.zookeeper; + for (int i = 0; i < nServers; i++) { + ServerName sn = + ServerName.valueOf("test-hbase-" + i, 12345, EnvironmentEdgeManager.currentTime()); + String rsZNode = ZNodePaths.joinZNode(storage.getQueuesZNode(), sn.toString()); + String peerZNode = ZNodePaths.joinZNode(rsZNode, peerId); + ZKUtil.createWithParents(zk, peerZNode); + for (int j = 0; j < i; j++) { + String wal = ZNodePaths.joinZNode(peerZNode, sn.toString() + "." + j); + ZKUtil.createSetData(zk, wal, ZKUtil.positionToByteArray(j)); + } + String deadServerPeerZNode = ZNodePaths.joinZNode(rsZNode, peerId + "-" + deadServer); + ZKUtil.createWithParents(zk, deadServerPeerZNode); + for (int j = 0; j < i; j++) { + String wal = ZNodePaths.joinZNode(deadServerPeerZNode, deadServer.toString() + "." + j); + if (j > 0) { + ZKUtil.createSetData(zk, wal, ZKUtil.positionToByteArray(j)); + } else { + ZKUtil.createWithParents(zk, wal); + } + } + } + ZKUtil.createWithParents(zk, + ZNodePaths.joinZNode(storage.getQueuesZNode(), deadServer.toString())); + } + + private static String getLastPushedSeqIdZNode(String regionsZNode, String encodedName, + String peerId) { + return ZNodePaths.joinZNode(regionsZNode, encodedName.substring(0, 2), + encodedName.substring(2, 4), encodedName.substring(4) + "-" + peerId); + } + + public static Map> mockLastPushedSeqIds( + ZKReplicationQueueStorageForMigration storage, String peerId1, String peerId2, int nRegions, + int emptyLevel1Count, int emptyLevel2Count) throws KeeperException { + ZKWatcher zk = storage.zookeeper; + Map> name2PeerIds = new HashMap<>(); + byte[] bytes = new byte[32]; + for (int i = 0; i < nRegions; i++) { + ThreadLocalRandom.current().nextBytes(bytes); + String encodeName = MD5Hash.getMD5AsHex(bytes); + String znode1 = getLastPushedSeqIdZNode(storage.getRegionsZNode(), encodeName, peerId1); + ZKUtil.createSetData(zk, znode1, ZKUtil.positionToByteArray(1)); + String znode2 = getLastPushedSeqIdZNode(storage.getRegionsZNode(), encodeName, peerId2); + ZKUtil.createSetData(zk, znode2, ZKUtil.positionToByteArray(2)); + name2PeerIds.put(encodeName, Sets.newHashSet(peerId1, peerId2)); + } + int addedEmptyZNodes = 0; + for (int i = 0; i < 256; i++) { + String level1ZNode = + ZNodePaths.joinZNode(storage.getRegionsZNode(), String.format("%02x", i)); + if (ZKUtil.checkExists(zk, level1ZNode) == -1) { + ZKUtil.createWithParents(zk, level1ZNode); + addedEmptyZNodes++; + if (addedEmptyZNodes <= emptyLevel2Count) { + ZKUtil.createWithParents(zk, ZNodePaths.joinZNode(level1ZNode, "ab")); + } + if (addedEmptyZNodes >= emptyLevel1Count + emptyLevel2Count) { + break; + } + } + } + return name2PeerIds; + } + + public static void mockHFileRefs(ZKReplicationQueueStorageForMigration storage, int nPeers) + throws KeeperException { + ZKWatcher zk = storage.zookeeper; + for (int i = 0; i < nPeers; i++) { + String peerId = "peer_" + i; + ZKUtil.createWithParents(zk, ZNodePaths.joinZNode(storage.getHfileRefsZNode(), peerId)); + for (int j = 0; j < i; j++) { + ZKUtil.createWithParents(zk, + ZNodePaths.joinZNode(storage.getHfileRefsZNode(), peerId, "hfile-" + j)); + } + } + } + + @Test + public void testDeleteAllData() throws Exception { + assertFalse(storage.hasData()); + ZKUtil.createWithParents(zk, storage.getQueuesZNode()); + assertTrue(storage.hasData()); + storage.deleteAllData(); + assertFalse(storage.hasData()); + } + + @Test + public void testEmptyIter() throws Exception { + ZKUtil.createWithParents(zk, storage.getQueuesZNode()); + ZKUtil.createWithParents(zk, storage.getRegionsZNode()); + ZKUtil.createWithParents(zk, storage.getHfileRefsZNode()); + assertNull(storage.listAllQueues().next()); + assertEquals(-1, ZKUtil.checkExists(zk, storage.getQueuesZNode())); + assertNull(storage.listAllLastPushedSeqIds().next()); + assertEquals(-1, ZKUtil.checkExists(zk, storage.getRegionsZNode())); + assertNull(storage.listAllHFileRefs().next()); + assertEquals(-1, ZKUtil.checkExists(zk, storage.getHfileRefsZNode())); + } + + @Test + public void testListAllQueues() throws Exception { + String peerId = "1"; + ServerName deadServer = + ServerName.valueOf("test-hbase-dead", 12345, EnvironmentEdgeManager.currentTime()); + int nServers = 10; + mockQueuesData(storage, nServers, peerId, deadServer); + MigrationIterator>> iter = + storage.listAllQueues(); + ServerName previousServerName = null; + for (int i = 0; i < nServers + 1; i++) { + Pair> pair = iter.next(); + assertNotNull(pair); + if (previousServerName != null) { + assertEquals(-1, ZKUtil.checkExists(zk, + ZNodePaths.joinZNode(storage.getQueuesZNode(), previousServerName.toString()))); + } + ServerName sn = pair.getFirst(); + previousServerName = sn; + if (sn.equals(deadServer)) { + assertThat(pair.getSecond(), empty()); + } else { + assertEquals(2, pair.getSecond().size()); + int n = Integer.parseInt(Iterables.getLast(Splitter.on('-').split(sn.getHostname()))); + ZkReplicationQueueData data0 = pair.getSecond().get(0); + assertEquals(peerId, data0.getQueueId().getPeerId()); + assertEquals(sn, data0.getQueueId().getServerName()); + assertEquals(n, data0.getWalOffsets().size()); + for (int j = 0; j < n; j++) { + assertEquals(j, + data0.getWalOffsets().get( + (data0.getQueueId().isRecovered() ? deadServer.toString() : sn.toString()) + "." + j) + .intValue()); + } + ZkReplicationQueueData data1 = pair.getSecond().get(1); + assertEquals(peerId, data1.getQueueId().getPeerId()); + assertEquals(sn, data1.getQueueId().getServerName()); + assertEquals(n, data1.getWalOffsets().size()); + for (int j = 0; j < n; j++) { + assertEquals(j, + data1.getWalOffsets().get( + (data1.getQueueId().isRecovered() ? deadServer.toString() : sn.toString()) + "." + j) + .intValue()); + } + // the order of the returned result is undetermined + if (data0.getQueueId().getSourceServerName().isPresent()) { + assertEquals(deadServer, data0.getQueueId().getSourceServerName().get()); + assertFalse(data1.getQueueId().getSourceServerName().isPresent()); + } else { + assertEquals(deadServer, data1.getQueueId().getSourceServerName().get()); + } + } + } + assertNull(iter.next()); + assertEquals(-1, ZKUtil.checkExists(zk, storage.getQueuesZNode())); + } + + @Test + public void testListAllLastPushedSeqIds() throws Exception { + String peerId1 = "1"; + String peerId2 = "2"; + Map> name2PeerIds = + mockLastPushedSeqIds(storage, peerId1, peerId2, 100, 10, 10); + MigrationIterator> iter = storage.listAllLastPushedSeqIds(); + int emptyListCount = 0; + for (;;) { + List list = iter.next(); + if (list == null) { + break; + } + if (list.isEmpty()) { + emptyListCount++; + continue; + } + for (ZkLastPushedSeqId seqId : list) { + name2PeerIds.get(seqId.getEncodedRegionName()).remove(seqId.getPeerId()); + if (seqId.getPeerId().equals(peerId1)) { + assertEquals(1, seqId.getLastPushedSeqId()); + } else { + assertEquals(2, seqId.getLastPushedSeqId()); + } + } + } + assertEquals(10, emptyListCount); + name2PeerIds.forEach((encodedRegionName, peerIds) -> { + assertThat(encodedRegionName + " still has unmigrated peers", peerIds, empty()); + }); + assertEquals(-1, ZKUtil.checkExists(zk, storage.getRegionsZNode())); + } + + @Test + public void testListAllHFileRefs() throws Exception { + int nPeers = 10; + mockHFileRefs(storage, nPeers); + MigrationIterator>> iter = storage.listAllHFileRefs(); + String previousPeerId = null; + for (int i = 0; i < nPeers; i++) { + Pair> pair = iter.next(); + if (previousPeerId != null) { + assertEquals(-1, ZKUtil.checkExists(zk, + ZNodePaths.joinZNode(storage.getHfileRefsZNode(), previousPeerId))); + } + String peerId = pair.getFirst(); + previousPeerId = peerId; + int index = Integer.parseInt(Iterables.getLast(Splitter.on('_').split(peerId))); + assertEquals(index, pair.getSecond().size()); + } + assertNull(iter.next()); + assertEquals(-1, ZKUtil.checkExists(zk, storage.getHfileRefsZNode())); + } +} diff --git a/hbase-server/pom.xml b/hbase-server/pom.xml index 0dba4aa98339..b61b0252a052 100644 --- a/hbase-server/pom.xml +++ b/hbase-server/pom.xml @@ -102,6 +102,12 @@ org.apache.hbase hbase-replication + + org.apache.hbase + hbase-replication + test-jar + test + org.apache.hbase hbase-balancer diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 896f9a5d0860..adb53468ce72 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -170,6 +170,7 @@ import org.apache.hadoop.hbase.master.replication.AddPeerProcedure; import org.apache.hadoop.hbase.master.replication.DisablePeerProcedure; import org.apache.hadoop.hbase.master.replication.EnablePeerProcedure; +import org.apache.hadoop.hbase.master.replication.MigrateReplicationQueueFromZkToTableProcedure; import org.apache.hadoop.hbase.master.replication.RemovePeerProcedure; import org.apache.hadoop.hbase.master.replication.ReplicationPeerManager; import org.apache.hadoop.hbase.master.replication.ReplicationPeerModificationStateStore; @@ -221,6 +222,7 @@ import org.apache.hadoop.hbase.replication.ReplicationPeerDescription; import org.apache.hadoop.hbase.replication.ReplicationUtils; import org.apache.hadoop.hbase.replication.SyncReplicationState; +import org.apache.hadoop.hbase.replication.ZKReplicationQueueStorageForMigration; import org.apache.hadoop.hbase.replication.master.ReplicationHFileCleaner; import org.apache.hadoop.hbase.replication.master.ReplicationLogCleaner; import org.apache.hadoop.hbase.replication.master.ReplicationSinkTrackerTableCreator; @@ -1058,6 +1060,17 @@ private void finishActiveMasterInitialization() throws IOException, InterruptedE this.balancer.initialize(); this.balancer.updateClusterMetrics(getClusterMetricsWithoutCoprocessor()); + // try migrate replication data + ZKReplicationQueueStorageForMigration oldReplicationQueueStorage = + new ZKReplicationQueueStorageForMigration(zooKeeper, conf); + // check whether there are something to migrate and we haven't scheduled a migration procedure + // yet + if ( + oldReplicationQueueStorage.hasData() && procedureExecutor.getProcedures().stream() + .allMatch(p -> !(p instanceof MigrateReplicationQueueFromZkToTableProcedure)) + ) { + procedureExecutor.submitProcedure(new MigrateReplicationQueueFromZkToTableProcedure()); + } // start up all service threads. startupTaskGroup.addTask("Initializing master service threads"); startServiceThreads(); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java index 487c45e5c5cb..97976756d828 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java @@ -37,6 +37,7 @@ import org.apache.hadoop.hbase.master.assignment.RegionStateNode; import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure; import org.apache.hadoop.hbase.master.replication.AssignReplicationQueuesProcedure; +import org.apache.hadoop.hbase.master.replication.MigrateReplicationQueueFromZkToTableProcedure; import org.apache.hadoop.hbase.monitoring.MonitoredTask; import org.apache.hadoop.hbase.monitoring.TaskMonitor; import org.apache.hadoop.hbase.procedure2.Procedure; @@ -52,6 +53,7 @@ import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.ServerCrashState; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos; /** * Handle crashed server. This is a port to ProcedureV2 of what used to be euphemistically called @@ -266,6 +268,16 @@ protected Flow executeFromState(MasterProcedureEnv env, ServerCrashState state) } break; case SERVER_CRASH_CLAIM_REPLICATION_QUEUES: + if ( + env.getMasterServices().getProcedures().stream() + .filter(p -> p instanceof MigrateReplicationQueueFromZkToTableProcedure) + .anyMatch(p -> !p.isFinished()) + ) { + LOG.info("There is a pending {}, will retry claim replication queue later", + MigrateReplicationQueueFromZkToTableProcedure.class.getSimpleName()); + suspend(10_000, true); + return Flow.NO_MORE_STATE; + } addChildProcedure(new AssignReplicationQueuesProcedure(serverName)); setNextState(ServerCrashState.SERVER_CRASH_FINISH); break; @@ -431,6 +443,13 @@ protected void releaseLock(final MasterProcedureEnv env) { env.getProcedureScheduler().wakeServerExclusiveLock(this, getServerName()); } + @Override + protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) { + setState(ProcedureProtos.ProcedureState.RUNNABLE); + env.getProcedureScheduler().addFront(this); + return false; + } + @Override public void toStringClassDetails(StringBuilder sb) { sb.append(getProcName()); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AbstractPeerNoLockProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AbstractPeerNoLockProcedure.java index 660f9968573d..1f0a89f20762 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AbstractPeerNoLockProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AbstractPeerNoLockProcedure.java @@ -98,10 +98,7 @@ protected final ProcedureSuspendedException suspend(Configuration conf, } long backoff = retryCounter.getBackoffTimeAndIncrementAttempts(); backoffConsumer.accept(backoff); - setTimeout(Math.toIntExact(backoff)); - setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT); - skipPersistence(); - throw new ProcedureSuspendedException(); + throw suspend(Math.toIntExact(backoff), false); } protected final void resetRetry() { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/MigrateReplicationQueueFromZkToTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/MigrateReplicationQueueFromZkToTableProcedure.java new file mode 100644 index 000000000000..536f232338e9 --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/MigrateReplicationQueueFromZkToTableProcedure.java @@ -0,0 +1,244 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.replication; + +import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.MigrateReplicationQueueFromZkToTableState.MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_DISABLE_PEER; +import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.MigrateReplicationQueueFromZkToTableState.MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_ENABLE_PEER; +import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.MigrateReplicationQueueFromZkToTableState.MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_MIGRATE; +import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.MigrateReplicationQueueFromZkToTableState.MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_PREPARE; +import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.MigrateReplicationQueueFromZkToTableState.MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_WAIT_UPGRADING; + +import java.io.IOException; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.stream.Collectors; +import org.apache.hadoop.hbase.master.procedure.GlobalProcedureInterface; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; +import org.apache.hadoop.hbase.master.procedure.PeerProcedureInterface; +import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; +import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; +import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; +import org.apache.hadoop.hbase.procedure2.StateMachineProcedure; +import org.apache.hadoop.hbase.replication.ReplicationPeerDescription; +import org.apache.hadoop.hbase.replication.ZKReplicationQueueStorageForMigration; +import org.apache.hadoop.hbase.util.VersionInfo; +import org.apache.yetus.audience.InterfaceAudience; +import org.apache.zookeeper.KeeperException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hbase.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder; + +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.MigrateReplicationQueueFromZkToTableState; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.MigrateReplicationQueueFromZkToTableStateData; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos; + +/** + * A procedure for migrating replication queue data from zookeeper to hbase:replication table. + */ +@InterfaceAudience.Private +public class MigrateReplicationQueueFromZkToTableProcedure + extends StateMachineProcedure + implements GlobalProcedureInterface { + + private static final Logger LOG = + LoggerFactory.getLogger(MigrateReplicationQueueFromZkToTableProcedure.class); + + private static final int MIN_MAJOR_VERSION = 3; + + private List disabledPeerIds; + + private List> futures; + + private ExecutorService executor; + + @Override + public String getGlobalId() { + return getClass().getSimpleName(); + } + + private ExecutorService getExecutorService() { + if (executor == null) { + executor = Executors.newFixedThreadPool(3, new ThreadFactoryBuilder() + .setNameFormat(getClass().getSimpleName() + "-%d").setDaemon(true).build()); + } + return executor; + } + + private void shutdownExecutorService() { + if (executor != null) { + executor.shutdown(); + executor = null; + } + } + + private void waitUntilNoPeerProcedure(MasterProcedureEnv env) throws ProcedureSuspendedException { + long peerProcCount; + try { + peerProcCount = env.getMasterServices().getProcedures().stream() + .filter(p -> p instanceof PeerProcedureInterface).filter(p -> !p.isFinished()).count(); + } catch (IOException e) { + LOG.warn("failed to check peer procedure status", e); + throw suspend(5000, true); + } + if (peerProcCount > 0) { + LOG.info("There are still {} pending peer procedures, will sleep and check later", + peerProcCount); + throw suspend(10_000, true); + } + LOG.info("No pending peer procedures found, continue..."); + } + + @Override + protected Flow executeFromState(MasterProcedureEnv env, + MigrateReplicationQueueFromZkToTableState state) + throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException { + switch (state) { + case MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_PREPARE: + waitUntilNoPeerProcedure(env); + List peers = env.getReplicationPeerManager().listPeers(null); + if (peers.isEmpty()) { + LOG.info("No active replication peer found, delete old replication queue data and quit"); + ZKReplicationQueueStorageForMigration oldStorage = + new ZKReplicationQueueStorageForMigration(env.getMasterServices().getZooKeeper(), + env.getMasterConfiguration()); + try { + oldStorage.deleteAllData(); + } catch (KeeperException e) { + LOG.warn("failed to delete old replication queue data, sleep and retry later", e); + suspend(10_000, true); + } + return Flow.NO_MORE_STATE; + } + // here we do not care the peers which have already been disabled, as later we do not need + // to enable them + disabledPeerIds = peers.stream().filter(ReplicationPeerDescription::isEnabled) + .map(ReplicationPeerDescription::getPeerId).collect(Collectors.toList()); + setNextState(MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_DISABLE_PEER); + return Flow.HAS_MORE_STATE; + case MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_DISABLE_PEER: + for (String peerId : disabledPeerIds) { + addChildProcedure(new DisablePeerProcedure(peerId)); + } + setNextState(MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_MIGRATE); + return Flow.HAS_MORE_STATE; + case MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_MIGRATE: + if (futures != null) { + // wait until all futures done + long notDone = futures.stream().filter(f -> !f.isDone()).count(); + if (notDone == 0) { + boolean succ = true; + for (Future future : futures) { + try { + future.get(); + } catch (Exception e) { + succ = false; + LOG.warn("Failed to migrate", e); + } + } + if (succ) { + shutdownExecutorService(); + setNextState(MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_WAIT_UPGRADING); + return Flow.HAS_MORE_STATE; + } + // reschedule to retry migration again + futures = null; + } else { + LOG.info("There still {} pending migration tasks, will sleep and check later", notDone); + throw suspend(10_000, true); + } + } + try { + futures = env.getReplicationPeerManager() + .migrateQueuesFromZk(env.getMasterServices().getZooKeeper(), getExecutorService()); + } catch (IOException e) { + LOG.warn("failed to submit migration tasks", e); + throw suspend(10_000, true); + } + throw suspend(10_000, true); + case MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_WAIT_UPGRADING: + long rsWithLowerVersion = + env.getMasterServices().getServerManager().getOnlineServers().values().stream() + .filter(sm -> VersionInfo.getMajorVersion(sm.getVersion()) < MIN_MAJOR_VERSION).count(); + if (rsWithLowerVersion == 0) { + setNextState(MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_ENABLE_PEER); + return Flow.HAS_MORE_STATE; + } else { + LOG.info("There are still {} region servers which have a major version less than {}, " + + "will sleep and check later", rsWithLowerVersion, MIN_MAJOR_VERSION); + throw suspend(10_000, true); + } + case MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_ENABLE_PEER: + for (String peerId : disabledPeerIds) { + addChildProcedure(new EnablePeerProcedure(peerId)); + } + return Flow.NO_MORE_STATE; + default: + throw new UnsupportedOperationException("unhandled state=" + state); + } + } + + @Override + protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) { + setState(ProcedureProtos.ProcedureState.RUNNABLE); + env.getProcedureScheduler().addFront(this); + return false; + } + + @Override + protected void rollbackState(MasterProcedureEnv env, + MigrateReplicationQueueFromZkToTableState state) throws IOException, InterruptedException { + throw new UnsupportedOperationException(); + } + + @Override + protected MigrateReplicationQueueFromZkToTableState getState(int stateId) { + return MigrateReplicationQueueFromZkToTableState.forNumber(stateId); + } + + @Override + protected int getStateId(MigrateReplicationQueueFromZkToTableState state) { + return state.getNumber(); + } + + @Override + protected MigrateReplicationQueueFromZkToTableState getInitialState() { + return MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_PREPARE; + } + + @Override + protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException { + super.serializeStateData(serializer); + MigrateReplicationQueueFromZkToTableStateData.Builder builder = + MigrateReplicationQueueFromZkToTableStateData.newBuilder(); + if (disabledPeerIds != null) { + builder.addAllDisabledPeerId(disabledPeerIds); + } + serializer.serialize(builder.build()); + } + + @Override + protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException { + super.deserializeStateData(serializer); + MigrateReplicationQueueFromZkToTableStateData data = + serializer.deserialize(MigrateReplicationQueueFromZkToTableStateData.class); + disabledPeerIds = data.getDisabledPeerIdList().stream().collect(Collectors.toList()); + } +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ModifyPeerProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ModifyPeerProcedure.java index 50214e205192..79bed1503bec 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ModifyPeerProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ModifyPeerProcedure.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.io.InterruptedIOException; +import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.TableNotFoundException; import org.apache.hadoop.hbase.client.TableDescriptor; @@ -27,6 +28,7 @@ import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.ProcedurePrepareLatch; import org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure; +import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; import org.apache.hadoop.hbase.replication.ReplicationException; import org.apache.hadoop.hbase.replication.ReplicationPeerConfig; @@ -152,12 +154,36 @@ protected void reopenRegions(MasterProcedureEnv env) throws IOException { } } + private boolean shouldFailForMigrating(MasterProcedureEnv env) throws IOException { + long parentProcId = getParentProcId(); + if ( + parentProcId != Procedure.NO_PROC_ID && env.getMasterServices().getMasterProcedureExecutor() + .getProcedure(parentProcId) instanceof MigrateReplicationQueueFromZkToTableProcedure + ) { + // this is scheduled by MigrateReplicationQueueFromZkToTableProcedure, should not fail it + return false; + } + return env.getMasterServices().getProcedures().stream() + .filter(p -> p instanceof MigrateReplicationQueueFromZkToTableProcedure) + .anyMatch(p -> !p.isFinished()); + } + @Override protected Flow executeFromState(MasterProcedureEnv env, PeerModificationState state) throws ProcedureSuspendedException, InterruptedException { switch (state) { case PRE_PEER_MODIFICATION: try { + if (shouldFailForMigrating(env)) { + LOG.info("There is a pending {}, give up execution of {}", + MigrateReplicationQueueFromZkToTableProcedure.class.getSimpleName(), + getClass().getName()); + setFailure("master-" + getPeerOperationType().name().toLowerCase() + "-peer", + new DoNotRetryIOException("There is a pending " + + MigrateReplicationQueueFromZkToTableProcedure.class.getSimpleName())); + releaseLatch(env); + return Flow.NO_MORE_STATE; + } checkPeerModificationEnabled(env); prePeerModification(env); } catch (IOException e) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java index 57380920d0fc..f3cdaddb31ca 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java @@ -21,14 +21,18 @@ import java.io.IOException; import java.net.URI; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.EnumSet; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; @@ -40,6 +44,7 @@ import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.ReplicationPeerNotFoundException; +import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.replication.ReplicationPeerConfigUtil; import org.apache.hadoop.hbase.conf.ConfigurationObserver; @@ -51,17 +56,24 @@ import org.apache.hadoop.hbase.replication.HBaseReplicationEndpoint; import org.apache.hadoop.hbase.replication.ReplicationEndpoint; import org.apache.hadoop.hbase.replication.ReplicationException; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; import org.apache.hadoop.hbase.replication.ReplicationPeerConfig; import org.apache.hadoop.hbase.replication.ReplicationPeerConfigBuilder; import org.apache.hadoop.hbase.replication.ReplicationPeerDescription; import org.apache.hadoop.hbase.replication.ReplicationPeerStorage; +import org.apache.hadoop.hbase.replication.ReplicationQueueData; import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; import org.apache.hadoop.hbase.replication.ReplicationUtils; import org.apache.hadoop.hbase.replication.SyncReplicationState; +import org.apache.hadoop.hbase.replication.ZKReplicationQueueStorageForMigration; +import org.apache.hadoop.hbase.replication.ZKReplicationQueueStorageForMigration.MigrationIterator; +import org.apache.hadoop.hbase.replication.ZKReplicationQueueStorageForMigration.ZkLastPushedSeqId; +import org.apache.hadoop.hbase.replication.ZKReplicationQueueStorageForMigration.ZkReplicationQueueData; import org.apache.hadoop.hbase.replication.master.ReplicationLogCleanerBarrier; import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.apache.hadoop.hbase.zookeeper.ZKClusterId; import org.apache.hadoop.hbase.zookeeper.ZKConfig; import org.apache.hadoop.hbase.zookeeper.ZKWatcher; @@ -116,7 +128,7 @@ public class ReplicationPeerManager implements ConfigurationObserver { private final ZKWatcher zk; @FunctionalInterface - private interface ReplicationQueueStorageInitializer { + interface ReplicationQueueStorageInitializer { void initialize() throws IOException; } @@ -151,6 +163,10 @@ private void checkQueuesDeleted(String peerId) } } + private void initializeQueueStorage() throws IOException { + queueStorageInitializer.initialize(); + } + void preAddPeer(String peerId, ReplicationPeerConfig peerConfig) throws ReplicationException, IOException { if (peerId.contains("-")) { @@ -165,7 +181,7 @@ void preAddPeer(String peerId, ReplicationPeerConfig peerConfig) } // lazy create table - queueStorageInitializer.initialize(); + initializeQueueStorage(); // make sure that there is no queues with the same peer id. This may happen when we create a // peer with the same id with a old deleted peer. If the replication queues for the old peer // have not been cleaned up yet then we should not create the new peer, otherwise the old wal @@ -718,4 +734,88 @@ public void onConfigurationChange(Configuration conf) { this.conf = conf; this.peerStorage = ReplicationStorageFactory.getReplicationPeerStorage(fs, zk, conf); } + + private ReplicationQueueData convert(ZkReplicationQueueData zkData) { + Map groupOffsets = new HashMap<>(); + zkData.getWalOffsets().forEach((wal, offset) -> { + String walGroup = AbstractFSWALProvider.getWALPrefixFromWALName(wal); + groupOffsets.compute(walGroup, (k, oldOffset) -> { + if (oldOffset == null) { + return new ReplicationGroupOffset(wal, offset); + } + // we should record the first wal's offset + long oldWalTs = AbstractFSWALProvider.getTimestamp(oldOffset.getWal()); + long walTs = AbstractFSWALProvider.getTimestamp(wal); + if (walTs < oldWalTs) { + return new ReplicationGroupOffset(wal, offset); + } + return oldOffset; + }); + }); + return new ReplicationQueueData(zkData.getQueueId(), ImmutableMap.copyOf(groupOffsets)); + } + + private void migrateQueues(ZKReplicationQueueStorageForMigration oldQueueStorage) + throws Exception { + MigrationIterator>> iter = + oldQueueStorage.listAllQueues(); + for (;;) { + Pair> pair = iter.next(); + if (pair == null) { + return; + } + queueStorage.batchUpdateQueues(pair.getFirst(), + pair.getSecond().stream().filter(data -> peers.containsKey(data.getQueueId().getPeerId())) + .map(this::convert).collect(Collectors.toList())); + } + } + + private void migrateLastPushedSeqIds(ZKReplicationQueueStorageForMigration oldQueueStorage) + throws Exception { + MigrationIterator> iter = oldQueueStorage.listAllLastPushedSeqIds(); + for (;;) { + List list = iter.next(); + if (list == null) { + return; + } + queueStorage.batchUpdateLastSequenceIds(list.stream() + .filter(data -> peers.containsKey(data.getPeerId())).collect(Collectors.toList())); + } + } + + private void migrateHFileRefs(ZKReplicationQueueStorageForMigration oldQueueStorage) + throws Exception { + MigrationIterator>> iter = oldQueueStorage.listAllHFileRefs(); + for (;;) { + Pair> pair = iter.next(); + if (pair == null) { + return; + } + if (peers.containsKey(pair.getFirst())) { + queueStorage.batchUpdateHFileRefs(pair.getFirst(), pair.getSecond()); + } + } + } + + /** + * Submit the migration tasks to the given {@code executor} and return the futures. + */ + List> migrateQueuesFromZk(ZKWatcher zookeeper, ExecutorService executor) + throws IOException { + // the replication queue table creation is asynchronous and will be triggered by addPeer, so + // here we need to manually initialize it since we will not call addPeer. + initializeQueueStorage(); + ZKReplicationQueueStorageForMigration oldStorage = + new ZKReplicationQueueStorageForMigration(zookeeper, conf); + return Arrays.asList(executor.submit(() -> { + migrateQueues(oldStorage); + return null; + }), executor.submit(() -> { + migrateLastPushedSeqIds(oldStorage); + return null; + }), executor.submit(() -> { + migrateHFileRefs(oldStorage); + return null; + })); + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/TransitPeerSyncReplicationStateProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/TransitPeerSyncReplicationStateProcedure.java index ed0760c69924..df6078d64bed 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/TransitPeerSyncReplicationStateProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/TransitPeerSyncReplicationStateProcedure.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hbase.master.procedure.ReopenTableRegionsProcedure; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; +import org.apache.hadoop.hbase.procedure2.StateMachineProcedure.Flow; import org.apache.hadoop.hbase.replication.ReplicationException; import org.apache.hadoop.hbase.replication.ReplicationPeerDescription; import org.apache.hadoop.hbase.replication.ReplicationUtils; @@ -236,6 +237,19 @@ protected Flow executeFromState(MasterProcedureEnv env, switch (state) { case PRE_PEER_SYNC_REPLICATION_STATE_TRANSITION: try { + if ( + env.getMasterServices().getProcedures().stream() + .filter(p -> p instanceof MigrateReplicationQueueFromZkToTableProcedure) + .anyMatch(p -> !p.isFinished()) + ) { + LOG.info("There is a pending {}, give up execution of {}", + MigrateReplicationQueueFromZkToTableProcedure.class.getSimpleName(), + getClass().getSimpleName()); + setFailure("master-transit-peer-sync-replication-state", + new DoNotRetryIOException("There is a pending " + + MigrateReplicationQueueFromZkToTableProcedure.class.getSimpleName())); + return Flow.NO_MORE_STATE; + } checkPeerModificationEnabled(env); preTransit(env); } catch (IOException e) { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestMigrateReplicationQueue.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestMigrateReplicationQueue.java new file mode 100644 index 000000000000..1b0f727a0722 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestMigrateReplicationQueue.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.replication; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.util.List; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; +import org.apache.hadoop.hbase.procedure2.Procedure; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; +import org.apache.hadoop.hbase.replication.ReplicationQueueData; +import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; +import org.apache.hadoop.hbase.replication.TestReplicationBase; +import org.apache.hadoop.hbase.replication.ZKReplicationStorageBase; +import org.apache.hadoop.hbase.testclassification.LargeTests; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.zookeeper.ZKUtil; +import org.apache.hadoop.hbase.zookeeper.ZKWatcher; +import org.apache.hadoop.hbase.zookeeper.ZNodePaths; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import org.apache.hbase.thirdparty.com.google.common.collect.Iterables; + +@Category({ MasterTests.class, LargeTests.class }) +public class TestMigrateReplicationQueue extends TestReplicationBase { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestMigrateReplicationQueue.class); + + private int disableAndInsert() throws Exception { + UTIL1.getAdmin().disableReplicationPeer(PEER_ID2); + return UTIL1.loadTable(htable1, famName); + } + + private String getQueuesZNode() throws IOException { + Configuration conf = UTIL1.getConfiguration(); + ZKWatcher zk = UTIL1.getZooKeeperWatcher(); + String replicationZNode = ZNodePaths.joinZNode(zk.getZNodePaths().baseZNode, + conf.get(ZKReplicationStorageBase.REPLICATION_ZNODE, + ZKReplicationStorageBase.REPLICATION_ZNODE_DEFAULT)); + return ZNodePaths.joinZNode(replicationZNode, conf.get("zookeeper.znode.replication.rs", "rs")); + } + + private void mockData() throws Exception { + // delete the replication queue table to simulate upgrading from an older version of hbase + TableName replicationQueueTableName = TableName + .valueOf(UTIL1.getConfiguration().get(ReplicationStorageFactory.REPLICATION_QUEUE_TABLE_NAME, + ReplicationStorageFactory.REPLICATION_QUEUE_TABLE_NAME_DEFAULT.getNameAsString())); + List queueDatas = UTIL1.getMiniHBaseCluster().getMaster() + .getReplicationPeerManager().getQueueStorage().listAllQueues(); + assertEquals(UTIL1.getMiniHBaseCluster().getRegionServerThreads().size(), queueDatas.size()); + UTIL1.getAdmin().disableTable(replicationQueueTableName); + UTIL1.getAdmin().deleteTable(replicationQueueTableName); + // shutdown the hbase cluster + UTIL1.shutdownMiniHBaseCluster(); + ZKWatcher zk = UTIL1.getZooKeeperWatcher(); + String queuesZNode = getQueuesZNode(); + for (ReplicationQueueData queueData : queueDatas) { + String replicatorZNode = + ZNodePaths.joinZNode(queuesZNode, queueData.getId().getServerName().toString()); + String queueZNode = ZNodePaths.joinZNode(replicatorZNode, queueData.getId().getPeerId()); + assertEquals(1, queueData.getOffsets().size()); + ReplicationGroupOffset offset = Iterables.getOnlyElement(queueData.getOffsets().values()); + String walZNode = ZNodePaths.joinZNode(queueZNode, offset.getWal()); + ZKUtil.createSetData(zk, walZNode, ZKUtil.positionToByteArray(offset.getOffset())); + } + } + + @Test + public void testMigrate() throws Exception { + int count = disableAndInsert(); + mockData(); + restartSourceCluster(1); + UTIL1.waitFor(60000, + () -> UTIL1.getMiniHBaseCluster().getMaster().getProcedures().stream() + .filter(p -> p instanceof MigrateReplicationQueueFromZkToTableProcedure).findAny() + .map(Procedure::isSuccess).orElse(false)); + TableName replicationQueueTableName = TableName + .valueOf(UTIL1.getConfiguration().get(ReplicationStorageFactory.REPLICATION_QUEUE_TABLE_NAME, + ReplicationStorageFactory.REPLICATION_QUEUE_TABLE_NAME_DEFAULT.getNameAsString())); + assertTrue(UTIL1.getAdmin().tableExists(replicationQueueTableName)); + ZKWatcher zk = UTIL1.getZooKeeperWatcher(); + assertEquals(-1, ZKUtil.checkExists(zk, getQueuesZNode())); + // wait until SCP finishes, which means we can finish the claim queue operation + UTIL1.waitFor(60000, () -> UTIL1.getMiniHBaseCluster().getMaster().getProcedures().stream() + .filter(p -> p instanceof ServerCrashProcedure).allMatch(Procedure::isSuccess)); + List queueDatas = UTIL1.getMiniHBaseCluster().getMaster() + .getReplicationPeerManager().getQueueStorage().listAllQueues(); + assertEquals(1, queueDatas.size()); + // should have 1 recovered queue, as we haven't replicated anything out so there is no queue + // data for the new alive region server + assertTrue(queueDatas.get(0).getId().isRecovered()); + assertEquals(1, queueDatas.get(0).getOffsets().size()); + // the peer is still disabled, so no data has been replicated + assertFalse(UTIL1.getAdmin().isReplicationPeerEnabled(PEER_ID2)); + assertEquals(0, HBaseTestingUtil.countRows(htable2)); + // enable peer, and make sure the replication can continue correctly + UTIL1.getAdmin().enableReplicationPeer(PEER_ID2); + waitForReplication(count, 100); + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestMigrateReplicationQueueFromZkToTableProcedure.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestMigrateReplicationQueueFromZkToTableProcedure.java new file mode 100644 index 000000000000..752abc380b84 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestMigrateReplicationQueueFromZkToTableProcedure.java @@ -0,0 +1,226 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.replication; + +import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.MigrateReplicationQueueFromZkToTableState.MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_WAIT_UPGRADING; +import static org.junit.Assert.assertFalse; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.CountDownLatch; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.ServerMetrics; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.StartTestingClusterOption; +import org.apache.hadoop.hbase.client.Admin; +import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.MasterServices; +import org.apache.hadoop.hbase.master.RegionServerList; +import org.apache.hadoop.hbase.master.ServerManager; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; +import org.apache.hadoop.hbase.master.procedure.PeerProcedureInterface; +import org.apache.hadoop.hbase.procedure2.Procedure; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; +import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; +import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; +import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; +import org.apache.hadoop.hbase.replication.ReplicationPeerConfig; +import org.apache.hadoop.hbase.replication.ReplicationPeerDescription; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState; + +@Category({ MasterTests.class, MediumTests.class }) +public class TestMigrateReplicationQueueFromZkToTableProcedure { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestMigrateReplicationQueueFromZkToTableProcedure.class); + + private static final HBaseTestingUtil UTIL = new HBaseTestingUtil(); + + public static final class HMasterForTest extends HMaster { + + public HMasterForTest(Configuration conf) throws IOException { + super(conf); + } + + @Override + protected ServerManager createServerManager(MasterServices master, RegionServerList storage) + throws IOException { + setupClusterConnection(); + return new ServerManagerForTest(master, storage); + } + } + + private static final ConcurrentMap EXTRA_REGION_SERVERS = + new ConcurrentHashMap<>(); + + public static final class ServerManagerForTest extends ServerManager { + + public ServerManagerForTest(MasterServices master, RegionServerList storage) { + super(master, storage); + } + + @Override + public Map getOnlineServers() { + Map map = new HashMap<>(super.getOnlineServers()); + map.putAll(EXTRA_REGION_SERVERS); + return map; + } + } + + @BeforeClass + public static void setupCluster() throws Exception { + UTIL.startMiniCluster( + StartTestingClusterOption.builder().masterClass(HMasterForTest.class).build()); + } + + @AfterClass + public static void cleanupTest() throws Exception { + UTIL.shutdownMiniCluster(); + } + + private ProcedureExecutor getMasterProcedureExecutor() { + return UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor(); + } + + @After + public void tearDown() throws Exception { + Admin admin = UTIL.getAdmin(); + for (ReplicationPeerDescription pd : admin.listReplicationPeers()) { + admin.removeReplicationPeer(pd.getPeerId()); + } + } + + private static CountDownLatch PEER_PROC_ARRIVE; + + private static CountDownLatch PEER_PROC_RESUME; + + public static final class FakePeerProcedure extends Procedure + implements PeerProcedureInterface { + + private String peerId; + + public FakePeerProcedure() { + } + + public FakePeerProcedure(String peerId) { + this.peerId = peerId; + } + + @Override + public String getPeerId() { + return peerId; + } + + @Override + public PeerOperationType getPeerOperationType() { + return PeerOperationType.UPDATE_CONFIG; + } + + @Override + protected Procedure[] execute(MasterProcedureEnv env) + throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException { + PEER_PROC_ARRIVE.countDown(); + PEER_PROC_RESUME.await(); + return null; + } + + @Override + protected void rollback(MasterProcedureEnv env) throws IOException, InterruptedException { + throw new UnsupportedOperationException(); + } + + @Override + protected boolean abort(MasterProcedureEnv env) { + return false; + } + + @Override + protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException { + } + + @Override + protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException { + } + } + + @Test + public void testWaitUntilNoPeerProcedure() throws Exception { + PEER_PROC_ARRIVE = new CountDownLatch(1); + PEER_PROC_RESUME = new CountDownLatch(1); + ProcedureExecutor procExec = getMasterProcedureExecutor(); + procExec.submitProcedure(new FakePeerProcedure("1")); + PEER_PROC_ARRIVE.await(); + MigrateReplicationQueueFromZkToTableProcedure proc = + new MigrateReplicationQueueFromZkToTableProcedure(); + procExec.submitProcedure(proc); + // make sure we will wait until there is no peer related procedures before proceeding + UTIL.waitFor(30000, () -> proc.getState() == ProcedureState.WAITING_TIMEOUT); + // continue and make sure we can finish successfully + PEER_PROC_RESUME.countDown(); + UTIL.waitFor(30000, () -> proc.isSuccess()); + } + + @Test + public void testDisablePeerAndWaitUpgrading() throws Exception { + String peerId = "2"; + ReplicationPeerConfig rpc = ReplicationPeerConfig.newBuilder() + .setClusterKey(UTIL.getZkCluster().getAddress().toString() + ":/testhbase") + .setReplicateAllUserTables(true).build(); + UTIL.getAdmin().addReplicationPeer(peerId, rpc); + // put a fake region server to simulate that there are still region servers with older version + ServerMetrics metrics = mock(ServerMetrics.class); + when(metrics.getVersion()).thenReturn("2.5.0"); + EXTRA_REGION_SERVERS + .put(ServerName.valueOf("localhost", 54321, EnvironmentEdgeManager.currentTime()), metrics); + + ProcedureExecutor procExec = getMasterProcedureExecutor(); + + MigrateReplicationQueueFromZkToTableProcedure proc = + new MigrateReplicationQueueFromZkToTableProcedure(); + procExec.submitProcedure(proc); + // wait until we reach the wait upgrading state + UTIL.waitFor(30000, + () -> proc.getCurrentStateId() + == MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_WAIT_UPGRADING.getNumber() + && proc.getState() == ProcedureState.WAITING_TIMEOUT); + // make sure the peer is disabled for migrating + assertFalse(UTIL.getAdmin().isReplicationPeerEnabled(peerId)); + + // the procedure should finish successfully + EXTRA_REGION_SERVERS.clear(); + UTIL.waitFor(30000, () -> proc.isSuccess()); + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestMigrateReplicationQueueFromZkToTableProcedureRecovery.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestMigrateReplicationQueueFromZkToTableProcedureRecovery.java new file mode 100644 index 000000000000..8d1a975400fa --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestMigrateReplicationQueueFromZkToTableProcedureRecovery.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.replication; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.hasItem; +import static org.hamcrest.Matchers.hasSize; + +import java.io.IOException; +import java.util.List; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureConstants; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureTestingUtility; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; +import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; +import org.apache.hadoop.hbase.replication.ReplicationPeerConfig; +import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; +import org.apache.hadoop.hbase.replication.ZKReplicationQueueStorageForMigration; +import org.apache.hadoop.hbase.replication.ZKReplicationStorageBase; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.zookeeper.ZKUtil; +import org.apache.hadoop.hbase.zookeeper.ZKWatcher; +import org.apache.hadoop.hbase.zookeeper.ZNodePaths; +import org.hamcrest.Matchers; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category({ MasterTests.class, MediumTests.class }) +public class TestMigrateReplicationQueueFromZkToTableProcedureRecovery { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestMigrateReplicationQueueFromZkToTableProcedureRecovery.class); + + private static final HBaseTestingUtil UTIL = new HBaseTestingUtil(); + + @BeforeClass + public static void setupCluster() throws Exception { + UTIL.getConfiguration().setInt(MasterProcedureConstants.MASTER_PROCEDURE_THREADS, 1); + UTIL.startMiniCluster(1); + } + + @AfterClass + public static void cleanupTest() throws Exception { + UTIL.shutdownMiniCluster(); + } + + private ProcedureExecutor getMasterProcedureExecutor() { + return UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor(); + } + + @Before + public void setup() throws Exception { + ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(getMasterProcedureExecutor(), false); + } + + @After + public void tearDown() throws Exception { + ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(getMasterProcedureExecutor(), false); + } + + private String getHFileRefsZNode() throws IOException { + Configuration conf = UTIL.getConfiguration(); + ZKWatcher zk = UTIL.getZooKeeperWatcher(); + String replicationZNode = ZNodePaths.joinZNode(zk.getZNodePaths().baseZNode, + conf.get(ZKReplicationStorageBase.REPLICATION_ZNODE, + ZKReplicationStorageBase.REPLICATION_ZNODE_DEFAULT)); + return ZNodePaths.joinZNode(replicationZNode, + conf.get(ZKReplicationQueueStorageForMigration.ZOOKEEPER_ZNODE_REPLICATION_HFILE_REFS_KEY, + ZKReplicationQueueStorageForMigration.ZOOKEEPER_ZNODE_REPLICATION_HFILE_REFS_DEFAULT)); + } + + @Test + public void testRecoveryAndDoubleExecution() throws Exception { + String peerId = "2"; + ReplicationPeerConfig rpc = ReplicationPeerConfig.newBuilder() + .setClusterKey(UTIL.getZkCluster().getAddress().toString() + ":/testhbase") + .setReplicateAllUserTables(true).build(); + UTIL.getAdmin().addReplicationPeer(peerId, rpc); + + // here we only test a simple migration, more complicated migration will be tested in other UTs, + // such as TestMigrateReplicationQueue and TestReplicationPeerManagerMigrateFromZk + String hfileRefsZNode = getHFileRefsZNode(); + String hfile = "hfile"; + String hfileZNode = ZNodePaths.joinZNode(hfileRefsZNode, peerId, hfile); + ZKUtil.createWithParents(UTIL.getZooKeeperWatcher(), hfileZNode); + + ProcedureExecutor procExec = getMasterProcedureExecutor(); + + ProcedureTestingUtility.waitNoProcedureRunning(procExec); + ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true); + + // Start the migration procedure && kill the executor + long procId = procExec.submitProcedure(new MigrateReplicationQueueFromZkToTableProcedure()); + // Restart the executor and execute the step twice + MasterProcedureTestingUtility.testRecoveryAndDoubleExecution(procExec, procId); + // Validate the migration result + ProcedureTestingUtility.assertProcNotFailed(procExec, procId); + ReplicationQueueStorage queueStorage = + UTIL.getMiniHBaseCluster().getMaster().getReplicationPeerManager().getQueueStorage(); + List hfiles = queueStorage.getReplicableHFiles(peerId); + assertThat(hfiles, Matchers.> both(hasItem(hfile)).and(hasSize(1))); + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestReplicationPeerManagerMigrateQueuesFromZk.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestReplicationPeerManagerMigrateQueuesFromZk.java new file mode 100644 index 000000000000..76301ae67531 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestReplicationPeerManagerMigrateQueuesFromZk.java @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.replication; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.empty; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; + +import java.io.IOException; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.Cell; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.TableNameTestRule; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.master.replication.ReplicationPeerManager.ReplicationQueueStorageInitializer; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; +import org.apache.hadoop.hbase.replication.ReplicationPeerDescription; +import org.apache.hadoop.hbase.replication.ReplicationPeerStorage; +import org.apache.hadoop.hbase.replication.ReplicationQueueData; +import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; +import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; +import org.apache.hadoop.hbase.replication.TableReplicationQueueStorage; +import org.apache.hadoop.hbase.replication.TestZKReplicationQueueStorage; +import org.apache.hadoop.hbase.replication.ZKReplicationQueueStorageForMigration; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import org.apache.hbase.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder; + +@Category({ MasterTests.class, MediumTests.class }) +public class TestReplicationPeerManagerMigrateQueuesFromZk { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestReplicationPeerManagerMigrateQueuesFromZk.class); + + private static HBaseTestingUtil UTIL = new HBaseTestingUtil(); + + private static ExecutorService EXECUTOR; + + ConcurrentMap peers; + + private ReplicationPeerStorage peerStorage; + + private ReplicationQueueStorage queueStorage; + + private ReplicationQueueStorageInitializer queueStorageInitializer; + + private ReplicationPeerManager manager; + + private int nServers = 10; + + private int nPeers = 10; + + private int nRegions = 100; + + private ServerName deadServerName; + + @Rule + public final TableNameTestRule tableNameRule = new TableNameTestRule(); + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + UTIL.startMiniCluster(1); + EXECUTOR = Executors.newFixedThreadPool(3, + new ThreadFactoryBuilder().setDaemon(true) + .setNameFormat(TestReplicationPeerManagerMigrateQueuesFromZk.class.getSimpleName() + "-%d") + .build()); + } + + @AfterClass + public static void tearDownAfterClass() throws Exception { + EXECUTOR.shutdownNow(); + UTIL.shutdownMiniCluster(); + } + + @Before + public void setUp() throws IOException { + Configuration conf = UTIL.getConfiguration(); + peerStorage = mock(ReplicationPeerStorage.class); + TableName tableName = tableNameRule.getTableName(); + UTIL.getAdmin() + .createTable(ReplicationStorageFactory.createReplicationQueueTableDescriptor(tableName)); + queueStorage = new TableReplicationQueueStorage(UTIL.getConnection(), tableName); + queueStorageInitializer = mock(ReplicationQueueStorageInitializer.class); + peers = new ConcurrentHashMap<>(); + deadServerName = + ServerName.valueOf("test-hbase-dead", 12345, EnvironmentEdgeManager.currentTime()); + manager = new ReplicationPeerManager(UTIL.getTestFileSystem(), UTIL.getZooKeeperWatcher(), + peerStorage, queueStorage, peers, conf, "cluster", queueStorageInitializer); + } + + private Map> prepareData() throws Exception { + ZKReplicationQueueStorageForMigration storage = new ZKReplicationQueueStorageForMigration( + UTIL.getZooKeeperWatcher(), UTIL.getConfiguration()); + TestZKReplicationQueueStorage.mockQueuesData(storage, 10, "peer_0", deadServerName); + Map> encodedName2PeerIds = TestZKReplicationQueueStorage + .mockLastPushedSeqIds(storage, "peer_1", "peer_2", nRegions, 10, 10); + TestZKReplicationQueueStorage.mockHFileRefs(storage, 10); + return encodedName2PeerIds; + } + + @Test + public void testNoPeers() throws Exception { + prepareData(); + for (Future future : manager.migrateQueuesFromZk(UTIL.getZooKeeperWatcher(), EXECUTOR)) { + future.get(1, TimeUnit.MINUTES); + } + // should have called initializer + verify(queueStorageInitializer).initialize(); + // should have not migrated any data since there is no peer + try (Table table = UTIL.getConnection().getTable(tableNameRule.getTableName())) { + assertEquals(0, HBaseTestingUtil.countRows(table)); + } + } + + @Test + public void testMigrate() throws Exception { + Map> encodedName2PeerIds = prepareData(); + // add all peers so we will migrate them all + for (int i = 0; i < nPeers; i++) { + // value is not used in this test, so just add a mock + peers.put("peer_" + i, mock(ReplicationPeerDescription.class)); + } + for (Future future : manager.migrateQueuesFromZk(UTIL.getZooKeeperWatcher(), EXECUTOR)) { + future.get(1, TimeUnit.MINUTES); + } + // should have called initializer + verify(queueStorageInitializer).initialize(); + List queueDatas = queueStorage.listAllQueues(); + // there should be two empty queues so minus 2 + assertEquals(2 * nServers - 2, queueDatas.size()); + for (ReplicationQueueData queueData : queueDatas) { + assertEquals("peer_0", queueData.getId().getPeerId()); + assertEquals(1, queueData.getOffsets().size()); + String walGroup = queueData.getId().getServerWALsBelongTo().toString(); + ReplicationGroupOffset offset = queueData.getOffsets().get(walGroup); + assertEquals(0, offset.getOffset()); + assertEquals(queueData.getId().getServerWALsBelongTo().toString() + ".0", offset.getWal()); + } + // there is no method in ReplicationQueueStorage can list all the last pushed sequence ids + try (Table table = UTIL.getConnection().getTable(tableNameRule.getTableName()); + ResultScanner scanner = + table.getScanner(TableReplicationQueueStorage.LAST_SEQUENCE_ID_FAMILY)) { + for (int i = 0; i < 2; i++) { + Result result = scanner.next(); + String peerId = Bytes.toString(result.getRow()); + assertEquals(nRegions, result.size()); + for (Cell cell : result.rawCells()) { + String encodedRegionName = Bytes.toString(cell.getQualifierArray(), + cell.getQualifierOffset(), cell.getQualifierLength()); + encodedName2PeerIds.get(encodedRegionName).remove(peerId); + long seqId = + Bytes.toLong(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength()); + assertEquals(i + 1, seqId); + } + } + encodedName2PeerIds.forEach((encodedRegionName, peerIds) -> { + assertThat(encodedRegionName + " still has unmigrated peers", peerIds, empty()); + }); + assertNull(scanner.next()); + } + for (int i = 0; i < nPeers; i++) { + List refs = queueStorage.getReplicableHFiles("peer_" + i); + assertEquals(i, refs.size()); + Set refsSet = new HashSet<>(refs); + for (int j = 0; j < i; j++) { + assertTrue(refsSet.remove("hfile-" + j)); + } + assertThat(refsSet, empty()); + } + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationBase.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationBase.java index b6157ac0f184..27477527277f 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationBase.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationBase.java @@ -216,7 +216,7 @@ static void configureClusters(HBaseTestingUtil util1, HBaseTestingUtil util2) { conf2.setBoolean("hbase.tests.use.shortcircuit.reads", false); } - static void restartSourceCluster(int numSlaves) throws Exception { + protected static void restartSourceCluster(int numSlaves) throws Exception { Closeables.close(hbaseAdmin, true); Closeables.close(htable1, true); UTIL1.shutdownMiniHBaseCluster(); diff --git a/pom.xml b/pom.xml index 6dbb40c8c64d..3b48f73cb4c9 100644 --- a/pom.xml +++ b/pom.xml @@ -1046,13 +1046,18 @@ hbase-hadoop-compat ${project.version} test-jar - test org.apache.hbase hbase-replication ${project.version} + + org.apache.hbase + hbase-replication + ${project.version} + test-jar + org.apache.hbase hbase-balancer From 487f0c6f06f4af9a7bfe6eab08c6a8d8fba1d654 Mon Sep 17 00:00:00 2001 From: LiangJun He <2005hithlj@163.com> Date: Sun, 13 Nov 2022 22:03:36 +0800 Subject: [PATCH 08/16] HBASE-27217 Revisit the DumpReplicationQueues tool (#4810) Signed-off-by: Duo Zhang --- .../regionserver/DumpReplicationQueues.java | 240 +++++++++++------- .../hbase/wal/AbstractFSWALProvider.java | 20 ++ .../TestDumpReplicationQueues.java | 159 ++++++++---- 3 files changed, 284 insertions(+), 135 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/DumpReplicationQueues.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/DumpReplicationQueues.java index 98d0a55fbc43..b284e3f6837f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/DumpReplicationQueues.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/DumpReplicationQueues.java @@ -19,8 +19,12 @@ import java.io.FileNotFoundException; import java.io.IOException; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -31,7 +35,7 @@ import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.hbase.Abortable; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.Admin; @@ -40,28 +44,33 @@ import org.apache.hadoop.hbase.client.replication.TableCFs; import org.apache.hadoop.hbase.io.WALLink; import org.apache.hadoop.hbase.procedure2.util.StringUtils; +import org.apache.hadoop.hbase.replication.ReplicationException; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; +import org.apache.hadoop.hbase.replication.ReplicationOffsetUtil; import org.apache.hadoop.hbase.replication.ReplicationPeerConfig; import org.apache.hadoop.hbase.replication.ReplicationPeerDescription; -import org.apache.hadoop.hbase.replication.ReplicationQueueInfo; +import org.apache.hadoop.hbase.replication.ReplicationQueueData; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; -import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; -import org.apache.hadoop.hbase.zookeeper.ZKDump; -import org.apache.hadoop.hbase.zookeeper.ZKWatcher; +import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; +import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap; import org.apache.hbase.thirdparty.com.google.common.util.concurrent.AtomicLongMap; /** - * TODO: reimplement this tool *

    * Provides information about the existing states of replication, replication peers and queues. * Usage: hbase org.apache.hadoop.hbase.replication.regionserver.DumpReplicationQueues [args] * Arguments: --distributed Polls each RS to dump information about the queue --hdfs Reports HDFS - * usage by the replication queues (note: can be overestimated). + * usage by the replication queues (note: can be overestimated). In the new version, we + * reimplemented the DumpReplicationQueues tool to support obtaining information from replication + * table. */ @InterfaceAudience.Private public class DumpReplicationQueues extends Configured implements Tool { @@ -185,7 +194,7 @@ protected static void printUsage(final String className, final String message) { System.err.println("General Options:"); System.err.println(" -h|--h|--help Show this help and exit."); System.err.println(" --distributed Poll each RS and print its own replication queue. " - + "Default only polls ZooKeeper"); + + "Default only polls replication table."); System.err.println(" --hdfs Use HDFS to calculate usage of WALs by replication." + " It could be overestimated if replicating to multiple peers." + " --distributed flag is also needed."); @@ -201,13 +210,7 @@ private int dumpReplicationQueues(DumpOptions opts) throws Exception { Connection connection = ConnectionFactory.createConnection(conf); Admin admin = connection.getAdmin(); - ZKWatcher zkw = - new ZKWatcher(conf, "DumpReplicationQueues" + EnvironmentEdgeManager.currentTime(), - new WarnOnlyAbortable(), true); - try { - // Our zk watcher - LOG.info("Our Quorum: " + zkw.getQuorum()); List replicatedTableCFs = admin.listReplicatedTableCFs(); if (replicatedTableCFs.isEmpty()) { LOG.info("No tables with a configured replication peer were found."); @@ -229,21 +232,72 @@ private int dumpReplicationQueues(DumpOptions opts) throws Exception { LOG.info("Found [--distributed], will poll each RegionServer."); Set peerIds = peers.stream().map((peer) -> peer.getPeerId()).collect(Collectors.toSet()); - System.out.println(dumpQueues(zkw, peerIds, opts.isHdfs())); + System.out.println(dumpQueues(connection, peerIds, opts.isHdfs(), conf)); System.out.println(dumpReplicationSummary()); } else { - // use ZK instead - System.out.print("Dumping replication znodes via ZooKeeper:"); - System.out.println(ZKDump.getReplicationZnodesDump(zkw)); + // use replication table instead + System.out.println("Dumping replication info via replication table."); + System.out.println(dumpReplicationViaTable(connection, conf)); } return (0); } catch (IOException e) { return (-1); } finally { - zkw.close(); + connection.close(); } } + public String dumpReplicationViaTable(Connection connection, Configuration conf) + throws ReplicationException, IOException { + StringBuilder sb = new StringBuilder(); + ReplicationQueueStorage queueStorage = + ReplicationStorageFactory.getReplicationQueueStorage(connection, conf); + + // The dump info format is as follows: + // peers: + // peers/1: zk1:2181:/hbase + // peers/1/peer-state: ENABLED + // rs: + // rs/rs1,16020,1664092120094/1/rs1%2C16020%2C1664092120094.1664096778778: 123 + // rs/rs2,16020,1664092120094/2/rs1%2C16020%2C1664092120094.1664096778778: 321 + // hfile-refs: + // hfile-refs/1/hfile1,hfile2 + // hfile-refs/2/hfile3,hfile4 + String peersKey = "peers"; + sb.append(peersKey).append(": ").append("\n"); + List repPeerDescs = connection.getAdmin().listReplicationPeers(); + for (ReplicationPeerDescription repPeerDesc : repPeerDescs) { + sb.append(peersKey).append("/").append(repPeerDesc.getPeerId()).append(": ") + .append(repPeerDesc.getPeerConfig().getClusterKey()).append("\n"); + sb.append(peersKey).append("/").append(repPeerDesc.getPeerId()).append("/peer-state: ") + .append(repPeerDesc.isEnabled() ? "ENABLED" : "DISABLED").append("\n"); + } + + List repQueueDataList = queueStorage.listAllQueues(); + String rsKey = "rs"; + sb.append(rsKey).append(": ").append("\n"); + for (ReplicationQueueData repQueueData : repQueueDataList) { + String peerId = repQueueData.getId().getPeerId(); + for (ImmutableMap.Entry entry : repQueueData.getOffsets() + .entrySet()) { + sb.append(rsKey).append("/").append(entry.getKey()).append("/").append(peerId).append("/") + .append(entry.getValue().getWal()).append(": ").append(entry.getValue().getOffset()) + .append("\n"); + } + } + + List peerIds = queueStorage.getAllPeersFromHFileRefsQueue(); + String hfileKey = "hfile-refs"; + sb.append(hfileKey).append(": ").append("\n"); + for (String peerId : peerIds) { + List hfiles = queueStorage.getReplicableHFiles(peerId); + sb.append(hfileKey).append("/").append(peerId).append("/").append(String.join(",", hfiles)) + .append("\n"); + } + + return sb.toString(); + } + public String dumpReplicationSummary() { StringBuilder sb = new StringBuilder(); if (!deletedQueues.isEmpty()) { @@ -294,71 +348,103 @@ public String dumpPeersState(List peers) throws Exce return sb.toString(); } - public String dumpQueues(ZKWatcher zkw, Set peerIds, boolean hdfs) throws Exception { - ReplicationQueueStorage queueStorage; + public String dumpQueues(Connection connection, Set peerIds, boolean hdfs, + Configuration conf) throws Exception { StringBuilder sb = new StringBuilder(); + ReplicationQueueStorage queueStorage = + ReplicationStorageFactory.getReplicationQueueStorage(connection, conf); + + Set liveRegionServers = + connection.getAdmin().getClusterMetrics().getLiveServerMetrics().keySet(); - // queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zkw, getConf()); - // Set liveRegionServers = ZKUtil.listChildrenNoWatch(zkw, - // zkw.getZNodePaths().rsZNode) - // .stream().map(ServerName::parseServerName).collect(Collectors.toSet()); - // - // Loops each peer on each RS and dumps the queues - // List regionservers = queueStorage.getListOfReplicators(); - // if (regionservers == null || regionservers.isEmpty()) { - // return sb.toString(); - // } - // for (ServerName regionserver : regionservers) { - // List queueIds = queueStorage.getAllQueues(regionserver); - // if (!liveRegionServers.contains(regionserver)) { - // deadRegionServers.add(regionserver.getServerName()); - // } - // for (String queueId : queueIds) { - // ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueId); - // List wals = queueStorage.getWALsInQueue(regionserver, queueId); - // Collections.sort(wals); - // if (!peerIds.contains(queueInfo.getPeerId())) { - // deletedQueues.add(regionserver + "/" + queueId); - // sb.append(formatQueue(regionserver, queueStorage, queueInfo, queueId, wals, true, hdfs)); - // } else { - // sb.append(formatQueue(regionserver, queueStorage, queueInfo, queueId, wals, false, hdfs)); - // } - // } - // } + List regionServers = queueStorage.listAllReplicators(); + if (regionServers == null || regionServers.isEmpty()) { + return sb.toString(); + } + for (ServerName regionServer : regionServers) { + List queueIds = queueStorage.listAllQueueIds(regionServer); + + if (!liveRegionServers.contains(regionServer)) { + deadRegionServers.add(regionServer.getServerName()); + } + for (ReplicationQueueId queueId : queueIds) { + List tmpWals = new ArrayList<>(); + // wals + AbstractFSWALProvider + .getWALFiles(connection.getConfiguration(), queueId.getServerWALsBelongTo()).stream() + .map(Path::toString).forEach(tmpWals::add); + + // old wals + AbstractFSWALProvider.getArchivedWALFiles(connection.getConfiguration(), + queueId.getServerWALsBelongTo(), URLEncoder + .encode(queueId.getServerWALsBelongTo().toString(), StandardCharsets.UTF_8.name())) + .stream().map(Path::toString).forEach(tmpWals::add); + + Map offsets = queueStorage.getOffsets(queueId); + // filter out the wal files that should replicate + List wals = new ArrayList<>(); + for (Map.Entry entry : offsets.entrySet()) { + ReplicationGroupOffset offset = entry.getValue(); + for (String wal : tmpWals) { + if (ReplicationOffsetUtil.shouldReplicate(offset, wal)) { + wals.add(wal); + } + } + } + Collections.sort(wals, Comparator.comparingLong(AbstractFSWALProvider::getTimestamp)); + if (!peerIds.contains(queueId.getPeerId())) { + deletedQueues.add(regionServer + "/" + queueId); + sb.append(formatQueue(regionServer, offsets, wals, queueId, true, hdfs)); + } else { + sb.append(formatQueue(regionServer, offsets, wals, queueId, false, hdfs)); + } + } + } return sb.toString(); } - private String formatQueue(ServerName regionserver, ReplicationQueueStorage queueStorage, - ReplicationQueueInfo queueInfo, String queueId, List wals, boolean isDeleted, - boolean hdfs) throws Exception { + private String formatQueue(ServerName regionServer, Map offsets, + List wals, ReplicationQueueId queueId, boolean isDeleted, boolean hdfs) + throws Exception { StringBuilder sb = new StringBuilder(); - List deadServers; - - sb.append("Dumping replication queue info for RegionServer: [" + regionserver + "]" + "\n"); - sb.append(" Queue znode: " + queueId + "\n"); - sb.append(" PeerID: " + queueInfo.getPeerId() + "\n"); - sb.append(" Recovered: " + queueInfo.isQueueRecovered() + "\n"); - deadServers = queueInfo.getDeadRegionServers(); - if (deadServers.isEmpty()) { - sb.append(" No dead RegionServers found in this queue." + "\n"); + sb.append("Dumping replication queue info for RegionServer: [" + regionServer + "]" + "\n"); + sb.append(" Queue id: " + queueId + "\n"); + sb.append(" PeerID: " + queueId.getPeerId() + "\n"); + sb.append(" Recovered: " + queueId.isRecovered() + "\n"); + // In new version, we only record the first dead RegionServer in queueId. + if (queueId.getSourceServerName().isPresent()) { + sb.append(" Dead RegionServer: " + queueId.getSourceServerName().get() + "\n"); } else { - sb.append(" Dead RegionServers: " + deadServers + "\n"); + sb.append(" No dead RegionServer found in this queue." + "\n"); } sb.append(" Was deleted: " + isDeleted + "\n"); sb.append(" Number of WALs in replication queue: " + wals.size() + "\n"); - peersQueueSize.addAndGet(queueInfo.getPeerId(), wals.size()); - - for (String wal : wals) { - // long position = queueStorage.getWALPosition(regionserver, queueInfo.getPeerId(), wal); - // sb.append(" Replication position for " + wal + ": " - // + (position > 0 ? position : "0" + " (not started or nothing to replicate)") + "\n"); + peersQueueSize.addAndGet(queueId.getPeerId(), wals.size()); + + for (Map.Entry entry : offsets.entrySet()) { + String walGroup = entry.getKey(); + ReplicationGroupOffset offset = entry.getValue(); + for (String wal : wals) { + long position = 0; + if (offset.getWal().equals(wal)) { + position = offset.getOffset(); + } + sb.append( + " Replication position for " + (walGroup != null ? walGroup + "/" + wal : wal) + ": "); + if (position == 0) { + sb.append("0 (not started or nothing to replicate)"); + } else if (position > 0) { + sb.append(position); + } + sb.append("\n"); + } } if (hdfs) { FileSystem fs = FileSystem.get(getConf()); sb.append(" Total size of WALs on HDFS for this queue: " - + StringUtils.humanSize(getTotalWALSize(fs, wals, regionserver)) + "\n"); + + StringUtils.humanSize(getTotalWALSize(fs, wals, regionServer)) + "\n"); } return sb.toString(); } @@ -366,8 +452,7 @@ private String formatQueue(ServerName regionserver, ReplicationQueueStorage queu /** * return total size in bytes from a list of WALs */ - private long getTotalWALSize(FileSystem fs, List wals, ServerName server) - throws IOException { + private long getTotalWALSize(FileSystem fs, List wals, ServerName server) { long size = 0; FileStatus fileStatus; @@ -389,19 +474,4 @@ private long getTotalWALSize(FileSystem fs, List wals, ServerName server totalSizeOfWALs += size; return size; } - - private static class WarnOnlyAbortable implements Abortable { - @Override - public void abort(String why, Throwable e) { - LOG.warn("DumpReplicationQueue received abort, ignoring. Reason: " + why); - if (LOG.isDebugEnabled()) { - LOG.debug(e.toString(), e); - } - } - - @Override - public boolean isAborted() { - return false; - } - } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java index 480866949993..5bbc66791967 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java @@ -379,6 +379,26 @@ public static List getArchivedWALFiles(Configuration conf, ServerName serv return archivedWalFiles; } + /** + * List all the wal files for a logPrefix. + */ + public static List getWALFiles(Configuration c, ServerName serverName) throws IOException { + Path walRoot = new Path(CommonFSUtils.getWALRootDir(c), HConstants.HREGION_LOGDIR_NAME); + FileSystem fs = walRoot.getFileSystem(c); + List walFiles = new ArrayList<>(); + Path walDir = new Path(walRoot, serverName.toString()); + try { + for (FileStatus status : fs.listStatus(walDir)) { + if (status.isFile()) { + walFiles.add(status.getPath()); + } + } + } catch (FileNotFoundException e) { + LOG.info("WAL dir {} not exists", walDir); + } + return walFiles; + } + /** * Pulls a ServerName out of a Path generated according to our layout rules. In the below layouts, * this method ignores the format of the logfile component. Current format: [base directory for diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestDumpReplicationQueues.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestDumpReplicationQueues.java index 3475ae5c1925..3e1dc624fe7d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestDumpReplicationQueues.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestDumpReplicationQueues.java @@ -17,34 +17,43 @@ */ package org.apache.hadoop.hbase.replication.regionserver; -import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; +import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseClassTestRule; -import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; +import org.apache.hadoop.hbase.replication.ReplicationPeerConfig; +import org.apache.hadoop.hbase.replication.ReplicationPeerConfigBuilder; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; +import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; +import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; import org.apache.hadoop.hbase.testclassification.ReplicationTests; import org.apache.hadoop.hbase.testclassification.SmallTests; -import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; -import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper; -import org.apache.hadoop.hbase.zookeeper.ZKWatcher; -import org.apache.hadoop.hbase.zookeeper.ZNodePaths; +import org.apache.hadoop.hbase.util.CommonFSUtils; +import org.apache.hadoop.hbase.util.Pair; +import org.junit.After; +import org.junit.Before; import org.junit.ClassRule; -import org.junit.Ignore; +import org.junit.Rule; import org.junit.Test; import org.junit.experimental.categories.Category; +import org.junit.rules.TestName; /** * Tests for DumpReplicationQueues tool */ -// TODO: reimplement -@Ignore @Category({ ReplicationTests.class, SmallTests.class }) public class TestDumpReplicationQueues { @@ -52,49 +61,99 @@ public class TestDumpReplicationQueues { public static final HBaseClassTestRule CLASS_RULE = HBaseClassTestRule.forClass(TestDumpReplicationQueues.class); - /** - * Makes sure dumpQueues returns wals znodes ordered chronologically. - * @throws Exception if dumpqueues finds any error while handling list of znodes. - */ + private static final HBaseTestingUtil UTIL = new HBaseTestingUtil(); + private static Configuration CONF; + private static FileSystem FS = null; + private Path root; + private Path logDir; + @Rule + public final TestName name = new TestName(); + + @Before + public void setup() throws Exception { + UTIL.startMiniCluster(3); + CONF = UTIL.getConfiguration(); + TableName tableName = TableName.valueOf("replication_" + name.getMethodName()); + UTIL.getAdmin() + .createTable(ReplicationStorageFactory.createReplicationQueueTableDescriptor(tableName)); + CONF.set(ReplicationStorageFactory.REPLICATION_QUEUE_TABLE_NAME, tableName.getNameAsString()); + FS = FileSystem.get(CONF); + root = UTIL.getDataTestDirOnTestFS("hbase"); + logDir = new Path(root, HConstants.HREGION_LOGDIR_NAME); + FS.mkdirs(logDir); + CommonFSUtils.setRootDir(CONF, root); + CommonFSUtils.setWALRootDir(CONF, root); + } + @Test - public void testDumpReplicationReturnsWalSorted() throws Exception { - Configuration config = HBaseConfiguration.create(); - ZKWatcher zkWatcherMock = mock(ZKWatcher.class); - ZNodePaths zNodePath = new ZNodePaths(config); - RecoverableZooKeeper recoverableZooKeeperMock = mock(RecoverableZooKeeper.class); - when(zkWatcherMock.getRecoverableZooKeeper()).thenReturn(recoverableZooKeeperMock); - when(zkWatcherMock.getZNodePaths()).thenReturn(zNodePath); - List nodes = new ArrayList<>(); - String server = "rs1,60030," + EnvironmentEdgeManager.currentTime(); - nodes.add(server); - when(recoverableZooKeeperMock.getChildren("/hbase/rs", null)).thenReturn(nodes); - when(recoverableZooKeeperMock.getChildren("/hbase/replication/rs", null)).thenReturn(nodes); - List queuesIds = new ArrayList<>(); - queuesIds.add("1"); - when(recoverableZooKeeperMock.getChildren("/hbase/replication/rs/" + server, null)) - .thenReturn(queuesIds); - List wals = new ArrayList<>(); - wals.add("rs1%2C60964%2C1549394085556.1549394101427"); - wals.add("rs1%2C60964%2C1549394085556.1549394101426"); - wals.add("rs1%2C60964%2C1549394085556.1549394101428"); - when(recoverableZooKeeperMock.getChildren("/hbase/replication/rs/" + server + "/1", null)) - .thenReturn(wals); + public void testDumpReplication() throws Exception { + String peerId = "1"; + String serverNameStr = "rs1,12345,123"; + addPeer(peerId, "hbase"); + ServerName serverName = ServerName.valueOf(serverNameStr); + String walName = "rs1%2C12345%2C123.10"; + Path walPath = new Path(logDir, serverNameStr + "/" + walName); + FS.createNewFile(walPath); + + ReplicationQueueId queueId = new ReplicationQueueId(serverName, peerId); + ReplicationQueueStorage queueStorage = + ReplicationStorageFactory.getReplicationQueueStorage(UTIL.getConnection(), CONF); + queueStorage.setOffset(queueId, "wal-group", + new ReplicationGroupOffset(FS.listStatus(walPath)[0].getPath().toString(), 123), + Collections.emptyMap()); + DumpReplicationQueues dumpQueues = new DumpReplicationQueues(); Set peerIds = new HashSet<>(); - peerIds.add("1"); - dumpQueues.setConf(config); - String dump = dumpQueues.dumpQueues(zkWatcherMock, peerIds, false); + peerIds.add(peerId); + List wals = new ArrayList<>(); + wals.add("rs1%2C12345%2C123.12"); + wals.add("rs1%2C12345%2C123.15"); + wals.add("rs1%2C12345%2C123.11"); + for (String wal : wals) { + Path wPath = new Path(logDir, serverNameStr + "/" + wal); + FS.createNewFile(wPath); + } + + String dump = dumpQueues.dumpQueues(UTIL.getConnection(), peerIds, false, CONF); + assertTrue(dump.indexOf("Queue id: 1-rs1,12345,123") > 0); + assertTrue(dump.indexOf("Number of WALs in replication queue: 4") > 0); + // test for 'Returns wal sorted' String[] parsedDump = dump.split("Replication position for"); - assertEquals("Parsed dump should have 4 parts.", 4, parsedDump.length); - assertTrue( - "First wal should be rs1%2C60964%2C1549394085556.1549394101426, but got: " + parsedDump[1], - parsedDump[1].indexOf("rs1%2C60964%2C1549394085556.1549394101426") >= 0); - assertTrue( - "Second wal should be rs1%2C60964%2C1549394085556.1549394101427, but got: " + parsedDump[2], - parsedDump[2].indexOf("rs1%2C60964%2C1549394085556.1549394101427") >= 0); - assertTrue( - "Third wal should be rs1%2C60964%2C1549394085556.1549394101428, but got: " + parsedDump[3], - parsedDump[3].indexOf("rs1%2C60964%2C1549394085556.1549394101428") >= 0); + assertTrue("First wal should be rs1%2C12345%2C123.10: 123, but got: " + parsedDump[1], + parsedDump[1].indexOf("rs1%2C12345%2C123.10: 123") >= 0); + assertTrue("Second wal should be rs1%2C12345%2C123.11: 0, but got: " + parsedDump[2], + parsedDump[2].indexOf("rs1%2C12345%2C123.11: 0 (not started or nothing to replicate)") >= 0); + assertTrue("Third wal should be rs1%2C12345%2C123.12: 0, but got: " + parsedDump[3], + parsedDump[3].indexOf("rs1%2C12345%2C123.12: 0 (not started or nothing to replicate)") >= 0); + assertTrue("Fourth wal should be rs1%2C12345%2C123.15: 0, but got: " + parsedDump[4], + parsedDump[4].indexOf("rs1%2C12345%2C123.15: 0 (not started or nothing to replicate)") >= 0); + + Path file1 = new Path("testHFile1"); + Path file2 = new Path("testHFile2"); + List> files = new ArrayList<>(1); + files.add(new Pair<>(null, file1)); + files.add(new Pair<>(null, file2)); + queueStorage.addHFileRefs(peerId, files); + // test for 'Dump Replication via replication table' + String dump2 = dumpQueues.dumpReplicationViaTable(UTIL.getConnection(), CONF); + assertTrue(dump2.indexOf("peers/1/peer-state: ENABLED") > 0); + assertTrue(dump2.indexOf("rs1,12345,123/rs1%2C12345%2C123.10: 123") >= 0); + assertTrue(dump2.indexOf("hfile-refs/1/testHFile1,testHFile2") >= 0); + } + + /** + * Add a peer + */ + private void addPeer(String peerId, String clusterKey) throws IOException { + ReplicationPeerConfigBuilder builder = ReplicationPeerConfig.newBuilder() + .setClusterKey(UTIL.getZkCluster().getAddress().toString() + ":/" + clusterKey) + .setReplicationEndpointImpl( + TestReplicationSourceManager.ReplicationEndpointForTest.class.getName()); + UTIL.getAdmin().addReplicationPeer(peerId, builder.build(), true); } + @After + public void tearDown() throws Exception { + UTIL.shutdownMiniCluster(); + } } From 6bb7d994f6841b6470f009518ae554790b4f203e Mon Sep 17 00:00:00 2001 From: Duo Zhang Date: Tue, 18 Oct 2022 16:46:03 +0800 Subject: [PATCH 09/16] HBASE-27429 Add exponential retry backoff support for MigrateReplicationQueueFromZkToTableProcedure Signed-off-by: Liangjun He --- .../procedure2/TimeoutExecutorThread.java | 10 +- ...eplicationQueueFromZkToTableProcedure.java | 131 ++++++++++++------ .../replication/ReplicationPeerManager.java | 45 +++--- ...icationPeerManagerMigrateQueuesFromZk.java | 9 +- 4 files changed, 125 insertions(+), 70 deletions(-) diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/TimeoutExecutorThread.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/TimeoutExecutorThread.java index 3b99781a5585..c0287a99435c 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/TimeoutExecutorThread.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/TimeoutExecutorThread.java @@ -78,9 +78,13 @@ public void add(InlineChore chore) { } public void add(Procedure procedure) { - LOG.info("ADDED {}; timeout={}, timestamp={}", procedure, procedure.getTimeout(), - procedure.getTimeoutTimestamp()); - queue.add(new DelayedProcedure<>(procedure)); + if (procedure.getTimeout() > 0) { + LOG.info("ADDED {}; timeout={}, timestamp={}", procedure, procedure.getTimeout(), + procedure.getTimeoutTimestamp()); + queue.add(new DelayedProcedure<>(procedure)); + } else { + LOG.info("Got negative timeout {} for {}, skip adding", procedure.getTimeout(), procedure); + } } public boolean remove(Procedure procedure) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/MigrateReplicationQueueFromZkToTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/MigrateReplicationQueueFromZkToTableProcedure.java index 536f232338e9..93ff27db3f72 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/MigrateReplicationQueueFromZkToTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/MigrateReplicationQueueFromZkToTableProcedure.java @@ -25,19 +25,25 @@ import java.io.IOException; import java.util.List; +import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; -import java.util.concurrent.Future; +import java.util.function.LongConsumer; import java.util.stream.Collectors; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.master.procedure.GlobalProcedureInterface; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.PeerProcedureInterface; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; +import org.apache.hadoop.hbase.procedure2.ProcedureUtil; import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; import org.apache.hadoop.hbase.procedure2.StateMachineProcedure; import org.apache.hadoop.hbase.replication.ReplicationPeerDescription; import org.apache.hadoop.hbase.replication.ZKReplicationQueueStorageForMigration; +import org.apache.hadoop.hbase.util.FutureUtils; +import org.apache.hadoop.hbase.util.IdLock; +import org.apache.hadoop.hbase.util.RetryCounter; import org.apache.hadoop.hbase.util.VersionInfo; import org.apache.yetus.audience.InterfaceAudience; import org.apache.zookeeper.KeeperException; @@ -65,18 +71,34 @@ public class MigrateReplicationQueueFromZkToTableProcedure private List disabledPeerIds; - private List> futures; + private CompletableFuture future; private ExecutorService executor; + private RetryCounter retryCounter; + @Override public String getGlobalId() { return getClass().getSimpleName(); } + private ProcedureSuspendedException suspend(Configuration conf, LongConsumer backoffConsumer) + throws ProcedureSuspendedException { + if (retryCounter == null) { + retryCounter = ProcedureUtil.createRetryCounter(conf); + } + long backoff = retryCounter.getBackoffTimeAndIncrementAttempts(); + backoffConsumer.accept(backoff); + throw suspend(Math.toIntExact(backoff), true); + } + + private void resetRetry() { + retryCounter = null; + } + private ExecutorService getExecutorService() { if (executor == null) { - executor = Executors.newFixedThreadPool(3, new ThreadFactoryBuilder() + executor = Executors.newCachedThreadPool(new ThreadFactoryBuilder() .setNameFormat(getClass().getSimpleName() + "-%d").setDaemon(true).build()); } return executor; @@ -95,14 +117,17 @@ private void waitUntilNoPeerProcedure(MasterProcedureEnv env) throws ProcedureSu peerProcCount = env.getMasterServices().getProcedures().stream() .filter(p -> p instanceof PeerProcedureInterface).filter(p -> !p.isFinished()).count(); } catch (IOException e) { - LOG.warn("failed to check peer procedure status", e); - throw suspend(5000, true); + throw suspend(env.getMasterConfiguration(), + backoff -> LOG.warn("failed to check peer procedure status, sleep {} secs and retry later", + backoff / 1000, e)); } if (peerProcCount > 0) { - LOG.info("There are still {} pending peer procedures, will sleep and check later", - peerProcCount); - throw suspend(10_000, true); + throw suspend(env.getMasterConfiguration(), + backoff -> LOG.info( + "There are still {} pending peer procedures, sleep {} secs and retry later", + peerProcCount, backoff / 1000)); } + resetRetry(); LOG.info("No pending peer procedures found, continue..."); } @@ -122,8 +147,10 @@ protected Flow executeFromState(MasterProcedureEnv env, try { oldStorage.deleteAllData(); } catch (KeeperException e) { - LOG.warn("failed to delete old replication queue data, sleep and retry later", e); - suspend(10_000, true); + throw suspend(env.getMasterConfiguration(), + backoff -> LOG.warn( + "failed to delete old replication queue data, sleep {} secs and retry later", + backoff / 1000, e)); } return Flow.NO_MORE_STATE; } @@ -132,6 +159,7 @@ protected Flow executeFromState(MasterProcedureEnv env, disabledPeerIds = peers.stream().filter(ReplicationPeerDescription::isEnabled) .map(ReplicationPeerDescription::getPeerId).collect(Collectors.toList()); setNextState(MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_DISABLE_PEER); + resetRetry(); return Flow.HAS_MORE_STATE; case MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_DISABLE_PEER: for (String peerId : disabledPeerIds) { @@ -140,39 +168,52 @@ protected Flow executeFromState(MasterProcedureEnv env, setNextState(MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_MIGRATE); return Flow.HAS_MORE_STATE; case MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_MIGRATE: - if (futures != null) { - // wait until all futures done - long notDone = futures.stream().filter(f -> !f.isDone()).count(); - if (notDone == 0) { - boolean succ = true; - for (Future future : futures) { - try { - future.get(); - } catch (Exception e) { - succ = false; - LOG.warn("Failed to migrate", e); - } - } - if (succ) { - shutdownExecutorService(); - setNextState(MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_WAIT_UPGRADING); - return Flow.HAS_MORE_STATE; - } - // reschedule to retry migration again - futures = null; - } else { - LOG.info("There still {} pending migration tasks, will sleep and check later", notDone); - throw suspend(10_000, true); + if (future != null) { + // should have finished when we arrive here + assert future.isDone(); + try { + future.get(); + } catch (Exception e) { + future = null; + throw suspend(env.getMasterConfiguration(), + backoff -> LOG.warn("failed to migrate queue data, sleep {} secs and retry later", + backoff / 1000, e)); } + shutdownExecutorService(); + setNextState(MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_WAIT_UPGRADING); + resetRetry(); + return Flow.HAS_MORE_STATE; } - try { - futures = env.getReplicationPeerManager() - .migrateQueuesFromZk(env.getMasterServices().getZooKeeper(), getExecutorService()); - } catch (IOException e) { - LOG.warn("failed to submit migration tasks", e); - throw suspend(10_000, true); - } - throw suspend(10_000, true); + future = env.getReplicationPeerManager() + .migrateQueuesFromZk(env.getMasterServices().getZooKeeper(), getExecutorService()); + FutureUtils.addListener(future, (r, e) -> { + // should acquire procedure execution lock to make sure that the procedure executor has + // finished putting this procedure to the WAITING_TIMEOUT state, otherwise there could be + // race and cause unexpected result + IdLock procLock = + env.getMasterServices().getMasterProcedureExecutor().getProcExecutionLock(); + IdLock.Entry lockEntry; + try { + lockEntry = procLock.getLockEntry(getProcId()); + } catch (IOException ioe) { + LOG.error("Error while acquiring execution lock for procedure {}" + + " when trying to wake it up, aborting...", ioe); + env.getMasterServices().abort("Can not acquire procedure execution lock", e); + return; + } + try { + setTimeoutFailure(env); + } finally { + procLock.releaseLockEntry(lockEntry); + } + }); + // here we set timeout to -1 so the ProcedureExecutor will not schedule a Timer for us + setTimeout(-1); + setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT); + // skip persistence is a must now since when restarting, if the procedure is in + // WAITING_TIMEOUT state and has -1 as timeout, it will block there forever... + skipPersistence(); + throw new ProcedureSuspendedException(); case MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_WAIT_UPGRADING: long rsWithLowerVersion = env.getMasterServices().getServerManager().getOnlineServers().values().stream() @@ -181,9 +222,11 @@ protected Flow executeFromState(MasterProcedureEnv env, setNextState(MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_ENABLE_PEER); return Flow.HAS_MORE_STATE; } else { - LOG.info("There are still {} region servers which have a major version less than {}, " - + "will sleep and check later", rsWithLowerVersion, MIN_MAJOR_VERSION); - throw suspend(10_000, true); + throw suspend(env.getMasterConfiguration(), + backoff -> LOG.warn( + "There are still {} region servers which have a major version" + + " less than {}, sleep {} secs and check later", + rsWithLowerVersion, MIN_MAJOR_VERSION, backoff / 1000)); } case MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_ENABLE_PEER: for (String peerId : disabledPeerIds) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java index f3cdaddb31ca..8cfb36a1bc17 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java @@ -21,7 +21,6 @@ import java.io.IOException; import java.net.URI; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; import java.util.EnumSet; import java.util.HashMap; @@ -29,10 +28,10 @@ import java.util.Map; import java.util.Optional; import java.util.Set; +import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ExecutorService; -import java.util.concurrent.Future; import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; @@ -72,6 +71,7 @@ import org.apache.hadoop.hbase.replication.ZKReplicationQueueStorageForMigration.ZkLastPushedSeqId; import org.apache.hadoop.hbase.replication.ZKReplicationQueueStorageForMigration.ZkReplicationQueueData; import org.apache.hadoop.hbase.replication.master.ReplicationLogCleanerBarrier; +import org.apache.hadoop.hbase.util.FutureUtils; import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.apache.hadoop.hbase.zookeeper.ZKClusterId; @@ -797,25 +797,38 @@ private void migrateHFileRefs(ZKReplicationQueueStorageForMigration oldQueueStor } } + private interface ExceptionalRunnable { + void run() throws Exception; + } + + private CompletableFuture runAsync(ExceptionalRunnable task, ExecutorService executor) { + CompletableFuture future = new CompletableFuture<>(); + executor.execute(() -> { + try { + task.run(); + future.complete(null); + } catch (Exception e) { + future.completeExceptionally(e); + } + }); + return future; + } + /** - * Submit the migration tasks to the given {@code executor} and return the futures. + * Submit the migration tasks to the given {@code executor}. */ - List> migrateQueuesFromZk(ZKWatcher zookeeper, ExecutorService executor) - throws IOException { + CompletableFuture migrateQueuesFromZk(ZKWatcher zookeeper, ExecutorService executor) { // the replication queue table creation is asynchronous and will be triggered by addPeer, so // here we need to manually initialize it since we will not call addPeer. - initializeQueueStorage(); + try { + initializeQueueStorage(); + } catch (IOException e) { + return FutureUtils.failedFuture(e); + } ZKReplicationQueueStorageForMigration oldStorage = new ZKReplicationQueueStorageForMigration(zookeeper, conf); - return Arrays.asList(executor.submit(() -> { - migrateQueues(oldStorage); - return null; - }), executor.submit(() -> { - migrateLastPushedSeqIds(oldStorage); - return null; - }), executor.submit(() -> { - migrateHFileRefs(oldStorage); - return null; - })); + return CompletableFuture.allOf(runAsync(() -> migrateQueues(oldStorage), executor), + runAsync(() -> migrateLastPushedSeqIds(oldStorage), executor), + runAsync(() -> migrateHFileRefs(oldStorage), executor)); } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestReplicationPeerManagerMigrateQueuesFromZk.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestReplicationPeerManagerMigrateQueuesFromZk.java index 76301ae67531..d211d707e92d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestReplicationPeerManagerMigrateQueuesFromZk.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestReplicationPeerManagerMigrateQueuesFromZk.java @@ -34,7 +34,6 @@ import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; -import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.Cell; @@ -146,9 +145,7 @@ private Map> prepareData() throws Exception { @Test public void testNoPeers() throws Exception { prepareData(); - for (Future future : manager.migrateQueuesFromZk(UTIL.getZooKeeperWatcher(), EXECUTOR)) { - future.get(1, TimeUnit.MINUTES); - } + manager.migrateQueuesFromZk(UTIL.getZooKeeperWatcher(), EXECUTOR).get(1, TimeUnit.MINUTES); // should have called initializer verify(queueStorageInitializer).initialize(); // should have not migrated any data since there is no peer @@ -165,9 +162,7 @@ public void testMigrate() throws Exception { // value is not used in this test, so just add a mock peers.put("peer_" + i, mock(ReplicationPeerDescription.class)); } - for (Future future : manager.migrateQueuesFromZk(UTIL.getZooKeeperWatcher(), EXECUTOR)) { - future.get(1, TimeUnit.MINUTES); - } + manager.migrateQueuesFromZk(UTIL.getZooKeeperWatcher(), EXECUTOR).get(1, TimeUnit.MINUTES); // should have called initializer verify(queueStorageInitializer).initialize(); List queueDatas = queueStorage.listAllQueues(); From 9c5bfcf01c73ee2a0fc926277488881d4abac816 Mon Sep 17 00:00:00 2001 From: Duo Zhang Date: Sat, 3 Dec 2022 20:51:40 +0800 Subject: [PATCH 10/16] HBASE-27430 Should disable replication log cleaner when migrating replication queue data (#4901) Signed-off-by: Liangjun He --- .../server/master/MasterProcedure.proto | 12 +++-- ...eplicationQueueFromZkToTableProcedure.java | 47 ++++++++++++++++++- ...eplicationQueueFromZkToTableProcedure.java | 29 +++++++++++- 3 files changed, 80 insertions(+), 8 deletions(-) diff --git a/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto b/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto index b6f5d7e50bb0..14d07c17c880 100644 --- a/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto +++ b/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto @@ -724,11 +724,13 @@ message AssignReplicationQueuesStateData { } enum MigrateReplicationQueueFromZkToTableState { - MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_PREPARE = 1; - MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_DISABLE_PEER = 2; - MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_MIGRATE = 3; - MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_WAIT_UPGRADING = 4; - MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_ENABLE_PEER = 5; + MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_DISABLE_CLEANER = 1; + MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_PREPARE = 2; + MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_DISABLE_PEER = 3; + MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_MIGRATE = 4; + MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_WAIT_UPGRADING = 5; + MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_ENABLE_PEER = 6; + MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_ENABLE_CLEANER = 7; } message MigrateReplicationQueueFromZkToTableStateData { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/MigrateReplicationQueueFromZkToTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/MigrateReplicationQueueFromZkToTableProcedure.java index 93ff27db3f72..b7c4e33ef858 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/MigrateReplicationQueueFromZkToTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/MigrateReplicationQueueFromZkToTableProcedure.java @@ -17,7 +17,9 @@ */ package org.apache.hadoop.hbase.master.replication; +import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.MigrateReplicationQueueFromZkToTableState.MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_DISABLE_CLEANER; import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.MigrateReplicationQueueFromZkToTableState.MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_DISABLE_PEER; +import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.MigrateReplicationQueueFromZkToTableState.MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_ENABLE_CLEANER; import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.MigrateReplicationQueueFromZkToTableState.MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_ENABLE_PEER; import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.MigrateReplicationQueueFromZkToTableState.MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_MIGRATE; import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.MigrateReplicationQueueFromZkToTableState.MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_PREPARE; @@ -111,6 +113,26 @@ private void shutdownExecutorService() { } } + private void disableReplicationLogCleaner(MasterProcedureEnv env) + throws ProcedureSuspendedException { + if (!env.getReplicationPeerManager().getReplicationLogCleanerBarrier().disable()) { + // it is not likely that we can reach here as we will schedule this procedure immediately + // after master restarting, where ReplicationLogCleaner should have not started its first run + // yet. But anyway, let's make the code more robust. And it is safe to wait a bit here since + // there will be no data in the new replication queue storage before we execute this procedure + // so ReplicationLogCleaner will quit immediately without doing anything. + throw suspend(env.getMasterConfiguration(), + backoff -> LOG.info( + "Can not disable replication log cleaner, sleep {} secs and retry later", + backoff / 1000)); + } + resetRetry(); + } + + private void enableReplicationLogCleaner(MasterProcedureEnv env) { + env.getReplicationPeerManager().getReplicationLogCleanerBarrier().enable(); + } + private void waitUntilNoPeerProcedure(MasterProcedureEnv env) throws ProcedureSuspendedException { long peerProcCount; try { @@ -136,6 +158,10 @@ protected Flow executeFromState(MasterProcedureEnv env, MigrateReplicationQueueFromZkToTableState state) throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException { switch (state) { + case MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_DISABLE_CLEANER: + disableReplicationLogCleaner(env); + setNextState(MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_PREPARE); + return Flow.HAS_MORE_STATE; case MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_PREPARE: waitUntilNoPeerProcedure(env); List peers = env.getReplicationPeerManager().listPeers(null); @@ -152,7 +178,8 @@ protected Flow executeFromState(MasterProcedureEnv env, "failed to delete old replication queue data, sleep {} secs and retry later", backoff / 1000, e)); } - return Flow.NO_MORE_STATE; + setNextState(MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_ENABLE_CLEANER); + return Flow.HAS_MORE_STATE; } // here we do not care the peers which have already been disabled, as later we do not need // to enable them @@ -232,6 +259,10 @@ protected Flow executeFromState(MasterProcedureEnv env, for (String peerId : disabledPeerIds) { addChildProcedure(new EnablePeerProcedure(peerId)); } + setNextState(MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_ENABLE_CLEANER); + return Flow.HAS_MORE_STATE; + case MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_ENABLE_CLEANER: + enableReplicationLogCleaner(env); return Flow.NO_MORE_STATE; default: throw new UnsupportedOperationException("unhandled state=" + state); @@ -263,7 +294,19 @@ protected int getStateId(MigrateReplicationQueueFromZkToTableState state) { @Override protected MigrateReplicationQueueFromZkToTableState getInitialState() { - return MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_PREPARE; + return MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_DISABLE_CLEANER; + } + + @Override + protected void afterReplay(MasterProcedureEnv env) { + if (getCurrentState() == getInitialState()) { + // do not need to disable log cleaner or acquire lock if we are in the initial state, later + // when executing the procedure we will try to disable and acquire. + return; + } + if (!env.getReplicationPeerManager().getReplicationLogCleanerBarrier().disable()) { + throw new IllegalStateException("can not disable log cleaner, this should not happen"); + } } @Override diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestMigrateReplicationQueueFromZkToTableProcedure.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestMigrateReplicationQueueFromZkToTableProcedure.java index 752abc380b84..cb795edcd623 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestMigrateReplicationQueueFromZkToTableProcedure.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestMigrateReplicationQueueFromZkToTableProcedure.java @@ -17,8 +17,11 @@ */ package org.apache.hadoop.hbase.master.replication; +import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.MigrateReplicationQueueFromZkToTableState.MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_DISABLE_CLEANER; import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.MigrateReplicationQueueFromZkToTableState.MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_WAIT_UPGRADING; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -48,6 +51,7 @@ import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; import org.apache.hadoop.hbase.replication.ReplicationPeerConfig; import org.apache.hadoop.hbase.replication.ReplicationPeerDescription; +import org.apache.hadoop.hbase.replication.master.ReplicationLogCleanerBarrier; import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; @@ -102,6 +106,8 @@ public Map getOnlineServers() { @BeforeClass public static void setupCluster() throws Exception { + // one hour, to make sure it will not run during the test + UTIL.getConfiguration().setInt(HMaster.HBASE_MASTER_CLEANER_INTERVAL, 60 * 60 * 1000); UTIL.startMiniCluster( StartTestingClusterOption.builder().masterClass(HMasterForTest.class).build()); } @@ -193,8 +199,10 @@ public void testWaitUntilNoPeerProcedure() throws Exception { UTIL.waitFor(30000, () -> proc.isSuccess()); } + // make sure we will disable replication peers while migrating + // and also tests disable/enable replication log cleaner and wait for region server upgrading @Test - public void testDisablePeerAndWaitUpgrading() throws Exception { + public void testDisablePeerAndWaitStates() throws Exception { String peerId = "2"; ReplicationPeerConfig rpc = ReplicationPeerConfig.newBuilder() .setClusterKey(UTIL.getZkCluster().getAddress().toString() + ":/testhbase") @@ -206,11 +214,22 @@ public void testDisablePeerAndWaitUpgrading() throws Exception { EXTRA_REGION_SERVERS .put(ServerName.valueOf("localhost", 54321, EnvironmentEdgeManager.currentTime()), metrics); + ReplicationLogCleanerBarrier barrier = UTIL.getHBaseCluster().getMaster() + .getReplicationPeerManager().getReplicationLogCleanerBarrier(); + assertTrue(barrier.start()); + ProcedureExecutor procExec = getMasterProcedureExecutor(); MigrateReplicationQueueFromZkToTableProcedure proc = new MigrateReplicationQueueFromZkToTableProcedure(); procExec.submitProcedure(proc); + + Thread.sleep(5000); + // make sure we are still waiting for replication log cleaner quit + assertEquals(MIGRATE_REPLICATION_QUEUE_FROM_ZK_TO_TABLE_DISABLE_CLEANER.getNumber(), + proc.getCurrentStateId()); + barrier.stop(); + // wait until we reach the wait upgrading state UTIL.waitFor(30000, () -> proc.getCurrentStateId() @@ -218,9 +237,17 @@ public void testDisablePeerAndWaitUpgrading() throws Exception { && proc.getState() == ProcedureState.WAITING_TIMEOUT); // make sure the peer is disabled for migrating assertFalse(UTIL.getAdmin().isReplicationPeerEnabled(peerId)); + // make sure the replication log cleaner is disabled + assertFalse(barrier.start()); // the procedure should finish successfully EXTRA_REGION_SERVERS.clear(); UTIL.waitFor(30000, () -> proc.isSuccess()); + + // make sure the peer is enabled again + assertTrue(UTIL.getAdmin().isReplicationPeerEnabled(peerId)); + // make sure the replication log cleaner is enabled again + assertTrue(barrier.start()); + barrier.stop(); } } From acd05c72d24d2f72c2fadecd4a45b205a889a815 Mon Sep 17 00:00:00 2001 From: Duo Zhang Date: Sat, 18 Mar 2023 21:38:53 +0800 Subject: [PATCH 11/16] HBASE-27216 Revisit the ReplicationSyncUp tool (#4966) Signed-off-by: Liangjun He --- .../apache/hadoop/hbase/util/JsonMapper.java | 4 + .../server/master/MasterProcedure.proto | 1 + .../replication/ReplicationQueueStorage.java | 21 + .../ReplicationStorageFactory.java | 25 +- .../TableReplicationQueueStorage.java | 20 + .../apache/hadoop/hbase/master/HMaster.java | 41 ++ .../AssignReplicationQueuesProcedure.java | 48 ++- .../ClaimReplicationQueueRemoteProcedure.java | 32 ++ .../OfflineTableReplicationQueueStorage.java | 382 ++++++++++++++++++ .../replication/ReplicationPeerManager.java | 2 +- .../ReplicationSourceManager.java | 188 +++++---- .../regionserver/ReplicationSyncUp.java | 195 +++++++-- .../hbase/wal/AbstractFSWALProvider.java | 4 + .../hbase/master/cleaner/TestLogsCleaner.java | 4 +- .../TestReplicationSyncUpTool.java | 185 +++++---- .../TestReplicationSyncUpToolBase.java | 3 +- ...plicationSyncUpToolWithBulkLoadedData.java | 92 ++--- .../TestTableReplicationQueueStorage.java | 51 +++ ...icationSyncUpToolWithMultipleAsyncWAL.java | 3 - ...tReplicationSyncUpToolWithMultipleWAL.java | 3 - .../TestReplicationSourceManager.java | 14 +- .../TestSerialReplicationChecker.java | 4 +- 22 files changed, 1029 insertions(+), 293 deletions(-) create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/OfflineTableReplicationQueueStorage.java diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/util/JsonMapper.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/util/JsonMapper.java index 0ff131f23bf2..f2c4585a6a85 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/util/JsonMapper.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/util/JsonMapper.java @@ -40,4 +40,8 @@ public static String writeMapAsString(Map map) throws IOExceptio public static String writeObjectAsString(Object object) throws IOException { return GSON.toJson(object); } + + public static T fromJson(String json, Class clazz) { + return GSON.fromJson(json, clazz); + } } diff --git a/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto b/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto index 14d07c17c880..901abf6bd0c5 100644 --- a/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto +++ b/hbase-protocol-shaded/src/main/protobuf/server/master/MasterProcedure.proto @@ -717,6 +717,7 @@ message ModifyColumnFamilyStoreFileTrackerStateData { enum AssignReplicationQueuesState { ASSIGN_REPLICATION_QUEUES_ADD_MISSING_QUEUES = 1; ASSIGN_REPLICATION_QUEUES_CLAIM = 2; + ASSIGN_REPLICATION_QUEUES_REMOVE_QUEUES = 3; } message AssignReplicationQueuesStateData { diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java index 1e36bbeb78f0..b5bc64eb55aa 100644 --- a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java +++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java @@ -203,4 +203,25 @@ void batchUpdateLastSequenceIds(List lastPushedSeqIds) * Add the given hfile refs to the given peer. */ void batchUpdateHFileRefs(String peerId, List hfileRefs) throws ReplicationException; + + // the below method is for clean up stale data after running ReplicatoinSyncUp + /** + * Remove all the last sequence ids and hfile references data which are written before the given + * timestamp. + *

    + * The data of these two types are not used by replication directly. + *

    + * For last sequence ids, we will check it in serial replication, to make sure that we will + * replicate all edits in order, so if there are stale data, the worst case is that we will stop + * replicating as we think we still need to finish previous ranges first, although actually we + * have already replicated them out. + *

    + * For hfile references, it is just used by hfile cleaner to not remove these hfiles before we + * replicate them out, so if there are stale data, the worst case is that we can not remove these + * hfiles, although actually they have already been replicated out. + *

    + * So it is OK for us to just bring up the cluster first, and then use this method to delete the + * stale data, i.e, the data which are written before a specific timestamp. + */ + void removeLastSequenceIdsAndHFileRefsBefore(long ts) throws ReplicationException; } diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationStorageFactory.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationStorageFactory.java index d0c204f99349..0b0eb0fc43fd 100644 --- a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationStorageFactory.java +++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationStorageFactory.java @@ -32,6 +32,8 @@ import org.apache.hadoop.hbase.util.ReflectionUtils; import org.apache.hadoop.hbase.zookeeper.ZKWatcher; import org.apache.yetus.audience.InterfaceAudience; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Used to create replication storage(peer, queue) classes. @@ -39,6 +41,8 @@ @InterfaceAudience.Private public final class ReplicationStorageFactory { + private static final Logger LOG = LoggerFactory.getLogger(ReplicationStorageFactory.class); + public static final String REPLICATION_PEER_STORAGE_IMPL = "hbase.replication.peer.storage.impl"; // must use zookeeper here, otherwise when user upgrading from an old version without changing the @@ -51,6 +55,8 @@ public final class ReplicationStorageFactory { public static final TableName REPLICATION_QUEUE_TABLE_NAME_DEFAULT = TableName.valueOf(NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR, "replication"); + public static final String REPLICATION_QUEUE_IMPL = "hbase.replication.queue.storage.impl"; + public static TableDescriptor createReplicationQueueTableDescriptor(TableName tableName) throws IOException { return TableDescriptorBuilder.newBuilder(tableName) @@ -108,15 +114,26 @@ public static ReplicationPeerStorage getReplicationPeerStorage(FileSystem fs, ZK */ public static ReplicationQueueStorage getReplicationQueueStorage(Connection conn, Configuration conf) { - return getReplicationQueueStorage(conn, TableName.valueOf(conf.get(REPLICATION_QUEUE_TABLE_NAME, - REPLICATION_QUEUE_TABLE_NAME_DEFAULT.getNameAsString()))); + return getReplicationQueueStorage(conn, conf, TableName.valueOf(conf + .get(REPLICATION_QUEUE_TABLE_NAME, REPLICATION_QUEUE_TABLE_NAME_DEFAULT.getNameAsString()))); } /** * Create a new {@link ReplicationQueueStorage}. */ public static ReplicationQueueStorage getReplicationQueueStorage(Connection conn, - TableName tableName) { - return new TableReplicationQueueStorage(conn, tableName); + Configuration conf, TableName tableName) { + Class clazz = conf.getClass(REPLICATION_QUEUE_IMPL, + TableReplicationQueueStorage.class, ReplicationQueueStorage.class); + try { + Constructor c = + clazz.getConstructor(Connection.class, TableName.class); + return c.newInstance(conn, tableName); + } catch (Exception e) { + LOG.debug( + "failed to create ReplicationQueueStorage with Connection, try creating with Configuration", + e); + return ReflectionUtils.newInstance(clazz, conf, tableName); + } } } diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/TableReplicationQueueStorage.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/TableReplicationQueueStorage.java index f3870f4d09d8..e59edd52f793 100644 --- a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/TableReplicationQueueStorage.java +++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/TableReplicationQueueStorage.java @@ -594,4 +594,24 @@ public void batchUpdateHFileRefs(String peerId, List hfileRefs) throw new ReplicationException("failed to batch update hfile references", e); } } + + @Override + public void removeLastSequenceIdsAndHFileRefsBefore(long ts) throws ReplicationException { + try (Table table = conn.getTable(tableName); + ResultScanner scanner = table.getScanner(new Scan().addFamily(LAST_SEQUENCE_ID_FAMILY) + .addFamily(HFILE_REF_FAMILY).setFilter(new KeyOnlyFilter()))) { + for (;;) { + Result r = scanner.next(); + if (r == null) { + break; + } + Delete delete = new Delete(r.getRow()).addFamily(LAST_SEQUENCE_ID_FAMILY, ts) + .addFamily(HFILE_REF_FAMILY, ts); + table.delete(delete); + } + } catch (IOException e) { + throw new ReplicationException( + "failed to remove last sequence ids and hfile references before timestamp " + ts, e); + } + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index adb53468ce72..3d59db245015 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -34,6 +34,9 @@ import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.UnknownHostException; +import java.time.Instant; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -59,6 +62,7 @@ import javax.servlet.http.HttpServlet; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.CatalogFamilyFormat; @@ -226,6 +230,8 @@ import org.apache.hadoop.hbase.replication.master.ReplicationHFileCleaner; import org.apache.hadoop.hbase.replication.master.ReplicationLogCleaner; import org.apache.hadoop.hbase.replication.master.ReplicationSinkTrackerTableCreator; +import org.apache.hadoop.hbase.replication.regionserver.ReplicationSyncUp; +import org.apache.hadoop.hbase.replication.regionserver.ReplicationSyncUp.ReplicationSyncUpToolInfo; import org.apache.hadoop.hbase.rsgroup.RSGroupAdminEndpoint; import org.apache.hadoop.hbase.rsgroup.RSGroupBasedLoadBalancer; import org.apache.hadoop.hbase.rsgroup.RSGroupInfoManager; @@ -246,6 +252,7 @@ import org.apache.hadoop.hbase.util.HFileArchiveUtil; import org.apache.hadoop.hbase.util.IdLock; import org.apache.hadoop.hbase.util.JVMClusterUtil; +import org.apache.hadoop.hbase.util.JsonMapper; import org.apache.hadoop.hbase.util.ModifyRegionUtils; import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.util.RetryCounter; @@ -267,7 +274,9 @@ import org.apache.hbase.thirdparty.com.google.common.collect.Lists; import org.apache.hbase.thirdparty.com.google.common.collect.Maps; import org.apache.hbase.thirdparty.com.google.common.collect.Sets; +import org.apache.hbase.thirdparty.com.google.common.io.ByteStreams; import org.apache.hbase.thirdparty.com.google.common.io.Closeables; +import org.apache.hbase.thirdparty.com.google.gson.JsonParseException; import org.apache.hbase.thirdparty.com.google.protobuf.Descriptors; import org.apache.hbase.thirdparty.com.google.protobuf.Service; import org.apache.hbase.thirdparty.org.eclipse.jetty.server.Server; @@ -1286,6 +1295,38 @@ private void finishActiveMasterInitialization() throws IOException, InterruptedE status.setStatus("Initializing MOB Cleaner"); initMobCleaner(); + // delete the stale data for replication sync up tool if necessary + status.setStatus("Cleanup ReplicationSyncUp status if necessary"); + Path replicationSyncUpInfoFile = + new Path(new Path(dataRootDir, ReplicationSyncUp.INFO_DIR), ReplicationSyncUp.INFO_FILE); + if (dataFs.exists(replicationSyncUpInfoFile)) { + // info file is available, load the timestamp and use it to clean up stale data in replication + // queue storage. + byte[] data; + try (FSDataInputStream in = dataFs.open(replicationSyncUpInfoFile)) { + data = ByteStreams.toByteArray(in); + } + ReplicationSyncUpToolInfo info = null; + try { + info = JsonMapper.fromJson(Bytes.toString(data), ReplicationSyncUpToolInfo.class); + } catch (JsonParseException e) { + // usually this should be a partial file, which means the ReplicationSyncUp tool did not + // finish properly, so not a problem. Here we do not clean up the status as we do not know + // the reason why the tool did not finish properly, so let users clean the status up + // manually + LOG.warn("failed to parse replication sync up info file, ignore and continue...", e); + } + if (info != null) { + LOG.info("Remove last sequence ids and hfile references which are written before {}({})", + info.getStartTimeMs(), DateTimeFormatter.ISO_DATE_TIME.withZone(ZoneId.systemDefault()) + .format(Instant.ofEpochMilli(info.getStartTimeMs()))); + replicationPeerManager.getQueueStorage() + .removeLastSequenceIdsAndHFileRefsBefore(info.getStartTimeMs()); + // delete the file after removing the stale data, so next time we do not need to do this + // again. + dataFs.delete(replicationSyncUpInfoFile, false); + } + } status.setStatus("Calling postStartMaster coprocessors"); if (this.cpHost != null) { // don't let cp initialization errors kill the master diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AssignReplicationQueuesProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AssignReplicationQueuesProcedure.java index d33259dd4368..b547c87009dd 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AssignReplicationQueuesProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AssignReplicationQueuesProcedure.java @@ -24,7 +24,9 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.master.MasterFileSystem; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; @@ -37,6 +39,7 @@ import org.apache.hadoop.hbase.replication.ReplicationPeerDescription; import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; +import org.apache.hadoop.hbase.replication.regionserver.ReplicationSyncUp; import org.apache.hadoop.hbase.util.RetryCounter; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; @@ -102,7 +105,7 @@ private void addMissingQueues(MasterProcedureEnv env) throws ReplicationExceptio } } - private Flow claimQueues(MasterProcedureEnv env) throws ReplicationException { + private Flow claimQueues(MasterProcedureEnv env) throws ReplicationException, IOException { Set existingPeerIds = env.getReplicationPeerManager().listPeers(null).stream() .map(ReplicationPeerDescription::getPeerId).collect(Collectors.toSet()); ReplicationQueueStorage storage = env.getReplicationPeerManager().getQueueStorage(); @@ -130,18 +133,51 @@ private Flow claimQueues(MasterProcedureEnv env) throws ReplicationException { return Flow.HAS_MORE_STATE; } + // check whether ReplicationSyncUp has already done the work for us, if so, we should skip + // claiming the replication queues and deleting them instead. + private boolean shouldSkip(MasterProcedureEnv env) throws IOException { + MasterFileSystem mfs = env.getMasterFileSystem(); + Path syncUpDir = new Path(mfs.getRootDir(), ReplicationSyncUp.INFO_DIR); + return mfs.getFileSystem().exists(new Path(syncUpDir, crashedServer.getServerName())); + } + + private void removeQueues(MasterProcedureEnv env) throws ReplicationException, IOException { + ReplicationQueueStorage storage = env.getReplicationPeerManager().getQueueStorage(); + for (ReplicationQueueId queueId : storage.listAllQueueIds(crashedServer)) { + storage.removeQueue(queueId); + } + MasterFileSystem mfs = env.getMasterFileSystem(); + Path syncUpDir = new Path(mfs.getRootDir(), ReplicationSyncUp.INFO_DIR); + // remove the region server record file + mfs.getFileSystem().delete(new Path(syncUpDir, crashedServer.getServerName()), false); + } + @Override protected Flow executeFromState(MasterProcedureEnv env, AssignReplicationQueuesState state) throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException { try { switch (state) { case ASSIGN_REPLICATION_QUEUES_ADD_MISSING_QUEUES: - addMissingQueues(env); - retryCounter = null; - setNextState(AssignReplicationQueuesState.ASSIGN_REPLICATION_QUEUES_CLAIM); - return Flow.HAS_MORE_STATE; + if (shouldSkip(env)) { + setNextState(AssignReplicationQueuesState.ASSIGN_REPLICATION_QUEUES_REMOVE_QUEUES); + return Flow.HAS_MORE_STATE; + } else { + addMissingQueues(env); + retryCounter = null; + setNextState(AssignReplicationQueuesState.ASSIGN_REPLICATION_QUEUES_CLAIM); + return Flow.HAS_MORE_STATE; + } case ASSIGN_REPLICATION_QUEUES_CLAIM: - return claimQueues(env); + if (shouldSkip(env)) { + retryCounter = null; + setNextState(AssignReplicationQueuesState.ASSIGN_REPLICATION_QUEUES_REMOVE_QUEUES); + return Flow.HAS_MORE_STATE; + } else { + return claimQueues(env); + } + case ASSIGN_REPLICATION_QUEUES_REMOVE_QUEUES: + removeQueues(env); + return Flow.NO_MORE_STATE; default: throw new UnsupportedOperationException("unhandled state=" + state); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ClaimReplicationQueueRemoteProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ClaimReplicationQueueRemoteProcedure.java index 7b637384398a..d3aeeba541a2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ClaimReplicationQueueRemoteProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ClaimReplicationQueueRemoteProcedure.java @@ -19,16 +19,22 @@ import java.io.IOException; import java.util.Optional; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.master.MasterFileSystem; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.ServerOperation; import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface; import org.apache.hadoop.hbase.master.procedure.ServerRemoteProcedure; +import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; +import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; +import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteOperation; import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteProcedure; import org.apache.hadoop.hbase.replication.ReplicationQueueId; import org.apache.hadoop.hbase.replication.regionserver.ClaimReplicationQueueCallable; +import org.apache.hadoop.hbase.replication.regionserver.ReplicationSyncUp; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -54,6 +60,32 @@ public ClaimReplicationQueueRemoteProcedure(ReplicationQueueId queueId, ServerNa this.targetServer = targetServer; } + // check whether ReplicationSyncUp has already done the work for us, if so, we should skip + // claiming the replication queues and deleting them instead. + private boolean shouldSkip(MasterProcedureEnv env) throws IOException { + MasterFileSystem mfs = env.getMasterFileSystem(); + Path syncUpDir = new Path(mfs.getRootDir(), ReplicationSyncUp.INFO_DIR); + return mfs.getFileSystem().exists(new Path(syncUpDir, getServerName().getServerName())); + } + + @Override + protected synchronized Procedure[] execute(MasterProcedureEnv env) + throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException { + try { + if (shouldSkip(env)) { + LOG.info("Skip claiming {} because replication sync up has already done it for us", + getServerName()); + return null; + } + } catch (IOException e) { + LOG.warn("failed to check whether we should skip claiming {} due to replication sync up", + getServerName(), e); + // just finish the procedure here, as the AssignReplicationQueuesProcedure will reschedule + return null; + } + return super.execute(env); + } + @Override public Optional remoteCallBuild(MasterProcedureEnv env, ServerName remote) { assert targetServer.equals(remote); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/OfflineTableReplicationQueueStorage.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/OfflineTableReplicationQueueStorage.java new file mode 100644 index 000000000000..9faca74f710d --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/OfflineTableReplicationQueueStorage.java @@ -0,0 +1,382 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.replication; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Set; +import java.util.TreeMap; +import java.util.stream.Collectors; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.ClientSideRegionScanner; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.client.TableDescriptor; +import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; +import org.apache.hadoop.hbase.replication.ReplicationException; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; +import org.apache.hadoop.hbase.replication.ReplicationQueueData; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; +import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; +import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; +import org.apache.hadoop.hbase.replication.TableReplicationQueueStorage; +import org.apache.hadoop.hbase.replication.ZKReplicationQueueStorageForMigration; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.CommonFSUtils; +import org.apache.hadoop.hbase.util.FSUtils; +import org.apache.hadoop.hbase.util.Pair; +import org.apache.yetus.audience.InterfaceAudience; + +import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableList; +import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap; + +@InterfaceAudience.Private +public class OfflineTableReplicationQueueStorage implements ReplicationQueueStorage { + + private final Map> offsets = + new HashMap<>(); + + private final Map> lastSequenceIds = new HashMap<>(); + + private final Map> hfileRefs = new HashMap<>(); + + private void loadRegionInfo(FileSystem fs, Path regionDir, + NavigableMap startKey2RegionInfo) throws IOException { + RegionInfo hri = HRegionFileSystem.loadRegionInfoFileContent(fs, regionDir); + // TODO: we consider that the there will not be too many regions for hbase:replication table, so + // here we just iterate over all the regions to find out the overlapped ones. Can be optimized + // later. + Iterator> iter = startKey2RegionInfo.entrySet().iterator(); + while (iter.hasNext()) { + Map.Entry entry = iter.next(); + if (hri.isOverlap(entry.getValue())) { + if (hri.getRegionId() > entry.getValue().getRegionId()) { + // we are newer, remove the old hri, we can not break here as if hri is a merged region, + // we need to remove all its parent regions. + iter.remove(); + } else { + // we are older, just return, skip the below add + return; + } + } + + } + startKey2RegionInfo.put(hri.getStartKey(), hri); + } + + private void loadOffsets(Result result) { + NavigableMap map = + result.getFamilyMap(TableReplicationQueueStorage.QUEUE_FAMILY); + if (map == null || map.isEmpty()) { + return; + } + Map offsetMap = new HashMap<>(); + map.forEach((k, v) -> { + String walGroup = Bytes.toString(k); + ReplicationGroupOffset offset = ReplicationGroupOffset.parse(Bytes.toString(v)); + offsetMap.put(walGroup, offset); + }); + ReplicationQueueId queueId = ReplicationQueueId.parse(Bytes.toString(result.getRow())); + offsets.put(queueId, offsetMap); + } + + private void loadLastSequenceIds(Result result) { + NavigableMap map = + result.getFamilyMap(TableReplicationQueueStorage.LAST_SEQUENCE_ID_FAMILY); + if (map == null || map.isEmpty()) { + return; + } + Map lastSeqIdMap = new HashMap<>(); + map.forEach((k, v) -> { + String encodedRegionName = Bytes.toString(k); + long lastSeqId = Bytes.toLong(v); + lastSeqIdMap.put(encodedRegionName, lastSeqId); + }); + String peerId = Bytes.toString(result.getRow()); + lastSequenceIds.put(peerId, lastSeqIdMap); + } + + private void loadHFileRefs(Result result) { + NavigableMap map = + result.getFamilyMap(TableReplicationQueueStorage.HFILE_REF_FAMILY); + if (map == null || map.isEmpty()) { + return; + } + Set refs = new HashSet<>(); + map.keySet().forEach(ref -> refs.add(Bytes.toString(ref))); + String peerId = Bytes.toString(result.getRow()); + hfileRefs.put(peerId, refs); + } + + private void loadReplicationQueueData(Configuration conf, TableName tableName) + throws IOException { + Path rootDir = CommonFSUtils.getRootDir(conf); + Path tableDir = CommonFSUtils.getTableDir(rootDir, tableName); + FileSystem fs = tableDir.getFileSystem(conf); + FileStatus[] regionDirs = + CommonFSUtils.listStatus(fs, tableDir, new FSUtils.RegionDirFilter(fs)); + if (regionDirs == null) { + return; + } + NavigableMap startKey2RegionInfo = new TreeMap<>(Bytes.BYTES_COMPARATOR); + for (FileStatus regionDir : regionDirs) { + loadRegionInfo(fs, regionDir.getPath(), startKey2RegionInfo); + } + TableDescriptor td = ReplicationStorageFactory.createReplicationQueueTableDescriptor(tableName); + for (RegionInfo hri : startKey2RegionInfo.values()) { + try (ClientSideRegionScanner scanner = + new ClientSideRegionScanner(conf, fs, rootDir, td, hri, new Scan(), null)) { + for (;;) { + Result result = scanner.next(); + if (result == null) { + break; + } + loadOffsets(result); + loadLastSequenceIds(result); + loadHFileRefs(result); + } + } + } + } + + public OfflineTableReplicationQueueStorage(Configuration conf, TableName tableName) + throws IOException { + loadReplicationQueueData(conf, tableName); + } + + @Override + public synchronized void setOffset(ReplicationQueueId queueId, String walGroup, + ReplicationGroupOffset offset, Map lastSeqIds) throws ReplicationException { + Map offsetMap = offsets.get(queueId); + if (offsetMap == null) { + offsetMap = new HashMap<>(); + offsets.put(queueId, offsetMap); + } + offsetMap.put(walGroup, offset); + Map lastSeqIdsMap = lastSequenceIds.get(queueId.getPeerId()); + if (lastSeqIdsMap == null) { + lastSeqIdsMap = new HashMap<>(); + lastSequenceIds.put(queueId.getPeerId(), lastSeqIdsMap); + } + for (Map.Entry entry : lastSeqIds.entrySet()) { + Long oldSeqId = lastSeqIdsMap.get(entry.getKey()); + if (oldSeqId == null || oldSeqId < entry.getValue()) { + lastSeqIdsMap.put(entry.getKey(), entry.getValue()); + } + } + } + + @Override + public synchronized Map getOffsets(ReplicationQueueId queueId) + throws ReplicationException { + Map offsetMap = offsets.get(queueId); + if (offsetMap == null) { + return Collections.emptyMap(); + } + return ImmutableMap.copyOf(offsetMap); + } + + @Override + public synchronized List listAllQueueIds(String peerId) + throws ReplicationException { + return offsets.keySet().stream().filter(rqi -> rqi.getPeerId().equals(peerId)) + .collect(Collectors.toList()); + } + + @Override + public synchronized List listAllQueueIds(ServerName serverName) + throws ReplicationException { + return offsets.keySet().stream().filter(rqi -> rqi.getServerName().equals(serverName)) + .collect(Collectors.toList()); + } + + @Override + public synchronized List listAllQueueIds(String peerId, ServerName serverName) + throws ReplicationException { + return offsets.keySet().stream() + .filter(rqi -> rqi.getPeerId().equals(peerId) && rqi.getServerName().equals(serverName)) + .collect(Collectors.toList()); + } + + @Override + public synchronized List listAllQueues() throws ReplicationException { + return offsets.entrySet().stream() + .map(e -> new ReplicationQueueData(e.getKey(), ImmutableMap.copyOf(e.getValue()))) + .collect(Collectors.toList()); + } + + @Override + public synchronized List listAllReplicators() throws ReplicationException { + return offsets.keySet().stream().map(ReplicationQueueId::getServerName).distinct() + .collect(Collectors.toList()); + } + + @Override + public synchronized Map claimQueue(ReplicationQueueId queueId, + ServerName targetServerName) throws ReplicationException { + Map offsetMap = offsets.remove(queueId); + if (offsetMap == null) { + return Collections.emptyMap(); + } + offsets.put(queueId.claim(targetServerName), offsetMap); + return ImmutableMap.copyOf(offsetMap); + } + + @Override + public synchronized void removeQueue(ReplicationQueueId queueId) throws ReplicationException { + offsets.remove(queueId); + } + + @Override + public synchronized void removeAllQueues(String peerId) throws ReplicationException { + Iterator iter = offsets.keySet().iterator(); + while (iter.hasNext()) { + if (iter.next().getPeerId().equals(peerId)) { + iter.remove(); + } + } + } + + @Override + public synchronized long getLastSequenceId(String encodedRegionName, String peerId) + throws ReplicationException { + Map lastSeqIdMap = lastSequenceIds.get(peerId); + if (lastSeqIdMap == null) { + return HConstants.NO_SEQNUM; + } + Long lastSeqId = lastSeqIdMap.get(encodedRegionName); + return lastSeqId != null ? lastSeqId.longValue() : HConstants.NO_SEQNUM; + } + + @Override + public synchronized void setLastSequenceIds(String peerId, Map lastSeqIds) + throws ReplicationException { + Map lastSeqIdMap = lastSequenceIds.get(peerId); + if (lastSeqIdMap == null) { + lastSeqIdMap = new HashMap<>(); + lastSequenceIds.put(peerId, lastSeqIdMap); + } + lastSeqIdMap.putAll(lastSeqIds); + } + + @Override + public synchronized void removeLastSequenceIds(String peerId) throws ReplicationException { + lastSequenceIds.remove(peerId); + } + + @Override + public synchronized void removeLastSequenceIds(String peerId, List encodedRegionNames) + throws ReplicationException { + Map lastSeqIdMap = lastSequenceIds.get(peerId); + if (lastSeqIdMap == null) { + return; + } + for (String encodedRegionName : encodedRegionNames) { + lastSeqIdMap.remove(encodedRegionName); + } + } + + @Override + public synchronized void removePeerFromHFileRefs(String peerId) throws ReplicationException { + hfileRefs.remove(peerId); + } + + @Override + public synchronized void addHFileRefs(String peerId, List> pairs) + throws ReplicationException { + Set refs = hfileRefs.get(peerId); + if (refs == null) { + refs = new HashSet<>(); + hfileRefs.put(peerId, refs); + } + for (Pair pair : pairs) { + refs.add(pair.getSecond().getName()); + } + } + + @Override + public synchronized void removeHFileRefs(String peerId, List files) + throws ReplicationException { + Set refs = hfileRefs.get(peerId); + if (refs == null) { + return; + } + refs.removeAll(files); + } + + @Override + public synchronized List getAllPeersFromHFileRefsQueue() throws ReplicationException { + return ImmutableList.copyOf(hfileRefs.keySet()); + } + + @Override + public synchronized List getReplicableHFiles(String peerId) throws ReplicationException { + Set refs = hfileRefs.get(peerId); + if (refs == null) { + return Collections.emptyList(); + } + return ImmutableList.copyOf(refs); + } + + @Override + public synchronized Set getAllHFileRefs() throws ReplicationException { + return hfileRefs.values().stream().flatMap(Set::stream).collect(Collectors.toSet()); + } + + @Override + public boolean hasData() throws ReplicationException { + return true; + } + + @Override + public void batchUpdateQueues(ServerName serverName, List datas) + throws ReplicationException { + throw new UnsupportedOperationException(); + } + + @Override + public void batchUpdateLastSequenceIds( + List lastPushedSeqIds) + throws ReplicationException { + throw new UnsupportedOperationException(); + } + + @Override + public void batchUpdateHFileRefs(String peerId, List hfileRefs) + throws ReplicationException { + throw new UnsupportedOperationException(); + } + + @Override + public void removeLastSequenceIdsAndHFileRefsBefore(long ts) throws ReplicationException { + throw new UnsupportedOperationException(); + } +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java index 8cfb36a1bc17..8b01225e553e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/ReplicationPeerManager.java @@ -654,7 +654,7 @@ public void initialize() throws IOException { }; } return Pair.newPair(ReplicationStorageFactory.getReplicationQueueStorage( - services.getConnection(), replicationQueueTableName), initializer); + services.getConnection(), conf, replicationQueueTableName), initializer); } public static ReplicationPeerManager create(MasterServices services, String clusterId) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java index 2fb996c6e4dc..d54cda92d901 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java @@ -25,7 +25,6 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; -import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -126,6 +125,7 @@ */ @InterfaceAudience.Private public class ReplicationSourceManager { + private static final Logger LOG = LoggerFactory.getLogger(ReplicationSourceManager.class); // all the sources that read this RS's logs and every peer only has one replication source private final ConcurrentMap sources; @@ -147,13 +147,15 @@ public class ReplicationSourceManager { // All logs we are currently tracking // Index structure of the map is: queue_id->logPrefix/logGroup->logs - // For normal replication source, the peer id is same with the queue id private final ConcurrentMap>> walsById; // Logs for recovered sources we are currently tracking // the map is: queue_id->logPrefix/logGroup->logs - // For recovered source, the queue id's format is peer_id-servername-* + // for recovered source, the WAL files should already been moved to oldLogDir, and we have + // different layout of old WAL files, for example, with server name sub directories or not, so + // here we record the full path instead of just the name, so when refreshing we can enqueue the + // WAL file again, without trying to guess the real path of the WAL files. private final ConcurrentMap>> walsByIdRecoveredQueues; + Map>> walsByIdRecoveredQueues; private final SyncReplicationPeerMappingManager syncReplicationPeerMappingManager; @@ -515,9 +517,9 @@ public void refreshSources(String peerId) throws ReplicationException, IOExcepti ReplicationSourceInterface recoveredReplicationSource = createRefreshedSource(oldSourceQueueId, peer); this.oldsources.add(recoveredReplicationSource); - for (SortedSet walsByGroup : walsByIdRecoveredQueues.get(oldSourceQueueId) + for (NavigableSet walsByGroup : walsByIdRecoveredQueues.get(oldSourceQueueId) .values()) { - walsByGroup.forEach(wal -> recoveredReplicationSource.enqueueLog(new Path(wal))); + walsByGroup.forEach(wal -> recoveredReplicationSource.enqueueLog(wal)); } toStartup.add(recoveredReplicationSource); } @@ -657,9 +659,11 @@ public void logPositionAndCleanOldLogs(ReplicationSourceInterface source, void cleanOldLogs(String log, boolean inclusive, ReplicationSourceInterface source) { String logPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(log); if (source.isRecovered()) { - NavigableSet wals = walsByIdRecoveredQueues.get(source.getQueueId()).get(logPrefix); + NavigableSet wals = walsByIdRecoveredQueues.get(source.getQueueId()).get(logPrefix); if (wals != null) { - NavigableSet walsToRemove = wals.headSet(log, inclusive); + // here we just want to compare the timestamp, so it is OK to just create a fake WAL path + NavigableSet walsToRemove = wals.headSet(new Path(oldLogDir, log), inclusive) + .stream().map(Path::getName).collect(Collectors.toCollection(TreeSet::new)); if (walsToRemove.isEmpty()) { return; } @@ -815,6 +819,93 @@ private boolean shouldReplicate(ReplicationGroupOffset offset, String wal) { } void claimQueue(ReplicationQueueId queueId) { + claimQueue(queueId, false); + } + + // sorted from oldest to newest + private PriorityQueue getWALFilesToReplicate(ServerName sourceRS, boolean syncUp, + Map offsets) throws IOException { + List walFiles = AbstractFSWALProvider.getArchivedWALFiles(conf, sourceRS, + URLEncoder.encode(sourceRS.toString(), StandardCharsets.UTF_8.name())); + if (syncUp) { + // we also need to list WALs directory for ReplicationSyncUp + walFiles.addAll(AbstractFSWALProvider.getWALFiles(conf, sourceRS)); + } + PriorityQueue walFilesPQ = + new PriorityQueue<>(AbstractFSWALProvider.TIMESTAMP_COMPARATOR); + // sort the wal files and also filter out replicated files + for (Path file : walFiles) { + String walGroupId = AbstractFSWALProvider.getWALPrefixFromWALName(file.getName()); + ReplicationGroupOffset groupOffset = offsets.get(walGroupId); + if (shouldReplicate(groupOffset, file.getName())) { + walFilesPQ.add(file); + } else { + LOG.debug("Skip enqueuing log {} because it is before the start offset {}", file.getName(), + groupOffset); + } + } + return walFilesPQ; + } + + private void addRecoveredSource(ReplicationSourceInterface src, ReplicationPeerImpl oldPeer, + ReplicationQueueId claimedQueueId, PriorityQueue walFiles) { + ReplicationPeerImpl peer = replicationPeers.getPeer(src.getPeerId()); + if (peer == null || peer != oldPeer) { + src.terminate("Recovered queue doesn't belong to any current peer"); + deleteQueue(claimedQueueId); + return; + } + // Do not setup recovered queue if a sync replication peer is in STANDBY state, or is + // transiting to STANDBY state. The only exception is we are in STANDBY state and + // transiting to DA, under this state we will replay the remote WAL and they need to be + // replicated back. + if (peer.getPeerConfig().isSyncReplication()) { + Pair stateAndNewState = + peer.getSyncReplicationStateAndNewState(); + if ( + (stateAndNewState.getFirst().equals(SyncReplicationState.STANDBY) + && stateAndNewState.getSecond().equals(SyncReplicationState.NONE)) + || stateAndNewState.getSecond().equals(SyncReplicationState.STANDBY) + ) { + src.terminate("Sync replication peer is in STANDBY state"); + deleteQueue(claimedQueueId); + return; + } + } + // track sources in walsByIdRecoveredQueues + Map> walsByGroup = new HashMap<>(); + walsByIdRecoveredQueues.put(claimedQueueId, walsByGroup); + for (Path wal : walFiles) { + String walPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(wal.getName()); + NavigableSet wals = walsByGroup.get(walPrefix); + if (wals == null) { + wals = new TreeSet<>(AbstractFSWALProvider.TIMESTAMP_COMPARATOR); + walsByGroup.put(walPrefix, wals); + } + wals.add(wal); + } + oldsources.add(src); + LOG.info("Added source for recovered queue {}, number of wals to replicate: {}", claimedQueueId, + walFiles.size()); + for (Path wal : walFiles) { + LOG.debug("Enqueueing log {} from recovered queue for source: {}", wal, claimedQueueId); + src.enqueueLog(wal); + } + src.startup(); + } + + /** + * Claim a replication queue. + *

    + * We add a flag to indicate whether we are called by ReplicationSyncUp. For normal claiming queue + * operation, we are the last step of a SCP, so we can assume that all the WAL files are under + * oldWALs directory. But for ReplicationSyncUp, we may want to claim the replication queue for a + * region server which has not been processed by SCP yet, so we still need to look at its WALs + * directory. + * @param queueId the replication queue id we want to claim + * @param syncUp whether we are called by ReplicationSyncUp + */ + void claimQueue(ReplicationQueueId queueId, boolean syncUp) { // Wait a bit before transferring the queues, we may be shutting down. // This sleep may not be enough in some cases. try { @@ -873,76 +964,17 @@ void claimQueue(ReplicationQueueId queueId) { server.abort("Failed to create replication source after claiming queue.", e); return; } - List walFiles; + PriorityQueue walFiles; try { - walFiles = AbstractFSWALProvider.getArchivedWALFiles(conf, sourceRS, - URLEncoder.encode(sourceRS.toString(), StandardCharsets.UTF_8.name())); + walFiles = getWALFilesToReplicate(sourceRS, syncUp, offsets); } catch (IOException e) { - LOG.error("Can not list all wal files for peer {} and queue {}", peerId, queueId, e); - server.abort("Can not list all wal files after claiming queue.", e); + LOG.error("Can not list wal files for peer {} and queue {}", peerId, queueId, e); + server.abort("Can not list wal files after claiming queue.", e); return; } - PriorityQueue walFilesPQ = new PriorityQueue<>( - Comparator. comparing(p -> AbstractFSWALProvider.getTimestamp(p.getName())) - .thenComparing(Path::getName)); - // sort the wal files and also filter out replicated files - for (Path file : walFiles) { - String walGroupId = AbstractFSWALProvider.getWALPrefixFromWALName(file.getName()); - ReplicationGroupOffset groupOffset = offsets.get(walGroupId); - if (shouldReplicate(groupOffset, file.getName())) { - walFilesPQ.add(file); - } else { - LOG.debug("Skip enqueuing log {} because it is before the start offset {}", file.getName(), - groupOffset); - } - } - // the method is a bit long, so assign it to null here to avoid later we reuse it again by - // mistake, we should use the sorted walFilesPQ instead - walFiles = null; // synchronized on oldsources to avoid adding recovered source for the to-be-removed peer synchronized (oldsources) { - peer = replicationPeers.getPeer(src.getPeerId()); - if (peer == null || peer != oldPeer) { - src.terminate("Recovered queue doesn't belong to any current peer"); - deleteQueue(claimedQueueId); - return; - } - // Do not setup recovered queue if a sync replication peer is in STANDBY state, or is - // transiting to STANDBY state. The only exception is we are in STANDBY state and - // transiting to DA, under this state we will replay the remote WAL and they need to be - // replicated back. - if (peer.getPeerConfig().isSyncReplication()) { - Pair stateAndNewState = - peer.getSyncReplicationStateAndNewState(); - if ( - (stateAndNewState.getFirst().equals(SyncReplicationState.STANDBY) - && stateAndNewState.getSecond().equals(SyncReplicationState.NONE)) - || stateAndNewState.getSecond().equals(SyncReplicationState.STANDBY) - ) { - src.terminate("Sync replication peer is in STANDBY state"); - deleteQueue(claimedQueueId); - return; - } - } - // track sources in walsByIdRecoveredQueues - Map> walsByGroup = new HashMap<>(); - walsByIdRecoveredQueues.put(claimedQueueId, walsByGroup); - for (Path wal : walFilesPQ) { - String walPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(wal.getName()); - NavigableSet wals = walsByGroup.get(walPrefix); - if (wals == null) { - wals = new TreeSet<>(); - walsByGroup.put(walPrefix, wals); - } - wals.add(wal.getName()); - } - oldsources.add(src); - LOG.info("Added source for recovered queue {}", claimedQueueId); - for (Path wal : walFilesPQ) { - LOG.debug("Enqueueing log {} from recovered queue for source: {}", wal, claimedQueueId); - src.enqueueLog(new Path(oldLogDir, wal)); - } - src.startup(); + addRecoveredSource(src, oldPeer, claimedQueueId, walFiles); } } @@ -971,16 +1003,6 @@ public Map>> getWALs() { return Collections.unmodifiableMap(walsById); } - /** - * Get a copy of the wals of the recovered sources on this rs - * @return a sorted set of wal names - */ - @RestrictedApi(explanation = "Should only be called in tests", link = "", - allowedOnPath = ".*/src/test/.*") - Map>> getWalsByIdRecoveredQueues() { - return Collections.unmodifiableMap(walsByIdRecoveredQueues); - } - /** * Get a list of all the normal sources of this rs * @return list of all normal sources @@ -1100,8 +1122,6 @@ MetricsReplicationGlobalSourceSource getGlobalMetrics() { return this.globalMetrics; } - @RestrictedApi(explanation = "Should only be called in tests", link = "", - allowedOnPath = ".*/src/test/.*") ReplicationQueueStorage getQueueStorage() { return queueStorage; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSyncUp.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSyncUp.java index b63ad473719c..f071cf6f1f81 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSyncUp.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSyncUp.java @@ -17,13 +17,17 @@ */ package org.apache.hadoop.hbase.replication.regionserver; +import java.io.FileNotFoundException; import java.io.IOException; import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Set; -import java.util.stream.Collectors; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.Abortable; @@ -35,11 +39,18 @@ import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.AsyncClusterConnection; import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.master.replication.OfflineTableReplicationQueueStorage; import org.apache.hadoop.hbase.replication.ReplicationException; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; +import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; +import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; +import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.CommonFSUtils; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.apache.hadoop.hbase.util.JsonMapper; +import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.apache.hadoop.hbase.wal.WALFactory; -import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.hadoop.hbase.zookeeper.ZKWatcher; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; @@ -59,6 +70,31 @@ @InterfaceAudience.Private public class ReplicationSyncUp extends Configured implements Tool { + public static class ReplicationSyncUpToolInfo { + + private long startTimeMs; + + public ReplicationSyncUpToolInfo() { + } + + public ReplicationSyncUpToolInfo(long startTimeMs) { + this.startTimeMs = startTimeMs; + } + + public long getStartTimeMs() { + return startTimeMs; + } + + public void setStartTimeMs(long startTimeMs) { + this.startTimeMs = startTimeMs; + } + } + + // For storing the information used to skip replicating some wals after the cluster is back online + public static final String INFO_DIR = "ReplicationSyncUp"; + + public static final String INFO_FILE = "info"; + private static final long SLEEP_TIME = 10000; /** @@ -69,41 +105,116 @@ public static void main(String[] args) throws Exception { System.exit(ret); } - private Set getLiveRegionServers(ZKWatcher zkw) throws KeeperException { - List rsZNodes = ZKUtil.listChildrenNoWatch(zkw, zkw.getZNodePaths().rsZNode); - return rsZNodes == null - ? Collections.emptySet() - : rsZNodes.stream().map(ServerName::parseServerName).collect(Collectors.toSet()); + // Find region servers under wal directory + // Here we only care about the region servers which may still be alive, as we need to add + // replications for them if missing. The dead region servers which have already been processed + // fully do not need to add their replication queues again, as the operation has already been done + // in SCP. + private Set listRegionServers(FileSystem walFs, Path walDir) throws IOException { + FileStatus[] statuses; + try { + statuses = walFs.listStatus(walDir); + } catch (FileNotFoundException e) { + System.out.println("WAL directory " + walDir + " does not exists, ignore"); + return Collections.emptySet(); + } + Set regionServers = new HashSet<>(); + for (FileStatus status : statuses) { + // All wal files under the walDir is within its region server's directory + if (!status.isDirectory()) { + continue; + } + ServerName sn = AbstractFSWALProvider.getServerNameFromWALDirectoryName(status.getPath()); + if (sn != null) { + regionServers.add(sn); + } + } + return regionServers; + } + + private void addMissingReplicationQueues(ReplicationQueueStorage storage, ServerName regionServer, + Set peerIds) throws ReplicationException { + Set existingQueuePeerIds = new HashSet<>(); + List queueIds = storage.listAllQueueIds(regionServer); + for (Iterator iter = queueIds.iterator(); iter.hasNext();) { + ReplicationQueueId queueId = iter.next(); + if (!queueId.isRecovered()) { + existingQueuePeerIds.add(queueId.getPeerId()); + } + } + + for (String peerId : peerIds) { + if (!existingQueuePeerIds.contains(peerId)) { + ReplicationQueueId queueId = new ReplicationQueueId(regionServer, peerId); + System.out.println("Add replication queue " + queueId + " for claiming"); + storage.setOffset(queueId, regionServer.toString(), ReplicationGroupOffset.BEGIN, + Collections.emptyMap()); + } + } + } + + private void addMissingReplicationQueues(ReplicationQueueStorage storage, + Set regionServers, Set peerIds) throws ReplicationException { + for (ServerName regionServer : regionServers) { + addMissingReplicationQueues(storage, regionServer, peerIds); + } } // When using this tool, usually the source cluster is unhealthy, so we should try to claim the // replication queues for the dead region servers first and then replicate the data out. - private void claimReplicationQueues(ZKWatcher zkw, ReplicationSourceManager mgr) - throws ReplicationException, KeeperException { - // TODO: reimplement this tool - // List replicators = mgr.getQueueStorage().getListOfReplicators(); - // Set liveRegionServers = getLiveRegionServers(zkw); - // for (ServerName sn : replicators) { - // if (!liveRegionServers.contains(sn)) { - // List replicationQueues = mgr.getQueueStorage().getAllQueues(sn); - // System.out.println(sn + " is dead, claim its replication queues: " + replicationQueues); - // for (String queue : replicationQueues) { - // mgr.claimQueue(sn, queue); - // } - // } - // } + private void claimReplicationQueues(ReplicationSourceManager mgr, Set regionServers) + throws ReplicationException, KeeperException, IOException { + // union the region servers from both places, i.e, from the wal directory, and the records in + // replication queue storage. + Set replicators = new HashSet<>(regionServers); + ReplicationQueueStorage queueStorage = mgr.getQueueStorage(); + replicators.addAll(queueStorage.listAllReplicators()); + FileSystem fs = CommonFSUtils.getCurrentFileSystem(getConf()); + Path infoDir = new Path(CommonFSUtils.getRootDir(getConf()), INFO_DIR); + for (ServerName sn : replicators) { + List replicationQueues = queueStorage.listAllQueueIds(sn); + System.out.println(sn + " is dead, claim its replication queues: " + replicationQueues); + // record the rs name, so when master restarting, we will skip claiming its replication queue + fs.createNewFile(new Path(infoDir, sn.getServerName())); + for (ReplicationQueueId queueId : replicationQueues) { + mgr.claimQueue(queueId, true); + } + } + } + + private void writeInfoFile(FileSystem fs) throws IOException { + // Record the info of this run. Currently only record the time we run the job. We will use this + // timestamp to clean up the data for last sequence ids and hfile refs in replication queue + // storage. See ReplicationQueueStorage.removeLastSequenceIdsAndHFileRefsBefore. + ReplicationSyncUpToolInfo info = + new ReplicationSyncUpToolInfo(EnvironmentEdgeManager.currentTime()); + String json = JsonMapper.writeObjectAsString(info); + Path infoDir = new Path(CommonFSUtils.getRootDir(getConf()), INFO_DIR); + try (FSDataOutputStream out = fs.create(new Path(infoDir, INFO_FILE), false)) { + out.write(Bytes.toBytes(json)); + } } @Override public int run(String[] args) throws Exception { Abortable abortable = new Abortable() { + + private volatile boolean abort = false; + @Override public void abort(String why, Throwable e) { + if (isAborted()) { + return; + } + abort = true; + System.err.println("Aborting because of " + why); + e.printStackTrace(); + System.exit(1); } @Override public boolean isAborted() { - return false; + return abort; } }; Configuration conf = getConf(); @@ -114,16 +225,24 @@ public boolean isAborted() { Path oldLogDir = new Path(walRootDir, HConstants.HREGION_OLDLOGDIR_NAME); Path logDir = new Path(walRootDir, HConstants.HREGION_LOGDIR_NAME); - System.out.println("Start Replication Server start"); + System.out.println("Start Replication Server"); + writeInfoFile(fs); Replication replication = new Replication(); - replication.initialize(new DummyServer(zkw), fs, logDir, oldLogDir, + // use offline table replication queue storage + getConf().setClass(ReplicationStorageFactory.REPLICATION_QUEUE_IMPL, + OfflineTableReplicationQueueStorage.class, ReplicationQueueStorage.class); + DummyServer server = new DummyServer(getConf(), zkw); + replication.initialize(server, fs, new Path(logDir, server.toString()), oldLogDir, new WALFactory(conf, ServerName .valueOf(getClass().getSimpleName() + ",16010," + EnvironmentEdgeManager.currentTime()), null, false)); ReplicationSourceManager manager = replication.getReplicationManager(); manager.init(); - claimReplicationQueues(zkw, manager); + Set regionServers = listRegionServers(fs, logDir); + addMissingReplicationQueues(manager.getQueueStorage(), regionServers, + manager.getReplicationPeers().getAllPeerIds()); + claimReplicationQueues(manager, regionServers); while (manager.activeFailoverTaskCount() > 0) { Thread.sleep(SLEEP_TIME); } @@ -138,23 +257,22 @@ public boolean isAborted() { return 0; } - class DummyServer implements Server { - String hostname; - ZKWatcher zkw; + private static final class DummyServer implements Server { + private final Configuration conf; + private final String hostname; + private final ZKWatcher zkw; + private volatile boolean abort = false; - DummyServer(ZKWatcher zkw) { + DummyServer(Configuration conf, ZKWatcher zkw) { // a unique name in case the first run fails hostname = EnvironmentEdgeManager.currentTime() + ".SyncUpTool.replication.org"; + this.conf = conf; this.zkw = zkw; } - DummyServer(String hostname) { - this.hostname = hostname; - } - @Override public Configuration getConfiguration() { - return getConf(); + return conf; } @Override @@ -174,11 +292,18 @@ public ServerName getServerName() { @Override public void abort(String why, Throwable e) { + if (isAborted()) { + return; + } + abort = true; + System.err.println("Aborting because of " + why); + e.printStackTrace(); + System.exit(1); } @Override public boolean isAborted() { - return false; + return abort; } @Override diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java index 5bbc66791967..5dc40dd60493 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java @@ -311,6 +311,10 @@ public static long getTimestamp(String name) { return matcher.matches() ? Long.parseLong(matcher.group(2)) : NO_TIMESTAMP; } + public static final Comparator TIMESTAMP_COMPARATOR = + Comparator. comparing(p -> AbstractFSWALProvider.getTimestamp(p.getName())) + .thenComparing(Path::getName); + /** * Construct the directory name for all WALs on a given server. Dir names currently look like this * for WALs: hbase//WALs/kalashnikov.att.net,61634,1486865297088. diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestLogsCleaner.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestLogsCleaner.java index d7ba6c227c6d..5d474bc21640 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestLogsCleaner.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestLogsCleaner.java @@ -127,8 +127,8 @@ public void beforeTest() throws Exception { TableDescriptor td = ReplicationStorageFactory.createReplicationQueueTableDescriptor(tableName); TEST_UTIL.getAdmin().createTable(td); TEST_UTIL.waitTableAvailable(tableName); - queueStorage = - ReplicationStorageFactory.getReplicationQueueStorage(TEST_UTIL.getConnection(), tableName); + queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(TEST_UTIL.getConnection(), + conf, tableName); masterServices = mock(MasterServices.class); when(masterServices.getConnection()).thenReturn(TEST_UTIL.getConnection()); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpTool.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpTool.java index 7a89af15902e..38225613b9d4 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpTool.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpTool.java @@ -22,24 +22,28 @@ import static org.apache.hadoop.hbase.replication.TestReplicationBase.NB_ROWS_IN_BATCH; import static org.apache.hadoop.hbase.replication.TestReplicationBase.SLEEP_TIME; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.Delete; import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.replication.regionserver.ReplicationSyncUp; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.testclassification.ReplicationTests; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.CommonFSUtils; import org.junit.ClassRule; -import org.junit.Ignore; import org.junit.Test; import org.junit.experimental.categories.Category; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -// revisit later when we implement the new ReplicationSyncUpTool -@Ignore @Category({ ReplicationTests.class, LargeTests.class }) public class TestReplicationSyncUpTool extends TestReplicationSyncUpToolBase { @@ -55,39 +59,70 @@ public class TestReplicationSyncUpTool extends TestReplicationSyncUpToolBase { */ @Test public void testSyncUpTool() throws Exception { - - /** - * Set up Replication: on Master and one Slave Table: t1_syncup and t2_syncup columnfamily: - * 'cf1' : replicated 'norep': not replicated - */ + // Set up Replication: on Master and one Slave + // Table: t1_syncup and t2_syncup + // columnfamily: + // 'cf1' : replicated + // 'norep': not replicated setupReplication(); - /** - * at Master: t1_syncup: put 100 rows into cf1, and 1 rows into norep t2_syncup: put 200 rows - * into cf1, and 1 rows into norep verify correctly replicated to slave - */ + // + // at Master: + // t1_syncup: put 100 rows into cf1, and 1 rows into norep + // t2_syncup: put 200 rows into cf1, and 1 rows into norep + // + // verify correctly replicated to slave putAndReplicateRows(); - /** - * Verify delete works step 1: stop hbase on Slave step 2: at Master: t1_syncup: delete 50 rows - * from cf1 t2_syncup: delete 100 rows from cf1 no change on 'norep' step 3: stop hbase on - * master, restart hbase on Slave step 4: verify Slave still have the rows before delete - * t1_syncup: 100 rows from cf1 t2_syncup: 200 rows from cf1 step 5: run syncup tool on Master - * step 6: verify that delete show up on Slave t1_syncup: 50 rows from cf1 t2_syncup: 100 rows - * from cf1 verify correctly replicated to Slave - */ + // Verify delete works + // + // step 1: stop hbase on Slave + // + // step 2: at Master: + // t1_syncup: delete 50 rows from cf1 + // t2_syncup: delete 100 rows from cf1 + // no change on 'norep' + // + // step 3: stop hbase on master, restart hbase on Slave + // + // step 4: verify Slave still have the rows before delete + // t1_syncup: 100 rows from cf1 + // t2_syncup: 200 rows from cf1 + // + // step 5: run syncup tool on Master + // + // step 6: verify that delete show up on Slave + // t1_syncup: 50 rows from cf1 + // t2_syncup: 100 rows from cf1 + // + // verify correctly replicated to Slave mimicSyncUpAfterDelete(); - /** - * Verify put works step 1: stop hbase on Slave step 2: at Master: t1_syncup: put 100 rows from - * cf1 t2_syncup: put 200 rows from cf1 and put another row on 'norep' ATTN: put to 'cf1' will - * overwrite existing rows, so end count will be 100 and 200 respectively put to 'norep' will - * add a new row. step 3: stop hbase on master, restart hbase on Slave step 4: verify Slave - * still has the rows before put t1_syncup: 50 rows from cf1 t2_syncup: 100 rows from cf1 step - * 5: run syncup tool on Master step 6: verify that put show up on Slave and 'norep' does not - * t1_syncup: 100 rows from cf1 t2_syncup: 200 rows from cf1 verify correctly replicated to - * Slave - */ + // Verify put works + // + // step 1: stop hbase on Slave + // + // step 2: at Master: + // t1_syncup: put 100 rows from cf1 + // t2_syncup: put 200 rows from cf1 + // and put another row on 'norep' + // ATTN: + // put to 'cf1' will overwrite existing rows, so end count will be 100 and 200 respectively + // put to 'norep' will add a new row. + // + // step 3: stop hbase on master, restart hbase on Slave + // + // step 4: verify Slave still has the rows before put + // t1_syncup: 50 rows from cf1 + // t2_syncup: 100 rows from cf1 + // + // step 5: run syncup tool on Master + // + // step 6: verify that put show up on Slave and 'norep' does not + // t1_syncup: 100 rows from cf1 + // t2_syncup: 200 rows from cf1 + // + // verify correctly replicated to Slave mimicSyncUpAfterPut(); } @@ -172,7 +207,8 @@ private void mimicSyncUpAfterDelete() throws Exception { int rowCount_ht2Source = countRows(ht2Source); assertEquals("t2_syncup has 101 rows on source, after remove 100 of the replicated colfam", 101, rowCount_ht2Source); - + List sourceRses = UTIL1.getHBaseCluster().getRegionServerThreads().stream() + .map(rst -> rst.getRegionServer().getServerName()).collect(Collectors.toList()); shutDownSourceHBaseCluster(); restartTargetHBaseCluster(1); @@ -184,40 +220,33 @@ private void mimicSyncUpAfterDelete() throws Exception { assertEquals("@Peer1 t1_syncup should still have 100 rows", 100, rowCountHt1TargetAtPeer1); assertEquals("@Peer1 t2_syncup should still have 200 rows", 200, rowCountHt2TargetAtPeer1); + syncUp(UTIL1); + // After sync up - for (int i = 0; i < NB_RETRIES; i++) { - syncUp(UTIL1); - rowCountHt1TargetAtPeer1 = countRows(ht1TargetAtPeer1); - rowCountHt2TargetAtPeer1 = countRows(ht2TargetAtPeer1); - if (i == NB_RETRIES - 1) { - if (rowCountHt1TargetAtPeer1 != 50 || rowCountHt2TargetAtPeer1 != 100) { - // syncUP still failed. Let's look at the source in case anything wrong there - restartSourceHBaseCluster(1); - rowCount_ht1Source = countRows(ht1Source); - LOG.debug("t1_syncup should have 51 rows at source, and it is " + rowCount_ht1Source); - rowCount_ht2Source = countRows(ht2Source); - LOG.debug("t2_syncup should have 101 rows at source, and it is " + rowCount_ht2Source); - } - assertEquals("@Peer1 t1_syncup should be sync up and have 50 rows", 50, - rowCountHt1TargetAtPeer1); - assertEquals("@Peer1 t2_syncup should be sync up and have 100 rows", 100, - rowCountHt2TargetAtPeer1); - } - if (rowCountHt1TargetAtPeer1 == 50 && rowCountHt2TargetAtPeer1 == 100) { - LOG.info("SyncUpAfterDelete succeeded at retry = " + i); - break; - } else { - LOG.debug("SyncUpAfterDelete failed at retry = " + i + ", with rowCount_ht1TargetPeer1 =" - + rowCountHt1TargetAtPeer1 + " and rowCount_ht2TargetAtPeer1 =" - + rowCountHt2TargetAtPeer1); - } - Thread.sleep(SLEEP_TIME); + rowCountHt1TargetAtPeer1 = countRows(ht1TargetAtPeer1); + rowCountHt2TargetAtPeer1 = countRows(ht2TargetAtPeer1); + assertEquals("@Peer1 t1_syncup should be sync up and have 50 rows", 50, + rowCountHt1TargetAtPeer1); + assertEquals("@Peer1 t2_syncup should be sync up and have 100 rows", 100, + rowCountHt2TargetAtPeer1); + + // check we have recorded the dead region servers and also have an info file + Path rootDir = CommonFSUtils.getRootDir(UTIL1.getConfiguration()); + Path syncUpInfoDir = new Path(rootDir, ReplicationSyncUp.INFO_DIR); + FileSystem fs = UTIL1.getTestFileSystem(); + for (ServerName sn : sourceRses) { + assertTrue(fs.exists(new Path(syncUpInfoDir, sn.getServerName()))); } + assertTrue(fs.exists(new Path(syncUpInfoDir, ReplicationSyncUp.INFO_FILE))); + assertEquals(sourceRses.size() + 1, fs.listStatus(syncUpInfoDir).length); + + restartSourceHBaseCluster(1); + // should finally removed all the records after restart + UTIL1.waitFor(60000, () -> fs.listStatus(syncUpInfoDir).length == 0); } private void mimicSyncUpAfterPut() throws Exception { LOG.debug("mimicSyncUpAfterPut"); - restartSourceHBaseCluster(1); shutDownTargetHBaseCluster(); Put p; @@ -261,34 +290,14 @@ private void mimicSyncUpAfterPut() throws Exception { assertEquals("@Peer1 t2_syncup should be NOT sync up and have 100 rows", 100, rowCountHt2TargetAtPeer1); - // after syun up - for (int i = 0; i < NB_RETRIES; i++) { - syncUp(UTIL1); - rowCountHt1TargetAtPeer1 = countRows(ht1TargetAtPeer1); - rowCountHt2TargetAtPeer1 = countRows(ht2TargetAtPeer1); - if (i == NB_RETRIES - 1) { - if (rowCountHt1TargetAtPeer1 != 100 || rowCountHt2TargetAtPeer1 != 200) { - // syncUP still failed. Let's look at the source in case anything wrong there - restartSourceHBaseCluster(1); - rowCount_ht1Source = countRows(ht1Source); - LOG.debug("t1_syncup should have 102 rows at source, and it is " + rowCount_ht1Source); - rowCount_ht2Source = countRows(ht2Source); - LOG.debug("t2_syncup should have 202 rows at source, and it is " + rowCount_ht2Source); - } - assertEquals("@Peer1 t1_syncup should be sync up and have 100 rows", 100, - rowCountHt1TargetAtPeer1); - assertEquals("@Peer1 t2_syncup should be sync up and have 200 rows", 200, - rowCountHt2TargetAtPeer1); - } - if (rowCountHt1TargetAtPeer1 == 100 && rowCountHt2TargetAtPeer1 == 200) { - LOG.info("SyncUpAfterPut succeeded at retry = " + i); - break; - } else { - LOG.debug("SyncUpAfterPut failed at retry = " + i + ", with rowCount_ht1TargetPeer1 =" - + rowCountHt1TargetAtPeer1 + " and rowCount_ht2TargetAtPeer1 =" - + rowCountHt2TargetAtPeer1); - } - Thread.sleep(SLEEP_TIME); - } + syncUp(UTIL1); + + // after sync up + rowCountHt1TargetAtPeer1 = countRows(ht1TargetAtPeer1); + rowCountHt2TargetAtPeer1 = countRows(ht2TargetAtPeer1); + assertEquals("@Peer1 t1_syncup should be sync up and have 100 rows", 100, + rowCountHt1TargetAtPeer1); + assertEquals("@Peer1 t2_syncup should be sync up and have 200 rows", 200, + rowCountHt2TargetAtPeer1); } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpToolBase.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpToolBase.java index d31421063628..8a28db3b1859 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpToolBase.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpToolBase.java @@ -136,7 +136,8 @@ final void setupReplication() throws Exception { } final void syncUp(HBaseTestingUtil util) throws Exception { - ToolRunner.run(util.getConfiguration(), new ReplicationSyncUp(), new String[0]); + ToolRunner.run(new Configuration(util.getConfiguration()), new ReplicationSyncUp(), + new String[0]); } // Utilities that manager shutdown / restart of source / sink clusters. They take care of diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpToolWithBulkLoadedData.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpToolWithBulkLoadedData.java index b5de8e6324fe..e9acc1bc45ee 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpToolWithBulkLoadedData.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpToolWithBulkLoadedData.java @@ -34,6 +34,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtil; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Table; @@ -45,14 +46,11 @@ import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.HFileTestUtil; import org.junit.ClassRule; -import org.junit.Ignore; import org.junit.Test; import org.junit.experimental.categories.Category; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -//revisit later when we implement the new ReplicationSyncUpTool -@Ignore @Category({ ReplicationTests.class, LargeTests.class }) public class TestReplicationSyncUpToolWithBulkLoadedData extends TestReplicationSyncUpToolBase { @@ -74,40 +72,50 @@ protected void customizeClusterConf(Configuration conf) { @Test public void testSyncUpTool() throws Exception { - /** - * Set up Replication: on Master and one Slave Table: t1_syncup and t2_syncup columnfamily: - * 'cf1' : replicated 'norep': not replicated - */ + // Set up Replication: + // on Master and one Slave Table: t1_syncup and t2_syncup + // columnfamily: + // 'cf1' : replicated + // 'norep': not replicated setupReplication(); - /** - * Prepare 24 random hfile ranges required for creating hfiles - */ + // Prepare 24 random hfile ranges required for creating hfiles Iterator randomHFileRangeListIterator = null; Set randomHFileRanges = new HashSet<>(24); for (int i = 0; i < 24; i++) { - randomHFileRanges.add(UTIL1.getRandomUUID().toString()); + randomHFileRanges.add(HBaseTestingUtil.getRandomUUID().toString()); } List randomHFileRangeList = new ArrayList<>(randomHFileRanges); Collections.sort(randomHFileRangeList); randomHFileRangeListIterator = randomHFileRangeList.iterator(); - /** - * at Master: t1_syncup: Load 50 rows into cf1, and 50 rows from other hdfs into cf1, and 3 rows - * into norep t2_syncup: Load 100 rows into cf1, and 100 rows from other hdfs into cf1, and 3 - * rows into norep verify correctly replicated to slave - */ + // at Master: + // t1_syncup: Load 50 rows into cf1, and 50 rows from other hdfs into cf1, and 3 rows into norep + // t2_syncup: Load 100 rows into cf1, and 100 rows from other hdfs into cf1, and 3 rows into + // norep + // verify correctly replicated to slave loadAndReplicateHFiles(true, randomHFileRangeListIterator); - /** - * Verify hfile load works step 1: stop hbase on Slave step 2: at Master: t1_syncup: Load - * another 100 rows into cf1 and 3 rows into norep t2_syncup: Load another 200 rows into cf1 and - * 3 rows into norep step 3: stop hbase on master, restart hbase on Slave step 4: verify Slave - * still has the rows before load t1_syncup: 100 rows from cf1 t2_syncup: 200 rows from cf1 step - * 5: run syncup tool on Master step 6: verify that hfiles show up on Slave and 'norep' does not - * t1_syncup: 200 rows from cf1 t2_syncup: 400 rows from cf1 verify correctly replicated to - * Slave - */ + // Verify hfile load works + // + // step 1: stop hbase on Slave + // + // step 2: at Master: + // t1_syncup: Load another 100 rows into cf1 and 3 rows into norep + // t2_syncup: Load another 200 rows into cf1 and 3 rows into norep + // + // step 3: stop hbase on master, restart hbase on Slave + // + // step 4: verify Slave still has the rows before load + // t1_syncup: 100 rows from cf1 + // t2_syncup: 200 rows from cf1 + // + // step 5: run syncup tool on Master + // + // step 6: verify that hfiles show up on Slave and 'norep' does not + // t1_syncup: 200 rows from cf1 + // t2_syncup: 400 rows from cf1 + // verify correctly replicated to Slave mimicSyncUpAfterBulkLoad(randomHFileRangeListIterator); } @@ -142,34 +150,12 @@ private void mimicSyncUpAfterBulkLoad(Iterator randomHFileRangeListItera syncUp(UTIL1); // After syun up - for (int i = 0; i < NB_RETRIES; i++) { - syncUp(UTIL1); - rowCountHt1TargetAtPeer1 = countRows(ht1TargetAtPeer1); - rowCountHt2TargetAtPeer1 = countRows(ht2TargetAtPeer1); - if (i == NB_RETRIES - 1) { - if (rowCountHt1TargetAtPeer1 != 200 || rowCountHt2TargetAtPeer1 != 400) { - // syncUP still failed. Let's look at the source in case anything wrong there - restartSourceHBaseCluster(1); - rowCount_ht1Source = countRows(ht1Source); - LOG.debug("t1_syncup should have 206 rows at source, and it is " + rowCount_ht1Source); - rowCount_ht2Source = countRows(ht2Source); - LOG.debug("t2_syncup should have 406 rows at source, and it is " + rowCount_ht2Source); - } - assertEquals("@Peer1 t1_syncup should be sync up and have 200 rows", 200, - rowCountHt1TargetAtPeer1); - assertEquals("@Peer1 t2_syncup should be sync up and have 400 rows", 400, - rowCountHt2TargetAtPeer1); - } - if (rowCountHt1TargetAtPeer1 == 200 && rowCountHt2TargetAtPeer1 == 400) { - LOG.info("SyncUpAfterBulkLoad succeeded at retry = " + i); - break; - } else { - LOG.debug("SyncUpAfterBulkLoad failed at retry = " + i + ", with rowCount_ht1TargetPeer1 =" - + rowCountHt1TargetAtPeer1 + " and rowCount_ht2TargetAtPeer1 =" - + rowCountHt2TargetAtPeer1); - } - Thread.sleep(SLEEP_TIME); - } + rowCountHt1TargetAtPeer1 = countRows(ht1TargetAtPeer1); + rowCountHt2TargetAtPeer1 = countRows(ht2TargetAtPeer1); + assertEquals("@Peer1 t1_syncup should be sync up and have 200 rows", 200, + rowCountHt1TargetAtPeer1); + assertEquals("@Peer1 t2_syncup should be sync up and have 400 rows", 400, + rowCountHt2TargetAtPeer1); } private void loadAndReplicateHFiles(boolean verifyReplicationOnSlave, diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestTableReplicationQueueStorage.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestTableReplicationQueueStorage.java index 4148c1c1a2c0..9041831d0e8a 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestTableReplicationQueueStorage.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestTableReplicationQueueStorage.java @@ -45,6 +45,7 @@ import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.testclassification.ReplicationTests; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.MD5Hash; import org.apache.hadoop.hbase.util.Pair; import org.apache.zookeeper.KeeperException; @@ -420,4 +421,54 @@ public void testRemovePeerForHFileRefs() throws ReplicationException, KeeperExce assertEquals(0, storage.getAllPeersFromHFileRefsQueue().size()); assertTrue(storage.getReplicableHFiles(peerId2).isEmpty()); } + + private void addLastSequenceIdsAndHFileRefs(String peerId1, String peerId2) + throws ReplicationException { + for (int i = 0; i < 100; i++) { + String encodedRegionName = MD5Hash.getMD5AsHex(Bytes.toBytes(i)); + storage.setLastSequenceIds(peerId1, ImmutableMap.of(encodedRegionName, (long) i)); + } + + List> files1 = new ArrayList<>(3); + files1.add(new Pair<>(null, new Path("file_1"))); + files1.add(new Pair<>(null, new Path("file_2"))); + files1.add(new Pair<>(null, new Path("file_3"))); + storage.addHFileRefs(peerId2, files1); + } + + @Test + public void testRemoveLastSequenceIdsAndHFileRefsBefore() + throws ReplicationException, InterruptedException { + String peerId1 = "1"; + String peerId2 = "2"; + addLastSequenceIdsAndHFileRefs(peerId1, peerId2); + // make sure we have write these out + for (int i = 0; i < 100; i++) { + String encodedRegionName = MD5Hash.getMD5AsHex(Bytes.toBytes(i)); + assertEquals(i, storage.getLastSequenceId(encodedRegionName, peerId1)); + } + assertEquals(1, storage.getAllPeersFromHFileRefsQueue().size()); + assertEquals(3, storage.getReplicableHFiles(peerId2).size()); + + // should have nothing after removal + long ts = EnvironmentEdgeManager.currentTime(); + storage.removeLastSequenceIdsAndHFileRefsBefore(ts); + for (int i = 0; i < 100; i++) { + String encodedRegionName = MD5Hash.getMD5AsHex(Bytes.toBytes(i)); + assertEquals(HConstants.NO_SEQNUM, storage.getLastSequenceId(encodedRegionName, peerId1)); + } + assertEquals(0, storage.getAllPeersFromHFileRefsQueue().size()); + + Thread.sleep(100); + // add again and remove with the old timestamp + addLastSequenceIdsAndHFileRefs(peerId1, peerId2); + storage.removeLastSequenceIdsAndHFileRefsBefore(ts); + // make sure we do not delete the data which are written after the give timestamp + for (int i = 0; i < 100; i++) { + String encodedRegionName = MD5Hash.getMD5AsHex(Bytes.toBytes(i)); + assertEquals(i, storage.getLastSequenceId(encodedRegionName, peerId1)); + } + assertEquals(1, storage.getAllPeersFromHFileRefsQueue().size()); + assertEquals(3, storage.getReplicableHFiles(peerId2).size()); + } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/multiwal/TestReplicationSyncUpToolWithMultipleAsyncWAL.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/multiwal/TestReplicationSyncUpToolWithMultipleAsyncWAL.java index 28779be43995..83cd41773ca8 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/multiwal/TestReplicationSyncUpToolWithMultipleAsyncWAL.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/multiwal/TestReplicationSyncUpToolWithMultipleAsyncWAL.java @@ -25,11 +25,8 @@ import org.apache.hadoop.hbase.wal.RegionGroupingProvider; import org.apache.hadoop.hbase.wal.WALFactory; import org.junit.ClassRule; -import org.junit.Ignore; import org.junit.experimental.categories.Category; -//revisit later when we implement the new ReplicationSyncUpTool -@Ignore @Category({ ReplicationTests.class, LargeTests.class }) public class TestReplicationSyncUpToolWithMultipleAsyncWAL extends TestReplicationSyncUpTool { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/multiwal/TestReplicationSyncUpToolWithMultipleWAL.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/multiwal/TestReplicationSyncUpToolWithMultipleWAL.java index f495f433bc9b..673b841430eb 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/multiwal/TestReplicationSyncUpToolWithMultipleWAL.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/multiwal/TestReplicationSyncUpToolWithMultipleWAL.java @@ -25,11 +25,8 @@ import org.apache.hadoop.hbase.wal.RegionGroupingProvider; import org.apache.hadoop.hbase.wal.WALFactory; import org.junit.ClassRule; -import org.junit.Ignore; import org.junit.experimental.categories.Category; -//revisit later when we implement the new ReplicationSyncUpTool -@Ignore @Category({ ReplicationTests.class, LargeTests.class }) public class TestReplicationSyncUpToolWithMultipleWAL extends TestReplicationSyncUpTool { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManager.java index b7564ed9168d..1bb9a3e2949b 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManager.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManager.java @@ -45,11 +45,8 @@ import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; -import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfoBuilder; -import org.apache.hadoop.hbase.client.TableDescriptor; -import org.apache.hadoop.hbase.client.TableDescriptorBuilder; import org.apache.hadoop.hbase.regionserver.wal.ProtobufLogWriter; import org.apache.hadoop.hbase.replication.DummyReplicationEndpoint; import org.apache.hadoop.hbase.replication.ReplicationException; @@ -125,8 +122,6 @@ public void init(Context context) throws IOException { private static final TableName TABLE_NAME = TableName.valueOf("test"); - private static TableDescriptor TD; - private static RegionInfo RI; private static NavigableMap SCOPES; @@ -152,10 +147,6 @@ public static void setUpBeforeClass() throws Exception { FS = UTIL.getTestFileSystem(); CONF = new Configuration(UTIL.getConfiguration()); CONF.setLong("replication.sleep.before.failover", 0); - TD = TableDescriptorBuilder.newBuilder(TABLE_NAME) - .setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(F1) - .setScope(HConstants.REPLICATION_SCOPE_GLOBAL).build()) - .setColumnFamily(ColumnFamilyDescriptorBuilder.of(F2)).build(); RI = RegionInfoBuilder.newBuilder(TABLE_NAME).build(); SCOPES = new TreeMap<>(Bytes.BYTES_COMPARATOR); @@ -176,7 +167,8 @@ public void setUp() throws Exception { when(server.getConfiguration()).thenReturn(CONF); when(server.getZooKeeper()).thenReturn(UTIL.getZooKeeperWatcher()); when(server.getConnection()).thenReturn(UTIL.getConnection()); - when(server.getServerName()).thenReturn(ServerName.valueOf("hostname.example.org", 1234, 1)); + ServerName sn = ServerName.valueOf("hostname.example.org", 1234, 1); + when(server.getServerName()).thenReturn(sn); oldLogDir = new Path(rootDir, HConstants.HREGION_OLDLOGDIR_NAME); FS.mkdirs(oldLogDir); logDir = new Path(rootDir, HConstants.HREGION_LOGDIR_NAME); @@ -189,7 +181,7 @@ public void setUp() throws Exception { CONF.set(ReplicationStorageFactory.REPLICATION_QUEUE_TABLE_NAME, tableName.getNameAsString()); replication = new Replication(); - replication.initialize(server, FS, logDir, oldLogDir, + replication.initialize(server, FS, new Path(logDir, sn.toString()), oldLogDir, new WALFactory(CONF, server.getServerName(), null, false)); manager = replication.getReplicationManager(); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestSerialReplicationChecker.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestSerialReplicationChecker.java index 1544265435c7..8731adbe4c2b 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestSerialReplicationChecker.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestSerialReplicationChecker.java @@ -99,8 +99,8 @@ public static void setUpBeforeClass() throws Exception { TableName repTable = TableName.valueOf("test_serial_rep"); UTIL.getAdmin() .createTable(ReplicationStorageFactory.createReplicationQueueTableDescriptor(repTable)); - QUEUE_STORAGE = - ReplicationStorageFactory.getReplicationQueueStorage(UTIL.getConnection(), repTable); + QUEUE_STORAGE = ReplicationStorageFactory.getReplicationQueueStorage(UTIL.getConnection(), + UTIL.getConfiguration(), repTable); } @AfterClass From 252c4d73d6b8b13ee7978a0f3af1026ac96b4ed3 Mon Sep 17 00:00:00 2001 From: Liangjun He Date: Wed, 5 Apr 2023 23:37:04 +0800 Subject: [PATCH 12/16] HBASE-27623 Start a new ReplicationSyncUp after the previous failed (#5150) Signed-off-by: Duo Zhang --- .../regionserver/ReplicationSyncUp.java | 46 +++++++++++++++++-- .../TestReplicationSyncUpTool.java | 36 +++++++++++++++ .../TestReplicationSyncUpToolBase.java | 7 ++- 3 files changed, 84 insertions(+), 5 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSyncUp.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSyncUp.java index f071cf6f1f81..cd6a4d9ac4d1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSyncUp.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSyncUp.java @@ -19,9 +19,11 @@ import java.io.FileNotFoundException; import java.io.IOException; +import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedList; import java.util.List; import java.util.Set; import org.apache.hadoop.conf.Configuration; @@ -182,7 +184,7 @@ private void claimReplicationQueues(ReplicationSourceManager mgr, Set argv = new LinkedList<>(); + argv.addAll(Arrays.asList(args)); + String cmd = null; + while ((cmd = argv.poll()) != null) { + if (cmd.equals("-h") || cmd.equals("--h") || cmd.equals("--help")) { + printUsageAndExit(null, 0); + } + if (cmd.equals("-f")) { + return true; + } + if (!argv.isEmpty()) { + printUsageAndExit("ERROR: Unrecognized option/command: " + cmd, -1); + } + } + return false; + } + + private static void printUsageAndExit(final String message, final int exitCode) { + printUsage(message); + System.exit(exitCode); + } + + private static void printUsage(final String message) { + if (message != null && message.length() > 0) { + System.err.println(message); + } + System.err.println("Usage: hbase " + ReplicationSyncUp.class.getName() + " \\"); + System.err.println(" [-D]*"); + System.err.println(); + System.err.println("General Options:"); + System.err.println(" -h|--h|--help Show this help and exit."); + System.err + .println(" -f Start a new ReplicationSyncUp after the previous ReplicationSyncUp failed. " + + "See HBASE-27623 for details."); + } + @Override public int run(String[] args) throws Exception { Abortable abortable = new Abortable() { @@ -217,6 +256,7 @@ public boolean isAborted() { return abort; } }; + boolean isForce = parseOpts(args); Configuration conf = getConf(); try (ZKWatcher zkw = new ZKWatcher(conf, "syncupReplication" + EnvironmentEdgeManager.currentTime(), abortable, true)) { @@ -226,7 +266,7 @@ public boolean isAborted() { Path logDir = new Path(walRootDir, HConstants.HREGION_LOGDIR_NAME); System.out.println("Start Replication Server"); - writeInfoFile(fs); + writeInfoFile(fs, isForce); Replication replication = new Replication(); // use offline table replication queue storage getConf().setClass(ReplicationStorageFactory.REPLICATION_QUEUE_IMPL, diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpTool.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpTool.java index 38225613b9d4..66de933832b5 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpTool.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpTool.java @@ -27,6 +27,8 @@ import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; +import org.apache.hadoop.fs.FileAlreadyExistsException; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseClassTestRule; @@ -300,4 +302,38 @@ private void mimicSyncUpAfterPut() throws Exception { assertEquals("@Peer1 t2_syncup should be sync up and have 200 rows", 200, rowCountHt2TargetAtPeer1); } + + /** + * test "start a new ReplicationSyncUp after the previous failed". See HBASE-27623 for details. + */ + @Test + public void testStartANewSyncUpToolAfterFailed() throws Exception { + // Start syncUpTool for the first time with non-force mode, + // let's assume that this will fail in sync data, + // this does not affect our test results + syncUp(UTIL1); + Path rootDir = CommonFSUtils.getRootDir(UTIL1.getConfiguration()); + Path syncUpInfoDir = new Path(rootDir, ReplicationSyncUp.INFO_DIR); + Path replicationInfoPath = new Path(syncUpInfoDir, ReplicationSyncUp.INFO_FILE); + FileSystem fs = UTIL1.getTestFileSystem(); + assertTrue(fs.exists(replicationInfoPath)); + FileStatus fileStatus1 = fs.getFileStatus(replicationInfoPath); + + // Start syncUpTool for the second time with non-force mode, + // startup will fail because replication info file already exists + try { + syncUp(UTIL1); + } catch (Exception e) { + assertTrue("e should be a FileAlreadyExistsException", + (e instanceof FileAlreadyExistsException)); + } + FileStatus fileStatus2 = fs.getFileStatus(replicationInfoPath); + assertEquals(fileStatus1.getModificationTime(), fileStatus2.getModificationTime()); + + // Start syncUpTool for the third time with force mode, + // startup will success and create a new replication info file + syncUp(UTIL1, new String[] { "-f" }); + FileStatus fileStatus3 = fs.getFileStatus(replicationInfoPath); + assertTrue(fileStatus3.getModificationTime() > fileStatus2.getModificationTime()); + } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpToolBase.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpToolBase.java index 8a28db3b1859..442582410581 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpToolBase.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSyncUpToolBase.java @@ -136,8 +136,11 @@ final void setupReplication() throws Exception { } final void syncUp(HBaseTestingUtil util) throws Exception { - ToolRunner.run(new Configuration(util.getConfiguration()), new ReplicationSyncUp(), - new String[0]); + syncUp(util, new String[0]); + } + + final void syncUp(HBaseTestingUtil util, String[] args) throws Exception { + ToolRunner.run(new Configuration(util.getConfiguration()), new ReplicationSyncUp(), args); } // Utilities that manager shutdown / restart of source / sink clusters. They take care of From 565ff49c3d0cd691fee3c050480bf19f6cb542c1 Mon Sep 17 00:00:00 2001 From: Duo Zhang Date: Sat, 8 Apr 2023 10:50:42 +0800 Subject: [PATCH 13/16] HBASE-27775 Use a separate WAL provider for hbase:replication table (#5157) Signed-off-by: Liangjun He --- .../ReplicationStorageFactory.java | 6 + .../MetricsRegionServerWrapperImpl.java | 12 +- .../hbase/wal/LazyInitializedWALProvider.java | 108 +++++++++++ .../apache/hadoop/hbase/wal/WALFactory.java | 172 +++++++++++------- .../TestMultiSlaveReplication.java | 6 +- .../hadoop/hbase/wal/TestWALFactory.java | 28 +++ 6 files changed, 260 insertions(+), 72 deletions(-) create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/wal/LazyInitializedWALProvider.java diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationStorageFactory.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationStorageFactory.java index 0b0eb0fc43fd..ada127ee7831 100644 --- a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationStorageFactory.java +++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationStorageFactory.java @@ -136,4 +136,10 @@ public static ReplicationQueueStorage getReplicationQueueStorage(Connection conn return ReflectionUtils.newInstance(clazz, conf, tableName); } } + + public static boolean isReplicationQueueTable(Configuration conf, TableName tableName) { + TableName replicationQueueTableName = TableName.valueOf(conf.get(REPLICATION_QUEUE_TABLE_NAME, + REPLICATION_QUEUE_TABLE_NAME_DEFAULT.getNameAsString())); + return replicationQueueTableName.equals(tableName); + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperImpl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperImpl.java index dd8c9c551270..6c7fc504b5fd 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperImpl.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperImpl.java @@ -982,12 +982,12 @@ synchronized public void run() { lastRan = currentTime; - final WALProvider provider = regionServer.getWalFactory().getWALProvider(); - final WALProvider metaProvider = regionServer.getWalFactory().getMetaWALProvider(); - numWALFiles = (provider == null ? 0 : provider.getNumLogFiles()) - + (metaProvider == null ? 0 : metaProvider.getNumLogFiles()); - walFileSize = (provider == null ? 0 : provider.getLogFileSize()) - + (metaProvider == null ? 0 : metaProvider.getLogFileSize()); + List providers = regionServer.getWalFactory().getAllWALProviders(); + for (WALProvider provider : providers) { + numWALFiles += provider.getNumLogFiles(); + walFileSize += provider.getLogFileSize(); + } + // Copy over computed values so that no thread sees half computed values. numStores = tempNumStores; numStoreFiles = tempNumStoreFiles; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/LazyInitializedWALProvider.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/LazyInitializedWALProvider.java new file mode 100644 index 000000000000..2a95b1821300 --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/LazyInitializedWALProvider.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.wal; + +import java.io.Closeable; +import java.io.IOException; +import java.util.concurrent.atomic.AtomicReference; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.Abortable; +import org.apache.hadoop.hbase.regionserver.wal.MetricsWAL; +import org.apache.hadoop.hbase.wal.WALFactory.Providers; +import org.apache.yetus.audience.InterfaceAudience; + +/** + * A lazy initialized WAL provider for holding the WALProvider for some special tables, such as + * hbase:meta, hbase:replication, etc. + */ +@InterfaceAudience.Private +class LazyInitializedWALProvider implements Closeable { + + private final WALFactory factory; + + private final String providerId; + + private final String providerConfigName; + + private final Abortable abortable; + + private final AtomicReference holder = new AtomicReference<>(); + + LazyInitializedWALProvider(WALFactory factory, String providerId, String providerConfigName, + Abortable abortable) { + this.factory = factory; + this.providerId = providerId; + this.providerConfigName = providerConfigName; + this.abortable = abortable; + } + + WALProvider getProvider() throws IOException { + Configuration conf = factory.getConf(); + for (;;) { + WALProvider provider = this.holder.get(); + if (provider != null) { + return provider; + } + Class clz = null; + if (conf.get(providerConfigName) == null) { + try { + clz = conf.getClass(WALFactory.WAL_PROVIDER, Providers.defaultProvider.clazz, + WALProvider.class); + } catch (Throwable t) { + // the WAL provider should be an enum. Proceed + } + } + if (clz == null) { + clz = factory.getProviderClass(providerConfigName, + conf.get(WALFactory.WAL_PROVIDER, WALFactory.DEFAULT_WAL_PROVIDER)); + } + provider = WALFactory.createProvider(clz); + provider.init(factory, conf, providerId, this.abortable); + provider.addWALActionsListener(new MetricsWAL()); + if (this.holder.compareAndSet(null, provider)) { + return provider; + } else { + // someone is ahead of us, close and try again. + provider.close(); + } + } + } + + /** + * Get the provider if it already initialized, otherwise just return {@code null} instead of + * creating it. + */ + WALProvider getProviderNoCreate() { + return holder.get(); + } + + @Override + public void close() throws IOException { + WALProvider provider = this.holder.get(); + if (provider != null) { + provider.close(); + } + } + + void shutdown() throws IOException { + WALProvider provider = this.holder.get(); + if (provider != null) { + provider.shutdown(); + } + } +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALFactory.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALFactory.java index bc0a9eec73a4..63bef79fa455 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALFactory.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALFactory.java @@ -20,6 +20,7 @@ import com.google.errorprone.annotations.RestrictedApi; import java.io.IOException; import java.io.InterruptedIOException; +import java.util.ArrayList; import java.util.List; import java.util.concurrent.atomic.AtomicReference; import org.apache.hadoop.conf.Configuration; @@ -28,10 +29,12 @@ import org.apache.hadoop.hbase.Abortable; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.RegionReplicaUtil; import org.apache.hadoop.hbase.io.asyncfs.monitor.ExcludeDatanodeManager; import org.apache.hadoop.hbase.regionserver.wal.MetricsWAL; import org.apache.hadoop.hbase.regionserver.wal.ProtobufWALStreamReader; import org.apache.hadoop.hbase.regionserver.wal.ProtobufWALTailingReader; +import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; import org.apache.hadoop.hbase.util.CancelableProgressable; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.LeaseNotRecoveredException; @@ -99,15 +102,22 @@ enum Providers { public static final String META_WAL_PROVIDER = "hbase.wal.meta_provider"; + public static final String REPLICATION_WAL_PROVIDER = "hbase.wal.replication_provider"; + public static final String WAL_ENABLED = "hbase.regionserver.hlog.enabled"; + static final String REPLICATION_WAL_PROVIDER_ID = "rep"; + final String factoryId; final Abortable abortable; private final WALProvider provider; // The meta updates are written to a different wal. If this // regionserver holds meta regions, then this ref will be non-null. // lazily intialized; most RegionServers don't deal with META - private final AtomicReference metaProvider = new AtomicReference<>(); + private final LazyInitializedWALProvider metaProvider; + // This is for avoid hbase:replication itself keeps trigger unnecessary updates to WAL file and + // generate a lot useless data, see HBASE-27775 for more details. + private final LazyInitializedWALProvider replicationProvider; /** * Configuration-specified WAL Reader used when a custom reader is requested @@ -144,13 +154,15 @@ private WALFactory(Configuration conf) { factoryId = SINGLETON_ID; this.abortable = null; this.excludeDatanodeManager = new ExcludeDatanodeManager(conf); + this.metaProvider = null; + this.replicationProvider = null; } Providers getDefaultProvider() { return Providers.defaultProvider; } - public Class getProviderClass(String key, String defaultValue) { + Class getProviderClass(String key, String defaultValue) { try { Providers provider = Providers.valueOf(conf.get(key, defaultValue)); @@ -246,6 +258,10 @@ private WALFactory(Configuration conf, String factoryId, Abortable abortable, this.factoryId = factoryId; this.excludeDatanodeManager = new ExcludeDatanodeManager(conf); this.abortable = abortable; + this.metaProvider = new LazyInitializedWALProvider(this, + AbstractFSWALProvider.META_WAL_PROVIDER_ID, META_WAL_PROVIDER, this.abortable); + this.replicationProvider = new LazyInitializedWALProvider(this, REPLICATION_WAL_PROVIDER_ID, + REPLICATION_WAL_PROVIDER, this.abortable); // end required early initialization if (conf.getBoolean(WAL_ENABLED, true)) { WALProvider provider = createProvider(getProviderClass(WAL_PROVIDER, DEFAULT_WAL_PROVIDER)); @@ -263,19 +279,45 @@ private WALFactory(Configuration conf, String factoryId, Abortable abortable, } } + public Configuration getConf() { + return conf; + } + /** * Shutdown all WALs and clean up any underlying storage. Use only when you will not need to * replay and edits that have gone to any wals from this factory. */ public void close() throws IOException { - final WALProvider metaProvider = this.metaProvider.get(); - if (null != metaProvider) { - metaProvider.close(); + List ioes = new ArrayList<>(); + // these fields could be null if the WALFactory is created only for being used in the + // getInstance method. + if (metaProvider != null) { + try { + metaProvider.close(); + } catch (IOException e) { + ioes.add(e); + } + } + if (replicationProvider != null) { + try { + replicationProvider.close(); + } catch (IOException e) { + ioes.add(e); + } + } + if (provider != null) { + try { + provider.close(); + } catch (IOException e) { + ioes.add(e); + } } - // close is called on a WALFactory with null provider in the case of contention handling - // within the getInstance method. - if (null != provider) { - provider.close(); + if (!ioes.isEmpty()) { + IOException ioe = new IOException("Failed to close WALFactory"); + for (IOException e : ioes) { + ioe.addSuppressed(e); + } + throw ioe; } } @@ -285,18 +327,36 @@ public void close() throws IOException { * if you can as it will try to leave things as tidy as possible. */ public void shutdown() throws IOException { - IOException exception = null; - final WALProvider metaProvider = this.metaProvider.get(); - if (null != metaProvider) { + List ioes = new ArrayList<>(); + // these fields could be null if the WALFactory is created only for being used in the + // getInstance method. + if (metaProvider != null) { try { metaProvider.shutdown(); - } catch (IOException ioe) { - exception = ioe; + } catch (IOException e) { + ioes.add(e); } } - provider.shutdown(); - if (null != exception) { - throw exception; + if (replicationProvider != null) { + try { + replicationProvider.shutdown(); + } catch (IOException e) { + ioes.add(e); + } + } + if (provider != null) { + try { + provider.shutdown(); + } catch (IOException e) { + ioes.add(e); + } + } + if (!ioes.isEmpty()) { + IOException ioe = new IOException("Failed to shutdown WALFactory"); + for (IOException e : ioes) { + ioe.addSuppressed(e); + } + throw ioe; } } @@ -304,38 +364,16 @@ public List getWALs() { return provider.getWALs(); } - /** - * Called when we lazily create a hbase:meta WAL OR from ReplicationSourceManager ahead of - * creating the first hbase:meta WAL so we can register a listener. - * @see #getMetaWALProvider() - */ - public WALProvider getMetaProvider() throws IOException { - for (;;) { - WALProvider provider = this.metaProvider.get(); - if (provider != null) { - return provider; - } - Class clz = null; - if (conf.get(META_WAL_PROVIDER) == null) { - try { - clz = conf.getClass(WAL_PROVIDER, Providers.defaultProvider.clazz, WALProvider.class); - } catch (Throwable t) { - // the WAL provider should be an enum. Proceed - } - } - if (clz == null) { - clz = getProviderClass(META_WAL_PROVIDER, conf.get(WAL_PROVIDER, DEFAULT_WAL_PROVIDER)); - } - provider = createProvider(clz); - provider.init(this, conf, AbstractFSWALProvider.META_WAL_PROVIDER_ID, this.abortable); - provider.addWALActionsListener(new MetricsWAL()); - if (metaProvider.compareAndSet(null, provider)) { - return provider; - } else { - // someone is ahead of us, close and try again. - provider.close(); - } - } + @RestrictedApi(explanation = "Should only be called in tests", link = "", + allowedOnPath = ".*/src/test/.*") + WALProvider getMetaProvider() throws IOException { + return metaProvider.getProvider(); + } + + @RestrictedApi(explanation = "Should only be called in tests", link = "", + allowedOnPath = ".*/src/test/.*") + WALProvider getReplicationProvider() throws IOException { + return replicationProvider.getProvider(); } /** @@ -343,14 +381,14 @@ public WALProvider getMetaProvider() throws IOException { */ public WAL getWAL(RegionInfo region) throws IOException { // Use different WAL for hbase:meta. Instantiates the meta WALProvider if not already up. - if ( - region != null && region.isMetaRegion() - && region.getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID - ) { - return getMetaProvider().getWAL(region); - } else { - return provider.getWAL(region); + if (region != null && RegionReplicaUtil.isDefaultReplica(region)) { + if (region.isMetaRegion()) { + return metaProvider.getProvider().getWAL(region); + } else if (ReplicationStorageFactory.isReplicationQueueTable(conf, region.getTable())) { + return replicationProvider.getProvider().getWAL(region); + } } + return provider.getWAL(region); } public WALStreamReader createStreamReader(FileSystem fs, Path path) throws IOException { @@ -527,16 +565,28 @@ public static Writer createWALWriter(final FileSystem fs, final Path path, return FSHLogProvider.createWriter(configuration, fs, path, false); } - public final WALProvider getWALProvider() { + public WALProvider getWALProvider() { return this.provider; } /** - * @return Current metaProvider... may be null if not yet initialized. - * @see #getMetaProvider() + * Returns all the wal providers, for example, the default one, the one for hbase:meta and the one + * for hbase:replication. */ - public final WALProvider getMetaWALProvider() { - return this.metaProvider.get(); + public List getAllWALProviders() { + List providers = new ArrayList<>(); + if (provider != null) { + providers.add(provider); + } + WALProvider meta = metaProvider.getProviderNoCreate(); + if (meta != null) { + providers.add(meta); + } + WALProvider replication = replicationProvider.getProviderNoCreate(); + if (replication != null) { + providers.add(replication); + } + return providers; } public ExcludeDatanodeManager getExcludeDatanodeManager() { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestMultiSlaveReplication.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestMultiSlaveReplication.java index 5b62b210f4b2..66386d275b2e 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestMultiSlaveReplication.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestMultiSlaveReplication.java @@ -51,7 +51,6 @@ import org.apache.hadoop.hbase.testclassification.ReplicationTests; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.zookeeper.MiniZooKeeperCluster; -import org.apache.hadoop.hbase.zookeeper.ZKWatcher; import org.junit.BeforeClass; import org.junit.ClassRule; import org.junit.Test; @@ -108,7 +107,6 @@ public static void setUpBeforeClass() throws Exception { utility1.startMiniZKCluster(); MiniZooKeeperCluster miniZK = utility1.getZkCluster(); utility1.setZkCluster(miniZK); - new ZKWatcher(conf1, "cluster1", null, true); conf2 = new Configuration(conf1); conf2.set(HConstants.ZOOKEEPER_ZNODE_PARENT, "/2"); @@ -118,11 +116,9 @@ public static void setUpBeforeClass() throws Exception { utility2 = new HBaseTestingUtil(conf2); utility2.setZkCluster(miniZK); - new ZKWatcher(conf2, "cluster2", null, true); utility3 = new HBaseTestingUtil(conf3); utility3.setZkCluster(miniZK); - new ZKWatcher(conf3, "cluster3", null, true); table = TableDescriptorBuilder.newBuilder(tableName) .setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(famName) @@ -133,7 +129,7 @@ public static void setUpBeforeClass() throws Exception { @Test public void testMultiSlaveReplication() throws Exception { LOG.info("testCyclicReplication"); - SingleProcessHBaseCluster master = utility1.startMiniCluster(); + utility1.startMiniCluster(); utility2.startMiniCluster(); utility3.startMiniCluster(); try (Connection conn = ConnectionFactory.createConnection(conf1); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALFactory.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALFactory.java index 26c1152c05a3..244c37bfe847 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALFactory.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALFactory.java @@ -22,6 +22,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNotSame; import static org.junit.Assert.assertThrows; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -64,6 +65,7 @@ import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener; import org.apache.hadoop.hbase.regionserver.wal.WALCellCodec; import org.apache.hadoop.hbase.regionserver.wal.WALCoprocessorHost; +import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.testclassification.RegionServerTests; import org.apache.hadoop.hbase.util.Bytes; @@ -708,6 +710,32 @@ public void testCustomMetaProvider() throws IOException { assertEquals(IOTestProvider.class, metaWALProvider.getClass()); } + @Test + public void testCustomReplicationProvider() throws IOException { + final Configuration config = new Configuration(); + config.set(WALFactory.REPLICATION_WAL_PROVIDER, IOTestProvider.class.getName()); + final WALFactory walFactory = new WALFactory(config, this.currentServername.toString()); + Class walProvider = + walFactory.getProviderClass(WALFactory.WAL_PROVIDER, Providers.filesystem.name()); + assertEquals(Providers.filesystem.clazz, walProvider); + WALProvider replicationWALProvider = walFactory.getReplicationProvider(); + assertEquals(IOTestProvider.class, replicationWALProvider.getClass()); + } + + /** + * Confirm that we will use different WALs for hbase:meta and hbase:replication + */ + @Test + public void testDifferentWALs() throws IOException { + WAL normalWAL = wals.getWAL(null); + WAL metaWAL = wals.getWAL(RegionInfoBuilder.FIRST_META_REGIONINFO); + WAL replicationWAL = wals.getWAL(RegionInfoBuilder + .newBuilder(ReplicationStorageFactory.REPLICATION_QUEUE_TABLE_NAME_DEFAULT).build()); + assertNotSame(normalWAL, metaWAL); + assertNotSame(normalWAL, replicationWAL); + assertNotSame(metaWAL, replicationWAL); + } + @Test public void testReaderClosedOnBadCodec() throws IOException { // Create our own Configuration and WALFactory to avoid breaking other test methods From 14db76f8d9238a24a0f7ad33038314ee7d5c707c Mon Sep 17 00:00:00 2001 From: Liangjun He Date: Tue, 18 Apr 2023 21:50:32 +0800 Subject: [PATCH 14/16] HBASE-27274 Re-enable the disabled tests when implementing HBASE-27212 (#5178) Signed-off-by: Duo Zhang --- .../hbase/util/TestHBaseFsckReplication.java | 136 +++++++++++------- 1 file changed, 83 insertions(+), 53 deletions(-) diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckReplication.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckReplication.java index fdf0d2d6a250..6580a65fbe2d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckReplication.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsckReplication.java @@ -17,16 +17,33 @@ */ package org.apache.hadoop.hbase.util; +import static org.junit.Assert.assertEquals; + +import java.util.Collections; +import java.util.List; +import java.util.stream.Stream; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.replication.ReplicationGroupOffset; +import org.apache.hadoop.hbase.replication.ReplicationPeerConfig; +import org.apache.hadoop.hbase.replication.ReplicationPeerStorage; +import org.apache.hadoop.hbase.replication.ReplicationQueueId; +import org.apache.hadoop.hbase.replication.ReplicationQueueStorage; +import org.apache.hadoop.hbase.replication.ReplicationStorageFactory; +import org.apache.hadoop.hbase.replication.SyncReplicationState; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.testclassification.ReplicationTests; -import org.junit.AfterClass; -import org.junit.BeforeClass; +import org.apache.hadoop.hbase.util.HbckErrorReporter.ERROR_CODE; +import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil; +import org.junit.After; +import org.junit.Before; import org.junit.ClassRule; -import org.junit.Ignore; +import org.junit.Rule; import org.junit.Test; import org.junit.experimental.categories.Category; +import org.junit.rules.TestName; @Category({ ReplicationTests.class, MediumTests.class }) public class TestHBaseFsckReplication { @@ -36,65 +53,78 @@ public class TestHBaseFsckReplication { HBaseClassTestRule.forClass(TestHBaseFsckReplication.class); private static final HBaseTestingUtil UTIL = new HBaseTestingUtil(); + @Rule + public final TestName name = new TestName(); - @BeforeClass - public static void setUp() throws Exception { + @Before + public void setUp() throws Exception { UTIL.getConfiguration().setBoolean("hbase.write.hbck1.lock.file", false); UTIL.startMiniCluster(1); + TableName tableName = TableName.valueOf("replication_" + name.getMethodName()); + UTIL.getAdmin() + .createTable(ReplicationStorageFactory.createReplicationQueueTableDescriptor(tableName)); + UTIL.getConfiguration().set(ReplicationStorageFactory.REPLICATION_QUEUE_TABLE_NAME, + tableName.getNameAsString()); } - @AfterClass - public static void tearDown() throws Exception { + @After + public void tearDown() throws Exception { UTIL.shutdownMiniCluster(); } - // TODO: reimplement - @Ignore @Test public void test() throws Exception { - // ReplicationPeerStorage peerStorage = ReplicationStorageFactory - // .getReplicationPeerStorage(UTIL.getZooKeeperWatcher(), UTIL.getConfiguration()); - // ReplicationQueueStorage queueStorage = ReplicationStorageFactory - // .getReplicationQueueStorage(UTIL.getZooKeeperWatcher(), UTIL.getConfiguration()); - // - // String peerId1 = "1"; - // String peerId2 = "2"; - // peerStorage.addPeer(peerId1, ReplicationPeerConfig.newBuilder().setClusterKey("key").build(), - // true, SyncReplicationState.NONE); - // peerStorage.addPeer(peerId2, ReplicationPeerConfig.newBuilder().setClusterKey("key").build(), - // true, SyncReplicationState.NONE); - // for (int i = 0; i < 10; i++) { - // queueStorage.addWAL(ServerName.valueOf("localhost", 10000 + i, 100000 + i), peerId1, - // "file-" + i); - // } - // queueStorage.addWAL(ServerName.valueOf("localhost", 10000, 100000), peerId2, "file"); - // HBaseFsck fsck = HbckTestingUtil.doFsck(UTIL.getConfiguration(), true); - // HbckTestingUtil.assertNoErrors(fsck); - // - // // should not remove anything since the replication peer is still alive - // assertEquals(10, queueStorage.getListOfReplicators().size()); - // peerStorage.removePeer(peerId1); - // // there should be orphan queues - // assertEquals(10, queueStorage.getListOfReplicators().size()); - // fsck = HbckTestingUtil.doFsck(UTIL.getConfiguration(), false); - // HbckTestingUtil.assertErrors(fsck, Stream.generate(() -> { - // return ERROR_CODE.UNDELETED_REPLICATION_QUEUE; - // }).limit(10).toArray(ERROR_CODE[]::new)); - // - // // should not delete anything when fix is false - // assertEquals(10, queueStorage.getListOfReplicators().size()); - // - // fsck = HbckTestingUtil.doFsck(UTIL.getConfiguration(), true); - // HbckTestingUtil.assertErrors(fsck, Stream.generate(() -> { - // return ERROR_CODE.UNDELETED_REPLICATION_QUEUE; - // }).limit(10).toArray(ERROR_CODE[]::new)); - // - // List replicators = queueStorage.getListOfReplicators(); - // // should not remove the server with queue for peerId2 - // assertEquals(1, replicators.size()); - // assertEquals(ServerName.valueOf("localhost", 10000, 100000), replicators.get(0)); - // for (String queueId : queueStorage.getAllQueues(replicators.get(0))) { - // assertEquals(peerId2, queueId); - // } + ReplicationPeerStorage peerStorage = ReplicationStorageFactory.getReplicationPeerStorage( + UTIL.getTestFileSystem(), UTIL.getZooKeeperWatcher(), UTIL.getConfiguration()); + ReplicationQueueStorage queueStorage = ReplicationStorageFactory + .getReplicationQueueStorage(UTIL.getConnection(), UTIL.getConfiguration()); + + String peerId1 = "1"; + String peerId2 = "2"; + peerStorage.addPeer(peerId1, ReplicationPeerConfig.newBuilder().setClusterKey("key").build(), + true, SyncReplicationState.NONE); + peerStorage.addPeer(peerId2, ReplicationPeerConfig.newBuilder().setClusterKey("key").build(), + true, SyncReplicationState.NONE); + ReplicationQueueId queueId = null; + for (int i = 0; i < 10; i++) { + queueId = new ReplicationQueueId(getServerName(i), peerId1); + queueStorage.setOffset(queueId, "group-" + i, + new ReplicationGroupOffset("file-" + i, i * 100), Collections.emptyMap()); + } + queueId = new ReplicationQueueId(getServerName(0), peerId2); + queueStorage.setOffset(queueId, "group-" + 0, new ReplicationGroupOffset("file-" + 0, 100), + Collections.emptyMap()); + HBaseFsck fsck = HbckTestingUtil.doFsck(UTIL.getConfiguration(), true); + HbckTestingUtil.assertNoErrors(fsck); + + // should not remove anything since the replication peer is still alive + assertEquals(10, queueStorage.listAllReplicators().size()); + peerStorage.removePeer(peerId1); + // there should be orphan queues + assertEquals(10, queueStorage.listAllReplicators().size()); + fsck = HbckTestingUtil.doFsck(UTIL.getConfiguration(), false); + HbckTestingUtil.assertErrors(fsck, Stream.generate(() -> { + return ERROR_CODE.UNDELETED_REPLICATION_QUEUE; + }).limit(10).toArray(ERROR_CODE[]::new)); + + // should not delete anything when fix is false + assertEquals(10, queueStorage.listAllReplicators().size()); + + fsck = HbckTestingUtil.doFsck(UTIL.getConfiguration(), true); + HbckTestingUtil.assertErrors(fsck, Stream.generate(() -> { + return ERROR_CODE.UNDELETED_REPLICATION_QUEUE; + }).limit(10).toArray(HbckErrorReporter.ERROR_CODE[]::new)); + + List replicators = queueStorage.listAllReplicators(); + // should not remove the server with queue for peerId2 + assertEquals(1, replicators.size()); + assertEquals(ServerName.valueOf("localhost", 10000, 100000), replicators.get(0)); + for (ReplicationQueueId qId : queueStorage.listAllQueueIds(replicators.get(0))) { + assertEquals(peerId2, qId.getPeerId()); + } + } + + private ServerName getServerName(int i) { + return ServerName.valueOf("localhost", 10000 + i, 100000 + i); } } From 79265c9d4ecf6f014ea2063b54c35844b0454fa6 Mon Sep 17 00:00:00 2001 From: Liangjun He Date: Sun, 23 Apr 2023 23:56:48 +0800 Subject: [PATCH 15/16] HBASE-27809 Attach move replication queue storage from zookeeper to a separated HBase table design doc to git repo (#5197) Signed-off-by: Duo Zhang --- ...om zookeeper to a separated HBase table.pdf | Bin 0 -> 104629 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 dev-support/design-docs/HBASE-27109 Move replication queue storage from zookeeper to a separated HBase table.pdf diff --git a/dev-support/design-docs/HBASE-27109 Move replication queue storage from zookeeper to a separated HBase table.pdf b/dev-support/design-docs/HBASE-27109 Move replication queue storage from zookeeper to a separated HBase table.pdf new file mode 100644 index 0000000000000000000000000000000000000000..095fe0b1c9f947331c4dfb4bfc29eb5a1ab87736 GIT binary patch literal 104629 zcmcedW0Yjwwx&~6X=kNv+qP}nwr$(CZQC|0ZQJVlj{2OwJ#OD${U;*!T5HB!d&d|% z=JURD6Uhn+Q_)c~KoAWc-W{Ho+~>{?4nr{D(&F0an?rDN;?fA4Svnfo|9Q96b2Jh( zGO#f;!ljWkvNmxv#bsciW5VU;hH!ASH`23$a0RSV->}~whW8p#=|w6dD*6THiwh{a z_L9$fzItA$xvX(gQe`hsb{)_F`}&!A#m$upm`haFq$mXEaC&+=ae9UJmHnOgoo$1| zqx||F<5u>Kvt3X|c{aRtIia_L;+56&9qV%+I9*=}`?~PnNzQ>qG^5oqlcANte%85oAtOqs-M_6 zQM+j&syQocDKJ_oXh15MDucMHWyRSXD?nNz3%Qj=VW@!TBBFegiLEvFbhjgQ>n%d+ z>mf&m!j4}$vh_T#1&Y~|4et@EDcHIs+Ak!wvtk^pjYAT1ps0ZRR=f~UuMJl7c2o1{ zsj1oIJi{XO8<+ZVXC!CJ;64ZEzJ2xh8@R9JJKOfCl3Xg;OC_pFbS&L4Q-&*7GXZgMdjMFCrSVxg=~Sv5e0)E-ecK z_vdrvla&Ue5$vhb6(ECklUD@T1C3xVng@Tux`NKpdo>lzD`={y3Zs^BP0#JGu8{om zyKQ@{vFqrT;NRCmZEy{L1Qn!H)PG=>ihJokfbttAvI1py+dHkEH;`l;9FHg3x}y@SE` zGv4_)#1=4sJ@HV_Rd}hnDk&(WD*bBLo2rJE!yT%Vd$cPP;b`TVSFTN}qJpfoK-W%M zFVRd;i8xZZ5XP~WF1o|tw!0yQ`7ZXxR$1__y3kqhzCpQs=G;8?8j~-Gqr8#N!c!w4 zfaQn@FeLUWHkLobO7Qg!Vq33p+)#&TRks|l8No}HG?s-nn_jUhcZckpELE}aJrS^$ zeZ0kLua^$tPM@?M_YU|BRDY;sg478x7<}tktKd7N(_&M^-0Y#chBV)R-}jPO3W?x` z<27xluRt@BR%w)i%SdlmQ)c)#i66(R?@onUsSwt>&rYrj_mk81cW+D<=}V2xt`lZ z@(>1d5fF!UZ99S{Yg@m`ZyrytzNdoPE(;f-j*UZ?fnk&WCvYTLyJ_Lz=B(RDn_TNS zNIUVT9kLZm?%=#|Z9rx{NctT4e1uZ`7(|mJ1Kj;s@y-X8c%m9)=hl4teME+1U}`+g zIJW71s3);A7Clbi+OuTDOvwJP0;z{(h03q^ta9_P?ISR)GcYBE7NNDg^faYyJH#+P z=E)T_i>1wU21ggi;OGI4m;R;2U(L{t}6^t8MocXm9iT_j}D z2%5+{{Op!s92tl=8VW$qz?$y8D-D9w$9xUT`9_*L(=V@f0%4R2Fm2ztkWbDq)!8AL zT?z?m<`RO!_(udqBXmYm;GC3})f+ddNqy|O(L1Lw9kVbby`K6VZmrKO^WI>c4sHm+vY=hsCMGO{c2K+yQI4olskG^W(Pe;h{k}^E1r^NBN z6wiLEqr<;FFjWbPMny$PJ-HRnUC9y#*qMU|Z> zx8MHf_fw2vwcdi#Habs0YspyjM1Y-I=@U#^HRUi*rGYbICYK<`OqH_Qaho0+wu9bK z?$*ricDu4g;L|w*(hA6|o?jUCo15^`tv?-Dy;Oi~nH2V>VqCoySc@B#tE?`F}hD;mPUN;_@V1?SLKr6$u9c(T+j@E5U?iU%eM4cVd z3A5j(sBWD18&SO%G|$yQreaGp{2>aM26D|bB6j;aDZa|caqpUf`udQVY%SXAMWben zR*_{?>bEDh4hrzPW|{I*7Y?JK+t%g0ft{~E>Kc$ES70YuAO*2C9u1o{Cuadl(Hpt^C*8E3L4E7cCUc2VdOC!x{~Z4+wZ~)lZ?32H!F)m=>eY>Veyf zY>)5Eg|)SXGV|@{g;bt?f&3`GaKAz5ypdW%jA$D{m_Uf~-~^x9jt%9OJ{udY%5F0Aovv zW6UDO23r?aMg=Bni79f$(1{SFyLtJ$lltbPeRMbEidg%$M0VL`SL$3_P2UCkwTb}Q z92ke7<8DB!0r9XV+ed%^2!%qXep<4npDP?9nM?=FY$p}!zau5@UKT_YM$z&s_|vH^ zf2teu zW-`k&h1>*^b$R-YjE_3n7@Bj+h{*f-p-ebOL6Ag3;G`6jV(1?PfVfZh;TC~-hO$0? z%Hm{>lhM-HZgoRifM$$M*QL3kSbAt44K?NbUZeWN4Phq~q!QS{c~!g&0mL+2uxk~Y zKGu|Y!hatiXkBKrhD&MK-%v(~kOE$4 zppOy@lU>qMdUz+^z3`VL*m}IEb6mPK$^8BW9S$Muq>2tYP-vteLO_5lr61XbmLGo- zb|h+Jrs(d8Cpvh3aK12Wpr zx%jMzID1Dt{X?;u9uy3$52@+a&ujurIt4~-Ld-)g{oYkh>`BiJ-Mo zMEdtGA<;gtDa~-o_>4w6Ez&M#JR~}3iRX+Xt>4##&Vw8WcbMZ|@l4Fce*Wj^2*wON z@rY#dmW2XtajZ#w& z)Ms=!my!(4>8itnrnP4W`F-|P!uyplU}m+Lzlh>P z)S0#iz@ckY3-{!QxwSgc7{hJ(Brm*NS@m3_KN@+%mS#3hvRs6#4(dFLgbt_nmvOrx zo!C@%mz7uonCIv=0=9eXGXM=Xdw)iy(K-@1nS}|CE|Aq}lv~Q-D#7Mcoa@Qp8zpoi zdUwl9lWQ%Q&O-ILDkk21L|f(UMJb(CtSu*@?pK;+JNIh6pQwVFSQt2t>a0wyW`Q6` ze|EAD zba7wBEJrltH($(y;r2;cjK+xzR4GC!bOulLveFe6|E|TnZ}rAarVc-=M4FtgZ8&Ln zTTvEL+izz>n3sfjTKEP$m8Ly|FtRrM2mSo{^cOSH{Tn+ey4n7LC0RX_zrETUSv%s= z|CyG@rI9ysuyL|CFmk|U{QH1_jkV*Sc?aCT*z3;%DI-HOJ$@TkTn*Yk159j8xU7uy zS`dF=>mPakiu)&{$=cf(C>S~7YWztiD2z*^XyodMtA$G=U}I@xuVAZZV1)Y@$_Y5o z;WGX0>$$mcX@p!IMHC$WK;7TwL>%ZK=$QYWTf~7LmyYFM3x)q|?Qd%o73p#RK1`#i z$cW4K_tlDu%(!%ao%NqxD=M<$(*5lWApU%&qs9Hhhks4{$@jl^{peVLZkl%8GlG3qYtK|N6rJ! zuf*RUtg(D5IilzoOHCm@$U|3~M(bqEj_i=#N+;Sahy=;gOeNYeDewd}>QWR-2RM8t zU08U@lmwtfO4FGxj9R@=6`sHEva26!S=&%kQvPErDBwED?=xLYZ6jSF!11E6s$ z^Bz0cQagTd4~hgb;^M;60Wimcn&O1zJ;&c>yeF9LIc?(VU74d8g~yLT>=%QP&|z^& zv4r5{Bos`_jN#`q-1mMjo7vRZ0A*%QFUoDuiIUt&B6wh!X-5#JpK2_%Aw;0h(yk9G zGYKe~ca{Z`sgn+XR}h;$)?3yWr(H@lNL7|ZIbt@3WAJ#OsFe z!Suo2j=>r3A&CW72eI&n#Y@H|v?Nr7LkBws`wWEdXO#0Rm2t!mhI1R#)F-QpUghf) z!z7kYKoSckM3E;XOKbE~6f?)yBIY9Q!b=!qHx#i)RpZr`sV2<9)ggk7=aEe$B}S4F z#}wjUlV%fd<8|Y86DX30F121%El+LFQkUrx^b+@?5lLW3xlh?2t4gSdvq(Bm@gR~( zTuE_B(V@;E(ZTZ(|0@1y1(NII?Njh8s8!Ar-fisX3KA1a&ri}1)lb(C(+}8V8j>UG zjQ5LgL69lqJ*}N5D#AnOUnp{$Z*7!nWM4O0r^OOsDSTf@I30SxIq7-;HzG39I3zv9 zJVZZi7?=OqPpkXFAdglQnGo3#8FSBg)Hbs}v%y?rrv2SXmh?()P41b5pJbc_m=sek zsbsT+w3JnDRW7qgYffygXTE0szC>LSHB&OP0!Y?C1qe`PwBX^z0EtgZ3Q`jTYqv12& zJBz!n8^M>+SDF`|*Q1xMSFme9P##w?M_!;_z%k5f&}#$IKJV zpYogdoeq>M#5JS~=6Ti8nY1I&YYG1RWVdztHgX@7#Dk=6L9ApuOKqBL8hW|_5(?!8 zWswTie$H;U71KVB+K@U@$JI7(QE4M-U;X@TQfejY*m=-uaC4Pw?X&fr0#*VxP9$O^ zu_WObeFjoHG^Iu*PNk+L{#pOp{JMmjojZ&>iW{-pxx2!P=0)!l=+pUa^M2*#_-*A~ z280!)68r?B5JVV68I&869vtC^6UaaxU|)wn4$Zs{q2?ko5dVjOa}Xyu6O`7kHwRQ| zO2$g!3%6D!S8rDr5Gs8kL4m;z(U{TsQIUu!WC>&!;W3dRp(PP1ArGMxA&C@KO0|Xq z;YjsJ++r?d9E@KcgwFP5&-0rSA^R755c^lu44TOe7wh&-hh-Bp)v-E#YIfw<7&utl zaNl8w{bh%e)ekytE{7m12^}Ht@yEjy$5FRhFGe46ASr?MfpdZ>gIZw*V%1{5#P)$? z_?mW{>UHk?4$2Z^6AQ=~%I33QZx@ZO%3upz}$IX_mlo0cI z=i($u9n_3FQlq~lA4MYxGl&AzARElnH*;g22;(hsaBu*UAk-o2TOnOvMq zx}Cp*Kl+%iEMTgFma8>4+N52y`r3CNyU$~+aIHkwZYG2$qKq)Mt;4q#<#Od}yLC+) zu8)TK+xX>nyQro$U@V6$e_38`8aGs#Z=~RB&pql-?1k^i5dDrn-&XTddEK0R3cTzp zdMbK%*>MJQ`Mr7Cey^IQuco~=TfL)IbvSvKf7jBXa?!1`7>nYBK$vC^|`<+A`71v~`Gjg5ra=`;E)Yf;5#HKg}@w{l?S zJpZh{$HYg+NAObgHf}~FNTg__KgMZ^V{wJ0lqHe5KPrB>Zr^Ui=5yy!zyq=3RFq{sHD??M*VG!_+4 z#<%A~6-Vy&#)J8+>e}k93BQyY?b|l{ zvW>3y52cg}lWI0C4zDb)uFK%hsAcSq>|~t_Z?n^;YpbqP+lJ5N{i^8hA700ATJJPl zPwifZ(cf7Zz!5*+uBf{`zO{dIoqBD3&ViP}YV%CGvwoU>8eHt>lR(IjXP5I-eJ;JM z7?*WDT;#6k&NPIY#00kr(0nJo*2oO}m?@i~5N;Gc6R8rJiF}RRbgz3Dx|xVfS=bNi zA^U26PMWL8@pgIdGSMF!ys?-&Pva5aD(z-_TYj=SHat04a51|sE(zbN=;nFy`^O0y^wj4Ny>@haq$HIxN{}8GCtaV4!nQ=uaxjtS^QTq`=??T6A<9nb1*W*rTeQ` zmB-ckXN6DySLOfDiub>>VHjE2SpU<8(cFl~Xo2_2sX=uR3)7@BITp4rrAi^FK`b*I z3fp(G9C=76&r~G$w*83Ro;kf*Kn2LuijOZU$J$#qeZYdA{80DS`cmK8`d&`F_Mx!& z{NC_TTvFQ5So5~*@M^M|cBeNg3>0bENc8HV!1gAy!NG1eQsDm3 z{8FXnadTF!mQ}y2Q%$mJ`~CRV%3k>})@-8u?pc!*q2!l6V_kS#a<^;sIs&R&(Yoa$ zpxM(#vth3A1xt(b^^UauW7gVSW5>Jn%zC@I^hD$1!uLhC>C#vM{V71N=fhBm`(vZH zD#u?3Rc)C+#hAiz?FxyHGmzQ}R-a4@UGgk2jZCfjC>Lz>UhPwgKM zX!$Kgs!*eeeC5RS)4Ix5t0uf!lUaOx8Q1Y*lp3ao0qSUl88{U{_eldSa9-HW&dW%_ zvPlHQ$WX?En1=V4-f(pV^JTZqxN%2Kg>?_2y3%{zRhDiS{6&B(ApB37w<_tNkehtK zjxWkxV=_-f=W-Taxd?UY-%Awzj$e!i2^}4(w|_H}ut) z3&qXriRor?tlakpo=mpo1HWUmXbChTMI9W{_yGkDdotLA)g;rjdE4mb(WDol!Qb+W z=)tq;MSO(l!O@>6lPdcmW?0RZ?)W0`w9J^s#uS@W_L-3kZY@~J8hKD9^rTKQe=0AY z?@ao1de{>eyTeMOsUwdHtK#<}G6F`X_DN!rmJQy@(e30oa&7&LSxTr^JMyfi@=xWl z0kK%qbXw`jJI8|ci$KQOgZ{1CKD3|$`mxCq>#u0U21u$LIMx?nNIgSb4Koics&(n= zY4ZspXo|k~MV~0vC(<}HvR*b#CWtB?1V~;uv7iwq=_0l zb!;FmV#q*__BHQV)ADO#JpW!3AVsS2Ad3b4$B|I}_AgP*Qz5`u)N+i6lk z@i>6?(%Aq!e2v6Hgy};tSj_n zXMf8x8niwIf~P$C%U!ihI(0g?07Rt-U(nnDP!;KZ$)^S*WI^Te63U8DSX@Dz=a+FU zursF>rvVsHFqx;rK>>NHZf)9Cw-*_DTBml8b{^o}12nwNtlu7f9DYPq&;8&YHOzR0 ziE&xA+nEgcWfGj*e|L$L8pwptMWe}Iv-)m5oxTd3Rc{gpV%3p%R_mt80?tQh%pu}0 zt={PDU^v(P${N882tUY1bViq-fP9I10jkEx!qtfE8S7zuWvW#@m0L7C7A0hkWt!C) zH!BE)i+lFqpXPOcLzSWRC26ZFrng)8`)A=%^gGy-q#vE&&LkSQwj`9+u6sXO6bPAO z3ECcCoSiTfnI3xqvI((0H$jSfGDPo@7%{%QA7EG#Hpb3l9oPpo`7GWHT{`+wWsgrL zgd{JB5Rdwg=sxdQh**(fvvBKi1q=C~=iEZA(%AC!nGKlP;}sD&1i<~p6vc3I#jm)w zA>;WH@VmCuL%FtOgaE^_nQ86`;&dMM@mz60@k4ckZY8%SmpGE(@~10)L6&3$rrqh0 zhGB0%+7YrVa`X=sci(d4d79zHF|Q;ofA&kC$nhkCV6Rb2Z91^HVlzOn!%(<5UwxiE zQD*Uk$FvL-pM}88K@*QBAwGPdD#atMv~M4EbMGqPUq~db%S!TLq{6y=6u{%0wzw?v?lfPmz$^cw59{e=`e-_ccnmxi zVQA5r&eeDDL=qZ!3{BT$Q89S|<6h0a)kzhD|g{)v9W$ET)} zL8={2G_k@=!2$x|f3m@fVfTx#9zDQ>1a+2tvNQPSj@}Q8=56KyXe5Lw0PgCUxy0az zxx}_5b(xtg+CDK3=|STiw_R>fViFmbDwa{|s6NnVHkYal9R|}Ox=nHgVab9E)bq} zQgmno3cD3a2%_D+P==ZK(!4+V+9!<}1wSuqxc@?{u2J-sT31NXL$>oGK+HsLbsfkwkyYGrW);Jt zf`oMoPDvyT$0&~Xq>~8X(8;_x;C`s>ED@J8RzUe6!<#zsZ$i!A09c)|4LSi;brSL5b9dX<7ir}PWjuH+ABDq2#^fyR4U9>D2SvuF-cAK~2tl9E=tlvKt7CI10 zMD9!ufy0ivAh^S85*IQ6K*w87mpT>&@+TFi!Vpsfa{)<8(x;wPaP8|aC`<>IT7RLojrKPeP2$wfRZWJx2)muqo z!!~*tS&;AuNOHab?Ti6jSZw`78=riT1D8fp!!~H2U{a?@X4j{W?}!-_n#65*#8oDv zXR8jePYSII|6U)@fudJ@6GgDV4HME-{Wgq;)2}WV1{A7QwjOs7WyLMq5nRZ(zT}Ly zr9k-O9M@8hCl}rUq8sx)Poi_rPDwB>Dii#OmH(wGNjFO<9b7{~uh=oj3r%b*yxG-+7Gg1IYjA9`IEf@E~BDyb8bP z`FcUcTv6EMV}E%qvn;2u62|s%KZNS7*##~266Z0gC9ydKz>6?HVF(Tb0Gr26wx;t> z!KGIv_|b-7&)D#Z{~jrBL?02lsdvY)@Ug1oYtE*_j7X{~8HfWI_Rh&CFY{7&jV~3z zGH7P9PN$Yx#ciOsV4DBWob7^$%333Fw|UY zd<=9mJ&tW0jjuxj;xVOrSTQ{hG3ui&2PZ5SMc^C$F&`$2a9UVY67XBLrPGhG0-Qix z%$TFXBSKXmk?;sDAvg*ZFUVoolaGOf$|PMq7e|?%hPei8u*D(TKi@#OX&r93dGSFTX263x9-L-jOJ}LavRKAi6|R z(FOk~pUAPa*&?e0Z+5$2RuPb()L2A~ z9~|Csqag(1fOf0ofxCn;u8uS7NgMQt1#vg1)V1bLp6M&$5~!|q_CNKATwrO?%3U-tk4q1Na$ zk#HA>X{@Xj^}XP`8G;?wY5jf#JX7U z#{>PhoNJV^t;vn=u4`mDn&Zr`_(edG-$B2}t(lnKv0eHG+5HT%b##ec%k<{U1g2Qq zW(`B?i6K9mr*H>&m9p;~e$BcYwV{>ry{OIB zcc-xQA#~A_4_h(DKhISvL2OnI3V7g`>bNik8NKlnU1($xA*M34o!2hY7Qy8SHvngs zA{|Lf6-JkK${{Clb5%i-VD-2j+aW&;T3_31m`4e!FbXOyMZ8yZEj=9V;MpI& z&;{pIJz?Y{Jo{P31VF$QW0&^nhP&5A+w=+g=vaB5#D*K<>CS=s^r9W4er&oqhZ&fL zly6`94js(vVs7FVk{6>AG&-CMbEWCgR8o|eiu!G|>0>E-zw%p_P}8kC!dK|rsDJH02xSi*8-;H?Y*wGKw_SQeVfOEcn<Wtuy?ApTh}rV*K<{7YubGs>BH+h z3GSip9loH8EaxO22kwuBg1Su*)OA%83OXXKDg%2-SJ>Q3w9e;(sl+;oXTa%0B9^@vX>K9-s9`?Iv#v8R>#F^u~ zzk<;2=V<3%x+tmMdyJ*MU3}Q^-#NL#+mXZY5Dtr~BME^_lM^F#Tjtvk-2Xv3$SDXm zB+uYgJ}56nn@^l4116k1EdL5te3rm7+rF%&bIH;QhF9qC^qHCgmjH!I{4p0(Uj{hU zYCNo3$hkt!A4n|?Yb02Q#DoUQE#wf#{cy2R8vPLnLOLf8Zp+NYZ)^l8N9>4YQ*x1d zPUhH(IepC#MjkoQC)5!7h}lQY2jm1$;<5)vRxNyrdKFnqm9Z*9Uy&UDYgy!G2Tp7e zNCe1)2!~8rljn6bY|SWBXC;+JljNXWo*p4m0cr@&G}~R_`Ife<>i*pM&i(Rpg8cxK z(=h&&FJgS*2j2M8&#q!eRld){oMmJY4jVNz^(mzH;Ix(U<7`{Kk{zKyn7!s9*dV5f z5`pG7Se#K_*NX2O3sMq6LDk zCXp?!oWp=Er%C+n;_ZuaEvgy-znV2>Ac;{|wztU?@x9m|nM-9TUy~2Fpa$BuwS56| zGh=)OBkj z`mZ^%zm-=d^G>?dzEoZty!!;)f=|`ct}+9n$2^rbB(<<6LoKbbESgim7ON(Tk88!v zEaPB=bWRhGJyp@1CCJRsHd!e|Z#cFa4_1)UkSUYLHh$t&xyDGvn4eAFuuqg>V=?VU z5AeESiZR7)_G%n$IIj%62kYFafh=b!)4Zxnm*|-G^;Me^Vz(EWZRFW8g>n-H5@usJ z&}4J1lq3!Mj&c8B#_mR-1OmU&px)f~Lq#b#;NZ6fFksq#R8~SITI-Gk7|ejkp<%t1 z!XwG>()?+|e|>0<#ZA^|6N`=@F$a?y{F|>{peZZ{!t2uqW*+GDzv^*xf0^0;)Y|?h zwmtnn9PfYm_J33B{>`|D_z&a$-;C_PMgB9@|2OCU|K`#E<=6i)yy^eZa`5MX5rY2j zhW&rI_kRxl&F%kV-_!rA3*e7tN&ojgfd8=g8UDQs;=hwF8CYnU{}1WXqULuq{AaaJ zUiHdpb&ZNjVx#*issF0AC8}KnA&MzrO+usa0)8kY;n!@>Hbdq>h_*|UOAsza9Dc~a zZW?Gv=NHu{^Cwl8*Xvp#4o``j^6NK`&tM2Ag)G7Msf3oZn#!NyBAToDf=}F-dbVOE z6+)Ms`HhYad>JYCXNNcL%}+fVu7yqG9PEn;g6I0fl#%K^$mcteccQgdh?R%N1)0;U zMxQY3hnZIUG^y^pj*bs)7g~4e(=* z)%8crW7gXB)?z!sMETzrx>v4iWeh_*RbKrswoR$lSaSo5)0AOS>v0evMI(jmuPGHi z!=5B;M#t^$Mr$GEAm6a#p5)^}|Gsh$qRHV$Kf9xzP)d#D2RdP@p%1QDb zaYqs=plkTn0vQ1L=_9iT_(}AZNloNS_|2ow24J@4v&Ic z1_c!2&iZ{vu7N+0CkVulmr_cOb=i`P4Oa4Kghv8b1V3BwmhkLs z5hZ_Qz}Y>g2%3Og^26w(D2-KhUXKH#U}m#x^J@OA=h@jXX_kIGZ}KLfv7xk+$CnnP zrCBX*u#)DWw`IRXyy+XxFp(|s-DjxC+Y4Ze95Oj;ABRwoj<|j_fataN>!W?*MLL+_ zw$5UuOqf+=1RbI6>mZR8Nz5&OQQ}tx@FbO>!zm3E-+O+lV~#90>TikFOg*983-I6! zd5KH^WdwuESM}?54=d$N9Y{-qn_kO)HfsJfy=-zIiD1^Dy&1VJe{!(8QmDGJ0Y5~k z)3_da+z4k(X~|g2?Shz8!gW6r5^EN8Nw75cr0FErR}sfwKe2K|%=rN|zr7TxX>cEd za~4)L3DgTw<107GlI+YMX8M~|XAGW=J8NI#k_;I{>m*7mrlx5*UJ7&MVZ<==qSqKq zrJ4ZEEw4#{$I^?*gmX8f50vNW_^m+xT@0t$U{C)?c&b9?#aE1K*73!%sEE`ivYK}9 zhBBCj6$nROLByEbkj2$)wMprZru)Un{YwOtk+z@q+7UIm%w@#VSs{+ciG*ueO7V{B zV!@#e+_7)AY-*aeQ4`0Y1zPN-?(b0AI2oexjs(!rw@0GphwzNu}-`jz-AiQg5-R>T3~ZIBSvW z2+7R(dzoThslY57%=1pLXdWzRSK=+-083(EZxFrXfG;@J7<6!d11_7@CDl!iD>E23 zR}kTkRNvB$T`mNy>hYz2T3tBroi4aMf!~uvZfw6`{X{ZDT;=mLa#6KI+Bw+aZUn}J zHjmA?BWyh4*a$}Vdl1zVz-H`Myp?cAnV;oLz>2IdDP+jMT6bUKAf-(`V2wDKXzd?& zNwmKUmsO;jXz^$X&s4R=& zl}cUuailLeGvX97FG_7_Y{GghEaImnSc4|+C1!0TI4cw$a)<8zoDo5Hispc)tQ*$` zR|`886p=xRYx%HG1>cC#grFD%%IL*=&=^850cpzMZ8nr`N*yvhRibR&tPw^!C!J5u znJrUfqM0G`7Js1&GW*nM;I61oOLh$ky`U$s){}>~u}v;R#kR`2(kWVYiWeYHqPGpV zqoic5;gBQ0QJUsHVU@W-uHYGwwC?VmR&Z@fCA^JJDSsvh2A7E5kS;SkL$({L9XoG# zY1(Y}!B$2iXHJirJV)_(`)BJEkEsul@{l_yWK9u(U*cGJaMB#$KS9tGb<6j-*0V0Q z+%BKgxfJmv?8So&OX1p8CHnD5O)dAn7xEtXlYT2)StMgrWlt&6l>fVNpvuD!wf=nA*@$`YyDVc>$*JopA^p_Ag zVwTL#Fs&0Hl@m5)V292|VbY6**qiB20fDras+p(SJt~?VKdKB=YVu9$*e7{9s+Lme zbKC8|&}@+~LaKr}lZ!I4apTjuk698bMl8-Tm8gd7sekvCBhmaC=i+P5CHt2jxenk=*=-Dw_qi52YETkCP zm?IU=L_Dvxvd%MT&2M0Xa%?#dM1jDK1c*Swg;~yA4~b-7tS+i01-#%z+P*5AkKa2! zwr87Axl4zv#c{}P59()cr*1g%_L;D7~E-t{lm0x4#d$4e!mc7nGw*v!R z=sO%?&ss8|PduG4&~h!Wj4S($o=8FhEA+67xq}f{a)?Xw4j`uZq{^y3WGb$f9R#uv zXKovg2-W4iZ4eIH7msyRR%JZEDPP)0K|J_F^Tsb`ooGtiiVI0y(PU}v*&P=QoP(LW zvUYy#J~;U^C+gb16WEb%k?*j?G+%!Z?jYmGG8+;U)otD7SrzWF}4yfkGirnWFST&5JEAx2`)N8&=X z<%)}0cz>S)ng`S93*Puv@3gRhgH)W{CqR!iS^f6OAHdH-cO%P`*bU}ac=&Vg1b%uu zu+Yr;cs6m-Ah=Bd?clV!BW4hKBfGq-tkAku(HT^#!B4{HzWwAmol#HD9J!Rrz7E(l8dCXs>qPwX#qXbDoxYNmj3GkyL+fUUU&s=ZK(_DCcyR zS)M|FJbvekY7)H__7FW1^>Slz5V9~l7rDR1e^$TMQp1A+^w1W$DjIR{b=rse+s(@~ z+=E_Ht{xBZS`)nTesO4dut;eMeF2aGUzfC()*BA)<|obu?cs#3Ums*;j36BK86KAV z^A{Gjb7RO$*!6BOp@!vwxfph_s<7W@5-5cj*LjfMh0M=nsR@K8tUhD`{OCTl_{@x= zEPvRHxA>d)VTKMYx{FkpNVcJg(}(**cKy9*FIF7syNI3ypjW(^k9Cs>i8dv@#~wGn z;dv*2fTmHAj*OcCid(_%VZ?xaIUx(P;0(mv$AbXSJIb*M)`n3*cpS?EfX>IPVNg(v zcWC>K2Ro;R+gG_bEaW~Mt=iu<+wF}jPZeJF+A#Z?v3*s~5PB1Re}oJ4oZk-nUeo=t z-n9_7ctK3e_JuXW#PtvgeqA9tU>8*fY~Ny*-41fHg+X3fiI=Q_Z=A@Qj^dfJ^C>5G8JgEdg32GHn6ZKZsHX#|!O)Ucn z6E>C77Lhyx%ud;`)OWmA*4bT!!f*{}On3&5;Kmn?CPev?b!n~Ea2Csg)7cdq-PWHo zkdnNs!fBW-PgdxP#&k3jY?U!xW`xKWPsS+6P<>-wY6|QyS?ii9O)M(#nSw&C#`MKh zxuKfBmfzzvOI9-R?Ra5%M-RTp-(Ir9!3-=0>TI z0v823T0LM!sXj2$ePgQG_iRjUzv+qH0C$98cz`rj7-6|T(-`HoSkf6E>~=GOg?yl_ ztMOUe1p7pz%w_;!@Eea`9w={H^*%h9eF9y^ojJ@!t$*I~Fyze7xrE&CmQGqMTds}g0x6)904?y z>8axf$etvQ@bIlsCN`(bq%z}6Mf5Vxe(g#08U*IsP6Zd`{WT}ReDT+b=_;Zg1q@(Q ziN{K|z2ja;lEW|gd4uMDrsV)a9Y;`JyP42Ev3|5r(X;osYD{wI{EgpMd|bw*5cg zPfriQ@b||8{uA*3y|Mnk1$;&Zmj3|!CQXZ&4QAxO0G}OukpXccpYWV605^4nn6J5` zw^`e;&dy~{IvrW4o;L&b^PQ{n`HpBT+0a66(LF9C<)95V-sMi?Bj%;0RnBg1(N1pJ zPC5DZnrG;G?|UdCgofPv4WZ;=JB0*EGNE$Eq?_!X*sT8L)WNeSB0QdZH=e~QzL-qb zuk{_%#+f5&n0(7CQUxXD_G;yYhQquShX$Whk=Zp!Jb|3XIAG#VP-2;2Gh}Yw%1zal z?2f&x-0F1q*Zf+Iw68-z2;f0q2F&sBbpPRNrGp*vZ}uNNiip?QA8N^Fr}#ZFDl06o zGc@~6;VKs0geWO--km*zsx`?gSLe^SSC@AFbb1EKKm~UPb}f_(g`x}iSHB3>YMA{{ zBOYuBd#CPSl)wNwHUJNvY?!oMaI zT`4u9$Db(CBJfDZ&X8GUrG$-#lRNWIY}K~n*%o{6e*4+&?+mYAhAnwr?ix|Dh;);|OzzUcY+uvc1J4-m< zp&oaUOU>eE<>Uq=M!eZ4rmRG^TzGs-1{L{vCzZe^`3=nKjKN`AjCs<&$Wf7)9j5)5 zu+Zt;(3h-st`qimCvPO)tw@)8=S3_3zsr& z%YSfkGM9la`*6*s`UODt(ox{4kx;=TM=6^yORfu^1qg0h|6ZqvkB&a@Y;q*3JnMVH}87YMs(eM&(-FS{QUj>t;cMt590+o4~yH*A)J^Ee4NctdyQ zU3`!(OrM7}zugQ}%4{f6YgY>E*Ull9*~tF#X#6JDKg$C0`JNzit6pzLAM|`alkR7Y zzOU}GjVs`kL<+vUy*b+-+^OtOR!va8u~x1IAFkXV?DAI5q6-v;qT*%uT~0k@*hMp7 zM#-i#HrcJ4v9v3$9j=Uko^E7bam3_lY$Fclds4Mmp7B9w5L`)Ft&n7?bjeS6mzdnR zq1GFLE^Dk#s9g6~R~o5_k|hB&~leBwKlN;^;`c+{1tgRMX7*tltjrGxieJu;sX2BDV5_|vrwWB?zv zRUzvXwjZkt0Ayfyrajb7sxEvx`1ylP0^)}F8k{vo7S*8dP6%8?mYmD(VOHaWvwN=&iS_ zwI#YturP0Bp=-YgXME64?rmEu+rS$gbQ(hf7nu^WWn}-;B;RQ<7a7%&i4~0?SnypP zsYyt#MN0oQCeYukBaDI1!58#}&UXj|kEVl443L3=(0 zJOXh*-gP`RE=O!e0yZgU?2ZTm0(A&>+`om5Xq<69SNxhW9n*Z%GM5&E#mrG^QiClS(serxHD$(qeTarzAU(#@ZhZXkAX0e zpa+h>HwzI6NgFy3UNp81BT4hvB_#I2eaLFUm_boix?i7m{@lfK(uLt^+*1e$aP7H> z`R?EByF$nJr-2Ql)a}l;%ZQtz*{2~tJgZYm1eZg`3pO_B70|w$I?%(yXu8oY@Yz(}Vx6H>7sI)1-L6zwOgY%~ zH&U;GNLiu9EZJ%+rl#9y(OT_BvsYU@zRKL-&Ujt;ebU&jujF4;Cc$xoBj%o8RrllUaU+n*^jjz@?iCYwm zuZ#QIM!Q1F9VbKR4xN!Do8s(9;Rjo=q-N-fZaWi4r`@ck7B|Hb8Z_oV&wt6^7i)Ds z(+ux97bCTPZ29wU#+cBaP?@D;kOZkAv@;iw!&W-Vl)=;-IW8N$N?%y9@}SkuS&$NEQL%PjUsE%>s1kyiwnxR9 z0EGN$!A@ehB7@4Kwm2s14^~E;Hr9NwNlrYO8#6j&eF6o1e7(_tt3RJ7-=U@IuJ98w zojv(JO+m}(K03iaMRS3)NQ`qq0ic)@A-GvQqRXQ_yo_~pSr1v%83(902bIGMoG_!C zqEth_L(-S~HI1hm3Ow*Zhw3`O<^#69ZWn0Am8jjyEbapg&Z;_zqTV$kcWGF@cIIwE z5I#uA$QBmGYcMZ??^A_l#6WAvs@AdWcH489I0r86C6O>io!7AOu3LUXx1wlR0?5co ze&g*E{2CT3pl0!Z{UJSD?m@RCG`*`cZ72Puy}iuW!Rkn=ThKn&h0e{PVAZ`|)^Ylm z%`A9a8Cgy8m+>cZ-<3sAaC21V#mgg|3`9H85J>kHR;JEFc;YN!&w=s7YiHPR@-?+D zQ0)r(xlSJjHJ}bu@79^=LUfc;kBK2(b5KM0CBEr-1K5+&P5-YB8&?9ax@OC;^5z z%unESgg702RJR|QUPEsxQUN`OXfojqkr(oRMsrmD`YE=Vqpz{kZ<1O8Mt4yi2Estm2Rw}H&;H8`HWsit`lDPU3)>RO#eKenT zn`f||PKkJ){`8^7=|U_GUM+g;MM6)6RL@qi{O!A_!-x+W^1wAG&LBg>r29z2osz{B z`RL+0{p2xitcfy?;^m(zz<>0|7*$Z^ao?s>f16Tow8Wzyq7>1UfMAL>kHs(pMwlyz zs;)NWN)d(CV~A#r-H*Xc1%=&1*7bJcjD#;>mM_03kvwNxRR_5zM9_dp-T?M=E|YO# z$;;$LtQ(IG7QdTmLixAPMkhn^ZP}@N^ixicruQ~mcz?x~q<#c2&P+|Gh|A9K7z^U<^`M`#_7CN4 z1jf0~g5(m+A-D}X)I_qn*KxIX*EcmfE$k(kvi0GM5p+o&Bn>i@*mdNVjb-`(qVMp8~yq8 z<_I&>t?SB8FAk~zwIRy@s$d})13^hiXraA;+xXtxo257Ttq#7HFtN7HWwp|oFJ-kH z{Rzei{fR+t|vh|gZXY4qAE$BA!%{nIVU=-Zp!?BZF)S{rn+@fi1_Eh~S zRtxr}kJAwTx7$=}aZP(u%p!9HHrE7of*2=`iQV3nd2N@pJ_TI-QmN^>cJ-t{_fxxy zyb#%+PfJ|XrC>%SNyP0lp+L&z#pieBDsgwLaa*lN2Os;haq{}Jv}TJnRA#?YO0q(z z%_xa)-^=3Ez-}z0bl$ZDLkpU0 z61@d??>YTsz&R^qmEF4^{4qFs$O`w_$?$X;Mc;8t08(LhQi;U>6@y#p`96;k9Aumb zc>vk0ieG#)GVjJD7rHoXzql!R@DwsIpB{E)1SY?YnI#FaT_57qUlH+TZPZ-cZtarA zpfpS(D=Mq3+*7VeZ6Hk+=CYu1d0m4^q$5Powc()!QCkY|N)!*6v#4jDZ%&s12ucm- zmU-v>ef`EBWrDJ=V2)+HD8+Y=nfFJcYxS3j%-xfJHRoRnRj|ut!nowFonyN~>d$P! zuW08^-3@+8?gM}B`WUr&dtE%LDHwkRH2J8oA2fmBV1po;UV0Vldjl!!ZFYH#t+D+g zkh|bz%V}#xzMug?MmYMM%GG$)&N8#uCp|_-zu=ku4KMVIvCip|{F11~H>c79@1)_% zan>oK4MnHXo|{c%`H_9783^9N1emGJf?5j0IW316(SN_Cbzf>qs+_*^E7Knz=#c*; zUG58?%}R_FTI4++vugY}=lr8HYMIkNDY%83Ks81n67&7n#g(rr`;tK*-TkfPkggQ_;;`6@2XS!%V*^B*nPX`apM zlB91+8AA<1KeE!pU_l+;WeVP$D(J(%1kcu(jD#{Uzs zRHptA!Ng*|Ys7qcgSp1SgFmCHZ{A?LTQ-jA-SgV*$kS%LTxsCS2!EH+xj*P$Hpy4F zj1`0$X}}w80ey_C4!VjkL`+858lNH_sLDav(&izj_4=5hy|SI#nIdN1eEUGNN0AxT zXe&Lg)2FXrkS`5sif1aZavHf5#gaP1JM*d0cU zLDbkO{u2V1JY_HQCcQt9cbQ72r4xk^+nJ!CAg@lVd3hOw zc`3ZoLi61HcwEm}-Jb9!biO}E?aihL?Jd0V+C^o{p!2~*!iQU@OGY)9eaPXqhoPT# zB`&4jeKX4Jqpe_=kO(^2?XhP|45d6sxC#N$?d+Gr?0WkSeq_^t!<+-CKjt6EGr> z=fZYF(|M2mv(~NzdiJDJ|K-JV~hUV#IMRJXw3(>nWwc?sBM0GgiXzcmZAj&d9w#;*V3Lqi`ot& zT)1*u$v;|69;mKGwB3>2zM)+ZzV3+H)#5y@WWn38NXjPDa<>o(Ye)&zYfWwc{H@j~iP>H4%Wkiq|KRZD*TrgRRQ*(gT&TJpS+=GgN$b5jD3S+<28YIPK?3H)<$iM;rT3Z*)= zfOz4f?AVR-WcNn>_V$njE-K)aYwKkCpKXv zFe-#;d1Zp0Ifz(%nmG(T!~X6aYtT{Pc}Eo7-+c%iNJ~S++*Ijy(*i7c$MN28_{u}}UnD7*WpN@JQ< zayTs;A(o-|D6C>8N!hF)Uq16#2nJf|Z8P3}JJ{D=X1xx@L_4}@;A}J(EBKrJF}VD1 z`5jRelq=2-s^}ieTTnP|fS_2QxVtg}5(h80>ngrAuxxA}DYEn^4j>#nd!j3N%#sNyo->I3NXNcs70_tdNG=}p2u*G0*}(HhGnSX z>g#4SZQ`f-FWmacg);$X`Gy4F2#6DG-$gg0WjrOsK+SZ{gy87))IV@R1{oqt;V&9U zO}m6~cdb5110;uAdM_TbL5Tf#h|eDNFWrIL%QFZQNR^UKcwS^$OxtP;-62Dermm?q zdvRKFb_yO3!Z7ex#ry=Fz7p%$+65R%KCfUeBi|ujMtZcj?2C~#qwQ0w&tJ%$j>EGi zoP%ah-V8seIbkrr&fHt>yua%|DQCloxaig<0!vltXZRiZFd6|ih_a)*6nmIUa=xVK zTDKGX4f*npGocVOGC1`daL6?iq+Er6EE z1;H@>#2h-gYNfE0?&y zA*cjAbep{#C<5yNF=ZRn~vy@~$X5xFNK8 zn#h<)rAbBu)VI0RhGN+>t(um;3ey#B7;yK-%8nj*Dy~)MXPi}mL01Q&8a7Y73ai2yBthN{~NSIEOYVgIZ;u+4BC-n{t|Vta~n-bV_U z>3r5tigK!l4*ozX*Df~AU?{abxQZkX>ZD8{zH`n5;l-BjN+2o%K!=t3Uo;rF8MDVK zHclw+bHQlH;BgY$&$Pckph$bqRUEOu&JXSvRT-I7yGCM7Ggb{{=1SM@Th0K~EN79goXqZ$n+pQ(YZ5Ld1 zw!*#5jo?s%FGIV@+Ox@vbTMfz*PTNmfFS<(P>5eU`!E`r93C$1;UOTtH44mMtV5#hVIi){UZc3>EWZmA z{)~`e`cd>00$n3Ap%c)%Zy6!+t_{yrPtIlwFBZ%e<(>;4yNjY) zukBBw@^z=gC3Q!+%SR3fAud35Ct;dFWrsuM8nQsWj*Hr?M_E0Bi$$gmjo`+(#$T{+ zw2pR`4o`jHK2QS*U0YMEAxye)vs$BW0nfwS9Z5;%*fS(RP#aTatx@$dCQGEkwBh+gzf*JM1+gFWfc#iS*EaqE6 z%puznpGhuA<^?BaN$Mm{(b+1&C3SMZ9t$5%Mw=p1_}h5ihGYTn^y_&CcD{n)-a=pA zK`VR5t+4Dudj5gv(U*`8ur_mt5jxqw@|eZDs3|OgN{Qy-dP83%VfU&YqayaI&Q|mS zVI7uI1(J*a^CsMk(5kqZ_pq%}0sjH<47rhx{s<^rq~AfW2A1f%AS}fyuWJj>X?hUq zr_$>BUEnx}DP15xo}Xc;d14U#0FE_9zS9CgYzCT6P2}805$Z5x0F%?1%smL zpv*`Gvg)(KR!emY=~p^D=PTIoO3Ea=8L)Lw%-x>m$sd&FU4Se3fHsdhzkE5$GK&$l zv^iXBeA{_~BO&!+n(?a~J^di32Hk-H0-+7&X=z7g%r3pcWa^+!Hm|ksq7a_c--gDYi=hd+D97WNm(p|nn*19_CBSw;6luL6{ZW!ZBu_hW0o$uK*o+lt5b z?si_ggL}=1?_HlFVL8^PtMx7a!=ECGMy>$fPIZuL1*C8ceVr(eT;V%=11{aP<+(*B z3Fa#0{)s!3@J4ct3PycR2a-%Q8Yi;zu)PG~gM4-ex%qf5;vy%C094v(?Z?GZ_gG=6 z24yL9{@Uv@sbvaWQocK zh3R?SNtzLwvf(HvLZDqcgV4la-_lYWqmogj#+YVD6Xbaab?}=-hn1^S{LZ=;>1y!b zG$*L7uAePIo6hRvzwtw0PkZKmMbe#3K^%i49N=n=1L8eK=qKLhv=@X*x70lnD@&RI zn_VnJ1Fiz8zh;kq>)#h`fCE~t0)IfN&wE!E6{V)?Xy`>UFiZ16U_8TSlhSie3%dj| zLt*F;R=g7sBhS?=%KRJH9yJe%M(Svc!SvIL^EGVCg4x&-!+AoDMUJ9=KvmX!eg9`g zknR7e%Z2U#?sEBe4edYUa{2ek|Nkh0|LT7E-?&`Z{xxLt|KxIEXXj-75Axu!j!oRb zNW%Awe*ctYy||Tw=(!JN)E`!^OG9g}XAg|FEOeqi5J=Jhfy2+eZ{M5>&x-Zs!|1Y* zph3f7gT~J7osP8yfeXTOzm=D`-j=5m{R@{yxgWo;Xvm9_!SK&Uf#*yWr}OjK-X%ZJ z8n<^rhRng5?VOGCx3u@~w|4DSo3BrSCxxf|y6#Gi-`b3>-x3#o-Y%4i7sJu3rZ)<% zZYlKAPCIxv|NLIF3QvB#L4O8eK~kseLvweczx^Gy+{)>(3D^_C^0v8|K+dg z;}NYNDXzwh(KST^y*_qzv%`cEh9GXbT=zge-dw{sj!OsI0{dfUGlJX6yH@OvqJ(rT zUZO6)xPakEk?#{DbeniflMimq-^E-o#)0j_l1G+Mcd+#~b8H_1T#J#Abz7$wUzMbC zKO=~iCoiqp9KJx&o*)k?%8TrgD8{afK~okU^hXscg*N*yKN0@& z%IQza$?!Wdw9*iy)&4*5O-b(o-Nf#3QR-;F>+`kuiTyRzS|e@Li&Ky>tk`DX5zp=h zXCpTE2~a8DV_>{mX(vpsyb>q{*Vg7*Ts8Rt7G?7d~nF4M;g4>+dj6*JyOJXm3DgM{Wg6f2lJ z*lCmKV-IzIz^n&ofPTd0P-(ROH=ZC^vZ+t4pNN44m}yXNuO9L8n6U^{`++Grm<7WC z<)?BEE|b3T!)1~T{L8BNOFqyv%>-}r*+1{(Ga-r|s`645T&|nRtT2zV8pNI) zfH;y1y~o&-VTB{C$%L3jiE?Pb?7kvgg{VQC`{0-)F9f$(?>rGsBN}Gxh?KZ7gk&iX zlB5zZHf23mq?>G2G8r~0YKV^HBGpr5-F-n#c#j_tcAXH(a~N1!Q;9Goeo-VC7_ifb zO-WI5ntpsn>Ui+*5oO*L{$4TEObpy4Nt?woB_uQqZQFDn9gSs(lO|m6JkGc=II9SF zA~Xglr(Qg7cXoHSSmT&9gA%A|tx^$PH9`c4ocn5{qHQ~#HB(?(y`{q=&qO6*=DFK6 zvZ}N13CW8Wxde5mOGnw>lcsPXmwNuaRK*`h4mH7Z4;D4GrY(cAsE>HqG$yj$8#Xf~ z-#W;I0^gbRQR@v93W}xk6GvNNCU+fLVvPNUW(+&ER%!!Bg0;{$)K}hx$D+zT9Dn1o zLpzBN-|x{F6_ezR#bSd-aKc7#=_V1a?^1pP7!Dfi@vLNSxshIjwp>z}IZm4OFH)bL zu5m&&Nv}(UX)QdmRcRXO@-QNs!rkv6mFI1!FZ*VP=#-I447rGa!W@5Lo$%vRAsz^P z{@^Z?b1P;IQ5bgC`@u1YI#ik&13D66Cj`Wkp`z2tY%5uTAsHAS9}E>iH0EkhB_=-N z1cUT}a5rY&wpsvB3*i{-k?I&5;KU#1KDjY9@@-yLqM_aY1JrPzXxz1h@SR?>CMwTt z1xQEgY}C6Ti25Z-Y_y-sR2hgp?LiVsuQXwI`AR%+6`MF{D(A*_JF&!6+|O-*Pph*r zsf&@eAr2tdD3I@|_e2(>zJAqWgZP0TDAK7Go69A#X)l^#Af0X502o}=mW#|A z9Cz6Ke%v?!E*MvX`0iUa?^e(gkCEk2O&Y6bOH{{m6V{4NqdT0m@+; zxhUa|#L*vLd?Q!|?mqot7G3i|i8V~dzwzwKI6IJN)*IuXkYw>()ZUb2P#27YKATF3 zZ8rZ5uKXm>7cn&pzG!+_Pbc`HVhwZ6(Y0(?2pwVp|5D9t|LYFFj& z$dTUZAUe)4TlBsh<2}Piu+`j&}l)3XiP()sXpFBqxcgjUX92i z$vm4*TikM1F$VA9WOnE}ZE9<&2(B6LOmsKS^1CTHbn0Gu4*K%qt@zBwP+NPmvo!TW zZLW>w_8XhhEtM9HEE+J6C-(1i^JXiFf#uN?1x8f|OL(>i+D-BiHj~GnTEXHh(l=@z znlNvS!h=5gVQ4I&zw5kozndgIp1qv@hC66M$?2_`DZwOJBe4dVvT0L>O)+;lRS2LN zf!XW5!q8UqO6rsO5A#;%r!A{iCbza-G(Sp0;y!FS4AI`uOHcW6SZK*g*Ax**&jtwu z6!xz1uJg(arq>(sPL%&G73B_ibWCQ25&5mFRYhbHZ6gdj;FBc@GU(kfI=LB0Oewri z1YwDmaV?hJHpA^gSKxUW@@;I|HzNp_y`Mn*dnPD6#)`#$mfB!6-gA)g>gT}AvFWVy z$#%!OHPaJ3#;Nt|W`ieM-Rk4Y0PDLq$^;lc=33zwFhb|gXxXysE~vF2QLD1zKyR$o#umdT}wIfq(a z4^D4=ooyH&!UcndD5~HVa!zQ3&Yo4&>|x#leW1wC?z=B=%aBo2!aR_*?LA%k|(#gbsDVFXSIbDWUYEI|zIO zUe>~=^cOL9hjp9;F`y$GbvrCITfP>7$A*!*^+v>^OjdaG>KI=m426GQ9!Q+%y@GBo z)bUMv*r9hS(Ev;|47WW}mn#cCL-Mo&|IU(+Kpu#2RC+~-cFY7T+|b?xegqclD>ep^9{L* z$In1$_?lume0u(Ci$UZ-P#8wN(BStO(|*3)-xND#uC)w^=)=*60*hMOD#%i^mya6m z>o}?^Az<1~i;!mL6ULWArx6L*3)h&E@7*U&CCy5_31LM#hDuihS;x^!>yR=9WL)8T zn8E~V$0h`~&3;Eb1OcagTqCp^MLRc$}qIGH?Yi={RX;ykiFd7bjz>oF~GN zN4us~$2p`go5wh;{pK;%U3*rcVeF#^dv$kk1cB9h|5=_3Y|4K znyDn7olb3IbTtQ@k{w&I47-&n>KiHu0kvZd6xp8iXNVsJdPszrx$AYvvTd2!< z%68{bS67HD;GDFO;@T>*s3xT)haFQMW>I&2GKxmi?Mn^)aztwT#zy6mI9rua*jFMM zRDi7FF&QgbkhQW0xiNdS8kYKaouR#z^0VgZ2G}e4yEB&oiY4s68TQ0-QBc}iIct`C z2zX|$LcShNrG|dvW3zr)1~j1PPw{l(WvPvjF^+*$Ig0^IubwJHX?7tryWJf-(D^N! zGPH5&f;n?{*oA)`}>O3?et8LvXK8opd75v^Vth{6qSVFj^Y? zUlYaLRQn-=9Xu^5nJs@CXyw(SVixCCz;&?gnFAWvC&-5k9s>m-*LVL7P8Ye1lqa+G zQBPGE@gHDPj_i!^nwa{!0vaS3il~`w;OLS4=`|^-as0?(e7WoQm-d;j3@_%L4$BWJv=NL-QvmV$kMb_JB zn05}wp+?pNmuoU}fH(#&pA@A6pkYZw+L&Gi8jVY`#3a64fdY6ZZLU{1#xyjRmW(qoKqt#%#3+I$ z;co*6B~2CfBaIDq=I;C8!hb=Lgx$BDVE9yI8adcn*M;dydK-%-uuO#R#SO{=W??nDKR{n8rg&!U2F!=teBt(D4v8Y%?k|Jx@YbQX{pu0n=yC zZhCV|cb-HJP>b`as%8h603(=85-A+9I6A~D>T|y?NyHKyx$KbOc~jd2839VGtts;} zNo0scYo`b_)hfFeLWStDgqkENtl7(a0_a>Wh`3~_dITv%H~`T*{O}momD`IG0a znoMViu(aaoWnGr6p5>UDxb?!RJ@#L+C*NtM6(y2v7aAa)@1nm;aY zX&PdW28Cmq{*Gl$hVMsjL?p(|lKkBn6U~(!ZGs?)cC$8>yEznZkfBthPsD7M8nMgp zVRP2dXLAQvd>-Tufj^QIj|T5L0I}msg442Nb8eW8LKVXcHYzOcUsQ1ynwnN#CcKJ` zrhHkXcDG3ZXC4jbPIHS6k`rE`Z$R?G`gNEQJ6RXB;T4UP2=_-cs8))9!{!VCLwA2u zrQD|TQ(;$mSP^mpRgv~B;g#pS^+Tfh9d8igE6L@-B{Li~xF;gpU6s~lIoP^5XwlOG zXq|SYJnAoVRPG$;K-vIR-h=P#I-4HLV|+BE47@>9={+r;vDpD>j|JR``r-XjwvzZ> zq+q2$K3@nSj2TG5n(Qw-tt+WXiFJ=j^a-(IzGTHh#ld!ADvG?GG$T3@2Gx^))~8KGC;z zBFQRqUW)0scgQ^%igmkYdo7y57YY{(%31o9xv^Uq_ajLkj{_(&tKg!F-7KO7wK}f@ zoNO^>a;ovPA3OOYsq5Y4IFbc(j!FpMa5XS5e`fS~#5OO|VT8ws%5MwCpEH^TIRP$> zJ5sQ0Fo!ZYFh@cU7y@IE!$=0P!U*25(ZQqx|y&A;Jy zQlv#rw6$A^Wf_?MphSRC6opPw;AcbIVEcUO>wdc=ey%D$hnWW9d(Sxf(G=agp>hE$ z`>Hr?l*e}o1ERg5%NG~Fj~G+F$ku`0q^p`0w4*ySd@egl+>ePfkhP`Tee?g~TpQ-o zF$4-R#NWpKg4nRL<~>DbNPE(f=vnSX1z&h*n$)DIUK%gpdkz=UXQ_jJ z2GfwY|^o!Q$ww-Ce!P&3@RoRtyD3Itptm< z&0lfowxLLzX@tp)KV5Su+r8^RwlvM1?}izyEL@o~t{}qePG+ag5N!@l2i?r35G*UX z3=;*ykx?RivM1!BN@VBW$Q3OD2PC5cRRqj8SLB6| zNXl-0(Ema!|JPpJ{|phz_Fu!4{=0kj-v#CWlve)V0^*SBxZblK)y{QGjy)<*}w^L=o3`6TYCL6cpsv64_= zDKk5Ez@&R;CG()e_p`mn$J;uyqnhwX<)>1QF4M|eFRSK>W%Y;iRiEDW=^^D?fIbm7 z(?BF{)kTBRI$pLC^ZB#b*SA?`Pm7*wi-^JOeI{>hs`dR){s-{+*chWpOO|OxyY2+) zDS5o{@Nqa!3u-BH_3-5aZfs*t^zHNY+VTx3j69GcSe-d|{`=6TYqwHavkqHWiCn1P z(@~T5+tDIzSqdwQ_b4D4Zhs1%eUzuPh8k@4ci&&}Crt(&3BB3+Y>#xp=@wyUGwI zf`4n++t~(T0;|*QZy~w+*Q>2hgoknev@8Lyv#izP%HfOI=qJ1vd1n6C3Y{YYx__d0 zp!@iwB2!hS47?}X=WYs;$HDa+^{8pmCZ_O2q)ab#h4jtjI7tiSB=C9_A$ZS!UMToa z_WV_q4}b!hA%%J9xdI=u?!4vpG@`h7ei4lZ$BAG>Kv3aw^Eb1g3-wPpes|JXC-e5; zifH_yc1GP^3k5dN5?l_;^{NH=!1lbp{>@au^^+UGoHQht9;%R!_k+)u6>(k&4b=)Rp1mJ= ziy1qy>K*|Psu>8D62rZl<_8?ZlGyqbw6JzEGNJ@T)ee)XEk}de-i4))mSWeF5#-W7 za=$$p^8fzq4&~-NSpNK;spc%TWuHX%>Y0!eN7J8&nf;~w8HJuc-++Vd+DVJ`OrC?u zz3d{d07a^nPgX-P#_XuH0QN<-j-6XAS4;1$oQJa8qTcx1HGWtMWsw84GeH*n`svWN>a9|fk(xh#?9T@3 z%Sdn<12Kx17TbGyLog!>huKJfP@x0;pI;xZM9nDE+aU_ftB|=Ewg_lqtekaS_@VAf zP4;4o5f1wet<#>(Hh}2#v34a_UVsUrj&`Sw8!h$j;KQg1avo##KkSbk1a7TQPI z7*F;=`(>_GtE451d7hnX)t1Eyll-W+J0LryYM1`qLFXHF`r-HqIpjiTU6*4<9cssW z+0>diT7_0&vr|jl)oR5+^aX>@cGs@&PHJ}q{ETzBf)Rq_oO}PX0KR%Cjq5hYWOe zy#zn*akTc;ns*sH&7|4y-7F^~IQplxDPYqD->@bFz7?>qNHt%~a>Ljy-D z@^Zd%UDJjo*C&VysAAl8UJs`Y`UE3hCLukX`1=QTy9s)kQ!&zPqJFnN&D<|q`Z~X3 zrDUt@LKqn}ZN>~4wA`WC1e8PZgdZS7_^Z<)>f zW`4543HAH58U*o9W=29qwcTUf|<7kyz|y;4r{sCBM}fk3-cokuzIo% z1y9)|r=+`(F>>)oJOc*NMk1#DfnPP(&Xq)=*i`f4Y%=b=EL0wxe*#OfIhdK$m|_KD zvQHJe5W+^}s|077mP_8(Fhsr#1ZFT+e-pk2alYD#Q*sTFYQ=ZA)<5syjD2gGVv-K2 zUgd~}mx#T%HwG-iiZ&t{@F6&ZxNiA2+7weVy*rzF`RmEKLayMB2e&|O;rEH5EFPIg z@qq#e1Wl)}0u)Rc1xd0%udarbyv3T=&PB+I^RySa;p)vM7oWcFpBYC)oah*f^1OkM zpM|4=Dmn=ech+phzotdDNMu)j1{33cvJbOu?1&l&GM}lA`tpbQC256Um1DO)r%oUn?BLA;m?2YlOyzo)5)J<_xjwJe(RjylOZeCE*1v6{^Jix7j%MOAz# zs$Vi7Hg}*z02S0ekM%qr#8CqnM(&L*%0y_=Y@`j zj$mitu=4GVd#V=2*YF09R8G`9EE8CxMeg(Z&p_9hm1Ni$Bm;`OqRAj4YN6mry;R!B zV5uwCY~j)IuTM-^qOGYVD9orM`r=Ra*fyos3gR{-m7C~DjzgRulakFyB$<})teKIG^F-0QFQ<_swqs^0=6v&&d2tiE9<06G zG&Ke2tLTU@w2&va94-(Vc98U@t8mF}O#~~Q+^ERWEt+0+i=e`mT-v=Y-IDp+1FsEo z0KZe9O3IPydiSjfu2i^;B4Cv!^EH!1iUgl=Uj=szL^%&Vkh7Tc;&!i08$+OpzWp27 zrVYUe#d6LU;J=H;Xu-0-YOrCQk(_lqt0(;n9OYk~_bq*+L@FKf!hYDv$68<>ZA=?W3AO_s$+u567J?IkE zN;jILcn#-8kp|F}d;CS+>T8j!`!b5T(hm~A0L65qH8lp}}hRsKRO0Ub7 z>WaA}>`?b8Oq+AG_BN~`NS5aZt;Q0HMRYo%i1Z%C`db=NLq3sgnJ{d8~|awoq90oZGWhte8Z#*T#zM#<6|;P?ZSkpuda?Sp!?(S?K! zjTW4q*I3aPh`it3^qfOjPQEV>7%6^Xz~qOF`p zzl7cm2pFY^m}L-9#h3RN0}zmuh1 z^Bxwom2{{<1P+8}}hoBSgzPi%4Tzr2ATDOQur4XvR z)O-MkYnFiT^*Su2KS1*=sZ!&mqe_Hyqq7ObT0TXEBPc{sJ!}Pe_C?oP4Zl`=(gO1$ z_~$JHfXjQyI@7FwO`k5#qU7AZTgJSGUc{Ju=n)l_^S<8uJTaHv#Q{v41^$qlN6-U= zb0-!0f-{TBbUJ66;M5ZHUA{cN6d98m2y8S64QaF(oX8RUE^P!%j+% zvWH6ytF95ilSDCRkgnlnshz{lAgQ`B!>;f=OAg_>iF5HF_lacgiL6=)1^T~&odBg; zqs%{ZW{$=`Di5eHr5Brwe)*)ISot5irC~CNhL!V@SC*Bgxolee0-YL~1o*k+Z z-TEf;f%Ut?S#7`Wm{TrqM2Z8Y%d;?6npU@(LZa6ZjpoG)9Q_&A{Rnb|eR1Lvec@&j z9e^Y0qGH`ZR$hCAy^^K#PG$%@(C2KQL-Jb9gvNRPmYkIs#zwJQ zCFfXvT!tN+l;6^vWg$_WEN-`pzRL&Z$W6`LBO2VeyNWGgO<#B$fDFGM57|JRq`mxu z-dxux4;21BiRAv~wncsNKKGm1`&qf2>C}G9PLhn2nLavY!0;lyypjYH92~}0sKR=N zkI}&bTd{MO{nbPFMI~N-WqL|I=gB~3I3Z~EmSo`@!-(#vc%Cmt`cX@MxZ^47iWWxR zw|J)sR8cY}Uto{$qLXImpgMaP%|jb!L+1&F%nhG^ror`={2RU}{FO2atfy+G@)len z2C*$ORSlykvSEWA{O9fwlSO_N+HN#L!V;yongqQ@LOO4Ef?7ybc_#ZoxWAl#*6Kqv zCos?Jw&|ZhTUHMCz`e9Oz?N;$4ZWlk$Tfl?1)aeg3~iun)WI72uvuQ<4@PZuVgvK@ zCVy!}zu1C80v$n$mD@*Paq3{r#D3y{B9v5A|6^mQu5=bykm>IK3vKTdCQ7(qS*C5#;>V9zl)z4_uo zV3s}sN%#@@B62A_Qn4N-oc^2>w$up3!#`p0m@KIS!PIDlExtdfB3djia~*aZ)Mva4 zLd$swx{zcIU`rukZIWwT17TF+zp(V=S6*Y%rNEBRQIEaz`oG;$F9NY;e>D{j+xFTp z;Mz1$@Rj9P^}DU2Zi=kMQh&_6-%U=EqZOH;C1U8+7(A}#h4_T4Gn&U_o+xs|RL@@x z_JfsU{547Y0g509kRY%;{&4dyW{E53zfE3v>eBy74+MXw!h~Py+aNWJlmw&5pDUcM zWT9fz>7T#dwzF}BK4N3>Nhla3y%hNk8+bn~ROjX0g0FiP>vTGJDXz{Wv-i#zp1<@5 z=rb^k`Nguyu56-}(Y%pkhI>Dh!C5`=6jzjrI(q70u zdE@GI6tzeYpMf|#%VP)SGRfURi)WXC*FH?gTbh;DAn4`UrLHsvtkjji3FMv+j-X)W zF!U|8q^^Xc#J=3MxWdG^4D|~-dNdDI53|L;@Vi2%>$WsVsrcG_-_15;ozl}>!(`o3 zx_4>LG9u0QI3f<(Kp^ck_8gpIuCmAiGu%uA>;RQn0@swvzBbP3*dt7fX%e9j zn9c^>J!1f2I8{K!%pyf0oIHt}&!po?!a-+n?=;C!4pptVpMw(%IdTXl&{zuedJKU` z9AvCv64u;3Bj5H8Gz+a4q=94y>~HgRs%qf)-qrb*?Zp9~vQ`-JhKvb`G(7JNkeEL{ z8~P9ftjvgN~U<^#sh-yFE507LDTtC0>7GAbj%AjTn+QT8a zH`8PbKY*9b*ooqIMIQeRz+gw}gWRTx#eks~+djc~YSuL2JofuPykNO-&FkhBB)&tq zL87$+WpMuTy^V}I(n(=dr07B9ZvoBDsQ4G~na6c!GdNlZ1sd^jSrWRGo0!?A{ncRV z02|G;45EY1QNB|IO@K-hA!RVx08FJ*x=N#rBr$)|LtxPy>%LcORe-=y#Sv1-NDxup zl4_ra54@_lue#?&%7$CNLin?qFFWQjC2h(>?W6TfcbI+Cq$Nu(&F?A7fRC!)e;-XS z- zj1BxR2<*S8=>A89_@5%M|E4SWe+cZqsq1A;O)L$C?A`xA{RJ%lXt?{|C9walCHT(- z_MdR+v;Jd3`+o*qS=iZF{zC}4`d>oGEr{QLReP_OL!DmBQohNqUVl&O&v23sf>`)* zM0D?a{zMd>fb-s`J2%YCvkD}#wO6}!{!ImAdtI{*5^pQ-AMY#g-)F5g-?!RZY+D@{ zt{c&xt+sAoo}@o}%hj$=_V|CAZBs5flKDT(JFzkhHT_;Sy0=#%TP!#1%3ko#pVYke zTGnmPpXw@J@Gm~TdaN&Mwtp(W%X9FLLo?T8yH5}GzaNxxD&Gb&^kvq(t!EcV^<}j1 z-IoZLz8k>9vxy#f0Gp8kfRKUSU3MO&^YfXo{`6bE+S~`0ZRJgu z8*H42B(&16dZ$gsSzQamvx5Z&d0&5BS8=BM+fI~t@&Q!^MZ(2B#H&vqZUfYN;1pHa zbJB8mw<#*bL>{o6!Hg)4lc0%7@S2~DpI}<-Du=^ZCHFITI43++a8KrRH0r3&yr70# zE<@h1J^zC0y#k@Sno{6eTObvs0$;!#{OD3tA&(Y{Gt#lTbZ%(f~Ot5qOx z62@o~vcUj*+qdG)_fZIK7i>mn%k0Bmnn?pWUmWNC5_tZ;RdL(Z{Ne}L{AvAu^1!s4 z%Cudc2KZ^I4t*+Fd*0Bquy-BAva*CZg|<|};FiYQUKFLC;tAO~qGr$~E8^*YjgP8~<^ zFbs~f23g%1t_`G;eFin#L1JwktCr!eY&Z3FNp9NpQN*a_>sX^PLlS%6Vu-8o^D}8F zG*&S4fyJQFHq#JX`i$L<#+a&(gxYXz;8>g3If3F_duiRxlQs07{p*Lj+c$sZ=QO!b zqA#eR$(P?U)_yT|^>t7%Q}1F4Kzqf{YEs;K{bG>Yg;O6ZrPm_yX`-map2x$obYYY@ zNx)oGm}iRn#Y{p2tYuag%|FtZpkW^_v$ImCiVWC7>+u<~GN zbUPPcI5utx6Y*<+fR)$MDCmN~ zp(>{-1J|g526N_PiBaJ~b+l-z^YpM#sdf;kL8~7pw9=EZW~GzqJgTsF-GH;BQ5fHs z>|)|g9vHzZJ>^xm?MIQCER<5qDv+>)cut~^ZHlDC2$z4b$JI5xKrmdngPL%eCUYX6 zQUk)T!$2!~9a}=m=EQD4G%)7cUXfXk2vicdc4~$gNvdMg!iHZXS<}C_nn84U$!(yAoLd1puyvD?x4$tqZn*yZN4irXEM6&B8I#!o_yCaV$ z5m41gmyCm2S;wSTr5>!pohPcrL25tLo%y#`bu*Yf8i7Fm0K*~Cj$%R1!n7TCuuZ-_ z@Hn<`Yf*g2CC@{Ytbn{*QI*lE4BYQaibKR5itND3S=M1TD6kLKxF<0jl(paqPrk94o%Gq zGe;x9{QICwDB6b~86Cdu@=8TMa~iVuo=Cvf`dF9dm=@9~ewC0~0fGWbqc+JbHCh8Z(K4gQ^ut1DSLRK!F% zH`nH-EVg;Iu}m?=+24O!RXFH|+A^qYt4mxH8QooE6oOz{C6&ifZ$ptc33X)LY9j_( z5Ud%5afjSWwKBh!bTP1+e#_t74u>n=)@Czn5fRS!?->S1}P`v!f+u8HFvI*Y~oc7WhA!1 zrG7goOCkmlkBHAqTVi~AZy7>FE5XEE6xor*SW6&d>H^uQrynR07x)Ke?fxk1l}Wx0 z49U_3m2X0Dat-0&8@vgGDVi%4MlJ*X!ky#F)2_{Y0y|rmI4{)!w+Rh&yrJwB*$@(v zB9)_k2HJvNuleihiA$~dyRLqJ1y#eU#Ot@^OaA*>XK z1;a{3%7zOrW%LeK8udXSQfCoUo_!i))nnu|c;cI2ZB(*dacR;zn_D8Vt@T({a|4zv z1GL|-=jT^YzhU-F6%KaL4sHh7c01i>WTo?lWiv7uC4c+vj`Zk=?9pe72|x&I`?7wT z4h};1JJ_d9xU4NOG@u|6k0Sg?^E?RmtmKL`1wWqD?sWNp(K~X*X&lRC9B-l++$bX#}lBt6~`3DNUw3dkedwo=s<{ z)5vU&9(v}4^$&V}2J4^i`S(jN*S%=9+Z;DIKz2}=k3SMib_06kJH;PZ;v4BuA#CyO z)YnuINjmWHrFFFq2=b4tmK@KBch?rHmqoD>(V1s(uqkzth1$+(Z3*Gyp^MD#LzfxR zB*I2J;Vi+`A{T2&+-pwV&H^9jwW9_9rtHC?63SMllk@+NOm{NGx{9 z@bGGrlvCXgNyur!Jyuby5ahxY&zQ(FC<8-SFy}-_68zI$+Ojceyqxk^6B)NLgmwRv zD{v_oVkCOQ)JrUa@boZon1+IP+C?)fbAyoI9=LzG-g??O_+~VBsXp>B`(`lhjWh38 z>GdCg5dfU`ZJDk!>H_2nLE(}rVa((4By^76W*e6~Qy3{>bI!(zuuDCa*XW|T!eU># zHA!D)4ad+|r;K+bd-7lZUV8%>roUVbRTZHyk9Re%xVuKsi3)}z?aJVPg~N0ike%iN zMVUX&Y9|`9>QT4sTCrY5hB124QYM3ti=5bfWL%1JgQ9@AjpyaeX|3a3WJ`slk*u8n zqR#fTEOT@Vz_ZfTBmY&>t9Dshl1~ARtz8mR%j^05M+U#%5p!{dm>Y%lc1{!T+@bt9 z4%dzi#=937QrggHceRS)7)q-USI-iApC-mt+|lHERWWK@%slgdwjnu@?LBBh1!nn){y#d|(fEi_cs>iW&qG$JAF%r;#$2V&=T- z5ml(a4;*Z{HgU2b>noY4X-_+qId&3S@Xk%m_vuRZNVB?iuXkNOhIRapZS@ZTN2OE?^KqgqQ`0 z>15{Yuhd#i=z|U|$E6~7T-8~taS&?*#i(whcih5DJ6lC?(>hZY-B}g)kD`upTfO3x z8KmC;;oC~Jxx1L<`{qHrN#*MtxnrlowB0Gqc3U8+Lkjgjt0W6dB8Slj0x^0_wv)C= zFynjF?d?NBN9eQRNfQ7fZ=AP`Q+K}T89^KV>osQ+q~Yvqz$N3>i!viWh|&J)wQL?pU6y6aa2>;CzZln;gHrMZ|9n!k)|_7mh%+Nt*Z0h zg>iSD;@s8ww0~5uN zrbe7sM3e~m=1k_N>a}2r+76nZoHYw1V=aIt7L~Vt=axm$7PnxoxCUK9T)n(%R^o=@ z`uf9;?Y-A@O3-m&FZ6t4`CPg$b2xpP$RLpA@8jh$?ik~pZ)Ca0#1H>8g8x7iip$}Dg6$xh%GEud(=*UdP^wT4qAXw_+Gpe83M z1*!J+!7)Q7x*vcJ20Y)0#on~61p_GVa%ovtJ_JQLGPcbH9wg8(o&iJ3T$?7y$YC9t zn^G_2X2^B51%rGuqS6)*}gEM4Ky{7d?)5|NIeA(AD@>wcmbBN z6>D=)>33triM1;F#YBv{F3-XC_jKi1Iu2?mmbqyZ1EH1q@rz#76+B2XLMg*~VtZIJuY zADntyvsXdAtzpIJOCX~lgo={1L4LIfUL4_e)*+4uvPfmca|1`y6NyufvMy#SI72?j4e+F9*w)k!AR9lMCCJa){&MIKRsal?|pEJ+Xt-o1?IC^ zYnt0ZJc}7W1K?gB;6#5tV@(PrAI4QHzrvm8frz;hLPTA*bwteP)bB{_=Wls76)XNz z&`LDs_-YI(x^G6LEXBTA6P_Ptw6|H4UsrV_@+Z5iyGs^0;HhxPauyCD7>0xiShqc9Y5yYBd(5H=podxz%)#V~$O0C(W}v`sVgKX1!j@ zv)$Jv(}qK`8-wkcYR*(XPURxL94nO){??Mnst>dGvjk#{q*~{EsKz^NOEq>cjkeH$ zm`uT;Q(z-Zts^&+o?yM9{de}S7{0k|w~9K$+iiVeuUmb3ZL4Fo7eK~Upn(3aZ3w=$ zS#$5n^Q5rYBiKX0?^?B?tVIa`gEwLY1eLP(a6f<*a}riIn^JoYyQI*y=69X-&>V3e>&dVK<2B%%SZ zU1-RFtcU3mRO+6*`JiY?Vrf!z`jBv4M`OLzYXbo>ARGzkU=#MlD_te~GYI&p!tj1P zcyNt^?bT;3fw^5@J4gPJw98H9rP6v9vA+|+Bfl(W>chIL)$sFA+TGF&duVns8RB?9 z<*h+n3oH%g4CQljIlT>sPV=CZ8WEo-0oXZZqCj!SfTk(Jkku%p7`4aA+d zL^kiolM}dVsk-*GD4q)25}O}fp9WM4{F&Y%p4DPgHUXL2=EY^qe@#GL6XvQ@FA66 z?tk<~B9P7iXQ0Xw_NY*bhWneu6o|moLkvI>yvBhrNfkRw)eEP1t zg8GC`E!mjuF`k~6Btbn(IFf%861ft}EDkB+wUj8rso&jI{h3eiQosU9yDO_JKLp_w zq$$Fh$rvvEh!LR3afBjGaX{A*KuuL4ilAr~Qbv6*;=A-28x)=eE3p%cEm=_(dB=Nm z763A!ltkxG`le@A0E0qbmf9vpyHr#jQ&Ukh28U zsrtU~Om???$t#cbQGFS8bnc#`#qeAUn%A_T(dzpw)NmxcdTv;g>%3n>cvvCN2DLZI zu-yR4pP?)A52KV>Mn z2Qd#zg)yhPX?1C~&On@3c-zD@n!Z|7s z^ix-Q?G1LmeS8QEjXh7`sd~;@0FcEvOBT0igB~^p zh^+ewc*GMt%2mlx=;cp9p}z0f=f~;5319WrN+bpSwYz^%U4c2$P%uSh6FhU#=sdoF zl9#C)Vy4vk^>gokt8d_f&U;2SSHfxQ_m60rS9~aH)fRe^Sz>d+~qLEctN=jyD zMh0KjdUcIt*RyP_iQq_oZ1hlI{#sq^eHkew87wlP?&5B`Op%pI=7x#7uEhXuH)7tU zAfij*Z(?bv`@*1g*DeulinC6W_m5F1$u>!9bEf(SjktyV#O(TIhcb%BURt~oh$F9r zg@X8gI$~M}1@zjY@;se48y-bEI<-(N7*x4}A$o4Lu^tWrS8{ZT5iwwH^kZ*R+0#+^ z8nuf)*4VTd^qcL3Ml{f%u{}M1Z1PG3~vXvt9N~5WA*zo2_b>umaYLe_x3%mrrRWx_k0Jp4QcegLF=T0!u<8? zL?&mYaGd^2b!_mSYPtKN%djzdyPSuOgY|x7!bHA+ zg+ieUqhei1NW~dJm)2(sHL=4EX4iEi{_h1iaE;y=;IJbI(DB!fUN+95jKD>a;AQpj zOK%ZZcr__GfP3Qjw%dTHx@C9LNr#lR7e^NE{SOo~^~ElCr$G$J8K}x12H=gL{;y zRs&VC88#&Kww9uxY-L0tHTX|b;_t#{pcM-@!9pC8Cu6{83D_RaB=qI30j6D_ySG#C zSy<;&0VaY4C=mw>fQu0#%8-@~{aG+`;iVlMR6H;3-QpBJa;{(<#?i{oH9?|0GM??o zdyJ)47CQJ=ics=hdzQrW1p!x2_5Cyv=yhM);)^!!NIN3{dQ4#0_@$4#WjIHl{`Cac zM#-+)c4NHB#m-vTHu6we_l)NVn=*6Lu97dA;FBcWqzY{{^71I(D~?dTxW6+FT7<8M zXS{k})*aTz51UYxq69m)!uT&+i7_63bKnu_;lQJZSJx;Y+u61_N_K|1{DF9=0AA$? z=h_F$Ib+$U`f#;C?Hv_0qxYofKgBVpo(hOdyv4*K9dHP>jBkY)|NSW$t>PY^4g~G$ z$cTa_7&;9k(Do6HSh{aQ>K|cFGGmTZXMysG6V@ng{`YedXSa6$V{v1xRD|>y%xa=X zVN;a2IF0ER7k5qv!XtY&j}x>YNhPvG*oP1~4vqnm21S52TjD&bE{~{H6}8agUco$# zwDJ`{82M7N50%pYL-X1ge-73vtlE)Nc&Q9gnl6(>+ud1Jnh7WlBX|GhDakZtKp-3K zH3|bgf!VZppd7Ce^$1~Q*gr31pfYjJ0{|DvV5^TXolq&8bxlJc_iQj|=B`Mryo?i@ zwNetKOeF~@T_?HIE`z$N+@}$m$@8^8THAUeDH#-d%YCaI)AZgOxst!f9GTNW0o6WK zF#-@9RH)4}g4!_gE0%An+w-3>XBNy%791M%*U;$fS!O=5sgWTA z7m3r++4xSN4%1Nr0(~kP;+I2|tI*i%NW z_j~aX$=9?hg*N!*;Bqo3_#U13e}9km)A7X|a~PWy^9IiI1`^*vRjo~;L%tLK^A-Jr0Sy$_B4fnHygO?_;onvVBbNwxd+*DlPGp2o{~K5 zaNBax$^pn@Al{0Zw$)KXL)LT0rPatjW(>Fxm|&c#ou8M|HcX>eNq~yCn+~UpQNZl0 zrXgQOa3!NFW#fo8imMQmIFU^Q?zo2=DV8zu=Ju0cSWl(SbkyuH?S8!H&qi}8f~0qaH~6@bAFls2Ar4K z=Se*c@3bX;5`2$r@pB;K;I*bl?y?L;Zm`Ci$T?~IngGdiM&RweUO@?kxtARs0LbDP z;hVxQAMYD_GIB4!W0oXi<&>9xzC#bM9Ngepx-Lu{*_F8qlHAsYZHc==A--#+%mZ57 zAa8Us+w~d(r8%=->Tv);ns8X97CFmm6zM7zO`Fx)TG#om`Uwl?7vIz8Oi~74-~RAd zTJC4<=CNW)@g-K?3OT7zDv(q<>{5vq}a=#sCn_ z7~o-hL#v*7h%=;MFqt;YA7kD&jEleOcE+qM`11mmDlOaJw-t_EZh`OD9!ugz$3o1oUNHwMu8ip9w$d_kstq;KI->@kgN(ldkF6nEa|f-$UU@Qr_se>o*#JJ`M#`WG zrWdivdn^~m8w+8?xtz=Ujmr!*EIi2WzROP=G>*e`ezBQntk8Sj(#Dftlv>QwsV`%h z96*=oy2~afl7%XcJAnz>HUs!3K1Iq9ueX((R06cTVQ8AH5eZ&lUfcYLKW_!KJoR+l zs8!WmcbZP3Z5_hmg!-_@QzAYP2Y=g8JYw4jsk)EbF7%~r(sne@^gfzuH@L(X1NE%> zia7g*eQ^hJMo~Ai=!JvX#QL1wk5&jLY+T)j7(~mf6l5-?M7hK!jayv3JpYjFh$0)6`8$fBxY6hLM~^z$Nqe$Rp4r?9kebLTg+0*!Er zaF}$-jdSHcCTyCX%Gr^WW-1`+>Esv>L#M|ZrP(?+_l)lR9@o9nKAff7}K$M#}?NB~3lQ?4*Oiii6BtLs}pC8Xt7oKQc!OOWN zXUr#N{&4edkKvqmqf!sK-($)0^Gpr{I+k)=R-U!L;#G@zzP1;C zk`vHH(I07S%1_^Mem|x4dy+lW6*_m+Eei3sC7s^~Hi>4=rVvcsu=x`uRjt)uxAMG~ zpYOiSS_khV>UU}spAb@_EBEk0{`sBnq#IHb5L##YU3~gyafB}MD`$o5Ap<{Esd+Vx|GB?+h=g z&A~j|4*I5@+fx2uZuZ(PXm$KMTh)BG?|s3+oW8iSrlBDsE_y#s{4!nsrY&GYe`2hKS1FGvJtH8SX%riU5q~%DX~KFO4x=yhI{e-=3O+h27nk?b&5#njEGuret861E4oypq3BQk$VBmF;E}^e?iMH?%+UJ@ zbkK)B1~A}56TzXCJ%?`1d$IB|WI3W)2Y_hEIc3X)!4k{O1sA-TgNfg||xn z>ag)U^TKP+b!lz1TYP*Fu`OUpY4XNWAxzsXs@^S}Kt&^?tFI4#XQ6sCeN2j9Q~iJ% z=|?L6-=v-OAEmbcMTVC3-=qafrq1>*PR6Fr1g!s-nf><=E$e?@g#P~@0sGfG|F59- zf1vA3Yy_5}}zr|;&+%iY*X06;xVg!XWL=Jvva<~P|p^OLf}>sxl; zihtiw*7{VlrINVA*Cnd{%kn|#`*N~n4gG`5?}N|lDX62tX8rTbw&$z~8E@Jyu{Bfn z`;o8L%d?`zBbnbP*DsackFhhO_MS`Mw_{>OwP#cHd2^?hiBkVc+jjfZym6rK$j}3O zsT#gg9mlJ=vtteqY~Hha<%;S<`1_IUXBzobrOH_$|B%mjFv1@1p<;FUO?>zM@qiME zU7NUy_Sv)jZmr-?g-u#ge zLHSDA$W>N*F|!*XDH@_9;4lcOA1-#kjzp>|EqUm1`s>pdFJ8%Xaxmh^2yD9F0-TLJ ziZj4(z!x3_YAa#-_HGiw&!v}%7`A$?C7)<9HZzW!r9L}&-^1z(mudc}J zI+}|&;Ln-0&7HJ|zQP)*RlZNqSGh0S(GuSnLm5pEU0ICbrex?7Phhf1^YBYEvXsAg z#8Q*8nq&j7`YI6wYSe`#vZz|ZU-3w%yE+;#M1f;Y6(|ZAjv8BVLPGjHUjx%xqvbPU zNMRUe;_riO2(;dA4!kb1F1+`Eik!tUL~;NYC?)6=qa;ospM(l`p7SSAEfC zLk19JXC|n0u3jiBrtqL-TSh=M#1ZyQ0^^T77X%{XIRUD7#OqyKqb~v!l>QKH9k_~P z*^}pdg}oiEsI>$j4;fTm2+XDuF_bsbu_P-R!BPU~U3|3<8W zm+BoJ0ZK%@P2muly3~tRILnvHGuDh^Wn0Zan{XnoMt+-XT|rTXxrc0GqdDYB1}9_p zJ9_Gm{B@`~LZoE6UZSwmh0-s4Bhmdz)2yy^;y3%sXpiXtemKg5OiCAb^Q%?0wr)kN zr8SNi3BZYJoC=*2v+lDxainosK~ONKhgA9i=j@Xjyk(c2t$(!e05vFvVS$q44dkE57PO8v?NL>cTSW4X|D z(6Sbc5D3Y6w&-3dKS!d5vSE-@fD17z#dy~_R$p?_ysWRLtQ(!UdA^FL)VMSh?PZ|; z?R-}UiaF^!lu8x;z%I-H0kviy;%+mu`R-D%0F@rctwC4zy*M3|4jyO#(?{&iJAA%t zXG3H~*sUH|-^R3p?HWD7>C$%&Iluc+@@F$*GaxbrH=w(`&^ODt5aC0lXR!{m3uOTK zd5gvJE`-qPIvE#s#?=Oy$@$$!pRp?DxCS%Ge`@kjBFCLI!rc1%a9*8sC8_?%yrnY- z>mC|)NBXPis@Qia@ufddJ?i5rkgKwAnmU8ty_nT(^|xqC0=JF@#lv)I+L2!k+>%a2 zKzQ%RJjXNsGBoz4z|_VK3`zHF{BT&=&Hk&A$S$TO@QF);vtW>C0#kNO_OI<}R}AFc zuH1OZ8{c3$roau1q}sXnvjJD{8!(eI1jZ_fvXPBp+~$Q%S}d`u;O^b>dsBb1{P45$y1x~m zAI%R*P6EW?8<(qbkk#3@96l@Y2pSSDZej6p< z+*;g|;#Yl2Fz`#R*u0rvWXQ$G{3N>UwZ&$=`cR_RC4}Uz(pfLONOY>A-A&t2-t-xBWTU){6gh?T`B4l=t|8RCAbh|3vWUdC z5duiw5R@z6UrrE|=C=qm6cXqlcK+Qo@kcN%c0LvX4=;|M&ku;c2o4SkzD#&F&{#2k zrk%fI!w^#2C87KARM*#aWl&kFHTp(V-d||SzS637%iy4HJ*5+l-03w$eCe{`#Qs$T zX4$qNsehwe{1L0ws154v@;o@y%#4*jO_FFN(6;qGoB#^8RUSlSkvM06VVSyR)_6x> zhtRiR*r~J)_IDzLCkXG&eCO3tZ)>YmIy--x~ z*%eBR!MRRO(4Zqxa1UuEkkJJweCHl9l7Vfb3Ks57;06|jq!1myql*W;w~gjk?LQLp z9zFml*0yiw%XewJUBXJ}34)gZ!U%`VBy5n#2=A|^iDRk&Q z+cxK*cRptVp%#1<0|p2BNjS(#Fo`sKH1zgheOc6iEykg`jQ6)pbN$XSB%#V6voxOL z-C6d>7pV_V&BiG1SPq2q!oMeU8e*)A7Dq%HxJ+#snsz8g?RJIZZ0xjXr`c81LOpQ9 zbztvZzuK$!j@eh$r3H6{mQ?A#36~!oLT}SzODA6V+SgrG1B55Mw;FBc8Go|X=`mFv z&x^@E&z2^id~X8iJG|QKl8f3kHwgO_!LkrH>G`HPp*M!AEl{DCXEHNdsO^q{K8E@j zL<}?=rv-292iGE9*wULZP_TgrfgtQ0z!aEvZKhR2%8s95$}6r~$`J8`se1};meko4 z7}o6kz%=;nkP>3iz=4}P(kVZsc5_TJwBU(7CfRL9R>tVLy z5kXFXAy{dam#94of#4(c=!S6-jru^@TxCJHDe`!JnP$JeX8{kU08JL%C}R(B`k9j& zNjm&kE#EBGQ;(8ooxXwG(5A^K*lOD<5`Q4joRA1oc7}0M+fRi$bKDs?{Gvm|!sZnt zC8_itFoQ7UZlna-Zy{F$VJDKi+^^E@Jm6lfo77cMv9VYS0l5hq3o@a?3k1bW0T6<} z^)dX2Zb39xT{$;m8mBBB?ZU$qgd{*@%jvoE^f~E9i)}fu!JF#>&LVhs;$y%B6&`YE zEJr^c4s(gkx?Fz~tBh_|>|!zd2w3jZw%Ob^V+WR|!nF-UG^j=IA@nt|Bb|GPdp17U zxF-k7f^>Fh-sqCs3R?-U1?;=v2^x(5%?-z@0n0|DxV=^~|7FEic^o!Tk$Z)Zgg0WSlhRBlMXu9Ck(a?zCq{SPW!rA zgT7I;mLhBLagOb&XC2L!RJ?DuOT5`veM$aLzviy76=r3YB@o==p9ns}Dbo`I<}P|+ z*wvR~m^%3#>CLr;NreI6T+nVbS_9(FhWW3Ijuo4U?q|BR>u~r$=eboJK=^2G6d3A- zHc}y>SFAiK3mXt>QjC&UWSD^aJt~10C|js5zK-~7Ns%^Rx|#-yxCO!WfJx-HL&iPC zyo}nTr*Unvi;DiGXA<)`HwpQ8&fp9a%5m3Qpekwn!vRBBU6WEr=UurlQ0*8 zEWRRS>sgx&d=RM&&IJ+~CiF+@@OUvmRl1sasThn?yP^^V0%;TwODXXAxGX!O6}Vtr zKql%IIu<;+*`{XOi^CX zjL6SlhS>6GYc~#oG;vkGXr}}qb%siZpNByuBL%R?X|D^&{d-4f9n&XnbyvHT{)oWA zPI2;lCu)+=OjxZNP&gA6uF!$&()E{Pld5*+LWqZr6+HK`b+8eW{l=Q3;wg~M^`%s$ z>nxAaR529?;^Hs4kX31CEUc68XsaLw^FQ0`pj*`6%nC=>Z{u4;wWVDN?F+1De~Jo7 zuQs8~$_NX-H`MexImA=!`IKSZljQGRn&se*U%?RR9;n7nh^XwIIh+~ZMFz1e8u(+? zIpa*IF5oeR+IT)~Hc+uJNo6(`z$t|j-x%eJ*yaCcEavjA3LW8qb02gwD8w3|5m75^ zO`;KlnV)Hz-{gDXtR91Bo4BG=S8;+<0Q6hI5Mj|2o!2+u5u|TCOK00iLw&>3BEr0T zL|EqI18tA!gG3_jh!mYJWV}$XB}V|&_lI(t9Al{sUetaUS`|WiN<;SV=It4yH%Mp0 z7$?tiv0-w_xM-vWHW58Ag>+7KQ5)04E>5CGeKfC+?>h*6gK}?G-)A!?lhP3{XPgFO z2l7(ABQQ*>90#zM6HY+!K^-~OA#KIv6?3y0Sl>N#2ggnPdT!?4MW>q2DB;(RUKn8bL7= zy?&|!0O6H&(Xjr--=!r|RYUiGf)C;WBb=of>12?>*GCZRU4UZTkVJ*!hZ<^A1_ol1 zO^{GGM-$`bqzFXi(O`+0rkJyK91baCf(0L4#{> z3GVLh?w;Td!Civ}cPDsoceoFcnVfHC&N+A8f8KAcm$jOHdTaOIwX3UsRlAyoG28ad z)K10yMukN9fF2FU@Zp&!S6?>%=R21B4#c-s@I!}Ft99%JbdAS^wlZysy1{4K@TNmc zb(OiNYJ+ID!6Vwq7zyfS2@lxs2&GQ_RlHKF`gabtkjs0PJF=Vw-Z!w)K6rFY>T~Ma zoTT?bgD6o%OcC6diDyv^_BLOW2R@ntG^si9^$E<#Vcxry?WIzh)nQdBgdLGGrP4Ve$}xiq9y7+A2nJThnZz zcZi0((5W6x-C11~mqqoE4jH@+i5(AWJh@Vsatq9U^C<9QOuXB@42Ti6U3>d8m~0KI zP5>8!$b(=oBF8C$>7R?I>38K8#D&sW+UY3uC0 zg=4n)?i=!}i5=*A ztAKa(Y&UyMc+&^;jR0 znG;Nk9P6dtp%mYHHXZ<<#nA*XYs$Qd@CeZzkVvSHK{AJcg$zZoN!7-oWVfFO?-|S+ zOM!2bP+?{#S9^5}4O`PhWKs{aFWM?(h?*_nvb8A9edV7dof~f7l4_L@J2z8KvtpBe zyviM%ywgvD^tKlt`NP>*X4@#Cj$F6L6&uSMegfi}!4z|x3+s{dgnpQ$?Mc`BuYt3| zAXqG#q{0(2pOOwYlAR`3J8yT~qFzDo_G3;58_rzkAm-TyihPuQr;=EqjpwElXromB zd4Q!_Mf=#dy8k0^u=gBOuZwlmu`L{^aN$b{)>s^ob0^I%QgDT`o@&`sYzYjXuN@FI zk*+?NddFZwVlLDv>U^N3s7KulDaX|DT}t*G_h# z@^(6Q1_aMRhed4}U;r_J=?MUxi`p_0F#IqA03dd{-$D!jL)2hEoZ&x0UJyJ79ftV< z&;x!lvJn9M*S|LlxcCR?{`cs||IQbliQ``*UZl$}NPsdRxsND^>28$C_P^=k-9+ zyxmbJy#5N)>Ek0?asJe?KV(MNNIoFQC~i2Mmd^AkkY>N541uE>(&`#_-flCDZdX!7 z6AYERnyWsN-u`pLqL5j3$rzP+e0dg}e-Kv%!xn-~Q28+!CHAg9Rpg#R7c*|z$*NU+ z_~-P+s?>FX0y7#|n=d3hZL6jqt*BU>Wc8|Ffk|qaI%jBqc%68L@j%jxk#Kf_siehU zfo=Qg6#3&ywU`)wlJetS?^`Jpi$H+e-C)0TTZWd>wIGSfrj3bHK)rWh&^$aW>99Wx zsNfBU8|r(8%a@9ka)VOpZ3nK>s>yL;s z|6*!-UbMgZ`*WGw0wQ;C(`nf~beyBY8;!FLt~I6Y6# zTY}$P{(jSc+w!joAW(n4Wfl$=0(NFbO#-?fkH^69M;||!*-wMw?`^aFl`#tG*y)&A z8aKhQS{p7zQCN{QqLdH5a1WbS-lhpZn z0kCJ%DVylq89&b+J0Ob%U_QpoR%(Bz`hcjM;PlKMhm0Yv?w}b)u+b1!cwxzg%Nu@BD0Tz$d2Blmq@(B}e z^%+K72YP&sMFVMrE5;lnyWcGEx@PA~r#+W{mP!PH+)CMU84}&H$6Y^(`5HYgZL}^i_^D87+s#Z zx@Ti!mznCJiki?zGugvMANV`&%;{+4jy>w)%jB{eqb+2QTaU+SV9aQ9xX?y(T_QnR z(u)wuvk=E2z^mhG$+M_T0zpJebAiE&$XL({Xj5I$WHqZcXuyhM-&Dc4IKHidQ2++~ zQYe>;Q5;!&QJeNL;R?5A|zjDCu#P(MT{*xSkkf*ecIe?h|Lbzr+Mz#dcb8Ts1CkTjJ zMW9Z@3SfvdOpGi9jPwA?VW4N?(1f8A2dp9!Jpl_NGXubap%bvxduBi!^eg~v2|)PI z7c`6vjDW?eV1&I}%Rq@eYf`K{vA_dD;06vx- z!ubtGWe-%iIw)MQk*h*wg`J9i=X$m<^1Trkms*`^Z}MTIpX<^QiCOX8$nsFbcA9J3 z5!cGi!|>7IlB{BSq=qc6E|Qvh!Y4cqS~uv%9`M}MX^w3YV8X<#&igd(Z5%0nd@U%% zG+P`9cUhe|h_elD+}qCakeb+|VC>VYM)T z749R2&y)AK0O!+zOAPz^+IW$(RrQ>2(5JKxD6Ah6>O}bi#v}YB#g2$MH?bOAX>_10 z@TbBxRAERng$;;pzY?k&rl?p^3qYL`W01Ob*wf^SCD4^FJ z8s3HkcqpJa%z82#-iy)LC`7z5z#u3jqHmee8$Rr6v%%LJApy&5llnUr%W}*IdbFc7 z11Yb@UXSR*oDjO8$gswAZ9q7G@uP<@MAXy6V|dzd_9D*aKV5^Y^o#ORAw&qk4qzq& zNqU0{d)*fS;*_{^hvDZmaswKbNe2>Fqw6n17V84zcYY`xr;{Da)w~j`i zv$?Y*1KqjCoJdL!BR*QYP;v(91!hG0T|YBW!oWg1_dcK$l*4r8zUz3Z5-+}Xhv+bC z&|SUQ_C|Pwe!B{Yln#u+jWw4NAjp#LudFzJ%58M1*am0NiIpr zFW8O)4mg+PU4aOiufH&5Aksm+m>@=@Mj{7QY^*r8YFW>Rl`oF-2Fd0OFa}0wkfgkb z>yM`tA+UQzKl#30vR8QX0#h9l^ChZ1DI!e8rO@ECRO@#0faO88U4BSjA1I{zM#Bsx zBa6Xne_YCyTl4@X`2K|$o~3+eNbshFHp~a{T;|5r!u-%bTXZkF$iO>m0ZmzqJDGb&(=-o6(GpW2Q9l^)E5d^1zKTuj@lp1H(W7YV;p$ePZr zsb^uyJ#WTm4abz3+k@;#u%frIwV4e`JVS7c!ry~_^W3wp4-QkjFvttEQz|3I~(mCu$wD zu$$(FDw3g!f%TwbLasVzkPub@*aqe;(Z0FJB1zgnrcHL?_g)ZaucM>)s&!=sd?ZdW zxc5rrSx&HPRhkzocNe=d2Q82A78m26j<4DH@V?Ezb-mp;JlnN9s@9g$3V2+k#kr`5 zU*D>24E{Q!x5Xs6bMOXJvnOwVD4w;0rG!7V#nl&~2O4n?NcC&CFQM*pN zrx~JHoK%+vLJ_(H!I5t}QxioSRyH2zTrAKdl@xvw6|Y_GsKY$1c* zFfaH9B|P7$NNF!{FP2$dh1i6eE_8vf69Gg7+Nx|4$p}k9mjrp2P!TK_iCn-^K4!Gx z>{f`n*tv2K`byr@QeMyUosc|TwDtua>J<;ytuEJP*MvxbsKq<{R`CYQuOUt_L3_aa z#K9_i!XV$MvdxNnhSc5I>_ragsuwBCJl@7uI!4qF>hs#Z_dVfO?ru9)RZ}}ltwl|9 zpBWdM9=40n!fs`AJEODn7cL8OhM@|ILbM`u$e|9f+2U3_>N9sR-V1?U91P*g)GFWC zM&~iq1ANNW-G_>Z_AK@UhnjnM%JJZ3q|tChgjS37)gye74}^+7)Yni?z41aI!sF>t zWO=kEPAbE-&KuPX4{k+Q1KkaZix0E=l$W&Ep9N{y5nGSCmM;*j$4`qgKzs3u$axUj zrm%}(IVRci5p-Sik4g$z_IwcGa3z96p_Z;r5@26`wYB*CVO}@c>~IJdQ(YMY-u&eq zhUi_K!saVssumv>eDgP2)fC;HO1e_sGgU(dF1eBA-7I3*%BjiLW{QX#sOMYg(3|)l zLJ$(PC%_{MN^PHR@4B-;5)pc|v^kz9s6f0^75)^-FH~p;-L48pmrYc@uMady609 z+Mg003_x0c7LMqCQ#gJqwf~@SJWFAJC>+lU!oMg4zr!TIWGO~KjsK_+M0Hqn5g-X4 zx%J@13V~g|BvdRUx*26HibZU%sEvBhZ|Ziuf*BMm!L#q3>`YJ1=XjLv2z?#zv&+!2 z(@d(-DFLC=-tRnJt(`7@SNPs=I{F0eY9Yqz@csqA1VnoqQx(bl>ak&$SMLY+=+`zL zK3xxs%DM0K?mJ$(gLK4!gzB{(3iZnH5FdwRaHhH^*2?ESqSVn4ZHOcr)C-$O zRcA(t6UH@bpb1$%sdCYWxmKCSQ4@N{+kXjO=QG;QRELZG78uYdI z0JdPZLiy`NejoL}&g7r-@?)NUle~YMz`qf^0eA*v-!J&r&xa?>DKN;aP0{ zQ|e}A2T1dOk-EPZ`XzQV1AkWYr#tn=Clo9sq|%3MQ?VjTp6 z;lBw3utH8;gaC7g z*@qvb0K`w=+Zs@}j?Y=r3$PdGa%f%~stsl{lo?tWBbI0J-q;^p@8I$wqRKuA1n@mQ z-Fae@!`RRvE>cHTaXlT;7tk{TZMK0nAcYf1A+>{H7z5e9$gbjsGvjA^@dC};&F+H0 zKMYP#)|wSU3j#tE$!ZE=cnF^Miyjcrb}=pEU+D#aVEj3B|92~2^q;YYANc9-OaEtV z;(u8B|AI9<;{iXh1(;_-_>L|7h9&$RzVHh}crMQhc&6_d#ZO$~XZ`QE#WVi!T>oEK z%kR;Tf3g-PHURqZA6ZMx;QKCmq}NxE&sal++4h(ZLtFwshWltWzV@4SZDak9?4kGW zBL#FjfT(7o#JetWD@S+^I!RKGD57G)g?)0QJ+Z*V$0G+I`UCK4Z09pusOS!lKR5rf_bd?@sq^n zx8jd`ZHWo5<*BC2%wneuf_-XI-_{6r%wI^@2ZK!2e1icmsw*-@8`ZM+1qqcuf`>8A zol-|Dp_yo)fjOtGna}pJP{4&LC~!d=t&Y#qK44sG5cPiF@ioLVJ1d=175(&^0Jw|^yKz-0X$5&s*e_dD(W4cGf03OD?KM1SLZ zzrVJB@jXUX0G|2>^|8PW5nzQg!3jkPO#&gy zeO4y`a3S~t0x)opJ{EF`;JtWIP@?*v5A6L3U?0l@6xd-({mj72r3|R37=kI}=j^?& z7R!xhyEifq3O$xqT;F;29&x!iY-8pTLn?T`@T=X-I~1<V%&zXUq}ijIYCY53?G zCimcOg^wXhd;SU6qFvzC=99f_C$7IFEP<)BWr*NH`KL)xM5FL{@P zEigh2>yam3*A=*LJJylOETM(uV186JvR8(Y3g&zZ^F@Gi*H%$RGWcb{Y|2TgG?Z8QQ3h8TgMSgzrPS%nrPka*O*@?c-wBuYD! zSr46~${C&c!E#NKr%Mf8AXq7v#-x2-MZufd!H4nsBI_(K&rZ)9JYEO$CGWM4g@-OL z`OY)ZAw(~4F0Z^CBm&kxbTslfU8sigzC#LBhyJsEe}>1EoArzBBWK4pg$(0ev*KZ1 zKhrqRmmL|im9Vh`hY52WTdA3zH#6>&hsII&Nc(xo{t6ICrNB-HRUOb~zM~fGs4*VJ zZq@X6p+z*k7#dq{jWD6ZX*3iTh2|DX-0m<;cT#fQ21F_K2=I$L_}Go$8AO4tmSjam6mrQX=YWcg7S7rCCF z_u}(QXR(=Pp&^eEub;%+8eN%A#^!UA4<=(tn}crl;+UJs? z913YQ`+)j_Ev+DFz=QRzTLg5o?Zf=&m&a73kDTO_qFy;+N`W-@vS}qLHQ<^gMtdce znskjwMl?g>>@OfHJ?76v7Md0f$KNVfZOOj8LcZt#A@5*w6^+6=w}Bv0Em107J?ZPq z(a1{10hMa+;RY>Bc92quti%fSMs$ogo4cL|ZOzq~<(dklJAJKzb+SD8s-uuH%DlS2 zc23y<{Zd+Z;Q+$~h^0F0xIYEoAaqNTX2rcK~!!*C`wt>A(_b{I-TV81a z9J+Bt*_Qz)F!i!?fV` zs%md#4s~Dm1ELz-;Fq0GPF-r0748wfE#Xn<_WT0Rw;b3~aW=X-nl1wYNq%YXmcS2g zqeDeFex(0rdT?0O%d|xNKY0ovv79%5+dkUrt;Q=^KpPzGkqL z1kK89-1($Fnwv#TD^`?m=O}6Z5fm)*FoCta2X12KJ66mZZ5JRZcePt)gBN`nsu!fW zJrGaC53iJ8`-9-JbcKv8>L8!Yp=R`Mag zS`r4X>mS6Y0H7kX+F#)Rd9&R*O| z1Kc3a5d?^PxiAJ5axg45-fW)th@O^p4J+@~&cLwBIQ4qxiF7M73$;ByldWx6=T&H! zL;381W0k%TT{?!AOfiDf1*|n zB|`k4-~;v_CgR_82=<>A%zw}!{w;j)z3#tihycCff8hfQ{hyZ8pC0@G@37pT^a4P8 z{-dkHn}384*ngN+e_#4PvBZB^`V7xT-anwjzw=f2-Ou0|BltVtg+FiwK>jZuhv&AQ z@dN0LnV1M}U;0U1IKUM3W;0PPwCjO&NsTntDu|SU|e02*$JMmgu zjpP6f2@H$^#Ayuj+a;)eFeBJx6#*6OllzE9zDabCYM2vU&r4pVk*1L*Yn8=scWxF} z&ejj~4z{gn@3vW+gm@A3Drq&@ss~f6EDs8kL(sq6?HTmvFU@9ZDxwdUdW_nrZWTAV zej5(F7hh@pqL~qRvUBLcb96BHXrJP-Esuw}9op!id6<0ohRNMrdaktO!efSkmwLvv z@lo-j?x{KR=r2LhoKR%lFIEB}?iLVJa!-SBcYWZJn1Ox852s06) zXLg(Sp67aP{k8T~e3qQtF_o{#oOm~ZN(_QN5rjZxI0~pp6poWfu;6I1d|Z$qlE2OR z{tGm!WMMQP-KBbAI1m$gWT7f@cEm6g-_^GHFT!v>y6GR_8K4BGJ*-FILhO0xbbRGv zoiA7*@MWRGISdgf5C0l3MC*I`uGxlRw+FN58eJ^)Ww{0q z2_o!AC-g8%oWo>q1uP9!&_WXDSh$?~Q`Q%f{`Z!)7l*s<9-cA_GtOHSHKL1Jsx28^ zNh>QiyWA!q>xjM}Ze@REc;9)`uXn|t4Da8(Ab;m`fAfa?p9wXw|1j13_Ky6W(f@i! zvM~bK*Dt6E5UcsWftomAHHPLMx9>gLl6M7C@=TiFPE5iPy_cL2iYXAA{E*1~;T)mG zk6RECCX|NyLzRLAxmiC^WK_4yM#J>srp03DQp@3oWPxym=6hPUg8DBX+8lj*twQ?n1Q79=W}1d(?FTn|@r8 zdFPeo+xc{pjVM5lc4Ef_$M~A~5rlu{W@bJ^1WM`4F4Jvk2M{Q(_S{HF_IV!7aYRp` znt2!nBb1M57vEJUZ^sKpaOczyh&A{L$&Z+-UhrQ*>DNP$CcF)83&p4_gx^@@S?_!8g|v4jEMfsnlSZ&vv-OPI#30q8K2j zd2}*qhD!bK9{E!dXL#vz=pmS<`p`c^d8I>iVOo_2L_@iB^Kc&dUl+h+6#+rWU?b3K41(pqI4X?hMLkBM@3$X? zy81~JP8(1TG4eN^&l1jPPdS~iTK)KtzOWv!1@mhIyQJoSuTY zUDu13h|C!V8oG7%UbIyREj~EXhB?Ak8~R5HlgLxN zejBflqrWQ~Vo;($ZD2q|K@}5{!$25M6WeFXO_ak9))Z)lSZH$5i?N|5W=-FK789{_ z-{0d=Uj+3EZ4HYtJc8)0g0PP@IB0!aSHw0LO@UE9_n zkqGl^U0c}P*iY}{(t0Gqo+4{VNqev(GKao!glM*Ob@qk_cFdbiarcZ!)OHEtQP+?4 zL+h+*TxW=4jxcLA_|fGZ%qOK>UgS+87P@s^d9F&4BJK*pwno(!(}&Z_${;|}P}r&M zkLk4fO4}M*uEZ!;WW4rCY9IiXdSxTRE^$zzbP9fqo~nDFv*W_=B!Br95VRa`AvTNb z9wD{aCF`IOxi@4Gw2f_B`PcSKc*ihaeBJmqR9izs8wGhD zng$)Eb*)a8A^u@kH76#dC5iBQVZE0kLgcz16sJPanz^>7##?3xC==;)*vBoQqH8h? ztG5&G1?V0529f-G-0VUPL*msDvHBXib(Q)0YxUf^L<^IGp@E_jjbs)R3|m;+*^e&L zxVs+wP#o_*t&=PZ)wX%~>RI0x*K1@wfZ^1_763iHO^zDDEbwwColrcoH@&^g6v?q3 z?A2Gry=>ux)D|V~h6Zv#5_Hfl)PRo~o)pk$dy}$M zFK~XiRqA<%YepOTLP36=#pQ7>9y5bRY_`_bx8UJ1G-b3X@8KpoLIJa5X@mndO9zp$ zn8t-0Nnhc~`7RpcLctsSp6GaH)us4@pJI&xRb1?sFJba+U&14)n9CDe+0&d^bOa_- z==$j#N9@;Y+#R=YRbv(3Yjx_ECAAqYaqg|tF!X-d+IM(i=Q5#sddU&K453Y&7`wNw zA?%AO0y^slZ)!S$GdU+>WRDvLU6Y5mC9_a>q4KmIAj4Y$_0Zj5yG@wdNlX1%r8&ra z3MoOSit2dAeglLWd+|1;%nZ-c1< z^|almQc2d}lCBIVJ6!KtKLJ?rs8xg`gpDht3j9ow1`QU8Z6%bNn5o{T`8i3JkjnTu ztFYO{FnP5FLD>_I~v2WdD$4H|OFdTW}x^N$DK~MwKj})=cl#rDl)g{D& zi=~zleFIdAeB^BqqEl)y^?TB}oEFrFdd~zSsaB$k-gy%2^>ZEV$_*ZKa9`rmiUl`2 zd|mc?xz!Tfld?;z8v1Zb)Jq$IuYSnKbg6ZzwclX5RiZiH*U1ZthUZ;>wdFgzbu5H( z_LYXBkh_ZUu%jr>=yg8+;~vAIZ>f6uWOWRFh8AwoPT;HZm9>^KdjXHinZz07iBkV_ zObF-KKo>Cm1a$%q{>=je=x)G+;A+vaGT<=9mhKEaK!-|FohyQ>fO?a}0mZ3mDYC!>rivWv|J zj%V9Q5U`~Le|Z@-F8ys1^$8}b6s(O8t(B3Q#(toR#+(D}TStN=&MPo8nY6b;WhYoY z39;MJm#~n~g<*C@)(&Z-&Z{^0LWI%U3J~-6;Kux=U@*F)NfvR|wJ9jmTDF0jy{09E z8B;lq@cha^b_o2~9tfHpywluz5yqetIqKmU_qZQPMrA>b`w;LH96H-}`UHurwE4xW z)nwTq#r^j6$xN6UDv$8y)ZYby;ve=n6*nt!9(8x7ZIQlB%gbS1 zox3{xI(eI9*@o3%Y|lU1Sa{<~7v)$?Jf$L5cm(-n&MgjR1BKF`@p@@x$j#Vd`qVjm z9#rxg5`wc7qCk4vfV@ak9&Ll{gb@vl!*aY2cRH)F@WLK4}rT&s#k$KNH zV11z^gae%~2wOL>ox4}oLOS@>RA`wBa9pjBX>BvI%MSe5E09XrHl0|tQS8N)XtQ06 z)QjHcCqvN~4+>5QTj@j{H6(E}vlj~(ZS9&RTfT@h^SNI=_NySMt+v)wficoa2p3?EHFp+BMUO+jrvudr-CB3s zUkq_an2JR;nitpWF)*7uK|)lSZ`kDA)h_!=(wv{{cD4gO=XlSE@Dy<`Fn>bMorQpH zNj!3n2kWj)Y?ouz47{60;!A+?qEG?qs#;|?E&gQ{3xT!iER~J_#`>ld=_fL-GU7ew za5BTOS25DarL`$N{>4PoA+8fcFXvl=uVCsih1x>)+Z_oMWrtlsA|yaQ4f7B|6>`6C z@NrE|{Hj)!f3FK`QpDr)N+VmB)HG9>1Q*2QFgEkpOSBT1>EqQ<)rhSo?-_N3lNG)w zO9t@@H@kF(84iOfn*&ckcx^NSNc%=(XAA!ETNt%KIN;+n6lp zTeN7}KF{`Ph`-_l2=Su0uB^t;z)CBnXXc>kzP!A1&g2yM3ZiBUb)BgU?2Y@YLwc#@ z7mT#__kJ&uNyfD_@mh{G6c%gDVtu6MnDQy(p?#MkjdJSp^Xx%n+F@s)I7hG6M&F40 zhXtwFVD|xO9i^lowt-4gIN^lz*&+#2>|3*-vllj$6(ZT#rhXKgIoR9i`e3+{g6=4D zjKv~~k1BOTw%8P#Uc}+Bb=_=ehLF_W=FICA>>wsfV>kApJ1p_?^vcQKH~x#2hLbof z@WNE3i<}`D8yEo-n167y&@(uNuygD_rZ`rM`H?F+@j zcj~%v0DcO?r-ROTI6v*FZL;cT{pqAVv-(vs39Hw&zCo@>uqliEym{P5LhWSYcI17< zqKZ#gjTj6VVm(4hGu|F4dm5f!;^}KYWbmM|6y}K#2W7u(@#lx^YB0vqAc*$ss#_Ck+6@MlA`QY;nmT4I7wMhIk{l{9YruGqLa8SIjnwZ zy#z{~A$<6X_)?n{XafUM%eTmMM0-tk`gUb-skSn2T+m_oRx`o?-$-FHqZ<&^KAdVl z5F{?nBEr_%ezqx@_+tVHs#hE3YVVzMEKkEPZWO)q0#H)(yknd(F+|u&dwLT3X=a9x zIpUZ|T|*kgj~;6sMEbjUIx(&^HCFjpZDNGBQp*5A*zcW=`6oJniSM_D)4o;u*(Aei zjJp->U1eh`!w}XJI428w7HbpMdZD_13J=6Dk>TJhcQ{cwUD(Y7Gn@>gc{#O`W27KN z=+xx=ZL7F&gQcMf5;HSdZgkpZ{VldJeZ?K-WoDM|W0o{@xJ+OD?Xt`Y?GCAdD1MEb z`!(+^T6+c^&R|}P-tVN$C9n5#V{JPKT4q5lA_dt^}fmW)q z1>d4)Jx(h7AT=UX23k*(-gXvo?Vi-l4AbDr<=2!+k6h>JIvbi;i0dfdXbo9Fo$5+v zk*8=jxg3uBnqu-(J!ZuVwAa<=PmEcYmA}{mDWpxPj3cwh=H+U#o-GcLW`CK=*XS!% zv4#0jkK+us1~Dm1i?(C$6Qy-UPBWgm4#ridZ$IKjOQF5F9yRU2MFbCj3*6TrRrm&J zK_;rSj_j3OXoPWWZOqU$99r&L$MS-w`LKRuP>ra5!QM8zE)Het2Leuf>dZ@qi1=WZl&Sklb4Ade!mfKcc>8H!1>0up- zvjmxiYC+eKfha^6ZE#O`mt977jHcbJp>tUkDTAp^ zRTsg&(pQzN!%FYBU2D7ddemYlMl`j2Q9BW0h0ioFj76XVF=bS8U!%%~# zuXQ~ZTK;K$JA~Yx6DhND>HX$ZOcAGQchc*3Q-LB6JX&76wuyF)ct^ZOOwSm~7Ak!eqnz;QWMPt3DEBZ5);8&z=H`0f_ZnE5 zs=83p!X{>B+Ei`+T_og@5>Uo+wN{YiR6FH(qRe@NUf4?8<5wZG%6o$j)YFjwJ?Dnr zZ#Jh93`8ZfaRM3d7qrSce65a5OJ^*!%vvN$6CoR9VVu6~sA__}RLqAi7&_nd)*#g9 zJ8Nuyuu#`Kv+U02Q^jkY6L2CRJ{#?*p8QA>;pieHE30!AVKc5sKPR=&b7XLu) z9x``T^y@6Xw+}_LqVNkXoE=6OQX%&{Isrbjoy(W`{d@j(N?IBkdGSg{d3Xh1#OMO6 zEmwKLxwJcYt1jE~5SPuQ)E90N797=$skeZl-+1Atm>lciKHakWMQA3;vyu(lZtXLy ze{L9I#qaKagL&70t8ncCaCBJ?Xvv6*j_B0V9Qp7DnYo_|hv1qZQQq;o*}(fchlVJg z|IoQ)I&LAHYL7I(vK*3KmQGRLCy5Kw zcs&s+3-8l^4iBw2?|Hi89NaZsTFm94q0i?eyQ?+9FlI1xFGwv$f(=`<01_}Z89wb} z#@Cq&xntpKLz6a0$ehiq!oRUA7C;bne6Orcra<$@Qkvv{##bu*0QImuAHX`m(&CfTFh=0ajO1 z#{v+PEj@P}np5ZpPyg59qWSHjFB>%0u#il?9K+#0Xl!)QDpjtBe-Sh!9odF!-co58 z!e(`F;rqFD3~Er-?8M7(oOG zlEMX{V%AS5tI~S|4o?gl1`3LWW_EOBFOT5G5H)pD+>>s{sGWuJH?xdYTa2Uv8!{^% zj=U7)1D*HHy`Beu(=+(nx98}(uW|DcxFIqgeVjUIH)pi`%@`(8gD6`K&7ZKN8 z)BawJXU`Ma=Lx4BQmA>s-n;3ukEdPUv@n1ZYe;LmJXTaXkpF5*}^HBa}J^XGZ{AEpiw(bFr zXRF^Yqucj(0Qo;HmN4Hvmw%T37bD#7{;fZaaR0T3vM)bcz!h%~^~7uI4-L!&g5nhV zW3WE)WXA^w89 zOBG+_dOi-RZ`U9FSmg4@j6&iZ6jF zcA|qka|Lk`aY-yQIvQrM+28HV#m;`U1lh3MZ~ekv;43UOT+82l@{|4d9SVOE@xL(} z{Z708|1lf=3K;RcjGwJ!?0@yG2NFpjykh2a+;z6L)kA{a{cl z<5evIB)X9j?Vczi|0Q`p^>F2?@_qO~!Xx6-3t_8+C*BCJEc_nNC)o~Ye*#@RLNTA| zMiD}sfK|M&_TOHuHCrgzXd3 z=*PMBt1zi2OKefwVk;p_HZGBzGvk?pJ0ZkfL=h6pihgV4`gW@<77qqWF|ERoxE)QHmygOk7QbMe`h5AK5^=u z#`c@XcA57*5-*>O*7|f6L>FM`wve~GJ#UJ7y4|?(sji)iMWq)75%=-U)m+G|kcElx z(ZZtY#BwetcR+o8e7=uUAlnhn$26uDrac9 zy|HPoS;eVqRH$b%xOI`7I<4@im(5EA#&>QVeN9plJ~A7RcEWm51>9^0Nk-C#UM~xp zF3=uo#h^-K#?#h-F(LG#M<@NUoWBk~d#AfA_wKbY z$5Tw3BUW9cxn*)jGY>C3_bWOcy;cx}v{mi6IFN1#PA_2pw>h?&OpoN-^aRfIj#F9gN+GAP zvSvPDvwD9)Q^PS5w6T~RQbJoS9W;K&HRKGZ@PhZ&T+u+eD|y;KLwX2DJechAtg>6@ z{nyBKTX{DAJ45Xr`1=I;gB535z!P>4bW?(pf)_Jy5$iidUVFOVa6F+5;tlD#HeV09 z6!131rxH`t5*B`$+O{sL>h)V<@HMYQdpxkeiq`+6>R5n1W zBlpjJF7s*y5emF(LN&S#oZ5}P^E8>SbbMo8z(0{t|H`yC=)^HCA&qt(b$)u#J}P@) zU;|Oc3h+s%S)EoOt`GyRL5{PO=+B2CfiE@tQA}ekb=+<^yz{!Qw;uhxgfuuz$SERt zIeNlDL>a8g)JFA0sk{_W;L3;^hH#R`XNbu1>Tb5)Ret#PdQk7Ncu*)DUa0FLTu3;i zrWfvhd+|K+&TfT^g6HNAL)y7Dr6Xc;k67o=3L98}hpLbLo|&wDh!V*czmfZ6=v^ zAFzgY)r!FeeFU_J$j+seI=wse$IVu*qh;!YEd|i{O?MHY*NB-iC7mcISk%0OGZ(n2 zD}8X%GVj{#ua;@B{4ccian3(Iaf!bH3t3zI)9$WjXU~8cuby$qZ6~r z2PFk(_f>bTkn;F!Xzk_=KxpYv+wRiCExcI|G(22~O6;skOM&dKy8EGcVuEu83&^uMp_Q<;Q=a)_b9}-8KWZp^fUn#q#&3kt?B>uT zl&5@dPS>$AIUc+gn&~lpX%;a{Aw93}$CM6z!93NfWpVcpg8B7YDX0}iipPX>1q{Gg z1O%!_gcU~)55;3Y_*hQ^+6Zji9KoxifCScqVYW+Qmu&kT{#X@zjl z(kRNs8oKH@#RW@@6qAgs6~WlT$1B%o5yWo2o)_Yiacs*cYbBpI)wF+4km;)Rb$wH# zeq3?2E4`?olr^f;pYJ8x{vt#oQ^1RyL}n7Q7RQgfv8$Mx37M*K$Df`+`k)fx|3(G) z-TCBC>FK{w0c8Hszxc;ys^2sKj-MXK|DXW?tForUp(`NfAJ1DE%#sP z1rR~;@95>fS^0ARsQh#M41D%yWCD(#9>@Q%^q(Eu{%|k;uB87^_FM!T&D2dvK}fE@O-Lh8pf6)oJ%^Y!T_KIp<&T+|>%J2b^r+d9>h-$9CCgcrMI+p1LV9 z(Ww4E?0r>u9ND?8n3^+9ixLjiWb?g2jX6b}Z&Qg10VBSTTP z;AL?mDcN%$iJQPv+a-S#Ua+6az({0YyZSr{C^?geqOF7B`GMsf36|xOh^__xZgO1c z`QP{o)J;^oS{TQkoiX zCib*{jh~a85?#Uq!RGuxA^UZOi4Lft+s0Tp(3#}Tixd?cMkL84L_~z`ND*Y0zq|Mj zx6@!JfjLhw){(#zeeF$iPn5tD)m9oZI)#lGrzq$Mx7!bP6S}XJi%m5gA{z8Zh~F}X z3A#7Z=Ro9d6NF9YRk_L28HO8!yLl2D-t*)n0K{^tBwSn?E8=chJ@$#5djOcKI5 zpqt?anZTVe?}fikk3C2#qo;|+;$at*7Lfp(OzB@#ry+6rRDi|4HqEkFi! z5KRa34WI%&kqDc%yoRE^F!ILAxDb_*FH&-u*ZPi5Uw}C+?o8jAc$(*DG{jmfM_r%c zG42(_8IsyZ`|T@fBv%lm;F2&9kQVn3a6hajY{%j$c=>}o;oZqh(Quq5|6P%6>|MM7 ztBPJ2ZH1`a29-g~>7;$0`uV+Gl0!ew(vx6XKEZ9 zjRfz3|y76UKYV7s&*B&1?U{1jhssy=A)vZ-sb$J*2 zS!H#NQXG6R5gpdz(OkT?lC)kb6@4!p$bC-&yIGb%6U6$hma1%x=qor$2JU-!kYpD2 z5{7KQ)4Jc4;bW1FV8M2f>o*fSfk#odW`ZYm2jy>Ag-_UO-e*2#mJbbPO+*=ew%%_# zAOEOOTI?jSN0-8ZvuHb?DvK9BIk3&M;j|$i4q+GEQ7BXx`G!+FV5*Wl#c+|6jI7v_UHI3{II z|D%`N1cY=Z;YlM%Q=lY4+i1hs=f421EDXuHw{F^jdfLFIn&HPzExOE z*uDrI{lsCL*I)>QO`C;YMWB6tVOOGAR|L zpomDBNDr`3lh%JDzNWg#;i7w}aQ-^e3jK)0ppzQkw7vI4-~H}#+;}Axj}r`XyFDUf zK?C8mv#)a{DA-`Obl`nX&Q=2b1VgiS!3lP+sWj8@#|d z-LGJ64P5SyZ)aDFJ@)fZ5nJ+7c#fwPhY?+zv$Pjt27B&T`xS|+`+M)S&jkqO>T#+< zh!#OGg<2rr=fXf*`9qhAk)(VnAs(X+0LJQPCB}EXKT;83{H`gktChJlW0+Z-s&^V(6Y^oGsx*z2 zfA1tBL=av*7^z{_SB08h!YazgqiIwIMuD#3m%1iD*(V>)oebQ>vRHzpOsz0l zlbmZqOzXwaQkH^VZrZ25nAP<&my4;$Jn(&>Px^`l$xqqcU4iC#-aZ#hIZd@yHE(qGUqvj4g7}J(p zf}e9|yWM}Cc%g9SOrN`eOlFc(T;2 z$X3-sByomU=u25w9s642DxU(7hbr!LRhCWtiqKmWEW*l#{{m{@DsHbnCS}+NflSR^ zn1q@Pj5IU=D>L)qx`J(`+`NjmlHf-;>_@*B)zcfBtRFh6-LHr51&^)h>ABasy3Xyt zab=}?iYMHWrVW$ozU`0Jn0H9es$Wu1g|N8K`1M38YjL%zYR(ehs+h?W~aom;wM z!oocD_y|EB?RZXbtcIYXU7pmKkeVSu!$-758vq>A*e6>dE+JvRWT5%rjUOkxo~|{$ z+q7+UHIoSAbN~HLK7!Wh%V(A^h~HA^WZ7^Pb=2lJj>%O8xeP4xd0=2s1uB}?@N2K3 zk4f>D&B=l)%WN{#PNw{7=HrMOSRYKp5AcnWukR3 zOU`Jnj>|0ZmuCp_y+u@9xfBKpP^%T>GuFQRc8<*|f)S;MU1TVKJuh0eJeN|{xE5?z zENsdJys527@S!Zhs5~7?8i@TXPD(XCUmWFbZnN27!$)SFNLd)r4p-d`bVB4^Rl(+K zYZNbUXMG9j&uToQ9!?r_&(UK=K5h<6_Z5w$@rIGL`rrMM1}7LUU9Vyo6mXwDzT#IJ zbwrI!s-i0AE3P!>!k@b%bQgNNw@dkYjfR6J0h3K=(Q`={@~Ts9w|ts5UG~l3%;9s9 z+5K^1wL}T0xUNWFJs4?7w)bq#`52ER3ha@?2CA;v=fq?7GWWKWhamiymsf|B!~?ZihD zF#Wwb)?@qKb5K1JUW|Y<;{pO!60U5!Mt$uGSh@qi>`#5Rjzkw~~K>zyCe;_D9p| zujT*$5A6pSVE;a}|F^sTzXFSYXuyAh75`6CZ@)Wn{4H#e`;Yw5@15iSQ*Xaz283Gp z8}&xbAoS;v4~QK9Z%6*Se@(spB@f^qsJFlH@c&}y1IVy{O}+i1+J2+i0Cd|gn(ZgG z_Vd^usky(>Zh-P%wA`=EfIrXs6QlpfiSIAd3M)V|;eT{h&`TJz?PEd$I(vX2Tm_Ak z&hUCF{+JMp+DWGXv_VOsdW`7uP>ugs<|~R)z0^yd}sdC zspcy;jF{>PD9oW+0W4aDUg04Mc*Fc540wZ*jgruuuov1A4LL8G(u_%LU}L>=Ty76k zX*mXXK5c^RsYu>JIZJuqXR7*?MUTdq1TFLrV-m?)=xwuEvS#l}T*}7Faw}=-X&UnQ zQ^(7c#)LZtrjmaY93DLQSi60UcT*6g^8xl*VHi21#CsIxl=22O9Bp0tbKm9uqua@U zM2!8h%l?ze?*Aq+21tzh+jH}8^WErSxJ!~#7c?7 z!F*7GLKwmk3BVsGqagjd5}lf+eni;GYd-aI>U($h(zSV)<#cPW zEKQ*ux*-B5as+M(8Nb(8z$9+INB9u)l_CHt!Y?B$Eyb{L24wto`Mk%jN50N;qhmI{FKTP?IZqD%X+m*2AhH;0q!P*~PuJoPE!<#DO@b+3L4RpZcds7=B zcLRprbe>v)`w;sZNzWv7_f;4HtjocNSQr%aj!d!8iAs=gL0bYVR8$hHhVwy87fh19 zOThi%-D2=eR$@LQQZ`ue&J;c+2;Tj`!=T4FEgHgj9Rrr+D0xIR?}!4imh+R(y}{?- z1?)E1QSu$Ng|wMpu-2G;#n%Qd7|Tsqgr584@0bl`^6HXLMx*Z zCMZOkgg++}kGt*@T@kPbK%px_5%!>q8mJF?_F;x1!P<^DhD?aC#jb%Z3W2b}i2>3= zl^3g3C$chyPMEPK8gVdQr%!~_Ir^uN-~rR|2G47TeT|NcFT;vaCo>B_rA_d@1Gy7E zuz8-krE;UpvNvIXiM@;`L(jqSEhvJ%#1EqS>JN01Z|ris)pL1^xW8szvh9KCoH2#p`}ngGz!4 z(WO8sUFT;DRkXQNP$M{$jphWqdVuEElFaE^eR1lvA9(%tAe)r|a&fZTyit)Oct!qZ z%=@aKvUItU@@=G2%gQv}O~`FOV2ai8bNNZScDdW|k+X+>uZE1QStOz$m_m;@laX?q zx@dJmxhBb87WlBcWZubOl#^f}#wsQuR3@0=2rM|!t~9jPA;kS$>;__zJ!V9sVI0!A z#>gH^h)SiQm=Uj%9I#d(Dry3ACNEQ4C6{&%XGVyN4sO{;uJ_vS-0N%e81?IPj|9FK zuKDfw_cjwzw#)A5*3*6$(X)1`7@u^F2`iT0ewwJXE*Iz3Yk(B7XijCYF?W6iHHhKK5+h28{!&_LWy%La`7%blHrVyt0B$L z(}gYe{)9()BxQUV$|pr`t1bU+saxa5fJgxg6$mn-n-`+2&W=Igr*cM z{KhF4-AkP*aw=HY)%Ws(?yWjMEC%ze1)-u!#^&%x>4eRo%NIrS8rG(x;X=>cIcCN; z#P@z)FE(FzVdBn!?H2lU6u*LdR4&DVHAz{JPlB0kl(fA|c#@y!QWX%*)bW!(lI*5s z{84dck$ol*m1Yc$JYdTZ$t9=gDJvZd4+2EN-5i7EES)6;M!p*C3R$~-GwHrBJ%vM&S6tG|Hhoo}dTDDh-^^dlBUXE) zI0Urb9B;W-HYGF{5Q$LJTYn#w8bz-wF-_78)swPKFM5u(-9H+QEXFv=RI;l zo;7qiupo2FS|$Ms)N40Y={cR-Yr+CeFfl$p2A=+)6|M%V4~)S$R#wTZrCu2lAO*Ty zMh~}}LS9tJSHWhH!iecGHLJdnv33Tg-Wu(6Cc%;? z(@=8)EyYa;II_*$8kGqy^w$}XkMx;bQZ&#&Wg~67@z18cjLZ#_3x%vOT>)y{a0HMb5Urjd_1O2gFzf?+@O!Jv@BQ5?5T@S7Wf)HsLw^ukbArRF4$%nr9*O`WZIb^65;z-Ti8vF0Uyw_P|@M{ zp>Fg~xkcbd6#45y2B>iNIW|hAHP)TZ6PA6DC%9v@U#_#yqML`s!K(Pis8nSt+S1#596J@3+D~F2JnVFBgKD#tFifU9zb$06V zOv&`Iv`cxiVjNu2Ppt7W|2SOmw<2op?kNQ?4*k-W(&+!$*zgS)Yw3JD8@jrsv6xq( zuCF1GUQWsap2No?k|BH!Gm*o#WUt5uI${`igu=qPNA&`F+_(qSzBGkI(^an`ktXRC zSOp)VGr%F4&RtKcA>k{O#yR-nnP+>u=O*XH12O?FZNl??#T5!8;xM0=c>w03zKWi* z%;i2ML{lMIpJ8;Ke`v&iviSYKi1Yyx zV1HWr{@DrQ|6~2%vHl<2Y~&fl{yhEx;tT)n_}BU!>(h(aS)2S5*+FKe-;MWw@UvqE z2(w(svkZGMCBKa<`8LT!E~y!#E-1NMJ<;rxXAKmUHg`CoLvFAtsH&H;Fw{mg0i z(`)9}d4TfYLT!GP{n3-=_b>Nr@cb3`|7U6dwn0DD0K7v|Pe}RpYf!LC>N@I{N?8(*Q$u}yA{ z6Mh%7sGv;cBaijntX%bk5KYRA{E5<8s!LF5_s1z0k`Powl2(Gzk019-YteB0K9|h5 z(ZmkZket0H&pz|axHd1amC8u`SW{C@`>d4fGE<8~HO=$20vpYua~+Tp*?&K?(I14W z);13)z4kO0s6O9|N~m$m3KV>r{{t-ebnsUZV6&p{xj=0@G{V4Y2+P92NvPOs;6arw z%S5z6R$n43;L1LtYhka2s>Oem+URoxd}bS)9I+y^qj+b+zS!Z-4WWfy$!aN~PU_*H z=KE2 zf1Cwj01|{OfO(LW6Tlw*ED03K>di|BrLPhDxfHC+AHaDn4YO#RRhafT4w zB%y=QI;IXn5a{?wVy3mWl9s(ORFUyP5Z^#-XGkM!CFGII(dAWF3keh(wM82mJ(F4r z)fH8|4EAT91WHO@iU~W;Z0^>d+V5UBUd$e5(f0n;2jE-(77;7C8-5 zylxd;p@1U_NWqx}mA7&>1_jy1X1dc4IvMNd3_8`3@njW#8Qt14$a?l0|288;4dVnI z+=!n*J3!7yl(p0>3nTrxCfdY+wuc?^c65yyh3F$;+=*N^yB3pWPoui^!&3T#K^?TI zu@Z+^y3o+p5@ZJ9N}#f?fS3%5gDnvc-os8=N{2XFxQ=LtvP$R=U@__Wfz%Qv3=|d& zRz@U3xO1sssMAtuDu+0hx~OM(BLf6)pwum5&VYqGqH22SI=6&LygiUdw9m-sy6i{OMLdCdCj-LoiV`noVpT?u@rx=21gzud zC>uokdZSD9iB_$`P&NFo;69J9B1-G`2+u{?ZVxwANk**QW%jN{BtpK|r=U5F8{(qz z*D#$*o0$mjTt29%rWyv<+&jd^DdOxVikm1^s1IEhs_V>4?6xuIk{T{Xz8jM(l~%1- ztey)%=uK*`b&y!H+lBVfa$D#3hS&)R+K0XpBM?K)Pqxqm8UWK_o-P<^R1Xr_8^*pF zfg(2s+F?l7h9A}MMGj%V*p}H0?b5j^0>bRpgsm$i@3zWgJZtA1*Qu#2t@93A=ac5& z$W61LTXuS`Ps!MV56Ed~-Q@MNRW>@~5VY-|nFRGhJUbJsZrbs2F>;-zRa&B}-Qi|$ zL!n#%rN$L;n*-O7RA9oNK9H|n)TG+ro9g#Dl1&8sRcLYjLgPmy)lVhlyN)HOcb;cp^VN}!Ek02{ccY|t56)eH zd^-@^6%>W{JFC7^XDwsb(mTD>n~TB*H=T^Rh@czPrPkyN_MQ3C;}p#jzg{( z_@A-tC@~OO+ZECmbuA%E_XhIyw@TWZOUNNFteV~>Rc&fEGi=1lE06Sz@)Fiiq^y;Zc_t(N?R?Bn%n0#@8UFd^ldPBv~Wnj2`kw( zz6Q^Z5w6~IzqJ{lSkSG|0c0ia8%Q5Rp6{j*rhT~Ulub1nYNWsfB1Cp-iwHbrJIO^O1oN}x-J36&4#CNZ4Y@)iafI?`n1ZiH@EZ0mJ{$W)PG&+!Q}sVkI)*EB zAOixA2wW;+;cyn;n#k=|1~^(=&xVNG(k_hPSR4}!`WU&H{e??S-VC-p>}L~mTV_Fx z1z%w18#20%xk*3q9j$3Cgg&;~DgyJy9-L2RoNIv8`j-H0@|}Txu%I31d(#;y!;=HFSa8WTCxbWJ=w#rT6C`oA$f74Xq89V?nyjy2U>dQ5G{V;Y~hjU)}_pGFqvIjwj zt%YvjWA%2zKQH9~qH_NwE)fy_mo`Xdrr)i^e+NnaVwe0ER>+@fV1Ek$_=~guPe}3; za{Lj3{K^OZ%L4felKg}oKjFsD{l8cp|IX1L0CRrRI{WK6zXAmS@4w{<2UvQ3L80GZ z5Mcl3xj(I%zv}#U?Vk_{pwa&8|NX4<^ZdVuIDfQ`{RQH%0V1{k#}(l}L<@YvJ{A*^ zQqGssFRc823(N%f*Jy!uY1PuGplgS-=gff7oIA!R2Ti?KpR^TGZ9T#0^#q1F7t2K6 z&vQ&>8GSCI?Y?AQi`g06W7ys;dJE1Qu`j!2G)@d}y>Vl0YisMOocME?oli~9me@my z?N2w}HJo8T_}&x_*s3PYT~6BOS=V(~{cI*j_c*S7Hh6T`*GB=zD4@1z0)5w_aS!u>e~J8VM35 zdURR92CZ2PptByyOEPV3Oc*ZYt#%U+6%8K=k6fb7hZ0amKaUb%RlWZaYM}5_cu!lU zks{d8X?Q?A5}Pd37EQPnLJI`ynl`IrFPJPg_%4|0C>0T92V>)?e`YRC0boj=C$2&Mn-iDyEQ21UOBLV0hJ&P_sTvA}(W4PVAXs}09diiqQY zuEzdu;2lz9On5obidu`r)%wvPNmZYhI^{PSixdIGpb#=F2Xr<yx2e4ySXrnY_U>@EFDYmp>1~-$%^<22uUp@cTzF z^?z2+otf!(XM{h3sXxv@zrYk58vq9WvZu1Lv;7BN0U??mnySMqIRI7lTBUIK^kT}1T<@#>DYfwiM)}@Y|(FdWARIiMi72iWGT3B6> z>8xzNy*<((>i40{>%6T#J0Is}J?OY%=yuDf@laJj!3lCD!N_Y&yTFZmFbY0jO3IAy zwsn4?7vX57mcEEcVaEwA<^b2%Tv$TPKBZ2}xRO|$Ie+#VdTcD%Of%A~o{XNt-HRpY z^)yoVHcJT6N0VJld3b+@m6E&i<`n}8!$b9wgQ+Jc{8|8qMJA^DCL+{*)gGg01kSs# zr10hvy@PeNfVx@IaX5hGp~$mjErXHItX$)LGSyk5$dnjd&?>^V#Mc7e^BAR>o7ESl zAqSMdLk9;MtCUji8k~A56ko(gC6y6 zq`@hW_rrNZ2yxgpblZ;dhaqACnczk|c?po+aJ3O&kDg}e5pZf?YsFoNO|0FJ!Vz?C zVskPP7?UsxVlOgLymY7***H$IRIKIP#TiYLn2I=FdwfVd5C39%_W?&~|5;c|B`EEv z4RM>1;Noqt%vjlT%e5(}PfVYKj|{VGyccpR4d9ROGLv)$Y_=He-`j6G2ww5I+g&=k z$8R~?(h@Jxf>|8 zOiYeKdB&*4a}o?^eY(~Ak2Kc(P$f#w-Am%%cQ*Dn&s}fZcXTM97rsU*%PBW31U$4O z7q_hzg@8kNvQIH#MX!$XQ9uw{`*vG9WV7q{$)lO9BEPHpto5(oyP!lCS>tbkE<>zzQVvy-D* zKaG`$V#f)F(+y~R5@8<%=NJ(81s|d*;ocoBPpVbvp%NwScg`T4Q{``pMRfb`K2%U1O1BNft|~oE2rQ zd5;WYYxRQBXBdEJcXv+Ia+i^l>wOB1F?4Gq2(eoYZl ziKR1$sU+nMVuv6Y`4q;6=sqwaU9zQ%CPH)#NECvhTZd!1b09=@ZwE(RMViVOjTM-^(m; ztx2#)?YT=^i3M=Jp^-fjp|qio*lPPX-)o}3qYK6>N^vt-RO>a46n{GYhHZVEP-W7D zoOr|+Me8-*+tj<3$S;N746ExGJMMB|Lk0SO? zPVa^Q!J=3FNtzMZ=(3c?0_0EyQa=Qc4uz<=)TaO&1ei>{Miq-m(U2o~r7Uidx!um$ zuJy|F)4{Ab>D#GpdZTxp_cp4oh9gtQ=WMwhY!cgpZnw?lety0%>j8nm4~h9KgjOXhb^7~j<2yr`!PR-N}n^!x>Vkvg4ZkEUmIc2oOXG(I(&KV%+zTqUB{ zx!dXa%szgIC|wWXK=*>xkSO_jlr5a<@yGq*G-ykN!-QKCe=An$X)NK@+UF}j08R)opnX|ajDsJYZ!)02jK=yueLNsBLI z6M+F~s16SUrL&RO78`JzTNSng*|1eDH|V?(uM`;bJeS-U?oeal)`GDcq#rHa({>b8 z$lPx6qO>~hkqWfQ?JnvTfBAloliMwABIv^bBf96Uk5uftWYk2F(C?PP^jIE4+zzb7 zUCu~`-u_c?4D%oE=k5bkEeErq&p}sIA_nwnRH?MI+YDj$fslFmYuPJX`N??I5;sz# zzg~^G?ef0I%DGvtx~z}yTQspfQK$>X{bB6Gs;a6$l@*l&kwXj32-n

    uP^j^Cg|n zcyQ! zI##wg7@owgMLV_L)=W?FVN>$|@f^=YP7=UqzS}14U zD7Mw^=qs$O=eGo@GNw^vPmS_>v=LQTH8eSvykeQTNlutq=US||Y__`AS3g16zlSPm z@!v?8$u?YAQ<*ykcXVnn%GEoK>s@-34#*b4Fa<+BR3GC;vbz?;|(Q^=8`M zl-)aKa^ra%sl|ujl~6r$))z4misqf`Y%4=LA;J8q9<@$c!s&gU@>~i6k}wE}TZ9pMNZ>f$h;W=G?;_veo71?s&H(8>MwV&>C$D1)y+Jnhp0v;`Bx zcw!9|Gp>?aRz&YgFq6J%Y-ug>Ex4WT+wASw*q?@ky?uG~R(whsDhwMIeB8F*X{s#F zXxh#C=o)@`E+0LkSJE^0glMSiVR2x;@^-VEW!x#yy)xtTvhPw;0QXUSs)sZ#t6|lj zj|zAC-MA!bmzdh;O<(rAmypNI>kqk~T;TW6P*meMQdE&~%l7pphC^5HLcV+B z&N+V2ZQd;9_mW2Db?V=|o{5Np0p;EcIA@2R3EiiqKzCv@mfs41P=%jMNX2}xO+M-^ zDN+>v=&Xj&h{m!szCu!XET-V6t5-NJbCyhy?h~oHz&5pswBJ6FG?8RxI$3qfF@xIA z2Z&WUul;-@MK>Q%2K7;5i|D#6XrKM~@t!yZgRk0M&ez(8n%1UOO=Y1kyw=K&!NZ6} zzz?$B;Kymz>6GAUo2W7!X=x_j{R$vdrB=#D7-Uih?xs^_pzGU9U1-nuq|2#UM*_@& z;|vn3@d;ixE2mrWgfCC_Zfa_KBYB=IYPD3;Yy%uvcFyZR+U8`*gA>$lKg1CQvkLZ+ z)PEsyI>=?dP2J7deOG@idTI2Mx+=Kg-bW^wle)f?nW3tvlEy5&?GA&1M=2iT1$j^> zFR^o@<94>0&#eq1VhiFo?x@POul3DCLwGvA1lJCP9;5CT2y@T3amtWX>}(U_JoeFjvu#*;{rN}G*ol!* zjCx&kx3|uvycC&f&VgDw5#ZP+iba-wOJv$2%E(K%?9W#IPSX|>s=%BhR%y#h^jY=w zO&dCTgv5B|gtT8GBUf+>{S*n)vGYdF%H&gT?}NqYmo+BAgh+q|4Za9~`ku=k#0o&c zBXujX2s|$IyL8+M>PTZ0v!PQ~`_ZOSHy>~JTke4_8JjN~pQah5wK~+%U@(%|gtFFi z-J~rTr(#%nsLq=Y9-Om6SL5^P|&c`-!v@elJxkZ;2(HH(=<%N54M8)Ikd{I^hk7-r_b;P*jDIN*nPT8)yNHO` zxdUv%T!7Rr^emi!FUrKm4iMV=cQ#>w`Izx%bDG*Z1L^>NIHRbcy_Bh?xrH;pCJbnw zzuJT~e=h~x6+jhXSM%Q*4)B`+B;?=hz|21_#y`Ibz?0jQg@_q&1%Oip%pa(W-%5V{ z>R;3a7uU~={)O~o=K@fO|0C;HsqJBpwwUJUGfq4qH7q(QEOyQ9#l;AdB}Aiz&Dlri7D?F` z$bG(o=u#4-(^Y6vr0inVfpoQTx|d{BevPMbk$G2m1>Lu(Z*E}=HV>q=xbLprURTpa zAq8Le?sD$0rc&MpSq%7~m_lp5H_!|))f!O@SUp@#3A`klZq*wJC+|n_DLHrCX}XBD zbhD7)RYpSHz4Htv)gkFSPNCVhuNCG}Lq;~E843y7I}vH&Bf+DJfSNNUgihCwaYfoEr7s4x*GUC;5Chwr8IClP zh8pTf+E=wE8Qgmu&Q!xqa@|fUviTVi?Co>-)+!q1aR(>K@e&rYXZJa3;! z780@*D*<{;}_32X#T`uU3+##Y6KK5Z-~hA4{mmXPO_c+?~fkoWzeDni;T2^eScKB)j|gO1RpOzRoQd z9}=ob*}dXQHR z;(IGf_%pIQ?$(>gakuaW$>K$`7{T!S6qqdXO%g^BfB*%tN3yQ5uL6N;UNtL)W8^ca zM~W_y#wf`Hk2G`<;}GA)}^xj5pdo<>|)!ik4QU0coPoZv-9PM@JNN@5f6`A8dUqgdRSbNUaAg( zKOhRn{@DGnoAE3@5C6@w^>$z5rS z#}g(33&DBRUaSR&KhT0A)mGL?M1~J6PA=K!SMd?L^&F`PLfkl}S-D8U)G4100*cS+ zG#gxHE36oKciu%$8>3-G?S9!eSh@br#cgzw-f=#8O)6M?sG`_Sg$qK$$+ zOFTw;73>rkGb}Jw0;;#}AxL*2`=v(5QL0P1CY=+dzJO9GEZ)OqAUr0Hh(U9NX>wY@ zG9cpPqjw}0))i+!dX0iOe;cM$t+N;|BpNeWWaI`lyxYA=SRWXf7uxA%ud&=T120+F z;T@ychC=at0%(V(3NIrwqU)sBC&>0EZ=|wt#M^s)L+mC7|KJWtr!Me#DgyMVXSjJ4 zGIC^UA#Lo6aoP&yqGm?-aF*i9OB#_7(HfCKOR*?|>+ori@@+br^^Pc*KDa1s{fvte z)}D(VZ1%NGiB4u2n}O(%Njj7iVtah&1>bWNj9Jw3D=dCy0#@8%BYLXu=msg%Cq`J8 zWoZ${$5&g>u>Sf6`}@Xxt`@KgI93s9pyY!SmD{=8*Q+H)){@tG7 z;`A@3cDA3)o7;<44cCwQK{&=)SJ#1qU1^0Tmx)L!&=s3wXCaQmJ;FBM=X9n?q#h

    Pf)hXr)ONh$WsSzo0n>$HeLQsLDfJ?F}<>4C%VI#CKqtWYQ z&lyokl}caojd34fZ;YxXTDi;EsT&o|Xb+Not(usR!?BT2G>22E>?8Z&Ya53%FwgHO zg(ETU!Q;Ji%j}2LfgX#d=3crWFqNpit4ae(inBlJ%2{D4KHu2R`xu`x@2#BjW6%Tf z3??)Xt;&sPZtpNj?Y-ih^GyVVvN~U{5)c}^Z!(t`t6CgxVJ=soBe2tUxoreTmHWJZ(Rfo7akt)Wm>Qudeu+dK=MUS(0t;><%LQIh;}#<1P|(?$(A)=i zPTObO0j!eK0cmvkWoL?-n!L@A)9&< zrMsvwQaRxp^)Nz!^yI^H9p~rw>*2x9ZsmtGfnGW|=dSEvB(-1Anu;(u$OBu+M>dU|AHD+@V#EYq}(&? zySZH)o^M=@!gsgtT=QXup3tM=5~fJYpPjV#z_jR6m5U*ijXX3gB!Oat8p8FJ^}Z94 zr19jXiTE!GCBVY7r?r;B zrZ@s&);pvufzhP;&X-k>eR@N5eSB@HY&g>~pP$FSDM#2S$viXcuS`pSl6Xj#4nraT zUcMyzQe*r(@{U@x0$oF&D5TAb(1g}r>Bqc=TR64Awsc@DQM425YbAJSG2jj8=AfJv zxKd!}J$_*mAEi3L2>&hwi2wv61z2Cif*MNZC~1x17=LH5=kU#=%}-$4rYJT7L1z~h zwOG_F4g{4DHs>kkMj-K5xhZnY-w0Ai3p`NkF0Cp+w4ZcxCXOqd>A-EIZGSmMj!DyR zXOAi=DOt-?aPgaowT{Dsm}hne0+&~7TerY1vpJZj)P-`;7O_xc(T{LP7M%f>rhti0 z$MPhWqA!&;1g?rH)X?n1Dq``BivO~&HRNv2ih4rSOsIhC@ZQlfV_X z_v}h>bf{Y%OGH>to?PNxusUgKJk4#YxWfuaqY^MxRfU-ZEW0?QBx$iJH`+k96;rlZ z$+9W905C^#f5T;o&G`iL(z`D+ zMlq840A;HQmvhGZQU`)8b<`l1f~kqN7c7q>Y@p=S>=`*0@%njhjjyqFiwkV~^@+1M ztSg5_>>tt6&BvEsFJ1A;+QeYj-sLf5!KMJE(C%Xg(aQLTH}s6dXmLnu%f?$gydUs_ zP;O2fi2eSo73_#7%Q!aF!kEpxAx~H=Ae2a0j@Rs3teby?q*MIq`{64%Y{|ehPg2kg z|0@DKr&vf^R!aeX8C80u8^@^HcP}oS*q$9#E9QdWa$j6kRc&K8&Zc2EJdu*(jR00U zn4tE_z1-rgHiZ~&@HFPV+YdTbJ8T>ZrPMB8=p(&qrpkpuBw!0y4xiwp2jmU1~ebkfR*({t^ln8tiO?VG4rTY;yXc57H-@uG9q z_S*66ds+NEllk3KsTyO%_9#5)(J0_J;k4g1PeDb@0Z1IV(l?-?=*bkD{$0vp#yrBK zRS<2K35JwD&pP3B=Qz$wsB=fwdIVDtu(eKj$0vpN$+lzstrTt;j-?QnWn17ZuOUX3 zBg`+?WdvoVuwd(&vuN9T%FCDjYhjpT!yu!%-wGa;xCF9N9Ny(gmWTw~V~#)tRY#4J znt)~~RAJ3Zp>Il3JV1_s3&g@*7LOvmmxoxs^Y`-ydsyEkxq@_rK2U|o&OEq_k9tOD z5jtAN#o`@7dSZ*%zKaVx9~_=o7fS?sI>h8>74YtW)+F;$hVyUtr8DsU^5FM+@xDh9 z(gtoIoH4$3Ah^28fkvKBD~u6MJpqUc`MsHS89257A*8Yh%z1N6^o~w=GPa?aj2_kB2>&Ce_Zvq=W=T9_L8UY^i#RXo${d$ehK`yH;;cHxn68LF9_l?^;nF z-tRL#An^=#eHr}JH=q}nDyXBTq8S%0StPfZ{H3l=#uMSdc0D~~=K`riT4Vd^tDRvC zLft~IpsQl$;P^n@_CT<^z>rkymL#LcgHD>r$M}{3u?RyHSVQ6<)On9BF-upcdiRx>d{V0JPG%s|SJI zePVoU>*o1tM3!2d7eB|{eso22WTcDM!)JRdZEG%x=E>ae+o1%|d#xc0!IzF)1KPWe z(rKQZ@-$^;hNg8B=}DPJiMmw-Xw0Vj-E39%R^Kd6`8Zj{Pu#+x@7YXMK_RBN z^0{qB*6tpfMYY6!&61(Dpr<+>wOh$L+hpU=Y~0!I$})fE4v&J3kM)4;g1I!|?>?r{0-J**C8;1t5J57+E}4aOL4tw_5Rj+@ zMS^5el9F>=1w;YKIS5EjN)B&E@4a4j@B7~O2fRLe>P*+Ep6;#=J2Stk>gI3DT8pjW z^(fb9E>2e$!wz+YDY?R1MM7Ps7A2*n4u+A*0@1gZc_-d-m#mm;4Rv75=>3bF(=s>e zW~8kriX^T-X~P+;y3W+3rFbhL4fYQF=&P8x-q29QD-FudR|I$Ue4~x^Yuj&amr}Y#HpNCZl`2>jW`MY3&K@Q9^ad@x8LrQLOf(| zF`f-e31GcMG50ae5JDGZ;H-r-<@Cn}xgdkr)5v2U!_6tX+a8#-Eow$6qt>Fhju*Ja zSk??gnsbNSD8kJ!{;s*oF5Jh*Nm9ITS{x@g#BwoFt_vT^CL_!^3wes2dt03?VlCJ7 zCT5i0i!WH75=(cPbnk~@m?c8za~^A6mW#GsU#Fvb(CbD6Sx9@yS5pg)QFe9Ew2h{hE!+jRZeRq}9b+)k@o@vxewJ$al~ZTFHrwBmc1bGqVlXJfhkp=WR`+!91b0wPHy_>pRn8%JUBl;7_6?nENWQ( zM@LIb$MoqlaM`E|yjM!$N;WD4CF-QE@leN+$6-V24SPU}&N(igiQ;|MDg=Vhz6$0{ zHH}mcj}OTe$;BSsqX!&0q-Gv?^7G%J*T~s=eWYu|kyRvgX3=pfy{D|i{;^Q>ZO7ZA z`Q=3k^K+CGeIEK_1rCwv+Au4B_B?pbi@K6FaRu&o<-=R!dRCtr;!Ns>C+jyGT`{Pc z`{~Wu-s6!Y-bzN&Nps`lw}W!Dqrwo3%y9t z68G1M*Lr=RHuBmZal@WDgXRf5$e|dWH`08N zxO9*M?cttqOFO%_+$qe!$X!3@Q1Q^%hHm{xXH8Or_$kFOIOh^k!}Wstm!Qqo8p>tc z83FgEjjs9kmJ7+!4tx?D`6HQcCyPs28NP&5&SqXob~zR2UNvTNgKQ453i3Np*?AV% z;`6|b(6EX;Iu`L!<Hr zUYQF?VtZ!!_Q~>nSLK1H%iqSe$E1ve)e*v8x>G9c2jup1J(pofNiVScw&|f0(2up~ zD=G21pj5jg(dlk!?3C%0m2dOGMwq?VaU>zC_LY@vSRc3Upg5s6*DnlOLGdK%-cYN% z7%4C*%$A1^@6iqI%7XD>gugs*yh%128tCoL99_#cnGK$7rPr9{LkEaa@i|XHCj4molY7qOSDe-D*{e+;UI?vtRcVX6f}wTz%fX#rnbSNbDMK zeCqUpnYokfm#b>|$4tKQs2xw#mD54QiB~;jDG)JXI6!_9ez@l(iBaMDN5tj%#^#W% z@H^AFtwX{y^K+$o+RjVuhuD4XoQC6?bMxXBatSLwy6UvXOOZR^&y_~9I zU@^?{E$ETH_ze;M4hnn6)jLj+%u^S7SQO|G;|gNJ!UZ!A%y(B7m7A6pzP+QrnR7>^ z=EvN!Cq8*jli$#|Y5vojhe`P}cTQV#O$wnG0$?xVQvKb>TPt!WRvcFy&-EG1`c5Xe zZi+1@?7U!Az`EAz(sZsUJ_s2^Xp+&HyVmIME<1HnKR)7nFt@3|+<>ITjw?_n&X-$- z_ctqHoBB?|ZrRz>X|D!eJ$p5o);K47q#U{;-Lusx@-Gn0KaF31JBR%j5yfTXgyrzI zX4)3Uw*LX7p@}UHgMk=^I{2NP^d}qxe!Bk~n*lR_K{J4hB9aQa_$Q+D=V$x>KUjcv zSYa?w@qeKmzsL1Afc6V5Q8Bi`1E30!{(U&9ZB+q~2ZjVe9>7Hw1_R(F918ir`u$-6 zpy?kN4+s|hjqyOxSS$lA#)AY9x8E2KU|9MOj0Xn7aoaGSU%s^e#CZN0r0Xw?2Ll6Z z{&TV*VE;|HpQ-E0b}a8%r=M}BIwr6hK=%%me>_uvra!0*&9?t~*$Ind8|^Wr)rjD7 z@lO4X_QOUW(``Od4x&;1`e`S;^fW>=UMT*kJvN{;`QQ)^tBD&9PSFcKt>^ec>F`z+ zVZJ!3hmQG$qGsCT#H2pM(&?qu;^#hWAHR zj#}N`I93+t+w0M}J{eF}f28*YyCjc({}=mlay~_ii*mwBYjAv~lQ%xlX@gfTmG5q_ zwZH@XIOl6o#nW>k&1LBtbDN{n6D#yzOoG?Dmbwz2_icU0a1uAi)`)|y&)pAFIz2`0 z;?ldb*nAClZ`CbFg?y2@CYh(;`!4vXi8Jz;a%1e>i1Tq$3_8)Rg~%J-*kr5XNh@SX zPqOUu3dizlXC!7GlOsIOUl*6(xzpyj5=^!$eM)y6mcye*x4sXl>8d_)W2&DV=gzoc zkh;Hf%gzpAC(o(C`LKldTY?*t%NAX6pUh0|-L2ZEA4cz%?(kdv3~L+e6m8wDT$rt| z#Bb7X#E=&|)$EvYIGN*&_c@I{al{+VOKd+R`-w4Q!exdJ6gISC?ojT3I8cUx-xKd- zX5({g#oo|*Td# zv$$*5P=Ah3P)E*ZYz7Ltn6%p@nw3rRTBj4KE=gf2w5+`l2Q6?ik_ajwOkSd+42>jF zY!OgL8-~a4(VCAhV7OsCuw5oe_Vw*^szzj6#>XN(tDa^nBSRFzh6djcBcpK6U6t)w zU)r-op*%Xr**F^~TrrogWZz1Uh;j*x`LRbb8yXPba{HrZ&&5+;;`Ow8O^z(il5RrL zXKwIm$mE}XW4}sJ5Rm_#Vzg2tl$0VkQ=rxB>?PSOex{f6R_F=F7+uYl61Aih`Cf-- z_12ss>$?SCu4-ixH-cWANbpt&KiSP1JLg9aF*f!JGP8YeLuVejmw7k=5y*(|io-j) zXZejaXTcH?N7oa2Z58gj-!EZajt-I*%UrN#c2Ck~ci&@ZD(}sz9TjP6rhUDm)?mli zff;z7@8tE1XEMG_vVMrHPNuu)qTb5SH7^=LSYQ+1lgub|Y~6Vg>8ju#X(lG~rFb-o ze%iVJHz{?A8Dr~ljo`r_%J7{plsP-E)E#bkP;Qp))Az0tGBGsnaGwaByCLlB*7-E# zr)lTz-HGujuP0G2)@OD~`4v{!Un>0A`$l;|!c|4Mu@mt?_e-lT(O_kcGkq(Ww34lq zUNS#ly?^v`q~t==*s&{-5}A3k6$=YD-yw$8?K_1fZmqY!(|D4$@Ud~dYWdrTpvo)K zCExPY31Y*5uxu^x@0#7LRoDZQe3ab?p>a(7W?t zFPNlH+W+DCt&DrJ{A@(TeZOV5SG6aOHk*D=IdLzsBv)U1kOZ??V7XPRs_B<0XBpc- zFR13lvcN=uhCQ+7NL^+$yk^-D&-bz7vG@*)L>-xup}I+XH{OT)E6zMme57;RrD{$? zM#!$%GXOUmJ~-y)lXSUF|DaUw#U=?l{nHbvq~URMRGc1nG&xOR4lzIe@Ll|AOT(R* zVUd1Zn=%C+?Bjd$W!R&WHx5^LvQB;s=6Eu{`h(GgwWj6d`9XPYUW=DZ<_yB3uX#m; zhEptXYI~)}ed=^R)HI{aW2$9zX9KP7&D*Mo|C z0)@uj!|)DgW;2hPAYHz$v-? zVoN^Pdb|3$jOb-lo?`2Zv;-$)KI5HRt)rKGZ*$lOv)dDXu(v(14H}4Qa7s65(H0}c z=-*@RI4y2n`P^8pmf6*m;gwuyL`fg<8bYU2*yZ6;QedQ;U-jCWQ1&?A+e9(ALFNwc zH^Ls`;u(9S>m=VjWbw+MZNVlP+C4$mRmBepC>B)>G+d@E8Rgg>)$h6G+f=T7FqCpu z{JCvQpw2!uA&t~QMya|}5u&qWf#+~!0h>jY3dVTbq&eQ_2YtC?HRG1NoO1T;voyFQ z$D}9p?qPHY%W-Qt!qOpz^YXGEs-hH!-R3X(uvCOBXiC&oU>6@BChWa%SXvkiFv{-LiCzrrd`^V(GQA4mihPN?c$-*~t)?fUv^5kfDDS`McMo;V^Svuu;O$AHZPbwrHJmAKl|9upCmQ`r{(c?@&sOhS(tBF-k#~kc?H8n#BS6!QpqO6r-8rG_1 z_KkA+YdAe~7?UKwyeFN}=#-f|nv7J&Di&y0RovB8rjF;aFnrAy09@@2+zBOL9t-Q2BV5o=^>V3Y}yxX~p?rGe$yI1{nM4sqa zU4NbzH{RCl<;r1L^Ys9p+_2!OP?4gzOY534vEs1>V&5%?x>Tjg!~JfzG%Zxx_Amuq z3h$U&OO=pN$A)Z8$)0v85(`np5WZIYxLCm@ndHXl2Z zdWY;@zSjra@{+r~Pbpe8kN5Ofhryg7)nGwZqV&nGnk74=XW5Vd9>!N*&_^&;^ctUz zSHXrs_c@Or!SV^Qd{-W0K}G5bn5aMyDxKMNFakGY<01MJ0p_! z2Kp!9gBI)uo&h(EJE)fL*kvEa5yvSyJtkpsTJnv+`dmdN;84RJb6q8%q^)|&9k*=F z*W#R~P|X{F9*7{2P}(K=d!^~CSQX_{DMP`Y8WPXbE(OtPS@#_pZkBo>oviQlohvQ5 zBzlPONo={7r1~roGW*(lk8Q5tn4n=pDc9}g98r?Az5H8f5;?9p{CeJLFZc9Pce2m) zC)~|XyXj_jM%@p3%?Vev(?VCe-+%ckV6bwQZ&%oDiG4@qO_trG&dLXK9`{InbxEPa zg>rCkY$|uHPJj^UJR>)%?Yua7{}#vK*4})#3-bT=;q_; zk!Xd-+u=zNG7(}7xrw(kCK;3QrofGiZ{i_j;~Um^QB$IclMsWLDv4<7U;>B=Zf@xT zu~}?DVhj=!kX!+Y21p>Ji)LrQ3)tU86ud$L#b8j3WOUOD~D0=~yTU>?wp9MElupxjwF6pnx*;V?L09|?o%K%oM_50FzPn*L`IhTpG?P9!`5u-Syc z&;MDi~ zC9z!&Fa-ttD7W+eRSp9N;`uM+uxP*}^Z%AZ!!ZBWH=y_1%L286p{&64w_czK42>MX zL($4%L7($&Wr2?gwEX~vf?ChOU;(4Ez3i{tb{^0Yzz%gg?_cE*FeG&y|0;)sfo8y9 z`rY53x$Qg*7PLg&&VvEAyfi!*7POB3d56E|6%2(zQS14a#CACh4*c@6o%e5YfH^A| z77k2*w;6^*0%pQgJQR*b_b}kY4z*t3NEBdnOjQ;K8tZQ7!I5YzwXP6Q6zEM1roY<) z7+!;h%iDMeC>k(frsAQYFzPW!Krz5Egw%2{nl%WlUeJhpdmU&v7)fCp4+#TomZ^BZ zB(}?85peJj#_eS>Q1DF4Z9KpWnx?-<7~mXDC5OboaMW%2b*Jt31D7vN9SGWe2r%{r z4tv;M7KH#tor;G8c1~)Yq7Ya#Ee`=k4+qoV#wCo`G3K zvlcKoK!H?pSl}^+S}#~Qnr0uz!V!Qo!S*_UrqJwbSOge*gh~#91W!xd);=tfc1>e3 zwDSs!#(+mwZZC_W9ScqhME?1J225JuFPed$U;c^$!~Xy> C^8H=_ literal 0 HcmV?d00001 From 168587e6a041490d3cb3bdd5ae60ef072cbbae2d Mon Sep 17 00:00:00 2001 From: Liangjun He Date: Sat, 6 May 2023 22:19:29 +0800 Subject: [PATCH 16/16] HBASE-27516 Document the table based replication queue storage in ref guide (#5203) Signed-off-by: Duo Zhang --- src/main/asciidoc/_chapters/ops_mgt.adoc | 70 ++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 5 deletions(-) diff --git a/src/main/asciidoc/_chapters/ops_mgt.adoc b/src/main/asciidoc/_chapters/ops_mgt.adoc index 5132c16ae945..fbde391a0394 100644 --- a/src/main/asciidoc/_chapters/ops_mgt.adoc +++ b/src/main/asciidoc/_chapters/ops_mgt.adoc @@ -2436,10 +2436,13 @@ states. HBASE-15867 is only half done, as although we have abstract these two interfaces, we still only have zookeeper based implementations. + And in HBASE-27110, we have implemented a file system based replication peer storage, to store replication peer state on file system. Of course you can still use the zookeeper based replication peer storage. + And in HBASE-27109, we have changed the replication queue storage from zookeeper based to hbase table based. See the below `Replication Queue State` in hbase:replication table section for more details. + Replication State in ZooKeeper:: By default, the state is contained in the base node _/hbase/replication_. - Usually this nodes contains two child nodes, the `peers` znode is for storing replication peer -state, and the `rs` znodes is for storing replication queue state. + Usually this nodes contains two child nodes, the peers znode is for storing replication peer state, and the rs znodes is for storing replication queue state. And if you choose the file system based replication peer storage, you will not see the peers znode. + And starting from 3.0.0, we have moved the replication queue state to <> table, so you will not see the rs znode. The `Peers` Znode:: The `peers` znode is stored in _/hbase/replication/peers_ by default. @@ -2454,6 +2457,12 @@ The `RS` Znode:: The child znode name is the region server's hostname, client port, and start code. This list includes both live and dead region servers. +[[hbase:replication]] +The hbase:replication Table:: + After 3.0.0, the `Queue` has been stored in the hbase:replication table, where the row key is -[/], the WAL group will be the qualifier, and the serialized ReplicationGroupOffset will be the value. + The ReplicationGroupOffset includes the wal file of the corresponding queue (-[/]) and its offset. + Because we track replication offset per queue instead of per file, we only need to store one replication offset per queue. + Other implementations for `ReplicationPeerStorage`:: Starting from 2.6.0, we introduce a file system based `ReplicationPeerStorage`, which stores the replication peer state with files on HFile file system, instead of znodes on ZooKeeper. @@ -2473,7 +2482,7 @@ A ZooKeeper watcher is placed on the _${zookeeper.znode.parent}/rs_ node of the This watch is used to monitor changes in the composition of the slave cluster. When nodes are removed from the slave cluster, or if nodes go down or come back up, the master cluster's region servers will respond by selecting a new pool of slave region servers to replicate to. -==== Keeping Track of Logs +==== Keeping Track of Logs(based on ZooKeeper) Each master cluster region server has its own znode in the replication znodes hierarchy. It contains one znode per peer cluster (if 5 slave clusters, 5 znodes are created), and each of these contain a queue of WALs to process. @@ -2494,6 +2503,18 @@ If the log is in the queue, the path will be updated in memory. If the log is currently being replicated, the change will be done atomically so that the reader doesn't attempt to open the file when has already been moved. Because moving a file is a NameNode operation , if the reader is currently reading the log, it won't generate any exception. +==== Keeping Track of Logs(based on hbase table) + +After 3.0.0, for table based implementation, we have server name in row key, which means we will have lots of rows for a given peer. + +For a normal replication queue, the WAL files belong to the region server that is still alive, all the WAL files are kept in memory, so we do not need to get the WAL files from replication queue storage. +And for a recovered replication queue, we could get the WAL files of the dead region server by listing the old WAL directory on HDFS. So theoretically, we do not need to store every WAL file in replication queue storage. +And what’s more, we store the created time(usually) in the WAL file name, so for all the WAL files in a WAL group, we can sort them(actually we will sort them in the current replication framework), which means we only need to store one replication offset per queue. +When starting a recovered replication queue, we will skip all the files before this offset, and start replicating from this offset. + +For ReplicationLogCleaner, all the files before this offset can be deleted, otherwise not. + + ==== Reading, Filtering and Sending Edits By default, a source attempts to read from a WAL and ship log entries to a sink as quickly as possible. @@ -2523,8 +2544,8 @@ NOTE: WALs are saved when replication is enabled or disabled as long as peers ex When no region servers are failing, keeping track of the logs in ZooKeeper adds no value. Unfortunately, region servers do fail, and since ZooKeeper is highly available, it is useful for managing the transfer of the queues in the event of a failure. - -Each of the master cluster region servers keeps a watcher on every other region server, in order to be notified when one dies (just as the master does). When a failure happens, they all race to create a znode called `lock` inside the dead region server's znode that contains its queues. +Each of the master cluster region servers keeps a watcher on every other region server, in order to be notified when one dies (just as the master does). +When a failure happens, they all race to create a znode called `lock` inside the dead region server's znode that contains its queues. The region server that creates it successfully then transfers all the queues to its own znode, one at a time since ZooKeeper does not support renaming queues. After queues are all transferred, they are deleted from the old location. The znodes that were recovered are renamed with the ID of the slave cluster appended with the name of the dead server. @@ -2533,6 +2554,11 @@ Next, the master cluster region server creates one new source thread per copied The main difference is that those queues will never receive new data, since they do not belong to their new region server. When the reader hits the end of the last log, the queue's znode is deleted and the master cluster region server closes that replication source. +And starting from 2.5.0, the failover logic has been moved to SCP, where we add a SERVER_CRASH_CLAIM_REPLICATION_QUEUES step in SCP to claim the replication queues for a dead server. +And starting from 3.0.0, where we changed the replication queue storage from zookeeper to table, the update to the replication queue storage is async, so we also need an extra step to add the missing replication queues before claiming. + +==== The replication queue claiming(based on ZooKeeper) + Given a master cluster with 3 region servers replicating to a single slave with id `2`, the following hierarchy represents what the znodes layout could be at some point in time. The region servers' znodes all contain a `peers` znode which contains a single queue. The znode names in the queues represent the actual file names on HDFS in the form `address,port.timestamp`. @@ -2610,6 +2636,32 @@ The new layout will be: 1.1.1.2,60020.1312 (Contains a position) ---- +==== The replication queue claiming(based on hbase table) + +Given a master cluster with 3 region servers replicating to a single slave with id `2`, the following info represents what the storage layout of queue in the hbase:replication at some point in time. +Row key is -[/], and value is WAL && Offset. + +---- + + -[/] WAL && Offset + 2-1.1.1.1,60020,123456780 1.1.1.1,60020.1234 (Contains a position) + 2-1.1.1.2,60020,123456790 1.1.1.2,60020.1214 (Contains a position) + 2-1.1.1.3,60020,123456630 1.1.1.3,60020.1280 (Contains a position) +---- + +Assume that 1.1.1.2 failed. +The survivors will claim queue of that, and, arbitrarily, 1.1.1.3 wins. +It will claim all the queue of 1.1.1.2, including removing the row of a replication queue, and inserting a new row(where we change the server name to the region server which claims the queue). +Finally, the layout will look like the following: + +---- + + -[/] WAL && Offset + 2-1.1.1.1,60020,123456780 1.1.1.1,60020.1234 (Contains a position) + 2-1.1.1.3,60020,123456630 1.1.1.3,60020.1280 (Contains a position) + 2-1.1.1.3,60020,123456630 1.1.1.2,60020,123456790 1.1.1.2,60020.1214 (Contains a position) +---- + === Replication Metrics The following metrics are exposed at the global region server level and at the peer level: @@ -2694,6 +2746,14 @@ The following metrics are exposed at the global region server level and at the p | The directory for storing replication peer state, when filesystem replication peer storage is specified | peers + +| hbase.replication.queue.table.name +| The table for storing replication queue state +| hbase:replication + +| hbase.replication.queue.storage.impl +| The replication queue storage implementation +| TableReplicationQueueStorage |=== === Monitoring Replication Status