From 4e9068e5a3c83731aeb1d7d2e9ebc607261fd8e3 Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Tue, 20 Apr 2021 15:30:17 -0700 Subject: [PATCH 01/15] HDDS-4330. Bootstrap new OM node --- .../org/apache/hadoop/hdds/NodeDetails.java | 42 ++- .../ozone/om/ha/OMFailoverProxyProvider.java | 54 +-- .../ozone/om/helpers}/OMNodeDetails.java | 86 ++++- .../om/protocol/OMInterServiceProtocol.java | 33 ++ .../om/protocolPB/Hadoop3OmTransport.java | 2 +- .../OMInterServiceProtocolClientSideImpl.java | 121 +++++++ .../protocolPB/OMInterServiceProtocolPB.java | 41 +++ .../om/ha/TestOMFailoverProxyProvider.java | 7 +- .../hadoop/ozone/MiniOzoneChaosCluster.java | 8 +- .../apache/hadoop/ozone/MiniOzoneCluster.java | 5 + .../hadoop/ozone/MiniOzoneClusterImpl.java | 7 +- .../hadoop/ozone/MiniOzoneHAClusterImpl.java | 230 ++++++++++-- .../ozone/MiniOzoneOMHAClusterImpl.java | 5 +- .../ozone/om/TestOzoneManagerBootstrap.java | 212 ++++++++++++ .../om/TestOzoneManagerConfiguration.java | 2 +- .../hadoop/ozone/om/TestOzoneManagerHA.java | 3 + .../main/proto/OmInterServiceProtocol.proto | 64 ++++ .../hadoop/ozone/om/OMStarterInterface.java | 2 + .../apache/hadoop/ozone/om/OzoneManager.java | 326 ++++++++++++----- .../hadoop/ozone/om/OzoneManagerStarter.java | 35 ++ .../hadoop/ozone/om/ha/OMHANodeDetails.java | 1 + .../om/ratis/OzoneManagerRatisServer.java | 327 ++++++++++-------- .../om/ratis/OzoneManagerStateMachine.java | 32 ++ .../ratis/utils/OzoneManagerRatisUtils.java | 85 +++++ .../OzoneManagerSnapshotProvider.java | 11 +- .../OMInterServiceProtocolServerSideImpl.java | 73 ++++ ...ManagerProtocolServerSideTranslatorPB.java | 73 ++-- .../ozone/om/TestOzoneManagerStarter.java | 6 + .../ozone/om/failover/TestOMFailovers.java | 2 +- .../om/ratis/TestOzoneManagerRatisServer.java | 8 +- .../hadoop/fs/ozone/Hadoop27RpcTransport.java | 2 +- 31 files changed, 1562 insertions(+), 343 deletions(-) rename hadoop-ozone/{ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha => common/src/main/java/org/apache/hadoop/ozone/om/helpers}/OMNodeDetails.java (54%) create mode 100644 hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocol/OMInterServiceProtocol.java create mode 100644 hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolClientSideImpl.java create mode 100644 hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolPB.java create mode 100644 hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java create mode 100644 hadoop-ozone/interface-client/src/main/proto/OmInterServiceProtocol.proto create mode 100644 hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OMInterServiceProtocolServerSideImpl.java diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/NodeDetails.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/NodeDetails.java index 27839da1cea6..49f5290bf568 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/NodeDetails.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/NodeDetails.java @@ -28,20 +28,39 @@ public abstract class NodeDetails { private String serviceId; private String nodeId; - private InetSocketAddress rpcAddress; + private String hostAddress; + private int rpcPort; private int ratisPort; private String httpAddress; private String httpsAddress; + private InetSocketAddress rpcAddress; + /** * Constructs NodeDetails object. */ public NodeDetails(String serviceId, String nodeId, - InetSocketAddress rpcAddr, int ratisPort, - String httpAddress, String httpsAddress) { + InetSocketAddress rpcAddress, int ratisPort, + String httpAddress, String httpsAddress) { this.serviceId = serviceId; this.nodeId = nodeId; - this.rpcAddress = rpcAddr; + this.rpcAddress = rpcAddress; + this.hostAddress = rpcAddress.getHostName(); + this.rpcPort = rpcAddress.getPort(); + this.ratisPort = ratisPort; + this.httpAddress = httpAddress; + this.httpsAddress = httpsAddress; + } + + /** + * Constructs NodeDetails object. + */ + public NodeDetails(String serviceId, String nodeId, String hostAddr, + int rpcPort, int ratisPort, String httpAddress, String httpsAddress) { + this.serviceId = serviceId; + this.nodeId = nodeId; + this.hostAddress = hostAddr; + this.rpcPort = rpcPort; this.ratisPort = ratisPort; this.httpAddress = httpAddress; this.httpsAddress = httpsAddress; @@ -56,19 +75,26 @@ public String getNodeId() { } public InetSocketAddress getRpcAddress() { + if (rpcAddress == null) { + rpcAddress = NetUtils.createSocketAddr(hostAddress, rpcPort); + } return rpcAddress; } public boolean isHostUnresolved() { - return rpcAddress.isUnresolved(); + return getRpcAddress().isUnresolved(); } public InetAddress getInetAddress() { - return rpcAddress.getAddress(); + return getRpcAddress().getAddress(); } public String getHostName() { - return rpcAddress.getHostName(); + return getRpcAddress().getHostName(); + } + + public String getHostAddress() { + return hostAddress; } public String getRatisHostPortStr() { @@ -93,7 +119,7 @@ public int getRatisPort() { } public String getRpcAddressString() { - return NetUtils.getHostPortString(rpcAddress); + return NetUtils.getHostPortString(getRpcAddress()); } public String getHttpAddress() { diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java index 1eec32e1b630..75c3b32cf0c3 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java @@ -68,26 +68,28 @@ * multiple OMs to connect to. In case of OM failover, client can try * connecting to another OM node from the list of proxies. */ -public class OMFailoverProxyProvider implements - FailoverProxyProvider, Closeable { +public class OMFailoverProxyProvider implements + FailoverProxyProvider, Closeable { public static final Logger LOG = LoggerFactory.getLogger(OMFailoverProxyProvider.class); + private final String omServiceId; + private final ConfigurationSource conf; + private final Class protocolClass; + private final long omVersion; + private final UserGroupInformation ugi; + private final Text delegationTokenService; + // Map of OMNodeID to its proxy - private Map> omProxies; + private Map> omProxies; private Map omProxyInfos; private List omNodeIDList; private String currentProxyOMNodeId; private int currentProxyIndex; - private final ConfigurationSource conf; - private final long omVersion; - private final UserGroupInformation ugi; - private final Text delegationTokenService; - - private final String omServiceId; + private List retryExceptions = new ArrayList<>(); // OMFailoverProxyProvider, on encountering certain exception, tries each OM // once in a round robin fashion. After that it waits for configured time @@ -101,11 +103,13 @@ public class OMFailoverProxyProvider implements private Set accessControlExceptionOMs = new HashSet<>(); public OMFailoverProxyProvider(ConfigurationSource configuration, - UserGroupInformation ugi, String omServiceId) throws IOException { + UserGroupInformation ugi, String omServiceId, Class protocol) + throws IOException { this.conf = configuration; - this.omVersion = RPC.getProtocolVersion(OzoneManagerProtocolPB.class); + this.omVersion = RPC.getProtocolVersion(protocol); this.ugi = ugi; this.omServiceId = omServiceId; + this.protocolClass = protocol; loadOMClientConfigs(conf, this.omServiceId); this.delegationTokenService = computeDelegationTokenService(); @@ -171,20 +175,18 @@ public synchronized String getCurrentProxyOMNodeId() { return currentProxyOMNodeId; } - private OzoneManagerProtocolPB createOMProxy(InetSocketAddress omAddress) - throws IOException { + private T createOMProxy(InetSocketAddress omAddress) throws IOException { Configuration hadoopConf = LegacyHadoopConfigurationSource.asHadoopConfiguration(conf); - RPC.setProtocolEngine(hadoopConf, OzoneManagerProtocolPB.class, - ProtobufRpcEngine.class); + RPC.setProtocolEngine(hadoopConf, protocolClass, ProtobufRpcEngine.class); // FailoverOnNetworkException ensures that the IPC layer does not attempt // retries on the same OM in case of connection exception. This retry // policy essentially results in TRY_ONCE_THEN_FAIL. RetryPolicy connectionRetryPolicy = RetryPolicies .failoverOnNetworkException(0); - - return RPC.getProtocolProxy(OzoneManagerProtocolPB.class, omVersion, + + return (T) RPC.getProtocolProxy(protocolClass, omVersion, omAddress, ugi, hadoopConf, NetUtils.getDefaultSocketFactory( hadoopConf), (int) OmUtils.getOMClientRpcTimeOut(conf), connectionRetryPolicy).getProxy(); @@ -197,7 +199,7 @@ private OzoneManagerProtocolPB createOMProxy(InetSocketAddress omAddress) * @return the OM proxy object to invoke methods upon */ @Override - public synchronized ProxyInfo getProxy() { + public synchronized ProxyInfo getProxy() { ProxyInfo currentProxyInfo = omProxies.get(currentProxyOMNodeId); if (currentProxyInfo == null) { currentProxyInfo = createOMProxy(currentProxyOMNodeId); @@ -213,7 +215,7 @@ protected ProxyInfo createOMProxy(String nodeId) { InetSocketAddress address = omProxyInfo.getAddress(); ProxyInfo proxyInfo; try { - OzoneManagerProtocolPB proxy = createOMProxy(address); + T proxy = createOMProxy(address); // Create proxyInfo here, to make it work with all Hadoop versions. proxyInfo = new ProxyInfo<>(proxy, omProxyInfo.toString()); omProxies.put(nodeId, proxyInfo); @@ -328,8 +330,8 @@ protected Text computeDelegationTokenService() { } @Override - public Class getInterface() { - return OzoneManagerProtocolPB.class; + public Class getInterface() { + return protocolClass; } /** @@ -350,7 +352,7 @@ public Class getInterface() { * failover again. */ @Override - public void performFailover(OzoneManagerProtocolPB currentProxy) { + public void performFailover(T currentProxy) { if (LOG.isDebugEnabled()) { int currentIndex = getCurrentProxyIndex(); LOG.debug("Failing over OM proxy to index: {}, nodeId: {}", @@ -483,7 +485,7 @@ public synchronized boolean shouldFailover(Exception ex) { */ @Override public synchronized void close() throws IOException { - for (ProxyInfo proxyInfo : omProxies.values()) { + for (ProxyInfo proxyInfo : omProxies.values()) { if (proxyInfo != null) { RPC.stopProxy(proxyInfo.proxy); } @@ -496,7 +498,7 @@ public List getOMProxies() { } @VisibleForTesting - public Map> getOMProxyMap() { + public Map> getOMProxyMap() { return omProxies; } @@ -511,7 +513,7 @@ public List getOMProxyInfos() { * @param exception * @return OMLeaderNotReadyException */ - private static OMLeaderNotReadyException getLeaderNotReadyException( + public static OMLeaderNotReadyException getLeaderNotReadyException( Exception exception) { Throwable cause = exception.getCause(); if (cause instanceof RemoteException) { @@ -544,7 +546,7 @@ public static OMNotLeaderException getNotLeaderException( @VisibleForTesting protected void setProxiesForTesting( - Map> testOMProxies, + Map> testOMProxies, Map testOMProxyInfos, List testOMNodeIDList) { this.omProxies = testOMProxies; diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMNodeDetails.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/helpers/OMNodeDetails.java similarity index 54% rename from hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMNodeDetails.java rename to hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/helpers/OMNodeDetails.java index 7156381ba2ed..d908b57f2729 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMNodeDetails.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/helpers/OMNodeDetails.java @@ -15,17 +15,23 @@ * the License. */ -package org.apache.hadoop.ozone.om.ha; +package org.apache.hadoop.ozone.om.helpers; import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.hdds.server.http.HttpConfig; +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.net.NetUtils; +import org.apache.hadoop.ozone.OmUtils; +import org.apache.hadoop.ozone.ha.ConfUtils; import org.apache.hadoop.hdds.NodeDetails; import java.net.InetSocketAddress; import static org.apache.hadoop.ozone.OzoneConsts.OZONE_DB_CHECKPOINT_REQUEST_FLUSH; import static org.apache.hadoop.ozone.OzoneConsts.OZONE_OM_DB_CHECKPOINT_HTTP_ENDPOINT; +import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_ADDRESS_KEY; +import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_RATIS_PORT_DEFAULT; +import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_RATIS_PORT_KEY; /** * This class stores OM node details. @@ -43,6 +49,16 @@ private OMNodeDetails(String serviceId, String nodeId, this.rpcPort = rpcPort; } + /** + * Constructs OMNodeDetails object. + */ + private OMNodeDetails(String serviceId, String nodeId, String hostAddr, + int rpcPort, int ratisPort, String httpAddress, String httpsAddress) { + super(serviceId, nodeId, hostAddr, rpcPort, ratisPort, httpAddress, + httpsAddress); + this.rpcPort = rpcPort; + } + @Override public String toString() { return "OMNodeDetails[" @@ -66,18 +82,35 @@ public int getRpcPort() { public static class Builder { private String omServiceId; private String omNodeId; + private String hostAddress; private InetSocketAddress rpcAddress; private int rpcPort; private int ratisPort; private String httpAddr; private String httpsAddr; + public Builder setHostAddress(String hostName) { + this.hostAddress = hostName; + return this; + } + public Builder setRpcAddress(InetSocketAddress rpcAddr) { this.rpcAddress = rpcAddr; this.rpcPort = rpcAddress.getPort(); return this; } + public Builder setRatisAddress(InetSocketAddress ratisAddr) { + this.hostAddress = ratisAddr.getHostName(); + this.ratisPort = ratisAddr.getPort(); + return this; + } + + public Builder setRpcPort(int port) { + this.rpcPort = port; + return this; + } + public Builder setRatisPort(int port) { this.ratisPort = port; return this; @@ -104,13 +137,18 @@ public Builder setHttpsAddress(String httpsAddress) { } public OMNodeDetails build() { - return new OMNodeDetails(omServiceId, omNodeId, rpcAddress, rpcPort, - ratisPort, httpAddr, httpsAddr); + if (rpcAddress != null) { + return new OMNodeDetails(omServiceId, omNodeId, rpcAddress, rpcPort, + ratisPort, httpAddr, httpsAddr); + } else { + return new OMNodeDetails(omServiceId, omNodeId, hostAddress, rpcPort, + ratisPort, httpAddr, httpsAddr); + } } } - public String getOMDBCheckpointEnpointUrl(HttpConfig.Policy httpPolicy) { - if (httpPolicy.isHttpEnabled()) { + public String getOMDBCheckpointEnpointUrl(boolean isHttpPolicy) { + if (isHttpPolicy) { if (StringUtils.isNotEmpty(getHttpAddress())) { return "http://" + getHttpAddress() + OZONE_OM_DB_CHECKPOINT_HTTP_ENDPOINT + @@ -125,4 +163,40 @@ public String getOMDBCheckpointEnpointUrl(HttpConfig.Policy httpPolicy) { } return null; } + + public static OMNodeDetails getOMNodeDetailsFromConf(OzoneConfiguration conf, + String omServiceId, String omNodeId) { + String rpcAddrKey = ConfUtils.addKeySuffixes(OZONE_OM_ADDRESS_KEY, + omServiceId, omNodeId); + String rpcAddrStr = OmUtils.getOmRpcAddress(conf, rpcAddrKey); + if (rpcAddrStr == null || rpcAddrStr.isEmpty()) { + return null; + } + + String ratisPortKey = ConfUtils.addKeySuffixes(OZONE_OM_RATIS_PORT_KEY, + omServiceId, omNodeId); + int ratisPort = conf.getInt(ratisPortKey, OZONE_OM_RATIS_PORT_DEFAULT); + + InetSocketAddress omRpcAddress = null; + try { + omRpcAddress = NetUtils.createSocketAddr(rpcAddrStr); + } catch (Exception e) { + throw new IllegalArgumentException("Couldn't create socket address" + + " for OM " + omNodeId + " at " + rpcAddrStr, e); + } + + String httpAddr = OmUtils.getHttpAddressForOMPeerNode(conf, + omServiceId, omNodeId, omRpcAddress.getHostName()); + String httpsAddr = OmUtils.getHttpsAddressForOMPeerNode(conf, + omServiceId, omNodeId, omRpcAddress.getHostName()); + + return new OMNodeDetails.Builder() + .setOMNodeId(omNodeId) + .setRpcAddress(omRpcAddress) + .setRatisPort(ratisPort) + .setHttpAddress(httpAddr) + .setHttpsAddress(httpsAddr) + .setOMServiceId(omServiceId) + .build(); + } } diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocol/OMInterServiceProtocol.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocol/OMInterServiceProtocol.java new file mode 100644 index 000000000000..568e94576c1a --- /dev/null +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocol/OMInterServiceProtocol.java @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ozone.om.protocol; + +import java.io.Closeable; +import java.io.IOException; +import org.apache.hadoop.ozone.om.helpers.OMNodeDetails; + +/** + * Protocol for inter OM communication. + */ +public interface OMInterServiceProtocol extends Closeable { + + /** + * Bootstrap OM by adding to existing OM Ratis ring. + */ + void bootstrap(OMNodeDetails newOMNode) throws IOException; +} diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/Hadoop3OmTransport.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/Hadoop3OmTransport.java index ea0e534582cd..32d4f54b1f6b 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/Hadoop3OmTransport.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/Hadoop3OmTransport.java @@ -64,7 +64,7 @@ public Hadoop3OmTransport(ConfigurationSource conf, ProtobufRpcEngine.class); this.omFailoverProxyProvider = new OMFailoverProxyProvider(conf, ugi, - omServiceId); + omServiceId, OzoneManagerProtocolPB.class); int maxFailovers = conf.getInt( OzoneConfigKeys.OZONE_CLIENT_FAILOVER_MAX_ATTEMPTS_KEY, diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolClientSideImpl.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolClientSideImpl.java new file mode 100644 index 000000000000..9ccac57b5a78 --- /dev/null +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolClientSideImpl.java @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ozone.om.protocolPB; + +import com.google.protobuf.RpcController; +import com.google.protobuf.ServiceException; +import java.io.IOException; +import org.apache.hadoop.hdds.conf.ConfigurationSource; +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.io.retry.RetryProxy; +import org.apache.hadoop.ipc.ProtobufHelper; +import org.apache.hadoop.ipc.ProtobufRpcEngine; +import org.apache.hadoop.ipc.RPC; +import org.apache.hadoop.ozone.OzoneConfigKeys; +import org.apache.hadoop.ozone.om.exceptions.OMLeaderNotReadyException; +import org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException; +import org.apache.hadoop.ozone.om.ha.OMFailoverProxyProvider; +import org.apache.hadoop.ozone.om.helpers.OMNodeDetails; +import org.apache.hadoop.ozone.om.protocol.OMInterServiceProtocol; +import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.BootstrapErrorCode; +import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.BootstrapOMRequest; +import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.BootstrapOMResponse; +import org.apache.hadoop.security.UserGroupInformation; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Protocol implementation for Inter OM communication + */ +public class OMInterServiceProtocolClientSideImpl implements + OMInterServiceProtocol { + + /** + * RpcController is not used and hence is set to null. + */ + private static final RpcController NULL_RPC_CONTROLLER = null; + + private static final Logger LOG = + LoggerFactory.getLogger(OMInterServiceProtocolClientSideImpl.class); + + private final OMFailoverProxyProvider omFailoverProxyProvider; + + private final OMInterServiceProtocolPB rpcProxy; + + public OMInterServiceProtocolClientSideImpl(ConfigurationSource conf, + UserGroupInformation ugi, String omServiceId) throws IOException { + + RPC.setProtocolEngine(OzoneConfiguration.of(conf), + OMInterServiceProtocolPB.class, ProtobufRpcEngine.class); + + this.omFailoverProxyProvider = new OMFailoverProxyProvider(conf, ugi, + omServiceId, OMInterServiceProtocolPB.class); + + int maxFailovers = conf.getInt( + OzoneConfigKeys.OZONE_CLIENT_FAILOVER_MAX_ATTEMPTS_KEY, + OzoneConfigKeys.OZONE_CLIENT_FAILOVER_MAX_ATTEMPTS_DEFAULT); + + this.rpcProxy = (OMInterServiceProtocolPB) RetryProxy.create( + OMInterServiceProtocolPB.class, omFailoverProxyProvider, + omFailoverProxyProvider.getRetryPolicy(maxFailovers)); + } + + @Override + public void bootstrap(OMNodeDetails newOMNode) throws IOException { + BootstrapOMRequest bootstrapOMRequest = BootstrapOMRequest.newBuilder() + .setNodeId(newOMNode.getNodeId()) + .setHostAddress(newOMNode.getHostAddress()) + .setRatisPort(newOMNode.getRatisPort()) + .build(); + + BootstrapOMResponse response; + try { + response = rpcProxy.bootstrap(NULL_RPC_CONTROLLER, bootstrapOMRequest); + } catch (ServiceException e) { + OMNotLeaderException notLeaderException = + OMFailoverProxyProvider.getNotLeaderException(e); + if (notLeaderException != null) { + throwException(BootstrapErrorCode.LEADER_UNDETERMINED, + notLeaderException.getMessage()); + } + + OMLeaderNotReadyException leaderNotReadyException = + OMFailoverProxyProvider.getLeaderNotReadyException(e); + if (leaderNotReadyException != null) { + throwException(BootstrapErrorCode.LEADER_NOT_READY, + leaderNotReadyException.getMessage()); + } + throw ProtobufHelper.getRemoteException(e); + } + + if (!response.getSuccess()) { + throwException(response.getErrorCode(), response.getErrorMsg()); + } + } + + private void throwException(BootstrapErrorCode errorCode, String errorMsg) + throws IOException { + throw new IOException("Failed to Bootstrap OM. Error Code: " + errorCode + + ", Error Message: " + errorMsg); + } + + @Override + public void close() throws IOException { + omFailoverProxyProvider.close(); + } +} diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolPB.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolPB.java new file mode 100644 index 000000000000..e3a60a394b4c --- /dev/null +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolPB.java @@ -0,0 +1,41 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ozone.om.protocolPB; + +import org.apache.hadoop.hdds.annotation.InterfaceAudience; +import org.apache.hadoop.ozone.om.OMConfigKeys; +import org.apache.hadoop.ipc.ProtocolInfo; +import org.apache.hadoop.ozone.protocol.proto + .OzoneManagerInterServiceProtocolProtos.OzoneManagerInterService; +import org.apache.hadoop.security.KerberosInfo; +import org.apache.hadoop.security.token.TokenInfo; +import org.apache.hadoop.ozone.security.OzoneDelegationTokenSelector; + +/** + * Protocol used for communication between OMs. + */ +@ProtocolInfo(protocolName = + "org.apache.hadoop.ozone.om.protocol.OMInterServiceProtocol", + protocolVersion = 1) +@KerberosInfo( + serverPrincipal = OMConfigKeys.OZONE_OM_KERBEROS_PRINCIPAL_KEY) +@TokenInfo(OzoneDelegationTokenSelector.class) +@InterfaceAudience.Private +public interface OMInterServiceProtocolPB + extends OzoneManagerInterService.BlockingInterface { +} diff --git a/hadoop-ozone/common/src/test/java/org/apache/hadoop/ozone/om/ha/TestOMFailoverProxyProvider.java b/hadoop-ozone/common/src/test/java/org/apache/hadoop/ozone/om/ha/TestOMFailoverProxyProvider.java index 21589174de39..12b0d408654c 100644 --- a/hadoop-ozone/common/src/test/java/org/apache/hadoop/ozone/om/ha/TestOMFailoverProxyProvider.java +++ b/hadoop-ozone/common/src/test/java/org/apache/hadoop/ozone/om/ha/TestOMFailoverProxyProvider.java @@ -26,6 +26,7 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.ozone.ha.ConfUtils; +import org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolPB; import org.junit.Assert; import org.junit.Test; import org.junit.Before; @@ -68,7 +69,8 @@ public void init() throws Exception { config.set(ConfUtils.addKeySuffixes(OZONE_OM_NODES_KEY, OM_SERVICE_ID), allNodeIds.toString()); provider = new OMFailoverProxyProvider(config, - UserGroupInformation.getCurrentUser(), OM_SERVICE_ID); + UserGroupInformation.getCurrentUser(), OM_SERVICE_ID, + OzoneManagerProtocolPB.class); } /** @@ -184,7 +186,8 @@ public void testCanonicalTokenServiceName() throws IOException { ozoneConf.set(ConfUtils.addKeySuffixes(OZONE_OM_NODES_KEY, OM_SERVICE_ID), allNodeIds.toString()); OMFailoverProxyProvider prov = new OMFailoverProxyProvider(ozoneConf, - UserGroupInformation.getCurrentUser(), OM_SERVICE_ID); + UserGroupInformation.getCurrentUser(), OM_SERVICE_ID, + OzoneManagerProtocolPB.class); Text dtService = prov.getCurrentProxyDelegationToken(); diff --git a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java index 1773b6d290ce..a8c588dd30ec 100644 --- a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java +++ b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java @@ -103,8 +103,10 @@ public static FailureService of(String serviceName) { public MiniOzoneChaosCluster(OzoneConfiguration conf, List ozoneManagers, List scms, List hddsDatanodes, String omServiceID, - String scmServiceId, Set> clazzes) { - super(conf, ozoneManagers, scms, hddsDatanodes, omServiceID, scmServiceId); + String scmServiceId, String clusterPath, + Set> clazzes) { + super(conf, ozoneManagers, scms, hddsDatanodes, omServiceID, scmServiceId, + clusterPath); this.numDatanodes = getHddsDatanodes().size(); this.numOzoneManagers = ozoneManagers.size(); this.numStorageContainerManagers = scms.size(); @@ -327,7 +329,7 @@ public MiniOzoneChaosCluster build() throws IOException { MiniOzoneChaosCluster cluster = new MiniOzoneChaosCluster(conf, omList, scmList, hddsDatanodes, - omServiceId, scmServiceId, clazzes); + omServiceId, scmServiceId, path, clazzes); if (startDataNodes) { cluster.startHddsDatanodes(); diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneCluster.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneCluster.java index 309fb9cbe48d..0263fcfd7fae 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneCluster.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneCluster.java @@ -75,6 +75,11 @@ static Builder newHABuilder(OzoneConfiguration conf) { */ OzoneConfiguration getConf(); + /** + * Set the configuration for the MiniOzoneCluster. + */ + void setConf(OzoneConfiguration newConf); + /** * Waits for the cluster to be ready, this call blocks till all the * configured {@link HddsDatanodeService} registers with diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneClusterImpl.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneClusterImpl.java index 738c8d8f8cf9..edda93971c2c 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneClusterImpl.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneClusterImpl.java @@ -110,7 +110,7 @@ public class MiniOzoneClusterImpl implements MiniOzoneCluster { private static final Logger LOG = LoggerFactory.getLogger(MiniOzoneClusterImpl.class); - private final OzoneConfiguration conf; + private OzoneConfiguration conf; private StorageContainerManager scm; private OzoneManager ozoneManager; private final List hddsDatanodes; @@ -189,6 +189,11 @@ public OzoneConfiguration getConf() { return conf; } + @Override + public void setConf(OzoneConfiguration newConf) { + this.conf = newConf; + } + @Override public String getOMServiceId() { // Non-HA cluster doesn't have OM Service Id. diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java index 82b31e4f9472..227ef72e5329 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java @@ -68,11 +68,13 @@ public class MiniOzoneHAClusterImpl extends MiniOzoneClusterImpl { private final OMHAService omhaService; private final SCMHAService scmhaService; + private final String clusterMetaPath; + private int waitForClusterToBeReadyTimeout = 120000; // 2 min private static final Random RANDOM = new Random(); - private static final int RATIS_RPC_TIMEOUT = 1000; // 1 second - public static final int NODE_FAILURE_TIMEOUT = 2000; // 2 seconds + private static final int RATIS_RPC_TIMEOUT = 1000; // 10 second + public static final int NODE_FAILURE_TIMEOUT = 2000; // 20 seconds /** * Creates a new MiniOzoneCluster. @@ -89,12 +91,14 @@ public MiniOzoneHAClusterImpl( List hddsDatanodes, String omServiceId, String scmServiceId, + String clusterPath, ReconServer reconServer) { super(conf, hddsDatanodes, reconServer); omhaService = new OMHAService(activeOMList, inactiveOMList, omServiceId); scmhaService = new SCMHAService(activeSCMList, inactiveSCMList, scmServiceId); + this.clusterMetaPath = clusterPath; } /** @@ -107,9 +111,10 @@ protected MiniOzoneHAClusterImpl( List scmList, List hddsDatanodes, String omServiceId, - String scmServiceId) { + String scmServiceId, + String clusterPath) { this(conf, omList, null, scmList, null, hddsDatanodes, - omServiceId, scmServiceId, null); + omServiceId, scmServiceId, clusterPath, null); } @Override @@ -181,7 +186,7 @@ public StorageContainerManager getStorageContainerManager(int index) { */ public OzoneManager getOMLeader() { OzoneManager res = null; - for (OzoneManager ozoneManager : this.omhaService.getServices()) { + for (OzoneManager ozoneManager : this.omhaService.getActiveServices()) { if (ozoneManager.isLeaderReady()) { if (res != null) { // Found more than one leader @@ -239,7 +244,7 @@ public void shutdownStorageContainerManager(StorageContainerManager scm) { LOG.info("Shutting down StorageContainerManager " + scm.getScmId()); scm.stop(); - scmhaService.removeInstance(scm); + scmhaService.deactivate(scm); } public void restartStorageContainerManager( @@ -251,7 +256,7 @@ public void restartStorageContainerManager( shutdownStorageContainerManager(scm); scm.join(); scm = TestUtils.getScmSimple(scmConf); - scmhaService.addInstance(scm); + scmhaService.activate(scm); scm.start(); if (waitForSCM) { waitForClusterToBeReady(); @@ -305,13 +310,17 @@ public void stop() { } public void stopOzoneManager(int index) { - omhaService.getServices().get(index).stop(); - omhaService.getServices().get(index).join(); + OzoneManager om = omhaService.getServices().get(index); + om.stop(); + om.join(); + omhaService.deactivate(om); } public void stopOzoneManager(String omNodeId) { - omhaService.getServiceById(omNodeId).stop(); - omhaService.getServiceById(omNodeId).join(); + OzoneManager om = omhaService.getServiceById(omNodeId); + om.stop(); + om.join(); + omhaService.deactivate(om); } /** @@ -356,7 +365,12 @@ public MiniOzoneCluster build() throws IOException { numOfActiveOMs = numOfOMs; } - // If num of ActiveOMs is not set, set it to numOfOMs. + // If num of OMs it not set, set it to 1. + if (numOfSCMs == 0) { + numOfSCMs = 1; + } + + // If num of ActiveSCMs is not set, set it to numOfSCMs. if (numOfActiveSCMs == ACTIVE_SCMS_NOT_SET) { numOfActiveSCMs = numOfSCMs; } @@ -383,7 +397,7 @@ public MiniOzoneCluster build() throws IOException { MiniOzoneHAClusterImpl cluster = new MiniOzoneHAClusterImpl(conf, activeOMs, inactiveOMs, activeSCMs, inactiveSCMs, - hddsDatanodes, omServiceId, scmServiceId, reconServer); + hddsDatanodes, omServiceId, scmServiceId, path, reconServer); if (startDataNodes) { cluster.startHddsDatanodes(); @@ -430,7 +444,7 @@ protected List createOMService() throws IOException, List omList = Lists.newArrayList(); int retryCount = 0; - int basePort = 10000; + int basePort; while (true) { try { @@ -466,7 +480,7 @@ protected List createOMService() throws IOException, if (i <= numOfActiveOMs) { om.start(); activeOMs.add(om); - LOG.info("Started OzoneManager RPC server at {}", + LOG.info("Started OzoneManager {} RPC server at {}", nodeId, om.getOmRpcServerAddr()); } else { inactiveOMs.add(om); @@ -642,13 +656,14 @@ private void initOMHAConfig(int basePort) throws IOException { conf.set(OMConfigKeys.OZONE_OM_INTERNAL_SERVICE_ID, omServiceId); String omNodesKey = ConfUtils.addKeySuffixes( OMConfigKeys.OZONE_OM_NODES_KEY, omServiceId); - StringBuilder omNodesKeyValue = new StringBuilder(); + List omNodeIds = new ArrayList<>(); int port = basePort; for (int i = 1; i <= numOfOMs; i++, port+=6) { String omNodeId = OM_NODE_ID_PREFIX + i; - omNodesKeyValue.append(",").append(omNodeId); + omNodeIds.add(omNodeId); + String omAddrKey = ConfUtils.addKeySuffixes( OMConfigKeys.OZONE_OM_ADDRESS_KEY, omServiceId, omNodeId); String omHttpAddrKey = ConfUtils.addKeySuffixes( @@ -664,8 +679,161 @@ private void initOMHAConfig(int basePort) throws IOException { conf.setInt(omRatisPortKey, port + 4); } - conf.set(omNodesKey, omNodesKeyValue.substring(1)); + conf.set(omNodesKey, String.join(",", omNodeIds)); + } + } + + /** + * Bootstrap new OM and add to existing OM HA service ring. + * @return new OM nodeId + */ + public void bootstrapOzoneManager(String omNodeId) throws Exception { + + int basePort; + int retryCount = 0; + + OzoneManager om = null; + + long leaderSnapshotIndex = getOMLeader().getRatisSnapshotIndex(); + + while (true) { + try { + basePort = 10000 + RANDOM.nextInt(1000) * 4; + OzoneConfiguration newConf = addNewOMToConfig(getOMServiceId(), + omNodeId, basePort); + + om = bootstrapNewOM(omNodeId); + + // Get the CertClient from an existing OM and set for new OM + if (omhaService.getServiceByIndex(0).getCertificateClient() != null) { + om.setCertClient( + omhaService.getServiceByIndex(0).getCertificateClient()); + } + + LOG.info("Bootstrapped OzoneManager {} RPC server at {}", omNodeId, + om.getOmRpcServerAddr()); + + // Add new OMs to cluster's in memory map and update existing OMs conf. + setConf(newConf); + + omhaService.addInstance(om, true); + break; + } catch (IOException e) { + // Existing OM config could have been updated with new conf. Reset it. + for (OzoneManager existingOM : omhaService.getServices()) { + existingOM.setConfiguration(getConf()); + } + if (e instanceof BindException || + e.getCause() instanceof BindException) { + if (om != null) { + om.stop(); + om.join(); + LOG.info("Stopping OzoneManager server at {}", + om.getOmRpcServerAddr()); + } + ++retryCount; + LOG.info("MiniOzoneHACluster port conflicts, retried {} times", + retryCount); + } else { + throw e; + } + } } + + waitForBootstrappedNodeToBeReady(om, leaderSnapshotIndex); + waitForConfigUpdateOnAllOMs(omNodeId); + } + + /** + * Set the configs for new OMs. + */ + private OzoneConfiguration addNewOMToConfig(String omServiceId, + String omNodeId, int basePort) { + OzoneConfiguration newConf = getConf(); + String omNodesKey = ConfUtils.addKeySuffixes( + OMConfigKeys.OZONE_OM_NODES_KEY, omServiceId); + StringBuilder omNodesKeyValue = new StringBuilder(); + omNodesKeyValue.append(newConf.get(omNodesKey)) + .append(",").append(omNodeId); + + String omAddrKey = ConfUtils.addKeySuffixes( + OMConfigKeys.OZONE_OM_ADDRESS_KEY, omServiceId, omNodeId); + String omHttpAddrKey = ConfUtils.addKeySuffixes( + OMConfigKeys.OZONE_OM_HTTP_ADDRESS_KEY, omServiceId, omNodeId); + String omHttpsAddrKey = ConfUtils.addKeySuffixes( + OMConfigKeys.OZONE_OM_HTTPS_ADDRESS_KEY, omServiceId, omNodeId); + String omRatisPortKey = ConfUtils.addKeySuffixes( + OMConfigKeys.OZONE_OM_RATIS_PORT_KEY, omServiceId, omNodeId); + + newConf.set(omAddrKey, "127.0.0.1:" + basePort); + newConf.set(omHttpAddrKey, "127.0.0.1:" + (basePort + 2)); + newConf.set(omHttpsAddrKey, "127.0.0.1:" + (basePort + 3)); + newConf.setInt(omRatisPortKey, basePort + 4); + + newConf.set(omNodesKey, omNodesKeyValue.toString()); + + return newConf; + } + + /** + * Start a new OM in Bootstrap mode. Configs for the new OM must already be + * set. + */ + private OzoneManager bootstrapNewOM(String nodeId) + throws IOException, AuthenticationException { + OzoneConfiguration config = new OzoneConfiguration(getConf()); + config.set(OMConfigKeys.OZONE_OM_NODE_ID_KEY, nodeId); + // Set the OM rpc and http(s) address to null so that the cluster picks + // up the address set with service ID and node ID + config.set(OMConfigKeys.OZONE_OM_ADDRESS_KEY, ""); + config.set(OMConfigKeys.OZONE_OM_HTTP_ADDRESS_KEY, ""); + config.set(OMConfigKeys.OZONE_OM_HTTPS_ADDRESS_KEY, ""); + + // Set metadata/DB dir base path + String metaDirPath = clusterMetaPath + "/" + nodeId; + config.set(OZONE_METADATA_DIRS, metaDirPath); + + // Update existing OMs config + for (OzoneManager existingOM : omhaService.getServices()) { + existingOM.setConfiguration(config); + } + + OzoneManager.omInit(config); + OzoneManager om = OzoneManager.createOm(config, + OzoneManager.StartupOption.BOOTSTRAP); + om.start(); + return om; + + } + + /** + * Wait for AddOM command to execute on all OMs. + */ + private void waitForBootstrappedNodeToBeReady(OzoneManager newOM, + long leaderSnapshotIndex) throws Exception { + // Wait for bootstrapped nodes to catch up with others + GenericTestUtils.waitFor(() -> { + try { + if (newOM.getRatisSnapshotIndex() >= leaderSnapshotIndex) { + return true; + } + } catch (IOException e) { + return false; + } + return false; + }, 1000, waitForClusterToBeReadyTimeout); + } + + private void waitForConfigUpdateOnAllOMs(String newOMNodeId) + throws Exception { + GenericTestUtils.waitFor(() -> { + for (OzoneManager om : omhaService.getActiveServices()) { + if (!om.doesPeerExist(newOMNodeId)) { + return false; + } + } + return true; + }, 1000, waitForClusterToBeReadyTimeout); } /** @@ -683,11 +851,15 @@ static class MiniOzoneHAService { private List activeServices; private List inactiveServices; + // Function to extract the Id from service + Function serviceIdProvider; + MiniOzoneHAService(String name, List activeList, List inactiveList, String serviceId, Function idProvider) { this.serviceName = name; this.serviceMap = Maps.newHashMap(); + this.serviceIdProvider = idProvider; if (activeList != null) { for (Type service : activeList) { this.serviceMap.put(idProvider.apply(service), service); @@ -717,12 +889,30 @@ public List getServices() { return services; } + public List getActiveServices() { + return activeServices; + } + public boolean removeInstance(Type t) { return services.remove(t); } - public boolean addInstance(Type t) { - return services.add(t); + public void addInstance(Type t, boolean isActive) { + services.add(t); + serviceMap.put(serviceIdProvider.apply(t), t); + if (isActive) { + activeServices.add(t); + } + } + + public void activate(Type t) { + activeServices.add(t); + inactiveServices.remove(t); + } + + public void deactivate(Type t) { + activeServices.remove(t); + inactiveServices.add(t); } public boolean isServiceActive(String id) { diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneOMHAClusterImpl.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneOMHAClusterImpl.java index 30a934540cd1..2f2b77341481 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneOMHAClusterImpl.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneOMHAClusterImpl.java @@ -50,9 +50,10 @@ private MiniOzoneOMHAClusterImpl( StorageContainerManager scm, List hddsDatanodes, String omServiceId, + String metaPath, ReconServer reconServer) { super(conf, activeOMList, inactiveOMList, Collections.singletonList(scm), - null, hddsDatanodes, omServiceId, null, reconServer); + null, hddsDatanodes, omServiceId, null, metaPath, reconServer); } /** @@ -104,7 +105,7 @@ public MiniOzoneCluster build() throws IOException { MiniOzoneClusterImpl cluster = new MiniOzoneOMHAClusterImpl(conf, getActiveOMs(), getInactiveOMs(), scm, hddsDatanodes, - omServiceId, reconServer); + omServiceId, path, reconServer); if (startDataNodes) { cluster.startHddsDatanodes(); diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java new file mode 100644 index 000000000000..9da4327028c3 --- /dev/null +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java @@ -0,0 +1,212 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.om; + +import com.google.common.base.Supplier; +import java.io.File; +import java.io.FileFilter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.RandomStringUtils; +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.hdfs.server.common.Storage; +import org.apache.hadoop.ozone.MiniOzoneCluster; +import org.apache.hadoop.ozone.MiniOzoneHAClusterImpl; +import org.apache.hadoop.ozone.client.ObjectStore; +import org.apache.hadoop.ozone.client.OzoneBucket; +import org.apache.hadoop.ozone.client.OzoneClientFactory; +import org.apache.hadoop.ozone.client.OzoneVolume; +import org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer; +import org.apache.hadoop.test.GenericTestUtils; +import org.junit.After; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; +import org.junit.rules.Timeout; + +import static org.apache.hadoop.ozone.OzoneConsts.SCM_DUMMY_SERVICE_ID; +import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_RATIS_SERVER_REQUEST_TIMEOUT_DEFAULT; +import static org.apache.hadoop.ozone.om.TestOzoneManagerHA.createKey; + +public class TestOzoneManagerBootstrap { + + @Rule + public ExpectedException exception = ExpectedException.none(); + + @Rule + public Timeout timeout = new Timeout(500_000); + + private MiniOzoneHAClusterImpl cluster = null; + private ObjectStore objectStore; + private OzoneConfiguration conf; + private final String clusterId = UUID.randomUUID().toString(); + private final String scmId = UUID.randomUUID().toString(); + private final String omServiceId = "om-bootstrap"; + + private static final int NUM_INITIAL_OMS = 3; + + private static final String VOLUME_NAME; + private static final String BUCKET_NAME; + + private long lastTransactionIndex; + + static { + VOLUME_NAME = "volume" + RandomStringUtils.randomNumeric(5); + BUCKET_NAME = "bucket" + RandomStringUtils.randomNumeric(5); + } + + private void setupCluster() throws Exception { + setupCluster(NUM_INITIAL_OMS); + } + + private void setupCluster(int numInitialOMs) throws Exception { + conf = new OzoneConfiguration(); + cluster = (MiniOzoneHAClusterImpl) MiniOzoneCluster.newHABuilder(conf) + .setClusterId(clusterId) + .setScmId(scmId) + .setSCMServiceId(SCM_DUMMY_SERVICE_ID) + .setOMServiceId(omServiceId) + .setNumOfOzoneManagers(numInitialOMs) + .build(); + cluster.waitForClusterToBeReady(); + objectStore = OzoneClientFactory.getRpcClient(omServiceId, conf) + .getObjectStore(); + + // Perform some transactions + objectStore.createVolume(VOLUME_NAME); + OzoneVolume volume = objectStore.getVolume(VOLUME_NAME); + volume.createBucket(BUCKET_NAME); + OzoneBucket bucket = volume.getBucket(BUCKET_NAME); + createKey(bucket); + + lastTransactionIndex = cluster.getOMLeader().getRatisSnapshotIndex(); + } + + @After + public void shutdown() throws Exception { + if (cluster != null) { + cluster.shutdown(); + } + } + + private void assertNewOMExistsInPeerList(String nodeId) throws Exception { + for (OzoneManager om : cluster.getOzoneManagersList()) { + Assert.assertTrue("New OM node " + nodeId + " not present in Peer list " + + "of OM " + om.getOMNodeId(), om.doesPeerExist(nodeId)); + } + OzoneManager newOM = cluster.getOzoneManager(nodeId); + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + try { + return newOM.getRatisSnapshotIndex() >= lastTransactionIndex; + } catch (IOException e) { + return false; + } + } + }, 100, 10000); + + // Check Ratis Dir for log files + File[] logFiles = getRatisLogFiles(newOM); + Assert.assertTrue("There are no ratis logs in new OM ", + logFiles.length > 0); + } + + private File[] getRatisLogFiles(OzoneManager om) { + OzoneManagerRatisServer newOMRatisServer = om.getOmRatisServer(); + File ratisDir = new File(newOMRatisServer.getRatisStorageDir(), + newOMRatisServer.getRaftGroupId().getUuid().toString()); + File ratisLogDir = new File(ratisDir, Storage.STORAGE_DIR_CURRENT); + return ratisLogDir.listFiles(new FileFilter() { + @Override + public boolean accept(File pathname) { + return pathname.getName().startsWith("log"); + } + }); + } + + private List testBootstrapOMs(int numNewOMs) throws Exception { + List newOMNodeIds = new ArrayList<>(numNewOMs); + for (int i = 1; i <= numNewOMs; i++) { + String nodeId = "omNode-bootstrap-" + i; + cluster.bootstrapOzoneManager(nodeId); + assertNewOMExistsInPeerList(nodeId); + newOMNodeIds.add(nodeId); + } + return newOMNodeIds; + } + + /** + * Add 1 new OM to cluster. + * @throws Exception + */ + @Test + public void testBootstrapOneNewOM() throws Exception { + setupCluster(); + testBootstrapOMs(1); + } + + /** + * Add 2 new OMs to cluster. + * @throws Exception + */ + @Test + public void testBootstrapTwoNewOMs() throws Exception { + setupCluster(); + testBootstrapOMs(2); + } + + /** + * Add 2 new OMs to a 1 node OM cluster. Verify that one of the new OMs + * must becomes the leader by stopping the old OM. + */ + @Test + public void testLeaderChangeToNewOM() throws Exception { + setupCluster(1); + OzoneManager oldOM = cluster.getOzoneManager(); + List newOMNodeIds = testBootstrapOMs(2); + + // Stop old OM + cluster.stopOzoneManager(oldOM.getOMNodeId()); + + // Wait for Leader Election timeout + Thread.sleep(OZONE_OM_RATIS_SERVER_REQUEST_TIMEOUT_DEFAULT + .toLong(TimeUnit.MILLISECONDS) * 3); + + // Verify that one of the new OMs is the leader + GenericTestUtils.waitFor(() -> cluster.getOMLeader() != null, 500, 30000); + OzoneManager omLeader = cluster.getOMLeader(); + + Assert.assertTrue("New Bootstrapped OM not elected Leader even though " + + "other OMs are down", newOMNodeIds.contains(omLeader.getOMNodeId())); + + // Perform some read and write operations with new OM leader + objectStore = OzoneClientFactory.getRpcClient(omServiceId, conf) + .getObjectStore(); + OzoneVolume volume = objectStore.getVolume(VOLUME_NAME); + OzoneBucket bucket = volume.getBucket(BUCKET_NAME); + String key = createKey(bucket); + + Assert.assertNotNull(bucket.getKey(key)); + } +} diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerConfiguration.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerConfiguration.java index 2d96923e1f10..6e57445ee924 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerConfiguration.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerConfiguration.java @@ -34,7 +34,7 @@ import org.apache.hadoop.ozone.OzoneConsts; import org.apache.hadoop.ozone.OzoneIllegalArgumentException; import org.apache.hadoop.ozone.ha.ConfUtils; -import org.apache.hadoop.ozone.om.ha.OMNodeDetails; +import org.apache.hadoop.ozone.om.helpers.OMNodeDetails; import org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer; import org.apache.ozone.test.GenericTestUtils; diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHA.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHA.java index 6c8bac278c3e..ffc1643f6e1a 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHA.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHA.java @@ -73,6 +73,7 @@ public abstract class TestOzoneManagerHA { private OzoneConfiguration conf; private String clusterId; private String scmId; + private String omId; private String omServiceId; private static int numOfOMs = 3; private static final int LOG_PURGE_GAP = 50; @@ -137,6 +138,7 @@ public void init() throws Exception { clusterId = UUID.randomUUID().toString(); scmId = UUID.randomUUID().toString(); omServiceId = "om-service-test1"; + omId = UUID.randomUUID().toString(); conf.setBoolean(OZONE_ACL_ENABLED, true); conf.set(OzoneConfigKeys.OZONE_ADMINISTRATORS, OZONE_ADMINISTRATORS_WILDCARD); @@ -168,6 +170,7 @@ public void init() throws Exception { .setClusterId(clusterId) .setScmId(scmId) .setOMServiceId(omServiceId) + .setOmId(omId) .setNumOfOzoneManagers(numOfOMs) .build(); cluster.waitForClusterToBeReady(); diff --git a/hadoop-ozone/interface-client/src/main/proto/OmInterServiceProtocol.proto b/hadoop-ozone/interface-client/src/main/proto/OmInterServiceProtocol.proto new file mode 100644 index 000000000000..bcd07ab827c7 --- /dev/null +++ b/hadoop-ozone/interface-client/src/main/proto/OmInterServiceProtocol.proto @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * These .proto interfaces are private and unstable. + * Please see http://wiki.apache.org/hadoop/Compatibility + * for what changes are allowed for a *unstable* .proto interface. + */ + +syntax = "proto2"; +option java_package = "org.apache.hadoop.ozone.protocol.proto"; +option java_outer_classname = "OzoneManagerInterServiceProtocolProtos"; +option java_generic_services = true; +option java_generate_equals_and_hash = true; +package hadoop.ozone; + +/** +This file contains the protocol for communication between Ozone Managers in +an HA setup. +*/ + +message BootstrapOMRequest { + required string nodeId = 1; + required string hostAddress = 2; + required uint32 ratisPort = 3; +} + +message BootstrapOMResponse { + required bool success = 1; + optional BootstrapErrorCode errorCode = 2; + optional string errorMsg = 3; +} + +enum BootstrapErrorCode { + RATIS_NOT_ENABLED = 1; + LEADER_UNDETERMINED = 2; + LEADER_NOT_READY = 3; + RATIS_BOOTSTRAP_ERROR = 4; + UNDEFINED_ERROR = 5; +} + +/** + The OM service for OM-OM communication. +*/ +service OzoneManagerInterService { + // RPC request from new OM to existing OM ring to bootstrap itself + rpc bootstrap(BootstrapOMRequest) + returns(BootstrapOMResponse); +} \ No newline at end of file diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMStarterInterface.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMStarterInterface.java index 1ffdaa206bd7..bafeec337ecf 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMStarterInterface.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMStarterInterface.java @@ -30,6 +30,8 @@ void start(OzoneConfiguration conf) throws IOException, AuthenticationException; boolean init(OzoneConfiguration conf) throws IOException, AuthenticationException; + void bootstrap(OzoneConfiguration conf) throws IOException, + AuthenticationException; void startAndCancelPrepare(OzoneConfiguration conf) throws IOException, AuthenticationException; } \ No newline at end of file diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index 82eb2f8f7b7a..8c4f46d0a96f 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -97,6 +97,7 @@ import org.apache.hadoop.ozone.OzoneAcl; import org.apache.hadoop.ozone.OzoneConfigKeys; import org.apache.hadoop.ozone.OzoneConsts; +import org.apache.hadoop.ozone.OzoneIllegalArgumentException; import org.apache.hadoop.ozone.OzoneSecurityUtil; import org.apache.hadoop.ozone.audit.AuditAction; import org.apache.hadoop.ozone.audit.AuditEventStatus; @@ -111,7 +112,7 @@ import org.apache.hadoop.ozone.om.exceptions.OMLeaderNotReadyException; import org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException; import org.apache.hadoop.ozone.om.ha.OMHANodeDetails; -import org.apache.hadoop.ozone.om.ha.OMNodeDetails; +import org.apache.hadoop.ozone.om.helpers.OMNodeDetails; import org.apache.hadoop.ozone.om.helpers.DBUpdates; import org.apache.hadoop.ozone.om.helpers.OmBucketArgs; import org.apache.hadoop.ozone.om.helpers.OmBucketInfo; @@ -133,7 +134,10 @@ import org.apache.hadoop.ozone.om.helpers.S3SecretValue; import org.apache.hadoop.ozone.om.helpers.ServiceInfo; import org.apache.hadoop.ozone.om.helpers.ServiceInfoEx; +import org.apache.hadoop.ozone.om.protocol.OMInterServiceProtocol; +import org.apache.hadoop.ozone.om.protocolPB.OMInterServiceProtocolClientSideImpl; import org.apache.hadoop.ozone.om.protocol.OzoneManagerProtocol; +import org.apache.hadoop.ozone.om.protocolPB.OMInterServiceProtocolPB; import org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolPB; import org.apache.hadoop.ozone.common.ha.ratis.RatisSnapshotInfo; import org.apache.hadoop.hdds.security.OzoneSecurityException; @@ -150,6 +154,7 @@ import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OMRoleInfo; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OzoneAclInfo; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.ServicePort; +import org.apache.hadoop.ozone.protocolPB.OMInterServiceProtocolServerSideImpl; import org.apache.hadoop.ozone.storage.proto.OzoneManagerStorageProtos.PersistedUserVolumeInfo; import org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB; import org.apache.hadoop.ozone.security.OzoneBlockTokenSecretManager; @@ -246,12 +251,12 @@ import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.TOKEN_ERROR_OTHER; import static org.apache.hadoop.ozone.om.lock.OzoneManagerLock.Resource.VOLUME_LOCK; import static org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.RaftServerStatus.LEADER_AND_READY; +import static org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.OzoneManagerInterService; +import static org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OzoneManagerService; import static org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OzoneManagerService.newReflectiveBlockingService; import static org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.PrepareStatusResponse.PrepareStatus; - import org.apache.ratis.proto.RaftProtos.RaftPeerRole; import org.apache.ratis.server.protocol.TermIndex; -import org.apache.ratis.util.ExitUtils; import org.apache.ratis.util.FileUtils; import org.apache.ratis.util.LifeCycle; import org.bouncycastle.pkcs.PKCS10CertificationRequest; @@ -263,7 +268,8 @@ */ @InterfaceAudience.LimitedPrivate({"HDFS", "CBLOCK", "OZONE", "HBASE"}) public final class OzoneManager extends ServiceRuntimeInfoImpl - implements OzoneManagerProtocol, OMMXBean, Auditor { + implements OzoneManagerProtocol, OMInterServiceProtocol, + OMMXBean, Auditor { public static final Logger LOG = LoggerFactory.getLogger(OzoneManager.class); @@ -280,7 +286,7 @@ public final class OzoneManager extends ServiceRuntimeInfoImpl private List caCertPemList = new ArrayList<>(); private static boolean testSecureOmFlag = false; private final Text omRpcAddressTxt; - private final OzoneConfiguration configuration; + private OzoneConfiguration configuration; private RPC.Server omRpcServer; private InetSocketAddress omRpcAddress; private String omId; @@ -358,12 +364,20 @@ public final class OzoneManager extends ServiceRuntimeInfoImpl private OzoneManagerPrepareState prepareState; + public enum StartupOption { + REGUALR, + BOOTSTRAP + } + private enum State { INITIALIZED, + BOOTSTRAPPING, RUNNING, STOPPED } + private boolean isBootstrapping = false; + // Used in MiniOzoneCluster testing private State omState; private Thread emptier; @@ -371,11 +385,11 @@ private enum State { private static final int MSECS_PER_MINUTE = 60 * 1000; @SuppressWarnings("methodlength") - private OzoneManager(OzoneConfiguration conf) throws IOException, - AuthenticationException { + private OzoneManager(OzoneConfiguration conf, StartupOption startupOption) + throws IOException, AuthenticationException { super(OzoneVersionInfo.OZONE_VERSION_INFO); Preconditions.checkNotNull(conf); - configuration = conf; + setConfiguration(conf); // Load HA related configurations OMHANodeDetails omhaNodeDetails = OMHANodeDetails.loadOMHAConfig(configuration); @@ -389,6 +403,17 @@ private OzoneManager(OzoneConfiguration conf) throws IOException, versionManager = new OMLayoutVersionManager(omStorage.getLayoutVersion()); upgradeFinalizer = new OMUpgradeFinalizer(versionManager); + exitManager = new ExitManager(); + + // In case of single OM Node Service there will be no OM Node ID + // specified, set it to value from om storage + if (this.omNodeDetails.getNodeId() == null) { + this.omNodeDetails = OMHANodeDetails.getOMNodeDetails(conf, + omNodeDetails.getServiceId(), + omStorage.getOmId(), omNodeDetails.getRpcAddress(), + omNodeDetails.getRatisPort()); + } + loginOMUserIfSecurityEnabled(conf); this.allowListAllVolumes = conf.getBoolean(OZONE_OM_VOLUME_LISTALL_ALLOWED, @@ -478,40 +503,14 @@ private OzoneManager(OzoneConfiguration conf) throws IOException, // Create special volume s3v which is required for S3G. addS3GVolumeToDB(); - this.omRatisSnapshotInfo = new RatisSnapshotInfo(); - - if (isRatisEnabled) { - // Create Ratis storage dir - String omRatisDirectory = - OzoneManagerRatisServer.getOMRatisDirectory(configuration); - if (omRatisDirectory == null || omRatisDirectory.isEmpty()) { - throw new IllegalArgumentException(HddsConfigKeys.OZONE_METADATA_DIRS + - " must be defined."); - } - OmUtils.createOMDir(omRatisDirectory); - - // Create Ratis snapshot dir - omRatisSnapshotDir = OmUtils.createOMDir( - OzoneManagerRatisServer.getOMRatisSnapshotDirectory(configuration)); - - // Before starting ratis server, check if previous installation has - // snapshot directory in Ratis storage directory. if yes, move it to - // new snapshot directory. - - File snapshotDir = new File(omRatisDirectory, OM_RATIS_SNAPSHOT_DIR); - - if (snapshotDir.isDirectory()) { - FileUtils.moveDirectory(snapshotDir.toPath(), - omRatisSnapshotDir.toPath()); - } - - if (peerNodes != null && !peerNodes.isEmpty()) { - this.omSnapshotProvider = new OzoneManagerSnapshotProvider( - configuration, omRatisSnapshotDir, peerNodes); - } + if (startupOption == StartupOption.BOOTSTRAP) { + isBootstrapping = true; } - initializeRatisServer(); + this.omRatisSnapshotInfo = new RatisSnapshotInfo(); + + initializeRatisDirs(conf); + initializeRatisServer(isBootstrapping); metrics = OMMetrics.create(); omClientProtocolMetrics = ProtocolMessageMetrics @@ -531,6 +530,24 @@ private OzoneManager(OzoneConfiguration conf) throws IOException, omState = State.INITIALIZED; } + /** + * Constructs OM instance based on the configuration. + * + * @param conf OzoneConfiguration + * @return OM instance + * @throws IOException, AuthenticationException in case OM instance + * creation fails. + */ + public static OzoneManager createOm(OzoneConfiguration conf) + throws IOException, AuthenticationException { + return new OzoneManager(conf, StartupOption.REGUALR); + } + + public static OzoneManager createOm(OzoneConfiguration conf, + StartupOption startupOption) throws IOException, AuthenticationException { + return new OzoneManager(conf, startupOption); + } + private void logVersionMismatch(OzoneConfiguration conf, ScmInfo scmInfo) { List scmNodeInfoList = SCMNodeInfo.buildNodeInfo(conf); StringBuilder scmBlockAddressBuilder = new StringBuilder(""); @@ -692,6 +709,13 @@ public void close() throws IOException { stop(); } + public void shutdown(Exception ex) throws IOException { + if (omState != State.STOPPED) { + stop(); + exitManager.exitSystem(1, ex.getLocalizedMessage(), ex, LOG); + } + } + /** * Class which schedule saving metrics to a file. */ @@ -901,22 +925,57 @@ private static StorageContainerLocationProtocol getScmContainerClient( } /** - * Starts an RPC server, if configured. + * Creates a new instance of rpc server. If an earlier instance is already + * running then returns the same. + */ + private RPC.Server getRpcServer(OzoneConfiguration conf) throws IOException { + if (isOmRpcServerRunning) { + return omRpcServer; + } + + InetSocketAddress omNodeRpcAddr = OmUtils.getOmAddress(conf); + + final int handlerCount = conf.getInt(OZONE_OM_HANDLER_COUNT_KEY, + OZONE_OM_HANDLER_COUNT_DEFAULT); + RPC.setProtocolEngine(configuration, OzoneManagerProtocolPB.class, + ProtobufRpcEngine.class); + + this.omServerProtocol = new OzoneManagerProtocolServerSideTranslatorPB( + this, omRatisServer, omClientProtocolMetrics, isRatisEnabled, + getLastTrxnIndexForNonRatis()); + BlockingService omService = + OzoneManagerService.newReflectiveBlockingService(omServerProtocol); + + OMInterServiceProtocolServerSideImpl omInterServerProtocol = + new OMInterServiceProtocolServerSideImpl(omRatisServer, + isRatisEnabled); + BlockingService omInterService = + OzoneManagerInterService.newReflectiveBlockingService( + omInterServerProtocol); + + return startRpcServer(configuration, omNodeRpcAddr, omService, + omInterService, handlerCount); + } + + /** * - * @param conf configuration - * @param addr configured address of RPC server - * @param protocol RPC protocol provided by RPC server - * @param instance RPC protocol implementation instance + * @param conf configuration + * @param addr configured address of RPC server + * @param clientProtocolService RPC protocol for client communication + * (OzoneManagerProtocolPB impl) + * @param interOMProtocolService RPC protocol for inter OM communication + * (OMInterServiceProtocolPB impl) * @param handlerCount RPC server handler count * @return RPC server * @throws IOException if there is an I/O error while creating RPC server */ private RPC.Server startRpcServer(OzoneConfiguration conf, - InetSocketAddress addr, Class protocol, BlockingService instance, - int handlerCount) throws IOException { + InetSocketAddress addr, BlockingService clientProtocolService, + BlockingService interOMProtocolService, int handlerCount) + throws IOException { RPC.Server rpcServer = new RPC.Builder(conf) - .setProtocol(protocol) - .setInstance(instance) + .setProtocol(OzoneManagerProtocolPB.class) + .setInstance(clientProtocolService) .setBindAddress(addr.getHostString()) .setPort(addr.getPort()) .setNumHandlers(handlerCount) @@ -924,7 +983,8 @@ private RPC.Server startRpcServer(OzoneConfiguration conf, .setSecretManager(delegationTokenMgr) .build(); - HddsServerUtil.addPBProtocol(conf, protocol, instance, rpcServer); + HddsServerUtil.addPBProtocol(conf, OMInterServiceProtocolPB.class, + interOMProtocolService, rpcServer); if (conf.getBoolean(CommonConfigurationKeys.HADOOP_SECURITY_AUTHORIZATION, false)) { @@ -941,19 +1001,6 @@ private static boolean isOzoneSecurityEnabled() { return securityEnabled; } - /** - * Constructs OM instance based on the configuration. - * - * @param conf OzoneConfiguration - * @return OM instance - * @throws IOException, AuthenticationException in case OM instance - * creation fails. - */ - public static OzoneManager createOm(OzoneConfiguration conf) - throws IOException, AuthenticationException { - return new OzoneManager(conf); - } - /** * Logs in the OM user if security is enabled in the configuration. * @@ -1067,6 +1114,39 @@ public static void initializeSecurity(OzoneConfiguration conf, } } + private void initializeRatisDirs(OzoneConfiguration conf) throws IOException { + if (isRatisEnabled) { + // Create Ratis storage dir + String omRatisDirectory = + OzoneManagerRatisUtils.getOMRatisDirectory(conf); + if (omRatisDirectory == null || omRatisDirectory.isEmpty()) { + throw new IllegalArgumentException(HddsConfigKeys.OZONE_METADATA_DIRS + + " must be defined."); + } + OmUtils.createOMDir(omRatisDirectory); + + // Create Ratis snapshot dir + omRatisSnapshotDir = OmUtils.createOMDir( + OzoneManagerRatisUtils.getOMRatisSnapshotDirectory(conf)); + + // Before starting ratis server, check if previous installation has + // snapshot directory in Ratis storage directory. if yes, move it to + // new snapshot directory. + + File snapshotDir = new File(omRatisDirectory, OM_RATIS_SNAPSHOT_DIR); + + if (snapshotDir.isDirectory()) { + FileUtils.moveDirectory(snapshotDir.toPath(), + omRatisSnapshotDir.toPath()); + } + + if (peerNodes != null && !peerNodes.isEmpty()) { + this.omSnapshotProvider = new OzoneManagerSnapshotProvider( + configuration, omRatisSnapshotDir, peerNodes); + } + } + } + /** * Builds a message for logging startup information about an RPC server. * @@ -1223,6 +1303,11 @@ public void start() throws IOException { startJVMPauseMonitor(); setStartTime(); + if (isBootstrapping) { + omState = State.BOOTSTRAPPING; + bootstrap(omNodeDetails); + } + omState = State.RUNNING; } @@ -1261,7 +1346,7 @@ public void restart() throws IOException { metricsTimer = new Timer(); metricsTimer.schedule(scheduleOMMetricsWriteTask, 0, period); - initializeRatisServer(); + initializeRatisServer(false); if (omRatisServer != null) { omRatisServer.start(); } @@ -1286,6 +1371,76 @@ public void restart() throws IOException { omState = State.RUNNING; } + @Override + public void bootstrap(OMNodeDetails newOMNode) throws IOException { + // Create InterOmServiceProtocol client to send request to other OMs + if (isRatisEnabled) { + try (OMInterServiceProtocolClientSideImpl omInterServiceProtocol = + new OMInterServiceProtocolClientSideImpl(configuration, + getRemoteUser(), getOMServiceId())) { + + omInterServiceProtocol.bootstrap(omNodeDetails); + + LOG.info("Successfully bootstrapped OM {} and joined the Ratis group " + + "{}", getOMNodeId(), omRatisServer.getRaftGroup()); + } + } else { + throw new IOException("OzoneManager can be bootstrapped only when ratis" + + " is enabled and there is atleast one OzoneManager to bootstrap" + + " from."); + } + } + + /** + * Add a new OM Node to the HA cluster. This call comes from OMRatisServer + * after a SetConfiguration request has been successfully executed by the + * Ratis server. + */ + public void addOMNodeToPeers(String newOMNodeId) { + OMNodeDetails newOMNodeDetails = OMNodeDetails.getOMNodeDetailsFromConf( + getConfiguration(), getOMServiceId(), newOMNodeId); + if (newOMNodeDetails == null) { + // Load new configuration object to read in new peer information + setConfiguration(new OzoneConfiguration()); + newOMNodeDetails = OMNodeDetails.getOMNodeDetailsFromConf( + getConfiguration(), getOMServiceId(), newOMNodeId); + + if (newOMNodeDetails == null) { + // If new node information is not present in the newly loaded + // configuration also, throw an exception + throw new OzoneIllegalArgumentException("There is no OM configuration " + + "for node ID " + newOMNodeId + " in ozone-site.xml."); + } + } + + peerNodes.add(newOMNodeDetails); + if (omSnapshotProvider == null) { + omSnapshotProvider = new OzoneManagerSnapshotProvider( + configuration, omRatisSnapshotDir, peerNodes); + } else { + omSnapshotProvider.addNewPeerNode(newOMNodeDetails); + } + omRatisServer.addRaftPeer(newOMNodeDetails); + } + + /** + * Check if the input nodeId exists in the peers list. + * @return true if the nodeId is self or it exists in peer node list, + * false otherwsie. + */ + public boolean doesPeerExist(String omNodeId) { + if (getOMNodeId().equals(omNodeId)) { + return true; + } + if (peerNodes != null && !peerNodes.isEmpty()) { + for (OMNodeDetails peerNode : peerNodes) { + if (peerNode.getNodeId().contains(omNodeId)) { + return true; + } + } + } + return false; + } /** * Starts a Trash Emptier thread that does an fs.trashRoots and performs @@ -1325,35 +1480,17 @@ public FileSystem run() throws IOException { } /** - * Creates a new instance of rpc server. If an earlier instance is already - * running then returns the same. + * Creates an instance of ratis server. */ - private RPC.Server getRpcServer(OzoneConfiguration conf) throws IOException { - if (isOmRpcServerRunning) { - return omRpcServer; - } - - InetSocketAddress omNodeRpcAddr = OmUtils.getOmAddress(conf); - - final int handlerCount = conf.getInt(OZONE_OM_HANDLER_COUNT_KEY, - OZONE_OM_HANDLER_COUNT_DEFAULT); - RPC.setProtocolEngine(configuration, OzoneManagerProtocolPB.class, - ProtobufRpcEngine.class); - this.omServerProtocol = new OzoneManagerProtocolServerSideTranslatorPB( - this, omRatisServer, omClientProtocolMetrics, isRatisEnabled, - getLastTrxnIndexForNonRatis()); - - BlockingService omService = newReflectiveBlockingService(omServerProtocol); - - return startRpcServer(configuration, omNodeRpcAddr, - OzoneManagerProtocolPB.class, omService, - handlerCount); - } - /** * Creates an instance of ratis server. + * @param shouldBootstrap If OM is started in Bootstrap mode, then Ratis + * server will be initialized without adding self to + * Ratis group + * @throws IOException */ - private void initializeRatisServer() throws IOException { + private void initializeRatisServer(boolean shouldBootstrap) + throws IOException { if (isRatisEnabled) { if (omRatisServer == null) { // This needs to be done before initializing Ratis. @@ -1361,7 +1498,7 @@ private void initializeRatisServer() throws IOException { registerRatisMetricReporters(ratisMetricsMap); omRatisServer = OzoneManagerRatisServer.newOMRatisServer( configuration, this, omNodeDetails, peerNodes, - secConfig, certClient); + secConfig, certClient, shouldBootstrap); } LOG.info("OzoneManager Ratis server initialized at port {}", omRatisServer.getServerPort()); @@ -1422,7 +1559,9 @@ public long getRatisSnapshotIndex() throws IOException { * Stop service. */ public void stop() { + LOG.info("Stopping Ozone Manager"); try { + omState = State.STOPPED; // Cancel the metrics timer and set to null. if (metricsTimer != null) { metricsTimer.cancel(); @@ -3546,7 +3685,7 @@ File replaceOMDBWithCheckpoint(long lastAppliedIndex, File oldDB, } catch (IOException ex) { String errorMsg = "Failed to reset to original DB. OM is in an " + "inconsistent state."; - ExitUtils.terminate(1, errorMsg, ex, LOG); + exitManager.exitSystem(1, errorMsg, ex, LOG); } throw e; } @@ -3594,6 +3733,11 @@ public OzoneConfiguration getConfiguration() { return configuration; } + @VisibleForTesting + public void setConfiguration(OzoneConfiguration conf) { + this.configuration = conf; + } + public static void setTestSecureOmFlag(boolean testSecureOmFlag) { OzoneManager.testSecureOmFlag = testSecureOmFlag; } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManagerStarter.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManagerStarter.java index 39069a85b804..e087fdbe32bc 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManagerStarter.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManagerStarter.java @@ -123,6 +123,26 @@ public void startOmUpgrade() throws Exception { } } + /** + * This function implements a sub-command to allow the OM to be bootstrapped + * initialized from the command line. After OM is initialized, it will + * contact the leader OM to add itself to the ring. Once the leader OM + * responds back affirmatively, bootstrap step is complete and the OM is + * functional. + */ + @CommandLine.Command(name = "--bootstrap", + customSynopsis = "ozone om [global options] --bootstrap", + hidden = false, + description = "Initialize if not already initialized and Bootstrap " + + "the Ozone Manager", + mixinStandardHelpOptions = true, + versionProvider = HddsVersionProvider.class) + public void bootstrapOM() + throws Exception { + commonInit(); + receiver.bootstrap(conf); + } + /** * This function should be called by each command to ensure the configuration * is set and print the startup banner message. @@ -165,6 +185,21 @@ public boolean init(OzoneConfiguration conf) throws IOException, return OzoneManager.omInit(conf); } + @Override + public void bootstrap(OzoneConfiguration conf) + throws IOException, AuthenticationException { + // Initialize the Ozone Manager, if not already initialized. + boolean initialize = OzoneManager.omInit(conf); + if (!initialize) { + throw new IOException("OM Init failed."); + } + // Bootstrap the OM + OzoneManager om = OzoneManager.createOm(conf, + OzoneManager.StartupOption.BOOTSTRAP); + om.start(); + om.join(); + } + @Override public void startAndCancelPrepare(OzoneConfiguration conf) throws IOException, AuthenticationException { diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHANodeDetails.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHANodeDetails.java index dd44868a56f1..30166bad72d3 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHANodeDetails.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHANodeDetails.java @@ -25,6 +25,7 @@ import org.apache.hadoop.ozone.OzoneIllegalArgumentException; import org.apache.hadoop.ozone.ha.ConfUtils; import org.apache.hadoop.ozone.om.OMConfigKeys; +import org.apache.hadoop.ozone.om.helpers.OMNodeDetails; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java index 551a2847dc11..1f845cb81b9f 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java @@ -18,25 +18,29 @@ package org.apache.hadoop.ozone.om.ratis; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import com.google.protobuf.InvalidProtocolBufferException; +import com.google.protobuf.ServiceException; + import java.io.File; import java.io.IOException; import java.net.InetSocketAddress; import java.nio.charset.StandardCharsets; -import java.nio.file.Paths; import java.security.cert.X509Certificate; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.UUID; +import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.TimeUnit; -import com.google.common.base.Preconditions; import org.apache.hadoop.hdds.conf.ConfigurationSource; import org.apache.hadoop.hdds.conf.StorageUnit; import org.apache.hadoop.hdds.security.x509.SecurityConfig; import org.apache.hadoop.hdds.security.x509.certificate.client.CertificateClient; -import org.apache.hadoop.hdds.server.ServerUtils; import org.apache.hadoop.hdds.tracing.TracingUtil; import org.apache.hadoop.hdds.utils.HAUtils; import org.apache.hadoop.ipc.ProtobufRpcEngine.Server; @@ -45,17 +49,14 @@ import org.apache.hadoop.ozone.om.exceptions.OMException; import org.apache.hadoop.ozone.om.exceptions.OMLeaderNotReadyException; import org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException; -import org.apache.hadoop.ozone.om.ha.OMNodeDetails; +import org.apache.hadoop.ozone.om.helpers.OMNodeDetails; import org.apache.hadoop.ozone.om.helpers.OMRatisHelper; +import org.apache.hadoop.ozone.om.ratis.utils.OzoneManagerRatisUtils; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OMRequest; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OMResponse; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.Status; -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Strings; -import com.google.protobuf.InvalidProtocolBufferException; -import com.google.protobuf.ServiceException; import org.apache.ratis.RaftConfigKeys; import org.apache.ratis.conf.Parameters; import org.apache.ratis.conf.RaftProperties; @@ -63,6 +64,7 @@ import org.apache.ratis.grpc.GrpcTlsConfig; import org.apache.ratis.netty.NettyConfigKeys; import org.apache.ratis.protocol.ClientId; +import org.apache.ratis.protocol.SetConfigurationRequest; import org.apache.ratis.protocol.exceptions.LeaderNotReadyException; import org.apache.ratis.protocol.exceptions.NotLeaderException; import org.apache.ratis.protocol.exceptions.StateMachineException; @@ -85,12 +87,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.hadoop.hdds.HddsConfigKeys.OZONE_METADATA_DIRS; import static org.apache.hadoop.ipc.RpcConstants.DUMMY_CLIENT_ID; import static org.apache.hadoop.ipc.RpcConstants.INVALID_CALL_ID; -import static org.apache.hadoop.ozone.OzoneConsts.OM_RATIS_SNAPSHOT_DIR; import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_HA_PREFIX; -import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_RATIS_SNAPSHOT_DIR; /** * Creates a Ratis server endpoint for OM. @@ -105,9 +104,127 @@ public final class OzoneManagerRatisServer { private final RaftGroupId raftGroupId; private final RaftGroup raftGroup; private final RaftPeerId raftPeerId; + private final List raftPeers; private final OzoneManager ozoneManager; private final OzoneManagerStateMachine omStateMachine; + private final String ratisStorageDir; + + private final ClientId clientId = ClientId.randomId(); + private static final AtomicLong CALL_ID_COUNTER = new AtomicLong(); + + private static long nextCallId() { + return CALL_ID_COUNTER.getAndIncrement() & Long.MAX_VALUE; + } + + /** + * Returns an OM Ratis server. + * @param conf configuration + * @param om the OM instance starting the ratis server + * @param raftGroupIdStr raft group id string + * @param localRaftPeerId raft peer id of this Ratis server + * @param addr address of the ratis server + * @param peers peer nodes in the raft ring + * @throws IOException + */ + @SuppressWarnings({"parameternumber", "java:S107"}) + private OzoneManagerRatisServer(ConfigurationSource conf, OzoneManager om, + String raftGroupIdStr, RaftPeerId localRaftPeerId, + InetSocketAddress addr, List peers, + SecurityConfig secConfig, CertificateClient certClient) + throws IOException { + this.ozoneManager = om; + this.omRatisAddress = addr; + this.port = addr.getPort(); + this.ratisStorageDir = OzoneManagerRatisUtils.getOMRatisDirectory(conf); + RaftProperties serverProperties = newRaftProperties(conf); + + this.raftPeerId = localRaftPeerId; + this.raftGroupId = RaftGroupId.valueOf( + getRaftGroupIdFromOmServiceId(raftGroupIdStr)); + this.raftPeers = Lists.newArrayList(); + this.raftPeers.addAll(peers); + this.raftGroup = RaftGroup.valueOf(raftGroupId, peers); + + StringBuilder raftPeersStr = new StringBuilder(); + for (RaftPeer peer : peers) { + raftPeersStr.append(", ").append(peer.getAddress()); + } + LOG.info("Instantiating OM Ratis server with groupID: {} and " + + "peers: {}", raftGroupIdStr, raftPeersStr.toString().substring(2)); + + this.omStateMachine = getStateMachine(conf); + + Parameters parameters = createServerTlsParameters(secConfig, certClient); + this.server = RaftServer.newBuilder() + .setServerId(this.raftPeerId) + .setGroup(this.raftGroup) + .setProperties(serverProperties) + .setParameters(parameters) + .setStateMachine(omStateMachine) + .build(); + } + + /** + * Creates an instance of OzoneManagerRatisServer. + */ + public static OzoneManagerRatisServer newOMRatisServer( + ConfigurationSource ozoneConf, OzoneManager omProtocol, + OMNodeDetails omNodeDetails, List peerNodes, + SecurityConfig secConfig, CertificateClient certClient, + boolean isBootstrapping) throws IOException { + + // RaftGroupId is the omServiceId + String omServiceId = omNodeDetails.getServiceId(); + + String omNodeId = omNodeDetails.getNodeId(); + RaftPeerId localRaftPeerId = RaftPeerId.getRaftPeerId(omNodeId); + + InetSocketAddress ratisAddr = new InetSocketAddress( + omNodeDetails.getInetAddress(), omNodeDetails.getRatisPort()); + + RaftPeer localRaftPeer = RaftPeer.newBuilder() + .setId(localRaftPeerId) + .setAddress(ratisAddr) + .build(); + + List raftPeers = new ArrayList<>(); + + // If the OM is started in bootstrap mode, do not add it to the ratis ring. + // It will be added later using SetConfiguration from the leader OM. + if (isBootstrapping) { + LOG.debug("OM started in Bootstrap mode and hence will not be added " + + "to Ratis group during startup."); + } else { + // On regular startup, add current OM to Ratis ring + raftPeers.add(localRaftPeer); + } + + for (OMNodeDetails peerInfo : peerNodes) { + String peerNodeId = peerInfo.getNodeId(); + RaftPeerId raftPeerId = RaftPeerId.valueOf(peerNodeId); + RaftPeer raftPeer; + if (peerInfo.isHostUnresolved()) { + raftPeer = RaftPeer.newBuilder() + .setId(raftPeerId) + .setAddress(peerInfo.getRatisHostPortStr()) + .build(); + } else { + InetSocketAddress peerRatisAddr = new InetSocketAddress( + peerInfo.getInetAddress(), peerInfo.getRatisPort()); + raftPeer = RaftPeer.newBuilder() + .setId(raftPeerId) + .setAddress(peerRatisAddr) + .build(); + } + + // Add other OM nodes belonging to the same OM service to the Ratis ring + raftPeers.add(raftPeer); + } + + return new OzoneManagerRatisServer(ozoneConf, omProtocol, omServiceId, + localRaftPeerId, ratisAddr, raftPeers, secConfig, certClient); + } /** * Submit request to Ratis server. @@ -165,6 +282,65 @@ private RaftClientReply submitRequestToRatis( } } + /** + * Add new OM to the Ratis ring. + */ + public void addOMToRatisRing(OMNodeDetails newOMNode) throws IOException { + + Preconditions.checkNotNull(newOMNode); + + String newOMNodeId = newOMNode.getNodeId(); + RaftPeerId newOMRaftPeerId = RaftPeerId.valueOf(newOMNodeId); + InetSocketAddress newOMRatisAddr = new InetSocketAddress( + newOMNode.getHostAddress(), newOMNode.getRatisPort()); + RaftPeer newRaftPeer = RaftPeer.newBuilder() + .setId(newOMRaftPeerId) + .setAddress(newOMRatisAddr) + .build(); + + LOG.info("{}: Submitting SetConfiguration request to Ratis server to add" + + " new OM peer {} to the Ratis group {}", ozoneManager.getOMNodeId(), + newRaftPeer, raftGroup); + + List newPeersList = new ArrayList<>(); + newPeersList.addAll(raftPeers); + newPeersList.add(newRaftPeer); + + checkLeaderStatus(); + SetConfigurationRequest request = new SetConfigurationRequest(clientId, + server.getId(), raftGroupId, nextCallId(), newPeersList); + + try { + RaftClientReply raftClientReply = server.setConfiguration(request); + if (raftClientReply.isSuccess()) { + LOG.info("Added OM {} to Ratis group {}.", newOMNodeId, raftGroupId); + } else { + LOG.error("Failed to add OM {} to Ratis group {}. Ratis " + + "SetConfiguration reply: {}", newOMNodeId, raftGroupId, + raftClientReply); + throw new IOException("Failed to add OM " + newOMNodeId + " to Ratis " + + "ring."); + } + } catch (IOException e) { + LOG.error("Failed to update Ratis configuration and add OM {} to " + + "Ratis group {}", newOMNodeId, raftGroupId, e); + throw e; + } + } + + public void addRaftPeer(OMNodeDetails omNodeDetails) { + InetSocketAddress newOMRatisAddr = new InetSocketAddress( + omNodeDetails.getHostAddress(), omNodeDetails.getRatisPort()); + + raftPeers.add(RaftPeer.newBuilder() + .setId(RaftPeerId.valueOf(omNodeDetails.getNodeId())) + .setAddress(newOMRatisAddr) + .build()); + + LOG.info("Added OM {} to Ratis Peers list. New OM Ratis Peers list: {}", + omNodeDetails.getNodeId(), raftPeers); + } + /** * Create Write RaftClient request from OMRequest. * @param omRequest @@ -273,106 +449,6 @@ private OzoneManagerProtocolProtos.Status exceptionToResponseStatus( } } - - /** - * Returns an OM Ratis server. - * @param conf configuration - * @param om the OM instance starting the ratis server - * @param raftGroupIdStr raft group id string - * @param localRaftPeerId raft peer id of this Ratis server - * @param addr address of the ratis server - * @param raftPeers peer nodes in the raft ring - * @throws IOException - */ - @SuppressWarnings({"parameternumber", "java:S107"}) - private OzoneManagerRatisServer(ConfigurationSource conf, - OzoneManager om, - String raftGroupIdStr, RaftPeerId localRaftPeerId, - InetSocketAddress addr, List raftPeers, - SecurityConfig secConfig, CertificateClient certClient) - throws IOException { - this.ozoneManager = om; - this.omRatisAddress = addr; - this.port = addr.getPort(); - RaftProperties serverProperties = newRaftProperties(conf); - - this.raftPeerId = localRaftPeerId; - this.raftGroupId = RaftGroupId.valueOf( - getRaftGroupIdFromOmServiceId(raftGroupIdStr)); - this.raftGroup = RaftGroup.valueOf(raftGroupId, raftPeers); - - StringBuilder raftPeersStr = new StringBuilder(); - for (RaftPeer peer : raftPeers) { - raftPeersStr.append(", ").append(peer.getAddress()); - } - LOG.info("Instantiating OM Ratis server with GroupID: {} and " + - "Raft Peers: {}", raftGroupIdStr, raftPeersStr.toString().substring(2)); - - this.omStateMachine = getStateMachine(conf); - - Parameters parameters = createServerTlsParameters(secConfig, certClient); - this.server = RaftServer.newBuilder() - .setServerId(this.raftPeerId) - .setGroup(this.raftGroup) - .setProperties(serverProperties) - .setParameters(parameters) - .setStateMachine(omStateMachine) - .build(); - } - - /** - * Creates an instance of OzoneManagerRatisServer. - */ - public static OzoneManagerRatisServer newOMRatisServer( - ConfigurationSource ozoneConf, OzoneManager omProtocol, - OMNodeDetails omNodeDetails, List peerNodes, - SecurityConfig secConfig, CertificateClient certClient) - throws IOException { - - // RaftGroupId is the omServiceId - String omServiceId = omNodeDetails.getServiceId(); - - String omNodeId = omNodeDetails.getNodeId(); - RaftPeerId localRaftPeerId = RaftPeerId.getRaftPeerId(omNodeId); - - InetSocketAddress ratisAddr = new InetSocketAddress( - omNodeDetails.getInetAddress(), omNodeDetails.getRatisPort()); - - RaftPeer localRaftPeer = RaftPeer.newBuilder() - .setId(localRaftPeerId) - .setAddress(ratisAddr) - .build(); - - List raftPeers = new ArrayList<>(); - // Add this Ratis server to the Ratis ring - raftPeers.add(localRaftPeer); - - for (OMNodeDetails peerInfo : peerNodes) { - String peerNodeId = peerInfo.getNodeId(); - RaftPeerId raftPeerId = RaftPeerId.valueOf(peerNodeId); - RaftPeer raftPeer; - if (peerInfo.isHostUnresolved()) { - raftPeer = RaftPeer.newBuilder() - .setId(raftPeerId) - .setAddress(peerInfo.getRatisHostPortStr()) - .build(); - } else { - InetSocketAddress peerRatisAddr = new InetSocketAddress( - peerInfo.getInetAddress(), peerInfo.getRatisPort()); - raftPeer = RaftPeer.newBuilder() - .setId(raftPeerId) - .setAddress(peerRatisAddr) - .build(); - } - - // Add other OM nodes belonging to the same OM service to the Ratis ring - raftPeers.add(raftPeer); - } - - return new OzoneManagerRatisServer(ozoneConf, omProtocol, omServiceId, - localRaftPeerId, ratisAddr, raftPeers, secConfig, certClient); - } - public RaftGroup getRaftGroup() { return this.raftGroup; } @@ -439,9 +515,8 @@ private RaftProperties newRaftProperties(ConfigurationSource conf) { } // Set Ratis storage directory - String storageDir = OzoneManagerRatisServer.getOMRatisDirectory(conf); RaftServerConfigKeys.setStorageDir(properties, - Collections.singletonList(new File(storageDir))); + Collections.singletonList(new File(ratisStorageDir))); // Disable the pre vote feature in Ratis RaftServerConfigKeys.LeaderElection.setPreVote(properties, false); @@ -634,30 +709,8 @@ private UUID getRaftGroupIdFromOmServiceId(String omServiceId) { return UUID.nameUUIDFromBytes(omServiceId.getBytes(StandardCharsets.UTF_8)); } - /** - * Get the local directory where ratis logs will be stored. - */ - public static String getOMRatisDirectory(ConfigurationSource conf) { - String storageDir = conf.get(OMConfigKeys.OZONE_OM_RATIS_STORAGE_DIR); - - if (Strings.isNullOrEmpty(storageDir)) { - storageDir = ServerUtils.getDefaultRatisDirectory(conf); - } - return storageDir; - } - - public static String getOMRatisSnapshotDirectory(ConfigurationSource conf) { - String snapshotDir = conf.get(OZONE_OM_RATIS_SNAPSHOT_DIR); - - // If ratis snapshot directory is not set, fall back to ozone.metadata.dir. - if (Strings.isNullOrEmpty(snapshotDir)) { - LOG.warn("{} is not configured. Falling back to {} config", - OZONE_OM_RATIS_SNAPSHOT_DIR, OZONE_METADATA_DIRS); - File metaDirPath = ServerUtils.getOzoneMetaDirPath(conf); - snapshotDir = Paths.get(metaDirPath.getPath(), - OM_RATIS_SNAPSHOT_DIR).toString(); - } - return snapshotDir; + public String getRatisStorageDir() { + return ratisStorageDir; } public TermIndex getLastAppliedTermIndex() { diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java index e51d40db439f..a40dbe553cdb 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java @@ -53,6 +53,7 @@ import org.apache.ratis.protocol.RaftClientRequest; import org.apache.ratis.protocol.RaftGroupId; import org.apache.ratis.protocol.RaftPeerId; +import org.apache.ratis.protocol.exceptions.RaftException; import org.apache.ratis.protocol.exceptions.StateMachineException; import org.apache.ratis.server.RaftServer; import org.apache.ratis.server.protocol.TermIndex; @@ -175,6 +176,28 @@ public void notifyTermIndexUpdated(long currentTerm, long index) { computeAndUpdateLastAppliedIndex(index, currentTerm, null, false); } + /** + * Called to notify state machine about configuration changes. + * Configurations changes include addition of newly bootstrapped OM. + */ + @Override + public void notifyConfigurationChanged(long term, long index, + RaftProtos.RaftConfigurationProto newRaftConfiguration) { + List newPeers = newRaftConfiguration.getPeersList(); + LOG.info("Received Configuration change notification from Ratis. New Peer" + + " list:\n{}", newPeers); + for (RaftProtos.RaftPeerProto raftPeerProto : newPeers) { + String omNodeId = RaftPeerId.valueOf(raftPeerProto.getId()).toString(); + if (!ozoneManager.doesPeerExist(omNodeId)) { + LOG.info("Adding new OM {} to the cluster.", omNodeId); + ozoneManager.addOMNodeToPeers(omNodeId); + } else { + LOG.debug("Not adding OM {} to cluster as it is already present in " + + "peer list.", omNodeId); + } + } + } + /** * Validate/pre-process the incoming update request in the state machine. * @return the content to be written to the log entry. Null means the request @@ -444,6 +467,15 @@ public String toStateMachineLogEntryString(StateMachineLogEntryProto proto) { return OMRatisHelper.smProtoToString(proto); } + @Override + public void close() throws IOException { + // OM should be shutdown as the StateMachine has shutdown. + LOG.info("StateMachine has shutdown. Shutdown OzoneManager if not " + + "already shutdown."); + ozoneManager.shutdown(new RaftException("RaftServer called shutdown on " + + "StateMachine")); + } + /** * Handle the RaftClientRequest and return TransactionContext object. * @param raftClientRequest diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java index d4822cd45940..a4f9c139bb06 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java @@ -18,12 +18,22 @@ package org.apache.hadoop.ozone.om.ratis.utils; import com.google.common.base.Preconditions; +import com.google.common.base.Strings; +import com.google.protobuf.ServiceException; +import java.io.File; +import java.nio.file.Paths; +import org.apache.hadoop.hdds.conf.ConfigurationSource; import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.hdds.server.ServerUtils; import org.apache.hadoop.hdds.utils.HAUtils; +import org.apache.hadoop.ozone.om.OMConfigKeys; import org.apache.hadoop.ozone.om.OzoneManager; import org.apache.hadoop.ozone.om.codec.OMDBDefinition; import org.apache.hadoop.ozone.om.exceptions.OMException; import org.apache.hadoop.hdds.utils.TransactionInfo; +import org.apache.hadoop.ozone.om.exceptions.OMLeaderNotReadyException; +import org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException; +import org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.RaftServerStatus; import org.apache.hadoop.ozone.om.request.bucket.OMBucketCreateRequest; import org.apache.hadoop.ozone.om.request.bucket.OMBucketDeleteRequest; import org.apache.hadoop.ozone.om.request.bucket.OMBucketSetPropertyRequest; @@ -87,11 +97,17 @@ import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OzoneObj.ObjectType; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.Status; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.Type; +import org.apache.ratis.protocol.RaftPeerId; import org.rocksdb.RocksDBException; import java.io.IOException; import java.nio.file.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import static org.apache.hadoop.hdds.HddsConfigKeys.OZONE_METADATA_DIRS; +import static org.apache.hadoop.ozone.OzoneConsts.OM_RATIS_SNAPSHOT_DIR; +import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_RATIS_SNAPSHOT_DIR; /** * Utility class used by OzoneManager HA. @@ -102,6 +118,9 @@ public final class OzoneManagerRatisUtils { // upgrade HDDS-3698 story reaches consensus. private static boolean isBucketFSOptimized = false; + private static final Logger LOG = LoggerFactory + .getLogger(OzoneManagerRatisUtils.class); + private OzoneManagerRatisUtils() { } @@ -349,4 +368,70 @@ public static boolean isBucketFSOptimized() { return isBucketFSOptimized; } + /** + * Get the local directory where ratis logs will be stored. + */ + public static String getOMRatisDirectory(ConfigurationSource conf) { + String storageDir = conf.get(OMConfigKeys.OZONE_OM_RATIS_STORAGE_DIR); + + if (Strings.isNullOrEmpty(storageDir)) { + storageDir = ServerUtils.getDefaultRatisDirectory(conf); + } + return storageDir; + } + + public static String getOMRatisSnapshotDirectory(ConfigurationSource conf) { + String snapshotDir = conf.get(OZONE_OM_RATIS_SNAPSHOT_DIR); + + // If ratis snapshot directory is not set, fall back to ozone.metadata.dir. + if (Strings.isNullOrEmpty(snapshotDir)) { + LOG.warn("{} is not configured. Falling back to {} config", + OZONE_OM_RATIS_SNAPSHOT_DIR, OZONE_METADATA_DIRS); + File metaDirPath = ServerUtils.getOzoneMetaDirPath(conf); + snapshotDir = Paths.get(metaDirPath.getPath(), + OM_RATIS_SNAPSHOT_DIR).toString(); + } + return snapshotDir; + } + + public static void checkLeaderStatus(RaftServerStatus raftServerStatus, + RaftPeerId raftPeerId) throws ServiceException { + switch (raftServerStatus) { + case LEADER_AND_READY: return; + + case LEADER_AND_NOT_READY: throw createLeaderNotReadyException(raftPeerId); + + case NOT_LEADER: throw createNotLeaderException(raftPeerId); + + default: throw new IllegalStateException( + "Unknown Ratis Server state: " + raftServerStatus); + } + } + + private static ServiceException createNotLeaderException( + RaftPeerId raftPeerId) { + + // TODO: Set suggest leaderID. Right now, client is not using suggest + // leaderID. Need to fix this. + + OMNotLeaderException notLeaderException = + new OMNotLeaderException(raftPeerId); + + LOG.debug(notLeaderException.getMessage()); + + return new ServiceException(notLeaderException); + } + + private static ServiceException createLeaderNotReadyException( + RaftPeerId raftPeerId) { + + OMLeaderNotReadyException leaderNotReadyException = + new OMLeaderNotReadyException(raftPeerId.toString() + " is Leader " + + "but not ready to process request yet."); + + LOG.debug(leaderNotReadyException.getMessage()); + + return new ServiceException(leaderNotReadyException); + } + } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OzoneManagerSnapshotProvider.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OzoneManagerSnapshotProvider.java index 097d1421d5c7..7d45d35c2d0f 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OzoneManagerSnapshotProvider.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OzoneManagerSnapshotProvider.java @@ -37,7 +37,7 @@ import org.apache.hadoop.hdds.utils.db.DBCheckpoint; import org.apache.hadoop.hdds.utils.db.RocksDBCheckpoint; import org.apache.hadoop.hdfs.web.URLConnectionFactory; -import org.apache.hadoop.ozone.om.ha.OMNodeDetails; +import org.apache.hadoop.ozone.om.helpers.OMNodeDetails; import static java.net.HttpURLConnection.HTTP_CREATED; import static java.net.HttpURLConnection.HTTP_OK; @@ -120,7 +120,7 @@ public DBCheckpoint getOzoneManagerDBSnapshot(String leaderOMNodeID) File targetFile = new File(snapshotFileName + ".tar.gz"); String omCheckpointUrl = peerNodesMap.get(leaderOMNodeID) - .getOMDBCheckpointEnpointUrl(httpPolicy); + .getOMDBCheckpointEnpointUrl(httpPolicy.isHttpEnabled()); LOG.info("Downloading latest checkpoint from Leader OM {}. Checkpoint " + "URL: {}", leaderOMNodeID, omCheckpointUrl); @@ -159,4 +159,11 @@ public void stop() { connectionFactory.destroy(); } } + + /** + * When a new OM is bootstrapped, add it to the peerNode map. + */ + public void addNewPeerNode(OMNodeDetails newOMNode) { + peerNodesMap.put(newOMNode.getNodeId(), newOMNode); + } } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OMInterServiceProtocolServerSideImpl.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OMInterServiceProtocolServerSideImpl.java new file mode 100644 index 000000000000..6f529eb1f096 --- /dev/null +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OMInterServiceProtocolServerSideImpl.java @@ -0,0 +1,73 @@ +package org.apache.hadoop.ozone.protocolPB; + +import com.google.protobuf.RpcController; +import com.google.protobuf.ServiceException; +import java.io.IOException; +import org.apache.hadoop.ozone.om.helpers.OMNodeDetails; +import org.apache.hadoop.ozone.om.protocolPB.OMInterServiceProtocolPB; +import org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer; +import org.apache.hadoop.ozone.om.ratis.utils.OzoneManagerRatisUtils; +import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.BootstrapErrorCode; +import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.BootstrapOMRequest; +import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.BootstrapOMResponse; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class OMInterServiceProtocolServerSideImpl implements + OMInterServiceProtocolPB { + + private static final Logger LOG = LoggerFactory.getLogger( + OMInterServiceProtocolServerSideImpl.class); + + private final OzoneManagerRatisServer omRatisServer; + private final boolean isRatisEnabled; + + public OMInterServiceProtocolServerSideImpl( + OzoneManagerRatisServer ratisServer, boolean enableRatis) { + this.omRatisServer = ratisServer; + this.isRatisEnabled = enableRatis; + } + + @Override + public BootstrapOMResponse bootstrap(RpcController controller, + BootstrapOMRequest request) throws ServiceException { + if (request == null) { + return null; + } + if (!isRatisEnabled) { + return BootstrapOMResponse.newBuilder() + .setSuccess(false) + .setErrorCode(BootstrapErrorCode.RATIS_NOT_ENABLED) + .setErrorMsg("New OM node cannot be bootstrapped as Ratis " + + "is not enabled on existing OM") + .build(); + } + + checkLeaderStatus(); + + OMNodeDetails newOmNode = new OMNodeDetails.Builder() + .setOMNodeId(request.getNodeId()) + .setHostAddress(request.getHostAddress()) + .setRatisPort(request.getRatisPort()) + .build(); + + try { + omRatisServer.addOMToRatisRing(newOmNode); + } catch (IOException ex) { + return BootstrapOMResponse.newBuilder() + .setSuccess(false) + .setErrorCode(BootstrapErrorCode.RATIS_BOOTSTRAP_ERROR) + .setErrorMsg(ex.getMessage()) + .build(); + } + + return BootstrapOMResponse.newBuilder() + .setSuccess(true) + .build(); + } + + private void checkLeaderStatus() throws ServiceException { + OzoneManagerRatisUtils.checkLeaderStatus(omRatisServer.checkLeaderStatus(), + omRatisServer.getRaftPeerId()); + } +} diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OzoneManagerProtocolServerSideTranslatorPB.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OzoneManagerProtocolServerSideTranslatorPB.java index d9dcece8addd..68f4d10bdc9e 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OzoneManagerProtocolServerSideTranslatorPB.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OzoneManagerProtocolServerSideTranslatorPB.java @@ -28,8 +28,6 @@ import org.apache.hadoop.hdds.utils.ProtocolMessageMetrics; import org.apache.hadoop.ozone.OmUtils; import org.apache.hadoop.ozone.om.OzoneManager; -import org.apache.hadoop.ozone.om.exceptions.OMLeaderNotReadyException; -import org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException; import org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolPB; import org.apache.hadoop.ozone.om.ratis.OzoneManagerDoubleBuffer; import org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer; @@ -133,48 +131,21 @@ private OMResponse processRequest(OMRequest request) throws if (OmUtils.isReadOnly(request)) { return submitReadRequestToOM(request); } else { - raftServerStatus = omRatisServer.checkLeaderStatus(); - if (raftServerStatus == LEADER_AND_READY) { - try { - OMClientRequest omClientRequest = createClientRequest(request); - request = omClientRequest.preExecute(ozoneManager); - } catch (IOException ex) { - // As some of the preExecute returns error. So handle here. - return createErrorResponse(request, ex); - } - return submitRequestToRatis(request); - } else { - throw createLeaderErrorException(raftServerStatus); + checkLeaderStatus(); + try { + OMClientRequest omClientRequest = createClientRequest(request); + request = omClientRequest.preExecute(ozoneManager); + } catch (IOException ex) { + // As some of the preExecute returns error. So handle here. + return createErrorResponse(request, ex); } + return submitRequestToRatis(request); } } else { return submitRequestDirectlyToOM(request); } } - /** - * Create OMResponse from the specified OMRequest and exception. - * - * @param omRequest - * @param exception - * @return OMResponse - */ - private OMResponse createErrorResponse( - OMRequest omRequest, IOException exception) { - // Added all write command types here, because in future if any of the - // preExecute is changed to return IOException, we can return the error - // OMResponse to the client. - OMResponse.Builder omResponse = OMResponse.newBuilder() - .setStatus(OzoneManagerRatisUtils.exceptionToResponseStatus(exception)) - .setCmdType(omRequest.getCmdType()) - .setTraceID(omRequest.getTraceID()) - .setSuccess(false); - if (exception.getMessage() != null) { - omResponse.setMessage(exception.getMessage()); - } - return omResponse.build(); - } - /** * Submits request to OM's Ratis server. */ @@ -264,6 +235,34 @@ private OMResponse submitRequestDirectlyToOM(OMRequest request) { return omClientResponse.getOMResponse(); } + private void checkLeaderStatus() throws ServiceException { + OzoneManagerRatisUtils.checkLeaderStatus(omRatisServer.checkLeaderStatus(), + omRatisServer.getRaftPeerId()); + } + + /** + * Create OMResponse from the specified OMRequest and exception. + * + * @param omRequest + * @param exception + * @return OMResponse + */ + private OMResponse createErrorResponse( + OMRequest omRequest, IOException exception) { + // Added all write command types here, because in future if any of the + // preExecute is changed to return IOException, we can return the error + // OMResponse to the client. + OMResponse.Builder omResponse = OMResponse.newBuilder() + .setStatus(OzoneManagerRatisUtils.exceptionToResponseStatus(exception)) + .setCmdType(omRequest.getCmdType()) + .setTraceID(omRequest.getTraceID()) + .setSuccess(false); + if (exception.getMessage() != null) { + omResponse.setMessage(exception.getMessage()); + } + return omResponse.build(); + } + public void stop() { if (!isRatisEnabled) { ozoneManagerDoubleBuffer.stop(); diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerStarter.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerStarter.java index b64edce5ec1b..e140fd50dcf4 100644 --- a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerStarter.java +++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerStarter.java @@ -177,6 +177,12 @@ public boolean init(OzoneConfiguration conf) throws IOException, return initStatus; } + @Override + public void bootstrap(OzoneConfiguration conf) throws IOException, + AuthenticationException { + //TODO: Add test for bootstrap + } + @Override public void startAndCancelPrepare(OzoneConfiguration conf) throws IOException, AuthenticationException { diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/failover/TestOMFailovers.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/failover/TestOMFailovers.java index 84b207ef5c8d..7d7c310f6858 100644 --- a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/failover/TestOMFailovers.java +++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/failover/TestOMFailovers.java @@ -118,7 +118,7 @@ private final class MockFailoverProxyProvider private MockFailoverProxyProvider(ConfigurationSource configuration) throws IOException { - super(configuration, null, null); + super(configuration, null, null, OzoneManagerProtocolPB.class); } @Override diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerRatisServer.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerRatisServer.java index 07455edbc899..86252d002164 100644 --- a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerRatisServer.java +++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerRatisServer.java @@ -36,7 +36,7 @@ import org.apache.hadoop.ozone.common.ha.ratis.RatisSnapshotInfo; import org.apache.hadoop.ozone.om.OMConfigKeys; import org.apache.hadoop.ozone.om.OMMetadataManager; -import org.apache.hadoop.ozone.om.ha.OMNodeDetails; +import org.apache.hadoop.ozone.om.helpers.OMNodeDetails; import org.apache.hadoop.ozone.om.OmMetadataManagerImpl; import org.apache.hadoop.ozone.om.OzoneManager; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos @@ -115,7 +115,7 @@ public void init() throws Exception { secConfig = new SecurityConfig(conf); certClient = new OMCertificateClient(secConfig); omRatisServer = OzoneManagerRatisServer.newOMRatisServer(conf, ozoneManager, - omNodeDetails, Collections.emptyList(), secConfig, certClient); + omNodeDetails, Collections.emptyList(), secConfig, certClient, false); omRatisServer.start(); } @@ -155,7 +155,7 @@ public void testLoadSnapshotInfoOnStart() throws Exception { // Start new Ratis server. It should pick up and load the new SnapshotInfo omRatisServer = OzoneManagerRatisServer.newOMRatisServer(conf, ozoneManager, - omNodeDetails, Collections.emptyList(), secConfig, certClient); + omNodeDetails, Collections.emptyList(), secConfig, certClient, false); omRatisServer.start(); TermIndex lastAppliedTermIndex = omRatisServer.getLastAppliedTermIndex(); @@ -225,7 +225,7 @@ public void verifyRaftGroupIdGenerationWithCustomOmServiceId() throws omRatisServer.stop(); OzoneManagerRatisServer newOmRatisServer = OzoneManagerRatisServer .newOMRatisServer(newConf, ozoneManager, nodeDetails, - Collections.emptyList(), secConfig, certClient); + Collections.emptyList(), secConfig, certClient, false); newOmRatisServer.start(); UUID uuid = UUID.nameUUIDFromBytes(customOmServiceId.getBytes(UTF_8)); diff --git a/hadoop-ozone/ozonefs-hadoop2/src/main/java/org/apache/hadoop/fs/ozone/Hadoop27RpcTransport.java b/hadoop-ozone/ozonefs-hadoop2/src/main/java/org/apache/hadoop/fs/ozone/Hadoop27RpcTransport.java index b2a35704a8fb..c0de6d632544 100644 --- a/hadoop-ozone/ozonefs-hadoop2/src/main/java/org/apache/hadoop/fs/ozone/Hadoop27RpcTransport.java +++ b/hadoop-ozone/ozonefs-hadoop2/src/main/java/org/apache/hadoop/fs/ozone/Hadoop27RpcTransport.java @@ -57,7 +57,7 @@ public Hadoop27RpcTransport(ConfigurationSource conf, ProtobufRpcEngine.class); this.omFailoverProxyProvider = new OMFailoverProxyProvider(conf, ugi, - omServiceId); + omServiceId, OzoneManagerProtocolPB.class); int maxFailovers = conf.getInt( OzoneConfigKeys.OZONE_CLIENT_FAILOVER_MAX_ATTEMPTS_KEY, From 369b5ccb9b288a7b7578d5f8341e4099b0754448 Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Fri, 23 Apr 2021 10:55:34 -0700 Subject: [PATCH 02/15] CI fixes 1 --- .../ozone/om/ha/OMFailoverProxyProvider.java | 1 - .../OMInterServiceProtocolClientSideImpl.java | 2 +- .../hadoop/ozone/MiniOzoneChaosCluster.java | 1 + .../hadoop/ozone/MiniOzoneHAClusterImpl.java | 8 +------ .../ozone/om/TestOzoneManagerBootstrap.java | 8 +++---- .../om/ratis/OzoneManagerStateMachine.java | 5 +++-- .../OMInterServiceProtocolServerSideImpl.java | 22 ++++++++++++++----- ...ManagerProtocolServerSideTranslatorPB.java | 4 ---- 8 files changed, 27 insertions(+), 24 deletions(-) diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java index 75c3b32cf0c3..219edc261662 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProvider.java @@ -54,7 +54,6 @@ import org.apache.hadoop.ozone.om.exceptions.OMLeaderNotReadyException; import org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException; import org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolClientSideTranslatorPB; -import org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolPB; import org.apache.hadoop.security.UserGroupInformation; import org.apache.ratis.protocol.exceptions.StateMachineException; diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolClientSideImpl.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolClientSideImpl.java index 9ccac57b5a78..74a3b8b43afe 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolClientSideImpl.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolClientSideImpl.java @@ -40,7 +40,7 @@ import org.slf4j.LoggerFactory; /** - * Protocol implementation for Inter OM communication + * Protocol implementation for Inter OM communication. */ public class OMInterServiceProtocolClientSideImpl implements OMInterServiceProtocol { diff --git a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java index a8c588dd30ec..728ded031e44 100644 --- a/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java +++ b/hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneChaosCluster.java @@ -100,6 +100,7 @@ public static FailureService of(String serviceName) { } } + @SuppressWarnings("parameternumber") public MiniOzoneChaosCluster(OzoneConfiguration conf, List ozoneManagers, List scms, List hddsDatanodes, String omServiceID, diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java index 227ef72e5329..ede4b549a0fe 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java @@ -725,12 +725,6 @@ public void bootstrapOzoneManager(String omNodeId) throws Exception { } if (e instanceof BindException || e.getCause() instanceof BindException) { - if (om != null) { - om.stop(); - om.join(); - LOG.info("Stopping OzoneManager server at {}", - om.getOmRpcServerAddr()); - } ++retryCount; LOG.info("MiniOzoneHACluster port conflicts, retried {} times", retryCount); @@ -852,7 +846,7 @@ static class MiniOzoneHAService { private List inactiveServices; // Function to extract the Id from service - Function serviceIdProvider; + private Function serviceIdProvider; MiniOzoneHAService(String name, List activeList, List inactiveList, String serviceId, diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java index 9da4327028c3..6e094beed2a6 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java @@ -61,10 +61,10 @@ public class TestOzoneManagerBootstrap { private OzoneConfiguration conf; private final String clusterId = UUID.randomUUID().toString(); private final String scmId = UUID.randomUUID().toString(); - private final String omServiceId = "om-bootstrap"; private static final int NUM_INITIAL_OMS = 3; + private static final String OM_SERVICE_ID = "om-bootstrap"; private static final String VOLUME_NAME; private static final String BUCKET_NAME; @@ -85,11 +85,11 @@ private void setupCluster(int numInitialOMs) throws Exception { .setClusterId(clusterId) .setScmId(scmId) .setSCMServiceId(SCM_DUMMY_SERVICE_ID) - .setOMServiceId(omServiceId) + .setOMServiceId(OM_SERVICE_ID) .setNumOfOzoneManagers(numInitialOMs) .build(); cluster.waitForClusterToBeReady(); - objectStore = OzoneClientFactory.getRpcClient(omServiceId, conf) + objectStore = OzoneClientFactory.getRpcClient(OM_SERVICE_ID, conf) .getObjectStore(); // Perform some transactions @@ -201,7 +201,7 @@ public void testLeaderChangeToNewOM() throws Exception { "other OMs are down", newOMNodeIds.contains(omLeader.getOMNodeId())); // Perform some read and write operations with new OM leader - objectStore = OzoneClientFactory.getRpcClient(omServiceId, conf) + objectStore = OzoneClientFactory.getRpcClient(OM_SERVICE_ID, conf) .getObjectStore(); OzoneVolume volume = objectStore.getVolume(VOLUME_NAME); OzoneBucket bucket = volume.getBucket(BUCKET_NAME); diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java index a40dbe553cdb..8cea2c488245 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java @@ -183,13 +183,14 @@ public void notifyTermIndexUpdated(long currentTerm, long index) { @Override public void notifyConfigurationChanged(long term, long index, RaftProtos.RaftConfigurationProto newRaftConfiguration) { - List newPeers = newRaftConfiguration.getPeersList(); + List newPeers = + newRaftConfiguration.getPeersList(); LOG.info("Received Configuration change notification from Ratis. New Peer" + " list:\n{}", newPeers); for (RaftProtos.RaftPeerProto raftPeerProto : newPeers) { String omNodeId = RaftPeerId.valueOf(raftPeerProto.getId()).toString(); if (!ozoneManager.doesPeerExist(omNodeId)) { - LOG.info("Adding new OM {} to the cluster.", omNodeId); + LOG.info("Adding new OM {} to the Peer list.", omNodeId); ozoneManager.addOMNodeToPeers(omNodeId); } else { LOG.debug("Not adding OM {} to cluster as it is already present in " + diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OMInterServiceProtocolServerSideImpl.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OMInterServiceProtocolServerSideImpl.java index 6f529eb1f096..e97cebf5142d 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OMInterServiceProtocolServerSideImpl.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OMInterServiceProtocolServerSideImpl.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS,WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package org.apache.hadoop.ozone.protocolPB; import com.google.protobuf.RpcController; @@ -10,15 +27,10 @@ import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.BootstrapErrorCode; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.BootstrapOMRequest; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.BootstrapOMResponse; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class OMInterServiceProtocolServerSideImpl implements OMInterServiceProtocolPB { - private static final Logger LOG = LoggerFactory.getLogger( - OMInterServiceProtocolServerSideImpl.class); - private final OzoneManagerRatisServer omRatisServer; private final boolean isRatisEnabled; diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OzoneManagerProtocolServerSideTranslatorPB.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OzoneManagerProtocolServerSideTranslatorPB.java index 68f4d10bdc9e..845d5528d040 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OzoneManagerProtocolServerSideTranslatorPB.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OzoneManagerProtocolServerSideTranslatorPB.java @@ -41,14 +41,10 @@ import com.google.protobuf.ProtocolMessageEnum; import com.google.protobuf.RpcController; import com.google.protobuf.ServiceException; -import org.apache.ratis.protocol.RaftPeerId; import org.apache.ratis.util.ExitUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.RaftServerStatus.LEADER_AND_READY; -import static org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.RaftServerStatus.NOT_LEADER; - /** * This class is the server-side translator that forwards requests received on * {@link OzoneManagerProtocolPB} From f4c147b2ff799a3715944e00216530fdcd062cdc Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Fri, 23 Apr 2021 15:35:20 -0700 Subject: [PATCH 03/15] CI fixes 2 --- .../src/main/java/org/apache/hadoop/hdds/NodeDetails.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/NodeDetails.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/NodeDetails.java index 49f5290bf568..5c86b2e93357 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/NodeDetails.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/NodeDetails.java @@ -45,8 +45,10 @@ public NodeDetails(String serviceId, String nodeId, this.serviceId = serviceId; this.nodeId = nodeId; this.rpcAddress = rpcAddress; - this.hostAddress = rpcAddress.getHostName(); - this.rpcPort = rpcAddress.getPort(); + if (rpcAddress != null) { + this.hostAddress = rpcAddress.getHostName(); + this.rpcPort = rpcAddress.getPort(); + } this.ratisPort = ratisPort; this.httpAddress = httpAddress; this.httpsAddress = httpsAddress; From 6532008e1aa14b40d4997d45bf9b42c77602f0a0 Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Wed, 26 May 2021 10:47:27 -0700 Subject: [PATCH 04/15] Generalize Bootstrap error code --- .../protocolPB/OMInterServiceProtocolClientSideImpl.java | 8 ++++---- .../src/main/proto/OmInterServiceProtocol.proto | 4 ++-- .../protocolPB/OMInterServiceProtocolServerSideImpl.java | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolClientSideImpl.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolClientSideImpl.java index 74a3b8b43afe..a81a19bfd679 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolClientSideImpl.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolClientSideImpl.java @@ -32,9 +32,9 @@ import org.apache.hadoop.ozone.om.ha.OMFailoverProxyProvider; import org.apache.hadoop.ozone.om.helpers.OMNodeDetails; import org.apache.hadoop.ozone.om.protocol.OMInterServiceProtocol; -import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.BootstrapErrorCode; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.BootstrapOMRequest; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.BootstrapOMResponse; +import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.ErrorCode; import org.apache.hadoop.security.UserGroupInformation; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -90,14 +90,14 @@ public void bootstrap(OMNodeDetails newOMNode) throws IOException { OMNotLeaderException notLeaderException = OMFailoverProxyProvider.getNotLeaderException(e); if (notLeaderException != null) { - throwException(BootstrapErrorCode.LEADER_UNDETERMINED, + throwException(ErrorCode.LEADER_UNDETERMINED, notLeaderException.getMessage()); } OMLeaderNotReadyException leaderNotReadyException = OMFailoverProxyProvider.getLeaderNotReadyException(e); if (leaderNotReadyException != null) { - throwException(BootstrapErrorCode.LEADER_NOT_READY, + throwException(ErrorCode.LEADER_NOT_READY, leaderNotReadyException.getMessage()); } throw ProtobufHelper.getRemoteException(e); @@ -108,7 +108,7 @@ public void bootstrap(OMNodeDetails newOMNode) throws IOException { } } - private void throwException(BootstrapErrorCode errorCode, String errorMsg) + private void throwException(ErrorCode errorCode, String errorMsg) throws IOException { throw new IOException("Failed to Bootstrap OM. Error Code: " + errorCode + ", Error Message: " + errorMsg); diff --git a/hadoop-ozone/interface-client/src/main/proto/OmInterServiceProtocol.proto b/hadoop-ozone/interface-client/src/main/proto/OmInterServiceProtocol.proto index bcd07ab827c7..49ef56357726 100644 --- a/hadoop-ozone/interface-client/src/main/proto/OmInterServiceProtocol.proto +++ b/hadoop-ozone/interface-client/src/main/proto/OmInterServiceProtocol.proto @@ -42,11 +42,11 @@ message BootstrapOMRequest { message BootstrapOMResponse { required bool success = 1; - optional BootstrapErrorCode errorCode = 2; + optional ErrorCode errorCode = 2; optional string errorMsg = 3; } -enum BootstrapErrorCode { +enum ErrorCode { RATIS_NOT_ENABLED = 1; LEADER_UNDETERMINED = 2; LEADER_NOT_READY = 3; diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OMInterServiceProtocolServerSideImpl.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OMInterServiceProtocolServerSideImpl.java index e97cebf5142d..d0636a066fcf 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OMInterServiceProtocolServerSideImpl.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OMInterServiceProtocolServerSideImpl.java @@ -24,9 +24,9 @@ import org.apache.hadoop.ozone.om.protocolPB.OMInterServiceProtocolPB; import org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer; import org.apache.hadoop.ozone.om.ratis.utils.OzoneManagerRatisUtils; -import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.BootstrapErrorCode; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.BootstrapOMRequest; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.BootstrapOMResponse; +import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.ErrorCode; public class OMInterServiceProtocolServerSideImpl implements OMInterServiceProtocolPB { @@ -49,7 +49,7 @@ public BootstrapOMResponse bootstrap(RpcController controller, if (!isRatisEnabled) { return BootstrapOMResponse.newBuilder() .setSuccess(false) - .setErrorCode(BootstrapErrorCode.RATIS_NOT_ENABLED) + .setErrorCode(ErrorCode.RATIS_NOT_ENABLED) .setErrorMsg("New OM node cannot be bootstrapped as Ratis " + "is not enabled on existing OM") .build(); @@ -68,7 +68,7 @@ public BootstrapOMResponse bootstrap(RpcController controller, } catch (IOException ex) { return BootstrapOMResponse.newBuilder() .setSuccess(false) - .setErrorCode(BootstrapErrorCode.RATIS_BOOTSTRAP_ERROR) + .setErrorCode(ErrorCode.RATIS_BOOTSTRAP_ERROR) .setErrorMsg(ex.getMessage()) .build(); } From 64974a84c4cfe11e347b7a3b1074c25d468fc3eb Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Thu, 27 May 2021 15:06:07 -0700 Subject: [PATCH 05/15] Review comments fixes --- .../common/src/main/resources/ozone-default.xml | 9 +++++++++ .../org/apache/hadoop/ozone/om/OMConfigKeys.java | 7 +++++++ .../om/protocol/OMInterServiceProtocol.java | 7 +++++++ .../hadoop/ozone/MiniOzoneHAClusterImpl.java | 12 +++--------- .../ozone/om/TestOzoneManagerBootstrap.java | 11 +++++++++-- .../apache/hadoop/ozone/om/OMPolicyProvider.java | 4 ++++ .../ozone/om/ratis/OzoneManagerRatisServer.java | 16 ++++++++++++++++ .../OMInterServiceProtocolServerSideImpl.java | 5 +++++ 8 files changed, 60 insertions(+), 11 deletions(-) diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml index 6dcb81b1ffad..7c75b6394cde 100644 --- a/hadoop-hdds/common/src/main/resources/ozone-default.xml +++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml @@ -2228,6 +2228,15 @@ client ozone manager protocol. + + ozone.om.security.admin.protocol.acl + * + SECURITY + + Comma separated list of users and groups allowed to access ozone + manager admin protocol. + + hdds.datanode.http.auth.kerberos.principal diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/OMConfigKeys.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/OMConfigKeys.java index c0ada6aeb733..563f99cceca7 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/OMConfigKeys.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/OMConfigKeys.java @@ -221,6 +221,13 @@ private OMConfigKeys() { public static final String OZONE_OM_SECURITY_CLIENT_PROTOCOL_ACL = "ozone.om.security.client.protocol.acl"; + // Comma separated acls (users, groups) allowing clients accessing + // OM admin protocol. + // When hadoop.security.authorization is true, this needs to be set in + // hadoop-policy.xml, "*" allows all users/groups to access. + public static final String OZONE_OM_SECURITY_ADMIN_PROTOCOL_ACL = + "ozone.om.security.admin.protocol.acl"; + public static final String OZONE_OM_KEYNAME_CHARACTER_CHECK_ENABLED_KEY = "ozone.om.keyname.character.check.enabled"; public static final boolean OZONE_OM_KEYNAME_CHARACTER_CHECK_ENABLED_DEFAULT = diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocol/OMInterServiceProtocol.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocol/OMInterServiceProtocol.java index 568e94576c1a..250287a16ad8 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocol/OMInterServiceProtocol.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocol/OMInterServiceProtocol.java @@ -19,11 +19,18 @@ import java.io.Closeable; import java.io.IOException; +import org.apache.hadoop.ozone.om.OMConfigKeys; import org.apache.hadoop.ozone.om.helpers.OMNodeDetails; +import org.apache.hadoop.ozone.security.OzoneDelegationTokenSelector; +import org.apache.hadoop.security.KerberosInfo; +import org.apache.hadoop.security.token.TokenInfo; /** * Protocol for inter OM communication. */ +@KerberosInfo( + serverPrincipal = OMConfigKeys.OZONE_OM_KERBEROS_PRINCIPAL_KEY) +@TokenInfo(OzoneDelegationTokenSelector.class) public interface OMInterServiceProtocol extends Closeable { /** diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java index ede4b549a0fe..f45a96c88442 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java @@ -73,8 +73,8 @@ public class MiniOzoneHAClusterImpl extends MiniOzoneClusterImpl { private int waitForClusterToBeReadyTimeout = 120000; // 2 min private static final Random RANDOM = new Random(); - private static final int RATIS_RPC_TIMEOUT = 1000; // 10 second - public static final int NODE_FAILURE_TIMEOUT = 2000; // 20 seconds + private static final int RATIS_RPC_TIMEOUT = 1000; // 1 second + private static final int NODE_FAILURE_TIMEOUT = 2000; // 2 seconds /** * Creates a new MiniOzoneCluster. @@ -365,7 +365,7 @@ public MiniOzoneCluster build() throws IOException { numOfActiveOMs = numOfOMs; } - // If num of OMs it not set, set it to 1. + // If num of SCMs it not set, set it to 1. if (numOfSCMs == 0) { numOfSCMs = 1; } @@ -704,12 +704,6 @@ public void bootstrapOzoneManager(String omNodeId) throws Exception { om = bootstrapNewOM(omNodeId); - // Get the CertClient from an existing OM and set for new OM - if (omhaService.getServiceByIndex(0).getCertificateClient() != null) { - om.setCertClient( - omhaService.getServiceByIndex(0).getCertificateClient()); - } - LOG.info("Bootstrapped OzoneManager {} RPC server at {}", omNodeId, om.getOmRpcServerAddr()); diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java index 6e094beed2a6..2c8b3b248ed6 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java @@ -99,7 +99,8 @@ private void setupCluster(int numInitialOMs) throws Exception { OzoneBucket bucket = volume.getBucket(BUCKET_NAME); createKey(bucket); - lastTransactionIndex = cluster.getOMLeader().getRatisSnapshotIndex(); + lastTransactionIndex = cluster.getOMLeader().getOmRatisServer() + .getOmStateMachine().getLastAppliedTermIndex().getIndex(); } @After @@ -110,10 +111,16 @@ public void shutdown() throws Exception { } private void assertNewOMExistsInPeerList(String nodeId) throws Exception { + // Check that new peer exists in all OMs peers list and also in their Ratis + // server's peer list for (OzoneManager om : cluster.getOzoneManagersList()) { Assert.assertTrue("New OM node " + nodeId + " not present in Peer list " + "of OM " + om.getOMNodeId(), om.doesPeerExist(nodeId)); + Assert.assertTrue("New OM node " + nodeId + " not present in " + + "RaftServer Peer list of OM " + om.getOMNodeId(), + om.getOmRatisServer().getCurrentPeersFromRaftConf().contains(nodeId)); } + OzoneManager newOM = cluster.getOzoneManager(nodeId); GenericTestUtils.waitFor(new Supplier() { @Override @@ -124,7 +131,7 @@ public Boolean get() { return false; } } - }, 100, 10000); + }, 100, 100000); // Check Ratis Dir for log files File[] logFiles = getRatisLogFiles(newOM); diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMPolicyProvider.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMPolicyProvider.java index 18a288f3cea4..f8f265e3c26f 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMPolicyProvider.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMPolicyProvider.java @@ -21,12 +21,14 @@ import org.apache.hadoop.hdds.annotation.InterfaceAudience.Private; import org.apache.hadoop.hdds.annotation.InterfaceStability; import org.apache.hadoop.hdds.annotation.InterfaceStability.Unstable; +import org.apache.hadoop.ozone.om.protocol.OMInterServiceProtocol; import org.apache.hadoop.ozone.om.protocol.OzoneManagerProtocol; import org.apache.hadoop.security.authorize.PolicyProvider; import org.apache.hadoop.security.authorize.Service; import java.util.concurrent.atomic.AtomicReference; +import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SECURITY_ADMIN_PROTOCOL_ACL; import static org.apache.hadoop.ozone.om.OMConfigKeys .OZONE_OM_SECURITY_CLIENT_PROTOCOL_ACL; @@ -56,6 +58,8 @@ public static OMPolicyProvider getInstance() { new Service[]{ new Service(OZONE_OM_SECURITY_CLIENT_PROTOCOL_ACL, OzoneManagerProtocol.class), + new Service(OZONE_OM_SECURITY_ADMIN_PROTOCOL_ACL, + OMInterServiceProtocol.class) }; @SuppressFBWarnings("EI_EXPOSE_REP") diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java index 1f845cb81b9f..4efbca0e0344 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java @@ -30,6 +30,7 @@ import java.nio.charset.StandardCharsets; import java.security.cert.X509Certificate; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Map; @@ -691,6 +692,21 @@ public RaftServerStatus checkLeaderStatus() { return RaftServerStatus.NOT_LEADER; } + @VisibleForTesting + public List getCurrentPeersFromRaftConf() { + try { + Collection currentPeers = + server.getDivision(raftGroupId).getRaftConf().getCurrentPeers(); + List currentPeerList = new ArrayList<>(); + currentPeers.forEach(e -> currentPeerList.add(e.getId().toString())); + return currentPeerList; + } catch (IOException e) { + // In this case we return not a leader. + LOG.error("Failed to get RaftServer information. ", e); + } + return null; + } + public int getServerPort() { return port; } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OMInterServiceProtocolServerSideImpl.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OMInterServiceProtocolServerSideImpl.java index d0636a066fcf..d5ceb4e8b853 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OMInterServiceProtocolServerSideImpl.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OMInterServiceProtocolServerSideImpl.java @@ -28,6 +28,11 @@ import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.BootstrapOMResponse; import org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.ErrorCode; +/** + * This class is the server-side translator that forwards requests received on + * {@link OMInterServiceProtocolPB} + * to the OzoneManagerInterService server implementation. + */ public class OMInterServiceProtocolServerSideImpl implements OMInterServiceProtocolPB { From aada9bbc654419173ec9ddd8eba073d207b6cc2d Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Wed, 9 Jun 2021 10:37:33 -0700 Subject: [PATCH 06/15] Review comment fix --- .../hadoop/ozone/om/ratis/OzoneManagerRatisServer.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java index 4efbca0e0344..7fac1d0d33bd 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java @@ -692,8 +692,12 @@ public RaftServerStatus checkLeaderStatus() { return RaftServerStatus.NOT_LEADER; } + /** + * Get list of peer NodeIds from Ratis. + * @return List of Peer NodeId's. + */ @VisibleForTesting - public List getCurrentPeersFromRaftConf() { + public List getCurrentPeersFromRaftConf() throws IOException { try { Collection currentPeers = server.getDivision(raftGroupId).getRaftConf().getCurrentPeers(); @@ -702,9 +706,8 @@ public List getCurrentPeersFromRaftConf() { return currentPeerList; } catch (IOException e) { // In this case we return not a leader. - LOG.error("Failed to get RaftServer information. ", e); + throw new IOException("Failed to get peer information from Ratis.", e); } - return null; } public int getServerPort() { From e8bd425d9f09e73bc5ee00f676bf7de692e70edf Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Thu, 10 Jun 2021 21:30:45 -0700 Subject: [PATCH 07/15] CI fixes + Remove Token authorization for the new protocol --- hadoop-ozone/common/pom.xml | 4 ++++ .../om/protocol/OMInterServiceProtocol.java | 1 - .../protocolPB/OMInterServiceProtocolPB.java | 1 - .../ozone/om/TestOzoneManagerBootstrap.java | 18 +++++++++--------- .../apache/hadoop/ozone/om/OzoneManager.java | 3 +++ 5 files changed, 16 insertions(+), 11 deletions(-) diff --git a/hadoop-ozone/common/pom.xml b/hadoop-ozone/common/pom.xml index 7f7576f0cebe..67217c21ae56 100644 --- a/hadoop-ozone/common/pom.xml +++ b/hadoop-ozone/common/pom.xml @@ -72,6 +72,10 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd"> spotbugs-annotations provided + + org.apache.ozone + hdds-server-framework + diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocol/OMInterServiceProtocol.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocol/OMInterServiceProtocol.java index 250287a16ad8..883aba4661fe 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocol/OMInterServiceProtocol.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocol/OMInterServiceProtocol.java @@ -30,7 +30,6 @@ */ @KerberosInfo( serverPrincipal = OMConfigKeys.OZONE_OM_KERBEROS_PRINCIPAL_KEY) -@TokenInfo(OzoneDelegationTokenSelector.class) public interface OMInterServiceProtocol extends Closeable { /** diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolPB.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolPB.java index e3a60a394b4c..eaf2ca819468 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolPB.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolPB.java @@ -34,7 +34,6 @@ protocolVersion = 1) @KerberosInfo( serverPrincipal = OMConfigKeys.OZONE_OM_KERBEROS_PRINCIPAL_KEY) -@TokenInfo(OzoneDelegationTokenSelector.class) @InterfaceAudience.Private public interface OMInterServiceProtocolPB extends OzoneManagerInterService.BlockingInterface { diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java index 2c8b3b248ed6..9696aed3050b 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java @@ -36,7 +36,7 @@ import org.apache.hadoop.ozone.client.OzoneClientFactory; import org.apache.hadoop.ozone.client.OzoneVolume; import org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer; -import org.apache.hadoop.test.GenericTestUtils; +import org.apache.ozone.test.GenericTestUtils; import org.junit.After; import org.junit.Assert; import org.junit.Rule; @@ -48,6 +48,9 @@ import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_RATIS_SERVER_REQUEST_TIMEOUT_DEFAULT; import static org.apache.hadoop.ozone.om.TestOzoneManagerHA.createKey; +/** + * Test for OM bootstrap process. + */ public class TestOzoneManagerBootstrap { @Rule @@ -122,14 +125,11 @@ private void assertNewOMExistsInPeerList(String nodeId) throws Exception { } OzoneManager newOM = cluster.getOzoneManager(nodeId); - GenericTestUtils.waitFor(new Supplier() { - @Override - public Boolean get() { - try { - return newOM.getRatisSnapshotIndex() >= lastTransactionIndex; - } catch (IOException e) { - return false; - } + GenericTestUtils.waitFor((Supplier) () -> { + try { + return newOM.getRatisSnapshotIndex() >= lastTransactionIndex; + } catch (IOException e) { + return false; } }, 100, 100000); diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index 8c4f46d0a96f..09c905db5231 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -364,6 +364,9 @@ public final class OzoneManager extends ServiceRuntimeInfoImpl private OzoneManagerPrepareState prepareState; + /** + * OM Startup mode. + */ public enum StartupOption { REGUALR, BOOTSTRAP From 522023112e681a00bb95bfb4e73c8acef71e88ba Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Fri, 11 Jun 2021 09:53:03 -0700 Subject: [PATCH 08/15] checkstyle fix --- .../apache/hadoop/ozone/om/protocol/OMInterServiceProtocol.java | 2 -- .../hadoop/ozone/om/protocolPB/OMInterServiceProtocolPB.java | 2 -- 2 files changed, 4 deletions(-) diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocol/OMInterServiceProtocol.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocol/OMInterServiceProtocol.java index 883aba4661fe..165cd0332366 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocol/OMInterServiceProtocol.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocol/OMInterServiceProtocol.java @@ -21,9 +21,7 @@ import java.io.IOException; import org.apache.hadoop.ozone.om.OMConfigKeys; import org.apache.hadoop.ozone.om.helpers.OMNodeDetails; -import org.apache.hadoop.ozone.security.OzoneDelegationTokenSelector; import org.apache.hadoop.security.KerberosInfo; -import org.apache.hadoop.security.token.TokenInfo; /** * Protocol for inter OM communication. diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolPB.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolPB.java index eaf2ca819468..90f773be814e 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolPB.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OMInterServiceProtocolPB.java @@ -23,8 +23,6 @@ import org.apache.hadoop.ozone.protocol.proto .OzoneManagerInterServiceProtocolProtos.OzoneManagerInterService; import org.apache.hadoop.security.KerberosInfo; -import org.apache.hadoop.security.token.TokenInfo; -import org.apache.hadoop.ozone.security.OzoneDelegationTokenSelector; /** * Protocol used for communication between OMs. From fbcba69d07be24819758ecbd960bddfa2303f221 Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Tue, 22 Jun 2021 12:18:01 -0700 Subject: [PATCH 09/15] Do not add peers to bootstrapping OM ratis server during initialization --- .../hadoop/ozone/MiniOzoneHAClusterImpl.java | 9 ++ .../ozone/om/TestOzoneManagerBootstrap.java | 15 ++- .../apache/hadoop/ozone/om/OzoneManager.java | 37 +++++- .../om/ratis/OzoneManagerRatisServer.java | 109 ++++++++++++------ .../om/ratis/OzoneManagerStateMachine.java | 13 +-- .../ratis/utils/OzoneManagerRatisUtils.java | 3 + 6 files changed, 130 insertions(+), 56 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java index f45a96c88442..b387a56ebbc1 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java @@ -34,6 +34,7 @@ import org.apache.hadoop.ozone.ha.ConfUtils; import org.apache.hadoop.ozone.om.OMConfigKeys; import org.apache.hadoop.ozone.om.OzoneManager; +import org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer; import org.apache.hadoop.ozone.recon.ReconServer; import org.apache.hadoop.security.authentication.client.AuthenticationException; import org.apache.ozone.test.GenericTestUtils; @@ -814,11 +815,19 @@ private void waitForBootstrappedNodeToBeReady(OzoneManager newOM, private void waitForConfigUpdateOnAllOMs(String newOMNodeId) throws Exception { + OzoneManagerRatisServer newOMRatisServer = + omhaService.getServiceById(newOMNodeId).getOmRatisServer(); GenericTestUtils.waitFor(() -> { + // Each existing active OM should contain the new OM in its peerList. + // Also, the new OM should contain each existing active OM in it's + // RatisServer peerList. for (OzoneManager om : omhaService.getActiveServices()) { if (!om.doesPeerExist(newOMNodeId)) { return false; } + if (!newOMRatisServer.doesPeerExist(om.getOMNodeId())) { + return false; + } } return true; }, 1000, waitForClusterToBeReadyTimeout); diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java index 9696aed3050b..0d6674f4d11b 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java @@ -119,19 +119,18 @@ private void assertNewOMExistsInPeerList(String nodeId) throws Exception { for (OzoneManager om : cluster.getOzoneManagersList()) { Assert.assertTrue("New OM node " + nodeId + " not present in Peer list " + "of OM " + om.getOMNodeId(), om.doesPeerExist(nodeId)); + Assert.assertTrue("New OM node " + nodeId + " not present in Peer list " + + "of OM " + om.getOMNodeId() + " RatisServer", + om.getOmRatisServer().doesPeerExist(nodeId)); Assert.assertTrue("New OM node " + nodeId + " not present in " + - "RaftServer Peer list of OM " + om.getOMNodeId(), + "OM " + om.getOMNodeId() + "RatisServer's RaftConf", om.getOmRatisServer().getCurrentPeersFromRaftConf().contains(nodeId)); } OzoneManager newOM = cluster.getOzoneManager(nodeId); - GenericTestUtils.waitFor((Supplier) () -> { - try { - return newOM.getRatisSnapshotIndex() >= lastTransactionIndex; - } catch (IOException e) { - return false; - } - }, 100, 100000); + GenericTestUtils.waitFor(() -> + newOM.getOmRatisServer().getLastAppliedTermIndex().getIndex() + >= lastTransactionIndex, 100, 100000); // Check Ratis Dir for log files File[] logFiles = getRatisLogFiles(newOM); diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index 09c905db5231..cc44063d203a 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -1395,7 +1395,38 @@ public void bootstrap(OMNodeDetails newOMNode) throws IOException { } /** - * Add a new OM Node to the HA cluster. This call comes from OMRatisServer + * When OMStateMachine receives a configuration change update, it calls + * this function to update the peers list, if required. + */ + public void updatePeerList(List omNodeIds) { + List ratisServerPeerIdsList = omRatisServer.getPeerIds(); + for (String omNodeId : omNodeIds) { + if (getOMNodeId().equals(omNodeId)) { + continue; + } + boolean isPresentInOMPeers = false; + for (OMNodeDetails peerNode : peerNodes) { + if (peerNode.getNodeId().contains(omNodeId)) { + isPresentInOMPeers = true; + if (!ratisServerPeerIdsList.contains(omNodeId)) { + // This can happen on a bootstrapping OM. The peer information + // would be present in OzoneManager but OMRatisServer peer list + // would not have the peers list. OMRatisServer peer list of + // bootstrapping node should be updated after it get the RaftConf + // through Ratis. + omRatisServer.addRaftPeer(peerNode); + } + } + } + // Add the node to OM peers list if it is not present already + if (!isPresentInOMPeers) { + addOMNodeToPeers(omNodeId); + } + } + } + + /** + * Add an OM Node to the peers list. This call comes from OMStateMachine * after a SetConfiguration request has been successfully executed by the * Ratis server. */ @@ -1424,13 +1455,15 @@ public void addOMNodeToPeers(String newOMNodeId) { omSnapshotProvider.addNewPeerNode(newOMNodeDetails); } omRatisServer.addRaftPeer(newOMNodeDetails); + LOG.info("Added new OM {} to the Peer list.", newOMNodeId); } /** * Check if the input nodeId exists in the peers list. * @return true if the nodeId is self or it exists in peer node list, - * false otherwsie. + * false otherwise. */ + @VisibleForTesting public boolean doesPeerExist(String omNodeId) { if (getOMNodeId().equals(omNodeId)) { return true; diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java index 7fac1d0d33bd..5c60aae48500 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java @@ -131,7 +131,7 @@ private static long nextCallId() { @SuppressWarnings({"parameternumber", "java:S107"}) private OzoneManagerRatisServer(ConfigurationSource conf, OzoneManager om, String raftGroupIdStr, RaftPeerId localRaftPeerId, - InetSocketAddress addr, List peers, + InetSocketAddress addr, List peers, boolean isBootstrapping, SecurityConfig secConfig, CertificateClient certClient) throws IOException { this.ozoneManager = om; @@ -147,13 +147,17 @@ private OzoneManagerRatisServer(ConfigurationSource conf, OzoneManager om, this.raftPeers.addAll(peers); this.raftGroup = RaftGroup.valueOf(raftGroupId, peers); - StringBuilder raftPeersStr = new StringBuilder(); - for (RaftPeer peer : peers) { - raftPeersStr.append(", ").append(peer.getAddress()); + if (isBootstrapping) { + LOG.info("OM started in Bootstrap mode. Instantiating OM Ratis server " + + "with groupID: {}", raftGroupIdStr); + } else { + StringBuilder raftPeersStr = new StringBuilder(); + for (RaftPeer peer : peers) { + raftPeersStr.append(", ").append(peer.getAddress()); + } + LOG.info("Instantiating OM Ratis server with groupID: {} and peers: {}", + raftGroupIdStr, raftPeersStr.toString().substring(2)); } - LOG.info("Instantiating OM Ratis server with groupID: {} and " + - "peers: {}", raftGroupIdStr, raftPeersStr.toString().substring(2)); - this.omStateMachine = getStateMachine(conf); Parameters parameters = createServerTlsParameters(secConfig, certClient); @@ -189,42 +193,40 @@ public static OzoneManagerRatisServer newOMRatisServer( .setAddress(ratisAddr) .build(); + // If OM is started in bootstrap mode, do not add peers to the RaftGroup. + // Raft peers will be added after SetConfiguration transaction is + // committed by leader and propagated to followers. List raftPeers = new ArrayList<>(); - - // If the OM is started in bootstrap mode, do not add it to the ratis ring. - // It will be added later using SetConfiguration from the leader OM. - if (isBootstrapping) { - LOG.debug("OM started in Bootstrap mode and hence will not be added " + - "to Ratis group during startup."); - } else { - // On regular startup, add current OM to Ratis ring + if (!isBootstrapping) { + // On regular startup, add all OMs to Ratis ring raftPeers.add(localRaftPeer); - } - for (OMNodeDetails peerInfo : peerNodes) { - String peerNodeId = peerInfo.getNodeId(); - RaftPeerId raftPeerId = RaftPeerId.valueOf(peerNodeId); - RaftPeer raftPeer; - if (peerInfo.isHostUnresolved()) { - raftPeer = RaftPeer.newBuilder() - .setId(raftPeerId) - .setAddress(peerInfo.getRatisHostPortStr()) - .build(); - } else { - InetSocketAddress peerRatisAddr = new InetSocketAddress( - peerInfo.getInetAddress(), peerInfo.getRatisPort()); - raftPeer = RaftPeer.newBuilder() - .setId(raftPeerId) - .setAddress(peerRatisAddr) - .build(); - } + for (OMNodeDetails peerInfo : peerNodes) { + String peerNodeId = peerInfo.getNodeId(); + RaftPeerId raftPeerId = RaftPeerId.valueOf(peerNodeId); + RaftPeer raftPeer; + if (peerInfo.isHostUnresolved()) { + raftPeer = RaftPeer.newBuilder() + .setId(raftPeerId) + .setAddress(peerInfo.getRatisHostPortStr()) + .build(); + } else { + InetSocketAddress peerRatisAddr = new InetSocketAddress( + peerInfo.getInetAddress(), peerInfo.getRatisPort()); + raftPeer = RaftPeer.newBuilder() + .setId(raftPeerId) + .setAddress(peerRatisAddr) + .build(); + } - // Add other OM nodes belonging to the same OM service to the Ratis ring - raftPeers.add(raftPeer); + // Add other OM nodes belonging to the same OM service to the Ratis ring + raftPeers.add(raftPeer); + } } return new OzoneManagerRatisServer(ozoneConf, omProtocol, omServiceId, - localRaftPeerId, ratisAddr, raftPeers, secConfig, certClient); + localRaftPeerId, ratisAddr, raftPeers, isBootstrapping, secConfig, + certClient); } /** @@ -329,6 +331,38 @@ public void addOMToRatisRing(OMNodeDetails newOMNode) throws IOException { } } + /** + * Return a list of peer NodeIds. + */ + public List getPeerIds() { + List peerIds = new ArrayList<>(); + for (RaftPeer raftPeer : raftPeers) { + peerIds.add(raftPeer.getId().toString()); + } + return peerIds; + } + + /** + * Check if the input peerId exists in the peers list. + * @return true if the nodeId is self or it exists in peer node list, + * false otherwise. + */ + @VisibleForTesting + public boolean doesPeerExist(String peerId) { + if (peerId.equals(raftPeerId.toString())) { + return true; + } + for (RaftPeer raftPeer : raftPeers) { + if (raftPeer.getId().toString().equals(peerId)) { + return true; + } + } + return false; + } + + /** + * Add given node to list of RaftPeers. + */ public void addRaftPeer(OMNodeDetails omNodeDetails) { InetSocketAddress newOMRatisAddr = new InetSocketAddress( omNodeDetails.getHostAddress(), omNodeDetails.getRatisPort()); @@ -338,8 +372,7 @@ public void addRaftPeer(OMNodeDetails omNodeDetails) { .setAddress(newOMRatisAddr) .build()); - LOG.info("Added OM {} to Ratis Peers list. New OM Ratis Peers list: {}", - omNodeDetails.getNodeId(), raftPeers); + LOG.info("Added OM {} to Ratis Peers list.", omNodeDetails.getNodeId()); } /** diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java index 8cea2c488245..1b7dc467dc7c 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java @@ -187,16 +187,13 @@ public void notifyConfigurationChanged(long term, long index, newRaftConfiguration.getPeersList(); LOG.info("Received Configuration change notification from Ratis. New Peer" + " list:\n{}", newPeers); + + List newPeerIds = new ArrayList<>(); for (RaftProtos.RaftPeerProto raftPeerProto : newPeers) { - String omNodeId = RaftPeerId.valueOf(raftPeerProto.getId()).toString(); - if (!ozoneManager.doesPeerExist(omNodeId)) { - LOG.info("Adding new OM {} to the Peer list.", omNodeId); - ozoneManager.addOMNodeToPeers(omNodeId); - } else { - LOG.debug("Not adding OM {} to cluster as it is already present in " + - "peer list.", omNodeId); - } + newPeerIds.add(RaftPeerId.valueOf(raftPeerProto.getId()).toString()); } + // Check and update the peer list in OzoneManager + ozoneManager.updatePeerList(newPeerIds); } /** diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java index a4f9c139bb06..e655b2887cfb 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java @@ -380,6 +380,9 @@ public static String getOMRatisDirectory(ConfigurationSource conf) { return storageDir; } + /** + * Get the local directory where ratis snapshots will be stored. + */ public static String getOMRatisSnapshotDirectory(ConfigurationSource conf) { String snapshotDir = conf.get(OZONE_OM_RATIS_SNAPSHOT_DIR); From cdb5ac7e2bc8f65c011bc4aa630698cb33125024 Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Tue, 22 Jun 2021 16:25:49 -0700 Subject: [PATCH 10/15] checkstyle fix --- .../org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java | 2 -- .../apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java index 0d6674f4d11b..a71b2ea1a238 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerBootstrap.java @@ -18,10 +18,8 @@ package org.apache.hadoop.ozone.om; -import com.google.common.base.Supplier; import java.io.File; import java.io.FileFilter; -import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.UUID; diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java index 5c60aae48500..a8ceaf59660a 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java @@ -151,7 +151,7 @@ private OzoneManagerRatisServer(ConfigurationSource conf, OzoneManager om, LOG.info("OM started in Bootstrap mode. Instantiating OM Ratis server " + "with groupID: {}", raftGroupIdStr); } else { - StringBuilder raftPeersStr = new StringBuilder(); + StringBuilder raftPeersStr = new StringBuilder(); for (RaftPeer peer : peers) { raftPeersStr.append(", ").append(peer.getAddress()); } From c6f0011e4a97e918da367c9fa9747c4c0c2f369c Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Wed, 14 Jul 2021 13:05:20 -0700 Subject: [PATCH 11/15] Rebase changes --- .../OzoneManagerProtocolServerSideTranslatorPB.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OzoneManagerProtocolServerSideTranslatorPB.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OzoneManagerProtocolServerSideTranslatorPB.java index 845d5528d040..7a256343f15e 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OzoneManagerProtocolServerSideTranslatorPB.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/protocolPB/OzoneManagerProtocolServerSideTranslatorPB.java @@ -16,6 +16,8 @@ */ package org.apache.hadoop.ozone.protocolPB; +import static org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.RaftServerStatus.LEADER_AND_READY; +import static org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.RaftServerStatus.NOT_LEADER; import static org.apache.hadoop.ozone.om.ratis.utils.OzoneManagerRatisUtils.createClientRequest; import static org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.Type.PrepareStatus; @@ -28,6 +30,8 @@ import org.apache.hadoop.hdds.utils.ProtocolMessageMetrics; import org.apache.hadoop.ozone.OmUtils; import org.apache.hadoop.ozone.om.OzoneManager; +import org.apache.hadoop.ozone.om.exceptions.OMLeaderNotReadyException; +import org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException; import org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolPB; import org.apache.hadoop.ozone.om.ratis.OzoneManagerDoubleBuffer; import org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer; @@ -41,6 +45,7 @@ import com.google.protobuf.ProtocolMessageEnum; import com.google.protobuf.RpcController; import com.google.protobuf.ServiceException; +import org.apache.ratis.protocol.RaftPeerId; import org.apache.ratis.util.ExitUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; From c8b89425e7b7e7a2222215c94970d4427556b6d0 Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Wed, 14 Jul 2021 15:44:32 -0700 Subject: [PATCH 12/15] CI fixes --- .../src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java | 1 - 1 file changed, 1 deletion(-) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index cc44063d203a..a679f69189b4 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -253,7 +253,6 @@ import static org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.RaftServerStatus.LEADER_AND_READY; import static org.apache.hadoop.ozone.protocol.proto.OzoneManagerInterServiceProtocolProtos.OzoneManagerInterService; import static org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OzoneManagerService; -import static org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OzoneManagerService.newReflectiveBlockingService; import static org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.PrepareStatusResponse.PrepareStatus; import org.apache.ratis.proto.RaftProtos.RaftPeerRole; import org.apache.ratis.server.protocol.TermIndex; From fdfe7a0291c188e7a89d479c5250fd0cd8a8a68c Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Fri, 16 Jul 2021 16:02:30 -0700 Subject: [PATCH 13/15] Add bootstrapping node to peer list on receiving notifyConfigurationChange --- .../apache/hadoop/ozone/om/OzoneManager.java | 59 ++++++++----------- .../hadoop/ozone/om/ha/OMHANodeDetails.java | 10 +++- .../om/ratis/OzoneManagerRatisServer.java | 13 ++-- .../OzoneManagerSnapshotProvider.java | 6 +- .../om/ratis/TestOzoneManagerRatisServer.java | 6 +- 5 files changed, 45 insertions(+), 49 deletions(-) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index a679f69189b4..616512d39c68 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -333,7 +333,7 @@ public final class OzoneManager extends ServiceRuntimeInfoImpl private OzoneManagerRatisServer omRatisServer; private OzoneManagerSnapshotProvider omSnapshotProvider; private OMNodeDetails omNodeDetails; - private List peerNodes; + private Map peerNodesMap; private File omRatisSnapshotDir; private final RatisSnapshotInfo omRatisSnapshotInfo; private final Map ratisMetricsMap = @@ -396,7 +396,7 @@ private OzoneManager(OzoneConfiguration conf, StartupOption startupOption) OMHANodeDetails omhaNodeDetails = OMHANodeDetails.loadOMHAConfig(configuration); - this.peerNodes = omhaNodeDetails.getPeerNodeDetails(); + this.peerNodesMap = omhaNodeDetails.getPeerNodesMap(); this.omNodeDetails = omhaNodeDetails.getLocalNodeDetails(); omStorage = new OMStorage(conf); @@ -1142,9 +1142,9 @@ private void initializeRatisDirs(OzoneConfiguration conf) throws IOException { omRatisSnapshotDir.toPath()); } - if (peerNodes != null && !peerNodes.isEmpty()) { + if (peerNodesMap != null && !peerNodesMap.isEmpty()) { this.omSnapshotProvider = new OzoneManagerSnapshotProvider( - configuration, omRatisSnapshotDir, peerNodes); + configuration, omRatisSnapshotDir, peerNodesMap); } } } @@ -1400,26 +1400,21 @@ public void bootstrap(OMNodeDetails newOMNode) throws IOException { public void updatePeerList(List omNodeIds) { List ratisServerPeerIdsList = omRatisServer.getPeerIds(); for (String omNodeId : omNodeIds) { - if (getOMNodeId().equals(omNodeId)) { - continue; - } - boolean isPresentInOMPeers = false; - for (OMNodeDetails peerNode : peerNodes) { - if (peerNode.getNodeId().contains(omNodeId)) { - isPresentInOMPeers = true; - if (!ratisServerPeerIdsList.contains(omNodeId)) { - // This can happen on a bootstrapping OM. The peer information - // would be present in OzoneManager but OMRatisServer peer list - // would not have the peers list. OMRatisServer peer list of - // bootstrapping node should be updated after it get the RaftConf - // through Ratis. - omRatisServer.addRaftPeer(peerNode); - } - } - } - // Add the node to OM peers list if it is not present already - if (!isPresentInOMPeers) { + // Check if the OM NodeID is already present in the peer list or its + // the local NodeID. + if (!peerNodesMap.containsKey(omNodeId) && + !getOMNodeId().equals(omNodeId)) { addOMNodeToPeers(omNodeId); + } else { + // Check if the OMNodeID is present in the RatisServer's peer list + if (!ratisServerPeerIdsList.contains(omNodeId)) { + // This can happen on a bootstrapping OM. The peer information + // would be present in OzoneManager but OMRatisServer peer list + // would not have the peers list. OMRatisServer peer list of + // bootstrapping node should be updated after it gets the RaftConf + // through Ratis. + omRatisServer.addRaftPeer(peerNodesMap.get(omNodeId)); + } } } } @@ -1446,10 +1441,10 @@ public void addOMNodeToPeers(String newOMNodeId) { } } - peerNodes.add(newOMNodeDetails); + peerNodesMap.put(newOMNodeId, newOMNodeDetails); if (omSnapshotProvider == null) { omSnapshotProvider = new OzoneManagerSnapshotProvider( - configuration, omRatisSnapshotDir, peerNodes); + configuration, omRatisSnapshotDir, peerNodesMap); } else { omSnapshotProvider.addNewPeerNode(newOMNodeDetails); } @@ -1467,12 +1462,8 @@ public boolean doesPeerExist(String omNodeId) { if (getOMNodeId().equals(omNodeId)) { return true; } - if (peerNodes != null && !peerNodes.isEmpty()) { - for (OMNodeDetails peerNode : peerNodes) { - if (peerNode.getNodeId().contains(omNodeId)) { - return true; - } - } + if (peerNodesMap != null && !peerNodesMap.isEmpty()) { + return peerNodesMap.containsKey(omNodeId); } return false; } @@ -1532,7 +1523,7 @@ private void initializeRatisServer(boolean shouldBootstrap) RatisDropwizardExports. registerRatisMetricReporters(ratisMetricsMap); omRatisServer = OzoneManagerRatisServer.newOMRatisServer( - configuration, this, omNodeDetails, peerNodes, + configuration, this, omNodeDetails, peerNodesMap, secConfig, certClient, shouldBootstrap); } LOG.info("OzoneManager Ratis server initialized at port {}", @@ -2921,7 +2912,7 @@ public List getServiceList() throws IOException { .build()); } - for (OMNodeDetails peerNode : peerNodes) { + for (OMNodeDetails peerNode : peerNodesMap.values()) { ServiceInfo.Builder peerOmServiceInfoBuilder = ServiceInfo.newBuilder() .setNodeType(HddsProtos.NodeType.OM) .setHostname(peerNode.getHostName()) @@ -3787,7 +3778,7 @@ public String getOMServiceId() { @VisibleForTesting public List getPeerNodes() { - return peerNodes; + return new ArrayList<>(peerNodesMap.values()); } @VisibleForTesting diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHANodeDetails.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHANodeDetails.java index 30166bad72d3..63fdf2b6b819 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHANodeDetails.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ha/OMHANodeDetails.java @@ -18,6 +18,8 @@ package org.apache.hadoop.ozone.om.ha; import com.google.common.base.Preconditions; +import java.util.HashMap; +import java.util.Map; import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.ozone.OmUtils; @@ -72,8 +74,12 @@ public OMNodeDetails getLocalNodeDetails() { return localNodeDetails; } - public List< OMNodeDetails > getPeerNodeDetails() { - return peerNodeDetails; + public Map getPeerNodesMap() { + Map peerNodesMap = new HashMap<>(); + for (OMNodeDetails peeNode : peerNodeDetails) { + peerNodesMap.put(peeNode.getNodeId(), peeNode); + } + return peerNodesMap; } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java index a8ceaf59660a..6c20f54bed9f 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java @@ -175,7 +175,7 @@ private OzoneManagerRatisServer(ConfigurationSource conf, OzoneManager om, */ public static OzoneManagerRatisServer newOMRatisServer( ConfigurationSource ozoneConf, OzoneManager omProtocol, - OMNodeDetails omNodeDetails, List peerNodes, + OMNodeDetails omNodeDetails, Map peerNodes, SecurityConfig secConfig, CertificateClient certClient, boolean isBootstrapping) throws IOException { @@ -201,18 +201,19 @@ public static OzoneManagerRatisServer newOMRatisServer( // On regular startup, add all OMs to Ratis ring raftPeers.add(localRaftPeer); - for (OMNodeDetails peerInfo : peerNodes) { - String peerNodeId = peerInfo.getNodeId(); + for (Map.Entry peerInfo : peerNodes.entrySet()) { + String peerNodeId = peerInfo.getKey(); + OMNodeDetails peerNode = peerInfo.getValue(); RaftPeerId raftPeerId = RaftPeerId.valueOf(peerNodeId); RaftPeer raftPeer; - if (peerInfo.isHostUnresolved()) { + if (peerNode.isHostUnresolved()) { raftPeer = RaftPeer.newBuilder() .setId(raftPeerId) - .setAddress(peerInfo.getRatisHostPortStr()) + .setAddress(peerNode.getRatisHostPortStr()) .build(); } else { InetSocketAddress peerRatisAddr = new InetSocketAddress( - peerInfo.getInetAddress(), peerInfo.getRatisPort()); + peerNode.getInetAddress(), peerNode.getRatisPort()); raftPeer = RaftPeer.newBuilder() .setId(raftPeerId) .setAddress(peerRatisAddr) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OzoneManagerSnapshotProvider.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OzoneManagerSnapshotProvider.java index 7d45d35c2d0f..5bc6f477c9a3 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OzoneManagerSnapshotProvider.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OzoneManagerSnapshotProvider.java @@ -72,15 +72,13 @@ public class OzoneManagerSnapshotProvider { private static final String OM_SNAPSHOT_DB = "om.snapshot.db"; public OzoneManagerSnapshotProvider(MutableConfigurationSource conf, - File omRatisSnapshotDir, List peerNodes) { + File omRatisSnapshotDir, Map peerNodeDetails) { LOG.info("Initializing OM Snapshot Provider"); this.omSnapshotDir = omRatisSnapshotDir; this.peerNodesMap = new HashMap<>(); - for (OMNodeDetails peerNode : peerNodes) { - this.peerNodesMap.put(peerNode.getNodeId(), peerNode); - } + peerNodesMap.putAll(peerNodeDetails); this.httpPolicy = HttpConfig.getHttpPolicy(conf); this.spnegoEnabled = conf.get(OZONE_OM_HTTP_AUTH_TYPE, "simple") diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerRatisServer.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerRatisServer.java index 86252d002164..2b2b75af240f 100644 --- a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerRatisServer.java +++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerRatisServer.java @@ -115,7 +115,7 @@ public void init() throws Exception { secConfig = new SecurityConfig(conf); certClient = new OMCertificateClient(secConfig); omRatisServer = OzoneManagerRatisServer.newOMRatisServer(conf, ozoneManager, - omNodeDetails, Collections.emptyList(), secConfig, certClient, false); + omNodeDetails, Collections.emptyMap(), secConfig, certClient, false); omRatisServer.start(); } @@ -155,7 +155,7 @@ public void testLoadSnapshotInfoOnStart() throws Exception { // Start new Ratis server. It should pick up and load the new SnapshotInfo omRatisServer = OzoneManagerRatisServer.newOMRatisServer(conf, ozoneManager, - omNodeDetails, Collections.emptyList(), secConfig, certClient, false); + omNodeDetails, Collections.emptyMap(), secConfig, certClient, false); omRatisServer.start(); TermIndex lastAppliedTermIndex = omRatisServer.getLastAppliedTermIndex(); @@ -225,7 +225,7 @@ public void verifyRaftGroupIdGenerationWithCustomOmServiceId() throws omRatisServer.stop(); OzoneManagerRatisServer newOmRatisServer = OzoneManagerRatisServer .newOMRatisServer(newConf, ozoneManager, nodeDetails, - Collections.emptyList(), secConfig, certClient, false); + Collections.emptyMap(), secConfig, certClient, false); newOmRatisServer.start(); UUID uuid = UUID.nameUUIDFromBytes(customOmServiceId.getBytes(UTF_8)); From 4c8f4ce00f8d34264d25b5324f7f1979cd10d55a Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Mon, 19 Jul 2021 07:58:55 -0700 Subject: [PATCH 14/15] Checkstyle fix --- .../hadoop/ozone/om/snapshot/OzoneManagerSnapshotProvider.java | 1 - 1 file changed, 1 deletion(-) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OzoneManagerSnapshotProvider.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OzoneManagerSnapshotProvider.java index 5bc6f477c9a3..b24ca4b08fde 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OzoneManagerSnapshotProvider.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OzoneManagerSnapshotProvider.java @@ -26,7 +26,6 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.HashMap; -import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; From ca81e1f9e92ac8e0c3e90f64f3657d579942f0e8 Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Tue, 20 Jul 2021 15:32:06 -0700 Subject: [PATCH 15/15] Fixed TestOMBootstrap --- .../hadoop/ozone/MiniOzoneHAClusterImpl.java | 11 +++++---- .../apache/hadoop/ozone/om/OzoneManager.java | 23 +++++++++++++++---- .../om/ratis/OzoneManagerRatisServer.java | 3 --- 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java index b387a56ebbc1..af62ed08faec 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java @@ -815,16 +815,19 @@ private void waitForBootstrappedNodeToBeReady(OzoneManager newOM, private void waitForConfigUpdateOnAllOMs(String newOMNodeId) throws Exception { - OzoneManagerRatisServer newOMRatisServer = - omhaService.getServiceById(newOMNodeId).getOmRatisServer(); + OzoneManager newOMNode = omhaService.getServiceById(newOMNodeId); + OzoneManagerRatisServer newOMRatisServer = newOMNode.getOmRatisServer(); GenericTestUtils.waitFor(() -> { // Each existing active OM should contain the new OM in its peerList. - // Also, the new OM should contain each existing active OM in it's - // RatisServer peerList. + // Also, the new OM should contain each existing active OM in it's OM + // peer list and RatisServer peerList. for (OzoneManager om : omhaService.getActiveServices()) { if (!om.doesPeerExist(newOMNodeId)) { return false; } + if (!newOMNode.doesPeerExist(om.getOMNodeId())) { + return false; + } if (!newOMRatisServer.doesPeerExist(om.getOMNodeId())) { return false; } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index 616512d39c68..393632e2880a 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -1402,8 +1402,7 @@ public void updatePeerList(List omNodeIds) { for (String omNodeId : omNodeIds) { // Check if the OM NodeID is already present in the peer list or its // the local NodeID. - if (!peerNodesMap.containsKey(omNodeId) && - !getOMNodeId().equals(omNodeId)) { + if (!peerNodesMap.containsKey(omNodeId) && !isCurrentNode(omNodeId)) { addOMNodeToPeers(omNodeId); } else { // Check if the OMNodeID is present in the RatisServer's peer list @@ -1413,12 +1412,26 @@ public void updatePeerList(List omNodeIds) { // would not have the peers list. OMRatisServer peer list of // bootstrapping node should be updated after it gets the RaftConf // through Ratis. - omRatisServer.addRaftPeer(peerNodesMap.get(omNodeId)); + if (isCurrentNode(omNodeId)) { + // OM Ratis server has the current node also in the peer list as + // this is the Raft Group peers list. Hence, add the current node + // also to Ratis peers list if not present. + omRatisServer.addRaftPeer(omNodeDetails); + } else { + omRatisServer.addRaftPeer(peerNodesMap.get(omNodeId)); + } } } } } + /** + * Check if the given nodeId is the current nodeId. + */ + private boolean isCurrentNode(String omNodeID) { + return getOMNodeId().equals(omNodeID); + } + /** * Add an OM Node to the peers list. This call comes from OMStateMachine * after a SetConfiguration request has been successfully executed by the @@ -1441,7 +1454,6 @@ public void addOMNodeToPeers(String newOMNodeId) { } } - peerNodesMap.put(newOMNodeId, newOMNodeDetails); if (omSnapshotProvider == null) { omSnapshotProvider = new OzoneManagerSnapshotProvider( configuration, omRatisSnapshotDir, peerNodesMap); @@ -1449,7 +1461,8 @@ public void addOMNodeToPeers(String newOMNodeId) { omSnapshotProvider.addNewPeerNode(newOMNodeDetails); } omRatisServer.addRaftPeer(newOMNodeDetails); - LOG.info("Added new OM {} to the Peer list.", newOMNodeId); + peerNodesMap.put(newOMNodeId, newOMNodeDetails); + LOG.info("Added OM {} to the Peer list.", newOMNodeId); } /** diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java index 6c20f54bed9f..c187d6d5358d 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java @@ -350,9 +350,6 @@ public List getPeerIds() { */ @VisibleForTesting public boolean doesPeerExist(String peerId) { - if (peerId.equals(raftPeerId.toString())) { - return true; - } for (RaftPeer raftPeer : raftPeers) { if (raftPeer.getId().toString().equals(peerId)) { return true;