From 7ebc81d9af5e1f5060526fcbc5e04a49491e6d34 Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Fri, 22 Jan 2021 18:33:25 -0300 Subject: [PATCH 01/40] HA Agent check regardless of NFS Currently, KVM HA implementation works only ifthe KVM cluster shares a NFS storage pool. This implementation adds health checks that work without NFS storage. This is done via a Java client that checks Agent status via a webserver. The additional webserver exposes a simple JSON API which returns a list of Virtual Machines that are running on that host according to libvirt. This way, KVM HA can verify, via libvirt, VMs status with a HTTP-call to this simple webserver and determine if the host is actually down or if it is just the Java Agent which has crashed. --- .../java/com/cloud/ha/KVMInvestigator.java | 55 ++++--- .../kvm/resource/KvmAgentHaClient.java | 139 ++++++++++++++++++ .../kvm/resource/KvmAgentHaClientTest.java | 31 ++++ 3 files changed, 208 insertions(+), 17 deletions(-) create mode 100644 plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClient.java create mode 100644 plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClientTest.java diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java index a76b56a1a4de..a154e5d2179c 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java @@ -33,6 +33,7 @@ import org.apache.cloudstack.storage.datastore.db.PrimaryDataStoreDao; import org.apache.cloudstack.storage.datastore.db.StoragePoolVO; import org.apache.log4j.Logger; +import org.jetbrains.annotations.NotNull; import javax.inject.Inject; import java.util.List; @@ -77,29 +78,49 @@ public Status isAgentAlive(Host agent) { return haManager.getHostStatus(agent); } - List clusterPools = _storagePoolDao.listPoolsByCluster(agent.getClusterId()); - boolean hasNfs = false; - for (StoragePoolVO pool : clusterPools) { - if (pool.getPoolType() == StoragePoolType.NetworkFilesystem) { - hasNfs = true; - break; - } + Status agentStatus = Status.Disconnected; + boolean hasNfs = isHostServedByNfsPool(agent); + if (hasNfs) { + s_logger.debug("Agent investigation was requested on host " + agent + ", checking agent status via NFS storage."); + agentStatus = checkAgentStatusViaNfs(agent); + } else { + s_logger.debug( + "Agent investigation was requested on host " + agent + ", but host has no NFS storage. Skipping investigation via NFS."); } + + return agentStatus; + } + + private boolean isHostServedByNfsPool(Host agent) { + boolean hasNfs = hasNfsPoolClusterWideForHost(agent); if (!hasNfs) { - List zonePools = _storagePoolDao.findZoneWideStoragePoolsByHypervisor(agent.getDataCenterId(), agent.getHypervisorType()); - for (StoragePoolVO pool : zonePools) { - if (pool.getPoolType() == StoragePoolType.NetworkFilesystem) { - hasNfs = true; - break; - } + hasNfs = hasNfsPoolZoneWideForHost(agent); + } + return hasNfs; + } + + private boolean hasNfsPoolZoneWideForHost(Host agent) { + List zonePools = _storagePoolDao.findZoneWideStoragePoolsByHypervisor(agent.getDataCenterId(), agent.getHypervisorType()); + for (StoragePoolVO pool : zonePools) { + if (pool.getPoolType() == StoragePoolType.NetworkFilesystem) { + return true; } } - if (!hasNfs) { - s_logger.warn( - "Agent investigation was requested on host " + agent + ", but host does not support investigation because it has no NFS storage. Skipping investigation."); - return Status.Disconnected; + return false; + } + + private boolean hasNfsPoolClusterWideForHost(Host agent) { + List clusterPools = _storagePoolDao.listPoolsByCluster(agent.getClusterId()); + for (StoragePoolVO pool : clusterPools) { + if (pool.getPoolType() == StoragePoolType.NetworkFilesystem) { + return true; + } } + return false; + } + @NotNull + private Status checkAgentStatusViaNfs(Host agent) { Status hostStatus = null; Status neighbourStatus = null; CheckOnHostCommand cmd = new CheckOnHostCommand(agent); diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClient.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClient.java new file mode 100644 index 000000000000..fe7dfd979a66 --- /dev/null +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClient.java @@ -0,0 +1,139 @@ +/* + * Copyright 2021 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.cloud.hypervisor.kvm.resource; + +import com.cloud.utils.exception.CloudRuntimeException; +import org.apache.http.HttpResponse; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpRequestBase; +import org.apache.http.client.utils.URIBuilder; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.log4j.Logger; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.concurrent.TimeUnit; + +/** + * This class provides a client that checks Agent status via a webserver. + * + * The additional webserver exposes a simple JSON API which returns a list + * of Virtual Machines that are running on that host according to libvirt. + * + * This way, KVM HA can verify, via libvirt, VMs status with a HTTP-call + * to this simple webserver and determine if the host is actually down + * or if it is just the Java Agent which has crashed. + */ +public class KvmAgentHaClient { + + private static final Logger LOGGER = Logger.getLogger(KvmAgentHaClient.class); + private final static int WAIT_FOR_REQUEST_RETRY = 2; + private String agentIpAddress; + private int port; + private int requestMaxRetries = 0; //TODO + + public KvmAgentHaClient(String agentIpAddress, int port) { + this.agentIpAddress = agentIpAddress; + this.port = port; + } + + /** + * TODO + * Returns the System ID. Used when sending Computer System requests (e.g. ComputerSystem.Reset request). + */ + public String checkVmsRunningOnAgent() { + String url = String.format("http://%s:%d", agentIpAddress, port); + + URIBuilder builder = null; + HttpGet httpReq = null; + try { + builder = new URIBuilder(url); + httpReq = new HttpGet(builder.build()); + } catch (URISyntaxException e) { + throw new CloudRuntimeException(String.format("Failed to create URI for GET request [URL: %s] due to exception.", url), e); + } + + HttpClient client = HttpClientBuilder.create().build(); + + HttpResponse response = null; + + try { + response = client.execute(httpReq); + } catch (IOException e) { + if (requestMaxRetries == 0) { + throw new CloudRuntimeException(String.format("Failed to execute HTTP %s request [URL: %s] due to exception %s.", httpReq.getMethod(), url, e), e); + } + retryHttpRequest(url, httpReq, client); + } + + return processHttpResponseIntoJson(response); + } + + /** + * TODO + */ + protected HttpResponse retryHttpRequest(String url, HttpRequestBase httpReq, HttpClient client) { + LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s]. Executing the request again.", httpReq.getMethod(), url)); + HttpResponse response = null; + for (int attempt = 1; attempt < requestMaxRetries + 1; attempt++) { + try { + TimeUnit.SECONDS.sleep(WAIT_FOR_REQUEST_RETRY); + LOGGER.debug(String.format("Retry HTTP %s request [URL: %s], attempt %d/%d.", httpReq.getMethod(), url, attempt, requestMaxRetries)); + response = client.execute(httpReq); + } catch (IOException | InterruptedException e) { + if (attempt == requestMaxRetries) { + throw new CloudRuntimeException( + String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", httpReq.getMethod(), attempt, requestMaxRetries, + url, e)); + } else { + LOGGER.warn( + String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", httpReq.getMethod(), attempt, requestMaxRetries, + url, e)); + } + } + } + + if (response == null) { + throw new CloudRuntimeException(String.format("Failed to execute HTTP %s request [URL: %s].", httpReq.getMethod(), url)); + } + + LOGGER.debug(String.format("Successfully executed HTTP %s request [URL: %s].", httpReq.getMethod(), url)); + return response; + } + + /** + * TODO + * Processes the response of request GET System ID as a JSON object. + */ + protected String processHttpResponseIntoJson(HttpResponse response) { + InputStream in; + String jsonString; + try { + in = response.getEntity().getContent(); + BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8)); + jsonString = streamReader.readLine(); + } catch (UnsupportedOperationException | IOException e) { + throw new CloudRuntimeException("Failed to process system Response", e); + } + return jsonString; + } + +} diff --git a/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClientTest.java b/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClientTest.java new file mode 100644 index 000000000000..2e7fde2f4f8a --- /dev/null +++ b/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClientTest.java @@ -0,0 +1,31 @@ +/* + * Copyright 2021 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.cloud.hypervisor.kvm.resource; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.junit.MockitoJUnitRunner; + +@RunWith(MockitoJUnitRunner.class) +public class KvmAgentHaClientTest { + + //TODO + @Test + public void checkHostStatusTest() { + KvmAgentHaClient kvmAgentHaClient = new KvmAgentHaClient("host", 8080); + System.out.println(kvmAgentHaClient.checkVmsRunningOnAgent()); + } +} From 44a320a31d92f47a49b122861ffefae6fc26c21c Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Mon, 25 Jan 2021 18:33:21 -0300 Subject: [PATCH 02/40] Add KVM HA Client validation on HA execution flow --- .../java/com/cloud/ha/KVMInvestigator.java | 17 +++- .../kvm/resource/KvmAgentHaClient.java | 92 +++++++++++++------ .../kvm/ha/KVMHostActivityChecker.java | 50 ++++++++-- .../kvm/resource/KvmAgentHaClientTest.java | 32 +++++-- 4 files changed, 146 insertions(+), 45 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java index a154e5d2179c..7f19cb9f1002 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java @@ -26,6 +26,7 @@ import com.cloud.host.Status; import com.cloud.host.dao.HostDao; import com.cloud.hypervisor.Hypervisor; +import com.cloud.hypervisor.kvm.resource.KvmAgentHaClient; import com.cloud.resource.ResourceManager; import com.cloud.storage.Storage.StoragePoolType; import com.cloud.utils.component.AdapterBase; @@ -33,7 +34,6 @@ import org.apache.cloudstack.storage.datastore.db.PrimaryDataStoreDao; import org.apache.cloudstack.storage.datastore.db.StoragePoolVO; import org.apache.log4j.Logger; -import org.jetbrains.annotations.NotNull; import javax.inject.Inject; import java.util.List; @@ -81,11 +81,19 @@ public Status isAgentAlive(Host agent) { Status agentStatus = Status.Disconnected; boolean hasNfs = isHostServedByNfsPool(agent); if (hasNfs) { - s_logger.debug("Agent investigation was requested on host " + agent + ", checking agent status via NFS storage."); agentStatus = checkAgentStatusViaNfs(agent); + s_logger.debug(String.format("Agent investigation was requested on host %s, agent status via NFS storage is %s.", agent, agentStatus)); } else { - s_logger.debug( - "Agent investigation was requested on host " + agent + ", but host has no NFS storage. Skipping investigation via NFS."); + s_logger.debug(String.format("Agent investigation was requested on host %s, but host has no NFS storage. Skipping investigation via NFS.", agent)); + } + + KvmAgentHaClient kvmAgentHaClient = new KvmAgentHaClient(agent.getPrivateIpAddress()); + boolean isKvmAgentRunning = kvmAgentHaClient.isKvmHaAgentRunning(); + if(isKvmAgentRunning) { + agentStatus = Status.Up; + s_logger.debug(String.format("Checking agent %s status; KVM HA webserver is Running as expected.")); + } else { + s_logger.warn(String.format("Checking agent %s status. Failed to check host status via KVM Agent HA webserver")); } return agentStatus; @@ -119,7 +127,6 @@ private boolean hasNfsPoolClusterWideForHost(Host agent) { return false; } - @NotNull private Status checkAgentStatusViaNfs(Host agent) { Status hostStatus = null; Status neighbourStatus = null; diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClient.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClient.java index fe7dfd979a66..501856ae00ba 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClient.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClient.java @@ -16,6 +16,9 @@ package com.cloud.hypervisor.kvm.resource; import com.cloud.utils.exception.CloudRuntimeException; +import com.google.gson.JsonParser; +import org.apache.cloudstack.utils.redfish.RedfishException; +import org.apache.commons.httpclient.HttpStatus; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; @@ -23,6 +26,7 @@ import org.apache.http.client.utils.URIBuilder; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.log4j.Logger; +import org.jetbrains.annotations.Nullable; import java.io.BufferedReader; import java.io.IOException; @@ -46,73 +50,102 @@ public class KvmAgentHaClient { private static final Logger LOGGER = Logger.getLogger(KvmAgentHaClient.class); private final static int WAIT_FOR_REQUEST_RETRY = 2; + private final static String VM_COUNT = "count"; + private final static int ERROR_CODE = -1; + private final static String EXPECTED_HTTP_STATUS = "2XX"; + private static final int MAX_REQUEST_RETRIES = 2; + private static final int DEFAULT_PORT = 8080; private String agentIpAddress; private int port; - private int requestMaxRetries = 0; //TODO - public KvmAgentHaClient(String agentIpAddress, int port) { + /** + * Instantiates a webclient that checks, via a webserver running on the KVM host, the VMs running + * @param agentIpAddress address of the KVM host running the webserver + */ + public KvmAgentHaClient(String agentIpAddress) { this.agentIpAddress = agentIpAddress; - this.port = port; + } + + public boolean isKvmHaAgentRunning() { + if (countRunningVmsOnAgent() < 0) { + return false; + } + return true; } /** - * TODO - * Returns the System ID. Used when sending Computer System requests (e.g. ComputerSystem.Reset request). + * Returns the number of VMs running on the KVM host according to libvirt. */ - public String checkVmsRunningOnAgent() { - String url = String.format("http://%s:%d", agentIpAddress, port); + public int countRunningVmsOnAgent() { + String url = String.format("http://%s:%d", agentIpAddress, DEFAULT_PORT); + HttpResponse response = executeHttpRequest(url); - URIBuilder builder = null; + if (response == null) + return ERROR_CODE; + + return Integer.valueOf(processHttpResponseIntoJson(response)); + } + + /** + * Executes a GET request for the given URL address. + */ + @Nullable + protected HttpResponse executeHttpRequest(String url) { HttpGet httpReq = null; try { - builder = new URIBuilder(url); + URIBuilder builder = new URIBuilder(url); httpReq = new HttpGet(builder.build()); } catch (URISyntaxException e) { - throw new CloudRuntimeException(String.format("Failed to create URI for GET request [URL: %s] due to exception.", url), e); + LOGGER.error(String.format("Failed to create URI for GET request [URL: %s] due to exception.", url), e); + return null; } HttpClient client = HttpClientBuilder.create().build(); - HttpResponse response = null; - try { response = client.execute(httpReq); } catch (IOException e) { - if (requestMaxRetries == 0) { - throw new CloudRuntimeException(String.format("Failed to execute HTTP %s request [URL: %s] due to exception %s.", httpReq.getMethod(), url, e), e); + if (MAX_REQUEST_RETRIES == 0) { + LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s] due to exception %s.", httpReq.getMethod(), url, e), e); + return null; } retryHttpRequest(url, httpReq, client); } - - return processHttpResponseIntoJson(response); + return response; } /** - * TODO + * Re-executes the HTTP GET request until it gets a response or it reaches the maximum request retries (#MAX_REQUEST_RETRIES) */ protected HttpResponse retryHttpRequest(String url, HttpRequestBase httpReq, HttpClient client) { LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s]. Executing the request again.", httpReq.getMethod(), url)); HttpResponse response = null; - for (int attempt = 1; attempt < requestMaxRetries + 1; attempt++) { + for (int attempt = 1; attempt < MAX_REQUEST_RETRIES + 1; attempt++) { try { TimeUnit.SECONDS.sleep(WAIT_FOR_REQUEST_RETRY); - LOGGER.debug(String.format("Retry HTTP %s request [URL: %s], attempt %d/%d.", httpReq.getMethod(), url, attempt, requestMaxRetries)); + LOGGER.debug(String.format("Retry HTTP %s request [URL: %s], attempt %d/%d.", httpReq.getMethod(), url, attempt, MAX_REQUEST_RETRIES)); response = client.execute(httpReq); } catch (IOException | InterruptedException e) { - if (attempt == requestMaxRetries) { - throw new CloudRuntimeException( - String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", httpReq.getMethod(), attempt, requestMaxRetries, + if (attempt == MAX_REQUEST_RETRIES) { + LOGGER.error( + String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", httpReq.getMethod(), attempt, MAX_REQUEST_RETRIES, url, e)); } else { - LOGGER.warn( - String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", httpReq.getMethod(), attempt, requestMaxRetries, + LOGGER.error( + String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", httpReq.getMethod(), attempt, MAX_REQUEST_RETRIES, url, e)); } } } if (response == null) { - throw new CloudRuntimeException(String.format("Failed to execute HTTP %s request [URL: %s].", httpReq.getMethod(), url)); + LOGGER.error(String.format("Failed to execute HTTP %s request [URL: %s].", httpReq.getMethod(), url)); + } + + int statusCode = response.getStatusLine().getStatusCode(); + if (statusCode < HttpStatus.SC_OK || statusCode >= HttpStatus.SC_MULTIPLE_CHOICES) { + throw new RedfishException(String.format("Failed to get VMs information with a %s request to URL '%s'. The expected HTTP status code is '%s' but it got '%s'.", + HttpGet.METHOD_NAME, url, EXPECTED_HTTP_STATUS, statusCode)); } LOGGER.debug(String.format("Successfully executed HTTP %s request [URL: %s].", httpReq.getMethod(), url)); @@ -126,14 +159,19 @@ protected HttpResponse retryHttpRequest(String url, HttpRequestBase httpReq, Htt protected String processHttpResponseIntoJson(HttpResponse response) { InputStream in; String jsonString; + if (response == null) { + return Integer.toString(ERROR_CODE); + } try { in = response.getEntity().getContent(); BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8)); jsonString = streamReader.readLine(); } catch (UnsupportedOperationException | IOException e) { - throw new CloudRuntimeException("Failed to process system Response", e); + throw new CloudRuntimeException("Failed to process response", e); } - return jsonString; + + String vmsCount = new JsonParser().parse(jsonString).getAsJsonObject().get(VM_COUNT).getAsString(); + return vmsCount; } } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java index e5752cb97da2..f0cbcc047eb5 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java @@ -26,7 +26,9 @@ import com.cloud.host.HostVO; import com.cloud.host.Status; import com.cloud.hypervisor.Hypervisor; +import com.cloud.hypervisor.kvm.resource.KvmAgentHaClient; import com.cloud.resource.ResourceManager; +import com.cloud.storage.Storage; import com.cloud.storage.StorageManager; import com.cloud.storage.StoragePool; import com.cloud.storage.Volume; @@ -81,7 +83,29 @@ public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException @Override public boolean isHealthy(Host r) { - return isAgentActive(r); + boolean isHealthy = false; + HashMap> poolVolMap = getVolumeUuidOnHost(r); + isHealthy = isHealthyCheckViaNfs(r, isHealthy, poolVolMap); + + KvmAgentHaClient kvmAgentHaClient = new KvmAgentHaClient(r.getPrivateIpAddress()); + boolean isKvmAgentRunning = kvmAgentHaClient.isKvmHaAgentRunning(); + + if(!isHealthy && isKvmAgentRunning) { + isHealthy = true; + } + + return isHealthy; + } + + private boolean isHealthyCheckViaNfs(Host r, boolean isHealthy, HashMap> poolVolMap) { + for (StoragePool pool : poolVolMap.keySet()) { + if(Storage.StoragePoolType.NetworkFilesystem == pool.getPoolType() + || Storage.StoragePoolType.ManagedNFS == pool.getPoolType() + || Storage.StoragePoolType.ManagedNFS == pool.getPoolType()) { + isHealthy = isAgentActive(r); + } + } + return isHealthy; } private boolean isAgentActive(Host agent) { @@ -151,19 +175,33 @@ private boolean isVMActivtyOnHost(Host agent, DateTime suspectTime) throws HAChe if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM && agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) { throw new IllegalStateException(String.format("Calling KVM investigator for non KVM Host of type [%s].", agent.getHypervisorType())); } - boolean activityStatus = true; + boolean activityStatus = false; HashMap> poolVolMap = getVolumeUuidOnHost(agent); for (StoragePool pool : poolVolMap.keySet()) { - activityStatus = verifyActivityOfStorageOnHost(poolVolMap, pool, agent, suspectTime, activityStatus); - if (!activityStatus) { - LOG.warn(String.format("It seems that the storage pool [%s] does not have activity on %s.", pool.getId(), agent.toString())); - break; + if(Storage.StoragePoolType.NetworkFilesystem == pool.getPoolType() + || Storage.StoragePoolType.ManagedNFS == pool.getPoolType() + || Storage.StoragePoolType.ManagedNFS == pool.getPoolType()) { + activityStatus = verifyActivityOfStorageOnHost(poolVolMap, pool, agent, suspectTime, activityStatus); + if (!activityStatus) { + LOG.warn(String.format("It seems that the storage pool [%s] does not have activity on %s.", pool.getId(), agent.toString())); + break; + } } } + KvmAgentHaClient kvmAgentHaClient = new KvmAgentHaClient(agent.getPrivateIpAddress()); + boolean isKvmAgentRunning = kvmAgentHaClient.isKvmHaAgentRunning(); + + if(!activityStatus && isKvmAgentRunning) { + activityStatus = true; + } else { + LOG.warn(String.format("No VM activity detected on %s. This might trigger HA Host Recovery and/or Fence.", agent.toString())); + } + return activityStatus; } + protected boolean verifyActivityOfStorageOnHost(HashMap> poolVolMap, StoragePool pool, Host agent, DateTime suspectTime, boolean activityStatus) throws HACheckerException, IllegalStateException { List volume_list = poolVolMap.get(pool); final CheckVMActivityOnStoragePoolCommand cmd = new CheckVMActivityOnStoragePoolCommand(agent, pool, volume_list, suspectTime); diff --git a/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClientTest.java b/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClientTest.java index 2e7fde2f4f8a..b8bba2774b31 100644 --- a/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClientTest.java +++ b/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClientTest.java @@ -15,17 +15,35 @@ */ package com.cloud.hypervisor.kvm.resource; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.mockito.junit.MockitoJUnitRunner; +import org.apache.http.StatusLine; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.mockito.Mockito; -@RunWith(MockitoJUnitRunner.class) +//@RunWith(MockitoJUnitRunner.class) public class KvmAgentHaClientTest { + private static final String AGENT_ADDRESS = "kvm-agent.domain.name"; + + private KvmAgentHaClient kvmAgentHaClient = Mockito.spy(new KvmAgentHaClient(AGENT_ADDRESS)); + //TODO - @Test +// @test public void checkHostStatusTest() { - KvmAgentHaClient kvmAgentHaClient = new KvmAgentHaClient("host", 8080); - System.out.println(kvmAgentHaClient.checkVmsRunningOnAgent()); + int kvmAgentResponse = kvmAgentHaClient.countRunningVmsOnAgent(); + } + +// @Test + public void isKvmHaAgentRunningTest() { + boolean isKvmAgentRunning = kvmAgentHaClient.isKvmHaAgentRunning(); } + + private CloseableHttpResponse mockResponse(int httpStatusCode) { + StatusLine statusLine = Mockito.mock(StatusLine.class); + Mockito.doReturn(httpStatusCode).when(statusLine).getStatusCode(); + CloseableHttpResponse response = Mockito.mock(CloseableHttpResponse.class); + Mockito.doReturn(statusLine).when(response).getStatusLine(); + Mockito.doReturn(response).when(kvmAgentHaClient).executeHttpRequest(Mockito.anyString()); + return response; + } + } From 895421ee6b7d597d2a08a560c8d28609c56cafc6 Mon Sep 17 00:00:00 2001 From: Wido den Hollander Date: Fri, 22 Jan 2021 16:56:33 +0100 Subject: [PATCH 03/40] kvm: Add Agent Helper to investigate if a Host is truly down The additional webserver on port 8080 exposes a very simple JSON API which returns the amount and list of Virtual Machines still running on that host according to libvirt. In case of HA the KVMHAInvestigator can perform a HTTP-call to this simple webserver and determine if the host is actually down or if it is just the Java Agent which has crashed. --- debian/rules | 4 +- packaging/centos7/cloud.spec | 2 + packaging/centos8/cloud.spec | 2 + .../cloudstack-agent-ha-helper.default | 18 +++++ .../cloudstack-agent-ha-helper.service | 36 +++++++++ scripts/vm/hypervisor/kvm/agent-ha-helper.py | 81 +++++++++++++++++++ 6 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 packaging/systemd/cloudstack-agent-ha-helper.default create mode 100644 packaging/systemd/cloudstack-agent-ha-helper.service create mode 100755 scripts/vm/hypervisor/kvm/agent-ha-helper.py diff --git a/debian/rules b/debian/rules index ed1559a46dfe..3b2de29130fa 100755 --- a/debian/rules +++ b/debian/rules @@ -45,6 +45,8 @@ override_dh_auto_install: install -d -m0755 debian/$(PACKAGE)-agent/lib/systemd/system install -m0644 packaging/systemd/$(PACKAGE)-agent.service debian/$(PACKAGE)-agent/lib/systemd/system/$(PACKAGE)-agent.service install -m0644 packaging/systemd/$(PACKAGE)-agent.default $(DESTDIR)/$(SYSCONFDIR)/default/$(PACKAGE)-agent + install -m0644 packaging/systemd/$(PACKAGE)-agent-ha-helper.service debian/$(PACKAGE)-agent/lib/systemd/system/$(PACKAGE)-agent-ha-helper.service + install -m0644 packaging/systemd/$(PACKAGE)-agent-ha-helper.default $(DESTDIR)/$(SYSCONFDIR)/default/$(PACKAGE)-agent-ha-helper install -m0644 packaging/systemd/$(PACKAGE)-rolling-maintenance@.service debian/$(PACKAGE)-agent/lib/systemd/system/$(PACKAGE)-rolling-maintenance@.service install -D -m0644 agent/target/transformed/cloudstack-agent.logrotate $(DESTDIR)/$(SYSCONFDIR)/logrotate.d/cloudstack-agent @@ -159,7 +161,7 @@ override_dh_auto_install: cp -r test/integration/* $(DESTDIR)/usr/share/$(PACKAGE)-integration-tests/ override_dh_systemd_enable: - dh_systemd_enable -pcloudstack-management -pcloudstack-agent -pcloudstack-usage + dh_systemd_enable -pcloudstack-management -pcloudstack-agent -pcloudstack-usage -pcloudstack-agent-ha-helper override_dh_strip_nondeterminism: # Disable dh_strip_nondeterminism to speed up the build diff --git a/packaging/centos7/cloud.spec b/packaging/centos7/cloud.spec index 604e853e12ae..ee7c5170461a 100644 --- a/packaging/centos7/cloud.spec +++ b/packaging/centos7/cloud.spec @@ -331,8 +331,10 @@ mkdir -p ${RPM_BUILD_ROOT}%{_localstatedir}/log/%{name}/agent mkdir -p ${RPM_BUILD_ROOT}%{_datadir}/%{name}-agent/lib mkdir -p ${RPM_BUILD_ROOT}%{_datadir}/%{name}-agent/plugins install -D packaging/systemd/cloudstack-agent.service ${RPM_BUILD_ROOT}%{_unitdir}/%{name}-agent.service +install -D packaging/systemd/cloudstack-agent-ha-helper.service ${RPM_BUILD_ROOT}%{_unitdir}/%{name}-agent-ha-helper.service install -D packaging/systemd/cloudstack-rolling-maintenance@.service ${RPM_BUILD_ROOT}%{_unitdir}/%{name}-rolling-maintenance@.service install -D packaging/systemd/cloudstack-agent.default ${RPM_BUILD_ROOT}%{_sysconfdir}/default/%{name}-agent +install -D packaging/systemd/cloudstack-agent-ha-helper.default ${RPM_BUILD_ROOT}%{_sysconfdir}/default/%{name}-agent-ha-helper install -D agent/target/transformed/agent.properties ${RPM_BUILD_ROOT}%{_sysconfdir}/%{name}/agent/agent.properties install -D agent/target/transformed/environment.properties ${RPM_BUILD_ROOT}%{_sysconfdir}/%{name}/agent/environment.properties install -D agent/target/transformed/log4j-cloud.xml ${RPM_BUILD_ROOT}%{_sysconfdir}/%{name}/agent/log4j-cloud.xml diff --git a/packaging/centos8/cloud.spec b/packaging/centos8/cloud.spec index cec88a20a72d..b7aabc2fdb98 100644 --- a/packaging/centos8/cloud.spec +++ b/packaging/centos8/cloud.spec @@ -324,8 +324,10 @@ mkdir -p ${RPM_BUILD_ROOT}%{_localstatedir}/log/%{name}/agent mkdir -p ${RPM_BUILD_ROOT}%{_datadir}/%{name}-agent/lib mkdir -p ${RPM_BUILD_ROOT}%{_datadir}/%{name}-agent/plugins install -D packaging/systemd/cloudstack-agent.service ${RPM_BUILD_ROOT}%{_unitdir}/%{name}-agent.service +install -D packaging/systemd/cloudstack-agent-ha-helper.service ${RPM_BUILD_ROOT}%{_unitdir}/%{name}-agent-ha-helper.service install -D packaging/systemd/cloudstack-rolling-maintenance@.service ${RPM_BUILD_ROOT}%{_unitdir}/%{name}-rolling-maintenance@.service install -D packaging/systemd/cloudstack-agent.default ${RPM_BUILD_ROOT}%{_sysconfdir}/default/%{name}-agent +install -D packaging/systemd/cloudstack-agent-ha-helper.default ${RPM_BUILD_ROOT}%{_sysconfdir}/default/%{name}-agent-ha-helper install -D agent/target/transformed/agent.properties ${RPM_BUILD_ROOT}%{_sysconfdir}/%{name}/agent/agent.properties install -D agent/target/transformed/environment.properties ${RPM_BUILD_ROOT}%{_sysconfdir}/%{name}/agent/environment.properties install -D agent/target/transformed/log4j-cloud.xml ${RPM_BUILD_ROOT}%{_sysconfdir}/%{name}/agent/log4j-cloud.xml diff --git a/packaging/systemd/cloudstack-agent-ha-helper.default b/packaging/systemd/cloudstack-agent-ha-helper.default new file mode 100644 index 000000000000..e98e162a23ea --- /dev/null +++ b/packaging/systemd/cloudstack-agent-ha-helper.default @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +PORT=8080 diff --git a/packaging/systemd/cloudstack-agent-ha-helper.service b/packaging/systemd/cloudstack-agent-ha-helper.service new file mode 100644 index 000000000000..eb01bcdc82b0 --- /dev/null +++ b/packaging/systemd/cloudstack-agent-ha-helper.service @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Do not modify this file as your changes will be lost in the next CSM update. +# If you need to add specific dependencies to this service unit do it in the +# /etc/systemd/system/cloudstack-management.service.d/ directory + +[Unit] +Description=CloudStack Agent HA Helper +Documentation=http://www.cloudstack.org/ +Requires=libvirtd.service +After=libvirtd.service + +[Service] +Type=simple +EnvironmentFile=/etc/default/cloudstack-agent-ha-helper +ExecStart=/usr/share/cloudstack-common/scripts/hypervisor/kvm/agent-ha-helper.py $PORT +Restart=always +RestartSec=10s + +[Install] +WantedBy=multi-user.target diff --git a/scripts/vm/hypervisor/kvm/agent-ha-helper.py b/scripts/vm/hypervisor/kvm/agent-ha-helper.py new file mode 100755 index 000000000000..d8d4ebfbd495 --- /dev/null +++ b/scripts/vm/hypervisor/kvm/agent-ha-helper.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import libvirt +import socket +import json +from http.server import BaseHTTPRequestHandler, HTTPServer + +class Libvirt(): + def __init__(self): + self.conn = libvirt.openReadOnly("qemu:///system") + if not self.conn: + raise Exception('Failed to open connection to libvirt') + + def running_vms(self): + alldomains = [domain for domain in map(self.conn.lookupByID, self.conn.listDomainsID())] + + domains = [] + for domain in alldomains: + if domain.info()[0] == libvirt.VIR_DOMAIN_RUNNING: + domains.append(domain.name()) + + self.conn.close() + + return domains + +class HTTPServerV6(HTTPServer): + address_family = socket.AF_INET6 + +class CloudStackAgentHAHelper(BaseHTTPRequestHandler): + def do_GET(self): + + if self.path != "/": + self.send_response(404) + self.end_headers() + return + + libvirt = Libvirt() + + running_vms = libvirt.running_vms() + + output = { + 'count': len(running_vms), + 'virtualmachines': running_vms + } + + self.send_response(200) + self.send_header("Content-type", "application/json") + self.end_headers() + self.wfile.write(json.dumps(output).encode()) + +def run(port=8080): + server_address = ('', port) + httpd = HTTPServerV6((server_address), CloudStackAgentHAHelper) + httpd.serve_forever() + +if __name__ == "__main__": + from sys import argv + + try: + if len(argv) == 2: + run(port=int(argv[1])) + else: + run() + except KeyboardInterrupt: + pass From f97100d1ce7b02041a4653380bdccae265a9172e Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Wed, 27 Jan 2021 18:43:20 -0300 Subject: [PATCH 04/40] Integrating Agent HA Helper --- debian/cloudstack-agent.install | 1 + debian/control | 7 +++++++ debian/rules | 4 ++-- packaging/systemd/cloudstack-agent-ha-helper.service | 3 ++- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/debian/cloudstack-agent.install b/debian/cloudstack-agent.install index 58715e0746ba..b540119b2c9c 100644 --- a/debian/cloudstack-agent.install +++ b/debian/cloudstack-agent.install @@ -19,6 +19,7 @@ /etc/cloudstack/agent/environment.properties /etc/cloudstack/agent/log4j-cloud.xml /etc/default/cloudstack-agent +/etc/default/cloudstack-agent-ha-helper /etc/profile.d/cloudstack-agent-profile.sh /etc/logrotate.d/cloudstack-agent /usr/bin/cloudstack-setup-agent diff --git a/debian/control b/debian/control index 066994785b38..07773a04d40b 100644 --- a/debian/control +++ b/debian/control @@ -56,3 +56,10 @@ Package: cloudstack-integration-tests Architecture: all Depends: ${misc:Depends}, cloudstack-marvin (= ${source:Version}) Description: The CloudStack Marvin integration tests + +Package: cloudstack-agent-ha-helper +Architecture: all +Depends: ${python3:Depends}, libvirt-bin (>= 1.3) | libvirt-daemon-system (>= 3.0), python3-libvirt +Description: The CloudStack Agent HA Helper is a (Python) webserver on port 8080; + it exposes a simple JSON API which returns a list of Virtual Machines + running in the host according to libvirt. diff --git a/debian/rules b/debian/rules index 3b2de29130fa..b0de48f7c401 100755 --- a/debian/rules +++ b/debian/rules @@ -45,8 +45,8 @@ override_dh_auto_install: install -d -m0755 debian/$(PACKAGE)-agent/lib/systemd/system install -m0644 packaging/systemd/$(PACKAGE)-agent.service debian/$(PACKAGE)-agent/lib/systemd/system/$(PACKAGE)-agent.service install -m0644 packaging/systemd/$(PACKAGE)-agent.default $(DESTDIR)/$(SYSCONFDIR)/default/$(PACKAGE)-agent - install -m0644 packaging/systemd/$(PACKAGE)-agent-ha-helper.service debian/$(PACKAGE)-agent/lib/systemd/system/$(PACKAGE)-agent-ha-helper.service - install -m0644 packaging/systemd/$(PACKAGE)-agent-ha-helper.default $(DESTDIR)/$(SYSCONFDIR)/default/$(PACKAGE)-agent-ha-helper + install -m0644 packaging/systemd/$(PACKAGE)-agent-ha-helper.service debian/$(PACKAGE)-agent/lib/systemd/system/$(PACKAGE)-agent-ha-helper.service + install -m0644 packaging/systemd/$(PACKAGE)-agent-ha-helper.default $(DESTDIR)/$(SYSCONFDIR)/default/$(PACKAGE)-agent-ha-helper install -m0644 packaging/systemd/$(PACKAGE)-rolling-maintenance@.service debian/$(PACKAGE)-agent/lib/systemd/system/$(PACKAGE)-rolling-maintenance@.service install -D -m0644 agent/target/transformed/cloudstack-agent.logrotate $(DESTDIR)/$(SYSCONFDIR)/logrotate.d/cloudstack-agent diff --git a/packaging/systemd/cloudstack-agent-ha-helper.service b/packaging/systemd/cloudstack-agent-ha-helper.service index eb01bcdc82b0..34ad6a37caaa 100644 --- a/packaging/systemd/cloudstack-agent-ha-helper.service +++ b/packaging/systemd/cloudstack-agent-ha-helper.service @@ -28,7 +28,8 @@ After=libvirtd.service [Service] Type=simple EnvironmentFile=/etc/default/cloudstack-agent-ha-helper -ExecStart=/usr/share/cloudstack-common/scripts/hypervisor/kvm/agent-ha-helper.py $PORT +ExecStart=/usr/share/cloudstack-common/scripts/vm/hypervisor/kvm/agent-ha-helper.py $PORT + Restart=always RestartSec=10s From 2e8b563884d4f5d94409b3f76d9a6ea95f747d95 Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Thu, 28 Jan 2021 17:38:22 -0300 Subject: [PATCH 05/40] Count number of VMs running on Define if a host is healthy by counting the number of VMs on Agent and comparing with DB --- .../java/com/cloud/ha/KVMInvestigator.java | 11 +++- .../kvm/resource/KvmAgentHaClient.java | 59 +++++++++++-------- .../kvm/ha/KVMHostActivityChecker.java | 14 +++-- .../kvm/resource/KvmAgentHaClientTest.java | 16 +---- 4 files changed, 52 insertions(+), 48 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java index 7f19cb9f1002..e0b84550a5f3 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java @@ -30,6 +30,8 @@ import com.cloud.resource.ResourceManager; import com.cloud.storage.Storage.StoragePoolType; import com.cloud.utils.component.AdapterBase; +import com.cloud.vm.VMInstanceVO; +import com.cloud.vm.dao.VMInstanceDao; import org.apache.cloudstack.ha.HAManager; import org.apache.cloudstack.storage.datastore.db.PrimaryDataStoreDao; import org.apache.cloudstack.storage.datastore.db.StoragePoolVO; @@ -50,6 +52,8 @@ public class KVMInvestigator extends AdapterBase implements Investigator { private PrimaryDataStoreDao _storagePoolDao; @Inject private HAManager haManager; + @Inject + private VMInstanceDao vmInstanceDao; @Override public boolean isVmAlive(com.cloud.vm.VirtualMachine vm, Host host) throws UnknownVM { @@ -87,9 +91,10 @@ public Status isAgentAlive(Host agent) { s_logger.debug(String.format("Agent investigation was requested on host %s, but host has no NFS storage. Skipping investigation via NFS.", agent)); } - KvmAgentHaClient kvmAgentHaClient = new KvmAgentHaClient(agent.getPrivateIpAddress()); - boolean isKvmAgentRunning = kvmAgentHaClient.isKvmHaAgentRunning(); - if(isKvmAgentRunning) { + List vmsOnHost = vmInstanceDao.listByHostId(agent.getId()); + KvmAgentHaClient kvmAgentHaClient = new KvmAgentHaClient(agent); + boolean isVmsOnKvmMatchingWithDatabase = kvmAgentHaClient.checkAgentHealthAndRunningVms(vmsOnHost.size()); + if(isVmsOnKvmMatchingWithDatabase) { agentStatus = Status.Up; s_logger.debug(String.format("Checking agent %s status; KVM HA webserver is Running as expected.")); } else { diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClient.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClient.java index 501856ae00ba..17f24aaecca2 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClient.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClient.java @@ -15,9 +15,10 @@ */ package com.cloud.hypervisor.kvm.resource; +import com.cloud.host.Host; import com.cloud.utils.exception.CloudRuntimeException; +import com.google.gson.JsonObject; import com.google.gson.JsonParser; -import org.apache.cloudstack.utils.redfish.RedfishException; import org.apache.commons.httpclient.HttpStatus; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; @@ -26,7 +27,6 @@ import org.apache.http.client.utils.URIBuilder; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.log4j.Logger; -import org.jetbrains.annotations.Nullable; import java.io.BufferedReader; import java.io.IOException; @@ -55,41 +55,44 @@ public class KvmAgentHaClient { private final static String EXPECTED_HTTP_STATUS = "2XX"; private static final int MAX_REQUEST_RETRIES = 2; private static final int DEFAULT_PORT = 8080; - private String agentIpAddress; - private int port; + private Host agent; /** * Instantiates a webclient that checks, via a webserver running on the KVM host, the VMs running - * @param agentIpAddress address of the KVM host running the webserver */ - public KvmAgentHaClient(String agentIpAddress) { - this.agentIpAddress = agentIpAddress; - } - - public boolean isKvmHaAgentRunning() { - if (countRunningVmsOnAgent() < 0) { - return false; - } - return true; + public KvmAgentHaClient(Host agent) { + this.agent = agent; } /** * Returns the number of VMs running on the KVM host according to libvirt. */ - public int countRunningVmsOnAgent() { - String url = String.format("http://%s:%d", agentIpAddress, DEFAULT_PORT); + protected int countRunningVmsOnAgent() { + String url = String.format("http://%s:%d", agent.getPrivateIpAddress(), DEFAULT_PORT); HttpResponse response = executeHttpRequest(url); if (response == null) return ERROR_CODE; - return Integer.valueOf(processHttpResponseIntoJson(response)); + JsonObject responseInJson = processHttpResponseIntoJson(response); + if (responseInJson == null) { + return ERROR_CODE; + } + + return Integer.valueOf(responseInJson.get(VM_COUNT).getAsString()); + } + + /** + * Returns true in case of the expected number of VMs matches with the VMs running on the KVM host according to libvirt. If the ammount of VMs are not the same then it is assumed that + */ + public boolean checkAgentHealthAndRunningVms(int expectedNumberOfVms) { + int numberOfVmsOnAgent = countRunningVmsOnAgent(); + return expectedNumberOfVms == numberOfVmsOnAgent; } /** * Executes a GET request for the given URL address. */ - @Nullable protected HttpResponse executeHttpRequest(String url) { HttpGet httpReq = null; try { @@ -140,12 +143,14 @@ protected HttpResponse retryHttpRequest(String url, HttpRequestBase httpReq, Htt if (response == null) { LOGGER.error(String.format("Failed to execute HTTP %s request [URL: %s].", httpReq.getMethod(), url)); + return response; } int statusCode = response.getStatusLine().getStatusCode(); if (statusCode < HttpStatus.SC_OK || statusCode >= HttpStatus.SC_MULTIPLE_CHOICES) { - throw new RedfishException(String.format("Failed to get VMs information with a %s request to URL '%s'. The expected HTTP status code is '%s' but it got '%s'.", - HttpGet.METHOD_NAME, url, EXPECTED_HTTP_STATUS, statusCode)); + throw new CloudRuntimeException( + String.format("Failed to get VMs information with a %s request to URL '%s'. The expected HTTP status code is '%s' but it got '%s'.", HttpGet.METHOD_NAME, url, + EXPECTED_HTTP_STATUS, statusCode)); } LOGGER.debug(String.format("Successfully executed HTTP %s request [URL: %s].", httpReq.getMethod(), url)); @@ -153,14 +158,17 @@ protected HttpResponse retryHttpRequest(String url, HttpRequestBase httpReq, Htt } /** - * TODO - * Processes the response of request GET System ID as a JSON object. + * Processes the response of request GET System ID as a JSON object.

+ * + * Json example: {"count": 3, "virtualmachines": ["r-123-VM", "v-134-VM", "s-111-VM"]}

+ * + * Note: this method can return NULL JsonObject in case HttpResponse is NULL. */ - protected String processHttpResponseIntoJson(HttpResponse response) { + protected JsonObject processHttpResponseIntoJson(HttpResponse response) { InputStream in; String jsonString; if (response == null) { - return Integer.toString(ERROR_CODE); + return null; } try { in = response.getEntity().getContent(); @@ -170,8 +178,7 @@ protected String processHttpResponseIntoJson(HttpResponse response) { throw new CloudRuntimeException("Failed to process response", e); } - String vmsCount = new JsonParser().parse(jsonString).getAsJsonObject().get(VM_COUNT).getAsString(); - return vmsCount; + return new JsonParser().parse(jsonString).getAsJsonObject(); } } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java index f0cbcc047eb5..08a8fac99d54 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java @@ -87,10 +87,11 @@ public boolean isHealthy(Host r) { HashMap> poolVolMap = getVolumeUuidOnHost(r); isHealthy = isHealthyCheckViaNfs(r, isHealthy, poolVolMap); - KvmAgentHaClient kvmAgentHaClient = new KvmAgentHaClient(r.getPrivateIpAddress()); - boolean isKvmAgentRunning = kvmAgentHaClient.isKvmHaAgentRunning(); + KvmAgentHaClient kvmAgentHaClient = new KvmAgentHaClient(r); + List vmsOnHost = vmInstanceDao.listByHostId(r.getId()); + boolean checkKvmHeatlh = kvmAgentHaClient.checkAgentHealthAndRunningVms(vmsOnHost.size()); - if(!isHealthy && isKvmAgentRunning) { + if(!isHealthy && checkKvmHeatlh) { isHealthy = true; } @@ -189,10 +190,11 @@ private boolean isVMActivtyOnHost(Host agent, DateTime suspectTime) throws HAChe } } - KvmAgentHaClient kvmAgentHaClient = new KvmAgentHaClient(agent.getPrivateIpAddress()); - boolean isKvmAgentRunning = kvmAgentHaClient.isKvmHaAgentRunning(); + KvmAgentHaClient kvmAgentHaClient = new KvmAgentHaClient(agent); + List vmsOnHost = vmInstanceDao.listByHostId(agent.getId()); + boolean isVmsOnKvmMatchingWithDatabase = kvmAgentHaClient.checkAgentHealthAndRunningVms(vmsOnHost.size()); - if(!activityStatus && isKvmAgentRunning) { + if(!activityStatus && isVmsOnKvmMatchingWithDatabase) { activityStatus = true; } else { LOG.warn(String.format("No VM activity detected on %s. This might trigger HA Host Recovery and/or Fence.", agent.toString())); diff --git a/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClientTest.java b/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClientTest.java index b8bba2774b31..539c7c07453e 100644 --- a/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClientTest.java +++ b/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClientTest.java @@ -15,6 +15,7 @@ */ package com.cloud.hypervisor.kvm.resource; +import com.cloud.host.HostVO; import org.apache.http.StatusLine; import org.apache.http.client.methods.CloseableHttpResponse; import org.mockito.Mockito; @@ -23,19 +24,8 @@ public class KvmAgentHaClientTest { private static final String AGENT_ADDRESS = "kvm-agent.domain.name"; - - private KvmAgentHaClient kvmAgentHaClient = Mockito.spy(new KvmAgentHaClient(AGENT_ADDRESS)); - - //TODO -// @test - public void checkHostStatusTest() { - int kvmAgentResponse = kvmAgentHaClient.countRunningVmsOnAgent(); - } - -// @Test - public void isKvmHaAgentRunningTest() { - boolean isKvmAgentRunning = kvmAgentHaClient.isKvmHaAgentRunning(); - } + private HostVO agent = Mockito.mock(HostVO.class); + private KvmAgentHaClient kvmAgentHaClient = Mockito.spy(new KvmAgentHaClient(agent)); private CloseableHttpResponse mockResponse(int httpStatusCode) { StatusLine statusLine = Mockito.mock(StatusLine.class); From 4bafec0e02324acc725366a2797854b2f60762f3 Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Thu, 11 Feb 2021 19:24:30 -0300 Subject: [PATCH 06/40] Add global settings configurations; enhance documentation; work on tests Change VMs counting Fix test case issues and enhance documentations and code --- .../java/com/cloud/ha/KVMInvestigator.java | 27 +- .../kvm/resource/KvmAgentHaClient.java | 184 ------------ .../apache/cloudstack/kvm/ha/KVMHAConfig.java | 7 + .../cloudstack/kvm/ha/KVMHAProvider.java | 4 +- .../kvm/ha/KVMHostActivityChecker.java | 54 ++-- .../cloudstack/kvm/ha/KvmHaAgentClient.java | 273 ++++++++++++++++++ .../spring-kvm-compute-context.xml | 1 - .../kvm/resource/KvmAgentHaClientTest.java | 39 --- .../kvm/ha/KvmHaAgentClientTest.java | 252 ++++++++++++++++ .../cloudstack/ha/provider/HAProvider.java | 4 +- 10 files changed, 591 insertions(+), 254 deletions(-) delete mode 100644 plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClient.java create mode 100644 plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java delete mode 100644 plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClientTest.java create mode 100644 plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java index e0b84550a5f3..21257481630a 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java @@ -26,13 +26,12 @@ import com.cloud.host.Status; import com.cloud.host.dao.HostDao; import com.cloud.hypervisor.Hypervisor; -import com.cloud.hypervisor.kvm.resource.KvmAgentHaClient; import com.cloud.resource.ResourceManager; import com.cloud.storage.Storage.StoragePoolType; import com.cloud.utils.component.AdapterBase; -import com.cloud.vm.VMInstanceVO; import com.cloud.vm.dao.VMInstanceDao; import org.apache.cloudstack.ha.HAManager; +import org.apache.cloudstack.kvm.ha.KvmHaAgentClient; import org.apache.cloudstack.storage.datastore.db.PrimaryDataStoreDao; import org.apache.cloudstack.storage.datastore.db.StoragePoolVO; import org.apache.log4j.Logger; @@ -86,21 +85,29 @@ public Status isAgentAlive(Host agent) { boolean hasNfs = isHostServedByNfsPool(agent); if (hasNfs) { agentStatus = checkAgentStatusViaNfs(agent); - s_logger.debug(String.format("Agent investigation was requested on host %s, agent status via NFS storage is %s.", agent, agentStatus)); + s_logger.debug(String.format("Agent investigation was requested on host %s. Agent status via NFS heartbeat is %s.", agent, agentStatus)); } else { s_logger.debug(String.format("Agent investigation was requested on host %s, but host has no NFS storage. Skipping investigation via NFS.", agent)); } - List vmsOnHost = vmInstanceDao.listByHostId(agent.getId()); - KvmAgentHaClient kvmAgentHaClient = new KvmAgentHaClient(agent); - boolean isVmsOnKvmMatchingWithDatabase = kvmAgentHaClient.checkAgentHealthAndRunningVms(vmsOnHost.size()); - if(isVmsOnKvmMatchingWithDatabase) { + agentStatus = checkAgentStatusViaKvmHaAgent(agent, agentStatus); + + return agentStatus; + } + + /** + * It checks the KVM node healthy via KVM HA Agent. If the agent is healthy it returns Status.Up, otherwise it relies keeps the provided Status as it is. + */ + private Status checkAgentStatusViaKvmHaAgent(Host agent, Status agentStatus) { + KvmHaAgentClient kvmHaAgentClient = new KvmHaAgentClient(agent); + + boolean isVmsCountOnKvmMatchingWithDatabase = kvmHaAgentClient.isKvmHaAgentHealthy(agent, vmInstanceDao); + if(isVmsCountOnKvmMatchingWithDatabase) { agentStatus = Status.Up; - s_logger.debug(String.format("Checking agent %s status; KVM HA webserver is Running as expected.")); + s_logger.debug(String.format("Checking agent %s status; KVM HA Agent is Running as expected.")); } else { - s_logger.warn(String.format("Checking agent %s status. Failed to check host status via KVM Agent HA webserver")); + s_logger.warn(String.format("Checking agent %s status. Failed to check host status via KVM HA Agent")); } - return agentStatus; } diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClient.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClient.java deleted file mode 100644 index 17f24aaecca2..000000000000 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClient.java +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright 2021 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.cloud.hypervisor.kvm.resource; - -import com.cloud.host.Host; -import com.cloud.utils.exception.CloudRuntimeException; -import com.google.gson.JsonObject; -import com.google.gson.JsonParser; -import org.apache.commons.httpclient.HttpStatus; -import org.apache.http.HttpResponse; -import org.apache.http.client.HttpClient; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.client.methods.HttpRequestBase; -import org.apache.http.client.utils.URIBuilder; -import org.apache.http.impl.client.HttpClientBuilder; -import org.apache.log4j.Logger; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.net.URISyntaxException; -import java.nio.charset.StandardCharsets; -import java.util.concurrent.TimeUnit; - -/** - * This class provides a client that checks Agent status via a webserver. - * - * The additional webserver exposes a simple JSON API which returns a list - * of Virtual Machines that are running on that host according to libvirt. - * - * This way, KVM HA can verify, via libvirt, VMs status with a HTTP-call - * to this simple webserver and determine if the host is actually down - * or if it is just the Java Agent which has crashed. - */ -public class KvmAgentHaClient { - - private static final Logger LOGGER = Logger.getLogger(KvmAgentHaClient.class); - private final static int WAIT_FOR_REQUEST_RETRY = 2; - private final static String VM_COUNT = "count"; - private final static int ERROR_CODE = -1; - private final static String EXPECTED_HTTP_STATUS = "2XX"; - private static final int MAX_REQUEST_RETRIES = 2; - private static final int DEFAULT_PORT = 8080; - private Host agent; - - /** - * Instantiates a webclient that checks, via a webserver running on the KVM host, the VMs running - */ - public KvmAgentHaClient(Host agent) { - this.agent = agent; - } - - /** - * Returns the number of VMs running on the KVM host according to libvirt. - */ - protected int countRunningVmsOnAgent() { - String url = String.format("http://%s:%d", agent.getPrivateIpAddress(), DEFAULT_PORT); - HttpResponse response = executeHttpRequest(url); - - if (response == null) - return ERROR_CODE; - - JsonObject responseInJson = processHttpResponseIntoJson(response); - if (responseInJson == null) { - return ERROR_CODE; - } - - return Integer.valueOf(responseInJson.get(VM_COUNT).getAsString()); - } - - /** - * Returns true in case of the expected number of VMs matches with the VMs running on the KVM host according to libvirt. If the ammount of VMs are not the same then it is assumed that - */ - public boolean checkAgentHealthAndRunningVms(int expectedNumberOfVms) { - int numberOfVmsOnAgent = countRunningVmsOnAgent(); - return expectedNumberOfVms == numberOfVmsOnAgent; - } - - /** - * Executes a GET request for the given URL address. - */ - protected HttpResponse executeHttpRequest(String url) { - HttpGet httpReq = null; - try { - URIBuilder builder = new URIBuilder(url); - httpReq = new HttpGet(builder.build()); - } catch (URISyntaxException e) { - LOGGER.error(String.format("Failed to create URI for GET request [URL: %s] due to exception.", url), e); - return null; - } - - HttpClient client = HttpClientBuilder.create().build(); - HttpResponse response = null; - try { - response = client.execute(httpReq); - } catch (IOException e) { - if (MAX_REQUEST_RETRIES == 0) { - LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s] due to exception %s.", httpReq.getMethod(), url, e), e); - return null; - } - retryHttpRequest(url, httpReq, client); - } - return response; - } - - /** - * Re-executes the HTTP GET request until it gets a response or it reaches the maximum request retries (#MAX_REQUEST_RETRIES) - */ - protected HttpResponse retryHttpRequest(String url, HttpRequestBase httpReq, HttpClient client) { - LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s]. Executing the request again.", httpReq.getMethod(), url)); - HttpResponse response = null; - for (int attempt = 1; attempt < MAX_REQUEST_RETRIES + 1; attempt++) { - try { - TimeUnit.SECONDS.sleep(WAIT_FOR_REQUEST_RETRY); - LOGGER.debug(String.format("Retry HTTP %s request [URL: %s], attempt %d/%d.", httpReq.getMethod(), url, attempt, MAX_REQUEST_RETRIES)); - response = client.execute(httpReq); - } catch (IOException | InterruptedException e) { - if (attempt == MAX_REQUEST_RETRIES) { - LOGGER.error( - String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", httpReq.getMethod(), attempt, MAX_REQUEST_RETRIES, - url, e)); - } else { - LOGGER.error( - String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", httpReq.getMethod(), attempt, MAX_REQUEST_RETRIES, - url, e)); - } - } - } - - if (response == null) { - LOGGER.error(String.format("Failed to execute HTTP %s request [URL: %s].", httpReq.getMethod(), url)); - return response; - } - - int statusCode = response.getStatusLine().getStatusCode(); - if (statusCode < HttpStatus.SC_OK || statusCode >= HttpStatus.SC_MULTIPLE_CHOICES) { - throw new CloudRuntimeException( - String.format("Failed to get VMs information with a %s request to URL '%s'. The expected HTTP status code is '%s' but it got '%s'.", HttpGet.METHOD_NAME, url, - EXPECTED_HTTP_STATUS, statusCode)); - } - - LOGGER.debug(String.format("Successfully executed HTTP %s request [URL: %s].", httpReq.getMethod(), url)); - return response; - } - - /** - * Processes the response of request GET System ID as a JSON object.

- * - * Json example: {"count": 3, "virtualmachines": ["r-123-VM", "v-134-VM", "s-111-VM"]}

- * - * Note: this method can return NULL JsonObject in case HttpResponse is NULL. - */ - protected JsonObject processHttpResponseIntoJson(HttpResponse response) { - InputStream in; - String jsonString; - if (response == null) { - return null; - } - try { - in = response.getEntity().getContent(); - BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8)); - jsonString = streamReader.readLine(); - } catch (UnsupportedOperationException | IOException e) { - throw new CloudRuntimeException("Failed to process response", e); - } - - return new JsonParser().parse(jsonString).getAsJsonObject(); - } - -} diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java index 59ea720328f5..6e9b4a180305 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java @@ -53,4 +53,11 @@ public class KVMHAConfig { public static final ConfigKey KvmHAFenceTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.fence.timeout", "60", "The maximum length of time, in seconds, expected for a fence operation to complete.", true, ConfigKey.Scope.Cluster); + public static final ConfigKey KVM_HA_WEBSERVICE_PORT = new ConfigKey("Advanced", Integer.class, "kvm.ha.webservice.port", "8080", + "It sets the port used to communicate with the KVM HA Agent Microservice that is running on KVM nodes. Default value is 8080.", + true, ConfigKey.Scope.Cluster); + + public static final ConfigKey KVM_HA_WEBSERVICE_ENABLED = new ConfigKey("Advanced", Boolean.class, "kvm.ha.webservice.enabled", "true", + "The KVM HA Webservice is executed on the KVM node and checks the amount of VMs running via libvirt. It serves as a HA health-check for KVM nodes. One can enable (set to 'true') or disable it ('false'). If disabled then CloudStack ignores HA validation via this agent.", + true, ConfigKey.Scope.Cluster); } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java index 5399fd23a1cd..f1160b30b52a 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java @@ -151,7 +151,9 @@ public ConfigKey[] getConfigKeys() { KVMHAConfig.KvmHAActivityCheckFailureThreshold, KVMHAConfig.KvmHADegradedMaxPeriod, KVMHAConfig.KvmHARecoverWaitPeriod, - KVMHAConfig.KvmHARecoverAttemptThreshold + KVMHAConfig.KvmHARecoverAttemptThreshold, + KVMHAConfig.KVM_HA_WEBSERVICE_PORT, + KVMHAConfig.KVM_HA_WEBSERVICE_ENABLED }; } } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java index 08a8fac99d54..4690077bd318 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java @@ -21,12 +21,13 @@ import com.cloud.agent.api.Answer; import com.cloud.agent.api.CheckOnHostCommand; import com.cloud.agent.api.CheckVMActivityOnStoragePoolCommand; +import com.cloud.dc.ClusterVO; +import com.cloud.dc.dao.ClusterDao; import com.cloud.exception.StorageUnavailableException; import com.cloud.host.Host; import com.cloud.host.HostVO; import com.cloud.host.Status; import com.cloud.hypervisor.Hypervisor; -import com.cloud.hypervisor.kvm.resource.KvmAgentHaClient; import com.cloud.resource.ResourceManager; import com.cloud.storage.Storage; import com.cloud.storage.StorageManager; @@ -51,6 +52,7 @@ import java.util.HashMap; import java.util.List; + public class KVMHostActivityChecker extends AdapterBase implements ActivityCheckerInterface, HealthCheckerInterface { private final static Logger LOG = Logger.getLogger(KVMHostActivityChecker.class); @@ -66,6 +68,8 @@ public class KVMHostActivityChecker extends AdapterBase implements ActivityCheck private StorageManager storageManager; @Inject private ResourceManager resourceManager; + @Inject + private ClusterDao clusterDao; @Override public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException { @@ -87,21 +91,41 @@ public boolean isHealthy(Host r) { HashMap> poolVolMap = getVolumeUuidOnHost(r); isHealthy = isHealthyCheckViaNfs(r, isHealthy, poolVolMap); - KvmAgentHaClient kvmAgentHaClient = new KvmAgentHaClient(r); - List vmsOnHost = vmInstanceDao.listByHostId(r.getId()); - boolean checkKvmHeatlh = kvmAgentHaClient.checkAgentHealthAndRunningVms(vmsOnHost.size()); + isHealthy = checkHealthViaKvmHaWebservice(r, isHealthy); - if(!isHealthy && checkKvmHeatlh) { - isHealthy = true; + return isHealthy; + } + + /** + * Checks the host healthy via an web-service that retrieves Running KVM instances via libvirt.
+ * The health-check is executed on the KVM node and verifies the amount of VMs running and if the libvirt service is running.

+ * + * One can enable or disable it via global settings 'kvm.ha.webservice.enabled'. + */ + private boolean checkHealthViaKvmHaWebservice(Host r, boolean isHealthy) { + KvmHaAgentClient kvmHaAgentClient = new KvmHaAgentClient(r); + if(!kvmHaAgentClient.isKvmHaWebserviceEnabled()) { + ClusterVO cluster = clusterDao.findById(r.getClusterId()); + LOG.debug(String.format("Skipping KVM HA web-service verification for %s due to 'kvm.ha.webservice.enabled' not enabled for cluster [id: %d, name: %s].", + r.toString(), cluster.getId(), cluster.getName())); + return isHealthy; } +// List vmsOnHost = kvmHaAgentClient.listVmsRunningMigratingStopping(r); +// List vmsOnHost = vmInstanceDao.listByHostAndState(r.getId(), VirtualMachine.State.Running); +// vmsOnHost.addAll(vmInstanceDao.listByHostAndState(r.getId(), VirtualMachine.State.Stopping)); +// vmsOnHost.addAll(vmInstanceDao.listByHostAndState(r.getId(), VirtualMachine.State.Migrating)); + boolean isKvmHaAgentHealthy = kvmHaAgentClient.isKvmHaAgentHealthy(r, vmInstanceDao); + + if (!isHealthy && isKvmHaAgentHealthy) { + isHealthy = true; + } return isHealthy; } private boolean isHealthyCheckViaNfs(Host r, boolean isHealthy, HashMap> poolVolMap) { for (StoragePool pool : poolVolMap.keySet()) { if(Storage.StoragePoolType.NetworkFilesystem == pool.getPoolType() - || Storage.StoragePoolType.ManagedNFS == pool.getPoolType() || Storage.StoragePoolType.ManagedNFS == pool.getPoolType()) { isHealthy = isAgentActive(r); } @@ -176,13 +200,12 @@ private boolean isVMActivtyOnHost(Host agent, DateTime suspectTime) throws HAChe if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM && agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) { throw new IllegalStateException(String.format("Calling KVM investigator for non KVM Host of type [%s].", agent.getHypervisorType())); } - boolean activityStatus = false; + boolean activityStatus = true; HashMap> poolVolMap = getVolumeUuidOnHost(agent); for (StoragePool pool : poolVolMap.keySet()) { if(Storage.StoragePoolType.NetworkFilesystem == pool.getPoolType() - || Storage.StoragePoolType.ManagedNFS == pool.getPoolType() || Storage.StoragePoolType.ManagedNFS == pool.getPoolType()) { - activityStatus = verifyActivityOfStorageOnHost(poolVolMap, pool, agent, suspectTime, activityStatus); + activityStatus = checkVmActivityOnStoragePool(poolVolMap, pool, agent, suspectTime, activityStatus); if (!activityStatus) { LOG.warn(String.format("It seems that the storage pool [%s] does not have activity on %s.", pool.getId(), agent.toString())); break; @@ -190,21 +213,16 @@ private boolean isVMActivtyOnHost(Host agent, DateTime suspectTime) throws HAChe } } - KvmAgentHaClient kvmAgentHaClient = new KvmAgentHaClient(agent); - List vmsOnHost = vmInstanceDao.listByHostId(agent.getId()); - boolean isVmsOnKvmMatchingWithDatabase = kvmAgentHaClient.checkAgentHealthAndRunningVms(vmsOnHost.size()); + activityStatus = checkHealthViaKvmHaWebservice(agent, activityStatus); - if(!activityStatus && isVmsOnKvmMatchingWithDatabase) { - activityStatus = true; - } else { + if(!activityStatus){ LOG.warn(String.format("No VM activity detected on %s. This might trigger HA Host Recovery and/or Fence.", agent.toString())); } return activityStatus; } - - protected boolean verifyActivityOfStorageOnHost(HashMap> poolVolMap, StoragePool pool, Host agent, DateTime suspectTime, boolean activityStatus) throws HACheckerException, IllegalStateException { + private boolean checkVmActivityOnStoragePool(HashMap> poolVolMap, StoragePool pool, Host agent, DateTime suspectTime, boolean activityStatus) throws HACheckerException, IllegalStateException { List volume_list = poolVolMap.get(pool); final CheckVMActivityOnStoragePoolCommand cmd = new CheckVMActivityOnStoragePoolCommand(agent, pool, volume_list, suspectTime); diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java new file mode 100644 index 000000000000..974d2528a5e6 --- /dev/null +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java @@ -0,0 +1,273 @@ +/* + * Copyright 2021 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cloudstack.kvm.ha; + +import com.cloud.host.Host; +import com.cloud.utils.exception.CloudRuntimeException; +import com.cloud.vm.VMInstanceVO; +import com.cloud.vm.VirtualMachine; +import com.cloud.vm.dao.VMInstanceDao; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import org.apache.commons.httpclient.HttpStatus; +import org.apache.http.HttpResponse; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpRequestBase; +import org.apache.http.client.utils.URIBuilder; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.log4j.Logger; +import org.jetbrains.annotations.Nullable; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** + * This class provides a client that checks Agent status via a webserver. + * + * The additional webserver exposes a simple JSON API which returns a list + * of Virtual Machines that are running on that host according to libvirt. + * + * This way, KVM HA can verify, via libvirt, VMs status with a HTTP-call + * to this simple webserver and determine if the host is actually down + * or if it is just the Java Agent which has crashed. + */ +public class KvmHaAgentClient { + + private static final Logger LOGGER = Logger.getLogger(KvmHaAgentClient.class); + private static final int ERROR_CODE = -1; + private static final String EXPECTED_HTTP_STATUS = "2XX"; + private static final String VM_COUNT = "count"; + private static final int WAIT_FOR_REQUEST_RETRY = 2; + private static final int MAX_REQUEST_RETRIES = 2; + private static final int CAUTIOUS_MARGIN_OF_VMS_ON_HOST = 1; + private Host agent; + + /** + * Instantiates a webclient that checks, via a webserver running on the KVM host, the VMs running + */ + public KvmHaAgentClient(Host agent) { + this.agent = agent; + } + + /** + * Returns the number of VMs running on the KVM host according to libvirt. + */ + protected int countRunningVmsOnAgent() { + String url = String.format("http://%s:%d", agent.getPrivateIpAddress(), getKvmHaMicroservicePortValue()); + HttpResponse response = executeHttpRequest(url); + + if (response == null) + return ERROR_CODE; + + JsonObject responseInJson = processHttpResponseIntoJson(response); + if (responseInJson == null) { + return ERROR_CODE; + } + + return responseInJson.get(VM_COUNT).getAsInt(); + } + + protected int getKvmHaMicroservicePortValue() { + Integer haAgentPort = KVMHAConfig.KVM_HA_WEBSERVICE_PORT.value(); + if (haAgentPort == null) { + LOGGER.warn(String.format("Using default kvm.ha.webservice.port: %s as it was set to NULL for the cluster [id: %d] from %s.", KVMHAConfig.KVM_HA_WEBSERVICE_PORT.defaultValue(), agent.getClusterId(), agent)); + haAgentPort = Integer.parseInt(KVMHAConfig.KVM_HA_WEBSERVICE_PORT.defaultValue()); + } + return haAgentPort; + } + + /** + * Checks if the KVM HA Webservice is enabled or not; if disabled then CloudStack ignores HA validation via the webservice. + */ + public boolean isKvmHaWebserviceEnabled() { + return KVMHAConfig.KVM_HA_WEBSERVICE_ENABLED.value(); + } + + /** + * Lists VMs on host according to vm_instance DB table. The states considered for such listing are: 'Running', 'Stopping', 'Migrating'. + *
+ *
+ * Note that VMs on state 'Starting' are not common to be at the host, therefore this method does not list them. + * However, there is still a probability of a VM in 'Starting' state be already listed on the KVM via '$virsh list', + * but that's not likely and thus it is not relevant for this very context. + */ + protected List listVmsOnHost(Host host, VMInstanceDao vmInstanceDao) { + List listByHostAndStateRunning = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Running); + List listByHostAndStateStopping = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Stopping); + List listByHostAndStateMigrating = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Migrating); + + List listByHostAndState = new ArrayList<>(); + listByHostAndState.addAll(listByHostAndStateRunning); + listByHostAndState.addAll(listByHostAndStateStopping); + listByHostAndState.addAll(listByHostAndStateMigrating); + + if (LOGGER.isTraceEnabled()) { + List listByHostAndStateStarting = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Starting); + int startingVMs = listByHostAndStateStarting.size(); + int runningVMs = listByHostAndStateRunning.size(); + int stoppingVms = listByHostAndStateStopping.size(); + int migratingVms = listByHostAndStateMigrating.size(); + int countRunningVmsOnAgent = countRunningVmsOnAgent(); + LOGGER.trace( + String.format("%s has (%d Starting) %d Running, %d Stopping, %d Migrating. Total listed via DB %d / %d (via libvirt)", agent.getName(), startingVMs, runningVMs, stoppingVms, + migratingVms, listByHostAndState.size(), countRunningVmsOnAgent)); + } + + return listByHostAndState; + } + + /** + * Returns true in case of the expected number of VMs matches with the VMs running on the KVM host according to Libvirt.

+ * + * IF:
+ * (i) KVM HA agent finds 0 running but CloudStack considers that the host has 2 or more VMs running: returns false as could not find VMs running but it expected at least 2 VMs running, fencing/recovering host would avoid downtime to VMs in this case.
+ * (ii) amount of listed VMs is different than expected: return true and print WARN messages so Admins can monitor and react accordingly + */ + public boolean isKvmHaAgentHealthy(Host host, VMInstanceDao vmInstanceDao) { + int numberOfVmsOnHostAccordingToDB = listVmsOnHost(host, vmInstanceDao).size(); + + int numberOfVmsOnAgent = countRunningVmsOnAgent(); + + if (numberOfVmsOnAgent < 0) { + LOGGER.error(String.format("KVM HA Agent health check failed, either the KVM Agent %s is unreachable or Libvirt validation failed", agent)); + LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDB)); + return false; + } + if (numberOfVmsOnHostAccordingToDB == numberOfVmsOnAgent) { + return true; + } + if (numberOfVmsOnAgent == 0) { + // Return false as could not find VMs running but it expected at least one VM running, fencing/recovering host would avoid downtime to VMs in this case. + LOGGER.warn(String.format("KVM HA Agent %s could not find VMs; it was expected to list %d VMs.", agent, numberOfVmsOnHostAccordingToDB)); + LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDB)); + return false; + } + // In order to have a less "aggressive" health-check, the KvmHaAgentClient will not return false; fencing/recovering could bring downtime to existing VMs + // Additionally, the inconsistency can also be due to jobs in progress to migrate/stop/start VMs + // Either way, WARN messages should be presented to Admins so they can look closely to what is happening on the host + LOGGER.warn(String.format("KVM HA Agent %s listed %d VMs; however, it was expected %d VMs.", agent, numberOfVmsOnAgent, numberOfVmsOnHostAccordingToDB)); + return true; + } + + /** + * Executes a GET request for the given URL address. + */ + protected HttpResponse executeHttpRequest(String url) { + HttpGet httpReq = prepareHttpRequestForUrl(url); + if (httpReq == null) { + return null; + } + + HttpClient client = HttpClientBuilder.create().build(); + HttpResponse response = null; + try { + response = client.execute(httpReq); + } catch (IOException e) { + if (MAX_REQUEST_RETRIES == 0) { + LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s] due to exception %s.", httpReq.getMethod(), url, e), e); + return null; + } + retryHttpRequest(url, httpReq, client); + } + return response; + } + + @Nullable + private HttpGet prepareHttpRequestForUrl(String url) { + HttpGet httpReq = null; + try { + URIBuilder builder = new URIBuilder(url); + httpReq = new HttpGet(builder.build()); + } catch (URISyntaxException e) { + LOGGER.error(String.format("Failed to create URI for GET request [URL: %s] due to exception.", url), e); + return null; + } + return httpReq; + } + + /** + * Re-executes the HTTP GET request until it gets a response or it reaches the maximum request retries {@link #MAX_REQUEST_RETRIES} + */ + protected HttpResponse retryHttpRequest(String url, HttpRequestBase httpReq, HttpClient client) { + LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s]. Executing the request again.", httpReq.getMethod(), url)); + HttpResponse response = retryUntilGetsHttpResponse(url, httpReq, client); + + if (response == null) { + LOGGER.error(String.format("Failed to execute HTTP %s request [URL: %s].", httpReq.getMethod(), url)); + return response; + } + + int statusCode = response.getStatusLine().getStatusCode(); + if (statusCode < HttpStatus.SC_OK || statusCode >= HttpStatus.SC_MULTIPLE_CHOICES) { + throw new CloudRuntimeException( + String.format("Failed to get VMs information with a %s request to URL '%s'. The expected HTTP status code is '%s' but it got '%s'.", HttpGet.METHOD_NAME, url, + EXPECTED_HTTP_STATUS, statusCode)); + } + + LOGGER.debug(String.format("Successfully executed HTTP %s request [URL: %s].", httpReq.getMethod(), url)); + return response; + } + + protected HttpResponse retryUntilGetsHttpResponse(String url, HttpRequestBase httpReq, HttpClient client) { + for (int attempt = 1; attempt < MAX_REQUEST_RETRIES + 1; attempt++) { + try { + TimeUnit.SECONDS.sleep(WAIT_FOR_REQUEST_RETRY); + LOGGER.debug(String.format("Retry HTTP %s request [URL: %s], attempt %d/%d.", httpReq.getMethod(), url, attempt, MAX_REQUEST_RETRIES)); + return client.execute(httpReq); + } catch (IOException | InterruptedException e) { + if (attempt == MAX_REQUEST_RETRIES) { + throw new CloudRuntimeException(String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", httpReq.getMethod(), attempt, MAX_REQUEST_RETRIES, url, e)); + } else { + LOGGER.error( + String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", httpReq.getMethod(), attempt, MAX_REQUEST_RETRIES, + url, e)); + } + } + } + return null; + } + + /** + * Processes the response of request GET System ID as a JSON object.
+ * Json example: {"count": 3, "virtualmachines": ["r-123-VM", "v-134-VM", "s-111-VM"]}

+ * + * Note: this method can return NULL JsonObject in case HttpResponse is NULL. + */ + protected JsonObject processHttpResponseIntoJson(HttpResponse response) { + InputStream in; + String jsonString; + if (response == null) { + return null; + } + try { + in = response.getEntity().getContent(); + BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8)); + jsonString = streamReader.readLine(); + } catch (UnsupportedOperationException | IOException e) { + throw new CloudRuntimeException("Failed to process response", e); + } + + return new JsonParser().parse(jsonString).getAsJsonObject(); + } +} diff --git a/plugins/hypervisors/kvm/src/main/resources/META-INF/cloudstack/kvm-compute/spring-kvm-compute-context.xml b/plugins/hypervisors/kvm/src/main/resources/META-INF/cloudstack/kvm-compute/spring-kvm-compute-context.xml index 9bcfdd9c3061..031593c5c15e 100644 --- a/plugins/hypervisors/kvm/src/main/resources/META-INF/cloudstack/kvm-compute/spring-kvm-compute-context.xml +++ b/plugins/hypervisors/kvm/src/main/resources/META-INF/cloudstack/kvm-compute/spring-kvm-compute-context.xml @@ -37,5 +37,4 @@ - diff --git a/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClientTest.java b/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClientTest.java deleted file mode 100644 index 539c7c07453e..000000000000 --- a/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/KvmAgentHaClientTest.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright 2021 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.cloud.hypervisor.kvm.resource; - -import com.cloud.host.HostVO; -import org.apache.http.StatusLine; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.mockito.Mockito; - -//@RunWith(MockitoJUnitRunner.class) -public class KvmAgentHaClientTest { - - private static final String AGENT_ADDRESS = "kvm-agent.domain.name"; - private HostVO agent = Mockito.mock(HostVO.class); - private KvmAgentHaClient kvmAgentHaClient = Mockito.spy(new KvmAgentHaClient(agent)); - - private CloseableHttpResponse mockResponse(int httpStatusCode) { - StatusLine statusLine = Mockito.mock(StatusLine.class); - Mockito.doReturn(httpStatusCode).when(statusLine).getStatusCode(); - CloseableHttpResponse response = Mockito.mock(CloseableHttpResponse.class); - Mockito.doReturn(statusLine).when(response).getStatusLine(); - Mockito.doReturn(response).when(kvmAgentHaClient).executeHttpRequest(Mockito.anyString()); - return response; - } - -} diff --git a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java new file mode 100644 index 000000000000..cba7f5ff683f --- /dev/null +++ b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java @@ -0,0 +1,252 @@ +/* + * Copyright 2021 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cloudstack.kvm.ha; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.apache.http.HttpEntity; +import org.apache.http.HttpResponse; +import org.apache.http.HttpStatus; +import org.apache.http.ProtocolVersion; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpRequestBase; +import org.apache.http.entity.InputStreamEntity; +import org.apache.http.message.BasicStatusLine; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.Mock; +import org.mockito.Mockito; +import org.mockito.junit.MockitoJUnitRunner; + +import com.cloud.host.HostVO; +import com.cloud.utils.exception.CloudRuntimeException; +import com.cloud.vm.VMInstanceVO; +import com.cloud.vm.dao.VMInstanceDaoImpl; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; + +@RunWith(MockitoJUnitRunner.class) +public class KvmHaAgentClientTest { + + private static final int ERROR_CODE = -1; + private HostVO agent = Mockito.mock(HostVO.class); + private KvmHaAgentClient kvmHaAgentClient = Mockito.spy(new KvmHaAgentClient(agent)); + private static final int DEFAULT_PORT = 8080; + private static final String PRIVATE_IP_ADDRESS = "1.2.3.4"; + private static final String JSON_STRING_EXAMPLE_3VMs = "{\"count\":3,\"virtualmachines\":[\"r-123-VM\",\"v-134-VM\",\"s-111-VM\"]}"; + private static final int EXPECTED_RUNNING_VMS_EXAMPLE_3VMs = 3; + private static final String JSON_STRING_EXAMPLE_0VMs = "{\"count\":0,\"virtualmachines\":[]}"; + private static final int EXPECTED_RUNNING_VMS_EXAMPLE_0VMs = 0; + private static final String EXPECTED_URL = String.format("http://%s:%d", PRIVATE_IP_ADDRESS, DEFAULT_PORT); + private static final HttpRequestBase HTTP_REQUEST_BASE = new HttpGet(EXPECTED_URL); + private static final String VMS_COUNT = "count"; + private static final String VIRTUAL_MACHINES = "virtualmachines"; + private static final int MAX_REQUEST_RETRIES = 2; + private static final int KVM_HA_WEBSERVICE_PORT = 8080; + + @Mock + HttpClient client; + + @Mock + VMInstanceDaoImpl vmInstanceDao; + + @Test + public void isKvmHaAgentHealthyTestAllGood() { + boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, EXPECTED_RUNNING_VMS_EXAMPLE_3VMs); + Assert.assertTrue(result); + } + + @Test + public void isKvmHaAgentHealthyTestVMsDoNotMatchButDoNotReturnFalse() { + boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, 1); + Assert.assertTrue(result); + } + + @Test + public void isKvmHaAgentHealthyTestExpectedRunningVmsButNoneListed() { + boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, 0); + Assert.assertFalse(result); + } + + @Test + public void isKvmHaAgentHealthyTestReceivedErrorCode() { + boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, ERROR_CODE); + Assert.assertFalse(result); + } + + private boolean isKvmHaAgentHealthyTests(int expectedNumberOfVms, int vmsRunningOnAgent) { + List vmsOnHostList = new ArrayList<>(); + for (int i = 0; i < expectedNumberOfVms; i++) { + VMInstanceVO vmInstance = Mockito.mock(VMInstanceVO.class); + vmsOnHostList.add(vmInstance); + } + + Mockito.doReturn(vmsOnHostList).when(kvmHaAgentClient).listVmsOnHost(Mockito.any(), Mockito.any()); + Mockito.doReturn(vmsRunningOnAgent).when(kvmHaAgentClient).countRunningVmsOnAgent(); + + return kvmHaAgentClient.isKvmHaAgentHealthy(agent, vmInstanceDao); + } + + @Test + public void processHttpResponseIntoJsonTestNull() { + JsonObject responseJson = kvmHaAgentClient.processHttpResponseIntoJson(null); + Assert.assertNull(responseJson); + } + + @Test + public void processHttpResponseIntoJsonTest() throws IOException { + prepareAndTestProcessHttpResponseIntoJson(JSON_STRING_EXAMPLE_3VMs, 3l); + } + + @Test + public void processHttpResponseIntoJsonTestOtherJsonExample() throws IOException { + prepareAndTestProcessHttpResponseIntoJson(JSON_STRING_EXAMPLE_0VMs, 0l); + } + + private void prepareAndTestProcessHttpResponseIntoJson(String jsonString, long expectedVmsCount) throws IOException { + CloseableHttpResponse mockedResponse = mockResponse(HttpStatus.SC_OK, jsonString); + JsonObject responseJson = kvmHaAgentClient.processHttpResponseIntoJson(mockedResponse); + + Assert.assertNotNull(responseJson); + JsonElement jsonElementVmsCount = responseJson.get(VMS_COUNT); + JsonElement jsonElementVmsArray = responseJson.get(VIRTUAL_MACHINES); + JsonArray jsonArray = jsonElementVmsArray.getAsJsonArray(); + + Assert.assertEquals(expectedVmsCount, jsonArray.size()); + Assert.assertEquals(expectedVmsCount, jsonElementVmsCount.getAsLong()); + Assert.assertEquals(jsonString, responseJson.toString()); + } + + private CloseableHttpResponse mockResponse(int httpStatusCode, String jsonString) throws IOException { + BasicStatusLine basicStatusLine = new BasicStatusLine(new ProtocolVersion("HTTP", 1000, 123), httpStatusCode, "Status"); + CloseableHttpResponse response = Mockito.mock(CloseableHttpResponse.class); + InputStream in = IOUtils.toInputStream(jsonString, StandardCharsets.UTF_8); + Mockito.when(response.getStatusLine()).thenReturn(basicStatusLine); + HttpEntity httpEntity = new InputStreamEntity(in); + Mockito.when(response.getEntity()).thenReturn(httpEntity); + return response; + } + + @Test + public void countRunningVmsOnAgentTest() throws IOException { + prepareAndRunCountRunningVmsOnAgent(JSON_STRING_EXAMPLE_3VMs, EXPECTED_RUNNING_VMS_EXAMPLE_3VMs); + } + + @Test + public void countRunningVmsOnAgentTestBlankNoVmsListed() throws IOException { + prepareAndRunCountRunningVmsOnAgent(JSON_STRING_EXAMPLE_0VMs, EXPECTED_RUNNING_VMS_EXAMPLE_0VMs); + } + + private void prepareAndRunCountRunningVmsOnAgent(String jsonStringExample, int expectedListedVms) throws IOException { + Mockito.when(agent.getPrivateIpAddress()).thenReturn(PRIVATE_IP_ADDRESS); + Mockito.doReturn(mockResponse(HttpStatus.SC_OK, JSON_STRING_EXAMPLE_3VMs)).when(kvmHaAgentClient).executeHttpRequest(EXPECTED_URL); + + JsonObject jObject = new JsonParser().parse(jsonStringExample).getAsJsonObject(); + Mockito.doReturn(jObject).when(kvmHaAgentClient).processHttpResponseIntoJson(Mockito.any(HttpResponse.class)); + + int result = kvmHaAgentClient.countRunningVmsOnAgent(); + Assert.assertEquals(expectedListedVms, result); + } + + @Test + public void retryHttpRequestTest() throws IOException { + kvmHaAgentClient.retryHttpRequest(EXPECTED_URL, HTTP_REQUEST_BASE, client); + Mockito.verify(client, Mockito.times(1)).execute(Mockito.any()); + Mockito.verify(kvmHaAgentClient, Mockito.times(1)).retryUntilGetsHttpResponse(Mockito.anyString(), Mockito.any(), Mockito.any()); + } + + @Test + public void retryHttpRequestTestNullResponse() throws IOException { + Mockito.doReturn(null).when(kvmHaAgentClient).retryUntilGetsHttpResponse(Mockito.anyString(), Mockito.any(), Mockito.any()); + HttpResponse response = kvmHaAgentClient.retryHttpRequest(EXPECTED_URL, HTTP_REQUEST_BASE, client); + Assert.assertNull(response); + } + + @Test(expected = CloudRuntimeException.class) + public void retryHttpRequestTestForbidden() throws IOException { + prepareAndRunRetryHttpRequestTest(HttpStatus.SC_FORBIDDEN); + } + + @Test(expected = CloudRuntimeException.class) + public void retryHttpRequestTestMultipleChoices() throws IOException { + prepareAndRunRetryHttpRequestTest(HttpStatus.SC_MULTIPLE_CHOICES); + } + + @Test(expected = CloudRuntimeException.class) + public void retryHttpRequestTestProcessing() throws IOException { + prepareAndRunRetryHttpRequestTest(HttpStatus.SC_PROCESSING); + } + + @Test(expected = CloudRuntimeException.class) + public void retryHttpRequestTestTimeout() throws IOException { + prepareAndRunRetryHttpRequestTest(HttpStatus.SC_GATEWAY_TIMEOUT); + } + + @Test(expected = CloudRuntimeException.class) + public void retryHttpRequestTestVersionNotSupported() throws IOException { + prepareAndRunRetryHttpRequestTest(HttpStatus.SC_HTTP_VERSION_NOT_SUPPORTED); + } + + private void prepareAndRunRetryHttpRequestTest(int scMultipleChoices) throws IOException { + HttpResponse mockedResponse = mockResponse(scMultipleChoices, JSON_STRING_EXAMPLE_3VMs); + Mockito.doReturn(mockedResponse).when(kvmHaAgentClient).retryUntilGetsHttpResponse(Mockito.anyString(), Mockito.any(), Mockito.any()); + kvmHaAgentClient.retryHttpRequest(EXPECTED_URL, HTTP_REQUEST_BASE, client); + } + + @Test + public void retryHttpRequestTestHttpOk() throws IOException { + HttpResponse mockedResponse = mockResponse(HttpStatus.SC_OK, JSON_STRING_EXAMPLE_3VMs); + Mockito.doReturn(mockedResponse).when(kvmHaAgentClient).retryUntilGetsHttpResponse(Mockito.anyString(), Mockito.any(), Mockito.any()); + HttpResponse result = kvmHaAgentClient.retryHttpRequest(EXPECTED_URL, HTTP_REQUEST_BASE, client); + Mockito.verify(kvmHaAgentClient, Mockito.times(1)).retryUntilGetsHttpResponse(Mockito.anyString(), Mockito.any(), Mockito.any()); + Assert.assertEquals(mockedResponse, result); + } + + @Test + public void retryUntilGetsHttpResponseTestOneIOException() throws IOException { + Mockito.when(client.execute(HTTP_REQUEST_BASE)).thenThrow(IOException.class).thenReturn(mockResponse(HttpStatus.SC_OK, JSON_STRING_EXAMPLE_3VMs)); + kvmHaAgentClient.retryUntilGetsHttpResponse(EXPECTED_URL, HTTP_REQUEST_BASE, client); + Mockito.verify(client, Mockito.times(MAX_REQUEST_RETRIES)).execute(Mockito.any()); + } + + @Test(expected = CloudRuntimeException.class) + public void retryUntilGetsHttpResponseTestTwoIOException() throws IOException { + Mockito.when(client.execute(HTTP_REQUEST_BASE)).thenThrow(IOException.class).thenThrow(IOException.class); + kvmHaAgentClient.retryUntilGetsHttpResponse(EXPECTED_URL, HTTP_REQUEST_BASE, client); + Mockito.verify(client, Mockito.times(MAX_REQUEST_RETRIES)).execute(Mockito.any()); + } + + @Test + public void isKvmHaWebserviceEnabledTestDefault() { + Assert.assertTrue(kvmHaAgentClient.isKvmHaWebserviceEnabled()); + } + + @Test + public void getKvmHaMicroservicePortValueTestDefault() { + Assert.assertEquals(KVM_HA_WEBSERVICE_PORT, kvmHaAgentClient.getKvmHaMicroservicePortValue()); + } + +} diff --git a/server/src/main/java/org/apache/cloudstack/ha/provider/HAProvider.java b/server/src/main/java/org/apache/cloudstack/ha/provider/HAProvider.java index 9a7f27c003ec..737c141e81cc 100644 --- a/server/src/main/java/org/apache/cloudstack/ha/provider/HAProvider.java +++ b/server/src/main/java/org/apache/cloudstack/ha/provider/HAProvider.java @@ -35,7 +35,9 @@ enum HAProviderConfig { MaxRecoveryAttempts, MaxActivityCheckInterval, MaxDegradedWaitTimeout, - RecoveryWaitTimeout + RecoveryWaitTimeout, + KVM_HA_WEBSERVICE_PORT, + IS_KVM_HA_WEBSERVICE_ENABLED }; HAResource.ResourceType resourceType(); From fa998e194de4a460081e09a85825935c9431bae1 Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Thu, 29 Apr 2021 03:07:27 -0300 Subject: [PATCH 07/40] Remove commented lines --- .../kvm/src/main/java/com/cloud/ha/KVMInvestigator.java | 1 - .../org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java | 4 ---- 2 files changed, 5 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java index 21257481630a..cb1145ab6835 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java @@ -100,7 +100,6 @@ public Status isAgentAlive(Host agent) { */ private Status checkAgentStatusViaKvmHaAgent(Host agent, Status agentStatus) { KvmHaAgentClient kvmHaAgentClient = new KvmHaAgentClient(agent); - boolean isVmsCountOnKvmMatchingWithDatabase = kvmHaAgentClient.isKvmHaAgentHealthy(agent, vmInstanceDao); if(isVmsCountOnKvmMatchingWithDatabase) { agentStatus = Status.Up; diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java index 4690077bd318..2eab959fc553 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java @@ -111,10 +111,6 @@ private boolean checkHealthViaKvmHaWebservice(Host r, boolean isHealthy) { return isHealthy; } -// List vmsOnHost = kvmHaAgentClient.listVmsRunningMigratingStopping(r); -// List vmsOnHost = vmInstanceDao.listByHostAndState(r.getId(), VirtualMachine.State.Running); -// vmsOnHost.addAll(vmInstanceDao.listByHostAndState(r.getId(), VirtualMachine.State.Stopping)); -// vmsOnHost.addAll(vmInstanceDao.listByHostAndState(r.getId(), VirtualMachine.State.Migrating)); boolean isKvmHaAgentHealthy = kvmHaAgentClient.isKvmHaAgentHealthy(r, vmInstanceDao); if (!isHealthy && isKvmHaAgentHealthy) { From ee4c28cd58c0461efb4d43feeb614ec40be76f48 Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Fri, 30 Apr 2021 00:17:57 -0300 Subject: [PATCH 08/40] Consider Paused instances to count when a VM is being migrated and already "placed" on the host (memory is already being allocated but it is not processing) but not yet resumed. --- .../apache/cloudstack/kvm/ha/KVMHAConfig.java | 4 ++-- .../cloudstack/kvm/ha/KVMHAProvider.java | 4 ++-- .../cloudstack/kvm/ha/KvmHaAgentClient.java | 23 +++++++++++-------- scripts/vm/hypervisor/kvm/agent-ha-helper.py | 2 ++ .../cloudstack/ha/provider/HAProvider.java | 4 ++-- 5 files changed, 21 insertions(+), 16 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java index 6e9b4a180305..61b0476f2e35 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java @@ -53,11 +53,11 @@ public class KVMHAConfig { public static final ConfigKey KvmHAFenceTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.fence.timeout", "60", "The maximum length of time, in seconds, expected for a fence operation to complete.", true, ConfigKey.Scope.Cluster); - public static final ConfigKey KVM_HA_WEBSERVICE_PORT = new ConfigKey("Advanced", Integer.class, "kvm.ha.webservice.port", "8080", + public static final ConfigKey KvmHaWebservicePort = new ConfigKey("Advanced", Integer.class, "kvm.ha.webservice.port", "8080", "It sets the port used to communicate with the KVM HA Agent Microservice that is running on KVM nodes. Default value is 8080.", true, ConfigKey.Scope.Cluster); - public static final ConfigKey KVM_HA_WEBSERVICE_ENABLED = new ConfigKey("Advanced", Boolean.class, "kvm.ha.webservice.enabled", "true", + public static final ConfigKey IsKvmHaWebserviceEnabled = new ConfigKey("Advanced", Boolean.class, "kvm.ha.webservice.enabled", "true", "The KVM HA Webservice is executed on the KVM node and checks the amount of VMs running via libvirt. It serves as a HA health-check for KVM nodes. One can enable (set to 'true') or disable it ('false'). If disabled then CloudStack ignores HA validation via this agent.", true, ConfigKey.Scope.Cluster); } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java index f1160b30b52a..5358bd1d6656 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java @@ -152,8 +152,8 @@ public ConfigKey[] getConfigKeys() { KVMHAConfig.KvmHADegradedMaxPeriod, KVMHAConfig.KvmHARecoverWaitPeriod, KVMHAConfig.KvmHARecoverAttemptThreshold, - KVMHAConfig.KVM_HA_WEBSERVICE_PORT, - KVMHAConfig.KVM_HA_WEBSERVICE_ENABLED + KVMHAConfig.KvmHaWebservicePort, + KVMHAConfig.IsKvmHaWebserviceEnabled }; } } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java index 974d2528a5e6..689dff916861 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java @@ -89,10 +89,10 @@ protected int countRunningVmsOnAgent() { } protected int getKvmHaMicroservicePortValue() { - Integer haAgentPort = KVMHAConfig.KVM_HA_WEBSERVICE_PORT.value(); + Integer haAgentPort = KVMHAConfig.KvmHaWebservicePort.value(); if (haAgentPort == null) { - LOGGER.warn(String.format("Using default kvm.ha.webservice.port: %s as it was set to NULL for the cluster [id: %d] from %s.", KVMHAConfig.KVM_HA_WEBSERVICE_PORT.defaultValue(), agent.getClusterId(), agent)); - haAgentPort = Integer.parseInt(KVMHAConfig.KVM_HA_WEBSERVICE_PORT.defaultValue()); + LOGGER.warn(String.format("Using default kvm.ha.webservice.port: %s as it was set to NULL for the cluster [id: %d] from %s.", KVMHAConfig.KvmHaWebservicePort.defaultValue(), agent.getClusterId(), agent)); + haAgentPort = Integer.parseInt(KVMHAConfig.KvmHaWebservicePort.defaultValue()); } return haAgentPort; } @@ -101,7 +101,7 @@ protected int getKvmHaMicroservicePortValue() { * Checks if the KVM HA Webservice is enabled or not; if disabled then CloudStack ignores HA validation via the webservice. */ public boolean isKvmHaWebserviceEnabled() { - return KVMHAConfig.KVM_HA_WEBSERVICE_ENABLED.value(); + return KVMHAConfig.IsKvmHaWebserviceEnabled.value(); } /** @@ -141,24 +141,27 @@ protected List listVmsOnHost(Host host, VMInstanceDao vmInstanceDa * Returns true in case of the expected number of VMs matches with the VMs running on the KVM host according to Libvirt.

* * IF:
- * (i) KVM HA agent finds 0 running but CloudStack considers that the host has 2 or more VMs running: returns false as could not find VMs running but it expected at least 2 VMs running, fencing/recovering host would avoid downtime to VMs in this case.
- * (ii) amount of listed VMs is different than expected: return true and print WARN messages so Admins can monitor and react accordingly + * (i) KVM HA agent finds 0 running but CloudStack considers that the host has 2 or more VMs running: returns false as could not find VMs running but it expected at least + * 2 VMs running, fencing/recovering host would avoid downtime to VMs in this case.
+ * (ii) KVM HA agent finds 0 VM running but CloudStack considers that the host has 1 VM running: return true and log WARN messages and avoids triggering HA recovery/fencing + * when it could be a inconsistency when migrating a VM.
+ * (iii) amount of listed VMs is different than expected: return true and print WARN messages so Admins can monitor and react accordingly */ public boolean isKvmHaAgentHealthy(Host host, VMInstanceDao vmInstanceDao) { int numberOfVmsOnHostAccordingToDB = listVmsOnHost(host, vmInstanceDao).size(); - int numberOfVmsOnAgent = countRunningVmsOnAgent(); - if (numberOfVmsOnAgent < 0) { - LOGGER.error(String.format("KVM HA Agent health check failed, either the KVM Agent %s is unreachable or Libvirt validation failed", agent)); + LOGGER.error(String.format("KVM HA Agent health check failed, either the KVM Agent %s is unreachable or Libvirt validation failed.", agent)); LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDB)); return false; } if (numberOfVmsOnHostAccordingToDB == numberOfVmsOnAgent) { return true; } - if (numberOfVmsOnAgent == 0) { + if (numberOfVmsOnAgent == 0 && numberOfVmsOnHostAccordingToDB > CAUTIOUS_MARGIN_OF_VMS_ON_HOST) { // Return false as could not find VMs running but it expected at least one VM running, fencing/recovering host would avoid downtime to VMs in this case. + // There is cautious margin added on the conditional. This avoids fencing/recovering hosts when there is one VM migrating to a host that had zero VMs. + // If there are more VMs than the CAUTIOUS_MARGIN_OF_VMS_ON_HOST) the Host should be treated as not healthy and fencing/recovering process might be triggered. LOGGER.warn(String.format("KVM HA Agent %s could not find VMs; it was expected to list %d VMs.", agent, numberOfVmsOnHostAccordingToDB)); LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDB)); return false; diff --git a/scripts/vm/hypervisor/kvm/agent-ha-helper.py b/scripts/vm/hypervisor/kvm/agent-ha-helper.py index d8d4ebfbd495..113c70bdd86c 100755 --- a/scripts/vm/hypervisor/kvm/agent-ha-helper.py +++ b/scripts/vm/hypervisor/kvm/agent-ha-helper.py @@ -34,6 +34,8 @@ def running_vms(self): for domain in alldomains: if domain.info()[0] == libvirt.VIR_DOMAIN_RUNNING: domains.append(domain.name()) + elif domain.info()[0] == libvirt.VIR_DOMAIN_PAUSED: + domains.append(domain.name()) self.conn.close() diff --git a/server/src/main/java/org/apache/cloudstack/ha/provider/HAProvider.java b/server/src/main/java/org/apache/cloudstack/ha/provider/HAProvider.java index 737c141e81cc..f0fca12c6819 100644 --- a/server/src/main/java/org/apache/cloudstack/ha/provider/HAProvider.java +++ b/server/src/main/java/org/apache/cloudstack/ha/provider/HAProvider.java @@ -36,8 +36,8 @@ enum HAProviderConfig { MaxActivityCheckInterval, MaxDegradedWaitTimeout, RecoveryWaitTimeout, - KVM_HA_WEBSERVICE_PORT, - IS_KVM_HA_WEBSERVICE_ENABLED + KvmHaWebservicePort, + IsKvmHaWebserviceEnabled }; HAResource.ResourceType resourceType(); From 6953b952d1172d4583d13c2e07717a684239db6c Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Mon, 3 May 2021 10:33:37 -0300 Subject: [PATCH 09/40] Final adjustments before calling for review --- .../cloudstack-agent-ha-helper.service | 1 - .../java/com/cloud/ha/KVMInvestigator.java | 2 +- .../kvm/ha/KVMHostActivityChecker.java | 47 +++++++++++++------ .../cloudstack/kvm/ha/KvmHaAgentClient.java | 2 - .../kvm/ha/KvmHaAgentClientTest.java | 2 - 5 files changed, 34 insertions(+), 20 deletions(-) diff --git a/packaging/systemd/cloudstack-agent-ha-helper.service b/packaging/systemd/cloudstack-agent-ha-helper.service index 34ad6a37caaa..696379ea1fc5 100644 --- a/packaging/systemd/cloudstack-agent-ha-helper.service +++ b/packaging/systemd/cloudstack-agent-ha-helper.service @@ -29,7 +29,6 @@ After=libvirtd.service Type=simple EnvironmentFile=/etc/default/cloudstack-agent-ha-helper ExecStart=/usr/share/cloudstack-common/scripts/vm/hypervisor/kvm/agent-ha-helper.py $PORT - Restart=always RestartSec=10s diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java index cb1145ab6835..72384c46740c 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java @@ -96,7 +96,7 @@ public Status isAgentAlive(Host agent) { } /** - * It checks the KVM node healthy via KVM HA Agent. If the agent is healthy it returns Status.Up, otherwise it relies keeps the provided Status as it is. + * It checks the KVM node healthy via KVM HA Agent. If the agent is healthy it returns Status.Up, otherwise it keeps the provided Status as it is. */ private Status checkAgentStatusViaKvmHaAgent(Host agent, Status agentStatus) { KvmHaAgentClient kvmHaAgentClient = new KvmHaAgentClient(agent); diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java index 2eab959fc553..fbf659cb9cff 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java @@ -32,8 +32,10 @@ import com.cloud.storage.Storage; import com.cloud.storage.StorageManager; import com.cloud.storage.StoragePool; +import com.cloud.storage.StoragePoolHostVO; import com.cloud.storage.Volume; import com.cloud.storage.VolumeVO; +import com.cloud.storage.dao.StoragePoolHostDao; import com.cloud.storage.dao.VolumeDao; import com.cloud.utils.component.AdapterBase; import com.cloud.vm.VMInstanceVO; @@ -43,6 +45,7 @@ import org.apache.cloudstack.ha.provider.HACheckerException; import org.apache.cloudstack.ha.provider.HealthCheckerInterface; import org.apache.cloudstack.storage.datastore.db.PrimaryDataStoreDao; +import org.apache.cloudstack.storage.datastore.db.StoragePoolVO; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; @@ -52,7 +55,6 @@ import java.util.HashMap; import java.util.List; - public class KVMHostActivityChecker extends AdapterBase implements ActivityCheckerInterface, HealthCheckerInterface { private final static Logger LOG = Logger.getLogger(KVMHostActivityChecker.class); @@ -63,13 +65,15 @@ public class KVMHostActivityChecker extends AdapterBase implements ActivityCheck @Inject private AgentManager agentMgr; @Inject - private PrimaryDataStoreDao storagePool; - @Inject private StorageManager storageManager; @Inject + private PrimaryDataStoreDao storagePool; + @Inject private ResourceManager resourceManager; @Inject private ClusterDao clusterDao; + @Inject + private StoragePoolHostDao storagePoolHostDao; @Override public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException { @@ -87,9 +91,11 @@ public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException @Override public boolean isHealthy(Host r) { - boolean isHealthy = false; - HashMap> poolVolMap = getVolumeUuidOnHost(r); - isHealthy = isHealthyCheckViaNfs(r, isHealthy, poolVolMap); + boolean isHealthy = true; + if (isHostServedByNfsPool(r)) { + HashMap> poolVolMap = getVolumeUuidOnHost(r); + isHealthy = isHealthyCheckViaNfs(r, isHealthy, poolVolMap); + } isHealthy = checkHealthViaKvmHaWebservice(r, isHealthy); @@ -197,14 +203,15 @@ private boolean isVMActivtyOnHost(Host agent, DateTime suspectTime) throws HAChe throw new IllegalStateException(String.format("Calling KVM investigator for non KVM Host of type [%s].", agent.getHypervisorType())); } boolean activityStatus = true; - HashMap> poolVolMap = getVolumeUuidOnHost(agent); - for (StoragePool pool : poolVolMap.keySet()) { - if(Storage.StoragePoolType.NetworkFilesystem == pool.getPoolType() - || Storage.StoragePoolType.ManagedNFS == pool.getPoolType()) { - activityStatus = checkVmActivityOnStoragePool(poolVolMap, pool, agent, suspectTime, activityStatus); - if (!activityStatus) { - LOG.warn(String.format("It seems that the storage pool [%s] does not have activity on %s.", pool.getId(), agent.toString())); - break; + if (isHostServedByNfsPool(agent)) { + HashMap> poolVolMap = getVolumeUuidOnHost(agent); + for (StoragePool pool : poolVolMap.keySet()) { + if (Storage.StoragePoolType.NetworkFilesystem == pool.getPoolType() || Storage.StoragePoolType.ManagedNFS == pool.getPoolType()) { + activityStatus = checkVmActivityOnStoragePool(poolVolMap, pool, agent, suspectTime, activityStatus); + if (!activityStatus) { + LOG.warn(String.format("It seems that the storage pool [%s] does not have activity on %s.", pool.getId(), agent.toString())); + break; + } } } } @@ -267,6 +274,18 @@ private HashMap> getVolumeUuidOnHost(Host agent) { return poolVolMap; } + private boolean isHostServedByNfsPool(Host agent) { + List storagesOnHost = storagePoolHostDao.listByHostId(agent.getId()); + for (StoragePoolHostVO storagePoolHostRef : storagesOnHost) { + StoragePoolVO storagePool = this.storagePool.findById(storagePoolHostRef.getPoolId()); + if(Storage.StoragePoolType.NetworkFilesystem == storagePool.getPoolType() + || Storage.StoragePoolType.ManagedNFS == storagePool.getPoolType()) { + return true; + } + } + return false; + } + public long[] getNeighbors(Host agent) { List neighbors = new ArrayList(); List cluster_hosts = resourceManager.listHostsInClusterByStatus(agent.getClusterId(), Status.Up); diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java index 689dff916861..62206d3cdeb6 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java @@ -1,6 +1,4 @@ /* - * Copyright 2021 The Apache Software Foundation. - * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at diff --git a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java index cba7f5ff683f..1b86e102eeea 100644 --- a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java +++ b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java @@ -1,6 +1,4 @@ /* - * Copyright 2021 The Apache Software Foundation. - * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at From 35449e8db91c1afbb9c488d80e92ca604f2aadd6 Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Wed, 5 May 2021 04:55:16 -0300 Subject: [PATCH 10/40] Refactoring tests and adding small improvements --- .../apache/cloudstack/kvm/ha/KVMHAConfig.java | 2 +- .../kvm/ha/KVMHostActivityChecker.java | 38 +++++++++------- .../cloudstack/kvm/ha/KvmHaAgentClient.java | 13 +++--- .../kvm/ha/KvmHaAgentClientTest.java | 45 ++++++++++++------- 4 files changed, 55 insertions(+), 43 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java index 61b0476f2e35..68d570823ce9 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java @@ -57,7 +57,7 @@ public class KVMHAConfig { "It sets the port used to communicate with the KVM HA Agent Microservice that is running on KVM nodes. Default value is 8080.", true, ConfigKey.Scope.Cluster); - public static final ConfigKey IsKvmHaWebserviceEnabled = new ConfigKey("Advanced", Boolean.class, "kvm.ha.webservice.enabled", "true", + public static final ConfigKey IsKvmHaWebserviceEnabled = new ConfigKey("Advanced", Boolean.class, "kvm.ha.webservice.enabled", "false", "The KVM HA Webservice is executed on the KVM node and checks the amount of VMs running via libvirt. It serves as a HA health-check for KVM nodes. One can enable (set to 'true') or disable it ('false'). If disabled then CloudStack ignores HA validation via this agent.", true, ConfigKey.Scope.Cluster); } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java index fbf659cb9cff..40d8c30e2e21 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java @@ -91,13 +91,21 @@ public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException @Override public boolean isHealthy(Host r) { - boolean isHealthy = true; + boolean isHealthy = false; if (isHostServedByNfsPool(r)) { HashMap> poolVolMap = getVolumeUuidOnHost(r); isHealthy = isHealthyCheckViaNfs(r, isHealthy, poolVolMap); + if(!isHealthy){ + LOG.warn(String.format("NFS storage health check failed for %s. It seems that a storage does not have activity.", r.toString())); + } } - isHealthy = checkHealthViaKvmHaWebservice(r, isHealthy); + boolean isKvmHaAgentHealthy = checkHealthViaKvmHaWebservice(r); + + if (!isHealthy && isKvmHaAgentHealthy) { + isHealthy = true; + LOG.warn(String.format("KVM HA Agent health check could not detect activity on %s. This might trigger HA Host Recovery and/or Fence", r.toString())); + } return isHealthy; } @@ -108,21 +116,14 @@ public boolean isHealthy(Host r) { * * One can enable or disable it via global settings 'kvm.ha.webservice.enabled'. */ - private boolean checkHealthViaKvmHaWebservice(Host r, boolean isHealthy) { - KvmHaAgentClient kvmHaAgentClient = new KvmHaAgentClient(r); + private boolean checkHealthViaKvmHaWebservice(Host host) { + KvmHaAgentClient kvmHaAgentClient = new KvmHaAgentClient(host); if(!kvmHaAgentClient.isKvmHaWebserviceEnabled()) { - ClusterVO cluster = clusterDao.findById(r.getClusterId()); + ClusterVO cluster = clusterDao.findById(host.getClusterId()); LOG.debug(String.format("Skipping KVM HA web-service verification for %s due to 'kvm.ha.webservice.enabled' not enabled for cluster [id: %d, name: %s].", - r.toString(), cluster.getId(), cluster.getName())); - return isHealthy; - } - - boolean isKvmHaAgentHealthy = kvmHaAgentClient.isKvmHaAgentHealthy(r, vmInstanceDao); - - if (!isHealthy && isKvmHaAgentHealthy) { - isHealthy = true; + host.toString(), cluster.getId(), cluster.getName())); } - return isHealthy; + return kvmHaAgentClient.isKvmHaAgentHealthy(host, vmInstanceDao); } private boolean isHealthyCheckViaNfs(Host r, boolean isHealthy, HashMap> poolVolMap) { @@ -202,7 +203,7 @@ private boolean isVMActivtyOnHost(Host agent, DateTime suspectTime) throws HAChe if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM && agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) { throw new IllegalStateException(String.format("Calling KVM investigator for non KVM Host of type [%s].", agent.getHypervisorType())); } - boolean activityStatus = true; + boolean activityStatus = false; if (isHostServedByNfsPool(agent)) { HashMap> poolVolMap = getVolumeUuidOnHost(agent); for (StoragePool pool : poolVolMap.keySet()) { @@ -216,10 +217,13 @@ private boolean isVMActivtyOnHost(Host agent, DateTime suspectTime) throws HAChe } } - activityStatus = checkHealthViaKvmHaWebservice(agent, activityStatus); + boolean isKvmHaAgentHealthy = checkHealthViaKvmHaWebservice(agent); + if (!activityStatus && isKvmHaAgentHealthy) { + activityStatus = true; + } if(!activityStatus){ - LOG.warn(String.format("No VM activity detected on %s. This might trigger HA Host Recovery and/or Fence.", agent.toString())); + LOG.warn(String.format("KVM HA Agent health check could not detect activity on %s. This might trigger HA Host Recovery and/or Fence.", agent.toString())); } return activityStatus; diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java index 62206d3cdeb6..d21b44230108 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java @@ -221,9 +221,10 @@ protected HttpResponse retryHttpRequest(String url, HttpRequestBase httpReq, Htt int statusCode = response.getStatusLine().getStatusCode(); if (statusCode < HttpStatus.SC_OK || statusCode >= HttpStatus.SC_MULTIPLE_CHOICES) { - throw new CloudRuntimeException( + LOGGER.error( String.format("Failed to get VMs information with a %s request to URL '%s'. The expected HTTP status code is '%s' but it got '%s'.", HttpGet.METHOD_NAME, url, EXPECTED_HTTP_STATUS, statusCode)); + return null; } LOGGER.debug(String.format("Successfully executed HTTP %s request [URL: %s].", httpReq.getMethod(), url)); @@ -237,13 +238,9 @@ protected HttpResponse retryUntilGetsHttpResponse(String url, HttpRequestBase ht LOGGER.debug(String.format("Retry HTTP %s request [URL: %s], attempt %d/%d.", httpReq.getMethod(), url, attempt, MAX_REQUEST_RETRIES)); return client.execute(httpReq); } catch (IOException | InterruptedException e) { - if (attempt == MAX_REQUEST_RETRIES) { - throw new CloudRuntimeException(String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", httpReq.getMethod(), attempt, MAX_REQUEST_RETRIES, url, e)); - } else { - LOGGER.error( - String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", httpReq.getMethod(), attempt, MAX_REQUEST_RETRIES, - url, e)); - } + String errorMessage = String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", + httpReq.getMethod(), attempt, MAX_REQUEST_RETRIES, url, e); + LOGGER.error(errorMessage); } } return null; diff --git a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java index 1b86e102eeea..786c1d5175e8 100644 --- a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java +++ b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java @@ -38,7 +38,6 @@ import org.mockito.junit.MockitoJUnitRunner; import com.cloud.host.HostVO; -import com.cloud.utils.exception.CloudRuntimeException; import com.cloud.vm.VMInstanceVO; import com.cloud.vm.dao.VMInstanceDaoImpl; import com.google.gson.JsonArray; @@ -183,35 +182,45 @@ public void retryHttpRequestTestNullResponse() throws IOException { Assert.assertNull(response); } - @Test(expected = CloudRuntimeException.class) + @Test public void retryHttpRequestTestForbidden() throws IOException { - prepareAndRunRetryHttpRequestTest(HttpStatus.SC_FORBIDDEN); + prepareAndRunRetryHttpRequestTest(HttpStatus.SC_FORBIDDEN, true); } - @Test(expected = CloudRuntimeException.class) + @Test public void retryHttpRequestTestMultipleChoices() throws IOException { - prepareAndRunRetryHttpRequestTest(HttpStatus.SC_MULTIPLE_CHOICES); + prepareAndRunRetryHttpRequestTest(HttpStatus.SC_MULTIPLE_CHOICES, true); } - @Test(expected = CloudRuntimeException.class) + @Test public void retryHttpRequestTestProcessing() throws IOException { - prepareAndRunRetryHttpRequestTest(HttpStatus.SC_PROCESSING); + prepareAndRunRetryHttpRequestTest(HttpStatus.SC_PROCESSING, true); } - @Test(expected = CloudRuntimeException.class) + @Test public void retryHttpRequestTestTimeout() throws IOException { - prepareAndRunRetryHttpRequestTest(HttpStatus.SC_GATEWAY_TIMEOUT); + prepareAndRunRetryHttpRequestTest(HttpStatus.SC_GATEWAY_TIMEOUT, true); } - @Test(expected = CloudRuntimeException.class) + @Test public void retryHttpRequestTestVersionNotSupported() throws IOException { - prepareAndRunRetryHttpRequestTest(HttpStatus.SC_HTTP_VERSION_NOT_SUPPORTED); + prepareAndRunRetryHttpRequestTest(HttpStatus.SC_HTTP_VERSION_NOT_SUPPORTED, true); + } + + @Test + public void retryHttpRequestTestOk() throws IOException { + prepareAndRunRetryHttpRequestTest(HttpStatus.SC_OK, false); } - private void prepareAndRunRetryHttpRequestTest(int scMultipleChoices) throws IOException { + private void prepareAndRunRetryHttpRequestTest(int scMultipleChoices, boolean expectNull) throws IOException { HttpResponse mockedResponse = mockResponse(scMultipleChoices, JSON_STRING_EXAMPLE_3VMs); Mockito.doReturn(mockedResponse).when(kvmHaAgentClient).retryUntilGetsHttpResponse(Mockito.anyString(), Mockito.any(), Mockito.any()); - kvmHaAgentClient.retryHttpRequest(EXPECTED_URL, HTTP_REQUEST_BASE, client); + HttpResponse response = kvmHaAgentClient.retryHttpRequest(EXPECTED_URL, HTTP_REQUEST_BASE, client); + if (expectNull) { + Assert.assertNull(response); + } else { + Assert.assertEquals(mockedResponse, response); + } } @Test @@ -226,20 +235,22 @@ public void retryHttpRequestTestHttpOk() throws IOException { @Test public void retryUntilGetsHttpResponseTestOneIOException() throws IOException { Mockito.when(client.execute(HTTP_REQUEST_BASE)).thenThrow(IOException.class).thenReturn(mockResponse(HttpStatus.SC_OK, JSON_STRING_EXAMPLE_3VMs)); - kvmHaAgentClient.retryUntilGetsHttpResponse(EXPECTED_URL, HTTP_REQUEST_BASE, client); + HttpResponse result = kvmHaAgentClient.retryUntilGetsHttpResponse(EXPECTED_URL, HTTP_REQUEST_BASE, client); Mockito.verify(client, Mockito.times(MAX_REQUEST_RETRIES)).execute(Mockito.any()); + Assert.assertNotNull(result); } - @Test(expected = CloudRuntimeException.class) + @Test public void retryUntilGetsHttpResponseTestTwoIOException() throws IOException { Mockito.when(client.execute(HTTP_REQUEST_BASE)).thenThrow(IOException.class).thenThrow(IOException.class); - kvmHaAgentClient.retryUntilGetsHttpResponse(EXPECTED_URL, HTTP_REQUEST_BASE, client); + HttpResponse result = kvmHaAgentClient.retryUntilGetsHttpResponse(EXPECTED_URL, HTTP_REQUEST_BASE, client); Mockito.verify(client, Mockito.times(MAX_REQUEST_RETRIES)).execute(Mockito.any()); + Assert.assertNull(result); } @Test public void isKvmHaWebserviceEnabledTestDefault() { - Assert.assertTrue(kvmHaAgentClient.isKvmHaWebserviceEnabled()); + Assert.assertFalse(kvmHaAgentClient.isKvmHaWebserviceEnabled()); } @Test From 0506649fdd211115fc93c60795237559150e7705 Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Thu, 6 May 2021 10:43:27 -0300 Subject: [PATCH 11/40] Fix missing parameter on log String.format --- .../kvm/src/main/java/com/cloud/ha/KVMInvestigator.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java index 72384c46740c..87f69a536588 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java @@ -103,9 +103,9 @@ private Status checkAgentStatusViaKvmHaAgent(Host agent, Status agentStatus) { boolean isVmsCountOnKvmMatchingWithDatabase = kvmHaAgentClient.isKvmHaAgentHealthy(agent, vmInstanceDao); if(isVmsCountOnKvmMatchingWithDatabase) { agentStatus = Status.Up; - s_logger.debug(String.format("Checking agent %s status; KVM HA Agent is Running as expected.")); + s_logger.debug(String.format("Checking agent %s status; KVM HA Agent is Running as expected.", agentStatus)); } else { - s_logger.warn(String.format("Checking agent %s status. Failed to check host status via KVM HA Agent")); + s_logger.warn(String.format("Checking agent %s status. Failed to check host status via KVM HA Agent", agentStatus)); } return agentStatus; } From 4f7936e496e2aa39a16e94314a2d60f8a5757c11 Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Tue, 11 May 2021 11:30:26 -0300 Subject: [PATCH 12/40] Update code addressing reviewers --- .../java/com/cloud/ha/KVMInvestigator.java | 3 +- .../kvm/ha/KVMHostActivityChecker.java | 17 ++++++---- .../cloudstack/kvm/ha/KvmHaAgentClient.java | 33 ++++++++++--------- .../spring-kvm-compute-context.xml | 1 + 4 files changed, 31 insertions(+), 23 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java index 87f69a536588..631c35f08e9b 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java @@ -96,7 +96,8 @@ public Status isAgentAlive(Host agent) { } /** - * It checks the KVM node healthy via KVM HA Agent. If the agent is healthy it returns Status.Up, otherwise it keeps the provided Status as it is. + * It checks the KVM node status via KVM HA Agent. + * If the agent is healthy it returns Status.Up, otherwise it keeps the provided Status as it is. */ private Status checkAgentStatusViaKvmHaAgent(Host agent, Status agentStatus) { KvmHaAgentClient kvmHaAgentClient = new KvmHaAgentClient(agent); diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java index 40d8c30e2e21..a35aea0fcfda 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java @@ -52,8 +52,12 @@ import javax.inject.Inject; import java.util.ArrayList; import org.joda.time.DateTime; + +import java.util.Arrays; import java.util.HashMap; +import java.util.HashSet; import java.util.List; +import java.util.Set; public class KVMHostActivityChecker extends AdapterBase implements ActivityCheckerInterface, HealthCheckerInterface { private final static Logger LOG = Logger.getLogger(KVMHostActivityChecker.class); @@ -75,6 +79,8 @@ public class KVMHostActivityChecker extends AdapterBase implements ActivityCheck @Inject private StoragePoolHostDao storagePoolHostDao; + private static final Set NFS_POOL_TYPE = new HashSet<>(Arrays.asList(Storage.StoragePoolType.NetworkFilesystem, Storage.StoragePoolType.ManagedNFS)); + @Override public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException { try { @@ -94,7 +100,7 @@ public boolean isHealthy(Host r) { boolean isHealthy = false; if (isHostServedByNfsPool(r)) { HashMap> poolVolMap = getVolumeUuidOnHost(r); - isHealthy = isHealthyCheckViaNfs(r, isHealthy, poolVolMap); + isHealthy = isHealthCheckViaNfs(r, isHealthy, poolVolMap); if(!isHealthy){ LOG.warn(String.format("NFS storage health check failed for %s. It seems that a storage does not have activity.", r.toString())); } @@ -111,7 +117,7 @@ public boolean isHealthy(Host r) { } /** - * Checks the host healthy via an web-service that retrieves Running KVM instances via libvirt.
+ * Checks the host health via an web-service that retrieves Running KVM instances via libvirt.
* The health-check is executed on the KVM node and verifies the amount of VMs running and if the libvirt service is running.

* * One can enable or disable it via global settings 'kvm.ha.webservice.enabled'. @@ -126,7 +132,7 @@ private boolean checkHealthViaKvmHaWebservice(Host host) { return kvmHaAgentClient.isKvmHaAgentHealthy(host, vmInstanceDao); } - private boolean isHealthyCheckViaNfs(Host r, boolean isHealthy, HashMap> poolVolMap) { + private boolean isHealthCheckViaNfs(Host r, boolean isHealthy, HashMap> poolVolMap) { for (StoragePool pool : poolVolMap.keySet()) { if(Storage.StoragePoolType.NetworkFilesystem == pool.getPoolType() || Storage.StoragePoolType.ManagedNFS == pool.getPoolType()) { @@ -207,7 +213,7 @@ private boolean isVMActivtyOnHost(Host agent, DateTime suspectTime) throws HAChe if (isHostServedByNfsPool(agent)) { HashMap> poolVolMap = getVolumeUuidOnHost(agent); for (StoragePool pool : poolVolMap.keySet()) { - if (Storage.StoragePoolType.NetworkFilesystem == pool.getPoolType() || Storage.StoragePoolType.ManagedNFS == pool.getPoolType()) { + if (NFS_POOL_TYPE.contains(pool.getPoolType())) { activityStatus = checkVmActivityOnStoragePool(poolVolMap, pool, agent, suspectTime, activityStatus); if (!activityStatus) { LOG.warn(String.format("It seems that the storage pool [%s] does not have activity on %s.", pool.getId(), agent.toString())); @@ -282,8 +288,7 @@ private boolean isHostServedByNfsPool(Host agent) { List storagesOnHost = storagePoolHostDao.listByHostId(agent.getId()); for (StoragePoolHostVO storagePoolHostRef : storagesOnHost) { StoragePoolVO storagePool = this.storagePool.findById(storagePoolHostRef.getPoolId()); - if(Storage.StoragePoolType.NetworkFilesystem == storagePool.getPoolType() - || Storage.StoragePoolType.ManagedNFS == storagePool.getPoolType()) { + if (NFS_POOL_TYPE.contains(storagePool.getPoolType())) { return true; } } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java index d21b44230108..3e044033a59a 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java @@ -42,11 +42,11 @@ /** * This class provides a client that checks Agent status via a webserver. - * + *
* The additional webserver exposes a simple JSON API which returns a list - * of Virtual Machines that are running on that host according to libvirt. - * - * This way, KVM HA can verify, via libvirt, VMs status with a HTTP-call + * of Virtual Machines that are running on that host according to Libvirt. + *
+ * This way, KVM HA can verify, via Libvirt, VMs status with an HTTP-call * to this simple webserver and determine if the host is actually down * or if it is just the Java Agent which has crashed. */ @@ -62,14 +62,14 @@ public class KvmHaAgentClient { private Host agent; /** - * Instantiates a webclient that checks, via a webserver running on the KVM host, the VMs running + * Instantiates a webclient that checks, via a webserver running on the KVM host, the VMs running according to the Libvirt */ public KvmHaAgentClient(Host agent) { this.agent = agent; } /** - * Returns the number of VMs running on the KVM host according to libvirt. + * Returns the number of VMs running on the KVM host according to Libvirt. */ protected int countRunningVmsOnAgent() { String url = String.format("http://%s:%d", agent.getPrivateIpAddress(), getKvmHaMicroservicePortValue()); @@ -89,7 +89,8 @@ protected int countRunningVmsOnAgent() { protected int getKvmHaMicroservicePortValue() { Integer haAgentPort = KVMHAConfig.KvmHaWebservicePort.value(); if (haAgentPort == null) { - LOGGER.warn(String.format("Using default kvm.ha.webservice.port: %s as it was set to NULL for the cluster [id: %d] from %s.", KVMHAConfig.KvmHaWebservicePort.defaultValue(), agent.getClusterId(), agent)); + LOGGER.warn(String.format("Using default kvm.ha.webservice.port: %s as it was set to NULL for the cluster [id: %d] from %s.", + KVMHAConfig.KvmHaWebservicePort.defaultValue(), agent.getClusterId(), agent)); haAgentPort = Integer.parseInt(KVMHAConfig.KvmHaWebservicePort.defaultValue()); } return haAgentPort; @@ -128,8 +129,8 @@ protected List listVmsOnHost(Host host, VMInstanceDao vmInstanceDa int migratingVms = listByHostAndStateMigrating.size(); int countRunningVmsOnAgent = countRunningVmsOnAgent(); LOGGER.trace( - String.format("%s has (%d Starting) %d Running, %d Stopping, %d Migrating. Total listed via DB %d / %d (via libvirt)", agent.getName(), startingVMs, runningVMs, stoppingVms, - migratingVms, listByHostAndState.size(), countRunningVmsOnAgent)); + String.format("%s has (%d Starting) %d Running, %d Stopping, %d Migrating. Total listed via DB %d / %d (via libvirt)", agent.getName(), startingVMs, runningVMs, + stoppingVms, migratingVms, listByHostAndState.size(), countRunningVmsOnAgent)); } return listByHostAndState; @@ -146,28 +147,28 @@ protected List listVmsOnHost(Host host, VMInstanceDao vmInstanceDa * (iii) amount of listed VMs is different than expected: return true and print WARN messages so Admins can monitor and react accordingly */ public boolean isKvmHaAgentHealthy(Host host, VMInstanceDao vmInstanceDao) { - int numberOfVmsOnHostAccordingToDB = listVmsOnHost(host, vmInstanceDao).size(); + int numberOfVmsOnHostAccordingToDb = listVmsOnHost(host, vmInstanceDao).size(); int numberOfVmsOnAgent = countRunningVmsOnAgent(); if (numberOfVmsOnAgent < 0) { LOGGER.error(String.format("KVM HA Agent health check failed, either the KVM Agent %s is unreachable or Libvirt validation failed.", agent)); - LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDB)); + LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDb)); return false; } - if (numberOfVmsOnHostAccordingToDB == numberOfVmsOnAgent) { + if (numberOfVmsOnHostAccordingToDb == numberOfVmsOnAgent) { return true; } - if (numberOfVmsOnAgent == 0 && numberOfVmsOnHostAccordingToDB > CAUTIOUS_MARGIN_OF_VMS_ON_HOST) { + if (numberOfVmsOnAgent == 0 && numberOfVmsOnHostAccordingToDb > CAUTIOUS_MARGIN_OF_VMS_ON_HOST) { // Return false as could not find VMs running but it expected at least one VM running, fencing/recovering host would avoid downtime to VMs in this case. // There is cautious margin added on the conditional. This avoids fencing/recovering hosts when there is one VM migrating to a host that had zero VMs. // If there are more VMs than the CAUTIOUS_MARGIN_OF_VMS_ON_HOST) the Host should be treated as not healthy and fencing/recovering process might be triggered. - LOGGER.warn(String.format("KVM HA Agent %s could not find VMs; it was expected to list %d VMs.", agent, numberOfVmsOnHostAccordingToDB)); - LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDB)); + LOGGER.warn(String.format("KVM HA Agent %s could not find VMs; it was expected to list %d VMs.", agent, numberOfVmsOnHostAccordingToDb)); + LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDb)); return false; } // In order to have a less "aggressive" health-check, the KvmHaAgentClient will not return false; fencing/recovering could bring downtime to existing VMs // Additionally, the inconsistency can also be due to jobs in progress to migrate/stop/start VMs // Either way, WARN messages should be presented to Admins so they can look closely to what is happening on the host - LOGGER.warn(String.format("KVM HA Agent %s listed %d VMs; however, it was expected %d VMs.", agent, numberOfVmsOnAgent, numberOfVmsOnHostAccordingToDB)); + LOGGER.warn(String.format("KVM HA Agent %s listed %d VMs; however, it was expected %d VMs.", agent, numberOfVmsOnAgent, numberOfVmsOnHostAccordingToDb)); return true; } diff --git a/plugins/hypervisors/kvm/src/main/resources/META-INF/cloudstack/kvm-compute/spring-kvm-compute-context.xml b/plugins/hypervisors/kvm/src/main/resources/META-INF/cloudstack/kvm-compute/spring-kvm-compute-context.xml index 031593c5c15e..bedaf1851d18 100644 --- a/plugins/hypervisors/kvm/src/main/resources/META-INF/cloudstack/kvm-compute/spring-kvm-compute-context.xml +++ b/plugins/hypervisors/kvm/src/main/resources/META-INF/cloudstack/kvm-compute/spring-kvm-compute-context.xml @@ -37,4 +37,5 @@ + From 6d59a81b7a8e9878c76cc5fcfe4781d6e17ca1b3 Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Tue, 11 May 2021 17:32:41 -0300 Subject: [PATCH 13/40] Simplify code flow, enhance Python, address reviewers. --- .../java/com/cloud/ha/KVMInvestigator.java | 3 + .../kvm/ha/KVMHostActivityChecker.java | 76 +++++++++++-------- .../cloudstack/kvm/ha/KvmHaAgentClient.java | 23 ++++++ .../kvm/ha/KvmHaAgentClientTest.java | 17 +++++ scripts/vm/hypervisor/kvm/agent-ha-helper.py | 75 ++++++++++++++---- 5 files changed, 148 insertions(+), 46 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java index 631c35f08e9b..ef6fe1050200 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java @@ -139,6 +139,9 @@ private boolean hasNfsPoolClusterWideForHost(Host agent) { return false; } + /** + * Checks the Agent Status sending command CheckOnHostCommand to the Agent, which verifies host Status via NFS Heart Beat script + */ private Status checkAgentStatusViaNfs(Host agent) { Status hostStatus = null; Status neighbourStatus = null; diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java index a35aea0fcfda..32985e3d1a46 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java @@ -51,6 +51,7 @@ import javax.inject.Inject; import java.util.ArrayList; + import org.joda.time.DateTime; import java.util.Arrays; @@ -88,7 +89,7 @@ public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException } catch (HACheckerException e) { //Re-throwing the exception to avoid poluting the 'HACheckerException' already thrown throw e; - } catch (Exception e){ + } catch (Exception e) { String message = String.format("Operation timed out, probably the %s is not reachable.", r.toString()); LOG.warn(message, e); throw new HACheckerException(message, e); @@ -97,46 +98,60 @@ public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException @Override public boolean isHealthy(Host r) { - boolean isHealthy = false; - if (isHostServedByNfsPool(r)) { - HashMap> poolVolMap = getVolumeUuidOnHost(r); - isHealthy = isHealthCheckViaNfs(r, isHealthy, poolVolMap); - if(!isHealthy){ - LOG.warn(String.format("NFS storage health check failed for %s. It seems that a storage does not have activity.", r.toString())); - } + boolean isHealthy = true; + boolean isHostServedByNfsPool = isHostServedByNfsPool(r); + boolean isKvmHaWebserviceEnabled = isKvmHaWebserviceEnabled(r); + + isHealthy = isHealthViaNfs(r); + + if (!isKvmHaWebserviceEnabled) { + return isHealthy; } - boolean isKvmHaAgentHealthy = checkHealthViaKvmHaWebservice(r); + //TODO + - if (!isHealthy && isKvmHaAgentHealthy) { + if (isVmActivtyOnHostViaKvmHaWebservice(r) && !isHealthy) { isHealthy = true; - LOG.warn(String.format("KVM HA Agent health check could not detect activity on %s. This might trigger HA Host Recovery and/or Fence", r.toString())); } return isHealthy; } /** - * Checks the host health via an web-service that retrieves Running KVM instances via libvirt.
- * The health-check is executed on the KVM node and verifies the amount of VMs running and if the libvirt service is running.

- * - * One can enable or disable it via global settings 'kvm.ha.webservice.enabled'. + * Checks the host health via an web-service that retrieves Running KVM instances via Libvirt.
+ * The health-check is executed on the KVM node and verifies the amount of VMs running and if the Libvirt service is running. */ - private boolean checkHealthViaKvmHaWebservice(Host host) { + private boolean isVmActivtyOnHostViaKvmHaWebservice(Host host) { KvmHaAgentClient kvmHaAgentClient = new KvmHaAgentClient(host); - if(!kvmHaAgentClient.isKvmHaWebserviceEnabled()) { + return kvmHaAgentClient.isKvmHaAgentHealthy(host, vmInstanceDao); + } + + //TODO + private boolean isNeigbourReachable(Host host) { + return true; + } + + /** + * Checks if the KVM HA webservice is enabled. One can enable or disable it via global settings 'kvm.ha.webservice.enabled'. + */ + private boolean isKvmHaWebserviceEnabled(Host host) { + KvmHaAgentClient kvmHaAgentClient = new KvmHaAgentClient(host); + if (!kvmHaAgentClient.isKvmHaWebserviceEnabled()) { ClusterVO cluster = clusterDao.findById(host.getClusterId()); - LOG.debug(String.format("Skipping KVM HA web-service verification for %s due to 'kvm.ha.webservice.enabled' not enabled for cluster [id: %d, name: %s].", - host.toString(), cluster.getId(), cluster.getName())); + LOG.debug(String.format("Skipping KVM HA web-service verification for %s due to 'kvm.ha.webservice.enabled' not enabled for cluster [id: %d, name: %s].", host, + cluster.getId(), cluster.getName())); + return false; } - return kvmHaAgentClient.isKvmHaAgentHealthy(host, vmInstanceDao); + return true; } - private boolean isHealthCheckViaNfs(Host r, boolean isHealthy, HashMap> poolVolMap) { - for (StoragePool pool : poolVolMap.keySet()) { - if(Storage.StoragePoolType.NetworkFilesystem == pool.getPoolType() - || Storage.StoragePoolType.ManagedNFS == pool.getPoolType()) { - isHealthy = isAgentActive(r); + private boolean isHealthViaNfs(Host r) { + boolean isHealthy = true; + if (isHostServedByNfsPool(r)) { + isHealthy = isAgentActive(r); + if (!isHealthy) { + LOG.warn(String.format("NFS storage health check failed for %s. It seems that a storage does not have activity.", r.toString())); } } return isHealthy; @@ -214,7 +229,7 @@ private boolean isVMActivtyOnHost(Host agent, DateTime suspectTime) throws HAChe HashMap> poolVolMap = getVolumeUuidOnHost(agent); for (StoragePool pool : poolVolMap.keySet()) { if (NFS_POOL_TYPE.contains(pool.getPoolType())) { - activityStatus = checkVmActivityOnStoragePool(poolVolMap, pool, agent, suspectTime, activityStatus); + activityStatus = isVmActivtyOnHostViaNfsStoragePool(poolVolMap, pool, agent, suspectTime, activityStatus); if (!activityStatus) { LOG.warn(String.format("It seems that the storage pool [%s] does not have activity on %s.", pool.getId(), agent.toString())); break; @@ -223,19 +238,20 @@ private boolean isVMActivtyOnHost(Host agent, DateTime suspectTime) throws HAChe } } - boolean isKvmHaAgentHealthy = checkHealthViaKvmHaWebservice(agent); + boolean isKvmHaAgentHealthy = isVmActivtyOnHostViaKvmHaWebservice(agent); if (!activityStatus && isKvmHaAgentHealthy) { activityStatus = true; } - if(!activityStatus){ + if (!activityStatus) { LOG.warn(String.format("KVM HA Agent health check could not detect activity on %s. This might trigger HA Host Recovery and/or Fence.", agent.toString())); } return activityStatus; } - private boolean checkVmActivityOnStoragePool(HashMap> poolVolMap, StoragePool pool, Host agent, DateTime suspectTime, boolean activityStatus) throws HACheckerException, IllegalStateException { + private boolean isVmActivtyOnHostViaNfsStoragePool(HashMap> poolVolMap, StoragePool pool, Host agent, DateTime suspectTime, boolean activityStatus) + throws HACheckerException, IllegalStateException { List volume_list = poolVolMap.get(pool); final CheckVMActivityOnStoragePoolCommand cmd = new CheckVMActivityOnStoragePoolCommand(agent, pool, volume_list, suspectTime); @@ -251,7 +267,7 @@ private boolean checkVmActivityOnStoragePool(HashMap> LOG.debug(message); throw new IllegalStateException(message); } - } catch (StorageUnavailableException e){ + } catch (StorageUnavailableException e) { String message = String.format("Storage [%s] is unavailable to do the check, probably the %s is not reachable.", pool.getId(), agent.toString()); LOG.warn(message, e); throw new HACheckerException(message, e); diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java index 3e044033a59a..1a031d8e4ff9 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java @@ -56,6 +56,9 @@ public class KvmHaAgentClient { private static final int ERROR_CODE = -1; private static final String EXPECTED_HTTP_STATUS = "2XX"; private static final String VM_COUNT = "count"; + private static final String STATUS = "status"; + private static final String CHECK = "check"; + private static final String UP = "Up"; private static final int WAIT_FOR_REQUEST_RETRY = 2; private static final int MAX_REQUEST_RETRIES = 2; private static final int CAUTIOUS_MARGIN_OF_VMS_ON_HOST = 1; @@ -86,6 +89,26 @@ protected int countRunningVmsOnAgent() { return responseInJson.get(VM_COUNT).getAsInt(); } + /** + * Executes ping command from the host executing the KVM HA Agent webservice to a target IP Address. + * The webserver serves a JSON Object such as {"status": "Up"} if the IP address is reachable OR {"status": "Down"} if could not ping the IP + */ + protected boolean isTargetHostReachable(String ipAddress) { + int port = getKvmHaMicroservicePortValue(); + String url = String.format("http://%s:%d/%s/%s:%d", agent.getPrivateIpAddress(), port, CHECK, ipAddress, port); + HttpResponse response = executeHttpRequest(url); + + if (response == null) + return false; + + JsonObject responseInJson = processHttpResponseIntoJson(response); + if (responseInJson == null) { + return false; + } + + return UP.equals(responseInJson.get(STATUS).getAsString()); + } + protected int getKvmHaMicroservicePortValue() { Integer haAgentPort = KVMHAConfig.KvmHaWebservicePort.value(); if (haAgentPort == null) { diff --git a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java index 786c1d5175e8..bf07313b5133 100644 --- a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java +++ b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java @@ -20,6 +20,7 @@ import java.util.List; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.math.NumberUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; @@ -258,4 +259,20 @@ public void getKvmHaMicroservicePortValueTestDefault() { Assert.assertEquals(KVM_HA_WEBSERVICE_PORT, kvmHaAgentClient.getKvmHaMicroservicePortValue()); } +// private void prepareAndRunCountRunningVmsOnAgent(String jsonStringExample, int expectedListedVms) throws IOException { +// Mockito.when(agent.getPrivateIpAddress()).thenReturn(PRIVATE_IP_ADDRESS); +// Mockito.doReturn(mockResponse(HttpStatus.SC_OK, JSON_STRING_EXAMPLE_3VMs)).when(kvmHaAgentClient).executeHttpRequest(EXPECTED_URL); +// +// JsonObject jObject = new JsonParser().parse(jsonStringExample).getAsJsonObject(); +// Mockito.doReturn(jObject).when(kvmHaAgentClient).processHttpResponseIntoJson(Mockito.any(HttpResponse.class)); +// +// int result = kvmHaAgentClient.countRunningVmsOnAgent(); +// Assert.assertEquals(expectedListedVms, result); +// } +//TODO +// @Test +// public void isTargetHostReachableTest() { +// kvmHaAgentClient.isTargetHostReachable(PRIVATE_IP_ADDRESS); +// } + } diff --git a/scripts/vm/hypervisor/kvm/agent-ha-helper.py b/scripts/vm/hypervisor/kvm/agent-ha-helper.py index 113c70bdd86c..eec5a00a6303 100755 --- a/scripts/vm/hypervisor/kvm/agent-ha-helper.py +++ b/scripts/vm/hypervisor/kvm/agent-ha-helper.py @@ -16,11 +16,21 @@ # specific language governing permissions and limitations # under the License. +import logging import libvirt import socket import json +import requests from http.server import BaseHTTPRequestHandler, HTTPServer +log_folder = "/var/log/cloudstack/agent/" +log_path = "/var/log/cloudstack/agent/agent-ha-helper.log" +root_path = "/" +check_path = "/check-neighbour/" +http_ok = 200 +http_multiple_choices = 300 +http_not_found = 404 + class Libvirt(): def __init__(self): self.conn = libvirt.openReadOnly("qemu:///system") @@ -46,34 +56,67 @@ class HTTPServerV6(HTTPServer): class CloudStackAgentHAHelper(BaseHTTPRequestHandler): def do_GET(self): + if self.path == root_path: + libvirt = Libvirt() - if self.path != "/": - self.send_response(404) - self.end_headers() - return - - libvirt = Libvirt() + running_vms = libvirt.running_vms() - running_vms = libvirt.running_vms() + output = { + 'count': len(running_vms), + 'virtualmachines': running_vms + } - output = { - 'count': len(running_vms), - 'virtualmachines': running_vms - } + self.send_response(http_ok) + self.send_header('Content-type', 'application/json') + self.end_headers() + self.wfile.write(json.dumps(output).encode()) + + elif check_path in self.path: + host_and_port = self.path.partition(check_path)[2] + request_url = 'http://{}/'.format(host_and_port) + logging.debug('Check if Host {} is reachable via HTTP GET request to agent-ha-helper.'.format(request_url)) + logging.debug('GET request: {}'.format(request_url)) + try: + response = requests.get(url = request_url) + if http_ok <= response.status_code < http_multiple_choices: + request_response = 'Up' + else: + request_response = 'Down' + except: + logging.error('GET Request {} failed.'.format(request_url)) + output = { + 'status': 'Down' + } + logging.debug('Neighbour host status: {}'.format(output)) + self.send_response(http_not_found) + self.send_header('Content-type', 'application/json') + self.end_headers() + self.wfile.write(json.dumps(output).encode()) + return + + logging.debug('Neighbour host status: {}'.format(request_response)) + output = { + 'status': request_response, + } + self.send_response(http_ok) + self.send_header('Content-type', 'application/json') + self.end_headers() + self.wfile.write(json.dumps(output).encode()) - self.send_response(200) - self.send_header("Content-type", "application/json") - self.end_headers() - self.wfile.write(json.dumps(output).encode()) + else: + self.send_response(http_not_found) + self.end_headers() + return def run(port=8080): server_address = ('', port) httpd = HTTPServerV6((server_address), CloudStackAgentHAHelper) httpd.serve_forever() + if __name__ == "__main__": from sys import argv - + logging.basicConfig(filename='/var/log/cloudstack/agent/agent-ha-helper.log', format='%(asctime)s - %(message)s', level=logging.DEBUG) try: if len(argv) == 2: run(port=int(argv[1])) From c37b39a681aff5c98d0985240c1e9acb15970d9f Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Wed, 19 May 2021 00:43:34 -0300 Subject: [PATCH 14/40] Create KvmHaHelper class. Enhance HA validations by checking if host is reachable by neighbour hosts as well as validating if the cluster is "problematic" --- .../java/com/cloud/ha/KVMInvestigator.java | 47 +++---- .../kvm/ha/KVMHostActivityChecker.java | 47 ++----- .../cloudstack/kvm/ha/KvmHaAgentClient.java | 127 +++++++++-------- .../apache/cloudstack/kvm/ha/KvmHaHelper.java | 130 ++++++++++++++++++ .../spring-kvm-compute-context.xml | 5 +- .../kvm/ha/KvmHaAgentClientTest.java | 67 +++++---- 6 files changed, 269 insertions(+), 154 deletions(-) create mode 100644 plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java index ef6fe1050200..313631339697 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java @@ -24,14 +24,12 @@ import com.cloud.host.Host; import com.cloud.host.HostVO; import com.cloud.host.Status; -import com.cloud.host.dao.HostDao; import com.cloud.hypervisor.Hypervisor; import com.cloud.resource.ResourceManager; import com.cloud.storage.Storage.StoragePoolType; import com.cloud.utils.component.AdapterBase; -import com.cloud.vm.dao.VMInstanceDao; import org.apache.cloudstack.ha.HAManager; -import org.apache.cloudstack.kvm.ha.KvmHaAgentClient; +import org.apache.cloudstack.kvm.ha.KvmHaHelper; import org.apache.cloudstack.storage.datastore.db.PrimaryDataStoreDao; import org.apache.cloudstack.storage.datastore.db.StoragePoolVO; import org.apache.log4j.Logger; @@ -42,8 +40,6 @@ public class KVMInvestigator extends AdapterBase implements Investigator { private final static Logger s_logger = Logger.getLogger(KVMInvestigator.class); @Inject - private HostDao _hostDao; - @Inject private AgentManager _agentMgr; @Inject private ResourceManager _resourceMgr; @@ -52,7 +48,7 @@ public class KVMInvestigator extends AdapterBase implements Investigator { @Inject private HAManager haManager; @Inject - private VMInstanceDao vmInstanceDao; + private KvmHaHelper kvmHaHelper; @Override public boolean isVmAlive(com.cloud.vm.VirtualMachine vm, Host host) throws UnknownVM { @@ -90,24 +86,8 @@ public Status isAgentAlive(Host agent) { s_logger.debug(String.format("Agent investigation was requested on host %s, but host has no NFS storage. Skipping investigation via NFS.", agent)); } - agentStatus = checkAgentStatusViaKvmHaAgent(agent, agentStatus); - - return agentStatus; - } + agentStatus = kvmHaHelper.checkAgentStatusViaKvmHaAgent(agent, agentStatus); - /** - * It checks the KVM node status via KVM HA Agent. - * If the agent is healthy it returns Status.Up, otherwise it keeps the provided Status as it is. - */ - private Status checkAgentStatusViaKvmHaAgent(Host agent, Status agentStatus) { - KvmHaAgentClient kvmHaAgentClient = new KvmHaAgentClient(agent); - boolean isVmsCountOnKvmMatchingWithDatabase = kvmHaAgentClient.isKvmHaAgentHealthy(agent, vmInstanceDao); - if(isVmsCountOnKvmMatchingWithDatabase) { - agentStatus = Status.Up; - s_logger.debug(String.format("Checking agent %s status; KVM HA Agent is Running as expected.", agentStatus)); - } else { - s_logger.warn(String.format("Checking agent %s status. Failed to check host status via KVM HA Agent", agentStatus)); - } return agentStatus; } @@ -153,24 +133,29 @@ private Status checkAgentStatusViaNfs(Host agent) { hostStatus = answer.getResult() ? Status.Down : Status.Up; } } catch (Exception e) { - s_logger.debug("Failed to send command to host: " + agent.getId()); + s_logger.debug(String.format("Failed to send command to %s", agent)); } + if (hostStatus == null) { hostStatus = Status.Disconnected; } + if (Status.Up == hostStatus) { + return hostStatus; + } + List neighbors = _resourceMgr.listHostsInClusterByStatus(agent.getClusterId(), Status.Up); for (HostVO neighbor : neighbors) { if (neighbor.getId() == agent.getId() || (neighbor.getHypervisorType() != Hypervisor.HypervisorType.KVM && neighbor.getHypervisorType() != Hypervisor.HypervisorType.LXC)) { continue; } - s_logger.debug("Investigating host:" + agent.getId() + " via neighbouring host:" + neighbor.getId()); + s_logger.debug(String.format("Investigating %s via neighbouring %s ", agent, neighbor)); try { Answer answer = _agentMgr.easySend(neighbor.getId(), cmd); if (answer != null) { neighbourStatus = answer.getResult() ? Status.Down : Status.Up; - s_logger.debug("Neighbouring host:" + neighbor.getId() + " returned status:" + neighbourStatus + " for the investigated host:" + agent.getId()); + s_logger.debug(String.format("Neighbouring %s returned status: %s for the investigated %s", neighbor, neighbourStatus, agent)); if (neighbourStatus == Status.Up) { break; } @@ -179,13 +164,15 @@ private Status checkAgentStatusViaNfs(Host agent) { s_logger.debug("Failed to send command to host: " + neighbor.getId()); } } - if (neighbourStatus == Status.Up && (hostStatus == Status.Disconnected || hostStatus == Status.Down)) { + + if (neighbourStatus == Status.Up) { hostStatus = Status.Disconnected; - } - if (neighbourStatus == Status.Down && (hostStatus == Status.Disconnected || hostStatus == Status.Down)) { + } else if (neighbourStatus == Status.Down) { hostStatus = Status.Down; } - s_logger.debug("HA: HOST is ineligible legacy state " + hostStatus + " for host " + agent.getId()); + + s_logger.debug(String.format("HA: HOST is ineligible legacy state %s for %s", hostStatus, agent)); return hostStatus; } + } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java index 32985e3d1a46..9a8e72364b8b 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java @@ -21,8 +21,6 @@ import com.cloud.agent.api.Answer; import com.cloud.agent.api.CheckOnHostCommand; import com.cloud.agent.api.CheckVMActivityOnStoragePoolCommand; -import com.cloud.dc.ClusterVO; -import com.cloud.dc.dao.ClusterDao; import com.cloud.exception.StorageUnavailableException; import com.cloud.host.Host; import com.cloud.host.HostVO; @@ -76,9 +74,9 @@ public class KVMHostActivityChecker extends AdapterBase implements ActivityCheck @Inject private ResourceManager resourceManager; @Inject - private ClusterDao clusterDao; - @Inject private StoragePoolHostDao storagePoolHostDao; + @Inject + private KvmHaHelper kvmHaHelper; private static final Set NFS_POOL_TYPE = new HashSet<>(Arrays.asList(Storage.StoragePoolType.NetworkFilesystem, Storage.StoragePoolType.ManagedNFS)); @@ -100,52 +98,23 @@ public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException public boolean isHealthy(Host r) { boolean isHealthy = true; boolean isHostServedByNfsPool = isHostServedByNfsPool(r); - boolean isKvmHaWebserviceEnabled = isKvmHaWebserviceEnabled(r); + boolean isKvmHaWebserviceEnabled = kvmHaHelper.isKvmHaWebserviceEnabled(r); - isHealthy = isHealthViaNfs(r); + if(isHostServedByNfsPool) { + isHealthy = isHealthViaNfs(r); + } if (!isKvmHaWebserviceEnabled) { return isHealthy; } - //TODO - - - if (isVmActivtyOnHostViaKvmHaWebservice(r) && !isHealthy) { + if (kvmHaHelper.isVmActivtyOnHostViaKvmHaWebservice(r) && !isHealthy) { isHealthy = true; } return isHealthy; } - /** - * Checks the host health via an web-service that retrieves Running KVM instances via Libvirt.
- * The health-check is executed on the KVM node and verifies the amount of VMs running and if the Libvirt service is running. - */ - private boolean isVmActivtyOnHostViaKvmHaWebservice(Host host) { - KvmHaAgentClient kvmHaAgentClient = new KvmHaAgentClient(host); - return kvmHaAgentClient.isKvmHaAgentHealthy(host, vmInstanceDao); - } - - //TODO - private boolean isNeigbourReachable(Host host) { - return true; - } - - /** - * Checks if the KVM HA webservice is enabled. One can enable or disable it via global settings 'kvm.ha.webservice.enabled'. - */ - private boolean isKvmHaWebserviceEnabled(Host host) { - KvmHaAgentClient kvmHaAgentClient = new KvmHaAgentClient(host); - if (!kvmHaAgentClient.isKvmHaWebserviceEnabled()) { - ClusterVO cluster = clusterDao.findById(host.getClusterId()); - LOG.debug(String.format("Skipping KVM HA web-service verification for %s due to 'kvm.ha.webservice.enabled' not enabled for cluster [id: %d, name: %s].", host, - cluster.getId(), cluster.getName())); - return false; - } - return true; - } - private boolean isHealthViaNfs(Host r) { boolean isHealthy = true; if (isHostServedByNfsPool(r)) { @@ -238,7 +207,7 @@ private boolean isVMActivtyOnHost(Host agent, DateTime suspectTime) throws HAChe } } - boolean isKvmHaAgentHealthy = isVmActivtyOnHostViaKvmHaWebservice(agent); + boolean isKvmHaAgentHealthy = kvmHaHelper.isVmActivtyOnHostViaKvmHaWebservice(agent); if (!activityStatus && isKvmHaAgentHealthy) { activityStatus = true; diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java index 1a031d8e4ff9..6a28e976d138 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java @@ -14,6 +14,7 @@ package org.apache.cloudstack.kvm.ha; import com.cloud.host.Host; +import com.cloud.host.Status; import com.cloud.utils.exception.CloudRuntimeException; import com.cloud.vm.VMInstanceVO; import com.cloud.vm.VirtualMachine; @@ -30,6 +31,7 @@ import org.apache.log4j.Logger; import org.jetbrains.annotations.Nullable; +import javax.inject.Inject; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; @@ -57,25 +59,20 @@ public class KvmHaAgentClient { private static final String EXPECTED_HTTP_STATUS = "2XX"; private static final String VM_COUNT = "count"; private static final String STATUS = "status"; - private static final String CHECK = "check"; - private static final String UP = "Up"; + private static final String CHECK_NEIGHBOUR = "check-neighbour"; private static final int WAIT_FOR_REQUEST_RETRY = 2; private static final int MAX_REQUEST_RETRIES = 2; private static final int CAUTIOUS_MARGIN_OF_VMS_ON_HOST = 1; - private Host agent; + private static final JsonParser JSON_PARSER = new JsonParser(); - /** - * Instantiates a webclient that checks, via a webserver running on the KVM host, the VMs running according to the Libvirt - */ - public KvmHaAgentClient(Host agent) { - this.agent = agent; - } + @Inject + private VMInstanceDao vmInstanceDao; /** * Returns the number of VMs running on the KVM host according to Libvirt. */ - protected int countRunningVmsOnAgent() { - String url = String.format("http://%s:%d", agent.getPrivateIpAddress(), getKvmHaMicroservicePortValue()); + protected int countRunningVmsOnAgent(Host host) { + String url = String.format("http://%s:%d", host.getPrivateIpAddress(), getKvmHaMicroservicePortValue(host)); HttpResponse response = executeHttpRequest(url); if (response == null) @@ -89,31 +86,11 @@ protected int countRunningVmsOnAgent() { return responseInJson.get(VM_COUNT).getAsInt(); } - /** - * Executes ping command from the host executing the KVM HA Agent webservice to a target IP Address. - * The webserver serves a JSON Object such as {"status": "Up"} if the IP address is reachable OR {"status": "Down"} if could not ping the IP - */ - protected boolean isTargetHostReachable(String ipAddress) { - int port = getKvmHaMicroservicePortValue(); - String url = String.format("http://%s:%d/%s/%s:%d", agent.getPrivateIpAddress(), port, CHECK, ipAddress, port); - HttpResponse response = executeHttpRequest(url); - - if (response == null) - return false; - - JsonObject responseInJson = processHttpResponseIntoJson(response); - if (responseInJson == null) { - return false; - } - - return UP.equals(responseInJson.get(STATUS).getAsString()); - } - - protected int getKvmHaMicroservicePortValue() { + protected int getKvmHaMicroservicePortValue(Host host) { Integer haAgentPort = KVMHAConfig.KvmHaWebservicePort.value(); if (haAgentPort == null) { LOGGER.warn(String.format("Using default kvm.ha.webservice.port: %s as it was set to NULL for the cluster [id: %d] from %s.", - KVMHAConfig.KvmHaWebservicePort.defaultValue(), agent.getClusterId(), agent)); + KVMHAConfig.KvmHaWebservicePort.defaultValue(), host.getClusterId(), host)); haAgentPort = Integer.parseInt(KVMHAConfig.KvmHaWebservicePort.defaultValue()); } return haAgentPort; @@ -150,9 +127,9 @@ protected List listVmsOnHost(Host host, VMInstanceDao vmInstanceDa int runningVMs = listByHostAndStateRunning.size(); int stoppingVms = listByHostAndStateStopping.size(); int migratingVms = listByHostAndStateMigrating.size(); - int countRunningVmsOnAgent = countRunningVmsOnAgent(); + int countRunningVmsOnAgent = countRunningVmsOnAgent(host); LOGGER.trace( - String.format("%s has (%d Starting) %d Running, %d Stopping, %d Migrating. Total listed via DB %d / %d (via libvirt)", agent.getName(), startingVMs, runningVMs, + String.format("%s has (%d Starting) %d Running, %d Stopping, %d Migrating. Total listed via DB %d / %d (via libvirt)", host.getName(), startingVMs, runningVMs, stoppingVms, migratingVms, listByHostAndState.size(), countRunningVmsOnAgent)); } @@ -169,35 +146,67 @@ protected List listVmsOnHost(Host host, VMInstanceDao vmInstanceDa * when it could be a inconsistency when migrating a VM.
* (iii) amount of listed VMs is different than expected: return true and print WARN messages so Admins can monitor and react accordingly */ - public boolean isKvmHaAgentHealthy(Host host, VMInstanceDao vmInstanceDao) { + public boolean isKvmHaAgentHealthy(Host host) { int numberOfVmsOnHostAccordingToDb = listVmsOnHost(host, vmInstanceDao).size(); - int numberOfVmsOnAgent = countRunningVmsOnAgent(); + int numberOfVmsOnAgent = countRunningVmsOnAgent(host); if (numberOfVmsOnAgent < 0) { - LOGGER.error(String.format("KVM HA Agent health check failed, either the KVM Agent %s is unreachable or Libvirt validation failed.", agent)); - LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDb)); + LOGGER.error(String.format("KVM HA Agent health check failed, either the KVM Agent %s is unreachable or Libvirt validation failed.", host)); + logIfFencingOrRecoveringMightBeTriggered(host); return false; } if (numberOfVmsOnHostAccordingToDb == numberOfVmsOnAgent) { return true; } if (numberOfVmsOnAgent == 0 && numberOfVmsOnHostAccordingToDb > CAUTIOUS_MARGIN_OF_VMS_ON_HOST) { - // Return false as could not find VMs running but it expected at least one VM running, fencing/recovering host would avoid downtime to VMs in this case. - // There is cautious margin added on the conditional. This avoids fencing/recovering hosts when there is one VM migrating to a host that had zero VMs. - // If there are more VMs than the CAUTIOUS_MARGIN_OF_VMS_ON_HOST) the Host should be treated as not healthy and fencing/recovering process might be triggered. - LOGGER.warn(String.format("KVM HA Agent %s could not find VMs; it was expected to list %d VMs.", agent, numberOfVmsOnHostAccordingToDb)); - LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName(), numberOfVmsOnHostAccordingToDb)); + LOGGER.warn(String.format("KVM HA Agent %s could not find VMs; it was expected to list %d VMs.", host, numberOfVmsOnHostAccordingToDb)); + logIfFencingOrRecoveringMightBeTriggered(host); return false; } - // In order to have a less "aggressive" health-check, the KvmHaAgentClient will not return false; fencing/recovering could bring downtime to existing VMs - // Additionally, the inconsistency can also be due to jobs in progress to migrate/stop/start VMs - // Either way, WARN messages should be presented to Admins so they can look closely to what is happening on the host - LOGGER.warn(String.format("KVM HA Agent %s listed %d VMs; however, it was expected %d VMs.", agent, numberOfVmsOnAgent, numberOfVmsOnHostAccordingToDb)); + LOGGER.warn(String.format("KVM HA Agent %s listed %d VMs; however, it was expected %d VMs.", host, numberOfVmsOnAgent, numberOfVmsOnHostAccordingToDb)); return true; } + private void logIfFencingOrRecoveringMightBeTriggered(Host agent) { + LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName())); + } + + /** + * Sends a HTTP GET request from the host executing the KVM HA Agent webservice to a target Host (expected to also be running the KVM HA Agent). + * The webserver serves a JSON Object such as {"status": "Up"} if the request gets a HTTP_OK OR {"status": "Down"} if HTTP GET failed + */ + public boolean isHostReachableByNeighbour(Host neighbour, Host target) { + String neighbourHostAddress = neighbour.getPrivateIpAddress(); + String targetHostAddress = target.getPrivateIpAddress(); + int port = getKvmHaMicroservicePortValue(neighbour); + String url = String.format("http://%s:%d/%s/%s:%d", neighbourHostAddress, port, CHECK_NEIGHBOUR, targetHostAddress, port); + HttpResponse response = executeHttpRequest(url); + + if (response == null) + return false; + + JsonObject responseInJson = processHttpResponseIntoJson(response); + if (responseInJson == null) + return false; + + int statusCode = response.getStatusLine().getStatusCode(); + if (isHttpStatusCodNotOk(statusCode)) { + LOGGER.error( + String.format("Failed HTTP %s Request %s; the expected HTTP status code is '%s' but it got '%s'.", HttpGet.METHOD_NAME, url, EXPECTED_HTTP_STATUS, statusCode)); + return false; + } + + String hostStatusFromJson = responseInJson.get(STATUS).getAsString(); + return Status.Up.toString().equals(hostStatusFromJson); + } + + private boolean isHttpStatusCodNotOk(int statusCode) { + return statusCode < HttpStatus.SC_OK || statusCode >= HttpStatus.SC_MULTIPLE_CHOICES; + } + /** * Executes a GET request for the given URL address. */ + @Nullable protected HttpResponse executeHttpRequest(String url) { HttpGet httpReq = prepareHttpRequestForUrl(url); if (httpReq == null) { @@ -213,7 +222,7 @@ protected HttpResponse executeHttpRequest(String url) { LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s] due to exception %s.", httpReq.getMethod(), url, e), e); return null; } - retryHttpRequest(url, httpReq, client); + response = retryHttpRequest(url, httpReq, client); } return response; } @@ -232,8 +241,9 @@ private HttpGet prepareHttpRequestForUrl(String url) { } /** - * Re-executes the HTTP GET request until it gets a response or it reaches the maximum request retries {@link #MAX_REQUEST_RETRIES} + * Re-executes the HTTP GET request until it gets a response or it reaches the maximum request retries {@link #MAX_REQUEST_RETRIES}. */ + @Nullable protected HttpResponse retryHttpRequest(String url, HttpRequestBase httpReq, HttpClient client) { LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s]. Executing the request again.", httpReq.getMethod(), url)); HttpResponse response = retryUntilGetsHttpResponse(url, httpReq, client); @@ -244,7 +254,7 @@ protected HttpResponse retryHttpRequest(String url, HttpRequestBase httpReq, Htt } int statusCode = response.getStatusLine().getStatusCode(); - if (statusCode < HttpStatus.SC_OK || statusCode >= HttpStatus.SC_MULTIPLE_CHOICES) { + if (isHttpStatusCodNotOk(statusCode)) { LOGGER.error( String.format("Failed to get VMs information with a %s request to URL '%s'. The expected HTTP status code is '%s' but it got '%s'.", HttpGet.METHOD_NAME, url, EXPECTED_HTTP_STATUS, statusCode)); @@ -255,8 +265,12 @@ protected HttpResponse retryHttpRequest(String url, HttpRequestBase httpReq, Htt return response; } + /** + * Retry HTTP Request until success or it reaches {@link #MAX_REQUEST_RETRIES} retries. It can return null. + */ + @Nullable protected HttpResponse retryUntilGetsHttpResponse(String url, HttpRequestBase httpReq, HttpClient client) { - for (int attempt = 1; attempt < MAX_REQUEST_RETRIES + 1; attempt++) { + for (int attempt = 1; attempt <= MAX_REQUEST_RETRIES; attempt++) { try { TimeUnit.SECONDS.sleep(WAIT_FOR_REQUEST_RETRY); LOGGER.debug(String.format("Retry HTTP %s request [URL: %s], attempt %d/%d.", httpReq.getMethod(), url, attempt, MAX_REQUEST_RETRIES)); @@ -276,20 +290,17 @@ protected HttpResponse retryUntilGetsHttpResponse(String url, HttpRequestBase ht * * Note: this method can return NULL JsonObject in case HttpResponse is NULL. */ + @Nullable protected JsonObject processHttpResponseIntoJson(HttpResponse response) { - InputStream in; - String jsonString; if (response == null) { return null; } try { - in = response.getEntity().getContent(); + InputStream in = response.getEntity().getContent(); BufferedReader streamReader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8)); - jsonString = streamReader.readLine(); + return JSON_PARSER.parse(streamReader.readLine()).getAsJsonObject(); } catch (UnsupportedOperationException | IOException e) { throw new CloudRuntimeException("Failed to process response", e); } - - return new JsonParser().parse(jsonString).getAsJsonObject(); } } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java new file mode 100644 index 000000000000..1c44270fdf67 --- /dev/null +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.cloudstack.kvm.ha; + +import com.cloud.dc.ClusterVO; +import com.cloud.dc.dao.ClusterDao; +import com.cloud.host.Host; +import com.cloud.host.HostVO; +import com.cloud.host.Status; +import com.cloud.resource.ResourceManager; +import org.apache.log4j.Logger; + +import javax.inject.Inject; +import java.util.List; + +/** + * This class provides methods that help the KVM HA process on checking hosts status as well as deciding if a host should be fenced/recovered or not. + */ +public class KvmHaHelper { + + @Inject + private ResourceManager resourceManager; + @Inject + private KvmHaAgentClient kvmHaAgentClient; + @Inject + private ClusterDao clusterDao; + + private final static Logger LOGGER = Logger.getLogger(KvmHaHelper.class); + private final static double PROBLEMATIC_HOSTS_RATIO_ACCEPTED = 0.3; + + /** + * It checks the KVM node status via KVM HA Agent. + * If the agent is healthy it returns Status.Up, otherwise it keeps the provided Status as it is. + */ + public Status checkAgentStatusViaKvmHaAgent(Host host, Status agentStatus) { + boolean isVmsCountOnKvmMatchingWithDatabase = kvmHaAgentClient.isKvmHaAgentHealthy(host); + if(isVmsCountOnKvmMatchingWithDatabase) { + agentStatus = Status.Up; + LOGGER.debug(String.format("Checking agent %s status; KVM HA Agent is Running as expected.", agentStatus)); + } else { + LOGGER.warn(String.format("Checking agent %s status. Failed to check host status via KVM HA Agent", agentStatus)); + } + return agentStatus; + } + + /** + * Returns false if the cluster has no problematic hosts or a small fraction of it.

+ * Returns true if the cluster is problematic. A cluster is problematic if many hosts are in Down or Disconnected states, in such case it should not recover/fence.
+ * Instead, Admins should be warned and check as it could be networking problems and also might not even have resources capacity on the few Healthy hosts at the cluster. + */ + private boolean isClusteProblematic(Host host) { + List hostsInCluster = resourceManager.listAllHostsInCluster(host.getClusterId()); + List problematicNeighbors = resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Down); + problematicNeighbors.addAll(resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Disconnected)); + problematicNeighbors.addAll(resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Alert)); + problematicNeighbors.addAll(resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Error)); + int problematicHosts = problematicNeighbors.size(); + + int problematicHostsRatioAccepted = (int)(hostsInCluster.size() * PROBLEMATIC_HOSTS_RATIO_ACCEPTED); + if (problematicHosts >= problematicHostsRatioAccepted) { + ClusterVO cluster = clusterDao.findById(host.getClusterId()); + LOGGER.warn(String.format("%s is problematic but HA will not fence/recover due to its cluster [id: %d, name: %s] containing %d problematic hosts (Down, Disconnected, " + + "Alert or Error states). Maximum problematic hosts accepted for this cluster is %d.", + host, cluster.getId(), cluster.getName(), problematicHosts, problematicHostsRatioAccepted)); + return true; + } + + return false; + } + + private boolean isHostAgentReachableByNeighbour(Host host) { + List neighbors = resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Up); + for (HostVO neighbor : neighbors) { + boolean isVmActivtyOnNeighborHost = kvmHaAgentClient.isKvmHaAgentHealthy(neighbor); + if(isVmActivtyOnNeighborHost) { + boolean isReachable = kvmHaAgentClient.isHostReachableByNeighbour(neighbor, host); + if (isReachable) { + String.format( "%s is reachable by neighbour %s. If CloudStack is failing to reach the respective host then it is probably a network issue between the host " + + "and CloudStack management server.", host, neighbor); + return true; + } + } + } + return false; + } + + /** + * Returns true if the host is healthy. The health-check is performed via HTTP GET request to a service that retrieves Running KVM instances via Libvirt.
+ * The health-check is executed on the KVM node and verifies the amount of VMs running and if the Libvirt service is running. + */ + public boolean isVmActivtyOnHostViaKvmHaWebservice(Host host) { + boolean isKvmHaAgentHealthy = kvmHaAgentClient.isKvmHaAgentHealthy(host); + + if (!isKvmHaAgentHealthy) { + if (isClusteProblematic(host) || isHostAgentReachableByNeighbour(host)) { + return true; + } + } + + return isKvmHaAgentHealthy; + } + + /** + * Checks if the KVM HA webservice is enabled. One can enable or disable it via global settings 'kvm.ha.webservice.enabled'. + */ + public boolean isKvmHaWebserviceEnabled(Host host) { + KvmHaAgentClient kvmHaAgentClient = new KvmHaAgentClient(); + if (!kvmHaAgentClient.isKvmHaWebserviceEnabled()) { + LOGGER.debug(String.format("Skipping KVM HA web-service verification for %s due to 'kvm.ha.webservice.enabled' not enabled.", host)); + return false; + } + return true; + } +} diff --git a/plugins/hypervisors/kvm/src/main/resources/META-INF/cloudstack/kvm-compute/spring-kvm-compute-context.xml b/plugins/hypervisors/kvm/src/main/resources/META-INF/cloudstack/kvm-compute/spring-kvm-compute-context.xml index bedaf1851d18..fb68f78523d6 100644 --- a/plugins/hypervisors/kvm/src/main/resources/META-INF/cloudstack/kvm-compute/spring-kvm-compute-context.xml +++ b/plugins/hypervisors/kvm/src/main/resources/META-INF/cloudstack/kvm-compute/spring-kvm-compute-context.xml @@ -31,11 +31,12 @@ - - + + + diff --git a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java index bf07313b5133..e7d7b24815e4 100644 --- a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java +++ b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java @@ -20,7 +20,6 @@ import java.util.List; import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.math.NumberUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; @@ -50,15 +49,20 @@ public class KvmHaAgentClientTest { private static final int ERROR_CODE = -1; - private HostVO agent = Mockito.mock(HostVO.class); - private KvmHaAgentClient kvmHaAgentClient = Mockito.spy(new KvmHaAgentClient(agent)); + private HostVO host = Mockito.mock(HostVO.class); + private KvmHaAgentClient kvmHaAgentClient = Mockito.spy(new KvmHaAgentClient()); + private static final String CHECK_NEIGHBOUR = "check-neighbour"; private static final int DEFAULT_PORT = 8080; private static final String PRIVATE_IP_ADDRESS = "1.2.3.4"; private static final String JSON_STRING_EXAMPLE_3VMs = "{\"count\":3,\"virtualmachines\":[\"r-123-VM\",\"v-134-VM\",\"s-111-VM\"]}"; private static final int EXPECTED_RUNNING_VMS_EXAMPLE_3VMs = 3; private static final String JSON_STRING_EXAMPLE_0VMs = "{\"count\":0,\"virtualmachines\":[]}"; + private static final String JSON_STRING_EXAMPLE_CHECK_NEIGHBOUR_UP = "{\"status\": \"Up\"}"; + private static final String JSON_STRING_EXAMPLE_CHECK_NEIGHBOUR_DOWN = "{\"status\": \"Down\"}"; private static final int EXPECTED_RUNNING_VMS_EXAMPLE_0VMs = 0; private static final String EXPECTED_URL = String.format("http://%s:%d", PRIVATE_IP_ADDRESS, DEFAULT_PORT); + private static final String EXPECTED_URL_CHECK_NEIGHBOUR = String + .format("http://%s:%d/%s/%s:%d", PRIVATE_IP_ADDRESS, DEFAULT_PORT, CHECK_NEIGHBOUR, PRIVATE_IP_ADDRESS, DEFAULT_PORT); private static final HttpRequestBase HTTP_REQUEST_BASE = new HttpGet(EXPECTED_URL); private static final String VMS_COUNT = "count"; private static final String VIRTUAL_MACHINES = "virtualmachines"; @@ -103,9 +107,9 @@ private boolean isKvmHaAgentHealthyTests(int expectedNumberOfVms, int vmsRunning } Mockito.doReturn(vmsOnHostList).when(kvmHaAgentClient).listVmsOnHost(Mockito.any(), Mockito.any()); - Mockito.doReturn(vmsRunningOnAgent).when(kvmHaAgentClient).countRunningVmsOnAgent(); + Mockito.doReturn(vmsRunningOnAgent).when(kvmHaAgentClient).countRunningVmsOnAgent(Mockito.any()); - return kvmHaAgentClient.isKvmHaAgentHealthy(agent, vmInstanceDao); + return kvmHaAgentClient.isKvmHaAgentHealthy(host); } @Test @@ -159,13 +163,13 @@ public void countRunningVmsOnAgentTestBlankNoVmsListed() throws IOException { } private void prepareAndRunCountRunningVmsOnAgent(String jsonStringExample, int expectedListedVms) throws IOException { - Mockito.when(agent.getPrivateIpAddress()).thenReturn(PRIVATE_IP_ADDRESS); + Mockito.when(host.getPrivateIpAddress()).thenReturn(PRIVATE_IP_ADDRESS); Mockito.doReturn(mockResponse(HttpStatus.SC_OK, JSON_STRING_EXAMPLE_3VMs)).when(kvmHaAgentClient).executeHttpRequest(EXPECTED_URL); JsonObject jObject = new JsonParser().parse(jsonStringExample).getAsJsonObject(); Mockito.doReturn(jObject).when(kvmHaAgentClient).processHttpResponseIntoJson(Mockito.any(HttpResponse.class)); - int result = kvmHaAgentClient.countRunningVmsOnAgent(); + int result = kvmHaAgentClient.countRunningVmsOnAgent(host); Assert.assertEquals(expectedListedVms, result); } @@ -256,23 +260,36 @@ public void isKvmHaWebserviceEnabledTestDefault() { @Test public void getKvmHaMicroservicePortValueTestDefault() { - Assert.assertEquals(KVM_HA_WEBSERVICE_PORT, kvmHaAgentClient.getKvmHaMicroservicePortValue()); - } - -// private void prepareAndRunCountRunningVmsOnAgent(String jsonStringExample, int expectedListedVms) throws IOException { -// Mockito.when(agent.getPrivateIpAddress()).thenReturn(PRIVATE_IP_ADDRESS); -// Mockito.doReturn(mockResponse(HttpStatus.SC_OK, JSON_STRING_EXAMPLE_3VMs)).when(kvmHaAgentClient).executeHttpRequest(EXPECTED_URL); -// -// JsonObject jObject = new JsonParser().parse(jsonStringExample).getAsJsonObject(); -// Mockito.doReturn(jObject).when(kvmHaAgentClient).processHttpResponseIntoJson(Mockito.any(HttpResponse.class)); -// -// int result = kvmHaAgentClient.countRunningVmsOnAgent(); -// Assert.assertEquals(expectedListedVms, result); -// } -//TODO -// @Test -// public void isTargetHostReachableTest() { -// kvmHaAgentClient.isTargetHostReachable(PRIVATE_IP_ADDRESS); -// } + Assert.assertEquals(KVM_HA_WEBSERVICE_PORT, kvmHaAgentClient.getKvmHaMicroservicePortValue(host)); + } + + @Test + public void isTargetHostReachableTestIsUp() throws IOException { + prepareAndRunisTargetHostReachableTest(mockResponse(HttpStatus.SC_OK, JSON_STRING_EXAMPLE_CHECK_NEIGHBOUR_UP), true); + } + + @Test + public void isTargetHostReachableTestIsDown() throws IOException { + prepareAndRunisTargetHostReachableTest(mockResponse(HttpStatus.SC_OK, JSON_STRING_EXAMPLE_CHECK_NEIGHBOUR_DOWN), false); + } + + @Test + public void isTargetHostReachableTestNotFound() throws IOException { + prepareAndRunisTargetHostReachableTest(mockResponse(HttpStatus.SC_NOT_FOUND, JSON_STRING_EXAMPLE_CHECK_NEIGHBOUR_UP), false); + } + + @Test + public void isTargetHostReachableTestNullResponse() throws IOException { + prepareAndRunisTargetHostReachableTest(null, false); + } + + private void prepareAndRunisTargetHostReachableTest(CloseableHttpResponse response, boolean expected) throws IOException { + Mockito.when(host.getPrivateIpAddress()).thenReturn(PRIVATE_IP_ADDRESS); + Mockito.when(kvmHaAgentClient.getKvmHaMicroservicePortValue(Mockito.any())).thenReturn(8080); + Mockito.doReturn(response).when(kvmHaAgentClient).executeHttpRequest(EXPECTED_URL_CHECK_NEIGHBOUR); + + boolean result = kvmHaAgentClient.isHostReachableByNeighbour(host, host); + Assert.assertEquals(expected, result); + } } From b5191e883fec80abc0b96c85eb77a984d6fd78e6 Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Wed, 19 May 2021 04:50:16 -0300 Subject: [PATCH 15/40] Fix checkstyle --- .../kvm/ha/KVMHostActivityChecker.java | 16 +++++++------- .../cloudstack/kvm/ha/KvmHaAgentClient.java | 22 ++++++------------- 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java index 9a8e72364b8b..1e85c91dda06 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java @@ -81,34 +81,34 @@ public class KVMHostActivityChecker extends AdapterBase implements ActivityCheck private static final Set NFS_POOL_TYPE = new HashSet<>(Arrays.asList(Storage.StoragePoolType.NetworkFilesystem, Storage.StoragePoolType.ManagedNFS)); @Override - public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException { + public boolean isActive(Host host, DateTime suspectTime) throws HACheckerException { try { - return isVMActivtyOnHost(r, suspectTime); + return isVMActivtyOnHost(host, suspectTime); } catch (HACheckerException e) { //Re-throwing the exception to avoid poluting the 'HACheckerException' already thrown throw e; } catch (Exception e) { - String message = String.format("Operation timed out, probably the %s is not reachable.", r.toString()); + String message = String.format("Operation timed out, probably the %s is not reachable.", host.toString()); LOG.warn(message, e); throw new HACheckerException(message, e); } } @Override - public boolean isHealthy(Host r) { + public boolean isHealthy(Host host) { boolean isHealthy = true; - boolean isHostServedByNfsPool = isHostServedByNfsPool(r); - boolean isKvmHaWebserviceEnabled = kvmHaHelper.isKvmHaWebserviceEnabled(r); + boolean isHostServedByNfsPool = isHostServedByNfsPool(host); + boolean isKvmHaWebserviceEnabled = kvmHaHelper.isKvmHaWebserviceEnabled(host); if(isHostServedByNfsPool) { - isHealthy = isHealthViaNfs(r); + isHealthy = isHealthViaNfs(host); } if (!isKvmHaWebserviceEnabled) { return isHealthy; } - if (kvmHaHelper.isVmActivtyOnHostViaKvmHaWebservice(r) && !isHealthy) { + if (kvmHaHelper.isVmActivtyOnHostViaKvmHaWebservice(host) && !isHealthy) { isHealthy = true; } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java index 6a28e976d138..df873ef28185 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java @@ -38,7 +38,6 @@ import java.io.InputStreamReader; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; -import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; @@ -112,28 +111,21 @@ public boolean isKvmHaWebserviceEnabled() { * but that's not likely and thus it is not relevant for this very context. */ protected List listVmsOnHost(Host host, VMInstanceDao vmInstanceDao) { - List listByHostAndStateRunning = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Running); - List listByHostAndStateStopping = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Stopping); - List listByHostAndStateMigrating = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Migrating); - - List listByHostAndState = new ArrayList<>(); - listByHostAndState.addAll(listByHostAndStateRunning); - listByHostAndState.addAll(listByHostAndStateStopping); - listByHostAndState.addAll(listByHostAndStateMigrating); + List listByHostAndStates = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Running, VirtualMachine.State.Stopping, VirtualMachine.State.Migrating); if (LOGGER.isTraceEnabled()) { List listByHostAndStateStarting = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Starting); int startingVMs = listByHostAndStateStarting.size(); - int runningVMs = listByHostAndStateRunning.size(); - int stoppingVms = listByHostAndStateStopping.size(); - int migratingVms = listByHostAndStateMigrating.size(); + long runningVMs = listByHostAndStates.stream().filter(vm -> vm.getState() == VirtualMachine.State.Running).count(); + long stoppingVms = listByHostAndStates.stream().filter(vm -> vm.getState() == VirtualMachine.State.Stopping).count(); + long migratingVms = listByHostAndStates.stream().filter(vm -> vm.getState() == VirtualMachine.State.Migrating).count(); int countRunningVmsOnAgent = countRunningVmsOnAgent(host); LOGGER.trace( String.format("%s has (%d Starting) %d Running, %d Stopping, %d Migrating. Total listed via DB %d / %d (via libvirt)", host.getName(), startingVMs, runningVMs, - stoppingVms, migratingVms, listByHostAndState.size(), countRunningVmsOnAgent)); + stoppingVms, migratingVms, listByHostAndStates.size(), countRunningVmsOnAgent)); } - return listByHostAndState; + return listByHostAndStates; } /** @@ -171,7 +163,7 @@ private void logIfFencingOrRecoveringMightBeTriggered(Host agent) { } /** - * Sends a HTTP GET request from the host executing the KVM HA Agent webservice to a target Host (expected to also be running the KVM HA Agent). + * Sends HTTP GET request from the host executing the KVM HA Agent webservice to a target Host (expected to also be running the KVM HA Agent). * The webserver serves a JSON Object such as {"status": "Up"} if the request gets a HTTP_OK OR {"status": "Down"} if HTTP GET failed */ public boolean isHostReachableByNeighbour(Host neighbour, Host target) { From e4c11fa80b8c861829618bf79ea5afd6770b224b Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Wed, 19 May 2021 12:12:44 -0300 Subject: [PATCH 16/40] Add tests and enhance HA flow via new KvmHaHelper --- .../kvm/ha/KVMHostActivityChecker.java | 4 +- .../cloudstack/kvm/ha/KvmHaAgentClient.java | 45 +------- .../apache/cloudstack/kvm/ha/KvmHaHelper.java | 71 ++++++++++-- .../kvm/ha/KvmHaAgentClientTest.java | 46 -------- .../cloudstack/kvm/ha/KvmHaHelperTest.java | 107 ++++++++++++++++++ 5 files changed, 171 insertions(+), 102 deletions(-) create mode 100644 plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaHelperTest.java diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java index 1e85c91dda06..1354046cc459 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java @@ -108,7 +108,7 @@ public boolean isHealthy(Host host) { return isHealthy; } - if (kvmHaHelper.isVmActivtyOnHostViaKvmHaWebservice(host) && !isHealthy) { + if (kvmHaHelper.isKvmHealthyCheckViaLibvirt(host) && !isHealthy) { isHealthy = true; } @@ -207,7 +207,7 @@ private boolean isVMActivtyOnHost(Host agent, DateTime suspectTime) throws HAChe } } - boolean isKvmHaAgentHealthy = kvmHaHelper.isVmActivtyOnHostViaKvmHaWebservice(agent); + boolean isKvmHaAgentHealthy = kvmHaHelper.isKvmHealthyCheckViaLibvirt(agent); if (!activityStatus && isKvmHaAgentHealthy) { activityStatus = true; diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java index df873ef28185..d39940716d07 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java @@ -61,7 +61,6 @@ public class KvmHaAgentClient { private static final String CHECK_NEIGHBOUR = "check-neighbour"; private static final int WAIT_FOR_REQUEST_RETRY = 2; private static final int MAX_REQUEST_RETRIES = 2; - private static final int CAUTIOUS_MARGIN_OF_VMS_ON_HOST = 1; private static final JsonParser JSON_PARSER = new JsonParser(); @Inject @@ -70,7 +69,7 @@ public class KvmHaAgentClient { /** * Returns the number of VMs running on the KVM host according to Libvirt. */ - protected int countRunningVmsOnAgent(Host host) { + public int countRunningVmsOnAgent(Host host) { String url = String.format("http://%s:%d", host.getPrivateIpAddress(), getKvmHaMicroservicePortValue(host)); HttpResponse response = executeHttpRequest(url); @@ -95,12 +94,6 @@ protected int getKvmHaMicroservicePortValue(Host host) { return haAgentPort; } - /** - * Checks if the KVM HA Webservice is enabled or not; if disabled then CloudStack ignores HA validation via the webservice. - */ - public boolean isKvmHaWebserviceEnabled() { - return KVMHAConfig.IsKvmHaWebserviceEnabled.value(); - } /** * Lists VMs on host according to vm_instance DB table. The states considered for such listing are: 'Running', 'Stopping', 'Migrating'. @@ -110,7 +103,7 @@ public boolean isKvmHaWebserviceEnabled() { * However, there is still a probability of a VM in 'Starting' state be already listed on the KVM via '$virsh list', * but that's not likely and thus it is not relevant for this very context. */ - protected List listVmsOnHost(Host host, VMInstanceDao vmInstanceDao) { + public List listVmsOnHost(Host host) { List listByHostAndStates = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Running, VirtualMachine.State.Stopping, VirtualMachine.State.Migrating); if (LOGGER.isTraceEnabled()) { @@ -128,40 +121,6 @@ protected List listVmsOnHost(Host host, VMInstanceDao vmInstanceDa return listByHostAndStates; } - /** - * Returns true in case of the expected number of VMs matches with the VMs running on the KVM host according to Libvirt.

- * - * IF:
- * (i) KVM HA agent finds 0 running but CloudStack considers that the host has 2 or more VMs running: returns false as could not find VMs running but it expected at least - * 2 VMs running, fencing/recovering host would avoid downtime to VMs in this case.
- * (ii) KVM HA agent finds 0 VM running but CloudStack considers that the host has 1 VM running: return true and log WARN messages and avoids triggering HA recovery/fencing - * when it could be a inconsistency when migrating a VM.
- * (iii) amount of listed VMs is different than expected: return true and print WARN messages so Admins can monitor and react accordingly - */ - public boolean isKvmHaAgentHealthy(Host host) { - int numberOfVmsOnHostAccordingToDb = listVmsOnHost(host, vmInstanceDao).size(); - int numberOfVmsOnAgent = countRunningVmsOnAgent(host); - if (numberOfVmsOnAgent < 0) { - LOGGER.error(String.format("KVM HA Agent health check failed, either the KVM Agent %s is unreachable or Libvirt validation failed.", host)); - logIfFencingOrRecoveringMightBeTriggered(host); - return false; - } - if (numberOfVmsOnHostAccordingToDb == numberOfVmsOnAgent) { - return true; - } - if (numberOfVmsOnAgent == 0 && numberOfVmsOnHostAccordingToDb > CAUTIOUS_MARGIN_OF_VMS_ON_HOST) { - LOGGER.warn(String.format("KVM HA Agent %s could not find VMs; it was expected to list %d VMs.", host, numberOfVmsOnHostAccordingToDb)); - logIfFencingOrRecoveringMightBeTriggered(host); - return false; - } - LOGGER.warn(String.format("KVM HA Agent %s listed %d VMs; however, it was expected %d VMs.", host, numberOfVmsOnAgent, numberOfVmsOnHostAccordingToDb)); - return true; - } - - private void logIfFencingOrRecoveringMightBeTriggered(Host agent) { - LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName())); - } - /** * Sends HTTP GET request from the host executing the KVM HA Agent webservice to a target Host (expected to also be running the KVM HA Agent). * The webserver serves a JSON Object such as {"status": "Up"} if the request gets a HTTP_OK OR {"status": "Down"} if HTTP GET failed diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java index 1c44270fdf67..c67ed62069eb 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java @@ -35,21 +35,22 @@ public class KvmHaHelper { @Inject - private ResourceManager resourceManager; + protected ResourceManager resourceManager; @Inject - private KvmHaAgentClient kvmHaAgentClient; + protected KvmHaAgentClient kvmHaAgentClient; @Inject - private ClusterDao clusterDao; + protected ClusterDao clusterDao; - private final static Logger LOGGER = Logger.getLogger(KvmHaHelper.class); - private final static double PROBLEMATIC_HOSTS_RATIO_ACCEPTED = 0.3; + private static final Logger LOGGER = Logger.getLogger(KvmHaHelper.class); + private static final double PROBLEMATIC_HOSTS_RATIO_ACCEPTED = 0.3; + private static final int CAUTIOUS_MARGIN_OF_VMS_ON_HOST = 1; /** * It checks the KVM node status via KVM HA Agent. * If the agent is healthy it returns Status.Up, otherwise it keeps the provided Status as it is. */ public Status checkAgentStatusViaKvmHaAgent(Host host, Status agentStatus) { - boolean isVmsCountOnKvmMatchingWithDatabase = kvmHaAgentClient.isKvmHaAgentHealthy(host); + boolean isVmsCountOnKvmMatchingWithDatabase = isKvmHaAgentHealthy(host); if(isVmsCountOnKvmMatchingWithDatabase) { agentStatus = Status.Up; LOGGER.debug(String.format("Checking agent %s status; KVM HA Agent is Running as expected.", agentStatus)); @@ -84,10 +85,10 @@ private boolean isClusteProblematic(Host host) { return false; } - private boolean isHostAgentReachableByNeighbour(Host host) { + protected boolean isHostAgentReachableByNeighbour(Host host) { List neighbors = resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Up); for (HostVO neighbor : neighbors) { - boolean isVmActivtyOnNeighborHost = kvmHaAgentClient.isKvmHaAgentHealthy(neighbor); + boolean isVmActivtyOnNeighborHost = isKvmHaAgentHealthy(neighbor); if(isVmActivtyOnNeighborHost) { boolean isReachable = kvmHaAgentClient.isHostReachableByNeighbour(neighbor, host); if (isReachable) { @@ -104,8 +105,8 @@ private boolean isHostAgentReachableByNeighbour(Host host) { * Returns true if the host is healthy. The health-check is performed via HTTP GET request to a service that retrieves Running KVM instances via Libvirt.
* The health-check is executed on the KVM node and verifies the amount of VMs running and if the Libvirt service is running. */ - public boolean isVmActivtyOnHostViaKvmHaWebservice(Host host) { - boolean isKvmHaAgentHealthy = kvmHaAgentClient.isKvmHaAgentHealthy(host); + public boolean isKvmHealthyCheckViaLibvirt(Host host) { + boolean isKvmHaAgentHealthy = isKvmHaAgentHealthy(host); if (!isKvmHaAgentHealthy) { if (isClusteProblematic(host) || isHostAgentReachableByNeighbour(host)) { @@ -121,10 +122,58 @@ public boolean isVmActivtyOnHostViaKvmHaWebservice(Host host) { */ public boolean isKvmHaWebserviceEnabled(Host host) { KvmHaAgentClient kvmHaAgentClient = new KvmHaAgentClient(); - if (!kvmHaAgentClient.isKvmHaWebserviceEnabled()) { + if (!isKvmHaWebserviceEnabled(host)) { LOGGER.debug(String.format("Skipping KVM HA web-service verification for %s due to 'kvm.ha.webservice.enabled' not enabled.", host)); return false; } return true; } + + /** + * Returns true in case of the expected number of VMs matches with the VMs running on the KVM host according to Libvirt.

+ * + * IF:
+ * (i) KVM HA agent finds 0 running but CloudStack considers that the host has 2 or more VMs running: returns false as could not find VMs running but it expected at least + * 2 VMs running, fencing/recovering host would avoid downtime to VMs in this case.
+ * (ii) KVM HA agent finds 0 VM running but CloudStack considers that the host has 1 VM running: return true and log WARN messages and avoids triggering HA recovery/fencing + * when it could be a inconsistency when migrating a VM.
+ * (iii) amount of listed VMs is different than expected: return true and print WARN messages so Admins can monitor and react accordingly + */ + public boolean isKvmHaAgentHealthy(Host host) { + int numberOfVmsOnHostAccordingToDb = kvmHaAgentClient.listVmsOnHost(host).size(); + int numberOfVmsOnAgent = kvmHaAgentClient.countRunningVmsOnAgent(host); + + if (numberOfVmsOnAgent < 0) { + LOGGER.error(String.format("KVM HA Agent health check failed, either the KVM Agent %s is unreachable or Libvirt validation failed.", host)); + if (isHostAgentReachableByNeighbour(host)) { + return true; + } + logIfFencingOrRecoveringMightBeTriggered(host); + return false; + } + + if (numberOfVmsOnHostAccordingToDb == numberOfVmsOnAgent) { + return true; + } + + if (numberOfVmsOnAgent == 0 && numberOfVmsOnHostAccordingToDb > CAUTIOUS_MARGIN_OF_VMS_ON_HOST) { + LOGGER.warn(String.format("KVM HA Agent %s could not find VMs; it was expected to list %d VMs.", host, numberOfVmsOnHostAccordingToDb)); + logIfFencingOrRecoveringMightBeTriggered(host); + return false; + } + + LOGGER.warn(String.format("KVM HA Agent %s listed %d VMs; however, it was expected %d VMs.", host, numberOfVmsOnAgent, numberOfVmsOnHostAccordingToDb)); + return true; + } + + private void logIfFencingOrRecoveringMightBeTriggered(Host agent) { + LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName())); + } + + /** + * Checks if the KVM HA Webservice is enabled or not; if disabled then CloudStack ignores HA validation via the webservice. + */ + public boolean isKvmHaWebserviceEnabled() { + return KVMHAConfig.IsKvmHaWebserviceEnabled.value(); + } } diff --git a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java index e7d7b24815e4..8d0fddbc61d2 100644 --- a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java +++ b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java @@ -16,8 +16,6 @@ import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.List; import org.apache.commons.io.IOUtils; import org.apache.http.HttpEntity; @@ -38,7 +36,6 @@ import org.mockito.junit.MockitoJUnitRunner; import com.cloud.host.HostVO; -import com.cloud.vm.VMInstanceVO; import com.cloud.vm.dao.VMInstanceDaoImpl; import com.google.gson.JsonArray; import com.google.gson.JsonElement; @@ -48,7 +45,6 @@ @RunWith(MockitoJUnitRunner.class) public class KvmHaAgentClientTest { - private static final int ERROR_CODE = -1; private HostVO host = Mockito.mock(HostVO.class); private KvmHaAgentClient kvmHaAgentClient = Mockito.spy(new KvmHaAgentClient()); private static final String CHECK_NEIGHBOUR = "check-neighbour"; @@ -75,43 +71,6 @@ public class KvmHaAgentClientTest { @Mock VMInstanceDaoImpl vmInstanceDao; - @Test - public void isKvmHaAgentHealthyTestAllGood() { - boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, EXPECTED_RUNNING_VMS_EXAMPLE_3VMs); - Assert.assertTrue(result); - } - - @Test - public void isKvmHaAgentHealthyTestVMsDoNotMatchButDoNotReturnFalse() { - boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, 1); - Assert.assertTrue(result); - } - - @Test - public void isKvmHaAgentHealthyTestExpectedRunningVmsButNoneListed() { - boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, 0); - Assert.assertFalse(result); - } - - @Test - public void isKvmHaAgentHealthyTestReceivedErrorCode() { - boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, ERROR_CODE); - Assert.assertFalse(result); - } - - private boolean isKvmHaAgentHealthyTests(int expectedNumberOfVms, int vmsRunningOnAgent) { - List vmsOnHostList = new ArrayList<>(); - for (int i = 0; i < expectedNumberOfVms; i++) { - VMInstanceVO vmInstance = Mockito.mock(VMInstanceVO.class); - vmsOnHostList.add(vmInstance); - } - - Mockito.doReturn(vmsOnHostList).when(kvmHaAgentClient).listVmsOnHost(Mockito.any(), Mockito.any()); - Mockito.doReturn(vmsRunningOnAgent).when(kvmHaAgentClient).countRunningVmsOnAgent(Mockito.any()); - - return kvmHaAgentClient.isKvmHaAgentHealthy(host); - } - @Test public void processHttpResponseIntoJsonTestNull() { JsonObject responseJson = kvmHaAgentClient.processHttpResponseIntoJson(null); @@ -253,11 +212,6 @@ public void retryUntilGetsHttpResponseTestTwoIOException() throws IOException { Assert.assertNull(result); } - @Test - public void isKvmHaWebserviceEnabledTestDefault() { - Assert.assertFalse(kvmHaAgentClient.isKvmHaWebserviceEnabled()); - } - @Test public void getKvmHaMicroservicePortValueTestDefault() { Assert.assertEquals(KVM_HA_WEBSERVICE_PORT, kvmHaAgentClient.getKvmHaMicroservicePortValue(host)); diff --git a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaHelperTest.java b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaHelperTest.java new file mode 100644 index 000000000000..30748e76e684 --- /dev/null +++ b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaHelperTest.java @@ -0,0 +1,107 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cloudstack.kvm.ha; + +import java.util.ArrayList; +import java.util.List; + +import com.cloud.host.HostVO; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import org.mockito.Mockito; + +import com.cloud.vm.VMInstanceVO; +import org.mockito.MockitoAnnotations; +import org.mockito.Spy; +import org.mockito.junit.MockitoJUnitRunner; + +@RunWith(MockitoJUnitRunner.class) +public class KvmHaHelperTest { + + private static final int ERROR_CODE = -1; + private static final int EXPECTED_RUNNING_VMS_EXAMPLE_3VMs = 3; + + @Spy + @InjectMocks + private KvmHaHelper kvmHaHelper; + @Mock + private KvmHaAgentClient kvmHaAgentClient; + @Mock + private HostVO host; + + @Before + public void setup() { + MockitoAnnotations.initMocks(this); + } + + @Test + public void isKvmHaAgentHealthyTestAllGood() { + boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, true); + Assert.assertTrue(result); + } + + @Test + public void isKvmHaAgentHealthyTestVMsDoNotMatchButDoNotReturnFalse() { + boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, 1, true); + Assert.assertTrue(result); + } + + @Test + public void isKvmHaAgentHealthyTestExpectedRunningVmsButNoneListed() { + boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, 0, true); + Assert.assertFalse(result); + } + + @Test + public void isKvmHaAgentHealthyTestExpectedRunningVmsButNoneListedUnreachable() { + boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, 0, false); + Assert.assertFalse(result); + } + + @Test + public void isKvmHaAgentHealthyTestReceivedErrorCode() { + boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, ERROR_CODE, true); + Assert.assertTrue(result); + } + + @Test + public void isKvmHaAgentHealthyTestReceivedErrorCodeHostUnreachable() { + boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, ERROR_CODE, false); + Assert.assertFalse(result); + } + + private boolean isKvmHaAgentHealthyTests(int expectedNumberOfVms, int vmsRunningOnAgent, boolean isHostAgentReachableByNeighbour) { + List vmsOnHostList = new ArrayList<>(); + for (int i = 0; i < expectedNumberOfVms; i++) { + VMInstanceVO vmInstance = Mockito.mock(VMInstanceVO.class); + vmsOnHostList.add(vmInstance); + } + + Mockito.doReturn(vmsOnHostList).when(kvmHaAgentClient).listVmsOnHost(Mockito.any()); + Mockito.doReturn(vmsRunningOnAgent).when(kvmHaAgentClient).countRunningVmsOnAgent(Mockito.any()); + Mockito.doReturn(isHostAgentReachableByNeighbour).when(kvmHaHelper).isHostAgentReachableByNeighbour(Mockito.any()); + + return kvmHaHelper.isKvmHaAgentHealthy(host); + } + + @Test + public void isKvmHaWebserviceEnabledTestDefault() { + Assert.assertFalse(kvmHaHelper.isKvmHaWebserviceEnabled(Mockito.any())); + } + +} From de8f68e32027984fab941ea202c18975c8b4dc9c Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Wed, 19 May 2021 12:30:51 -0300 Subject: [PATCH 17/40] Fix isKvmHaWebserviceEnabled --- .../src/main/java/com/cloud/ha/KVMInvestigator.java | 5 ++++- .../apache/cloudstack/kvm/ha/KvmHaAgentClient.java | 1 - .../org/apache/cloudstack/kvm/ha/KvmHaHelper.java | 11 ++--------- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java index 313631339697..099a81504f2a 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java @@ -86,7 +86,10 @@ public Status isAgentAlive(Host agent) { s_logger.debug(String.format("Agent investigation was requested on host %s, but host has no NFS storage. Skipping investigation via NFS.", agent)); } - agentStatus = kvmHaHelper.checkAgentStatusViaKvmHaAgent(agent, agentStatus); + boolean isKvmHaWebserviceEnabled = kvmHaHelper.isKvmHaWebserviceEnabled(agent); + if(isKvmHaWebserviceEnabled) { + agentStatus = kvmHaHelper.checkAgentStatusViaKvmHaAgent(agent, agentStatus); + } return agentStatus; } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java index d39940716d07..dedc7ca483df 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java @@ -94,7 +94,6 @@ protected int getKvmHaMicroservicePortValue(Host host) { return haAgentPort; } - /** * Lists VMs on host according to vm_instance DB table. The states considered for such listing are: 'Running', 'Stopping', 'Migrating'. *
diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java index c67ed62069eb..5a43ae5b2474 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java @@ -121,8 +121,8 @@ public boolean isKvmHealthyCheckViaLibvirt(Host host) { * Checks if the KVM HA webservice is enabled. One can enable or disable it via global settings 'kvm.ha.webservice.enabled'. */ public boolean isKvmHaWebserviceEnabled(Host host) { - KvmHaAgentClient kvmHaAgentClient = new KvmHaAgentClient(); - if (!isKvmHaWebserviceEnabled(host)) { + boolean isKvmHaWebserviceEnabled = KVMHAConfig.IsKvmHaWebserviceEnabled.value(); + if (!isKvmHaWebserviceEnabled) { LOGGER.debug(String.format("Skipping KVM HA web-service verification for %s due to 'kvm.ha.webservice.enabled' not enabled.", host)); return false; } @@ -169,11 +169,4 @@ public boolean isKvmHaAgentHealthy(Host host) { private void logIfFencingOrRecoveringMightBeTriggered(Host agent) { LOGGER.warn(String.format("Host %s is not considered healthy and HA fencing/recovering process might be triggered.", agent.getName())); } - - /** - * Checks if the KVM HA Webservice is enabled or not; if disabled then CloudStack ignores HA validation via the webservice. - */ - public boolean isKvmHaWebserviceEnabled() { - return KVMHAConfig.IsKvmHaWebserviceEnabled.value(); - } } From 7b4e086d81034db790cccddce68c1f7d30a05fef Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Sat, 22 May 2021 01:26:55 -0300 Subject: [PATCH 18/40] Add test cases and config key --- .../apache/cloudstack/kvm/ha/KVMHAConfig.java | 8 +- .../cloudstack/kvm/ha/KVMHAProvider.java | 3 +- .../cloudstack/kvm/ha/KvmHaAgentClient.java | 2 +- .../apache/cloudstack/kvm/ha/KvmHaHelper.java | 28 +++--- .../kvm/ha/KvmHaAgentClientTest.java | 75 ++++++++++++++-- .../cloudstack/kvm/ha/KvmHaHelperTest.java | 85 +++++++++++++++++-- .../cloudstack/ha/provider/HAProvider.java | 4 +- 7 files changed, 176 insertions(+), 29 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java index 68d570823ce9..46d1da1f412a 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java @@ -58,6 +58,12 @@ public class KVMHAConfig { true, ConfigKey.Scope.Cluster); public static final ConfigKey IsKvmHaWebserviceEnabled = new ConfigKey("Advanced", Boolean.class, "kvm.ha.webservice.enabled", "false", - "The KVM HA Webservice is executed on the KVM node and checks the amount of VMs running via libvirt. It serves as a HA health-check for KVM nodes. One can enable (set to 'true') or disable it ('false'). If disabled then CloudStack ignores HA validation via this agent.", + "The KVM HA Webservice is executed on the KVM node and checks the amount of VMs running via libvirt. It serves as a HA health-check for KVM nodes. " + + "One can enable (set to 'true') or disable it ('false'). If disabled then CloudStack ignores HA validation via this agent.", + true, ConfigKey.Scope.Cluster); + + public static final ConfigKey KvmHaAcceptedProblematicHostsRatio = new ConfigKey("Advanced", Double.class, "kvm.ha.accepted.problematic.hosts.ratio", "0.3", + "The ratio of problematic Hosts accepted on a Cluster. If a cluster has more than the accepted ratio, HA will not be Fence/Recover Hosts and Admins will be notified to check the cluster healthy. " + + "A Host is considered problematic if in one of the following states: Error, Alert, Down, Disconnected. Default value is '0.3' (30%).", true, ConfigKey.Scope.Cluster); } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java index 5358bd1d6656..69f3f2aa07bd 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java @@ -153,7 +153,8 @@ public ConfigKey[] getConfigKeys() { KVMHAConfig.KvmHARecoverWaitPeriod, KVMHAConfig.KvmHARecoverAttemptThreshold, KVMHAConfig.KvmHaWebservicePort, - KVMHAConfig.IsKvmHaWebserviceEnabled + KVMHAConfig.IsKvmHaWebserviceEnabled, + KVMHAConfig.KvmHaAcceptedProblematicHostsRatio }; } } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java index dedc7ca483df..8b55435781ef 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java @@ -149,7 +149,7 @@ public boolean isHostReachableByNeighbour(Host neighbour, Host target) { return Status.Up.toString().equals(hostStatusFromJson); } - private boolean isHttpStatusCodNotOk(int statusCode) { + protected boolean isHttpStatusCodNotOk(int statusCode) { return statusCode < HttpStatus.SC_OK || statusCode >= HttpStatus.SC_MULTIPLE_CHOICES; } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java index 5a43ae5b2474..e5ab1316426b 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java @@ -25,9 +25,14 @@ import com.cloud.host.Status; import com.cloud.resource.ResourceManager; import org.apache.log4j.Logger; +import org.jetbrains.annotations.NotNull; import javax.inject.Inject; +import java.util.Arrays; +import java.util.HashSet; import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; /** * This class provides methods that help the KVM HA process on checking hosts status as well as deciding if a host should be fenced/recovered or not. @@ -45,6 +50,8 @@ public class KvmHaHelper { private static final double PROBLEMATIC_HOSTS_RATIO_ACCEPTED = 0.3; private static final int CAUTIOUS_MARGIN_OF_VMS_ON_HOST = 1; + private static final Set PROBLEMATIC_HOST_STATUS = new HashSet<>(Arrays.asList(Status.Alert, Status.Disconnected, Status.Down, Status.Error)); + /** * It checks the KVM node status via KVM HA Agent. * If the agent is healthy it returns Status.Up, otherwise it keeps the provided Status as it is. @@ -60,28 +67,29 @@ public Status checkAgentStatusViaKvmHaAgent(Host host, Status agentStatus) { return agentStatus; } + @NotNull + protected List listProblematicHosts(List hostsInCluster) { + return hostsInCluster.stream().filter(neighbour -> PROBLEMATIC_HOST_STATUS.contains(neighbour.getStatus())).collect(Collectors.toList()); + } + /** * Returns false if the cluster has no problematic hosts or a small fraction of it.

* Returns true if the cluster is problematic. A cluster is problematic if many hosts are in Down or Disconnected states, in such case it should not recover/fence.
* Instead, Admins should be warned and check as it could be networking problems and also might not even have resources capacity on the few Healthy hosts at the cluster. */ - private boolean isClusteProblematic(Host host) { + protected boolean isClusteProblematic(Host host) { List hostsInCluster = resourceManager.listAllHostsInCluster(host.getClusterId()); - List problematicNeighbors = resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Down); - problematicNeighbors.addAll(resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Disconnected)); - problematicNeighbors.addAll(resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Alert)); - problematicNeighbors.addAll(resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Error)); + List problematicNeighbors = listProblematicHosts(hostsInCluster); int problematicHosts = problematicNeighbors.size(); + int problematicHostsRatioAccepted = (int) (hostsInCluster.size() * KVMHAConfig.KvmHaAcceptedProblematicHostsRatio.value()); - int problematicHostsRatioAccepted = (int)(hostsInCluster.size() * PROBLEMATIC_HOSTS_RATIO_ACCEPTED); - if (problematicHosts >= problematicHostsRatioAccepted) { + if (problematicHosts > problematicHostsRatioAccepted) { ClusterVO cluster = clusterDao.findById(host.getClusterId()); LOGGER.warn(String.format("%s is problematic but HA will not fence/recover due to its cluster [id: %d, name: %s] containing %d problematic hosts (Down, Disconnected, " + "Alert or Error states). Maximum problematic hosts accepted for this cluster is %d.", host, cluster.getId(), cluster.getName(), problematicHosts, problematicHostsRatioAccepted)); return true; } - return false; } @@ -89,10 +97,10 @@ protected boolean isHostAgentReachableByNeighbour(Host host) { List neighbors = resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Up); for (HostVO neighbor : neighbors) { boolean isVmActivtyOnNeighborHost = isKvmHaAgentHealthy(neighbor); - if(isVmActivtyOnNeighborHost) { + if (isVmActivtyOnNeighborHost) { boolean isReachable = kvmHaAgentClient.isHostReachableByNeighbour(neighbor, host); if (isReachable) { - String.format( "%s is reachable by neighbour %s. If CloudStack is failing to reach the respective host then it is probably a network issue between the host " + String.format("%s is reachable by neighbour %s. If CloudStack is failing to reach the respective host then it is probably a network issue between the host " + "and CloudStack management server.", host, neighbor); return true; } diff --git a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java index 8d0fddbc61d2..85a2e3cbd052 100644 --- a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java +++ b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java @@ -31,12 +31,13 @@ import org.junit.Assert; import org.junit.Test; import org.junit.runner.RunWith; +import org.mockito.InjectMocks; import org.mockito.Mock; import org.mockito.Mockito; +import org.mockito.Spy; import org.mockito.junit.MockitoJUnitRunner; import com.cloud.host.HostVO; -import com.cloud.vm.dao.VMInstanceDaoImpl; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; @@ -46,7 +47,6 @@ public class KvmHaAgentClientTest { private HostVO host = Mockito.mock(HostVO.class); - private KvmHaAgentClient kvmHaAgentClient = Mockito.spy(new KvmHaAgentClient()); private static final String CHECK_NEIGHBOUR = "check-neighbour"; private static final int DEFAULT_PORT = 8080; private static final String PRIVATE_IP_ADDRESS = "1.2.3.4"; @@ -65,11 +65,11 @@ public class KvmHaAgentClientTest { private static final int MAX_REQUEST_RETRIES = 2; private static final int KVM_HA_WEBSERVICE_PORT = 8080; + @Spy + @InjectMocks + private KvmHaAgentClient kvmHaAgentClient; @Mock - HttpClient client; - - @Mock - VMInstanceDaoImpl vmInstanceDao; + private HttpClient client; @Test public void processHttpResponseIntoJsonTestNull() { @@ -246,4 +246,67 @@ private void prepareAndRunisTargetHostReachableTest(CloseableHttpResponse respon Assert.assertEquals(expected, result); } + @Test + public void isHostReachableByNeighbourTest() throws IOException { + JsonObject jsonObject = new JsonParser().parse(JSON_STRING_EXAMPLE_CHECK_NEIGHBOUR_UP).getAsJsonObject(); + CloseableHttpResponse response = mockResponse(HttpStatus.SC_OK, JSON_STRING_EXAMPLE_CHECK_NEIGHBOUR_UP); + prepareAndTestIsHostReachableByNeighbour(response, jsonObject, true); + } + + @Test + public void isHostReachableByNeighbourTestHttp300() throws IOException { + JsonObject jsonObject = new JsonParser().parse(JSON_STRING_EXAMPLE_CHECK_NEIGHBOUR_UP).getAsJsonObject(); + CloseableHttpResponse response = mockResponse(HttpStatus.SC_MULTIPLE_CHOICES, JSON_STRING_EXAMPLE_CHECK_NEIGHBOUR_UP); + prepareAndTestIsHostReachableByNeighbour(response, jsonObject, false); + } + + @Test + public void isHostReachableByNeighbourTestHttp404() throws IOException { + JsonObject jsonObject = new JsonParser().parse(JSON_STRING_EXAMPLE_CHECK_NEIGHBOUR_UP).getAsJsonObject(); + CloseableHttpResponse response = mockResponse(HttpStatus.SC_NOT_FOUND, JSON_STRING_EXAMPLE_CHECK_NEIGHBOUR_UP); + prepareAndTestIsHostReachableByNeighbour(response, jsonObject, false); + } + + @Test + public void isHostReachableByNeighbourTestNullResponse() throws IOException { + JsonObject jsonObject = new JsonParser().parse(JSON_STRING_EXAMPLE_CHECK_NEIGHBOUR_UP).getAsJsonObject(); + prepareAndTestIsHostReachableByNeighbour(null, jsonObject, false); + } + + @Test + public void isHostReachableByNeighbourTestNullJson() throws IOException { + CloseableHttpResponse response = mockResponse(HttpStatus.SC_OK, JSON_STRING_EXAMPLE_CHECK_NEIGHBOUR_UP); + prepareAndTestIsHostReachableByNeighbour(response, null, false); + } + + private void prepareAndTestIsHostReachableByNeighbour(CloseableHttpResponse response, JsonObject jsonObject, boolean expected) throws IOException { + HostVO neighbour = Mockito.mock(HostVO.class); + Mockito.when(neighbour.getPrivateIpAddress()).thenReturn(PRIVATE_IP_ADDRESS); + HostVO target = Mockito.mock(HostVO.class); + Mockito.when(target.getPrivateIpAddress()).thenReturn(PRIVATE_IP_ADDRESS); + + Mockito.doReturn(response).when(kvmHaAgentClient).executeHttpRequest(EXPECTED_URL_CHECK_NEIGHBOUR); + Mockito.doReturn(jsonObject).when(kvmHaAgentClient).processHttpResponseIntoJson(Mockito.any(HttpResponse.class)); + + boolean result = kvmHaAgentClient.isHostReachableByNeighbour(neighbour, target); + Assert.assertEquals(expected, result); + } + + @Test + public void isHttpStatusCodNotOkTestHttp200() { + boolean result = kvmHaAgentClient.isHttpStatusCodNotOk(HttpStatus.SC_OK); + Assert.assertFalse(result); + } + + @Test + public void isHttpStatusCodNotOkTestHttp300() { + boolean result = kvmHaAgentClient.isHttpStatusCodNotOk(HttpStatus.SC_MULTIPLE_CHOICES); + Assert.assertTrue(result); + } + + @Test + public void isHttpStatusCodNotOkTestHttp404() { + boolean result = kvmHaAgentClient.isHttpStatusCodNotOk(HttpStatus.SC_NOT_FOUND); + Assert.assertTrue(result); + } } diff --git a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaHelperTest.java b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaHelperTest.java index 30748e76e684..9a62e9f0c1c2 100644 --- a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaHelperTest.java +++ b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaHelperTest.java @@ -16,7 +16,11 @@ import java.util.ArrayList; import java.util.List; +import com.cloud.dc.ClusterVO; +import com.cloud.dc.dao.ClusterDao; import com.cloud.host.HostVO; +import com.cloud.host.Status; +import com.cloud.resource.ResourceManager; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -43,6 +47,10 @@ public class KvmHaHelperTest { private KvmHaAgentClient kvmHaAgentClient; @Mock private HostVO host; + @Mock + private ResourceManager resourceManager; + @Mock + private ClusterDao clusterDao; @Before public void setup() { @@ -51,41 +59,41 @@ public void setup() { @Test public void isKvmHaAgentHealthyTestAllGood() { - boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, true); + boolean result = prepareAndTestIsKvmHaAgentHealthy(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, true); Assert.assertTrue(result); } @Test public void isKvmHaAgentHealthyTestVMsDoNotMatchButDoNotReturnFalse() { - boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, 1, true); + boolean result = prepareAndTestIsKvmHaAgentHealthy(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, 1, true); Assert.assertTrue(result); } @Test public void isKvmHaAgentHealthyTestExpectedRunningVmsButNoneListed() { - boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, 0, true); + boolean result = prepareAndTestIsKvmHaAgentHealthy(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, 0, true); Assert.assertFalse(result); } @Test public void isKvmHaAgentHealthyTestExpectedRunningVmsButNoneListedUnreachable() { - boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, 0, false); + boolean result = prepareAndTestIsKvmHaAgentHealthy(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, 0, false); Assert.assertFalse(result); } @Test public void isKvmHaAgentHealthyTestReceivedErrorCode() { - boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, ERROR_CODE, true); + boolean result = prepareAndTestIsKvmHaAgentHealthy(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, ERROR_CODE, true); Assert.assertTrue(result); } @Test public void isKvmHaAgentHealthyTestReceivedErrorCodeHostUnreachable() { - boolean result = isKvmHaAgentHealthyTests(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, ERROR_CODE, false); + boolean result = prepareAndTestIsKvmHaAgentHealthy(EXPECTED_RUNNING_VMS_EXAMPLE_3VMs, ERROR_CODE, false); Assert.assertFalse(result); } - private boolean isKvmHaAgentHealthyTests(int expectedNumberOfVms, int vmsRunningOnAgent, boolean isHostAgentReachableByNeighbour) { + private boolean prepareAndTestIsKvmHaAgentHealthy(int expectedNumberOfVms, int vmsRunningOnAgent, boolean isHostAgentReachableByNeighbour) { List vmsOnHostList = new ArrayList<>(); for (int i = 0; i < expectedNumberOfVms; i++) { VMInstanceVO vmInstance = Mockito.mock(VMInstanceVO.class); @@ -104,4 +112,67 @@ public void isKvmHaWebserviceEnabledTestDefault() { Assert.assertFalse(kvmHaHelper.isKvmHaWebserviceEnabled(Mockito.any())); } + @Test + public void listProblematicHostsTest() { + List hostsInCluster = mockProblematicCluster(); + List problematicNeighbors = kvmHaHelper.listProblematicHosts(hostsInCluster); + Assert.assertEquals(5, hostsInCluster.size()); + Assert.assertEquals(4, problematicNeighbors.size()); + } + + private List mockProblematicCluster() { + HostVO hostDown = Mockito.mock(HostVO.class); + Mockito.doReturn(Status.Down).when(hostDown).getStatus(); + HostVO hostDisconnected = Mockito.mock(HostVO.class); + Mockito.doReturn(Status.Disconnected).when(hostDisconnected).getStatus(); + HostVO hostError = Mockito.mock(HostVO.class); + Mockito.doReturn(Status.Error).when(hostError).getStatus(); + HostVO hostAlert = Mockito.mock(HostVO.class); + Mockito.doReturn(Status.Alert).when(hostAlert).getStatus(); + List hostsInCluster = mockHealthyCluster(1); + hostsInCluster.add(hostAlert); + hostsInCluster.add(hostDown); + hostsInCluster.add(hostDisconnected); + hostsInCluster.add(hostError); + return hostsInCluster; + } + + private List mockHealthyCluster(int healthyHosts) { + HostVO hostUp = Mockito.mock(HostVO.class); + Mockito.doReturn(Status.Up).when(hostUp).getStatus(); + List hostsInCluster = new ArrayList<>(); + for (int i = 0; i < healthyHosts; i++) { + hostsInCluster.add(hostUp); + } + return hostsInCluster; + } + + @Test + public void isClusteProblematicTestProblematicCluster() { + prepareAndTestIsClusteProblematicTest(mockProblematicCluster(), true); + } + + @Test + public void isClusteProblematicTestProblematicCluster10Healthy4ProblematicHosts() { + List hostsInCluster = mockHealthyCluster(9); + hostsInCluster.addAll(mockProblematicCluster()); + prepareAndTestIsClusteProblematicTest(hostsInCluster, false); + } + + @Test + public void isClusteProblematicTestHealthyCluster() { + List hostsInCluster = mockHealthyCluster(20); + hostsInCluster.addAll(mockProblematicCluster()); + prepareAndTestIsClusteProblematicTest(hostsInCluster, false); + } + + private void prepareAndTestIsClusteProblematicTest(List problematicCluster, boolean expectedProblematicCluster) { + ClusterVO cluster = Mockito.mock(ClusterVO.class); + Mockito.doReturn(0l).when(cluster).getId(); + Mockito.doReturn("cluster-name").when(cluster).getName(); + Mockito.doReturn(problematicCluster).when(resourceManager).listAllHostsInCluster(Mockito.anyLong()); + Mockito.doReturn(cluster).when(clusterDao).findById(Mockito.anyLong()); + boolean isClusteProblematic = kvmHaHelper.isClusteProblematic(host); + Assert.assertEquals(expectedProblematicCluster, isClusteProblematic); + } } diff --git a/server/src/main/java/org/apache/cloudstack/ha/provider/HAProvider.java b/server/src/main/java/org/apache/cloudstack/ha/provider/HAProvider.java index f0fca12c6819..9a7f27c003ec 100644 --- a/server/src/main/java/org/apache/cloudstack/ha/provider/HAProvider.java +++ b/server/src/main/java/org/apache/cloudstack/ha/provider/HAProvider.java @@ -35,9 +35,7 @@ enum HAProviderConfig { MaxRecoveryAttempts, MaxActivityCheckInterval, MaxDegradedWaitTimeout, - RecoveryWaitTimeout, - KvmHaWebservicePort, - IsKvmHaWebserviceEnabled + RecoveryWaitTimeout }; HAResource.ResourceType resourceType(); From 891a0958d9405be29da2055a33263705b6b84215 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabriel=20Beims=20Br=C3=A4scher?= Date: Wed, 26 May 2021 17:23:02 -0300 Subject: [PATCH 19/40] Change KvmHaAcceptedProblematicHostsRatio config key description --- .../src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java index 46d1da1f412a..e2640a2eb994 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java @@ -63,7 +63,7 @@ public class KVMHAConfig { true, ConfigKey.Scope.Cluster); public static final ConfigKey KvmHaAcceptedProblematicHostsRatio = new ConfigKey("Advanced", Double.class, "kvm.ha.accepted.problematic.hosts.ratio", "0.3", - "The ratio of problematic Hosts accepted on a Cluster. If a cluster has more than the accepted ratio, HA will not be Fence/Recover Hosts and Admins will be notified to check the cluster healthy. " + "The ratio of problematic Hosts accepted on a Cluster. If a cluster has more than the accepted ratio, HA will not Fence/Recover Hosts; instead, it will notify Admins to check the cluster healthy. " + "A Host is considered problematic if in one of the following states: Error, Alert, Down, Disconnected. Default value is '0.3' (30%).", true, ConfigKey.Scope.Cluster); } From e2f20e36731f91b626b336661a9176cf9bf45f25 Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Wed, 2 Jun 2021 10:17:32 -0300 Subject: [PATCH 20/40] Address reviewer enhancing log and 'if' conditional --- .../apache/cloudstack/kvm/ha/KVMHostActivityChecker.java | 7 ++++--- .../org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java index 1354046cc459..119a7294341c 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java @@ -79,6 +79,7 @@ public class KVMHostActivityChecker extends AdapterBase implements ActivityCheck private KvmHaHelper kvmHaHelper; private static final Set NFS_POOL_TYPE = new HashSet<>(Arrays.asList(Storage.StoragePoolType.NetworkFilesystem, Storage.StoragePoolType.ManagedNFS)); + private static final Set KVM_OR_LXC = new HashSet<>(Arrays.asList(Hypervisor.HypervisorType.KVM, Hypervisor.HypervisorType.LXC)); @Override public boolean isActive(Host host, DateTime suspectTime) throws HACheckerException { @@ -100,7 +101,7 @@ public boolean isHealthy(Host host) { boolean isHostServedByNfsPool = isHostServedByNfsPool(host); boolean isKvmHaWebserviceEnabled = kvmHaHelper.isKvmHaWebserviceEnabled(host); - if(isHostServedByNfsPool) { + if (isHostServedByNfsPool) { isHealthy = isHealthViaNfs(host); } @@ -109,7 +110,7 @@ public boolean isHealthy(Host host) { } if (kvmHaHelper.isKvmHealthyCheckViaLibvirt(host) && !isHealthy) { - isHealthy = true; + return true; } return isHealthy; @@ -190,7 +191,7 @@ private boolean isAgentActive(Host agent) { } private boolean isVMActivtyOnHost(Host agent, DateTime suspectTime) throws HACheckerException { - if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM && agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) { + if (!KVM_OR_LXC.contains(agent.getHypervisorType())) { throw new IllegalStateException(String.format("Calling KVM investigator for non KVM Host of type [%s].", agent.getHypervisorType())); } boolean activityStatus = false; diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java index 8b55435781ef..95167ed46e23 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java @@ -228,7 +228,7 @@ protected HttpResponse retryUntilGetsHttpResponse(String url, HttpRequestBase ht } catch (IOException | InterruptedException e) { String errorMessage = String.format("Failed to execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s", httpReq.getMethod(), attempt, MAX_REQUEST_RETRIES, url, e); - LOGGER.error(errorMessage); + LOGGER.error(errorMessage, e); } } return null; From 10cd13b2395d4c3bfb31189d7c440caf7e843cef Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Mon, 21 Jun 2021 15:08:18 -0300 Subject: [PATCH 21/40] Enhance Documentation & remove unnecessary Catch (them all) --- .../main/java/com/cloud/ha/KVMInvestigator.java | 14 +++++--------- .../org/apache/cloudstack/kvm/ha/KvmHaHelper.java | 9 +++++++++ 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java index 099a81504f2a..9ae627038b3f 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java @@ -87,7 +87,7 @@ public Status isAgentAlive(Host agent) { } boolean isKvmHaWebserviceEnabled = kvmHaHelper.isKvmHaWebserviceEnabled(agent); - if(isKvmHaWebserviceEnabled) { + if (isKvmHaWebserviceEnabled) { agentStatus = kvmHaHelper.checkAgentStatusViaKvmHaAgent(agent, agentStatus); } @@ -130,13 +130,9 @@ private Status checkAgentStatusViaNfs(Host agent) { Status neighbourStatus = null; CheckOnHostCommand cmd = new CheckOnHostCommand(agent); - try { - Answer answer = _agentMgr.easySend(agent.getId(), cmd); - if (answer != null) { - hostStatus = answer.getResult() ? Status.Down : Status.Up; - } - } catch (Exception e) { - s_logger.debug(String.format("Failed to send command to %s", agent)); + Answer answer = _agentMgr.easySend(agent.getId(), cmd); + if (answer != null) { + hostStatus = answer.getResult() ? Status.Down : Status.Up; } if (hostStatus == null) { @@ -155,7 +151,7 @@ private Status checkAgentStatusViaNfs(Host agent) { } s_logger.debug(String.format("Investigating %s via neighbouring %s ", agent, neighbor)); try { - Answer answer = _agentMgr.easySend(neighbor.getId(), cmd); + answer = _agentMgr.easySend(neighbor.getId(), cmd); if (answer != null) { neighbourStatus = answer.getResult() ? Status.Down : Status.Up; s_logger.debug(String.format("Neighbouring %s returned status: %s for the investigated %s", neighbor, neighbourStatus, agent)); diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java index e5ab1316426b..1c223579402e 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java @@ -67,6 +67,15 @@ public Status checkAgentStatusViaKvmHaAgent(Host host, Status agentStatus) { return agentStatus; } + /** + * Given a List of Hosts, it lists Hosts that are in the following states: + *
    + *
  • Status.Alert; + *
  • Status.Disconnected; + *
  • Status.Down; + *
  • Status.Error. + *
+ */ @NotNull protected List listProblematicHosts(List hostsInCluster) { return hostsInCluster.stream().filter(neighbour -> PROBLEMATIC_HOST_STATUS.contains(neighbour.getStatus())).collect(Collectors.toList()); From 839d13c63462f5fa50ebc8ab0c1b12aaec24460c Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Wed, 23 Jun 2021 02:19:24 -0300 Subject: [PATCH 22/40] Fine adjustments --- .../kvm/src/main/java/com/cloud/ha/KVMInvestigator.java | 1 + .../java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java index 9ae627038b3f..2c26d4b34237 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java @@ -79,6 +79,7 @@ public Status isAgentAlive(Host agent) { Status agentStatus = Status.Disconnected; boolean hasNfs = isHostServedByNfsPool(agent); + List clusterPools = _storagePoolDao.listPoolsByCluster(agent.getClusterId()); if (hasNfs) { agentStatus = checkAgentStatusViaNfs(agent); s_logger.debug(String.format("Agent investigation was requested on host %s. Agent status via NFS heartbeat is %s.", agent, agentStatus)); diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java index 1c223579402e..e48636853810 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java @@ -58,7 +58,7 @@ public class KvmHaHelper { */ public Status checkAgentStatusViaKvmHaAgent(Host host, Status agentStatus) { boolean isVmsCountOnKvmMatchingWithDatabase = isKvmHaAgentHealthy(host); - if(isVmsCountOnKvmMatchingWithDatabase) { + if (isVmsCountOnKvmMatchingWithDatabase) { agentStatus = Status.Up; LOGGER.debug(String.format("Checking agent %s status; KVM HA Agent is Running as expected.", agentStatus)); } else { @@ -85,6 +85,8 @@ protected List listProblematicHosts(List hostsInCluster) { * Returns false if the cluster has no problematic hosts or a small fraction of it.

* Returns true if the cluster is problematic. A cluster is problematic if many hosts are in Down or Disconnected states, in such case it should not recover/fence.
* Instead, Admins should be warned and check as it could be networking problems and also might not even have resources capacity on the few Healthy hosts at the cluster. + *

+ * Admins can change the accepted ration of problematic hosts via global settings by updating configuration: "kvm.ha.accepted.problematic.hosts.ratio". */ protected boolean isClusteProblematic(Host host) { List hostsInCluster = resourceManager.listAllHostsInCluster(host.getClusterId()); @@ -102,6 +104,9 @@ protected boolean isClusteProblematic(Host host) { return false; } + /** + * Returns true if the given Host KVM-HA-Helper is reachable by another host in the same cluster. + */ protected boolean isHostAgentReachableByNeighbour(Host host) { List neighbors = resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Up); for (HostVO neighbor : neighbors) { From a9e9a8e2cfe0ba956434db8c0d448b43bdb7f0bc Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Thu, 24 Jun 2021 02:14:58 -0300 Subject: [PATCH 23/40] Re-check packaging --- packaging/centos7/cloud.spec | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packaging/centos7/cloud.spec b/packaging/centos7/cloud.spec index ee7c5170461a..b50ed09dce8f 100644 --- a/packaging/centos7/cloud.spec +++ b/packaging/centos7/cloud.spec @@ -334,7 +334,6 @@ install -D packaging/systemd/cloudstack-agent.service ${RPM_BUILD_ROOT}%{_unitdi install -D packaging/systemd/cloudstack-agent-ha-helper.service ${RPM_BUILD_ROOT}%{_unitdir}/%{name}-agent-ha-helper.service install -D packaging/systemd/cloudstack-rolling-maintenance@.service ${RPM_BUILD_ROOT}%{_unitdir}/%{name}-rolling-maintenance@.service install -D packaging/systemd/cloudstack-agent.default ${RPM_BUILD_ROOT}%{_sysconfdir}/default/%{name}-agent -install -D packaging/systemd/cloudstack-agent-ha-helper.default ${RPM_BUILD_ROOT}%{_sysconfdir}/default/%{name}-agent-ha-helper install -D agent/target/transformed/agent.properties ${RPM_BUILD_ROOT}%{_sysconfdir}/%{name}/agent/agent.properties install -D agent/target/transformed/environment.properties ${RPM_BUILD_ROOT}%{_sysconfdir}/%{name}/agent/environment.properties install -D agent/target/transformed/log4j-cloud.xml ${RPM_BUILD_ROOT}%{_sysconfdir}/%{name}/agent/log4j-cloud.xml @@ -342,6 +341,7 @@ install -D agent/target/transformed/cloud-setup-agent ${RPM_BUILD_ROOT}%{_bindir install -D agent/target/transformed/cloudstack-agent-upgrade ${RPM_BUILD_ROOT}%{_bindir}/%{name}-agent-upgrade install -D agent/target/transformed/cloud-guest-tool ${RPM_BUILD_ROOT}%{_bindir}/%{name}-guest-tool install -D agent/target/transformed/libvirtqemuhook ${RPM_BUILD_ROOT}%{_datadir}/%{name}-agent/lib/libvirtqemuhook +install -D agent/target/transformed/cloudstack-agent-ha-helper.default ${RPM_BUILD_ROOT}%{_sysconfdir}/default/%{name}-agent-ha-helper install -D agent/target/transformed/rolling-maintenance ${RPM_BUILD_ROOT}%{_datadir}/%{name}-agent/lib/rolling-maintenance install -D agent/target/transformed/cloud-ssh ${RPM_BUILD_ROOT}%{_bindir}/%{name}-ssh install -D agent/target/transformed/cloudstack-agent-profile.sh ${RPM_BUILD_ROOT}%{_sysconfdir}/profile.d/%{name}-agent-profile.sh @@ -494,6 +494,7 @@ mkdir -m 0755 -p /usr/share/cloudstack-agent/tmp /sbin/service libvirtd restart /sbin/systemctl enable cloudstack-agent > /dev/null 2>&1 || true /sbin/systemctl enable cloudstack-rolling-maintenance@p > /dev/null 2>&1 || true +/sbin/systemctl enable cloudstack-agent-ha-helper > /dev/null 2>&1 || true # if saved configs from upgrade exist, copy them over if [ -f "%{_sysconfdir}/cloud.rpmsave/agent/agent.properties" ]; then From 25d8a0bd623acb31ddd299a5f7c423fbdd147d9e Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Fri, 2 Jul 2021 03:07:22 -0300 Subject: [PATCH 24/40] Changes on centos7 packaging for agent-ha-helper --- packaging/centos7/cloud.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging/centos7/cloud.spec b/packaging/centos7/cloud.spec index b50ed09dce8f..840b9fe6d410 100644 --- a/packaging/centos7/cloud.spec +++ b/packaging/centos7/cloud.spec @@ -334,6 +334,7 @@ install -D packaging/systemd/cloudstack-agent.service ${RPM_BUILD_ROOT}%{_unitdi install -D packaging/systemd/cloudstack-agent-ha-helper.service ${RPM_BUILD_ROOT}%{_unitdir}/%{name}-agent-ha-helper.service install -D packaging/systemd/cloudstack-rolling-maintenance@.service ${RPM_BUILD_ROOT}%{_unitdir}/%{name}-rolling-maintenance@.service install -D packaging/systemd/cloudstack-agent.default ${RPM_BUILD_ROOT}%{_sysconfdir}/default/%{name}-agent +install -D packaging/systemd/cloudstack-agent-ha-helper.default ${RPM_BUILD_ROOT}%{_sysconfdir}/default/%{name}-agent-ha-helper install -D agent/target/transformed/agent.properties ${RPM_BUILD_ROOT}%{_sysconfdir}/%{name}/agent/agent.properties install -D agent/target/transformed/environment.properties ${RPM_BUILD_ROOT}%{_sysconfdir}/%{name}/agent/environment.properties install -D agent/target/transformed/log4j-cloud.xml ${RPM_BUILD_ROOT}%{_sysconfdir}/%{name}/agent/log4j-cloud.xml @@ -341,7 +342,6 @@ install -D agent/target/transformed/cloud-setup-agent ${RPM_BUILD_ROOT}%{_bindir install -D agent/target/transformed/cloudstack-agent-upgrade ${RPM_BUILD_ROOT}%{_bindir}/%{name}-agent-upgrade install -D agent/target/transformed/cloud-guest-tool ${RPM_BUILD_ROOT}%{_bindir}/%{name}-guest-tool install -D agent/target/transformed/libvirtqemuhook ${RPM_BUILD_ROOT}%{_datadir}/%{name}-agent/lib/libvirtqemuhook -install -D agent/target/transformed/cloudstack-agent-ha-helper.default ${RPM_BUILD_ROOT}%{_sysconfdir}/default/%{name}-agent-ha-helper install -D agent/target/transformed/rolling-maintenance ${RPM_BUILD_ROOT}%{_datadir}/%{name}-agent/lib/rolling-maintenance install -D agent/target/transformed/cloud-ssh ${RPM_BUILD_ROOT}%{_bindir}/%{name}-ssh install -D agent/target/transformed/cloudstack-agent-profile.sh ${RPM_BUILD_ROOT}%{_sysconfdir}/profile.d/%{name}-agent-profile.sh From 4ec91b535089be45972f0e5d87bcf2c60d42d52b Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Wed, 7 Jul 2021 20:04:01 -0300 Subject: [PATCH 25/40] Remove unnecessary line after rebasing and fixing conflicts with main --- .../kvm/src/main/java/com/cloud/ha/KVMInvestigator.java | 1 - 1 file changed, 1 deletion(-) diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java index 2c26d4b34237..9ae627038b3f 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java @@ -79,7 +79,6 @@ public Status isAgentAlive(Host agent) { Status agentStatus = Status.Disconnected; boolean hasNfs = isHostServedByNfsPool(agent); - List clusterPools = _storagePoolDao.listPoolsByCluster(agent.getClusterId()); if (hasNfs) { agentStatus = checkAgentStatusViaNfs(agent); s_logger.debug(String.format("Agent investigation was requested on host %s. Agent status via NFS heartbeat is %s.", agent, agentStatus)); From 25ffd18ee62822881bc6836ccedead367676bbf6 Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Wed, 7 Jul 2021 20:10:13 -0300 Subject: [PATCH 26/40] Remove nested IF in KvmHaHelper --- .../java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java index e48636853810..4a64eb5f34fd 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java @@ -129,13 +129,9 @@ protected boolean isHostAgentReachableByNeighbour(Host host) { */ public boolean isKvmHealthyCheckViaLibvirt(Host host) { boolean isKvmHaAgentHealthy = isKvmHaAgentHealthy(host); - - if (!isKvmHaAgentHealthy) { - if (isClusteProblematic(host) || isHostAgentReachableByNeighbour(host)) { - return true; - } + if (!isKvmHaAgentHealthy && (isClusteProblematic(host) || isHostAgentReachableByNeighbour(host))) { + return true; } - return isKvmHaAgentHealthy; } From 9b519e292d46b9ba74b6bbfa4cb2ce203432f0ed Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Wed, 7 Jul 2021 20:18:13 -0300 Subject: [PATCH 27/40] Address review removing nested IF and enhancing KvmHaAgentClient.prepareHttpRequestForUrl(String) --- .../cloudstack/kvm/ha/KVMHostActivityChecker.java | 12 ++++++------ .../apache/cloudstack/kvm/ha/KvmHaAgentClient.java | 4 +--- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java index 119a7294341c..4688bb601a83 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java @@ -117,12 +117,12 @@ public boolean isHealthy(Host host) { } private boolean isHealthViaNfs(Host r) { - boolean isHealthy = true; - if (isHostServedByNfsPool(r)) { - isHealthy = isAgentActive(r); - if (!isHealthy) { - LOG.warn(String.format("NFS storage health check failed for %s. It seems that a storage does not have activity.", r.toString())); - } + if (!isHostServedByNfsPool(r)) { + return true; + } + boolean isHealthy = isAgentActive(r); + if (!isHealthy) { + LOG.warn(String.format("NFS storage health check failed for %s. It seems that a storage does not have activity.", r)); } return isHealthy; } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java index 95167ed46e23..4284a046d8d8 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java @@ -179,15 +179,13 @@ protected HttpResponse executeHttpRequest(String url) { @Nullable private HttpGet prepareHttpRequestForUrl(String url) { - HttpGet httpReq = null; try { URIBuilder builder = new URIBuilder(url); - httpReq = new HttpGet(builder.build()); + return new HttpGet(builder.build()); } catch (URISyntaxException e) { LOGGER.error(String.format("Failed to create URI for GET request [URL: %s] due to exception.", url), e); return null; } - return httpReq; } /** From 178a09c8f90395639891a3473bc395905739a435 Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Wed, 7 Jul 2021 23:14:10 -0300 Subject: [PATCH 28/40] change header --- .../cloudstack/kvm/ha/KvmHaAgentClient.java | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java index 4284a046d8d8..368a05f5d241 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java @@ -1,16 +1,21 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// package org.apache.cloudstack.kvm.ha; import com.cloud.host.Host; From ddd8aba160386f323b0db745e5e40d1d2c8933bc Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Fri, 16 Jul 2021 12:21:58 -0300 Subject: [PATCH 29/40] Fix cloud.spec with permissions for cloudstack-agent-ha-helper --- packaging/centos7/cloud.spec | 2 ++ packaging/centos8/cloud.spec | 3 +++ packaging/systemd/cloudstack-agent-ha-helper.service | 4 ---- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/packaging/centos7/cloud.spec b/packaging/centos7/cloud.spec index 840b9fe6d410..a9f1f0d9c6ff 100644 --- a/packaging/centos7/cloud.spec +++ b/packaging/centos7/cloud.spec @@ -605,8 +605,10 @@ pip3 install --upgrade urllib3 %attr(0755,root,root) %{_bindir}/%{name}-guest-tool %attr(0755,root,root) %{_bindir}/%{name}-ssh %attr(0644,root,root) %{_unitdir}/%{name}-agent.service +%attr(0644,root,root) %{_unitdir}/%{name}-agent-ha-helper.service %attr(0644,root,root) %{_unitdir}/%{name}-rolling-maintenance@.service %config(noreplace) %{_sysconfdir}/default/%{name}-agent +%config(noreplace) %{_sysconfdir}/default/%{name}-agent-ha-helper %attr(0644,root,root) %{_sysconfdir}/profile.d/%{name}-agent-profile.sh %config(noreplace) %attr(0644,root,root) %{_sysconfdir}/logrotate.d/%{name}-agent %attr(0755,root,root) %{_datadir}/%{name}-common/scripts/network/cisco diff --git a/packaging/centos8/cloud.spec b/packaging/centos8/cloud.spec index b7aabc2fdb98..77d8cebf76f2 100644 --- a/packaging/centos8/cloud.spec +++ b/packaging/centos8/cloud.spec @@ -485,6 +485,7 @@ mkdir -m 0755 -p /usr/share/cloudstack-agent/tmp /sbin/service libvirtd restart /sbin/systemctl enable cloudstack-agent > /dev/null 2>&1 || true /sbin/systemctl enable cloudstack-rolling-maintenance@p > /dev/null 2>&1 || true +/sbin/systemctl enable cloudstack-agent-ha-helper > /dev/null 2>&1 || true # if saved configs from upgrade exist, copy them over if [ -f "%{_sysconfdir}/cloud.rpmsave/agent/agent.properties" ]; then @@ -592,8 +593,10 @@ pip install --upgrade /usr/share/cloudstack-marvin/Marvin-*.tar.gz %attr(0755,root,root) %{_bindir}/%{name}-guest-tool %attr(0755,root,root) %{_bindir}/%{name}-ssh %attr(0644,root,root) %{_unitdir}/%{name}-agent.service +%attr(0644,root,root) %{_unitdir}/%{name}-agent-ha-helper.service %attr(0644,root,root) %{_unitdir}/%{name}-rolling-maintenance@.service %config(noreplace) %{_sysconfdir}/default/%{name}-agent +%config(noreplace) %{_sysconfdir}/default/%{name}-agent-ha-helper %attr(0644,root,root) %{_sysconfdir}/profile.d/%{name}-agent-profile.sh %config(noreplace) %attr(0644,root,root) %{_sysconfdir}/logrotate.d/%{name}-agent %attr(0755,root,root) %{_datadir}/%{name}-common/scripts/network/cisco diff --git a/packaging/systemd/cloudstack-agent-ha-helper.service b/packaging/systemd/cloudstack-agent-ha-helper.service index 696379ea1fc5..2e4a52695903 100644 --- a/packaging/systemd/cloudstack-agent-ha-helper.service +++ b/packaging/systemd/cloudstack-agent-ha-helper.service @@ -15,10 +15,6 @@ # specific language governing permissions and limitations # under the License. -# Do not modify this file as your changes will be lost in the next CSM update. -# If you need to add specific dependencies to this service unit do it in the -# /etc/systemd/system/cloudstack-management.service.d/ directory - [Unit] Description=CloudStack Agent HA Helper Documentation=http://www.cloudstack.org/ From d760b5540545ad703b68d83aac0ff573907ced34 Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Fri, 16 Jul 2021 16:42:57 -0300 Subject: [PATCH 30/40] Use Stream instead of For loop. --- .../main/java/com/cloud/ha/KVMInvestigator.java | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java index 9ae627038b3f..c011d500c0a4 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java @@ -104,22 +104,12 @@ private boolean isHostServedByNfsPool(Host agent) { private boolean hasNfsPoolZoneWideForHost(Host agent) { List zonePools = _storagePoolDao.findZoneWideStoragePoolsByHypervisor(agent.getDataCenterId(), agent.getHypervisorType()); - for (StoragePoolVO pool : zonePools) { - if (pool.getPoolType() == StoragePoolType.NetworkFilesystem) { - return true; - } - } - return false; + return zonePools.stream().anyMatch(pool -> pool.getPoolType() == StoragePoolType.NetworkFilesystem); } private boolean hasNfsPoolClusterWideForHost(Host agent) { List clusterPools = _storagePoolDao.listPoolsByCluster(agent.getClusterId()); - for (StoragePoolVO pool : clusterPools) { - if (pool.getPoolType() == StoragePoolType.NetworkFilesystem) { - return true; - } - } - return false; + return clusterPools.stream().anyMatch(pool -> pool.getPoolType() == StoragePoolType.NetworkFilesystem); } /** From 19b1884b83079a2a42ca1520f97b9e0febc2bc46 Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Wed, 11 Aug 2021 17:32:50 -0300 Subject: [PATCH 31/40] Use Cluster scope for KvmHaAcceptedProblematicHostsRatio --- .../main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java index 4a64eb5f34fd..228e9698138e 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java @@ -92,7 +92,8 @@ protected boolean isClusteProblematic(Host host) { List hostsInCluster = resourceManager.listAllHostsInCluster(host.getClusterId()); List problematicNeighbors = listProblematicHosts(hostsInCluster); int problematicHosts = problematicNeighbors.size(); - int problematicHostsRatioAccepted = (int) (hostsInCluster.size() * KVMHAConfig.KvmHaAcceptedProblematicHostsRatio.value()); + double acceptedProblematicHostsRatio = KVMHAConfig.KvmHaAcceptedProblematicHostsRatio.valueIn(host.getClusterId()); + int problematicHostsRatioAccepted = (int) (hostsInCluster.size() * acceptedProblematicHostsRatio); if (problematicHosts > problematicHostsRatioAccepted) { ClusterVO cluster = clusterDao.findById(host.getClusterId()); From a5d139cb790c340893f9e068821a5d1a8e318354 Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Thu, 12 Aug 2021 14:56:20 -0300 Subject: [PATCH 32/40] Remove unused variable --- .../src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java | 1 - 1 file changed, 1 deletion(-) diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java index 228e9698138e..0683d37a3560 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java @@ -47,7 +47,6 @@ public class KvmHaHelper { protected ClusterDao clusterDao; private static final Logger LOGGER = Logger.getLogger(KvmHaHelper.class); - private static final double PROBLEMATIC_HOSTS_RATIO_ACCEPTED = 0.3; private static final int CAUTIOUS_MARGIN_OF_VMS_ON_HOST = 1; private static final Set PROBLEMATIC_HOST_STATUS = new HashSet<>(Arrays.asList(Status.Alert, Status.Disconnected, Status.Down, Status.Error)); From 955c65eb146ae289ca0665bda70c3635cbb64fce Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Mon, 16 Aug 2021 11:57:38 -0300 Subject: [PATCH 33/40] Enhance KVM HA checks via neighbour hosts. And, avoid recursive issues. --- .../apache/cloudstack/kvm/ha/KvmHaHelper.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java index 0683d37a3560..ae4e5cb12f94 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java @@ -47,6 +47,7 @@ public class KvmHaHelper { protected ClusterDao clusterDao; private static final Logger LOGGER = Logger.getLogger(KvmHaHelper.class); + private static final double PROBLEMATIC_HOSTS_RATIO_ACCEPTED = 0.3; private static final int CAUTIOUS_MARGIN_OF_VMS_ON_HOST = 1; private static final Set PROBLEMATIC_HOST_STATUS = new HashSet<>(Arrays.asList(Status.Alert, Status.Disconnected, Status.Down, Status.Error)); @@ -60,6 +61,8 @@ public Status checkAgentStatusViaKvmHaAgent(Host host, Status agentStatus) { if (isVmsCountOnKvmMatchingWithDatabase) { agentStatus = Status.Up; LOGGER.debug(String.format("Checking agent %s status; KVM HA Agent is Running as expected.", agentStatus)); + } else if (isHostAgentReachableByNeighbour(host)) { + LOGGER.warn(String.format("Checking agent %s status; CloudStack manager failed to reach KVM HA Agent but it was detected as Running by its neighbour hosts.", agentStatus)); } else { LOGGER.warn(String.format("Checking agent %s status. Failed to check host status via KVM HA Agent", agentStatus)); } @@ -110,14 +113,11 @@ protected boolean isClusteProblematic(Host host) { protected boolean isHostAgentReachableByNeighbour(Host host) { List neighbors = resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Up); for (HostVO neighbor : neighbors) { - boolean isVmActivtyOnNeighborHost = isKvmHaAgentHealthy(neighbor); - if (isVmActivtyOnNeighborHost) { - boolean isReachable = kvmHaAgentClient.isHostReachableByNeighbour(neighbor, host); - if (isReachable) { - String.format("%s is reachable by neighbour %s. If CloudStack is failing to reach the respective host then it is probably a network issue between the host " - + "and CloudStack management server.", host, neighbor); - return true; - } + boolean isReachable = kvmHaAgentClient.isHostReachableByNeighbour(neighbor, host); + if (isReachable) { + String.format("%s is reachable by neighbour %s. If CloudStack is failing to reach the respective host then it is probably a network issue between the host " + + "and CloudStack management server.", host, neighbor); + return true; } } return false; From 89ff24016def8da321e63be51950dacebaecda0f Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Mon, 16 Aug 2021 18:23:42 -0300 Subject: [PATCH 34/40] Enhance isKvmHaWebserviceEnabled --- .../main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java index ae4e5cb12f94..5d1129c40b28 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java @@ -142,9 +142,8 @@ public boolean isKvmHaWebserviceEnabled(Host host) { boolean isKvmHaWebserviceEnabled = KVMHAConfig.IsKvmHaWebserviceEnabled.value(); if (!isKvmHaWebserviceEnabled) { LOGGER.debug(String.format("Skipping KVM HA web-service verification for %s due to 'kvm.ha.webservice.enabled' not enabled.", host)); - return false; } - return true; + return isKvmHaWebserviceEnabled; } /** From 0e94c1d868f758056b334c6583ab09db602b48a1 Mon Sep 17 00:00:00 2001 From: Gabriel Brascher Date: Mon, 16 Aug 2021 18:32:04 -0300 Subject: [PATCH 35/40] Avoid issues in case of big cluster, and thus limit number of rertries of isHostAgentReachableByNeighbour. --- .../org/apache/cloudstack/kvm/ha/KvmHaHelper.java | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java index 5d1129c40b28..9d17c0075a38 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaHelper.java @@ -31,6 +31,7 @@ import java.util.Arrays; import java.util.HashSet; import java.util.List; +import java.util.Random; import java.util.Set; import java.util.stream.Collectors; @@ -47,8 +48,8 @@ public class KvmHaHelper { protected ClusterDao clusterDao; private static final Logger LOGGER = Logger.getLogger(KvmHaHelper.class); - private static final double PROBLEMATIC_HOSTS_RATIO_ACCEPTED = 0.3; private static final int CAUTIOUS_MARGIN_OF_VMS_ON_HOST = 1; + private static final int MAXIMUM_IS_HOST_REACHABLE_RETRIES = 3; private static final Set PROBLEMATIC_HOST_STATUS = new HashSet<>(Arrays.asList(Status.Alert, Status.Disconnected, Status.Down, Status.Error)); @@ -112,11 +113,15 @@ protected boolean isClusteProblematic(Host host) { */ protected boolean isHostAgentReachableByNeighbour(Host host) { List neighbors = resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Up); - for (HostVO neighbor : neighbors) { - boolean isReachable = kvmHaAgentClient.isHostReachableByNeighbour(neighbor, host); + neighbors.remove(host); + Random random = new Random(); + + for (int i = 1; i < MAXIMUM_IS_HOST_REACHABLE_RETRIES; i++) { + Host neighbour = neighbors.get(random.nextInt(neighbors.size())); + boolean isReachable = kvmHaAgentClient.isHostReachableByNeighbour(neighbour, host); if (isReachable) { String.format("%s is reachable by neighbour %s. If CloudStack is failing to reach the respective host then it is probably a network issue between the host " - + "and CloudStack management server.", host, neighbor); + + "and CloudStack management server.", host, neighbour); return true; } } From 8b5faa79975a3cdf10d8dc415f4d7bf88715b846 Mon Sep 17 00:00:00 2001 From: gabriel Date: Fri, 19 Nov 2021 10:36:55 +0100 Subject: [PATCH 36/40] Port python HTTP client to HTTPS --- scripts/vm/hypervisor/kvm/agent-ha-helper.py | 29 +++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/scripts/vm/hypervisor/kvm/agent-ha-helper.py b/scripts/vm/hypervisor/kvm/agent-ha-helper.py index eec5a00a6303..0f4fecbae560 100755 --- a/scripts/vm/hypervisor/kvm/agent-ha-helper.py +++ b/scripts/vm/hypervisor/kvm/agent-ha-helper.py @@ -18,18 +18,21 @@ import logging import libvirt -import socket import json +import socket import requests -from http.server import BaseHTTPRequestHandler, HTTPServer +import ssl +import os.path +from http.server import HTTPServer, SimpleHTTPRequestHandler -log_folder = "/var/log/cloudstack/agent/" -log_path = "/var/log/cloudstack/agent/agent-ha-helper.log" root_path = "/" check_path = "/check-neighbour/" +cloud_key = '/etc/cloudstack/agent/cloud.key' +cloud_cert = '/etc/cloudstack/agent/cloud.crt' http_ok = 200 http_multiple_choices = 300 http_not_found = 404 +server_side = True class Libvirt(): def __init__(self): @@ -54,7 +57,7 @@ def running_vms(self): class HTTPServerV6(HTTPServer): address_family = socket.AF_INET6 -class CloudStackAgentHAHelper(BaseHTTPRequestHandler): +class CloudStackAgentHAHelper(SimpleHTTPRequestHandler): def do_GET(self): if self.path == root_path: libvirt = Libvirt() @@ -73,8 +76,8 @@ def do_GET(self): elif check_path in self.path: host_and_port = self.path.partition(check_path)[2] - request_url = 'http://{}/'.format(host_and_port) - logging.debug('Check if Host {} is reachable via HTTP GET request to agent-ha-helper.'.format(request_url)) + request_url = 'https://{}/'.format(host_and_port) + logging.debug('Check if Host {} is reachable via HTTPs GET request to agent-ha-helper.'.format(request_url)) logging.debug('GET request: {}'.format(request_url)) try: response = requests.get(url = request_url) @@ -108,10 +111,16 @@ def do_GET(self): self.end_headers() return -def run(port=8080): +def run(port=443): server_address = ('', port) - httpd = HTTPServerV6((server_address), CloudStackAgentHAHelper) - httpd.serve_forever() + if os.path.isfile(cloud_key) & os.path.isfile(cloud_cert): + httpd = HTTPServerV6(server_address, CloudStackAgentHAHelper) + httpd.socket = ssl.wrap_socket(httpd.socket, keyfile=cloud_key, certfile=cloud_cert, server_side=True) + httpd.serve_forever() + else: + logging.error('Failed to run HTTPS server, cannot open file {}.'.format(filepath)) + + if __name__ == "__main__": From 95c58d9ce289ea55a706f832f35e06beb2f54971 Mon Sep 17 00:00:00 2001 From: gabriel Date: Tue, 4 Jan 2022 18:32:53 +0100 Subject: [PATCH 37/40] Rebase against main branch, fix test erros after updating default port for KvmHaClient. --- .../cloudstack-agent-ha-helper.default | 12 +++- .../cloudstack-agent-ha-helper.service | 2 +- .../apache/cloudstack/kvm/ha/KVMHAConfig.java | 4 +- .../cloudstack/kvm/ha/KvmHaAgentClient.java | 6 +- .../kvm/ha/KvmHaAgentClientTest.java | 11 ++-- scripts/vm/hypervisor/kvm/agent-ha-helper.py | 58 ++++++++++++++----- 6 files changed, 63 insertions(+), 30 deletions(-) diff --git a/packaging/systemd/cloudstack-agent-ha-helper.default b/packaging/systemd/cloudstack-agent-ha-helper.default index e98e162a23ea..f96fed3bf0c0 100644 --- a/packaging/systemd/cloudstack-agent-ha-helper.default +++ b/packaging/systemd/cloudstack-agent-ha-helper.default @@ -15,4 +15,14 @@ # specific language governing permissions and limitations # under the License. -PORT=8080 +# The agent-ha-helper.py provides a HTTP server which handles API requests to identify if the host (or a neighbour host) is healthy. +# optional arguments: + +# PORT Port to be used by the agent-ha-helper server +# Default is 8443. However, if the "insecure" mode is activated, then it uses 8080 by default. +# PORT=8443 + +# If uncommented, it passes flag that allows to deploy the HTTP server without SSL (default port would be 8080 in such case) +# To enable HTTP requests then make it +# SECURITY_MODE="--insecure" +# If no mode is provided, agent-ha-helper creates HTTP server with support to SSL via cloud.crt and cloud.key. diff --git a/packaging/systemd/cloudstack-agent-ha-helper.service b/packaging/systemd/cloudstack-agent-ha-helper.service index 2e4a52695903..14b95dc3790b 100644 --- a/packaging/systemd/cloudstack-agent-ha-helper.service +++ b/packaging/systemd/cloudstack-agent-ha-helper.service @@ -24,7 +24,7 @@ After=libvirtd.service [Service] Type=simple EnvironmentFile=/etc/default/cloudstack-agent-ha-helper -ExecStart=/usr/share/cloudstack-common/scripts/vm/hypervisor/kvm/agent-ha-helper.py $PORT +ExecStart=/usr/share/cloudstack-common/scripts/vm/hypervisor/kvm/agent-ha-helper.py $PORT $SECURITY_MODE Restart=always RestartSec=10s diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java index e2640a2eb994..e1c873d93362 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java @@ -53,8 +53,8 @@ public class KVMHAConfig { public static final ConfigKey KvmHAFenceTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.fence.timeout", "60", "The maximum length of time, in seconds, expected for a fence operation to complete.", true, ConfigKey.Scope.Cluster); - public static final ConfigKey KvmHaWebservicePort = new ConfigKey("Advanced", Integer.class, "kvm.ha.webservice.port", "8080", - "It sets the port used to communicate with the KVM HA Agent Microservice that is running on KVM nodes. Default value is 8080.", + public static final ConfigKey KvmHaWebservicePort = new ConfigKey("Advanced", Integer.class, "kvm.ha.webservice.port", "8443", + "It sets the port used to communicate with the KVM HA Agent Microservice that is running on KVM nodes. Default value is 8443.", true, ConfigKey.Scope.Cluster); public static final ConfigKey IsKvmHaWebserviceEnabled = new ConfigKey("Advanced", Boolean.class, "kvm.ha.webservice.enabled", "false", diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java index 368a05f5d241..e3b26d7a0909 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java @@ -75,7 +75,7 @@ public class KvmHaAgentClient { * Returns the number of VMs running on the KVM host according to Libvirt. */ public int countRunningVmsOnAgent(Host host) { - String url = String.format("http://%s:%d", host.getPrivateIpAddress(), getKvmHaMicroservicePortValue(host)); + String url = String.format("https://%s:%d", host.getPrivateIpAddress(), getKvmHaMicroservicePortValue(host)); HttpResponse response = executeHttpRequest(url); if (response == null) @@ -109,7 +109,6 @@ protected int getKvmHaMicroservicePortValue(Host host) { */ public List listVmsOnHost(Host host) { List listByHostAndStates = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Running, VirtualMachine.State.Stopping, VirtualMachine.State.Migrating); - if (LOGGER.isTraceEnabled()) { List listByHostAndStateStarting = vmInstanceDao.listByHostAndState(host.getId(), VirtualMachine.State.Starting); int startingVMs = listByHostAndStateStarting.size(); @@ -121,7 +120,6 @@ public List listVmsOnHost(Host host) { String.format("%s has (%d Starting) %d Running, %d Stopping, %d Migrating. Total listed via DB %d / %d (via libvirt)", host.getName(), startingVMs, runningVMs, stoppingVms, migratingVms, listByHostAndStates.size(), countRunningVmsOnAgent)); } - return listByHostAndStates; } @@ -133,7 +131,7 @@ public boolean isHostReachableByNeighbour(Host neighbour, Host target) { String neighbourHostAddress = neighbour.getPrivateIpAddress(); String targetHostAddress = target.getPrivateIpAddress(); int port = getKvmHaMicroservicePortValue(neighbour); - String url = String.format("http://%s:%d/%s/%s:%d", neighbourHostAddress, port, CHECK_NEIGHBOUR, targetHostAddress, port); + String url = String.format("https://%s:%d/%s/%s:%d", neighbourHostAddress, port, CHECK_NEIGHBOUR, targetHostAddress, port); HttpResponse response = executeHttpRequest(url); if (response == null) diff --git a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java index 85a2e3cbd052..e97d3fd7925c 100644 --- a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java +++ b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java @@ -48,7 +48,7 @@ public class KvmHaAgentClientTest { private HostVO host = Mockito.mock(HostVO.class); private static final String CHECK_NEIGHBOUR = "check-neighbour"; - private static final int DEFAULT_PORT = 8080; + private static final int DEFAULT_PORT = 8443; private static final String PRIVATE_IP_ADDRESS = "1.2.3.4"; private static final String JSON_STRING_EXAMPLE_3VMs = "{\"count\":3,\"virtualmachines\":[\"r-123-VM\",\"v-134-VM\",\"s-111-VM\"]}"; private static final int EXPECTED_RUNNING_VMS_EXAMPLE_3VMs = 3; @@ -56,14 +56,13 @@ public class KvmHaAgentClientTest { private static final String JSON_STRING_EXAMPLE_CHECK_NEIGHBOUR_UP = "{\"status\": \"Up\"}"; private static final String JSON_STRING_EXAMPLE_CHECK_NEIGHBOUR_DOWN = "{\"status\": \"Down\"}"; private static final int EXPECTED_RUNNING_VMS_EXAMPLE_0VMs = 0; - private static final String EXPECTED_URL = String.format("http://%s:%d", PRIVATE_IP_ADDRESS, DEFAULT_PORT); + private static final String EXPECTED_URL = String.format("https://%s:%d", PRIVATE_IP_ADDRESS, DEFAULT_PORT); private static final String EXPECTED_URL_CHECK_NEIGHBOUR = String - .format("http://%s:%d/%s/%s:%d", PRIVATE_IP_ADDRESS, DEFAULT_PORT, CHECK_NEIGHBOUR, PRIVATE_IP_ADDRESS, DEFAULT_PORT); + .format("https://%s:%d/%s/%s:%d", PRIVATE_IP_ADDRESS, DEFAULT_PORT, CHECK_NEIGHBOUR, PRIVATE_IP_ADDRESS, DEFAULT_PORT); private static final HttpRequestBase HTTP_REQUEST_BASE = new HttpGet(EXPECTED_URL); private static final String VMS_COUNT = "count"; private static final String VIRTUAL_MACHINES = "virtualmachines"; private static final int MAX_REQUEST_RETRIES = 2; - private static final int KVM_HA_WEBSERVICE_PORT = 8080; @Spy @InjectMocks @@ -214,7 +213,7 @@ public void retryUntilGetsHttpResponseTestTwoIOException() throws IOException { @Test public void getKvmHaMicroservicePortValueTestDefault() { - Assert.assertEquals(KVM_HA_WEBSERVICE_PORT, kvmHaAgentClient.getKvmHaMicroservicePortValue(host)); + Assert.assertEquals(DEFAULT_PORT, kvmHaAgentClient.getKvmHaMicroservicePortValue(host)); } @Test @@ -239,7 +238,7 @@ public void isTargetHostReachableTestNullResponse() throws IOException { private void prepareAndRunisTargetHostReachableTest(CloseableHttpResponse response, boolean expected) throws IOException { Mockito.when(host.getPrivateIpAddress()).thenReturn(PRIVATE_IP_ADDRESS); - Mockito.when(kvmHaAgentClient.getKvmHaMicroservicePortValue(Mockito.any())).thenReturn(8080); + Mockito.when(kvmHaAgentClient.getKvmHaMicroservicePortValue(Mockito.any())).thenReturn(DEFAULT_PORT); Mockito.doReturn(response).when(kvmHaAgentClient).executeHttpRequest(EXPECTED_URL_CHECK_NEIGHBOUR); boolean result = kvmHaAgentClient.isHostReachableByNeighbour(host, host); diff --git a/scripts/vm/hypervisor/kvm/agent-ha-helper.py b/scripts/vm/hypervisor/kvm/agent-ha-helper.py index 0f4fecbae560..ceafb9980229 100755 --- a/scripts/vm/hypervisor/kvm/agent-ha-helper.py +++ b/scripts/vm/hypervisor/kvm/agent-ha-helper.py @@ -29,16 +29,22 @@ check_path = "/check-neighbour/" cloud_key = '/etc/cloudstack/agent/cloud.key' cloud_cert = '/etc/cloudstack/agent/cloud.crt' +log_path = '/var/log/cloudstack/agent/agent-ha-helper.log' http_ok = 200 http_multiple_choices = 300 http_not_found = 404 server_side = True +bind = '' +insecure = False +port=8443 class Libvirt(): def __init__(self): self.conn = libvirt.openReadOnly("qemu:///system") if not self.conn: - raise Exception('Failed to open connection to libvirt') + libvirt_error = 'Failed to open connection to libvirt' + logging.error(libvirt_error) + raise Exception(libvirt_error) def running_vms(self): alldomains = [domain for domain in map(self.conn.lookupByID, self.conn.listDomainsID())] @@ -111,25 +117,45 @@ def do_GET(self): self.end_headers() return -def run(port=443): - server_address = ('', port) - if os.path.isfile(cloud_key) & os.path.isfile(cloud_cert): - httpd = HTTPServerV6(server_address, CloudStackAgentHAHelper) - httpd.socket = ssl.wrap_socket(httpd.socket, keyfile=cloud_key, certfile=cloud_cert, server_side=True) - httpd.serve_forever() - else: - logging.error('Failed to run HTTPS server, cannot open file {}.'.format(filepath)) - - +def run(): + server_address_and_port = (bind, port) + httpd = HTTPServerV6((server_address_and_port), CloudStackAgentHAHelper) + if insecure: + logging.warning('Creating HTTP Server on insecure mode (HTTP, no SSL) exposed at port {}.'.format(port)) + elif not os.path.isfile(cloud_key) or not os.path.isfile(cloud_cert): + error_message = 'Failed to run HTTPS server, cannot find certificate or key files: "{}" or "{}".'.format(cloud_key, cloud_cert) + logging.error(error_message) + raise Exception(error_message) + else: + logging.debug('Creating HTTPs Server at port {}.'.format(port)) + httpd.socket = ssl.wrap_socket(httpd.socket, keyfile=cloud_key, certfile=cloud_cert) + httpd.serve_forever() if __name__ == "__main__": - from sys import argv - logging.basicConfig(filename='/var/log/cloudstack/agent/agent-ha-helper.log', format='%(asctime)s - %(message)s', level=logging.DEBUG) + import argparse, sys + logging.basicConfig(filename=log_path, format='%(asctime)s - %(message)s', level=logging.DEBUG) try: - if len(argv) == 2: - run(port=int(argv[1])) + parser = argparse.ArgumentParser(prog='agent-ha-helper', + usage='%(prog)s [-h] [-i] -p ', + description='The agent-ha-helper.py provides a HTTP server ' + 'which handles API requests to identify ' + 'if the host (or a neighbour host) is healthy.') + parser.add_argument('-i', '--insecure', help='Allows to run the HTTP server without SSL', action='store_true') + parser.add_argument('-p', '--port', help='Port to be used by the agent-ha-helper server', type=int) + args = parser.parse_args() + + if not len(sys.argv) > 1: + parser.print_help(sys.stderr) + logging.warning('Note: no arguments have been passed. Using default values [bind: "::", port: {}, insecure: {}].'.format(port, insecure)) else: - run() + if args.insecure: + insecure = True + port = 8080 + print("insecure turned on") + + if args.port is not None: + port = args.port + run() except KeyboardInterrupt: pass From a1271d2c812bdff14f7475639c6fe49d214f19ca Mon Sep 17 00:00:00 2001 From: gabriel Date: Tue, 18 Jan 2022 16:26:32 +0100 Subject: [PATCH 38/40] Add SSL & Authentication support into the agent-ha-helper.py --- scripts/vm/hypervisor/kvm/agent-ha-helper.py | 204 +++++++++++++------ 1 file changed, 144 insertions(+), 60 deletions(-) diff --git a/scripts/vm/hypervisor/kvm/agent-ha-helper.py b/scripts/vm/hypervisor/kvm/agent-ha-helper.py index ceafb9980229..912209ed1d2e 100755 --- a/scripts/vm/hypervisor/kvm/agent-ha-helper.py +++ b/scripts/vm/hypervisor/kvm/agent-ha-helper.py @@ -24,23 +24,52 @@ import ssl import os.path from http.server import HTTPServer, SimpleHTTPRequestHandler - -root_path = "/" -check_path = "/check-neighbour/" -cloud_key = '/etc/cloudstack/agent/cloud.key' -cloud_cert = '/etc/cloudstack/agent/cloud.crt' -log_path = '/var/log/cloudstack/agent/agent-ha-helper.log' -http_ok = 200 -http_multiple_choices = 300 -http_not_found = 404 +from base64 import b64encode + +# Constants +APPLICATION_JSON = 'application/json' +BASIC_AUTH = "Basic " +CONTENT_TYPE = 'Content-type' +ROOT_PATH = '/' +CHECK_NEIGHBOUR_PATH = '/check-neighbour/' +CLOUD_KEY_PATH = '/etc/cloudstack/agent/cloud.key' +CLOUD_CERT_PATH = '/etc/cloudstack/agent/cloud.crt' +LOG_PATH = '/var/log/cloudstack/agent/agent-ha-helper.log' +HTTP_OK = 200 +HTTP_MULTIPLE_CHOICES = 300 +HTTP_UNAUTHORIZED = 401 +HTTP_NOT_FOUND = 404 +HTTP_PROTOCOL = 'http' +HTTPS_PROTOCOL = 'https' +QEMU_SYSTEM = 'qemu:///system' + +# Variables server_side = True bind = '' +key = '' insecure = False -port=8443 +port = 8443 +http_protocol = HTTPS_PROTOCOL +username = 'kvmHaHelperDefaultUsername' +password = 'kvmhahelperDefaultPassword' + +""" + This web-server exposes a simple JSON API that returns a list of Virtual Machines running according to Libvirt. + This helps on the CloudStack KVM HA as it van verify VMs status with HTTP-call to this simple webserver + and determine if the host is actually down or if it is just the Java Agent which has crashed. +""" + class Libvirt(): + """ + Provides an interface to the libvirt, allowing to run commands in the libvirt. + However, the scope of this webservice is restricted to LISTING VMs. + Noneless, it would not just be out of scope from the KVM High Availability Client "needs" as well as it could + expose security issues in case this webservice is accessed by malicious threats. + """ + def __init__(self): - self.conn = libvirt.openReadOnly("qemu:///system") + self.conn = libvirt.openReadOnly(QEMU_SYSTEM) if not self.conn: libvirt_error = 'Failed to open connection to libvirt' logging.error(libvirt_error) @@ -60,81 +89,123 @@ def running_vms(self): return domains + class HTTPServerV6(HTTPServer): address_family = socket.AF_INET6 -class CloudStackAgentHAHelper(SimpleHTTPRequestHandler): - def do_GET(self): - if self.path == root_path: - libvirt = Libvirt() - running_vms = libvirt.running_vms() +class CloudStackAgentHAHelper(SimpleHTTPRequestHandler): + """ + Provides an HTTP client that can either be HTTP or HTTPS. + Its execution allows either to list VMs on the current KVM node, or to call the same API from another HOST. + """ - output = { - 'count': len(running_vms), - 'virtualmachines': running_vms - } + def do_GET(self): + expected_header = BASIC_AUTH + key + request_header = self.headers.get("Authorization") + if request_header == expected_header: + self.send_response(HTTP_OK) + self.send_header(CONTENT_TYPE, APPLICATION_JSON) + self.end_headers() + self.process_get_request() + else: + logging.error('Failed to authenticate: wrong authentication method or credentials.') + self.send_response(HTTP_UNAUTHORIZED) + self.send_header(CONTENT_TYPE, APPLICATION_JSON) + self.end_headers() - self.send_response(http_ok) - self.send_header('Content-type', 'application/json') + def process_get_request(self): + if self.path == ROOT_PATH: + self.do_libvirt_vms_list() + elif CHECK_NEIGHBOUR_PATH in self.path: + self.do_libvirt_neighbour_check() + else: + self.send_response(HTTP_NOT_FOUND) self.end_headers() - self.wfile.write(json.dumps(output).encode()) + return - elif check_path in self.path: - host_and_port = self.path.partition(check_path)[2] - request_url = 'https://{}/'.format(host_and_port) - logging.debug('Check if Host {} is reachable via HTTPs GET request to agent-ha-helper.'.format(request_url)) - logging.debug('GET request: {}'.format(request_url)) - try: - response = requests.get(url = request_url) - if http_ok <= response.status_code < http_multiple_choices: - request_response = 'Up' - else: - request_response = 'Down' - except: - logging.error('GET Request {} failed.'.format(request_url)) - output = { - 'status': 'Down' - } - logging.debug('Neighbour host status: {}'.format(output)) - self.send_response(http_not_found) - self.send_header('Content-type', 'application/json') - self.end_headers() - self.wfile.write(json.dumps(output).encode()) - return - - logging.debug('Neighbour host status: {}'.format(request_response)) + def do_libvirt_vms_list(self): + """ List Running VMs """ + libvirt = Libvirt() + + running_vms = libvirt.running_vms() + + output = { + 'count': len(running_vms), + 'virtualmachines': running_vms + } + + self.send_response(HTTP_OK) + self.send_header(CONTENT_TYPE, APPLICATION_JSON) + self.end_headers() + self.wfile.write(json.dumps(output).encode()) + + def do_libvirt_neighbour_check(self): + """ + Sends a request to the neighbour host, validating if it is healthy. + Healthy Hosts respond by listing the expected number of VMs Running on the KVM. + Unhealthy hosts fit in one of the following cases: + (i) do not respond; + (ii) respond with error code '-1'; + (iii) list '0' (zero) VMs when it was supposed to have multiple VMs running. + """ + host_and_port = self.path.partition(CHECK_NEIGHBOUR_PATH)[2] + request_url = '{}://{}/'.format(http_protocol, host_and_port) + logging.debug('Check if Host {} is reachable via HTTPs GET request to agent-ha-helper.'.format(request_url)) + logging.debug('GET request: {}'.format(request_url)) + try: + response = requests.get(url=request_url) + if HTTP_OK <= response.status_code < HTTP_MULTIPLE_CHOICES: + request_response = 'Up' + else: + request_response = 'Down' + except: + logging.error('GET Request {} failed.'.format(request_url)) output = { - 'status': request_response, + 'status': 'Down' } - self.send_response(http_ok) - self.send_header('Content-type', 'application/json') + logging.debug('Neighbour host status: {}'.format(output)) + self.send_response(HTTP_NOT_FOUND) + self.send_header(CONTENT_TYPE, 'application/json') self.end_headers() self.wfile.write(json.dumps(output).encode()) - - else: - self.send_response(http_not_found) - self.end_headers() return + logging.debug('Neighbour host status: {}'.format(request_response)) + output = { + 'status': request_response, + } + self.send_response(HTTP_OK) + self.send_header('Content-type', 'application/json') + self.end_headers() + self.wfile.write(json.dumps(output).encode()) + def run(): + """ Configure, create and serve the 'HTTP server' """ server_address_and_port = (bind, port) httpd = HTTPServerV6((server_address_and_port), CloudStackAgentHAHelper) if insecure: logging.warning('Creating HTTP Server on insecure mode (HTTP, no SSL) exposed at port {}.'.format(port)) - elif not os.path.isfile(cloud_key) or not os.path.isfile(cloud_cert): - error_message = 'Failed to run HTTPS server, cannot find certificate or key files: "{}" or "{}".'.format(cloud_key, cloud_cert) + elif not os.path.isfile(CLOUD_KEY_PATH) or not os.path.isfile(CLOUD_CERT_PATH): + error_message = 'Failed to run HTTPS server, cannot find certificate or key files: "{}" or "{}".'.format( + CLOUD_KEY_PATH, CLOUD_CERT_PATH) logging.error(error_message) raise Exception(error_message) else: logging.debug('Creating HTTPs Server at port {}.'.format(port)) - httpd.socket = ssl.wrap_socket(httpd.socket, keyfile=cloud_key, certfile=cloud_cert) + httpd.socket = ssl.wrap_socket(httpd.socket, keyfile=CLOUD_KEY_PATH, certfile=CLOUD_CERT_PATH) httpd.serve_forever() +def set_auth_key(username, password): + """ Prepares Authentication key encoding the string ':' in base 64 """ + username_and_password = '{}:{}'.format(username, password) + return b64encode(bytes(username_and_password, 'utf-8')).decode('utf-8') + if __name__ == "__main__": import argparse, sys - logging.basicConfig(filename=log_path, format='%(asctime)s - %(message)s', level=logging.DEBUG) + + logging.basicConfig(filename=LOG_PATH, format='%(asctime)s - %(message)s', level=logging.DEBUG) try: parser = argparse.ArgumentParser(prog='agent-ha-helper', usage='%(prog)s [-h] [-i] -p ', @@ -143,19 +214,32 @@ def run(): 'if the host (or a neighbour host) is healthy.') parser.add_argument('-i', '--insecure', help='Allows to run the HTTP server without SSL', action='store_true') parser.add_argument('-p', '--port', help='Port to be used by the agent-ha-helper server', type=int) + parser.add_argument('-u', '--username', help='Sets the user for server authentication', type=str) + parser.add_argument('-k', '--password', help='Keyword/password for server authentication', type=str) args = parser.parse_args() if not len(sys.argv) > 1: parser.print_help(sys.stderr) - logging.warning('Note: no arguments have been passed. Using default values [bind: "::", port: {}, insecure: {}].'.format(port, insecure)) + logging.warning( + 'Note: no arguments have been passed. Running with default configuration ' + '[bind:"::", port:{}, insecure:{}, username:{}, keyword:{}].'.format(port, insecure, username, + password)) else: if args.insecure: insecure = True port = 8080 - print("insecure turned on") + http_protocol = HTTP_PROTOCOL + logging.warning("WARNING: Insecure Mode turned ON!") if args.port is not None: port = args.port + if args.username is not None: + username = args.username + if args.password is not None: + password = args.password + + key = set_auth_key(username, password) run() + except KeyboardInterrupt: pass From 5918befc19be3cfe273c12022189bf6b85e39e40 Mon Sep 17 00:00:00 2001 From: gabriel Date: Fri, 21 Jan 2022 14:40:37 +0100 Subject: [PATCH 39/40] Update KVM HA Client to mach with KVM HA Helper HTTPS server. --- .../upgrade/SystemVmTemplateRegistration.java | 2 +- .../cloudstack-agent-ha-helper.default | 15 ++- .../cloudstack-agent-ha-helper.service | 2 +- .../apache/cloudstack/kvm/ha/KVMHAConfig.java | 15 +++ .../cloudstack/kvm/ha/KVMHAProvider.java | 3 + .../cloudstack/kvm/ha/KvmHaAgentClient.java | 103 ++++++++++++++++-- .../kvm/ha/KvmHaAgentClientTest.java | 22 ++-- scripts/vm/hypervisor/kvm/agent-ha-helper.py | 36 ++++-- 8 files changed, 164 insertions(+), 34 deletions(-) diff --git a/engine/schema/src/main/java/com/cloud/upgrade/SystemVmTemplateRegistration.java b/engine/schema/src/main/java/com/cloud/upgrade/SystemVmTemplateRegistration.java index 1eb3fdd20bbc..bacfc0f2e7e3 100644 --- a/engine/schema/src/main/java/com/cloud/upgrade/SystemVmTemplateRegistration.java +++ b/engine/schema/src/main/java/com/cloud/upgrade/SystemVmTemplateRegistration.java @@ -700,7 +700,7 @@ public static String parseMetadataFile() { Ini.Section section = ini.get("default"); return section.get("version"); } catch (Exception e) { - String errMsg = String.format("Failed to parse systemVM template metadata file: %s", METADATA_FILE); + String errMsg = String.format("Failed to parse systemVM template metadata file: %s", METADATA_FILE); LOGGER.error(errMsg, e); throw new CloudRuntimeException(errMsg, e); } diff --git a/packaging/systemd/cloudstack-agent-ha-helper.default b/packaging/systemd/cloudstack-agent-ha-helper.default index f96fed3bf0c0..b6bcb13fcb5b 100644 --- a/packaging/systemd/cloudstack-agent-ha-helper.default +++ b/packaging/systemd/cloudstack-agent-ha-helper.default @@ -18,11 +18,20 @@ # The agent-ha-helper.py provides a HTTP server which handles API requests to identify if the host (or a neighbour host) is healthy. # optional arguments: -# PORT Port to be used by the agent-ha-helper server +# PORT Sets the port to be used by the agent-ha-helper server (must contains the flag '-p' for Script to add the value) # Default is 8443. However, if the "insecure" mode is activated, then it uses 8080 by default. -# PORT=8443 +# PORT="-p 8443" # If uncommented, it passes flag that allows to deploy the HTTP server without SSL (default port would be 8080 in such case) +# If no mode is provided, agent-ha-helper creates HTTP server with support to SSL via cloud.crt and cloud.key. # To enable HTTP requests then make it # SECURITY_MODE="--insecure" -# If no mode is provided, agent-ha-helper creates HTTP server with support to SSL via cloud.crt and cloud.key. + + +# USERNAME Sets username for server authentication (must contains the flag '-u' for Script to add the value) +# Default is kvmHaHelperDefaultUsername. If this is changed, the CloudStack global settings 'kvm.ha.webservice.username' should also be updated to match this configuration. +# USERNAME="-u kvmHaHelperDefaultUsername" + +# KEYWORD Sets the keyworkd / password for server authentication (must contains the flag '-k' for Script to add the value) +# Default is kvmHaHelperDefaultPassword. If this is changed, the CloudStack global settings 'kvm.ha.webservice.password' should also be updated to match this configuration. +# KEYWORD="-k kvmHaHelperDefaultPassword" diff --git a/packaging/systemd/cloudstack-agent-ha-helper.service b/packaging/systemd/cloudstack-agent-ha-helper.service index 14b95dc3790b..bdb172c29d6b 100644 --- a/packaging/systemd/cloudstack-agent-ha-helper.service +++ b/packaging/systemd/cloudstack-agent-ha-helper.service @@ -24,7 +24,7 @@ After=libvirtd.service [Service] Type=simple EnvironmentFile=/etc/default/cloudstack-agent-ha-helper -ExecStart=/usr/share/cloudstack-common/scripts/vm/hypervisor/kvm/agent-ha-helper.py $PORT $SECURITY_MODE +ExecStart=/usr/share/cloudstack-common/scripts/vm/hypervisor/kvm/agent-ha-helper.py $SECURITY_MODE $PORT $USERNAME $KEYWORD Restart=always RestartSec=10s diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java index e1c873d93362..b0af0d7e90cf 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java @@ -62,8 +62,23 @@ public class KVMHAConfig { + "One can enable (set to 'true') or disable it ('false'). If disabled then CloudStack ignores HA validation via this agent.", true, ConfigKey.Scope.Cluster); + public static final ConfigKey KvmHaWebserviceSslEnabled = new ConfigKey("Advanced", Boolean.class, "kvm.ha.webservice.ssl.enabled", "true", + "Enable SSL for KVM HA Helper Agent. Note that HA Helper SSL will work only on KVM nodes where there has been provided Security Keys or a custom certificate configured." + + "Default value is true, if you configured your KVM HA Helper servers (on KVM nodes) with SSL disabled then you must change this to False." + + "Note that HA Helper SSL will work only on KVM nodes where there has been provided Security Keys or a custom certificate configured.", + true, ConfigKey.Scope.Cluster); + + public static final ConfigKey KvmHaWebserviceUsername = new ConfigKey("Advanced", String.class, "kvm.ha.webservice.username", "kvmHaHelperDefaultUsername", + "Sets the username for KVM HA webserver Basic Authentication. The username set here must match with the webserver's configured username.", + true, ConfigKey.Scope.Cluster); + + public static final ConfigKey KvmHaWebservicePassword = new ConfigKey("Advanced", String.class, "kvm.ha.webservice.password", "kvmHaHelperDefaultPassword", + "Sets the password for KVM HA webserver Basic Authentication. The password set here must match with the webserver's configured password.", + true, ConfigKey.Scope.Cluster); + public static final ConfigKey KvmHaAcceptedProblematicHostsRatio = new ConfigKey("Advanced", Double.class, "kvm.ha.accepted.problematic.hosts.ratio", "0.3", "The ratio of problematic Hosts accepted on a Cluster. If a cluster has more than the accepted ratio, HA will not Fence/Recover Hosts; instead, it will notify Admins to check the cluster healthy. " + "A Host is considered problematic if in one of the following states: Error, Alert, Down, Disconnected. Default value is '0.3' (30%).", true, ConfigKey.Scope.Cluster); + } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java index 69f3f2aa07bd..4594f081441d 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java @@ -154,6 +154,9 @@ public ConfigKey[] getConfigKeys() { KVMHAConfig.KvmHARecoverAttemptThreshold, KVMHAConfig.KvmHaWebservicePort, KVMHAConfig.IsKvmHaWebserviceEnabled, + KVMHAConfig.KvmHaWebserviceSslEnabled, + KVMHAConfig.KvmHaWebserviceUsername, + KVMHAConfig.KvmHaWebservicePassword, KVMHAConfig.KvmHaAcceptedProblematicHostsRatio }; } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java index e3b26d7a0909..df6fb422a293 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClient.java @@ -32,17 +32,26 @@ import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpRequestBase; import org.apache.http.client.utils.URIBuilder; +import org.apache.http.conn.ssl.SSLConnectionSocketFactory; +import org.apache.http.conn.ssl.TrustSelfSignedStrategy; import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.ssl.SSLContexts; import org.apache.log4j.Logger; import org.jetbrains.annotations.Nullable; import javax.inject.Inject; +import javax.net.ssl.SSLContext; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; +import java.security.KeyManagementException; +import java.security.KeyStoreException; +import java.security.NoSuchAlgorithmException; +import java.util.Base64; import java.util.List; import java.util.concurrent.TimeUnit; @@ -67,6 +76,10 @@ public class KvmHaAgentClient { private static final int WAIT_FOR_REQUEST_RETRY = 2; private static final int MAX_REQUEST_RETRIES = 2; private static final JsonParser JSON_PARSER = new JsonParser(); + static final String HTTP_PROTOCOL = "http"; + static final String HTTPS_PROTOCOL = "https"; + private final static String APPLICATION_JSON = "application/json"; + private final static String ACCEPT = "accept"; @Inject private VMInstanceDao vmInstanceDao; @@ -75,7 +88,8 @@ public class KvmHaAgentClient { * Returns the number of VMs running on the KVM host according to Libvirt. */ public int countRunningVmsOnAgent(Host host) { - String url = String.format("https://%s:%d", host.getPrivateIpAddress(), getKvmHaMicroservicePortValue(host)); + String protocol = getProtocolString(); + String url = String.format("%s://%s:%d", protocol, host.getPrivateIpAddress(), getKvmHaMicroservicePortValue(host)); HttpResponse response = executeHttpRequest(url); if (response == null) @@ -89,6 +103,21 @@ public int countRunningVmsOnAgent(Host host) { return responseInJson.get(VM_COUNT).getAsInt(); } + /** + * Returns the HTTP protocol. It can be 'HTTP' or 'HTTPS' depending on configuration 'kvm.ha.webservice.ssl.enabled' + */ + protected String getProtocolString() { + boolean KvmHaWebserviceSslEnabled = KVMHAConfig.KvmHaWebserviceSslEnabled.value(); + String protocol = HTTP_PROTOCOL; + if (KvmHaWebserviceSslEnabled) { + protocol = HTTPS_PROTOCOL; + } + return protocol; + } + + /** + * Returns the port from the KVM HA Helper according to the configuration 'kvm.ha.webservice.port' + */ protected int getKvmHaMicroservicePortValue(Host host) { Integer haAgentPort = KVMHAConfig.KvmHaWebservicePort.value(); if (haAgentPort == null) { @@ -131,7 +160,8 @@ public boolean isHostReachableByNeighbour(Host neighbour, Host target) { String neighbourHostAddress = neighbour.getPrivateIpAddress(); String targetHostAddress = target.getPrivateIpAddress(); int port = getKvmHaMicroservicePortValue(neighbour); - String url = String.format("https://%s:%d/%s/%s:%d", neighbourHostAddress, port, CHECK_NEIGHBOUR, targetHostAddress, port); + String protocol = getProtocolString(); + String url = String.format("%s://%s:%d/%s/%s:%d", protocol, neighbourHostAddress, port, CHECK_NEIGHBOUR, targetHostAddress, port); HttpResponse response = executeHttpRequest(url); if (response == null) @@ -143,8 +173,8 @@ public boolean isHostReachableByNeighbour(Host neighbour, Host target) { int statusCode = response.getStatusLine().getStatusCode(); if (isHttpStatusCodNotOk(statusCode)) { - LOGGER.error( - String.format("Failed HTTP %s Request %s; the expected HTTP status code is '%s' but it got '%s'.", HttpGet.METHOD_NAME, url, EXPECTED_HTTP_STATUS, statusCode)); + LOGGER.error(String.format("Failed HTTP %s Request %s; the expected HTTP status code is '%s' but it got '%s'.", + HttpGet.METHOD_NAME, url, EXPECTED_HTTP_STATUS, statusCode)); return false; } @@ -152,6 +182,9 @@ public boolean isHostReachableByNeighbour(Host neighbour, Host target) { return Status.Up.toString().equals(hostStatusFromJson); } + /** + * Return 'True' in case the Status Code is NOT in the [200,299] range. + */ protected boolean isHttpStatusCodNotOk(int statusCode) { return statusCode < HttpStatus.SC_OK || statusCode >= HttpStatus.SC_MULTIPLE_CHOICES; } @@ -166,20 +199,56 @@ protected HttpResponse executeHttpRequest(String url) { return null; } - HttpClient client = HttpClientBuilder.create().build(); + HttpClient httpClient = prepareHttpClient(httpReq); + if (httpClient == null) + return null; + HttpResponse response = null; try { - response = client.execute(httpReq); + response = httpClient.execute(httpReq); } catch (IOException e) { if (MAX_REQUEST_RETRIES == 0) { LOGGER.warn(String.format("Failed to execute HTTP %s request [URL: %s] due to exception %s.", httpReq.getMethod(), url, e), e); return null; } - response = retryHttpRequest(url, httpReq, client); + response = retryHttpRequest(url, httpReq, httpClient); } return response; } + /** + * Creates an httpClient that can be either prepared for HTTP requests or HTTP*S* (accepting sign certificates). + * In case of exceptions, it returns null. + */ + @Nullable + private HttpClient prepareHttpClient(HttpRequestBase httpReq) { + HttpClient httpClient; + boolean isKvmHaWebserviceSslEnabled = KVMHAConfig.KvmHaWebserviceSslEnabled.value(); + if (isKvmHaWebserviceSslEnabled) { + try { + setsHttpHeaderForBasicAuthentication(httpReq); + httpClient = createSslHttpClient(); + } catch (NoSuchAlgorithmException | KeyStoreException | KeyManagementException e) { + LOGGER.error(String.format("Failed to create HTTPS Client due to exception %s.", e), e); + return null; + } + } else { + httpClient = HttpClientBuilder.create().build(); + } + return httpClient; + } + + /** + * Creates an HttpClient that implements SSL. + * Note that it accepts self-signed certificates as CloudStack agents only have this kind of certs on Agents. + */ + private HttpClient createSslHttpClient() throws NoSuchAlgorithmException, KeyStoreException, KeyManagementException { + SSLContext sslContext = SSLContexts.custom().loadTrustMaterial(null, new TrustSelfSignedStrategy()).build(); + SSLConnectionSocketFactory sslConnectionSocketFactory = new SSLConnectionSocketFactory(sslContext); + HttpClient httpClient = HttpClients.custom().setSSLSocketFactory(sslConnectionSocketFactory).build(); + return httpClient; + } + @Nullable private HttpGet prepareHttpRequestForUrl(String url) { try { @@ -254,4 +323,24 @@ protected JsonObject processHttpResponseIntoJson(HttpResponse response) { throw new CloudRuntimeException("Failed to process response", e); } } + + /** + * Adds to the given HttpRequest a Basic Authentication header with the respective Username:Password for the KVM HA Helper. + */ + protected void setsHttpHeaderForBasicAuthentication(HttpRequestBase httpReq) { + String username = KVMHAConfig.KvmHaWebserviceUsername.value(); + String password = KVMHAConfig.KvmHaWebservicePassword.value(); + httpReq.addHeader(ACCEPT, APPLICATION_JSON); + String encoding = basicAuth(username, password); + httpReq.addHeader("Authorization", encoding); + } + + /** + * Encodes 'username:password' into 64-base encoded String + */ + private static String basicAuth(String username, String password) { + return "Basic " + Base64.getEncoder().encodeToString((username + ":" + password).getBytes(StandardCharsets.UTF_8)); + } + + } diff --git a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java index e97d3fd7925c..e935d91ec2fb 100644 --- a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java +++ b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaAgentClientTest.java @@ -13,10 +13,11 @@ */ package org.apache.cloudstack.kvm.ha; -import java.io.IOException; -import java.io.InputStream; -import java.nio.charset.StandardCharsets; - +import com.cloud.host.HostVO; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; import org.apache.commons.io.IOUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; @@ -37,11 +38,9 @@ import org.mockito.Spy; import org.mockito.junit.MockitoJUnitRunner; -import com.cloud.host.HostVO; -import com.google.gson.JsonArray; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; -import com.google.gson.JsonParser; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; @RunWith(MockitoJUnitRunner.class) public class KvmHaAgentClientTest { @@ -63,6 +62,8 @@ public class KvmHaAgentClientTest { private static final String VMS_COUNT = "count"; private static final String VIRTUAL_MACHINES = "virtualmachines"; private static final int MAX_REQUEST_RETRIES = 2; + static final String HTTP = "http"; + static final String HTTPS = "https"; @Spy @InjectMocks @@ -101,7 +102,7 @@ private void prepareAndTestProcessHttpResponseIntoJson(String jsonString, long e } private CloseableHttpResponse mockResponse(int httpStatusCode, String jsonString) throws IOException { - BasicStatusLine basicStatusLine = new BasicStatusLine(new ProtocolVersion("HTTP", 1000, 123), httpStatusCode, "Status"); + BasicStatusLine basicStatusLine = new BasicStatusLine(new ProtocolVersion(HTTP, 1000, 123), httpStatusCode, "Status"); CloseableHttpResponse response = Mockito.mock(CloseableHttpResponse.class); InputStream in = IOUtils.toInputStream(jsonString, StandardCharsets.UTF_8); Mockito.when(response.getStatusLine()).thenReturn(basicStatusLine); @@ -308,4 +309,5 @@ public void isHttpStatusCodNotOkTestHttp404() { boolean result = kvmHaAgentClient.isHttpStatusCodNotOk(HttpStatus.SC_NOT_FOUND); Assert.assertTrue(result); } + } diff --git a/scripts/vm/hypervisor/kvm/agent-ha-helper.py b/scripts/vm/hypervisor/kvm/agent-ha-helper.py index 912209ed1d2e..10c76b7cee5d 100755 --- a/scripts/vm/hypervisor/kvm/agent-ha-helper.py +++ b/scripts/vm/hypervisor/kvm/agent-ha-helper.py @@ -42,6 +42,9 @@ HTTP_PROTOCOL = 'http' HTTPS_PROTOCOL = 'https' QEMU_SYSTEM = 'qemu:///system' +STATUS = 'status' +UP = 'Up' +DOWN = 'Down' # Variables server_side = True @@ -51,7 +54,7 @@ port = 8443 http_protocol = HTTPS_PROTOCOL username = 'kvmHaHelperDefaultUsername' -password = 'kvmhahelperDefaultPassword' +password = 'kvmHaHelperDefaultPassword' """ This web-server exposes a simple JSON API that returns a list of Virtual Machines running according to Libvirt. @@ -101,13 +104,22 @@ class CloudStackAgentHAHelper(SimpleHTTPRequestHandler): """ def do_GET(self): + if not insecure: + self.do_http_basic_auth() + else: + self.do_normal_http_get() + + def do_normal_http_get(self): + self.send_response(HTTP_OK) + self.send_header(CONTENT_TYPE, APPLICATION_JSON) + self.end_headers() + self.process_get_request() + + def do_http_basic_auth(self): expected_header = BASIC_AUTH + key request_header = self.headers.get("Authorization") if request_header == expected_header: - self.send_response(HTTP_OK) - self.send_header(CONTENT_TYPE, APPLICATION_JSON) - self.end_headers() - self.process_get_request() + self.do_normal_http_get() else: logging.error('Failed to authenticate: wrong authentication method or credentials.') self.send_response(HTTP_UNAUTHORIZED) @@ -156,27 +168,27 @@ def do_libvirt_neighbour_check(self): try: response = requests.get(url=request_url) if HTTP_OK <= response.status_code < HTTP_MULTIPLE_CHOICES: - request_response = 'Up' + request_response = UP else: - request_response = 'Down' + request_response = DOWN except: logging.error('GET Request {} failed.'.format(request_url)) output = { - 'status': 'Down' + STATUS : DOWN } logging.debug('Neighbour host status: {}'.format(output)) self.send_response(HTTP_NOT_FOUND) - self.send_header(CONTENT_TYPE, 'application/json') + self.send_header(CONTENT_TYPE, APPLICATION_JSON) self.end_headers() self.wfile.write(json.dumps(output).encode()) return logging.debug('Neighbour host status: {}'.format(request_response)) output = { - 'status': request_response, + STATUS : request_response, } self.send_response(HTTP_OK) - self.send_header('Content-type', 'application/json') + self.send_header(CONTENT_TYPE, APPLICATION_JSON) self.end_headers() self.wfile.write(json.dumps(output).encode()) @@ -208,7 +220,7 @@ def set_auth_key(username, password): logging.basicConfig(filename=LOG_PATH, format='%(asctime)s - %(message)s', level=logging.DEBUG) try: parser = argparse.ArgumentParser(prog='agent-ha-helper', - usage='%(prog)s [-h] [-i] -p ', + usage='%(prog)s [-h] [-i] -p -u -k ', description='The agent-ha-helper.py provides a HTTP server ' 'which handles API requests to identify ' 'if the host (or a neighbour host) is healthy.') From 10c86365ae6358b221f405d1ef591c7c5a98a42e Mon Sep 17 00:00:00 2001 From: gabriel Date: Wed, 9 Mar 2022 11:30:42 +0100 Subject: [PATCH 40/40] Update "check-neighbour" and client/server HTTPs with Auth --- .../cloudstack/kvm/ha/KvmHaHelperTest.java | 2 +- scripts/vm/hypervisor/kvm/agent-ha-helper.py | 73 +++++++++---------- 2 files changed, 37 insertions(+), 38 deletions(-) diff --git a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaHelperTest.java b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaHelperTest.java index 9a62e9f0c1c2..6d6c6d29d89e 100644 --- a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaHelperTest.java +++ b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KvmHaHelperTest.java @@ -43,7 +43,7 @@ public class KvmHaHelperTest { @Spy @InjectMocks private KvmHaHelper kvmHaHelper; - @Mock + @Spy private KvmHaAgentClient kvmHaAgentClient; @Mock private HostVO host; diff --git a/scripts/vm/hypervisor/kvm/agent-ha-helper.py b/scripts/vm/hypervisor/kvm/agent-ha-helper.py index 10c76b7cee5d..68ac55fcef19 100755 --- a/scripts/vm/hypervisor/kvm/agent-ha-helper.py +++ b/scripts/vm/hypervisor/kvm/agent-ha-helper.py @@ -27,6 +27,7 @@ from base64 import b64encode # Constants +AGENT_HA_HELPER_LOG_MARKER = '[AGENT-HA-HELPER]' APPLICATION_JSON = 'application/json' BASIC_AUTH = "Basic " CONTENT_TYPE = 'Content-type' @@ -34,17 +35,20 @@ CHECK_NEIGHBOUR_PATH = '/check-neighbour/' CLOUD_KEY_PATH = '/etc/cloudstack/agent/cloud.key' CLOUD_CERT_PATH = '/etc/cloudstack/agent/cloud.crt' -LOG_PATH = '/var/log/cloudstack/agent/agent-ha-helper.log' +LOG_PATH = '/var/log/cloudstack/agent/agent.log' HTTP_OK = 200 HTTP_MULTIPLE_CHOICES = 300 HTTP_UNAUTHORIZED = 401 HTTP_NOT_FOUND = 404 HTTP_PROTOCOL = 'http' HTTPS_PROTOCOL = 'https' +UTF8 = 'utf-8' QEMU_SYSTEM = 'qemu:///system' STATUS = 'status' UP = 'Up' DOWN = 'Down' +COUNT = 'count' +VIRTUAL_MACHINES = 'virtualmachines' # Variables server_side = True @@ -60,6 +64,13 @@ This web-server exposes a simple JSON API that returns a list of Virtual Machines running according to Libvirt. This helps on the CloudStack KVM HA as it van verify VMs status with HTTP-call to this simple webserver and determine if the host is actually down or if it is just the Java Agent which has crashed. + + Optional arguments: + -h, --help Show this help message and exit + -i, --insecure Allows to run the HTTP server without SSL + -p, --port PORT Port to be used by the agent-ha-helper server + -u, --username USERNAME Sets the user for server authentication + -k, --password PASSWORD Keyword/password for server authentication """ @@ -74,7 +85,7 @@ class Libvirt(): def __init__(self): self.conn = libvirt.openReadOnly(QEMU_SYSTEM) if not self.conn: - libvirt_error = 'Failed to open connection to libvirt' + libvirt_error = '{} Failed to open connection to libvirt'.format(AGENT_HA_HELPER_LOG_MARKER) logging.error(libvirt_error) raise Exception(libvirt_error) @@ -121,7 +132,7 @@ def do_http_basic_auth(self): if request_header == expected_header: self.do_normal_http_get() else: - logging.error('Failed to authenticate: wrong authentication method or credentials.') + logging.error('{} Failed to authenticate: wrong authentication method or credentials.'.format(AGENT_HA_HELPER_LOG_MARKER)) self.send_response(HTTP_UNAUTHORIZED) self.send_header(CONTENT_TYPE, APPLICATION_JSON) self.end_headers() @@ -134,7 +145,7 @@ def process_get_request(self): else: self.send_response(HTTP_NOT_FOUND) self.end_headers() - return + return def do_libvirt_vms_list(self): """ List Running VMs """ @@ -143,13 +154,10 @@ def do_libvirt_vms_list(self): running_vms = libvirt.running_vms() output = { - 'count': len(running_vms), - 'virtualmachines': running_vms + COUNT: len(running_vms), + VIRTUAL_MACHINES: running_vms } - self.send_response(HTTP_OK) - self.send_header(CONTENT_TYPE, APPLICATION_JSON) - self.end_headers() self.wfile.write(json.dumps(output).encode()) def do_libvirt_neighbour_check(self): @@ -163,33 +171,23 @@ def do_libvirt_neighbour_check(self): """ host_and_port = self.path.partition(CHECK_NEIGHBOUR_PATH)[2] request_url = '{}://{}/'.format(http_protocol, host_and_port) - logging.debug('Check if Host {} is reachable via HTTPs GET request to agent-ha-helper.'.format(request_url)) - logging.debug('GET request: {}'.format(request_url)) + logging.debug('{} Check if neighbour Host is reachable via HTTPs GET request [{}] to agent-ha-helper.'.format(AGENT_HA_HELPER_LOG_MARKER, request_url)) + request_response = DOWN try: - response = requests.get(url=request_url) + if insecure: + response = requests.get(url=request_url) + else: + basic_auth_header = 'Basic {}'.format(key) + response = requests.get(url=request_url, headers={'Authorization': basic_auth_header}, verify=False) + if HTTP_OK <= response.status_code < HTTP_MULTIPLE_CHOICES: request_response = UP - else: - request_response = DOWN - except: - logging.error('GET Request {} failed.'.format(request_url)) - output = { - STATUS : DOWN - } - logging.debug('Neighbour host status: {}'.format(output)) - self.send_response(HTTP_NOT_FOUND) - self.send_header(CONTENT_TYPE, APPLICATION_JSON) - self.end_headers() - self.wfile.write(json.dumps(output).encode()) - return + except Exception as e: + logging.error('{} GET Request {} failed due to {}.'.format(AGENT_HA_HELPER_LOG_MARKER, request_url, e)) - logging.debug('Neighbour host status: {}'.format(request_response)) output = { STATUS : request_response, } - self.send_response(HTTP_OK) - self.send_header(CONTENT_TYPE, APPLICATION_JSON) - self.end_headers() self.wfile.write(json.dumps(output).encode()) def run(): @@ -198,24 +196,25 @@ def run(): httpd = HTTPServerV6((server_address_and_port), CloudStackAgentHAHelper) if insecure: - logging.warning('Creating HTTP Server on insecure mode (HTTP, no SSL) exposed at port {}.'.format(port)) + logging.warning('{} Creating HTTP Server on insecure mode (HTTP, no SSL) exposed at port {}.'.format(AGENT_HA_HELPER_LOG_MARKER, port)) elif not os.path.isfile(CLOUD_KEY_PATH) or not os.path.isfile(CLOUD_CERT_PATH): - error_message = 'Failed to run HTTPS server, cannot find certificate or key files: "{}" or "{}".'.format( - CLOUD_KEY_PATH, CLOUD_CERT_PATH) + error_message = '{} Failed to run HTTPS server, cannot find certificate or key files: "{}" or "{}".'.format( + AGENT_HA_HELPER_LOG_MARKER, CLOUD_KEY_PATH, CLOUD_CERT_PATH) logging.error(error_message) raise Exception(error_message) else: - logging.debug('Creating HTTPs Server at port {}.'.format(port)) + logging.debug('{} Creating HTTPs Server at port {}.'.format(AGENT_HA_HELPER_LOG_MARKER, port)) httpd.socket = ssl.wrap_socket(httpd.socket, keyfile=CLOUD_KEY_PATH, certfile=CLOUD_CERT_PATH) httpd.serve_forever() def set_auth_key(username, password): """ Prepares Authentication key encoding the string ':' in base 64 """ username_and_password = '{}:{}'.format(username, password) - return b64encode(bytes(username_and_password, 'utf-8')).decode('utf-8') + return b64encode(bytes(username_and_password, UTF8)).decode(UTF8) if __name__ == "__main__": - import argparse, sys + import argparse, sys, urllib3 + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) logging.basicConfig(filename=LOG_PATH, format='%(asctime)s - %(message)s', level=logging.DEBUG) try: @@ -234,8 +233,8 @@ def set_auth_key(username, password): parser.print_help(sys.stderr) logging.warning( 'Note: no arguments have been passed. Running with default configuration ' - '[bind:"::", port:{}, insecure:{}, username:{}, keyword:{}].'.format(port, insecure, username, - password)) + '[bind:"::", port:{}, insecure:{}, username:{}, keyword:{}].'.format( + port, insecure, username, password)) else: if args.insecure: insecure = True