From a2e0490399e265fee0e58f1e76fe2c6c13beaa6e Mon Sep 17 00:00:00 2001 From: Justin Pierce Date: Tue, 30 Aug 2022 16:44:48 -0400 Subject: [PATCH] Prevent frequent m6a capacity overruns Checks search.ci.openshift.org for any capacity issues for m6as of the selected size in the slice's region. If capacity issues are detected in the last 30 minutes, falls back to the less cost effective m6i family. --- .../ipi/conf/aws/ipi-conf-aws-commands.sh | 105 ++++++++++++------ 1 file changed, 74 insertions(+), 31 deletions(-) diff --git a/ci-operator/step-registry/ipi/conf/aws/ipi-conf-aws-commands.sh b/ci-operator/step-registry/ipi/conf/aws/ipi-conf-aws-commands.sh index bea728d0459b3..e6a6f8d0c73af 100755 --- a/ci-operator/step-registry/ipi/conf/aws/ipi-conf-aws-commands.sh +++ b/ci-operator/step-registry/ipi/conf/aws/ipi-conf-aws-commands.sh @@ -18,37 +18,86 @@ REGION="${LEASED_RESOURCE}" # for general purpose work. Use by default, when supported in the # region. IS_M6A_REGION="no" -if aws ec2 describe-instance-type-offerings --region "${REGION}" | grep m6a ; then +if aws ec2 describe-instance-type-offerings --region "${REGION}" | grep -q m6a ; then IS_M6A_REGION="yes" fi +function eval_instance_capacity() { + local DESIRED_TYPE="$1" + local FALLBACK_TYPE="$2" + # During our initial adoption of m6a, AWS has report insufficient capacity at peak hours. For cost effectiveness + # and to ensure AWS eventual adds m6a capacity due to these errors, we want to continue to use them. However, + # if left unchecked, these peak hour errors can derail a statistically significant number of jobs. + # To mitigate the capacity issues, search.ci.openshift.org can tell us if previous jobs have failed to provision + # the desired instance type - in this region - in the last x minutes. + # If we find such an error, use the fallback instance type. + + # Example error + # error creating EC2 instance: InsufficientInstanceCapacity: We currently do not have sufficient m6a.xlarge capacity + # in the Availability Zone you requested (us-east-1c). Our system will be working on provisioning additional capacity. + # You can currently get m6a.xlarge capacity by not specifying an Availability Zone in your request or choosing + # us-east-1a, us-east-1b, us-east-1d, us-east-1f.\n status code: 500, request id: ... + + set +o errexit + local LOOK_BACK_PERIOD="30m" + local TARGET_TYPE="${DESIRED_TYPE}" + for retry in {1..30}; do + if err_count=$(curl -L -s "https://search.ci.openshift.org/search?search=InsufficientInstanceCapacity.*${DESIRED_TYPE}.*${REGION}&maxAge=${LOOK_BACK_PERIOD}&context=0&type=build-log" | jq length); then + if [[ "${err_count}" == "0" ]]; then + break # Use DESIRED_TYPE + else + >&2 echo "Recent instance AWS availability issue for ${DESIRED_TYPE} in ${REGION}; falling back to ${FALLBACK_TYPE}" + TARGET_TYPE="${FALLBACK_TYPE}" + break + fi + fi + sleep 2 + >&2 echo "Error querying search.ci.openshift.com for AWS instance availability information (retry ${retry} of 30)." + done + + echo "${TARGET_TYPE}" + set -o errexit +} + # Do not change auto-types unless it is coordinated with the cloud # financial operations team. Savings plans may be in place to # decrease the cost of certain instance families. if [[ "${COMPUTE_NODE_TYPE}" == "" ]]; then if [[ "${IS_M6A_REGION}" == "yes" ]]; then - COMPUTE_NODE_TYPE="m6a.xlarge" + COMPUTE_NODE_TYPE=$(eval_instance_capacity "m6a.xlarge" "m6i.xlarge") else COMPUTE_NODE_TYPE="m6i.xlarge" fi fi +CONTROL_PLANE_INSTANCE_SIZE="xlarge" +if [[ "${SIZE_VARIANT}" == "xlarge" ]]; then + CONTROL_PLANE_INSTANCE_SIZE="8xlarge" +elif [[ "${SIZE_VARIANT}" == "large" ]]; then + CONTROL_PLANE_INSTANCE_SIZE="4xlarge" +elif [[ "${SIZE_VARIANT}" == "compact" ]]; then + CONTROL_PLANE_INSTANCE_SIZE="2xlarge" +fi + # BootstrapInstanceType gets its value from pkg/types/aws/defaults/platform.go architecture=${OCP_ARCH:-"amd64"} -if [[ "${IS_M6A_REGION}" == "yes" ]]; then - arch_instance_type=m6a -else - arch_instance_type=m6i -fi - if [[ "${CLUSTER_TYPE}" == "aws-arm64" ]]; then architecture="arm64" fi if [[ x"${architecture}" == x"arm64" ]]; then arch_instance_type=m6g + CONTROL_PLANE_INSTANCE_TYPE="${arch_instance_type}.${CONTROL_PLANE_INSTANCE_SIZE}" +else + if [[ "${IS_M6A_REGION}" == "yes" ]]; then + CONTROL_PLANE_INSTANCE_TYPE=$(eval_instance_capacity "m6a.${CONTROL_PLANE_INSTANCE_SIZE}" "m6i.${CONTROL_PLANE_INSTANCE_SIZE}") + else + CONTROL_PLANE_INSTANCE_TYPE="m6i.${CONTROL_PLANE_INSTANCE_SIZE}" + fi + arch_instance_type=$(echo -n "${CONTROL_PLANE_INSTANCE_TYPE}" | cut -d . -f 1) fi + BOOTSTRAP_NODE_TYPE=${arch_instance_type}.large workers=3 @@ -56,35 +105,26 @@ if [[ "${SIZE_VARIANT}" == "compact" ]]; then workers=0 fi -master_type=${arch_instance_type}.xlarge -if [[ "${SIZE_VARIANT}" == "xlarge" ]]; then - master_type=${arch_instance_type}.8xlarge -elif [[ "${SIZE_VARIANT}" == "large" ]]; then - master_type=${arch_instance_type}.4xlarge -elif [[ "${SIZE_VARIANT}" == "compact" ]]; then - master_type=${arch_instance_type}.2xlarge -fi - # Generate working availability zones from the region mapfile -t AVAILABILITY_ZONES < <(aws --region "${REGION}" ec2 describe-availability-zones | jq -r '.AvailabilityZones[] | select(.State == "available") | .ZoneName' | sort -u) # Generate availability zones with OpenShift Installer required instance types -if [[ "${COMPUTE_NODE_TYPE}" == "${BOOTSTRAP_NODE_TYPE}" && "${COMPUTE_NODE_TYPE}" == "${master_type}" ]]; then ## all regions are the same - mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${master_type}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 1 ' | awk '{print $2}') -elif [[ "${master_type}" == null && "${COMPUTE_NODE_TYPE}" == null ]]; then ## two null regions - mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${master_type}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 1 ' | awk '{print $2}') -elif [[ "${master_type}" == null || "${COMPUTE_NODE_TYPE}" == null ]]; then ## one null region - if [[ "${BOOTSTRAP_NODE_TYPE}" == "${COMPUTE_NODE_TYPE}" || "${BOOTSTRAP_NODE_TYPE}" == "${master_type}" || "${master_type}" == "${COMPUTE_NODE_TYPE}" ]]; then ## "one null region and duplicates" - mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${master_type}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 1 ' | awk '{print $2}') +if [[ "${COMPUTE_NODE_TYPE}" == "${BOOTSTRAP_NODE_TYPE}" && "${COMPUTE_NODE_TYPE}" == "${CONTROL_PLANE_INSTANCE_TYPE}" ]]; then ## all regions are the same + mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${CONTROL_PLANE_INSTANCE_TYPE}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 1 ' | awk '{print $2}') +elif [[ "${CONTROL_PLANE_INSTANCE_TYPE}" == null && "${COMPUTE_NODE_TYPE}" == null ]]; then ## two null regions + mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${CONTROL_PLANE_INSTANCE_TYPE}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 1 ' | awk '{print $2}') +elif [[ "${CONTROL_PLANE_INSTANCE_TYPE}" == null || "${COMPUTE_NODE_TYPE}" == null ]]; then ## one null region + if [[ "${BOOTSTRAP_NODE_TYPE}" == "${COMPUTE_NODE_TYPE}" || "${BOOTSTRAP_NODE_TYPE}" == "${CONTROL_PLANE_INSTANCE_TYPE}" || "${CONTROL_PLANE_INSTANCE_TYPE}" == "${COMPUTE_NODE_TYPE}" ]]; then ## "one null region and duplicates" + mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${CONTROL_PLANE_INSTANCE_TYPE}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 1 ' | awk '{print $2}') else ## "one null region and no duplicates" - mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${master_type}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 2 ' | awk '{print $2}') + mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${CONTROL_PLANE_INSTANCE_TYPE}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 2 ' | awk '{print $2}') fi -elif [[ "${BOOTSTRAP_NODE_TYPE}" == "${COMPUTE_NODE_TYPE}" || "${BOOTSTRAP_NODE_TYPE}" == "${master_type}" || "${master_type}" == "${COMPUTE_NODE_TYPE}" ]]; then ## duplicates regions with no null region - mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${master_type}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 2 ' | awk '{print $2}') -elif [[ "${BOOTSTRAP_NODE_TYPE}" != "${COMPUTE_NODE_TYPE}" && "${COMPUTE_NODE_TYPE}" != "${master_type}" ]]; then # three different regions - mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${master_type}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 3 ' | awk '{print $2}') +elif [[ "${BOOTSTRAP_NODE_TYPE}" == "${COMPUTE_NODE_TYPE}" || "${BOOTSTRAP_NODE_TYPE}" == "${CONTROL_PLANE_INSTANCE_TYPE}" || "${CONTROL_PLANE_INSTANCE_TYPE}" == "${COMPUTE_NODE_TYPE}" ]]; then ## duplicates regions with no null region + mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${CONTROL_PLANE_INSTANCE_TYPE}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 2 ' | awk '{print $2}') +elif [[ "${BOOTSTRAP_NODE_TYPE}" != "${COMPUTE_NODE_TYPE}" && "${COMPUTE_NODE_TYPE}" != "${CONTROL_PLANE_INSTANCE_TYPE}" ]]; then # three different regions + mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${CONTROL_PLANE_INSTANCE_TYPE}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 3 ' | awk '{print $2}') fi -# Generate availability zones based on these 2 criterias +# Generate availability zones based on these 2 criteria mapfile -t ZONES < <(echo "${AVAILABILITY_ZONES[@]}" "${INSTANCE_ZONES[@]}" | sed 's/ /\n/g' | sort -R | uniq -d) # Calculate the maximum number of availability zones from the region MAX_ZONES_COUNT="${#ZONES[@]}" @@ -114,6 +154,9 @@ else echo "zones already set in install-config.yaml, skipped" fi +echo "Using control plane instance type: ${CONTROL_PLANE_INSTANCE_TYPE}" +echo "Using compute instance type: ${COMPUTE_NODE_TYPE}" + PATCH="${SHARED_DIR}/install-config-common.yaml.patch" cat > "${PATCH}" << EOF baseDomain: ${BASE_DOMAIN} @@ -127,7 +170,7 @@ controlPlane: name: master platform: aws: - type: ${master_type} + type: ${CONTROL_PLANE_INSTANCE_TYPE} compute: - architecture: ${architecture} name: worker