Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
e9f3309
feat: add status workflows
ignacioboud Oct 15, 2025
b7f9d85
feat: create diagnose k8s
ignacioboud Nov 13, 2025
8755fc5
fix: adjust templates and entrepoints
ignacioboud Nov 13, 2025
d695713
feat: add logs in result
ignacioboud Nov 13, 2025
874f553
feat: add send yo service
ignacioboud Nov 13, 2025
e66e296
fix: add logs
ignacioboud Nov 13, 2025
451ee54
fix: add log
ignacioboud Nov 13, 2025
513c769
fix: change json
ignacioboud Nov 13, 2025
add9331
fix: body
ignacioboud Nov 13, 2025
a038f22
feat: add no output
ignacioboud Nov 13, 2025
d119ce8
feat: add new checks
ignacioboud Nov 14, 2025
91810a7
fix: change status script
ignacioboud Nov 14, 2025
2e6509d
feat: update update_check_result
ignacioboud Nov 18, 2025
e8164f0
feat: add application checks
ignacioboud Nov 18, 2025
985e90d
fix: deploment id
ignacioboud Nov 18, 2025
b351803
fix: yml
ignacioboud Nov 18, 2025
0f89c4f
fix: remove exit
ignacioboud Nov 18, 2025
d6668b3
fix: changes
ignacioboud Nov 18, 2025
6493e80
feat: add new checks
ignacioboud Nov 19, 2025
f723299
fix: exit
ignacioboud Nov 19, 2025
040701c
fix: exit
ignacioboud Nov 19, 2025
7a8cd8d
fix: label selector
ignacioboud Nov 19, 2025
3d1a800
fix: remove exit
ignacioboud Nov 19, 2025
238aa93
feat: add networking
ignacioboud Nov 27, 2025
bb66c1c
fix: change not found message
ignacioboud Nov 28, 2025
2e8bf90
fix: add namespace
ignacioboud Nov 28, 2025
10375b7
fix: echo
ignacioboud Nov 28, 2025
93a4969
fix: remove export
ignacioboud Nov 28, 2025
29f8376
fix: add values
ignacioboud Nov 28, 2025
4ded6e3
fix: label
ignacioboud Nov 28, 2025
771eb0c
fix: label selector
ignacioboud Nov 28, 2025
9d48034
fix: remove exit 1
ignacioboud Dec 1, 2025
9c61ec0
Revert "fix: remove exit 1"
ignacioboud Dec 3, 2025
1bfe5b5
feat: remove running step && test exit 1
ignacioboud Dec 1, 2025
d6add76
fix: remove exit 1
ignacioboud Dec 3, 2025
b63e620
fix: reduce kubectl calls
ignacioboud Dec 5, 2025
9fbbdf7
fix: avoid data json
ignacioboud Dec 5, 2025
246a833
fix: service selector match
ignacioboud Dec 5, 2025
d0ed6e0
fix: add require ingress
ignacioboud Dec 9, 2025
2ce3217
feat: add change log
ignacioboud Dec 9, 2025
3396155
feat: add deployment wf
ignacioboud Dec 9, 2025
c36ad79
fix: change exit
ignacioboud Dec 9, 2025
5c8e679
fix: replace exit with return
ignacioboud Dec 9, 2025
d4c413d
feat: add mismatches
ignacioboud Dec 9, 2025
ebe00ec
fix: label
ignacioboud Dec 9, 2025
cee905a
fix: pod selector
ignacioboud Dec 9, 2025
92ce5f3
fix: change log
ignacioboud Dec 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add deployment hints for failed deployments
- Add wait for ingress reconciliation
- Only wait for blue deployment when using rolling deployment strategy
- Add **k8s/diagnose**: New diagnostic workflows and checks for troubleshooting Kubernetes scopes (scope, service, and networking diagnostics)

## [1.8.0] - 2025-11-28
- Add support for multiple override layers
Expand Down
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,18 @@ The script will:

- **`configure`**: The setup script that registers the provider with nullplatform.
- **`entrypoint`**: The main script executed by the agent. It routes events to the appropriate scope implementation.
- **`k8s/`**: Implementation for the Kubernetes scope.
- **`k8s/`**: Kubernetes scope implementation.
- `specs/`: JSON schemas and templates.
- `scope/`: Lifecycle workflows (create, update, delete).
- **`azure/`**: Implementation for the Azure scope.
- **`scheduled_task/`**: Implementation for Scheduled Tasks scope.
- `diagnose/`: Diagnostic workflows and checks for troubleshooting (scope, service, networking diagnostics).
- `deployment/`: Deployment configuration and templates.
- `backup/`: Backup workflows and utilities.
- `instance/`: Instance management workflows.
- `log/`: Log collection and processing workflows.
- `metric/`: Metrics collection workflows.
- `parameters/`: Parameter management workflows.
- **`azure/`**: Azure scope implementation.
- **`scheduled_task/`**: Scheduled Tasks scope implementation.
- **`agent/`**: Agent deployment scripts.

## IDE Support for Workflow YAML
Expand Down
37 changes: 37 additions & 0 deletions k8s/deployment/workflows/diagnose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
continue_on_error: true
include:
- "$SERVICE_PATH/values.yaml"
steps:
- name: build context
type: script
file: "$SERVICE_PATH/diagnose/build_context"
output:
- name: CONTEXT
type: environment
- name: LABEL_SELECTOR
type: environment
- name: load_functions
type: script
file: "$SERVICE_PATH/diagnose/utils/diagnose_utils"
output:
- name: update_check_result
type: function
parameters:
status: string
evidence: object
- name: notify_results
type: function
- name: diagnose
type: executor
before_each:
name: notify_check_running
type: script
file: "$SERVICE_PATH/diagnose/notify_check_running"
after_each:
name: notify_check_results
type: script
file: "$SERVICE_PATH/diagnose/notify_diagnose_results"
folders:
- "$SERVICE_PATH/diagnose/service"
- "$SERVICE_PATH/diagnose/scope"
- "$SERVICE_PATH/diagnose/networking"
85 changes: 85 additions & 0 deletions k8s/diagnose/build_context
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/bin/bash

NAMESPACE=$(echo "$CONTEXT" | jq -r --arg default "$K8S_NAMESPACE" '
.providers["container-orchestration"].cluster.namespace // $default
')

SCOPE_LABEL_SELECTOR="scope_id=$SCOPE_ID"
LABEL_SELECTOR="$SCOPE_LABEL_SELECTOR"

DEPLOYMENT_ID=$(echo "$CONTEXT" | jq -r '.deployment.id // .scope.current_active_deployment // empty')
if [ -n "$DEPLOYMENT_ID" ]; then
LABEL_SELECTOR="$LABEL_SELECTOR,deployment_id=$DEPLOYMENT_ID"
fi

export SCOPE_LABEL_SELECTOR
export LABEL_SELECTOR
export NAMESPACE

# Notify checks to run
notify_results

# Collect all Kubernetes data once and store in JSON files
# Create data directory for context files (excluded from results)
DATA_DIR="$NP_OUTPUT_DIR/data"
mkdir -p "$DATA_DIR"

# Pods
PODS_FILE="$DATA_DIR/pods.json"
kubectl get pods -n "$NAMESPACE" -l "$LABEL_SELECTOR" -o json 2>/dev/null > "$PODS_FILE" || echo '{"items":[]}' > "$PODS_FILE"
export PODS_FILE

# Services
SERVICES_FILE="$DATA_DIR/services.json"
kubectl get services -n "$NAMESPACE" -l "$LABEL_SELECTOR" -o json 2>/dev/null > "$SERVICES_FILE" || echo '{"items":[]}' > "$SERVICES_FILE"
export SERVICES_FILE

# Endpoints
ENDPOINTS_FILE="$DATA_DIR/endpoints.json"
kubectl get endpoints -n "$NAMESPACE" -o json 2>/dev/null > "$ENDPOINTS_FILE" || echo '{"items":[]}' > "$ENDPOINTS_FILE"
export ENDPOINTS_FILE

# Ingresses
INGRESSES_FILE="$DATA_DIR/ingresses.json"
kubectl get ingress -n "$NAMESPACE" -l "$SCOPE_LABEL_SELECTOR" -o json 2>/dev/null > "$INGRESSES_FILE" || echo '{"items":[]}' > "$INGRESSES_FILE"
export INGRESSES_FILE

# Secrets (metadata only, no data for security)
SECRETS_FILE="$DATA_DIR/secrets.json"
kubectl get secrets -n "$NAMESPACE" -o json 2>/dev/null | jq 'del(.items[].data)' > "$SECRETS_FILE" || echo '{"items":[]}' > "$SECRETS_FILE"
export SECRETS_FILE

# IngressClasses (cluster-wide)
INGRESSCLASSES_FILE="$DATA_DIR/ingressclasses.json"
kubectl get ingressclass -o json 2>/dev/null > "$INGRESSCLASSES_FILE" || echo '{"items":[]}' > "$INGRESSCLASSES_FILE"
export INGRESSCLASSES_FILE

# Events (namespace-scoped)
EVENTS_FILE="$DATA_DIR/events.json"
kubectl get events -n "$NAMESPACE" --sort-by='.lastTimestamp' -o json 2>/dev/null > "$EVENTS_FILE" || echo '{"items":[]}' > "$EVENTS_FILE"
export EVENTS_FILE

# ALB Controller Pods (for networking diagnostics)
ALB_CONTROLLER_NAMESPACE="${ALB_CONTROLLER_NAMESPACE:-kube-system}"
export ALB_CONTROLLER_NAMESPACE

ALB_CONTROLLER_PODS_FILE="$DATA_DIR/alb_controller_pods.json"
# Try new controller name first
kubectl get pods -n "$ALB_CONTROLLER_NAMESPACE" -l app.kubernetes.io/name=aws-load-balancer-controller -o json 2>/dev/null > "$ALB_CONTROLLER_PODS_FILE"

# If no pods found, try legacy controller name
if [[ $(jq '.items | length' "$ALB_CONTROLLER_PODS_FILE" 2>/dev/null || echo 0) -eq 0 ]]; then
kubectl get pods -n "$ALB_CONTROLLER_NAMESPACE" -l app=aws-alb-ingress-controller -o json 2>/dev/null > "$ALB_CONTROLLER_PODS_FILE" || echo '{"items":[]}' > "$ALB_CONTROLLER_PODS_FILE"
fi
export ALB_CONTROLLER_PODS_FILE

# Collect ALB Controller logs
ALB_CONTROLLER_LOGS_DIR="$DATA_DIR/alb_controller_logs"
mkdir -p "$ALB_CONTROLLER_LOGS_DIR"

ALB_POD_NAMES=$(jq -r '.items[].metadata.name' "$ALB_CONTROLLER_PODS_FILE" 2>/dev/null)
if [[ -n "$ALB_POD_NAMES" ]]; then
for POD_NAME in $ALB_POD_NAMES; do
kubectl logs "$POD_NAME" -n "$ALB_CONTROLLER_NAMESPACE" --tail=200 2>/dev/null > "$ALB_CONTROLLER_LOGS_DIR/${POD_NAME}.log" || echo "" > "$ALB_CONTROLLER_LOGS_DIR/${POD_NAME}.log"
done
fi
141 changes: 141 additions & 0 deletions k8s/diagnose/networking/alb_capacity_check
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/bin/bash
# Check: ALB Capacity Check
# Checks for common ALB issues (IP exhaustion, certificate problems)

# Validate ingresses exist
require_ingresses || return 0

# Read ingresses from pre-collected data
INGRESSES=$(jq -r '.items[].metadata.name' "$INGRESSES_FILE" 2>/dev/null | tr '\n' ' ')

HAS_ISSUES=0

# Get ALB controller pods from pre-collected data
ALB_CONTROLLER_PODS=$(jq -r '.items[].metadata.name' "$ALB_CONTROLLER_PODS_FILE" 2>/dev/null | tr '\n' ' ')

if [[ -n "$ALB_CONTROLLER_PODS" ]]; then
for POD in $ALB_CONTROLLER_PODS; do
# Look for IP exhaustion errors in pre-collected controller logs
if [[ -f "$ALB_CONTROLLER_LOGS_DIR/${POD}.log" ]]; then
IP_ERRORS=$(cat "$ALB_CONTROLLER_LOGS_DIR/${POD}.log" | grep -iE "no available ip|insufficient ip|ip address.*(exhausted|unavailable)" || true)

if [[ -n "$IP_ERRORS" ]]; then
HAS_ISSUES=1
print_error " ALB subnet IP exhaustion detected, Recent logs:"
echo "$IP_ERRORS" | tail -n 3 | sed 's/^/ /'
print_info " Action: Check subnet CIDR ranges and consider expanding or using different subnets"
print_info " Annotation: alb.ingress.kubernetes.io/subnets=<subnet-ids>"
break
fi
fi
done

if [[ -z "$IP_ERRORS" ]]; then
print_success " No IP exhaustion issues detected"
fi
fi

# Consolidated loop: check all ingress-related issues in one pass
for INGRESS_NAME in $INGRESSES; do
# Get ingress info from pre-collected data (single read per ingress)
INGRESS_INFO=$(jq --arg name "$INGRESS_NAME" '.items[] | select(.metadata.name == $name)' "$INGRESSES_FILE" 2>/dev/null)

print_info "Checking ingress: $INGRESS_NAME"

# ===== TLS/Certificate Configuration Checks =====
CERT_ARN=$(echo "$INGRESS_INFO" | jq -r '.metadata.annotations["alb.ingress.kubernetes.io/certificate-arn"] // empty')
TLS_HOSTS=$(echo "$INGRESS_INFO" | jq -r '.spec.tls[]?.hosts[]?' 2>/dev/null)
INGRESS_HOSTS=$(echo "$INGRESS_INFO" | jq -r '.spec.rules[]?.host' 2>/dev/null)

if [[ -n "$TLS_HOSTS" || -n "$CERT_ARN" ]]; then
print_info " SSL/TLS configured"

if [[ -n "$CERT_ARN" ]]; then
print_info " Certificate ARN: $CERT_ARN"

# Check controller logs for certificate errors
if [[ -n "$ALB_CONTROLLER_PODS" ]]; then
for POD in $ALB_CONTROLLER_PODS; do
if [[ -f "$ALB_CONTROLLER_LOGS_DIR/${POD}.log" ]]; then
CERT_ERRORS=$(cat "$ALB_CONTROLLER_LOGS_DIR/${POD}.log" | grep -i "$INGRESS_NAME" | grep -iE "certificate.*(not found|invalid|failed|error)" || true)

if [[ -n "$CERT_ERRORS" ]]; then
HAS_ISSUES=1
print_error " Certificate validation errors found:"
echo "$CERT_ERRORS" | tail -n 2 | sed 's/^/ /'
print_info " Action: Verify certificate ARN exists in ACM and covers the required domains"
fi
fi
done
fi
fi

# Verify hosts match between rules and TLS
if [[ -n "$TLS_HOSTS" && -n "$INGRESS_HOSTS" ]]; then
for HOST in $INGRESS_HOSTS; do
if ! echo "$TLS_HOSTS" | grep -qw "$HOST"; then
HAS_ISSUES=1
print_error " Host '$HOST' in rules but not in TLS configuration"
print_info " Action: Add host to spec.tls or ensure certificate covers this domain"
fi
done
fi

# Check for missing certificate when TLS is configured
if [[ -n "$TLS_HOSTS" && -z "$CERT_ARN" ]]; then
print_warning " TLS hosts configured but no ACM certificate ARN annotation"
print_info " Add annotation: alb.ingress.kubernetes.io/certificate-arn=<arn>"
fi
else
print_info " No SSL/TLS configured (HTTP only)"
fi

# ===== Events Checks (subnet, security group, target group) =====
EVENTS=$(jq -r --arg name "$INGRESS_NAME" '.items[] | select(.involvedObject.name == $name) | "\(.lastTimestamp) \(.type) \(.reason) \(.message)"' "$EVENTS_FILE" 2>/dev/null | tail -n 20)

if [[ -n "$EVENTS" ]]; then
# Check for subnet errors
SUBNET_ERRORS=$(echo "$EVENTS" | grep -iE "subnet|availability zone" | grep -iE "error|failed" || true)
if [[ -n "$SUBNET_ERRORS" ]]; then
HAS_ISSUES=1
print_error " Subnet configuration issues"
echo "$SUBNET_ERRORS" | tail -n 2 | sed 's/^/ /'
fi

# Check for security group errors
SG_ERRORS=$(echo "$EVENTS" | grep -iE "security.?group" | grep -iE "error|failed" || true)
if [[ -n "$SG_ERRORS" ]]; then
HAS_ISSUES=1
print_error " Security group issues"
echo "$SG_ERRORS" | tail -n 2 | sed 's/^/ /'
fi

# Check for target group errors
TG_ERRORS=$(echo "$EVENTS" | grep -iE "target.?group" | grep -iE "error|failed" || true)
if [[ -n "$TG_ERRORS" ]]; then
HAS_ISSUES=1
print_error " Target group registration issues"
echo "$TG_ERRORS" | tail -n 2 | sed 's/^/ /'
fi
fi

# ===== Annotation Checks (scheme, subnets) =====
SCHEME=$(echo "$INGRESS_INFO" | jq -r '.metadata.annotations["alb.ingress.kubernetes.io/scheme"] // empty')
if [[ -z "$SCHEME" ]]; then
print_warning " No scheme annotation (defaulting to internal)"
print_info " Add annotation: alb.ingress.kubernetes.io/scheme=internet-facing (or internal)"
fi

SUBNETS=$(echo "$INGRESS_INFO" | jq -r '.metadata.annotations["alb.ingress.kubernetes.io/subnets"] // empty')
if [[ -z "$SUBNETS" ]]; then
print_info " Using auto-discovered subnets"
print_info " Consider explicit subnets: alb.ingress.kubernetes.io/subnets=<subnet-ids>"
fi
done

if [[ $HAS_ISSUES -eq 0 ]]; then
print_success "No critical ALB capacity or configuration issues detected"
update_check_result --status "success" --evidence "{}"
else
update_check_result --status "failed" --evidence "{}"
fi
90 changes: 90 additions & 0 deletions k8s/diagnose/networking/ingress_backend_service
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/bin/bash
# Check: Ingress Backend Service
# Checks if ingress backend services exist and are reachable

# Validate ingresses exist
require_ingresses || return 0

# Get ingresses
INGRESSES=$(jq -r '.items[].metadata.name' "$INGRESSES_FILE" 2>/dev/null | tr '\n' ' ')

HAS_ISSUES=0

for INGRESS_NAME in $INGRESSES; do
INGRESS_INFO=$(jq --arg name "$INGRESS_NAME" '.items[] | select(.metadata.name == $name)' "$INGRESSES_FILE" 2>/dev/null)

print_info "Checking backends for ingress: $INGRESS_NAME"

# Get default backend if exists
DEFAULT_BACKEND=$(echo "$INGRESS_INFO" | jq -r '.spec.defaultBackend.service.name // empty')

if [[ -n "$DEFAULT_BACKEND" ]]; then
DEFAULT_PORT=$(echo "$INGRESS_INFO" | jq -r '.spec.defaultBackend.service.port.number // .spec.defaultBackend.service.port.name // empty')

# Check if service exists in pre-collected data
SERVICE_INFO=$(jq --arg name "$DEFAULT_BACKEND" '.items[] | select(.metadata.name == $name)' "$SERVICES_FILE" 2>/dev/null)

if [[ -n "$SERVICE_INFO" && "$SERVICE_INFO" != "null" ]]; then
# Check if service has endpoints from pre-collected data
ENDPOINT_INFO=$(jq --arg name "$DEFAULT_BACKEND" '.items[] | select(.metadata.name == $name)' "$ENDPOINTS_FILE" 2>/dev/null)
ENDPOINTS=$(echo "$ENDPOINT_INFO" | jq -r '.subsets[].addresses[].ip' 2>/dev/null | tr '\n' ' ')

if [[ -n "$ENDPOINTS" ]]; then
print_success " Default backend: $DEFAULT_BACKEND:$DEFAULT_PORT (has endpoints)"
else
HAS_ISSUES=1
print_error " Default backend: $DEFAULT_BACKEND:$DEFAULT_PORT (no endpoints)"
fi
else
HAS_ISSUES=1
print_error " Default backend: Service '$DEFAULT_BACKEND' not found"
fi
fi

# Get all rule backends
BACKENDS=$(echo "$INGRESS_INFO" | jq -r '.spec.rules[].http.paths[] | "\(.backend.service.name):\(.backend.service.port.number // .backend.service.port.name)"' 2>/dev/null)

if [[ -z "$BACKENDS" ]]; then
print_warning " No path rules defined"
continue
fi

# Check each unique backend
echo "$BACKENDS" | sort -u | while IFS=':' read -r SERVICE_NAME SERVICE_PORT; do
# Check if service exists in pre-collected data
SERVICE_INFO=$(jq --arg name "$SERVICE_NAME" '.items[] | select(.metadata.name == $name)' "$SERVICES_FILE" 2>/dev/null)

if [[ -n "$SERVICE_INFO" && "$SERVICE_INFO" != "null" ]]; then
# Check if service has endpoints from pre-collected data
ENDPOINT_INFO=$(jq --arg name "$SERVICE_NAME" '.items[] | select(.metadata.name == $name)' "$ENDPOINTS_FILE" 2>/dev/null)
ENDPOINTS=$(echo "$ENDPOINT_INFO" | jq -r '.subsets[].addresses[].ip' 2>/dev/null | tr '\n' ' ')

if [[ -n "$ENDPOINTS" ]]; then
ENDPOINT_COUNT=$(echo "$ENDPOINTS" | wc -w)
print_success " Backend: $SERVICE_NAME:$SERVICE_PORT ($ENDPOINT_COUNT endpoint(s))"
else
HAS_ISSUES=1
print_error " Backend: $SERVICE_NAME:$SERVICE_PORT (no endpoints)"
print_info " Action: Verify pods are running and service selector matches"
fi

# Verify port exists in service from pre-collected data
SERVICE_PORTS=$(echo "$SERVICE_INFO" | jq -r '.spec.ports[].port' 2>/dev/null | tr '\n' ' ')

if ! echo "$SERVICE_PORTS" | grep -qw "$SERVICE_PORT"; then
HAS_ISSUES=1
print_error " Backend: Port $SERVICE_PORT not found in service $SERVICE_NAME"
print_warning " Available ports: $SERVICE_PORTS"
fi
else
HAS_ISSUES=1
print_error " Backend: Service '$SERVICE_NAME' not found in namespace"
fi
done
done

if [[ $HAS_ISSUES -eq 0 ]]; then
update_check_result --status "success" --evidence "{}"
else
update_check_result --status "failed" --evidence "{}"
fi
Loading