From 9d7bb64034e84213cf3967d9ec0329f4dd01bb04 Mon Sep 17 00:00:00 2001 From: Alejandro Gullon Date: Mon, 27 Apr 2026 16:56:30 +0200 Subject: [PATCH] NO-JIRA: add Loki readiness check and restart logic to observability tests Add a Loki readiness check (/ready endpoint) to the observability test suite setup to detect when Loki's ingester has entered a shutdown state. Automatically restart the Loki container if unhealthy, polling readiness every 5s for up to 30s per attempt (3 attempts max) to cover the ingester's ~15s warmup window. - Add check_loki_ready() to loki.py with proper error handling - Add restart action to manage_loki.sh script - Fix IPv6 address handling in check_loki_query - Split broad except Exception in check_loki_query into specific handlers Co-Authored-By: Claude Opus 4.6 pre-commit.check-secrets: ENABLED --- test/bin/manage_loki.sh | 13 ++++++++-- test/resources/loki.py | 30 ++++++++++++++++++++---- test/suites/optional/observability.robot | 21 +++++++++++++++++ 3 files changed, 58 insertions(+), 6 deletions(-) diff --git a/test/bin/manage_loki.sh b/test/bin/manage_loki.sh index 99557f42e6..ff523d6a97 100755 --- a/test/bin/manage_loki.sh +++ b/test/bin/manage_loki.sh @@ -14,7 +14,7 @@ DEFAULT_HOST_PORT="3100" usage() { cat - < /dev/null } +action_restart() { + local host_port="${1:-${DEFAULT_HOST_PORT}}" + echo "Restarting Loki container" + action_stop + action_start "${host_port}" +} + if [ $# -eq 0 ]; then usage exit 1 @@ -60,7 +69,7 @@ action="${1}" shift case "${action}" in - start|stop) + start|stop|restart) "action_${action}" "$@" ;; -h) diff --git a/test/resources/loki.py b/test/resources/loki.py index 7c21822f01..ce05e5e6f5 100644 --- a/test/resources/loki.py +++ b/test/resources/loki.py @@ -2,6 +2,7 @@ import json import requests from datetime import datetime, timedelta +import libipv6 def query_loki(loki_url: str, query: str, limit: int = 10, start_time: datetime = None, end_time: datetime = None) -> dict: @@ -68,17 +69,38 @@ def _print_results(results: dict) -> None: _log(f"{log_line}") +def check_loki_ready(host: str, port: int) -> None: + """Check if Loki is ready by hitting the /ready endpoint. + + Raises Exception if Loki is unreachable or returns a non-200 status code. + """ + address = libipv6.add_brackets_if_ipv6(host) + url = f"http://{address}:{port}/ready" + _log(f"Checking Loki readiness at {url}") + try: + response = requests.get(url, timeout=5) + except requests.exceptions.ConnectionError as e: + raise Exception(f"Loki is unreachable at {url}: {e}") from e + except requests.exceptions.Timeout as e: + raise Exception(f"Loki readiness check timed out at {url}: {e}") from e + _log(f"Loki readiness response: {response.status_code} {response.text.strip()}") + if response.status_code != 200: + raise Exception(f"Loki is not ready: {response.status_code} {response.text.strip()}") + + def check_loki_query(host: str, port: int, query: str, limit: int = 10) -> None: try: from robot.libraries.BuiltIn import BuiltIn - # Running within RF stdout, _, _ = BuiltIn().run_keyword("Command Execution", "hostname") if stdout: query = f"{query} | host_name=`{stdout}`" _log(f"Added hostname to query: {query}") - except Exception: - None - results = query_loki(f"http://{host}:{port}", query, limit) + except ImportError: + _log("Not running within Robot Framework, skipping host filter") + except Exception as e: + _log(f"WARNING: Could not determine hostname, proceeding without host filter: {e}") + address = libipv6.add_brackets_if_ipv6(host) + results = query_loki(f"http://{address}:{port}", query, limit) _print_results(results) diff --git a/test/suites/optional/observability.robot b/test/suites/optional/observability.robot index b60f002891..6c13000cdd 100644 --- a/test/suites/optional/observability.robot +++ b/test/suites/optional/observability.robot @@ -74,6 +74,7 @@ Setup Suite And Prepare Test Host Command Should Work sudo firewall-cmd --reload # Configure observability settings Check Required Observability Variables + Ensure Loki Is Ready Set Test OTEL Configuration # We need to do something to the cluster to generate new kube events Create Hello MicroShift Pod @@ -89,6 +90,26 @@ Check Required Observability Variables Should Not Be Empty ${LOKI_HOST} LOKI_HOST variable is required ${string_value} Convert To String ${LOKI_HOST} Should Not Be Empty ${string_value} LOKI_HOST variable is required + ${string_value} Convert To String ${LOKI_PORT} + Should Not Be Empty ${string_value} LOKI_PORT variable is required + +Ensure Loki Is Ready + [Documentation] Check if Loki's ingester is healthy, restart the container if not. + ... Loki's ingester can enter a shutdown state over time, causing it to + ... reject all writes with HTTP 503 while still responding to queries. + ... After a restart, the ingester needs ~15s before it reports ready. + ${status} ${error} Run Keyword And Ignore Error + ... Check Loki Ready ${LOKI_HOST} ${LOKI_PORT} + IF "${status}" == "PASS" RETURN + Log Loki is not ready: ${error} console=True + FOR ${attempt} IN RANGE 1 4 + Log Restarting Loki (attempt ${attempt}/3) console=True + Local Command Should Work ./bin/manage_loki.sh restart ${LOKI_PORT} + ${poll_status} ${poll_error} Run Keyword And Ignore Error + ... Wait Until Keyword Succeeds 30s 5s Check Loki Ready ${LOKI_HOST} ${LOKI_PORT} + IF "${poll_status}" == "PASS" RETURN + END + Fail Loki did not become ready after 3 restart attempts. Last error: ${poll_error} Set Test OTEL Configuration [Documentation] Set Test OTEL Configuration