From 83d7d4d24e5ccd2f25a4f91146d09d79c772174d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Chojnacki?= Date: Mon, 20 Apr 2026 20:25:00 +0200 Subject: [PATCH 1/3] Fix integration-test hang and stale proxy.conf directive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_http_proxy constructs an AioHTTPServer in a non-daemon background thread. When an assertion failed before http_server.stop() ran, the thread kept the interpreter alive after pytest_sessionfinish and the whole process wedged in __futex_wait until the container was killed. Make AioHTTPServer a context manager (the existing `# TODO: support with syntax` asked for this) so cleanup runs on any exit path, set daemon=True on its thread, and bound the stop-time join. conftest's TestAgent had the same shape — same treatment — plus two real teardown bugs: - internal_run called _app.cleanup() instead of runner.cleanup(), leaving the TCP listener open and the event loop unable to exit. - TCPSite had no shutdown_timeout, so aiohttp waited the default 60s on keep-alive sockets left over from SIGKILL'd Apache workers. Server.load_configuration's readiness probe is also switched from an HTTP GET to a bare TCP connect: probes shouldn't emit traces into the per-test agent session. Finally, remove the orphan block from proxy.conf - commit 470db65 ("chore: deprecate sampling delegation") removed DatadogDelegateSampling from the module but missed this config, breaking test_http_proxy (which only uses the /http ProxyPass directly above) with "Invalid command" from apachectl. --- test/integration-test/conftest.py | 40 ++++++++++++------- .../scenarios/conf/proxy.conf | 5 --- test/integration-test/scenarios/test_proxy.py | 36 +++++++++++------ 3 files changed, 48 insertions(+), 33 deletions(-) diff --git a/test/integration-test/conftest.py b/test/integration-test/conftest.py index 1be935e2..43900310 100644 --- a/test/integration-test/conftest.py +++ b/test/integration-test/conftest.py @@ -94,22 +94,22 @@ def load_configuration(self, conf_path: str) -> bool: # Continue anyway - sometimes apachectl returns non-zero but Apache starts # We'll verify below with HTTP requests - # Wait for Apache to fully start and verify it's running - # Give Apache up to 5 seconds to start + # Probe readiness with a bare TCP connect rather than an HTTP GET — + # mod_datadog's `DatadogTracing Off` inside a is silently + # dropped by the config merge in common_conf.cpp (child Off is ORed + # with parent On), so any HTTP probe would emit a trace and pollute + # the per-test agent session. + import socket max_wait = 5 start_time = time.time() while time.time() - start_time < max_wait: - # Check if Apache is responding by making a simple HTTP request try: - import requests - response = requests.get(self.make_url("/"), timeout=1) - # If we get any response, Apache is running - print(f"[debug] Apache started successfully, responding with status {response.status_code}") - return True - except requests.exceptions.RequestException: - # Apache not ready yet, wait a bit - time.sleep(0.2) + with socket.create_connection((self.host, int(self.port)), timeout=0.5): + print(f"[debug] Apache accepting connections on {self.host}:{self.port}") + return True + except (ConnectionRefusedError, OSError): + time.sleep(0.1) # If we get here, Apache didn't start properly print(f"[error] Apache failed to respond after {max_wait} seconds") @@ -198,19 +198,29 @@ def __init__(self, host: str, port: int) -> None: def internal_run(self) -> None: runner = web.AppRunner(self._app) self.loop.run_until_complete(runner.setup()) - site = web.TCPSite(runner, self.host, self.port) + # shutdown_timeout bounds how long aiohttp will wait for active + # keep-alive connections to close on teardown. The default is 60s; + # Apache workers that got SIGKILL leave sockets lingering and make + # that timeout show up as a 60s hang at end-of-session. + site = web.TCPSite(runner, self.host, self.port, shutdown_timeout=1.0) self.loop.run_until_complete(site.start()) self.loop.run_until_complete(self._stop.wait()) - self.loop.run_until_complete(self._app.cleanup()) + # runner.cleanup() stops sites, closes the TCP listener, and runs the + # app's cleanup handlers. Calling only app.cleanup() leaves the + # listener open and the event loop blocks forever on exit. + self.loop.run_until_complete(runner.cleanup()) self.loop.close() def run(self) -> None: - self._thread = threading.Thread(target=self.internal_run) + # daemon=True so a stuck event loop (e.g. aiohttp refusing to close) + # can't block interpreter shutdown after pytest_sessionfinish. The + # explicit stop() path below is still the intended cleanup. + self._thread = threading.Thread(target=self.internal_run, daemon=True) self._thread.start() def stop(self) -> None: self.loop.call_soon_threadsafe(self._stop.set) - self._thread.join() + self._thread.join(timeout=3.0) def new_session(self, token=None) -> AgentSession: if token is None: diff --git a/test/integration-test/scenarios/conf/proxy.conf b/test/integration-test/scenarios/conf/proxy.conf index 661c8eb4..938fa65d 100644 --- a/test/integration-test/scenarios/conf/proxy.conf +++ b/test/integration-test/scenarios/conf/proxy.conf @@ -18,8 +18,3 @@ DatadogAddTag foo bar DatadogPropagationStyle datadog ProxyPass "/http" "${upstream_url}" - - - DatadogDelegateSampling On - ProxyPass "${upstream_url}" - diff --git a/test/integration-test/scenarios/test_proxy.py b/test/integration-test/scenarios/test_proxy.py index 569e7923..d4ef9594 100644 --- a/test/integration-test/scenarios/test_proxy.py +++ b/test/integration-test/scenarios/test_proxy.py @@ -34,11 +34,23 @@ def internal_run(self) -> None: self.loop.close() def run(self) -> None: - self._thread = threading.Thread(target=self.internal_run) + # daemon=True so an early test failure that skips stop() can't keep + # the interpreter alive at pytest shutdown — the __exit__ path below + # is the intended cleanup, this is just a safety net. + self._thread = threading.Thread(target=self.internal_run, daemon=True) self._thread.start() def stop(self) -> None: self.loop.call_soon_threadsafe(self._stop.set) + if self._thread is not None: + self._thread.join(timeout=3.0) + + def __enter__(self) -> "AioHTTPServer": + self.run() + return self + + def __exit__(self, exc_type, exc, tb) -> None: + self.stop() def test_http_proxy(server, agent, log_dir, module_path): @@ -46,7 +58,6 @@ def test_http_proxy(server, agent, log_dir, module_path): Verify proxified HTTP requests propagate tracing context """ - # TODO: support `with` syntax def make_temporary_http_server(host: str, port: int): q = Queue() @@ -62,7 +73,6 @@ async def index(request): host = "127.0.0.1" port = 8081 queue, http_server = make_temporary_http_server(host, port) - http_server.run() config = { "path": relpath("conf/proxy.conf"), @@ -72,20 +82,20 @@ async def index(request): conf_path = os.path.join(log_dir, "httpd.conf") save_configuration(make_configuration(config, log_dir, module_path), conf_path) - assert server.check_configuration(conf_path) - assert server.load_configuration(conf_path) + with http_server: + assert server.check_configuration(conf_path) + assert server.load_configuration(conf_path) - r = requests.get(server.make_url("/http"), timeout=2) - assert r.status_code == 200 + r = requests.get(server.make_url("/http"), timeout=2) + assert r.status_code == 200 - http_server.stop() - server.stop(conf_path) + server.stop(conf_path) - assert not queue.empty() + assert not queue.empty() - upstream_headers = queue.get() - assert "x-datadog-trace-id" in upstream_headers - assert "x-datadog-parent-id" in upstream_headers + upstream_headers = queue.get() + assert "x-datadog-trace-id" in upstream_headers + assert "x-datadog-parent-id" in upstream_headers traces = agent.get_traces(timeout=5) assert len(traces) == 1 From f77537169de99b819abcf6499615bea4b00d6048 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Chojnacki?= Date: Mon, 20 Apr 2026 20:30:00 +0200 Subject: [PATCH 2/3] Add test-integration CI job running full pytest suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a GitLab `test-integration:$ARCH` job per arch that exercises the same entrypoint as `make test-integration` locally — the `.devcontainer/run-integration-tests.sh` script, with MODULE_PATH pointed at the build:$ARCH artifact so the cmake build is skipped. This gives the stack three benefits: - Local and CI test invocations run identical commands, so "works on my machine" diverging from "fails in CI" now has one fewer cause. - The existing test-rum:$ARCH job only runs tests tagged requires_rum; the new job covers everything else (configuration, proxy, log injection, smoke, etc.) that was previously only exercised manually. - Snapshot uploads gate on full-suite success rather than just the RUM subset, so a broken non-RUM scenario can't ship a green snapshot. Only makes sense on top of the proxy.conf + teardown-hang fix from this branch — on earlier stack branches the suite fails on test_http_proxy and the teardown hang wedges the pipeline. --- .gitlab-ci.yml | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 84d97536..9e18ca19 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -53,6 +53,18 @@ stages: script: - cd $CI_PROJECT_DIR/test/integration-test && uv sync && uv run pytest scenarios --module-path=$CI_PROJECT_DIR/artifacts/$ARCH/mod_datadog.so --bin-path=/httpd/httpd-build/bin/apachectl --log-dir=$CI_PROJECT_DIR/logs -m requires_rum -v +# Runs the full integration-test suite via the same entrypoint local +# developers use (`make test-integration` -> run-integration-tests.sh). +# The script is pointed at the artifact from build:$ARCH via MODULE_PATH so +# the cmake build is skipped and only pytest runs. +.test-integration-template: + stage: test + image: $DEVCONTAINER_IMAGE + variables: + MODULE_PATH: $CI_PROJECT_DIR/artifacts/$ARCH/mod_datadog.so + script: + - .devcontainer/run-integration-tests.sh + build:amd64: extends: .build-template needs: @@ -109,6 +121,24 @@ test-rum:arm64: ARCH: arm64 tags: ["arch:arm64"] +test-integration:amd64: + extends: .test-integration-template + needs: + - devcontainer-image + - job: "build:amd64" + variables: + ARCH: amd64 + tags: ["arch:amd64"] + +test-integration:arm64: + extends: .test-integration-template + needs: + - devcontainer-image + - job: "build:arm64" + variables: + ARCH: arm64 + tags: ["arch:arm64"] + .upload-template: stage: aws needs: @@ -118,6 +148,8 @@ test-rum:arm64: artifacts: true - job: "test-rum:amd64" - job: "test-rum:arm64" + - job: "test-integration:amd64" + - job: "test-integration:arm64" tags: ["arch:amd64"] image: registry.ddbuild.io/images/aws-cli:2.33.12 script: From c5d42dafcd27b786b598ac2a38bea7ea51e83d8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Chojnacki?= Date: Mon, 20 Apr 2026 20:32:00 +0200 Subject: [PATCH 3/3] Scope test-integration to its arch-specific devcontainer slot Match the narrowing done for build/test-rum in the devcontainer branch: `test-integration:$ARCH` no longer waits for both devcontainer-image matrix slots. It scopes the needs to its own arch slot via `parallel: matrix:`, inheriting the arch-specific DEVCONTAINER_IMAGE without blocking on the other arch's rebuild. --- .gitlab-ci.yml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9e18ca19..d59ce3fe 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -124,7 +124,12 @@ test-rum:arm64: test-integration:amd64: extends: .test-integration-template needs: - - devcontainer-image + - job: devcontainer-image + parallel: + matrix: + - ARCH: amd64 + TOOLCHAIN_ARCH: x86_64 + artifacts: true - job: "build:amd64" variables: ARCH: amd64 @@ -133,7 +138,12 @@ test-integration:amd64: test-integration:arm64: extends: .test-integration-template needs: - - devcontainer-image + - job: devcontainer-image + parallel: + matrix: + - ARCH: arm64 + TOOLCHAIN_ARCH: aarch64 + artifacts: true - job: "build:arm64" variables: ARCH: arm64