Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,18 @@ stages:
script:
- cd $CI_PROJECT_DIR/test/integration-test && uv sync && uv run pytest scenarios --module-path=$CI_PROJECT_DIR/artifacts/$ARCH/mod_datadog.so --bin-path=/httpd/httpd-build/bin/apachectl --log-dir=$CI_PROJECT_DIR/logs -m requires_rum -v

# Runs the full integration-test suite via the same entrypoint local
# developers use (`make test-integration` -> run-integration-tests.sh).
# The script is pointed at the artifact from build:$ARCH via MODULE_PATH so
# the cmake build is skipped and only pytest runs.
.test-integration-template:
stage: test
image: $DEVCONTAINER_IMAGE
variables:
MODULE_PATH: $CI_PROJECT_DIR/artifacts/$ARCH/mod_datadog.so
script:
- .devcontainer/run-integration-tests.sh

build:amd64:
extends: .build-template
needs:
Expand Down Expand Up @@ -109,6 +121,34 @@ test-rum:arm64:
ARCH: arm64
tags: ["arch:arm64"]

test-integration:amd64:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be factorized:

test-integration:
  extends: .test-integration-template
  parallel:
    matrix:
      - ARCH: amd64
        TOOLCHAIN_ARCH: x86_64
      - ARCH: arm64
        TOOLCHAIN_ARCH: aarch64
  needs:
    - job: devcontainer-image
      parallel:
        matrix:
          - ARCH: $ARCH
            TOOLCHAIN_ARCH: $TOOLCHAIN_ARCH
      artifacts: true
    - job: "build:$ARCH"
  tags: ["arch:$ARCH"]

Note that this needs to update the needs in .upload-template (to be tested):

  • test-integration:amd64test-integration:[amd64]
  • test-integration:arm64test-integration:[arm64]

Similarly, we should try to factorize build:amd64 / build:arm64 and test-rum:amd64 / test-rum:arm64.

extends: .test-integration-template
needs:
- job: devcontainer-image
parallel:
matrix:
- ARCH: amd64
TOOLCHAIN_ARCH: x86_64
artifacts: true
- job: "build:amd64"
variables:
ARCH: amd64
tags: ["arch:amd64"]

test-integration:arm64:
extends: .test-integration-template
needs:
- job: devcontainer-image
parallel:
matrix:
- ARCH: arm64
TOOLCHAIN_ARCH: aarch64
artifacts: true
- job: "build:arm64"
variables:
ARCH: arm64
tags: ["arch:arm64"]

.upload-template:
stage: aws
needs:
Expand All @@ -118,6 +158,8 @@ test-rum:arm64:
artifacts: true
- job: "test-rum:amd64"
- job: "test-rum:arm64"
- job: "test-integration:amd64"
- job: "test-integration:arm64"
tags: ["arch:amd64"]
image: registry.ddbuild.io/images/aws-cli:2.33.12
script:
Expand Down
40 changes: 25 additions & 15 deletions test/integration-test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,22 +94,22 @@ def load_configuration(self, conf_path: str) -> bool:
# Continue anyway - sometimes apachectl returns non-zero but Apache starts
# We'll verify below with HTTP requests

# Wait for Apache to fully start and verify it's running
# Give Apache up to 5 seconds to start
# Probe readiness with a bare TCP connect rather than an HTTP GET —
# mod_datadog's `DatadogTracing Off` inside a <Location> is silently
# dropped by the config merge in common_conf.cpp (child Off is ORed
# with parent On), so any HTTP probe would emit a trace and pollute
# the per-test agent session.
import socket
max_wait = 5
start_time = time.time()

while time.time() - start_time < max_wait:
# Check if Apache is responding by making a simple HTTP request
try:
import requests
response = requests.get(self.make_url("/"), timeout=1)
# If we get any response, Apache is running
print(f"[debug] Apache started successfully, responding with status {response.status_code}")
return True
except requests.exceptions.RequestException:
# Apache not ready yet, wait a bit
time.sleep(0.2)
with socket.create_connection((self.host, int(self.port)), timeout=0.5):
print(f"[debug] Apache accepting connections on {self.host}:{self.port}")
return True
except (ConnectionRefusedError, OSError):
time.sleep(0.1)

# If we get here, Apache didn't start properly
print(f"[error] Apache failed to respond after {max_wait} seconds")
Expand Down Expand Up @@ -198,19 +198,29 @@ def __init__(self, host: str, port: int) -> None:
def internal_run(self) -> None:
runner = web.AppRunner(self._app)
self.loop.run_until_complete(runner.setup())
site = web.TCPSite(runner, self.host, self.port)
# shutdown_timeout bounds how long aiohttp will wait for active
# keep-alive connections to close on teardown. The default is 60s;
# Apache workers that got SIGKILL leave sockets lingering and make
# that timeout show up as a 60s hang at end-of-session.
site = web.TCPSite(runner, self.host, self.port, shutdown_timeout=1.0)
self.loop.run_until_complete(site.start())
self.loop.run_until_complete(self._stop.wait())
self.loop.run_until_complete(self._app.cleanup())
# runner.cleanup() stops sites, closes the TCP listener, and runs the
# app's cleanup handlers. Calling only app.cleanup() leaves the
# listener open and the event loop blocks forever on exit.
self.loop.run_until_complete(runner.cleanup())
self.loop.close()

def run(self) -> None:
self._thread = threading.Thread(target=self.internal_run)
# daemon=True so a stuck event loop (e.g. aiohttp refusing to close)
# can't block interpreter shutdown after pytest_sessionfinish. The
# explicit stop() path below is still the intended cleanup.
self._thread = threading.Thread(target=self.internal_run, daemon=True)
self._thread.start()

def stop(self) -> None:
self.loop.call_soon_threadsafe(self._stop.set)
self._thread.join()
self._thread.join(timeout=3.0)
Comment on lines 221 to +223
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TestAgent.stop() now uses join(timeout=3.0) but doesn’t check whether the thread actually stopped. If the aiohttp loop is stuck (the scenario this change is trying to mitigate), the test run will continue while the agent thread is still running, which can lead to port-binding flakiness or a noisy shutdown. Consider checking self._thread.is_alive() after the timeout and surfacing a clear error/warning, and optionally guard call_soon_threadsafe() in case the loop has already been closed due to an internal_run error.

Copilot uses AI. Check for mistakes.

def new_session(self, token=None) -> AgentSession:
if token is None:
Expand Down
5 changes: 0 additions & 5 deletions test/integration-test/scenarios/conf/proxy.conf
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,3 @@ DatadogAddTag foo bar
DatadogPropagationStyle datadog

ProxyPass "/http" "${upstream_url}"

<Location "/delegate">
DatadogDelegateSampling On
ProxyPass "${upstream_url}"
</Location>
36 changes: 23 additions & 13 deletions test/integration-test/scenarios/test_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,30 @@ def internal_run(self) -> None:
self.loop.close()

def run(self) -> None:
self._thread = threading.Thread(target=self.internal_run)
# daemon=True so an early test failure that skips stop() can't keep
# the interpreter alive at pytest shutdown — the __exit__ path below
# is the intended cleanup, this is just a safety net.
self._thread = threading.Thread(target=self.internal_run, daemon=True)
self._thread.start()

def stop(self) -> None:
self.loop.call_soon_threadsafe(self._stop.set)
if self._thread is not None:
self._thread.join(timeout=3.0)
Comment on lines 43 to +46
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AioHTTPServer.internal_run() still calls app.cleanup() and never calls runner.cleanup() / site.stop(). That can leave the listening socket open and may prevent the thread from exiting cleanly (same teardown issue fixed for TestAgent in conftest.py). Consider switching to runner.cleanup() (and/or explicitly stopping the site) before closing the loop so the context-manager shutdown reliably releases the TCP listener.

Copilot uses AI. Check for mistakes.
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

stop() joins the server thread with a timeout but then silently proceeds even if the thread is still alive. That can mask teardown failures and leave a background aiohttp server bound to the port, making later tests flaky. After join(timeout=…), consider checking is_alive() and surfacing a clear failure (or at least logging a warning) so a stuck shutdown doesn’t go unnoticed.

Suggested change
self._thread.join(timeout=3.0)
self._thread.join(timeout=3.0)
if self._thread.is_alive():
raise RuntimeError(
f"AioHTTPServer failed to stop within 3.0 seconds for "
f"{self._host}:{self._port}; the server thread is still running."
)

Copilot uses AI. Check for mistakes.

def __enter__(self) -> "AioHTTPServer":
self.run()
return self

def __exit__(self, exc_type, exc, tb) -> None:
self.stop()


def test_http_proxy(server, agent, log_dir, module_path):
"""
Verify proxified HTTP requests propagate tracing context
"""

# TODO: support `with` syntax
def make_temporary_http_server(host: str, port: int):
q = Queue()

Expand All @@ -62,7 +73,6 @@ async def index(request):
host = "127.0.0.1"
port = 8081
queue, http_server = make_temporary_http_server(host, port)
http_server.run()

config = {
"path": relpath("conf/proxy.conf"),
Expand All @@ -72,20 +82,20 @@ async def index(request):
conf_path = os.path.join(log_dir, "httpd.conf")
save_configuration(make_configuration(config, log_dir, module_path), conf_path)

assert server.check_configuration(conf_path)
assert server.load_configuration(conf_path)
with http_server:
assert server.check_configuration(conf_path)
assert server.load_configuration(conf_path)

r = requests.get(server.make_url("/http"), timeout=2)
assert r.status_code == 200
r = requests.get(server.make_url("/http"), timeout=2)
assert r.status_code == 200

http_server.stop()
server.stop(conf_path)
server.stop(conf_path)

assert not queue.empty()
assert not queue.empty()

upstream_headers = queue.get()
assert "x-datadog-trace-id" in upstream_headers
assert "x-datadog-parent-id" in upstream_headers
upstream_headers = queue.get()
assert "x-datadog-trace-id" in upstream_headers
assert "x-datadog-parent-id" in upstream_headers

traces = agent.get_traces(timeout=5)
assert len(traces) == 1
Loading