diff --git a/.github/actions/dind-up-action/action.yml b/.github/actions/dind-up-action/action.yml new file mode 100644 index 000000000000..23cc8613bb67 --- /dev/null +++ b/.github/actions/dind-up-action/action.yml @@ -0,0 +1,275 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: "Start and Prepare DinD" +description: "Launch, verify, and prepare a Docker-in-Docker environment." +inputs: + # --- Core DinD Config --- + container-name: + description: "Name for the DinD container." + default: dind-daemon + bind-address: + default: 127.0.0.1 + port: + default: "2375" + storage-volume: + default: dind-storage + execroot-volume: + default: dind-execroot + ephemeral-volumes: + description: "Generate unique per-run volume names (recommended)." + default: "true" + auto-prune-dangling: + description: "Prune dangling ephemeral DinD volumes from previous runs." + default: "true" + tmpfs-run-size: + default: 1g + tmpfs-varrun-size: + default: 1g + storage-driver: + default: overlay2 + additional-dockerd-args: + default: "" + use-host-network: + description: "Run DinD with --network host instead of publishing a TCP port." + default: "false" + + # --- Health & Wait Config --- + health-interval: + default: 2s + health-retries: + default: "60" + health-start-period: + default: 10s + wait-timeout: + default: "180" + + # --- NEW: Optional Setup & Verification Steps --- + cleanup-dind-on-start: + description: "Run 'docker system prune' inside DinD immediately after it starts." + default: "true" + smoke-test-port-mapping: + description: "Run a quick test to ensure port mapping from DinD is working." + default: "true" + prime-testcontainers: + description: "Start and stop a small container via the testcontainers library to prime Ryuk." + default: "false" + + # --- Output Config --- + export-gh-env: + description: "Also write DOCKER_HOST and DIND_IP to $GITHUB_ENV for the rest of the job." + default: "false" + +outputs: + docker-host: + description: "The TCP address for the DinD daemon (e.g., tcp://127.0.0.1:2375)." + value: ${{ steps.set-output.outputs.docker-host }} + dind-ip: + description: "The discovered bridge IP address of the DinD container." + value: ${{ steps.discover-ip.outputs.dind-ip }} + container-name: + description: "The name of the running DinD container." + value: ${{ inputs.container-name || 'dind-daemon' }} + storage-volume: + value: ${{ steps.set-output.outputs.storage_volume }} + execroot-volume: + value: ${{ steps.set-output.outputs.execroot_volume }} + +runs: + using: "composite" + steps: + - name: Prune old dangling ephemeral DinD volumes + if: ${{ inputs.auto-prune-dangling == 'true' }} + shell: bash + run: | + docker volume ls -q \ + --filter "label=com.github.dind=1" \ + --filter "label=com.github.repo=${GITHUB_REPOSITORY}" \ + --filter "dangling=true" | xargs -r docker volume rm || true + + - name: Start docker:dind + shell: bash + run: | + # (Your original 'Start docker:dind' script is perfect here - no changes needed) + set -euo pipefail + NAME="${{ inputs.container-name || 'dind-daemon' }}" + BIND="${{ inputs.bind-address || '127.0.0.1' }}" + PORT="${{ inputs.port || '2375' }}" + SD="${{ inputs.storage-driver || 'overlay2' }}" + TRS="${{ inputs.tmpfs-run-size || '1g' }}" + TVRS="${{ inputs.tmpfs-varrun-size || '1g' }}" + HI="${{ inputs.health-interval || '2s' }}" + HR="${{ inputs.health-retries || '60' }}" + HSP="${{ inputs.health-start-period || '10s' }}" + EXTRA="${{ inputs.additional-dockerd-args }}" + USE_HOST_NET="${{ inputs.use-host-network || 'false' }}" + + if [[ "${{ inputs.ephemeral-volumes }}" == "true" ]]; then + SUFFIX="${GITHUB_RUN_ID:-0}-${GITHUB_RUN_ATTEMPT:-0}-${GITHUB_JOB:-job}" + STORAGE_VOL="dind-storage-${SUFFIX}" + EXECROOT_VOL="dind-execroot-${SUFFIX}" + else + STORAGE_VOL="${{ inputs.storage-volume || 'dind-storage' }}" + EXECROOT_VOL="${{ inputs.execroot-volume || 'dind-execroot' }}" + fi + + docker volume create --name "${STORAGE_VOL}" --label "com.github.dind=1" --label "com.github.repo=${GITHUB_REPOSITORY}" >/dev/null + docker volume create --name "${EXECROOT_VOL}" --label "com.github.dind=1" --label "com.github.repo=${GITHUB_REPOSITORY}" >/dev/null + docker rm -f -v "$NAME" 2>/dev/null || true + + NET_ARGS="" + PUBLISH_ARGS="-p ${BIND}:${PORT}:${PORT}" + if [[ "${USE_HOST_NET}" == "true" ]]; then + NET_ARGS="--network host" + PUBLISH_ARGS="" + fi + + docker run -d --privileged --name "$NAME" \ + --cgroupns=host \ + -e DOCKER_TLS_CERTDIR= \ + ${NET_ARGS} \ + ${PUBLISH_ARGS} \ + -v "${STORAGE_VOL}:/var/lib/docker" \ + -v "${EXECROOT_VOL}:/execroot" \ + --tmpfs /run:rw,exec,size=${TRS} \ + --tmpfs /var/run:rw,exec,size=${TVRS} \ + --label "com.github.dind=1" \ + --health-cmd='docker info > /dev/null' \ + --health-interval=${HI} \ + --health-retries=${HR} \ + --health-start-period=${HSP} \ + docker:dind \ + --host=tcp://0.0.0.0:${PORT} \ + --host=unix:///var/run/docker.sock \ + --storage-driver=${SD} \ + --exec-root=/execroot ${EXTRA} + + { + echo "STORAGE_VOL=${STORAGE_VOL}" + echo "EXECROOT_VOL=${EXECROOT_VOL}" + } >> "$GITHUB_ENV" + + - name: Wait for DinD daemon + shell: bash + run: | + set -euo pipefail + NAME="${{ inputs.container-name || 'dind-daemon' }}" + HOST="${{ inputs.bind-address || '127.0.0.1' }}" + PORT="${{ inputs.port || '2375' }}" + TIMEOUT="${{ inputs.wait-timeout || '180' }}" + echo "Waiting for Docker-in-Docker to be ready..." + if ! timeout ${TIMEOUT}s bash -c 'until docker -H "tcp://'"${HOST}"':'"${PORT}"'" info >/dev/null 2>&1; do sleep 2; done'; then + echo "::error::DinD failed to start within ${TIMEOUT}s." + docker logs "$NAME" || true + exit 1 + fi + echo "DinD is ready." + docker -H "tcp://${HOST}:${PORT}" info --format 'Daemon OK → OS={{.OperatingSystem}} Version={{.ServerVersion}}' + + - id: set-output + shell: bash + run: | + HOST="${{ inputs.bind-address || '127.0.0.1' }}" + PORT="${{ inputs.port || '2375' }}" + echo "docker-host=tcp://${HOST}:${PORT}" >> "$GITHUB_OUTPUT" + echo "storage_volume=${STORAGE_VOL:-}" >> "$GITHUB_OUTPUT" + echo "execroot_volume=${EXECROOT_VOL:-}" >> "$GITHUB_OUTPUT" + + # --- NEW: Integrated Setup & Verification Steps --- + + - name: Cleanup DinD Environment + if: ${{ inputs.cleanup-dind-on-start == 'true' }} + shell: bash + run: | + echo "Performing initial cleanup of DinD environment..." + DIND_HOST="${{ steps.set-output.outputs.docker-host }}" + docker -H "${DIND_HOST}" system prune -af --volumes + docker -H "${DIND_HOST}" image prune -af + + - id: discover-ip + name: Discover DinD Container IP + shell: bash + run: | + set -euo pipefail + NAME="${{ inputs.container-name || 'dind-daemon' }}" + + # Use host daemon to inspect the DinD container + nm=$(docker inspect -f '{{.HostConfig.NetworkMode}}' "$NAME") + echo "DinD NetworkMode=${nm}" + + # Try to find the bridge network IP + ip=$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$NAME" || true) + + # If still empty, likely host networking -> use loopback + if [[ -z "${ip}" || "${nm}" == "host" ]]; then + echo "No bridge IP found or using host network. Falling back to 127.0.0.1." + ip="127.0.0.1" + fi + + echo "Discovered DinD IP: ${ip}" + echo "dind-ip=${ip}" >> "$GITHUB_OUTPUT" + + - name: Smoke Test Port Mapping + if: ${{ inputs.smoke-test-port-mapping == 'true' }} + env: + DOCKER_HOST: ${{ steps.set-output.outputs.docker-host }} + DIND_IP: ${{ steps.discover-ip.outputs.dind-ip }} + shell: bash + run: | + set -euo pipefail + echo "Running port mapping smoke test..." + docker pull redis:7.2-alpine + cid=$(docker run -d -p 0:6379 --name redis-smoke redis:7-alpine) + hostport=$(docker port redis-smoke 6379/tcp | sed 's/.*://') + echo "Redis container started, mapped to host port ${hostport}" + echo "Probing connection to ${DIND_IP}:${hostport} ..." + + timeout 5 bash -c 'exec 3<>/dev/tcp/$DIND_IP/'"$hostport" + if [[ $? -eq 0 ]]; then + echo "TCP connection successful. Port mapping is working." + else + echo "::error::Failed to connect to mapped port on ${DIND_IP}:${hostport}" + docker logs redis-smoke + exit 1 + fi + docker rm -f "$cid" + + - name: Prime Testcontainers (Ryuk) + if: ${{ inputs.prime-testcontainers == 'true' }} + env: + DOCKER_HOST: ${{ steps.set-output.outputs.docker-host }} + TESTCONTAINERS_HOST_OVERRIDE: ${{ steps.discover-ip.outputs.dind-ip }} + shell: bash + run: | + echo "Priming Testcontainers/Ryuk..." + python -m pip install -q --upgrade pip testcontainers + # Use a tiny image for a fast and stable prime + docker pull alpine:3.19 + python - <<'PY' + from testcontainers.core.container import DockerContainer + c = DockerContainer("alpine:3.19").with_command("true") + c.start() + c.stop() + print("Ryuk primed and ready.") + PY + + - name: Export Environment Variables + if: ${{ inputs.export-gh-env == 'true' }} + shell: bash + run: | + echo "DOCKER_HOST=${{ steps.set-output.outputs.docker-host }}" >> "$GITHUB_ENV" + echo "DIND_IP=${{ steps.discover-ip.outputs.dind-ip }}" >> "$GITHUB_ENV" \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Python_Coverage.yml b/.github/workflows/beam_PreCommit_Python_Coverage.yml index 093f7026b13a..b21ad50e9da2 100644 --- a/.github/workflows/beam_PreCommit_Python_Coverage.yml +++ b/.github/workflows/beam_PreCommit_Python_Coverage.yml @@ -59,7 +59,7 @@ env: jobs: beam_PreCommit_Python_Coverage: name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - runs-on: [self-hosted, ubuntu-20.04, main] + runs-on: [self-hosted, ubuntu-20.04, highmem] strategy: matrix: job_name: [beam_PreCommit_Python_Coverage] @@ -84,7 +84,29 @@ jobs: with: java-version: default python-version: default + - name: Start DinD + uses: ./.github/actions/dind-up-action + id: dind + with: + # Enable all the new features + cleanup-dind-on-start: "true" + smoke-test-port-mapping: "true" + prime-testcontainers: "true" + tmpfs-run-size: 2g + tmpfs-varrun-size: 4g + export-gh-env: "true" - name: Run preCommitPyCoverage + env: + DOCKER_HOST: ${{ steps.dind.outputs.docker-host }} + TOX_TESTENV_PASSENV: "DOCKER_*,TESTCONTAINERS_*,TC_*,BEAM_*,GRPC_*,OMP_*,OPENBLAS_*,PYTHONHASHSEED,PYTEST_*" + TESTCONTAINERS_HOST_OVERRIDE: ${{ env.DIND_IP }} + TESTCONTAINERS_DOCKER_SOCKET_OVERRIDE: "/var/run/docker.sock" + TESTCONTAINERS_RYUK_DISABLED: "false" + TESTCONTAINERS_RYUK_CONTAINER_PRIVILEGED: "true" + PYTEST_ADDOPTS: "-v --tb=short --maxfail=3 --durations=20 --reruns=2 --reruns-delay=5" + TC_TIMEOUT: "120" + TC_MAX_TRIES: "120" + TC_SLEEP_TIME: "1" uses: ./.github/actions/gradle-command-self-hosted-action with: gradle-command: :sdks:python:test-suites:tox:py39:preCommitPyCoverage @@ -99,6 +121,8 @@ jobs: name: Python Test Results path: '**/pytest*.xml' - name: Publish Python Test Results + env: + DOCKER_HOST: "" # Unset DOCKER_HOST to run on host Docker daemon uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py index ebc05722841c..ec9579ca5d18 100644 --- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py +++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py @@ -295,7 +295,7 @@ def __init__( class MilvusEnrichmentTestHelper: @staticmethod def start_db_container( - image="milvusdb/milvus:v2.5.10", + image="milvusdb/milvus:v2.3.9", max_vec_fields=5, vector_client_max_retries=3, tc_max_retries=TC_MAX_TRIES) -> Optional[MilvusDBContainerInfo]: @@ -479,7 +479,7 @@ class TestMilvusSearchEnrichment(unittest.TestCase): """Tests for search functionality across all search strategies""" _db: MilvusDBContainerInfo - _version = "milvusdb/milvus:v2.5.10" + _version = "milvusdb/milvus:v2.3.9" @classmethod def setUpClass(cls): diff --git a/sdks/python/conftest.py b/sdks/python/conftest.py index 37c4a0434e75..20251f295ce7 100644 --- a/sdks/python/conftest.py +++ b/sdks/python/conftest.py @@ -17,7 +17,12 @@ """Pytest configuration and custom hooks.""" +import os import sys +from types import SimpleNamespace + +import pytest +from testcontainers.core import waiting_utils from apache_beam.options import pipeline_options from apache_beam.testing.test_pipeline import TestPipeline @@ -39,11 +44,131 @@ def pytest_addoption(parser): ] +@pytest.fixture(scope="session", autouse=True) +def configure_beam_rpc_timeouts(): + """ + Configure gRPC and RPC timeouts for Beam tests + to prevent DEADLINE_EXCEEDED errors. + """ + print("\n--- Applying Beam RPC timeout configuration ---") + + # Set gRPC keepalive and timeout settings + timeout_env_vars = { + 'GRPC_ARG_KEEPALIVE_TIME_MS': '30000', + 'GRPC_ARG_KEEPALIVE_TIMEOUT_MS': '5000', + 'GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA': '0', + 'GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS': '1', + 'GRPC_ARG_HTTP2_MIN_RECV_PING_INTERVAL_WITHOUT_DATA_MS': '300000', + 'GRPC_ARG_HTTP2_MIN_SENT_PING_INTERVAL_WITHOUT_DATA_MS': '10000', + + # Additional stability settings for DinD environment + 'GRPC_ARG_MAX_RECONNECT_BACKOFF_MS': '120000', + 'GRPC_ARG_INITIAL_RECONNECT_BACKOFF_MS': '1000', + 'GRPC_ARG_MAX_CONNECTION_IDLE_MS': '300000', + 'GRPC_ARG_MAX_CONNECTION_AGE_MS': '1800000', + + # Beam-specific retry and timeout settings + 'BEAM_RETRY_MAX_ATTEMPTS': '5', + 'BEAM_RETRY_INITIAL_DELAY_MS': '1000', + 'BEAM_RETRY_MAX_DELAY_MS': '60000', + 'BEAM_RUNNER_BUNDLE_TIMEOUT_MS': '300000', + + # Force deterministic execution in DinD environment + 'BEAM_TESTING_FORCE_SINGLE_BUNDLE': 'true', + 'BEAM_TESTING_DETERMINISTIC_ORDER': 'true', + 'BEAM_SDK_WORKER_PARALLELISM': '1', + 'BEAM_WORKER_POOL_SIZE': '1', + 'BEAM_FN_API_CONTROL_PORT': '0', + 'BEAM_FN_API_DATA_PORT': '0', + + # Container-specific stability settings + 'PYTHONHASHSEED': '0', + 'OMP_NUM_THREADS': '1', + 'OPENBLAS_NUM_THREADS': '1', + + # Force sequential pytest execution (CRITICAL for DinD stability) + 'PYTEST_XDIST_WORKER_COUNT': '1', + 'PYTEST_CURRENT_TEST_TIMEOUT': '300', + + # Mock and test isolation improvements + 'PYTEST_MOCK_TIMEOUT': '60', + 'BEAM_TEST_ISOLATION_MODE': 'strict', + } + + for key, value in timeout_env_vars.items(): + os.environ[key] = value + print(f"Set {key}={value}") + + print("Successfully configured Beam RPC timeouts") + + +@pytest.fixture(autouse=True) +def ensure_clean_state(): + """ + Ensure clean state before each test + to prevent cross-test contamination. + """ + import gc + import threading + import time + + # Force garbage collection to clean up any lingering resources + gc.collect() + + # Log active thread count for debugging + thread_count = threading.active_count() + if thread_count > 50: # Increased threshold since we see 104 threads + print(f"Warning: {thread_count} active threads detected before test") + + # Force a brief pause to let threads settle + time.sleep(0.5) + gc.collect() + + yield + + # Enhanced cleanup after test + try: + # Force more aggressive cleanup + gc.collect() + + # Brief pause to let any async operations complete + time.sleep(0.1) + + # Additional garbage collection + gc.collect() + except Exception as e: + print(f"Warning: Cleanup error: {e}") + + +@pytest.fixture(autouse=True) +def enhance_mock_stability(): + """Enhance mock stability in DinD environment.""" + import time + + # Brief pause before test to ensure clean mock state + time.sleep(0.05) + + yield + + # Brief pause after test to let mocks clean up + time.sleep(0.05) + + def pytest_configure(config): """Saves options added in pytest_addoption for later use. - This is necessary since pytest-xdist workers do not have the same sys.argv as - the main pytest invocation. xdist does seem to pickle TestPipeline + This is necessary since pytest-xdist workers do not have the + same sys.argv as the main pytest invocation. + xdist does seem to pickle TestPipeline """ + # for the entire test session. + print("\n--- Applying global testcontainers timeout configuration ---") + waiting_utils.config = SimpleNamespace( + timeout=int(os.getenv("TC_TIMEOUT", "120")), + max_tries=int(os.getenv("TC_MAX_TRIES", "120")), + sleep_time=float(os.getenv("TC_SLEEP_TIME", "1")), + ) + print("Successfully set waiting utils config") + TestPipeline.pytest_test_pipeline_options = config.getoption( 'test_pipeline_options', default='') # Enable optional type checks on all tests. diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index e746232c397e..157dfedecc47 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -31,7 +31,7 @@ select = E3 # https://github.com/apache/beam/issues/25668 pip_pre = True # allow apps that support color to use it. -passenv=TERM,CLOUDSDK_CONFIG +passenv=TERM,CLOUDSDK_CONFIG,DOCKER_*,TESTCONTAINERS_*,TC_* # Set [] options for pip installation of apache-beam tarball. extras = test,dataframe # Don't warn that these commands aren't installed. @@ -147,11 +147,18 @@ list_dependencies_command = {envbindir}/python.exe {envbindir}/pip.exe freeze [testenv:py39-cloudcoverage] deps = pytest-cov==3.0.0 + +platform = linux +passenv = GIT_*,BUILD_*,ghprb*,CHANGE_ID,BRANCH_NAME,JENKINS_*,CODECOV_*,GITHUB_*,DOCKER_*,TESTCONTAINERS_*,TC_* + # Don't set TMPDIR to avoid "AF_UNIX path too long" errors in certain tests. setenv = PYTHONPATH = {toxinidir} -platform = linux -passenv = GIT_*,BUILD_*,ghprb*,CHANGE_ID,BRANCH_NAME,JENKINS_*,CODECOV_*,GITHUB_* + DOCKER_HOST = {env:DOCKER_HOST} + TC_TIMEOUT = {env:TC_TIMEOUT:120} + TC_MAX_TRIES = {env:TC_MAX_TRIES:120} + TC_SLEEP_TIME = {env:TC_SLEEP_TIME:1} + # NOTE: we could add ml_test to increase the collected code coverage metrics, but it would make the suite slower. extras = test,gcp,interactive,dataframe,aws commands =