From bc482004d6efaf711011e8c9400f17c3a1c6211b Mon Sep 17 00:00:00 2001 From: Ryan VanGundy Date: Fri, 16 May 2025 17:04:50 -0400 Subject: [PATCH 1/9] Split CI and include support bundles --- .github/support-bundle.yaml | 238 ++++++++++++++++++++++++++++++++++ .github/workflows/ci.yaml | 124 +++++++++++------- aqua.yaml | 1 + contexts/local/blueprint.yaml | 1 + 4 files changed, 320 insertions(+), 44 deletions(-) create mode 100644 .github/support-bundle.yaml diff --git a/.github/support-bundle.yaml b/.github/support-bundle.yaml new file mode 100644 index 00000000..06860177 --- /dev/null +++ b/.github/support-bundle.yaml @@ -0,0 +1,238 @@ +apiVersion: troubleshoot.sh/v1beta2 +kind: SupportBundle +metadata: + name: full-diagnostics +spec: + collectors: + # Everything in the cluster + - clusterInfo: {} + - clusterResources: {} + - customResourceDefinition: {} + + # System logs + - logs: + namespace: default + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: demo-bookinfo + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: kube-node-lease + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: kube-public + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: kube-system + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: kubelet-serving-cert-approver + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: system-csi + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: system-dns + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: system-gitops + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: system-ingress + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: system-object-store + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: system-pki + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: system-pki-trust + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: system-policy + limits: + maxAge: 24h + maxLines: 10000 + + # All events from all namespaces + - events: {} + + # All node info + - nodeInfo: {} + - nodeResources: {} + + # All network resources + - networkPolicy: {} + - service: {} + - ingress: {} + + # All storage resources + - persistentVolumeClaim: {} + - persistentVolume: {} + - storageClass: {} + + # All deployments, statefulsets, daemonsets + - deployment: {} + - statefulSet: {} + - daemonSet: {} + + # All pods and their status + - pod: {} + + # All configmaps from all namespaces + - configMap: + namespace: system-gitops + name: blueprint + + # All service accounts and RBAC + - serviceAccount: {} + - clusterRole: {} + - clusterRoleBinding: {} + - role: {} + - roleBinding: {} + + analyzers: + # Cluster health + - clusterVersion: + outcomes: + - fail: + when: "< 1.21.0" + message: "Kubernetes version is too old for supported blueprints." + - pass: + message: "Kubernetes version is supported." + + # Pod and container health + - clusterPodStatuses: + outcomes: + - fail: + when: "count > 0" + message: "There are pods not running or pending." + - pass: + message: "All pods are running." + + - clusterContainerStatuses: + outcomes: + - fail: + when: "count > 0" + message: "Some containers are not ready or are restarting." + - pass: + message: "All containers are healthy." + + # All CRDs + - customResourceDefinition: {} + + # Deployment, StatefulSet, DaemonSet health + - deploymentStatus: + outcomes: + - fail: + when: "status.availableReplicas < 1" + message: "Some deployments are not available." + - pass: + message: "All deployments are available." + - statefulSetStatus: + outcomes: + - fail: + when: "status.readyReplicas < 1" + message: "Some statefulsets are not ready." + - pass: + message: "All statefulsets are ready." + - daemonSetStatus: + outcomes: + - fail: + when: "status.numberAvailable < 1" + message: "Some daemonsets are not available." + - pass: + message: "All daemonsets are available." + + # Storage class presence + - storageClass: + outcomes: + - fail: + message: "No storage class found." + - pass: + message: "Storage class is present." + + # Ingress presence + - ingress: + outcomes: + - fail: + message: "No ingress resources found." + - pass: + message: "Ingress resources are present." + + # Event warnings + - event: + outcomes: + - fail: + when: "count > 0" + message: "There are warning events in the cluster." + - pass: + message: "No warning events found." + + # Events analysis + - event: + name: "Warning Events" + when: "count > 0" + outcomes: + - fail: + message: "Found warning events in the cluster" + uri: "events" + - pass: + message: "No warning events found" + + # Network analysis + - service: + outcomes: + - fail: + when: "status.loadBalancer.ingress == null" + message: "LoadBalancer service has no ingress IP" + - pass: + message: "LoadBalancer service is properly configured" + + # Storage analysis + - persistentVolumeClaim: + outcomes: + - fail: + when: "status.phase != Bound" + message: "PVC is not bound" + - pass: + message: "PVC is bound" + + # Node analysis + - nodeResources: + outcomes: + - fail: + when: "status.allocatable.cpu < 2" + message: "Node has insufficient CPU resources" + - fail: + when: "status.allocatable.memory < 4Gi" + message: "Node has insufficient memory resources" + - pass: + message: "Node resources are sufficient" diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8e69a70b..bb95cd76 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -18,45 +18,22 @@ env: DOCKER_HOST: unix:///var/run/docker.sock # renovate: datasource=github-releases depName=docker-compose package=docker/compose DOCKER_COMPOSE_VERSION: v2.36.0 + # renovate: datasource=github-releases depName=troubleshoot package=replicatedhq/troubleshoot + SUPPORT_BUNDLE_VERSION: v0.119.0 jobs: - ci: + static-checks: + name: Static Analysis runs-on: ubuntu-latest - steps: - name: Checkout code uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Install Docker Compose - run: | - sudo curl -L "https://github.com/docker/compose/releases/download/${DOCKER_COMPOSE_VERSION}/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose - sudo chmod +x /usr/local/bin/docker-compose - docker-compose --version - continue-on-error: false - - name: Set up Python uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: python-version: '3.x' - - name: Setup Terraform - uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3.1.2 - with: - # renovate: datasource=github-releases depName=terraform package=hashicorp/terraform - terraform_version: 1.12.0 - - - name: Setup kubectl - uses: azure/setup-kubectl@3e0aec4d80787158d308d7b364cb1b702e7feb7f # v4.0.0 - with: - # renovate: datasource=github-releases depName=kubectl package=kubernetes/kubectl - version: v1.33.0 - - - name: Install Windsor CLI - uses: windsorcli/action@main - with: - ref: main - context: local - - name: Run yamllint run: | pip install yamllint @@ -67,6 +44,12 @@ jobs: sudo apt-get install -y shellcheck find . -name "*.sh" -print0 | xargs -0 shellcheck + - name: Setup Terraform + uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3.1.2 + with: + # renovate: datasource=github-releases depName=terraform package=hashicorp/terraform + terraform_version: 1.12.0 + - name: Run terraform fmt run: terraform fmt -check -recursive @@ -77,6 +60,63 @@ jobs: (cd "$testdir" && terraform init -input=false && terraform test) done + - name: Checkov GitHub Action + uses: bridgecrewio/checkov-action@360818f2ad44468d3294cfddae854a8c9036dfee # v12.3014.0 + with: + directory: ./terraform + output_format: cli,sarif + output_file_path: console,results.sarif + + - name: Upload SARIF file + uses: github/codeql-action/upload-sarif@ff0a06e83cb2de871e5a09832bc6a81e7276941f # v3.28.18 + with: + sarif_file: results.sarif + + integration-tests: + name: Windsor Integration Tests + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Install Docker Compose + run: | + sudo curl -L "https://github.com/docker/compose/releases/download/${DOCKER_COMPOSE_VERSION}/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + sudo chmod +x /usr/local/bin/docker-compose + docker-compose --version + continue-on-error: false + + - name: Setup Terraform + uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3.1.2 + with: + # renovate: datasource=github-releases depName=terraform package=hashicorp/terraform + terraform_version: 1.12.0 + + - name: Setup kubectl + uses: azure/setup-kubectl@3e0aec4d80787158d308d7b364cb1b702e7feb7f # v4.0.0 + with: + # renovate: datasource=github-releases depName=kubectl package=kubernetes/kubectl + version: v1.33.0 + + - name: Install support-bundle CLI + run: | + cd "$(mktemp -d)" && + curl -fsSLO "https://github.com/replicatedhq/troubleshoot/releases/download/${SUPPORT_BUNDLE_VERSION}/support-bundle_linux_amd64.tar.gz" && + tar xzf support-bundle_linux_amd64.tar.gz && + sudo install -m 0755 -o root -g root support-bundle /usr/local/bin/ && + cd - && + rm -rf "$OLDPWD" && + support-bundle version + + - name: Create bundle directory + run: mkdir -p support-bundles + + - name: Install Windsor CLI + uses: windsorcli/action@main + with: + ref: main + context: local + - name: Create .docker-cache directory run: mkdir -p .windsor/.docker-cache @@ -92,24 +132,20 @@ jobs: windsor init local --set dns.enabled=false windsor up --install --verbose - - name: Windsor Down + - name: Collect support bundle + if: success() || failure() run: | - windsor down + support-bundle --interactive=false --format=json --output=support-bundles/bundle-${{ github.workflow }}-${{ github.run_id }}-${{ github.run_number }}.json .github/support-bundle.yaml - checkov: - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - - name: Checkov GitHub Action - uses: bridgecrewio/checkov-action@360818f2ad44468d3294cfddae854a8c9036dfee # v12.3014.0 + - name: Upload support bundle + if: always() + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: - directory: ./terraform - output_format: cli,sarif - output_file_path: console,results.sarif + name: support-bundle-local + path: support-bundles/ + retention-days: 30 - - name: Upload SARIF file - uses: github/codeql-action/upload-sarif@ff0a06e83cb2de871e5a09832bc6a81e7276941f # v3.28.18 - with: - sarif_file: results.sarif + - name: Windsor Down + if: always() + run: | + windsor down diff --git a/aqua.yaml b/aqua.yaml index a0ee5712..92eea59c 100644 --- a/aqua.yaml +++ b/aqua.yaml @@ -31,3 +31,4 @@ packages: - name: 1password/cli@v2.30.3 - name: evilmartians/lefthook@v1.6.7 - name: bridgecrewio/checkov@3.2.424 + - name: kubernetes-sigs/krew@v0.4.5 diff --git a/contexts/local/blueprint.yaml b/contexts/local/blueprint.yaml index e9c4f618..0c42407d 100644 --- a/contexts/local/blueprint.yaml +++ b/contexts/local/blueprint.yaml @@ -18,6 +18,7 @@ terraform: path: cluster/talos - source: core path: gitops/flux + destroy: false kustomize: - name: policy-base path: policy/base From d8975d20015fa4adcf9996461f9ebc85be8e4985 Mon Sep 17 00:00:00 2001 From: Ryan VanGundy Date: Fri, 16 May 2025 17:12:16 -0400 Subject: [PATCH 2/9] add contents: write --- .github/workflows/ci.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index bb95cd76..85bafb57 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -9,7 +9,7 @@ on: - main permissions: - contents: read + contents: write security-events: write checks: write @@ -22,8 +22,8 @@ env: SUPPORT_BUNDLE_VERSION: v0.119.0 jobs: - static-checks: - name: Static Analysis + code-checks: + name: Code Quality runs-on: ubuntu-latest steps: - name: Checkout code @@ -72,8 +72,8 @@ jobs: with: sarif_file: results.sarif - integration-tests: - name: Windsor Integration Tests + integration: + name: Windsor Integration runs-on: ubuntu-latest steps: - name: Checkout code From cfbc2af2ff1b6c67d0ba91dec1dcf1bba65cbf88 Mon Sep 17 00:00:00 2001 From: Ryan VanGundy Date: Fri, 16 May 2025 17:14:05 -0400 Subject: [PATCH 3/9] always collect support bundles --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 85bafb57..642e9884 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -133,7 +133,7 @@ jobs: windsor up --install --verbose - name: Collect support bundle - if: success() || failure() + if: always() run: | support-bundle --interactive=false --format=json --output=support-bundles/bundle-${{ github.workflow }}-${{ github.run_id }}-${{ github.run_number }}.json .github/support-bundle.yaml From 155a3b31dec15190e808066374b487941fbdf970 Mon Sep 17 00:00:00 2001 From: Ryan VanGundy Date: Fri, 16 May 2025 17:25:43 -0400 Subject: [PATCH 4/9] Remove --format=json --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 642e9884..f01ae801 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -135,7 +135,7 @@ jobs: - name: Collect support bundle if: always() run: | - support-bundle --interactive=false --format=json --output=support-bundles/bundle-${{ github.workflow }}-${{ github.run_id }}-${{ github.run_number }}.json .github/support-bundle.yaml + support-bundle --interactive=false --output=support-bundles/bundle-${{ github.workflow }}-${{ github.run_id }}-${{ github.run_number }}.json .github/support-bundle.yaml - name: Upload support bundle if: always() From d935c9a265d2bfc708fc703515479c5e097b1895 Mon Sep 17 00:00:00 2001 From: Ryan VanGundy Date: Fri, 16 May 2025 17:28:08 -0400 Subject: [PATCH 5/9] Rename --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f01ae801..49458dd5 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -73,7 +73,7 @@ jobs: sarif_file: results.sarif integration: - name: Windsor Integration + name: Integration Tests runs-on: ubuntu-latest steps: - name: Checkout code From bbfbe6cd26e6b7d8275f5b3a1f743d5f24a6a5aa Mon Sep 17 00:00:00 2001 From: Ryan VanGundy Date: Fri, 16 May 2025 17:52:19 -0400 Subject: [PATCH 6/9] collect windsor state --- .github/workflows/ci.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 49458dd5..d4d429ca 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -132,10 +132,15 @@ jobs: windsor init local --set dns.enabled=false windsor up --install --verbose + - name: Collect Windsor State + if: always() + run: | + tar --exclude='.docker-cache' -czf support-bundles/windsor-state.tar.gz contexts/local .windsor + - name: Collect support bundle if: always() run: | - support-bundle --interactive=false --output=support-bundles/bundle-${{ github.workflow }}-${{ github.run_id }}-${{ github.run_number }}.json .github/support-bundle.yaml + support-bundle --interactive=false --output=support-bundles/bundle-${{ github.workflow }}-${{ github.run_id }}-${{ github.run_number }} .github/support-bundle.yaml - name: Upload support bundle if: always() From dbb95f07876f24b6dd4207fd7a6640eee6a2e552 Mon Sep 17 00:00:00 2001 From: Ryan VanGundy Date: Fri, 16 May 2025 18:16:38 -0400 Subject: [PATCH 7/9] disable kubelet-cert-rotation --- contexts/local/blueprint.yaml | 37 ++++----- contexts/local/terraform/cluster/talos.tfvars | 82 +++++++++++++++---- 2 files changed, 81 insertions(+), 38 deletions(-) diff --git a/contexts/local/blueprint.yaml b/contexts/local/blueprint.yaml index 0c42407d..cb898cd3 100644 --- a/contexts/local/blueprint.yaml +++ b/contexts/local/blueprint.yaml @@ -20,6 +20,20 @@ terraform: path: gitops/flux destroy: false kustomize: +- name: telemetry-base + path: telemetry/base + source: core + components: + - prometheus + - prometheus/flux +- name: telemetry-resources + path: telemetry/resources + source: core + dependsOn: + - telemetry-base + components: + - prometheus + - prometheus/flux - name: policy-base path: policy/base source: core @@ -73,7 +87,6 @@ kustomize: path: dns source: core dependsOn: - - ingress-base - pki-base force: true components: @@ -91,25 +104,3 @@ kustomize: force: true components: - webhook -- name: demo - path: demo/bookinfo - source: core - dependsOn: - - ingress-base - force: true - components: - - ingress -- name: object-store-base - path: object-store/base - dependsOn: - - pki-base - force: true - components: - - minio -- name: object-store-resources - path: object-store/resources - dependsOn: - - object-store-base - force: true - components: - - common diff --git a/contexts/local/terraform/cluster/talos.tfvars b/contexts/local/terraform/cluster/talos.tfvars index 0864addb..f6801d68 100644 --- a/contexts/local/terraform/cluster/talos.tfvars +++ b/contexts/local/terraform/cluster/talos.tfvars @@ -1,29 +1,81 @@ -// Managed by Windsor CLI: This file is partially managed by the windsor CLI. Your changes will not be overwritten. -// Module source: github.com/windsorcli/core//terraform/cluster/talos?ref=main +# Managed by Windsor CLI: This file is partially managed by the windsor CLI. Your changes will not be overwritten. +# Module source: github.com/windsorcli/core//terraform/cluster/talos?ref=main -// The external controlplane API endpoint of the kubernetes API -cluster_endpoint = "https://127.0.0.1:6443" +# The kubernetes version to deploy. +# kubernetes_version = "1.33.1" -// The name of the cluster -cluster_name = "talos" +# The talos version to deploy. +# talos_version = "1.10.1" -// A YAML string of common config patches to apply -common_config_patches = "\"cluster\":\n \"apiServer\":\n \"certSANs\":\n - \"localhost\"\n - \"127.0.0.1\"\n \"extraManifests\":\n - \"https://raw.githubusercontent.com/alex1989hu/kubelet-serving-cert-approver/v0.8.7/deploy/standalone-install.yaml\"\n\"machine\":\n \"certSANs\":\n - \"localhost\"\n - \"127.0.0.1\"\n \"kubelet\":\n \"extraArgs\":\n \"rotate-server-certificates\": \"true\"\n \"network\":\n \"interfaces\":\n - \"ignore\": true\n \"interface\": \"eth0\"\n \"registries\":\n \"mirrors\":\n \"docker.io\":\n \"endpoints\":\n - \"http://registry-1.docker.test:5000\"\n \"gcr.io\":\n \"endpoints\":\n - \"http://gcr.test:5000\"\n \"ghcr.io\":\n \"endpoints\":\n - \"http://ghcr.test:5000\"\n \"quay.io\":\n \"endpoints\":\n - \"http://quay.test:5000\"\n \"registry.k8s.io\":\n \"endpoints\":\n - \"http://registry.k8s.test:5000\"\n \"registry.test\":\n \"endpoints\":\n - \"http://registry.test:5000\"" +# The name of the cluster. +cluster_name = "talos" -// A YAML string of controlplane config patches to apply -controlplane_config_patches = "" +# The external controlplane API endpoint of the kubernetes API. +cluster_endpoint = "https://127.0.0.1:6443" -// Machine config details for control planes +# A list of machine configuration details for control planes. controlplanes = [{ endpoint = "127.0.0.1:50000" node = "controlplane-1" }] -// A YAML string of worker config patches to apply -worker_config_patches = "\"machine\":\n \"kubelet\":\n \"extraMounts\":\n - \"destination\": \"/var/local\"\n \"options\":\n - \"rbind\"\n - \"rw\"\n \"source\": \"/var/local\"\n \"type\": \"bind\"" - -// Machine config details for workers +# A list of machine configuration details workers = [{ endpoint = "127.0.0.1:50001" node = "worker-1" }] + +# A YAML string of common config patches to apply. Can be an empty string or valid YAML. +common_config_patches = < Date: Fri, 16 May 2025 18:46:57 -0400 Subject: [PATCH 8/9] Exclude .terraform from support bundle --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index d4d429ca..dd4052ed 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -135,7 +135,7 @@ jobs: - name: Collect Windsor State if: always() run: | - tar --exclude='.docker-cache' -czf support-bundles/windsor-state.tar.gz contexts/local .windsor + tar --exclude='.docker-cache' --exclude='.terraform' -czf support-bundles/windsor-state.tar.gz contexts/local .windsor - name: Collect support bundle if: always() From bb400384ef54db5f57626988fd6f984e7208087b Mon Sep 17 00:00:00 2001 From: Ryan VanGundy Date: Fri, 16 May 2025 19:14:32 -0400 Subject: [PATCH 9/9] try install br_netfilter --- .github/workflows/ci.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index dd4052ed..e156482c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -86,6 +86,12 @@ jobs: docker-compose --version continue-on-error: false + - name: Load br_netfilter kernel module + run: | + sudo modprobe br_netfilter + echo "1" | sudo tee /proc/sys/net/bridge/bridge-nf-call-iptables + echo "1" | sudo tee /proc/sys/net/bridge/bridge-nf-call-ip6tables + - name: Setup Terraform uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3.1.2 with: