diff --git a/.github/support-bundle.yaml b/.github/support-bundle.yaml new file mode 100644 index 00000000..06860177 --- /dev/null +++ b/.github/support-bundle.yaml @@ -0,0 +1,238 @@ +apiVersion: troubleshoot.sh/v1beta2 +kind: SupportBundle +metadata: + name: full-diagnostics +spec: + collectors: + # Everything in the cluster + - clusterInfo: {} + - clusterResources: {} + - customResourceDefinition: {} + + # System logs + - logs: + namespace: default + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: demo-bookinfo + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: kube-node-lease + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: kube-public + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: kube-system + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: kubelet-serving-cert-approver + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: system-csi + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: system-dns + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: system-gitops + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: system-ingress + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: system-object-store + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: system-pki + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: system-pki-trust + limits: + maxAge: 24h + maxLines: 10000 + - logs: + namespace: system-policy + limits: + maxAge: 24h + maxLines: 10000 + + # All events from all namespaces + - events: {} + + # All node info + - nodeInfo: {} + - nodeResources: {} + + # All network resources + - networkPolicy: {} + - service: {} + - ingress: {} + + # All storage resources + - persistentVolumeClaim: {} + - persistentVolume: {} + - storageClass: {} + + # All deployments, statefulsets, daemonsets + - deployment: {} + - statefulSet: {} + - daemonSet: {} + + # All pods and their status + - pod: {} + + # All configmaps from all namespaces + - configMap: + namespace: system-gitops + name: blueprint + + # All service accounts and RBAC + - serviceAccount: {} + - clusterRole: {} + - clusterRoleBinding: {} + - role: {} + - roleBinding: {} + + analyzers: + # Cluster health + - clusterVersion: + outcomes: + - fail: + when: "< 1.21.0" + message: "Kubernetes version is too old for supported blueprints." + - pass: + message: "Kubernetes version is supported." + + # Pod and container health + - clusterPodStatuses: + outcomes: + - fail: + when: "count > 0" + message: "There are pods not running or pending." + - pass: + message: "All pods are running." + + - clusterContainerStatuses: + outcomes: + - fail: + when: "count > 0" + message: "Some containers are not ready or are restarting." + - pass: + message: "All containers are healthy." + + # All CRDs + - customResourceDefinition: {} + + # Deployment, StatefulSet, DaemonSet health + - deploymentStatus: + outcomes: + - fail: + when: "status.availableReplicas < 1" + message: "Some deployments are not available." + - pass: + message: "All deployments are available." + - statefulSetStatus: + outcomes: + - fail: + when: "status.readyReplicas < 1" + message: "Some statefulsets are not ready." + - pass: + message: "All statefulsets are ready." + - daemonSetStatus: + outcomes: + - fail: + when: "status.numberAvailable < 1" + message: "Some daemonsets are not available." + - pass: + message: "All daemonsets are available." + + # Storage class presence + - storageClass: + outcomes: + - fail: + message: "No storage class found." + - pass: + message: "Storage class is present." + + # Ingress presence + - ingress: + outcomes: + - fail: + message: "No ingress resources found." + - pass: + message: "Ingress resources are present." + + # Event warnings + - event: + outcomes: + - fail: + when: "count > 0" + message: "There are warning events in the cluster." + - pass: + message: "No warning events found." + + # Events analysis + - event: + name: "Warning Events" + when: "count > 0" + outcomes: + - fail: + message: "Found warning events in the cluster" + uri: "events" + - pass: + message: "No warning events found" + + # Network analysis + - service: + outcomes: + - fail: + when: "status.loadBalancer.ingress == null" + message: "LoadBalancer service has no ingress IP" + - pass: + message: "LoadBalancer service is properly configured" + + # Storage analysis + - persistentVolumeClaim: + outcomes: + - fail: + when: "status.phase != Bound" + message: "PVC is not bound" + - pass: + message: "PVC is bound" + + # Node analysis + - nodeResources: + outcomes: + - fail: + when: "status.allocatable.cpu < 2" + message: "Node has insufficient CPU resources" + - fail: + when: "status.allocatable.memory < 4Gi" + message: "Node has insufficient memory resources" + - pass: + message: "Node resources are sufficient" diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8e69a70b..e156482c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -9,7 +9,7 @@ on: - main permissions: - contents: read + contents: write security-events: write checks: write @@ -18,11 +18,63 @@ env: DOCKER_HOST: unix:///var/run/docker.sock # renovate: datasource=github-releases depName=docker-compose package=docker/compose DOCKER_COMPOSE_VERSION: v2.36.0 + # renovate: datasource=github-releases depName=troubleshoot package=replicatedhq/troubleshoot + SUPPORT_BUNDLE_VERSION: v0.119.0 jobs: - ci: + code-checks: + name: Code Quality runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Set up Python + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: '3.x' + + - name: Run yamllint + run: | + pip install yamllint + yamllint . + + - name: Run shellcheck + run: | + sudo apt-get install -y shellcheck + find . -name "*.sh" -print0 | xargs -0 shellcheck + + - name: Setup Terraform + uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3.1.2 + with: + # renovate: datasource=github-releases depName=terraform package=hashicorp/terraform + terraform_version: 1.12.0 + - name: Run terraform fmt + run: terraform fmt -check -recursive + + - name: Run Terraform Tests + run: | + find terraform -type f -name '*.tftest.hcl' | while read testfile; do + testdir=$(dirname "$testfile") + (cd "$testdir" && terraform init -input=false && terraform test) + done + + - name: Checkov GitHub Action + uses: bridgecrewio/checkov-action@360818f2ad44468d3294cfddae854a8c9036dfee # v12.3014.0 + with: + directory: ./terraform + output_format: cli,sarif + output_file_path: console,results.sarif + + - name: Upload SARIF file + uses: github/codeql-action/upload-sarif@ff0a06e83cb2de871e5a09832bc6a81e7276941f # v3.28.18 + with: + sarif_file: results.sarif + + integration: + name: Integration Tests + runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -34,49 +86,43 @@ jobs: docker-compose --version continue-on-error: false - - name: Set up Python - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 - with: - python-version: '3.x' + - name: Load br_netfilter kernel module + run: | + sudo modprobe br_netfilter + echo "1" | sudo tee /proc/sys/net/bridge/bridge-nf-call-iptables + echo "1" | sudo tee /proc/sys/net/bridge/bridge-nf-call-ip6tables - name: Setup Terraform uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3.1.2 with: # renovate: datasource=github-releases depName=terraform package=hashicorp/terraform terraform_version: 1.12.0 - + - name: Setup kubectl uses: azure/setup-kubectl@3e0aec4d80787158d308d7b364cb1b702e7feb7f # v4.0.0 with: # renovate: datasource=github-releases depName=kubectl package=kubernetes/kubectl version: v1.33.0 + - name: Install support-bundle CLI + run: | + cd "$(mktemp -d)" && + curl -fsSLO "https://github.com/replicatedhq/troubleshoot/releases/download/${SUPPORT_BUNDLE_VERSION}/support-bundle_linux_amd64.tar.gz" && + tar xzf support-bundle_linux_amd64.tar.gz && + sudo install -m 0755 -o root -g root support-bundle /usr/local/bin/ && + cd - && + rm -rf "$OLDPWD" && + support-bundle version + + - name: Create bundle directory + run: mkdir -p support-bundles + - name: Install Windsor CLI uses: windsorcli/action@main with: ref: main context: local - - name: Run yamllint - run: | - pip install yamllint - yamllint . - - - name: Run shellcheck - run: | - sudo apt-get install -y shellcheck - find . -name "*.sh" -print0 | xargs -0 shellcheck - - - name: Run terraform fmt - run: terraform fmt -check -recursive - - - name: Run Terraform Tests - run: | - find terraform -type f -name '*.tftest.hcl' | while read testfile; do - testdir=$(dirname "$testfile") - (cd "$testdir" && terraform init -input=false && terraform test) - done - - name: Create .docker-cache directory run: mkdir -p .windsor/.docker-cache @@ -92,24 +138,25 @@ jobs: windsor init local --set dns.enabled=false windsor up --install --verbose - - name: Windsor Down + - name: Collect Windsor State + if: always() run: | - windsor down + tar --exclude='.docker-cache' --exclude='.terraform' -czf support-bundles/windsor-state.tar.gz contexts/local .windsor - checkov: - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Collect support bundle + if: always() + run: | + support-bundle --interactive=false --output=support-bundles/bundle-${{ github.workflow }}-${{ github.run_id }}-${{ github.run_number }} .github/support-bundle.yaml - - name: Checkov GitHub Action - uses: bridgecrewio/checkov-action@360818f2ad44468d3294cfddae854a8c9036dfee # v12.3014.0 + - name: Upload support bundle + if: always() + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: - directory: ./terraform - output_format: cli,sarif - output_file_path: console,results.sarif + name: support-bundle-local + path: support-bundles/ + retention-days: 30 - - name: Upload SARIF file - uses: github/codeql-action/upload-sarif@ff0a06e83cb2de871e5a09832bc6a81e7276941f # v3.28.18 - with: - sarif_file: results.sarif + - name: Windsor Down + if: always() + run: | + windsor down diff --git a/aqua.yaml b/aqua.yaml index a0ee5712..92eea59c 100644 --- a/aqua.yaml +++ b/aqua.yaml @@ -31,3 +31,4 @@ packages: - name: 1password/cli@v2.30.3 - name: evilmartians/lefthook@v1.6.7 - name: bridgecrewio/checkov@3.2.424 + - name: kubernetes-sigs/krew@v0.4.5 diff --git a/contexts/local/blueprint.yaml b/contexts/local/blueprint.yaml index e9c4f618..cb898cd3 100644 --- a/contexts/local/blueprint.yaml +++ b/contexts/local/blueprint.yaml @@ -18,7 +18,22 @@ terraform: path: cluster/talos - source: core path: gitops/flux + destroy: false kustomize: +- name: telemetry-base + path: telemetry/base + source: core + components: + - prometheus + - prometheus/flux +- name: telemetry-resources + path: telemetry/resources + source: core + dependsOn: + - telemetry-base + components: + - prometheus + - prometheus/flux - name: policy-base path: policy/base source: core @@ -72,7 +87,6 @@ kustomize: path: dns source: core dependsOn: - - ingress-base - pki-base force: true components: @@ -90,25 +104,3 @@ kustomize: force: true components: - webhook -- name: demo - path: demo/bookinfo - source: core - dependsOn: - - ingress-base - force: true - components: - - ingress -- name: object-store-base - path: object-store/base - dependsOn: - - pki-base - force: true - components: - - minio -- name: object-store-resources - path: object-store/resources - dependsOn: - - object-store-base - force: true - components: - - common diff --git a/contexts/local/terraform/cluster/talos.tfvars b/contexts/local/terraform/cluster/talos.tfvars index 0864addb..f6801d68 100644 --- a/contexts/local/terraform/cluster/talos.tfvars +++ b/contexts/local/terraform/cluster/talos.tfvars @@ -1,29 +1,81 @@ -// Managed by Windsor CLI: This file is partially managed by the windsor CLI. Your changes will not be overwritten. -// Module source: github.com/windsorcli/core//terraform/cluster/talos?ref=main +# Managed by Windsor CLI: This file is partially managed by the windsor CLI. Your changes will not be overwritten. +# Module source: github.com/windsorcli/core//terraform/cluster/talos?ref=main -// The external controlplane API endpoint of the kubernetes API -cluster_endpoint = "https://127.0.0.1:6443" +# The kubernetes version to deploy. +# kubernetes_version = "1.33.1" -// The name of the cluster -cluster_name = "talos" +# The talos version to deploy. +# talos_version = "1.10.1" -// A YAML string of common config patches to apply -common_config_patches = "\"cluster\":\n \"apiServer\":\n \"certSANs\":\n - \"localhost\"\n - \"127.0.0.1\"\n \"extraManifests\":\n - \"https://raw.githubusercontent.com/alex1989hu/kubelet-serving-cert-approver/v0.8.7/deploy/standalone-install.yaml\"\n\"machine\":\n \"certSANs\":\n - \"localhost\"\n - \"127.0.0.1\"\n \"kubelet\":\n \"extraArgs\":\n \"rotate-server-certificates\": \"true\"\n \"network\":\n \"interfaces\":\n - \"ignore\": true\n \"interface\": \"eth0\"\n \"registries\":\n \"mirrors\":\n \"docker.io\":\n \"endpoints\":\n - \"http://registry-1.docker.test:5000\"\n \"gcr.io\":\n \"endpoints\":\n - \"http://gcr.test:5000\"\n \"ghcr.io\":\n \"endpoints\":\n - \"http://ghcr.test:5000\"\n \"quay.io\":\n \"endpoints\":\n - \"http://quay.test:5000\"\n \"registry.k8s.io\":\n \"endpoints\":\n - \"http://registry.k8s.test:5000\"\n \"registry.test\":\n \"endpoints\":\n - \"http://registry.test:5000\"" +# The name of the cluster. +cluster_name = "talos" -// A YAML string of controlplane config patches to apply -controlplane_config_patches = "" +# The external controlplane API endpoint of the kubernetes API. +cluster_endpoint = "https://127.0.0.1:6443" -// Machine config details for control planes +# A list of machine configuration details for control planes. controlplanes = [{ endpoint = "127.0.0.1:50000" node = "controlplane-1" }] -// A YAML string of worker config patches to apply -worker_config_patches = "\"machine\":\n \"kubelet\":\n \"extraMounts\":\n - \"destination\": \"/var/local\"\n \"options\":\n - \"rbind\"\n - \"rw\"\n \"source\": \"/var/local\"\n \"type\": \"bind\"" - -// Machine config details for workers +# A list of machine configuration details workers = [{ endpoint = "127.0.0.1:50001" node = "worker-1" }] + +# A YAML string of common config patches to apply. Can be an empty string or valid YAML. +common_config_patches = <