diff --git a/.github/workflows/k8s-compatibility-test.yml b/.github/workflows/k8s-compatibility-test.yml index fbeab9f5..f89709c7 100644 --- a/.github/workflows/k8s-compatibility-test.yml +++ b/.github/workflows/k8s-compatibility-test.yml @@ -12,6 +12,7 @@ jobs: image: ${{ steps.set-image.outputs.image }} testserver_img: ${{ steps.set-image.outputs.testserver_img }} stress_img: ${{ steps.set-image.outputs.stress_img }} + nodemon_img: ${{ steps.set-image.outputs.nodemon_img }} steps: - name: Set Docker image names @@ -20,18 +21,21 @@ jobs: ZXPORTER_IMG="ttl.sh/$(uuidgen):2h" TESTSERVER_IMG="ttl.sh/$(uuidgen):2h" STRESS_IMG="ttl.sh/$(uuidgen):2h" + NODEMON_IMG="ttl.sh/$(uuidgen):2h" MAJOR="0" MINOR="0" PATCH="1-compatibility-test" echo "ZXPORTER_IMG=$ZXPORTER_IMG" >> $GITHUB_ENV echo "TESTSERVER_IMG=$TESTSERVER_IMG" >> $GITHUB_ENV echo "STRESS_IMG=$STRESS_IMG" >> $GITHUB_ENV + echo "NODEMON_IMG=$NODEMON_IMG" >> $GITHUB_ENV echo "MAJOR=$MAJOR" >> $GITHUB_ENV echo "MINOR=$MINOR" >> $GITHUB_ENV echo "PATCH=$PATCH" >> $GITHUB_ENV echo "image=$ZXPORTER_IMG" >> $GITHUB_OUTPUT echo "testserver_img=$TESTSERVER_IMG" >> $GITHUB_OUTPUT echo "stress_img=$STRESS_IMG" >> $GITHUB_OUTPUT + echo "nodemon_img=$NODEMON_IMG" >> $GITHUB_OUTPUT - name: Checkout code uses: actions/checkout@v4 @@ -53,6 +57,9 @@ jobs: echo "Building and pushing stress image: ${{ env.STRESS_IMG }}" make stress-docker-build stress-docker-push STRESS_IMG=${{ env.STRESS_IMG }} + echo "Building and pushing nodemon image: ${{ env.NODEMON_IMG }}" + make docker-build-nodemon docker-push-nodemon IMG_NODEMON=${{ env.NODEMON_IMG }} + test: name: Test on K8s ${{ matrix.k8s-version }} (${{ matrix.deployment-method }}) needs: build @@ -80,6 +87,7 @@ jobs: echo "ZXPORTER_IMG=${{ needs.build.outputs.image }}" >> $GITHUB_ENV echo "TESTSERVER_IMG=${{ needs.build.outputs.testserver_img }}" >> $GITHUB_ENV echo "STRESS_IMG=${{ needs.build.outputs.stress_img }}" >> $GITHUB_ENV + echo "NODEMON_IMG=${{ needs.build.outputs.nodemon_img }}" >> $GITHUB_ENV - name: Checkout code uses: actions/checkout@v4 @@ -104,18 +112,6 @@ jobs: cluster_name: kind-${{ matrix.k8s-version }} wait: 120s - - name: Install Metrics Server - run: | - helm repo add metrics-server https://kubernetes-sigs.github.io/metrics-server/ - helm repo update - helm upgrade --install --set args={--kubelet-insecure-tls} metrics-server metrics-server/metrics-server --namespace kube-system - - - name: Install Node Exporter - run: | - helm repo add prometheus-community https://prometheus-community.github.io/helm-charts - helm repo update - helm install node-exporter prometheus-community/prometheus-node-exporter - - name: Deploy testserver to Kubernetes run: | # Create namespace if it doesn't exist @@ -203,13 +199,18 @@ jobs: yq eval '.zxporter.kubeContextName = "test-kind-cluster"' -i helm-chart/zxporter/values.yaml yq eval '.zxporter.k8sProvider = "other"' -i helm-chart/zxporter/values.yaml yq eval '.zxporter.logLevel = "info"' -i helm-chart/zxporter/values.yaml + yq eval '.nodemonMetrics.enabled = true' -i helm-chart/zxporter/values.yaml + yq eval '.zxporter-nodemon.gpuMetricsExporter.image.repository = "'"${NODEMON_IMG%:*}"'"' -i helm-chart/zxporter/values.yaml + yq eval '.zxporter-nodemon.gpuMetricsExporter.image.tag = "'"${NODEMON_IMG##*:}"'"' -i helm-chart/zxporter/values.yaml + yq eval '.zxporter-nodemon.provider = "other"' -i helm-chart/zxporter/values.yaml echo "Updated values.yaml:" cat helm-chart/zxporter/values.yaml - + + helm dependency update helm-chart/zxporter/ make helm-chart-install YQ=/usr/local/bin/yq else echo "Deploying ZXporter using make deploy..." - make deploy IMG=${{ env.ZXPORTER_IMG }} DAKR_URL=http://testserver.devzero-system.svc.cluster.local:50051 TARGET_NAMESPACES=dztest CLUSTER_TOKEN=test-token-for-ci + make deploy IMG=${{ env.ZXPORTER_IMG }} IMG_NODEMON=${{ env.NODEMON_IMG }} DAKR_URL=http://testserver.devzero-system.svc.cluster.local:50051 TARGET_NAMESPACES=dztest CLUSTER_TOKEN=test-token-for-ci fi - name: Wait for deployment to be ready @@ -256,43 +257,7 @@ jobs: echo "Getting pod logs (if any)..." POD_NAME=$(kubectl get pods -n devzero-system -l control-plane=controller-manager -o jsonpath='{.items[0].metadata.name}') kubectl logs $POD_NAME -n devzero-system --tail=100 || echo "No logs available" - kubectl logs -n devzero-system -l app.kubernetes.io/name=dz-prometheus --all-containers - - - - name: Debug Prometheus status and logs - run: | - PROM_POD=$(kubectl get pod -n devzero-system -l app.kubernetes.io/name=dz-prometheus,app.kubernetes.io/component=server -o jsonpath='{.items[0].metadata.name}') - echo "Prometheus Pod: $PROM_POD" - - echo "Describing Prometheus pod..." - kubectl describe pod $PROM_POD -n devzero-system || echo "Describe failed" - - echo "Getting logs from Prometheus containers..." - kubectl logs $PROM_POD -n devzero-system -c dz-prometheus-server || echo "No prometheus-server logs" - kubectl logs $PROM_POD -n devzero-system -c dz-prometheus-server-configmap-reload || echo "No configmap-reload logs" - - echo "Spawning debug pod to test Prometheus readiness endpoint..." - - # Create a temporary curl pod - kubectl run curlbox \ - -n devzero-system \ - --image=curlimages/curl:latest \ - --restart=Never \ - --command -- sleep 120 - - # Wait for it to be ready - kubectl wait --for=condition=Ready pod/curlbox -n devzero-system --timeout=30s - - # Run curl against the Prometheus /-/ready endpoint - kubectl exec -n devzero-system curlbox -- \ - curl -v http://prometheus-dz-prometheus-server.devzero-system.svc.cluster.local:80/-/ready || echo "Prometheus not responding to /-/ready" - - # Clean up - kubectl delete pod curlbox -n devzero-system --ignore-not-found - echo "Checking Prometheus config mounted in pod..." - kubectl exec -n devzero-system $PROM_POD -c dz-prometheus-server -- cat /etc/config/prometheus.yml || echo "Could not read prometheus.yml" - - name: Check testserver stats and validate resource usage run: | echo "Checking testserver stats..." diff --git a/.github/workflows/metrics-server-lifecycle-test.yml b/.github/workflows/metrics-server-lifecycle-test.yml deleted file mode 100644 index 3f3485e1..00000000 --- a/.github/workflows/metrics-server-lifecycle-test.yml +++ /dev/null @@ -1,331 +0,0 @@ -permissions: - contents: read -name: Metrics Server Lifecycle Test - -on: - workflow_dispatch: - pull_request: - -jobs: - build: - name: Build Docker Image - runs-on: ubuntu-latest - outputs: - image: ${{ steps.set-image.outputs.image }} - - steps: - - name: Set Docker image names - id: set-image - run: | - ZXPORTER_IMG="ttl.sh/$(uuidgen):2h" - MAJOR="0" - MINOR="0" - PATCH="1-metrics-lifecycle-test" - echo "ZXPORTER_IMG=$ZXPORTER_IMG" >> $GITHUB_ENV - echo "MAJOR=$MAJOR" >> $GITHUB_ENV - echo "MINOR=$MINOR" >> $GITHUB_ENV - echo "PATCH=$PATCH" >> $GITHUB_ENV - echo "image=$ZXPORTER_IMG" >> $GITHUB_OUTPUT - - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version-file: "go.mod" - cache: true - - - name: Build and push Docker image - run: | - echo "Building and pushing zxporter image: ${{ env.ZXPORTER_IMG }}" - make docker-build docker-push IMG=${{ env.ZXPORTER_IMG }} - - test: - name: Test Metrics Server Lifecycle on K8s v1.32.3 - needs: build - runs-on: ubuntu-xl - - steps: - - name: Set Docker image names - run: | - echo "ZXPORTER_IMG=${{ needs.build.outputs.image }}" >> $GITHUB_ENV - - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version-file: "go.mod" - cache: true - - - name: Create k8s Kind Cluster - uses: helm/kind-action@v1 - with: - version: v0.27.0 - node_image: kindest/node:v1.32.3 - cluster_name: kind-metrics-test - wait: 120s - - - name: Verify cluster is ready - run: | - echo "Verifying cluster is ready..." - kubectl cluster-info - kubectl get nodes -o wide - kubectl get pods -A - - - name: Install Metrics Server - run: | - echo "Installing metrics-server..." - helm repo add metrics-server https://kubernetes-sigs.github.io/metrics-server/ - helm repo update - helm upgrade --install --set args={--kubelet-insecure-tls} metrics-server metrics-server/metrics-server --namespace kube-system - - echo "Waiting for metrics-server to be ready..." - kubectl wait --for=condition=available --timeout=300s deployment/metrics-server -n kube-system - - echo "Waiting for metrics API to be available..." - ATTEMPTS=0 - MAX_ATTEMPTS=30 - while [ $ATTEMPTS -lt $MAX_ATTEMPTS ]; do - if kubectl get --raw "/apis/metrics.k8s.io/v1beta1/nodes" &>/dev/null; then - echo "✅ Metrics API is available" - break - fi - ATTEMPTS=$((ATTEMPTS+1)) - echo "Waiting for metrics API (attempt $ATTEMPTS/$MAX_ATTEMPTS)..." - sleep 10 - done - - if [ $ATTEMPTS -eq $MAX_ATTEMPTS ]; then - echo "❌ Metrics API did not become available in time" - exit 1 - fi - - - name: Verify metrics server is working - run: | - echo "Testing kubectl top nodes command..." - if kubectl top nodes; then - echo "✅ kubectl top nodes is working - metrics server is functional" - else - echo "❌ kubectl top nodes failed - metrics server is not working properly" - exit 1 - fi - - echo "Testing kubectl top pods command..." - kubectl top pods -A || echo "No pods consuming resources yet" - - - name: Delete metrics server deployment - run: | - echo "Deleting metrics-server deployment (keeping apiservice)..." - kubectl delete deployment metrics-server -n kube-system - - echo "Waiting for metrics-server pods to be terminated..." - kubectl wait --for=delete pod -l k8s-app=metrics-server -n kube-system --timeout=120s || true - - echo "Verifying metrics-server deployment is removed but apiservice remains..." - kubectl get deployment metrics-server -n kube-system || echo "Deployment removed (expected)" - kubectl get pods -n kube-system -l k8s-app=metrics-server || echo "No metrics-server pods found (expected)" - - echo "Checking that apiservice still exists..." - kubectl get apiservice v1beta1.metrics.k8s.io && echo "✅ APIService still exists (expected)" - - - name: Validate metrics server is not working - run: | - echo "Waiting for metrics API to become unavailable..." - sleep 30 - - echo "Testing kubectl top nodes command (should fail)..." - if kubectl top nodes 2>&1; then - echo "❌ kubectl top nodes is still working - this should not happen" - exit 1 - else - echo "✅ kubectl top nodes failed as expected - metrics server is properly removed" - fi - - echo "Testing metrics API endpoint directly..." - if kubectl get --raw "/apis/metrics.k8s.io/v1beta1/nodes" 2>&1; then - echo "❌ Metrics API is still available - this should not happen" - exit 1 - else - echo "✅ Metrics API is unavailable as expected" - fi - - - name: Deploy ZXporter - run: | - echo "Deploying ZXporter using manifest deployment..." - make deploy IMG=${{ env.ZXPORTER_IMG }} - - - name: Wait for ZXporter deployment to be ready - run: | - echo "Waiting for ZXporter deployment to be ready..." - kubectl wait --for=condition=available --timeout=300s deployment/devzero-zxporter-controller-manager -n devzero-system - - echo "Getting ZXporter pod status..." - kubectl get pods -n devzero-system -o wide - - echo "Getting ZXporter pod name..." - ZXPORTER_POD=$(kubectl get pods -n devzero-system -l control-plane=controller-manager -o jsonpath='{.items[0].metadata.name}') - echo "ZXporter pod: $ZXPORTER_POD" - - - name: Monitor ZXporter logs for metrics-server installation - run: | - echo "Monitoring ZXporter logs for metrics-server installation..." - ZXPORTER_POD=$(kubectl get pods -n devzero-system -l control-plane=controller-manager -o jsonpath='{.items[0].metadata.name}') - - echo "Getting initial ZXporter logs..." - kubectl logs $ZXPORTER_POD -n devzero-system | head -n 100 - - echo "Waiting and monitoring for apiservice configuration and 'metrics-server is now ready' messages..." - ATTEMPTS=0 - MAX_ATTEMPTS=60 - FOUND_APISERVICE_MESSAGE=false - FOUND_READY_MESSAGE=false - - while [ $ATTEMPTS -lt $MAX_ATTEMPTS ]; do - LOGS=$(kubectl logs $ZXPORTER_POD -n devzero-system) - - if echo "$LOGS" | grep -qE "apiservice.apiregistration.k8s.io/v1beta1.metrics.k8s.io (configured|unchanged)"; then - if [ "$FOUND_APISERVICE_MESSAGE" = false ]; then - echo "✅ Found apiservice v1beta1.metrics.k8s.io applied message in ZXporter logs!" - FOUND_APISERVICE_MESSAGE=true - fi - fi - - if echo "$LOGS" | grep -q "metrics-server is now ready"; then - echo "✅ Found 'metrics-server is now ready' message in ZXporter logs!" - FOUND_READY_MESSAGE=true - break - fi - - if echo "$LOGS" | grep -q "metrics-server installed successfully"; then - echo "📝 Found 'metrics-server installed successfully' message" - fi - - if echo "$LOGS" | grep -q "metrics-server not found, installing it now"; then - echo "📝 Found 'metrics-server not found, installing it now' message" - fi - - ATTEMPTS=$((ATTEMPTS+1)) - echo "Waiting for 'metrics-server is now ready' message (attempt $ATTEMPTS/$MAX_ATTEMPTS)..." - sleep 10 - done - - if [ "$FOUND_APISERVICE_MESSAGE" = false ]; then - echo "❌ Did not find apiservice v1beta1.metrics.k8s.io configured/unchanged message in ZXporter logs" - echo "Final ZXporter logs:" - kubectl logs $ZXPORTER_POD -n devzero-system --tail=200 - exit 1 - fi - - if [ "$FOUND_READY_MESSAGE" = false ]; then - echo "❌ Did not find 'metrics-server is now ready' message in ZXporter logs within timeout" - echo "Final ZXporter logs:" - kubectl logs $ZXPORTER_POD -n devzero-system --tail=200 - exit 1 - fi - - - name: Verify metrics server is working again - run: | - echo "Verifying that metrics server is working again..." - - echo "Checking if metrics-server pods are running..." - kubectl get pods -n kube-system -l k8s-app=metrics-server -o wide || echo "No metrics-server pods found yet" - - echo "Waiting for metrics API to be available again..." - ATTEMPTS=0 - MAX_ATTEMPTS=30 - while [ $ATTEMPTS -lt $MAX_ATTEMPTS ]; do - if kubectl get --raw "/apis/metrics.k8s.io/v1beta1/nodes" &>/dev/null; then - echo "✅ Metrics API is available again" - break - fi - ATTEMPTS=$((ATTEMPTS+1)) - echo "Waiting for metrics API to be available (attempt $ATTEMPTS/$MAX_ATTEMPTS)..." - sleep 10 - done - - if [ $ATTEMPTS -eq $MAX_ATTEMPTS ]; then - echo "❌ Metrics API did not become available again" - kubectl get pods -n kube-system -l k8s-app=metrics-server - kubectl describe pods -n kube-system -l k8s-app=metrics-server - exit 1 - fi - - echo "Testing kubectl top nodes command again..." - if kubectl top nodes; then - echo "✅ kubectl top nodes is working again - metrics server has been restored by ZXporter" - else - echo "❌ kubectl top nodes is still failing" - exit 1 - fi - - - name: Verify ZXporter metrics server deployment - run: | - echo "Verifying ZXporter installed metrics-server correctly..." - - echo "Checking metrics-server deployment..." - kubectl get deployment -n kube-system -l k8s-app=metrics-server -o wide - - echo "Checking metrics-server pods..." - kubectl get pods -n kube-system -l k8s-app=metrics-server -o wide - - echo "Checking metrics-server service..." - kubectl get service -n kube-system -l k8s-app=metrics-server -o wide - - echo "Describing metrics-server deployment..." - kubectl describe deployment -n kube-system -l k8s-app=metrics-server - - - name: Final validation and cleanup - run: | - echo "Running final validation..." - - echo "Final ZXporter logs:" - ZXPORTER_POD=$(kubectl get pods -n devzero-system -l control-plane=controller-manager -o jsonpath='{.items[0].metadata.name}') - kubectl logs $ZXPORTER_POD -n devzero-system --tail=100 - - echo "Final kubectl top nodes test:" - kubectl top nodes - - echo "Final kubectl top pods test:" - kubectl top pods -A - - echo "✅ Metrics Server Lifecycle Test completed successfully!" - echo "✅ ZXporter successfully detected missing metrics-server and reinstalled it" - echo "✅ Found 'metrics-server is now ready' log message" - echo "✅ Metrics server is functional after ZXporter installation" - - - name: Debug on failure - if: failure() - run: | - echo "=== DEBUG INFORMATION ===" - - echo "Cluster info:" - kubectl cluster-info - - echo "All pods in kube-system:" - kubectl get pods -n kube-system -o wide - - echo "All pods in devzero-system:" - kubectl get pods -n devzero-system -o wide - - echo "ZXporter deployment status:" - kubectl describe deployment devzero-zxporter-controller-manager -n devzero-system - - echo "ZXporter pod logs:" - ZXPORTER_POD=$(kubectl get pods -n devzero-system -l control-plane=controller-manager -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") - if [ ! -z "$ZXPORTER_POD" ]; then - kubectl logs $ZXPORTER_POD -n devzero-system --tail=200 - else - echo "No ZXporter pod found" - fi - - echo "Metrics server pods (if any):" - kubectl get pods -n kube-system -l k8s-app=metrics-server -o wide || echo "No metrics-server pods found" - - echo "Events in devzero-system namespace:" - kubectl get events -n devzero-system --sort-by='.lastTimestamp' - - echo "Events in kube-system namespace (last 20):" - kubectl get events -n kube-system --sort-by='.lastTimestamp' | tail -20 diff --git a/Dockerfile b/Dockerfile index d0920e11..1edd9f49 100644 --- a/Dockerfile +++ b/Dockerfile @@ -60,7 +60,6 @@ WORKDIR / COPY --from=builder /workspace/manager . USER 65532:65532 -COPY ./dist/metrics-server.yaml /metrics-server.yaml COPY ./entrypoint.sh /entrypoint.sh ENTRYPOINT ["/entrypoint.sh", "/manager"] diff --git a/Makefile b/Makefile index c5482150..85796a25 100644 --- a/Makefile +++ b/Makefile @@ -57,8 +57,6 @@ TESTSERVER_IMG ?= ttl.sh/zxporter-testserver:latest STRESS_IMG ?= ttl.sh/zxporter-stress:latest # DAKR URL to use for deployment DAKR_URL ?= https://dakr.devzero.io -# PROMETHEUS URL for metrics collection -PROMETHEUS_URL ?= http://prometheus-dz-prometheus-server.$(DEVZERO_MONITORING_NAMESPACE).svc.cluster.local:80 # TARGET_NAMESPACES for limiting collection to specific namespaces (comma-separated) TARGET_NAMESPACES ?= # COLLECTION_FILE is used to control the collectionpolicies. @@ -70,19 +68,13 @@ ENV_CONFIGMAP_FILE ?= config/manager/env_configmap.yaml CLUSTER_TOKEN ?= # Monitoring resources -PROMETHEUS_CHART_VERSION ?= 27.20.0 DEVZERO_MONITORING_NAMESPACE ?= devzero-system -NODE_EXPORTER_CHART_VERSION ?= 4.47.0 -METRICS_SERVER_CHART_VERSION ?= 3.12.2 # DIST_INSTALL_BUNDLE is the final complete manifest DIST_DIR ?= dist DIST_INSTALL_BUNDLE ?= $(DIST_DIR)/install.yaml DIST_BACKEND_INSTALL_BUNDLE ?= $(DIST_DIR)/backend-install.yaml DIST_ZXPORTER_BUNDLE ?= $(DIST_DIR)/zxporter.yaml -DIST_PROMETHEUS_BUNDLE ?= $(DIST_DIR)/prometheus.yaml -DIST_NODE_EXPORTER_BUNDLE ?= $(DIST_DIR)/node-exporter.yaml -METRICS_SERVER ?= $(DIST_DIR)/metrics-server.yaml # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary. ENVTEST_K8S_VERSION = 1.31.0 @@ -208,20 +200,6 @@ run: manifests generate fmt vet ## Run a controller from your host. # More info: https://docs.docker.com/develop/develop-images/build_enhancements/ .PHONY: docker-build docker-build: helm ## Build docker image with the manager. - @echo "[INFO] Adding Metrics Server repo" - @$(HELM) repo add metrics-server https://kubernetes-sigs.github.io/metrics-server/ >> /dev/null || true - @echo "[INFO] Fetching Metrics Server repo data" - @$(HELM) repo update metrics-server >> /dev/null - - @echo "[INFO] Generate Metrics Server manifest" - @$(HELM) template metrics-server metrics-server/metrics-server \ - --version $(METRICS_SERVER_CHART_VERSION) \ - --namespace devzero-system \ - --set args="{--kubelet-insecure-tls}" \ - --set nameOverride="dz-metrics-server" \ - --set fullnameOverride="dz-metrics-server" \ - > $(METRICS_SERVER) - @echo "[INFO] For debug -> $(GO_VERSION), major $(MAJOR), minor $(MINOR), patch $(PATCH)" $(CONTAINER_TOOL) build --load \ --build-arg MAJOR=$(MAJOR) \ @@ -292,37 +270,19 @@ docker-buildx: ## Build and push docker image for the manager for cross-platform - $(CONTAINER_TOOL) buildx rm zxporter-builder rm Dockerfile.cross -.PHONY: generate-monitoring-manifests -generate-monitoring-manifests: helm ## Generate monitoring manifests for Prometheus and Node Exporter. - @echo "[INFO] Adding Prometheus repo" - @$(HELM) repo add prometheus-community https://prometheus-community.github.io/helm-charts >> /dev/null || true - @echo "[INFO] Fetching prometheus repo data" - @$(HELM) repo update prometheus-community >> /dev/null - - @echo "[INFO] Generate prometheus manifest" - @$(HELM) template prometheus prometheus-community/prometheus \ - --version $(PROMETHEUS_CHART_VERSION) \ - --namespace $(DEVZERO_MONITORING_NAMESPACE) \ - --create-namespace \ - --values config/prometheus/hack.prometheus.values.yaml \ - > $(DIST_PROMETHEUS_BUNDLE) - - @echo "[INFO] Generate Node Exporter manifest" - @$(HELM) template node-exporter prometheus-community/prometheus-node-exporter \ - --version $(NODE_EXPORTER_CHART_VERSION) \ - --namespace $(DEVZERO_MONITORING_NAMESPACE) \ - --create-namespace \ - --values config/prometheus/hack.node-exporter.values.yaml \ - > $(DIST_NODE_EXPORTER_BUNDLE) - .PHONY: final-installer final-installer: @cp dist/install.yaml $(DIST_BACKEND_INSTALL_BUNDLE) @$(YQ) -i '(select(.kind == "ConfigMap" and .metadata.name == "devzero-zxporter-env-config") | .data.DAKR_URL) = "{{ .api_url }}/dakr"' $(DIST_BACKEND_INSTALL_BUNDLE) @$(YQ) -i '(select(.kind == "Deployment") | .spec.template.spec.containers[]? | select(.image == "ttl.sh/zxporter:latest")).image = "docker.io/devzeroinc/zxporter:latest"' $(DIST_BACKEND_INSTALL_BUNDLE) @$(YQ) -i '(select(.kind == "Secret" and .metadata.name == "devzero-zxporter-token") | .stringData.CLUSTER_TOKEN) = "{{ .cluster_token }}"' $(DIST_BACKEND_INSTALL_BUNDLE) - @$(YQ) -i '(select(.kind == "Namespace" and .metadata.labels."app.kubernetes.io/managed-by" == "kustomize") | .metadata.name) = "{{.zxporter_namespace}}"' $(DIST_BACKEND_INSTALL_BUNDLE) @$(MAKE) installer-without-configmap + @echo "[INFO] Templating namespace in backend-install.yaml for DAKR backend" + @sed -i'' -e 's|namespace: $(DEVZERO_MONITORING_NAMESPACE)|namespace: {{.zxporter_namespace}}|g' $(DIST_BACKEND_INSTALL_BUNDLE) + @sed -i'' -e 's|name: $(DEVZERO_MONITORING_NAMESPACE)|name: {{.zxporter_namespace}}|g' $(DIST_BACKEND_INSTALL_BUNDLE) + @echo "[INFO] Templating namespace in installer_updater.yaml for DAKR backend" + @sed -i'' -e 's|namespace: $(DEVZERO_MONITORING_NAMESPACE)|namespace: {{.zxporter_namespace}}|g' $(DIST_DIR)/installer_updater.yaml + @sed -i'' -e 's|name: $(DEVZERO_MONITORING_NAMESPACE)|name: {{.zxporter_namespace}}|g' $(DIST_DIR)/installer_updater.yaml @if [ -d "$(DAKR_DIR)/services/dakr_installers" ]; then \ cp $(DIST_BACKEND_INSTALL_BUNDLE) $(DAKR_DIR)/services/dakr_installers/install.yaml; \ cp $(DIST_DIR)/installer_updater.yaml $(DAKR_DIR)/services/dakr_installers/installer_updater.yaml; \ @@ -332,24 +292,14 @@ final-installer: .PHONY: installer-without-configmap installer-without-configmap: @cp $(DIST_BACKEND_INSTALL_BUNDLE) $(DIST_DIR)/installer_updater.yaml - @$(YQ) -i 'select((.kind != "ConfigMap" or .metadata.name != "devzero-zxporter-env-config") and (.kind != "Secret" or .metadata.name != "devzero-zxporter-token"))' $(DIST_DIR)/installer_updater.yaml + @$(YQ) -i 'select(.kind != "ConfigMap" or .metadata.name != "devzero-zxporter-env-config") | select(.kind != "Secret" or .metadata.name != "devzero-zxporter-token")' $(DIST_DIR)/installer_updater.yaml .PHONY: build-installer -build-installer: manifests generate kustomize yq ## Generate a consolidated YAML with deployment. +build-installer: manifests generate kustomize yq helm ## Generate a consolidated YAML with deployment. @mkdir -p $(DIST_DIR) - @echo "[INFO] Generating manifests for monitoring components..." - @$(MAKE) generate-monitoring-manifests - @echo "[INFO] Monitoring manifests generated." - @echo "[INFO] Generating installer bundle..." - @echo "## ATTN KUBERNETES ADMINS! Read this..." > $(DIST_INSTALL_BUNDLE) - @echo "# If prometheus-server is already installed, and you want to use that version," >> $(DIST_INSTALL_BUNDLE) - @echo "# comment out the section from \"START PROM SERVER\" to \"END PROM SERVER\" and update the \"prometheusURL\" variable." >> $(DIST_INSTALL_BUNDLE) - @echo -e "#" >> $(DIST_INSTALL_BUNDLE) - @echo "# If prometheus-node-exporter is already installed, and you want to use that version," >> $(DIST_INSTALL_BUNDLE) - @echo "# comment out the section from \"START PROM NODE EXPORTER\" to \"END PROM NODE EXPORTER\"" >> $(DIST_INSTALL_BUNDLE) - @echo -e "# \n" >> $(DIST_INSTALL_BUNDLE) + @echo "# ZXPorter installer bundle" > $(DIST_INSTALL_BUNDLE) @echo "[INFO] Adding namespace to the main installer" @echo "apiVersion: v1" >> $(DIST_INSTALL_BUNDLE) @@ -360,37 +310,32 @@ build-installer: manifests generate kustomize yq ## Generate a consolidated YAML @echo " app.kubernetes.io/name: $(DEVZERO_MONITORING_NAMESPACE)" >> $(DIST_INSTALL_BUNDLE) @echo " name: $(DEVZERO_MONITORING_NAMESPACE)" >> $(DIST_INSTALL_BUNDLE) - @echo "[INFO] Append prometheus-server to the main installer" - @echo "# ----- START PROM SERVER -----" >> $(DIST_INSTALL_BUNDLE) - @cat $(DIST_PROMETHEUS_BUNDLE) >> $(DIST_INSTALL_BUNDLE) - @echo "# ----- END PROM SERVER -----" >> $(DIST_INSTALL_BUNDLE) - - @echo "[INFO] Append prometheus-node-exporter to the main installer" - @echo "# ----- START PROM NODE EXPORTER -----" >> $(DIST_INSTALL_BUNDLE) - @cat $(DIST_NODE_EXPORTER_BUNDLE) >> $(DIST_INSTALL_BUNDLE) - @echo "# ----- END PROM NODE EXPORTER -----" >> $(DIST_INSTALL_BUNDLE) - - # @echo "[INFO] Append Metrics Server to the main installer" - # @echo "# ----- START METRICS SERVER -----" >> $(DIST_INSTALL_BUNDLE) - # @cat $(METRICS_SERVER) >> $(DIST_INSTALL_BUNDLE) - # @echo "# ----- END METRICS SERVER -----" >> $(DIST_INSTALL_BUNDLE) @echo "---" >> $(DIST_INSTALL_BUNDLE) - + @echo "[INFO] Append zxporter-manager to the installer bundle" @cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG} @echo "[INFO] Replacing env variables in configmap" @$(YQ) e '.data.DAKR_URL = "$(DAKR_URL)"' -i $(ENV_CONFIGMAP_FILE) - @$(YQ) e '.data.PROMETHEUS_URL = "$(PROMETHEUS_URL)"' -i $(ENV_CONFIGMAP_FILE) @$(YQ) e '.data.TARGET_NAMESPACES = "$(TARGET_NAMESPACES)"' -i $(ENV_CONFIGMAP_FILE) @$(KUSTOMIZE) build config/default > $(DIST_ZXPORTER_BUNDLE) @echo "[INFO] Patching cluster token into generated bundle" - @if [ -n "$(CLUSTER_TOKEN)" ]; then \ - sed "s|CLUSTER_TOKEN: '{{ .cluster_token }}'|CLUSTER_TOKEN: \"$(CLUSTER_TOKEN)\"|g" $(DIST_ZXPORTER_BUNDLE) > $(DIST_ZXPORTER_BUNDLE).tmp && mv $(DIST_ZXPORTER_BUNDLE).tmp $(DIST_ZXPORTER_BUNDLE); \ - fi + @sed "s|CLUSTER_TOKEN: '{{ .cluster_token }}'|CLUSTER_TOKEN: \"$(CLUSTER_TOKEN)\"|g" $(DIST_ZXPORTER_BUNDLE) > $(DIST_ZXPORTER_BUNDLE).tmp && mv $(DIST_ZXPORTER_BUNDLE).tmp $(DIST_ZXPORTER_BUNDLE) @cat $(DIST_ZXPORTER_BUNDLE) >> $(DIST_INSTALL_BUNDLE) + @echo "[INFO] Generate and append nodemon DaemonSet to installer" + @$(HELM) template zxporter-nodemon ./helm-chart/zxporter-nodemon \ + --namespace $(DEVZERO_MONITORING_NAMESPACE) \ + --set provider=other \ + --set gpuMetricsExporter.image.repository=$(word 1,$(subst :, ,$(IMG_NODEMON))) \ + --set gpuMetricsExporter.image.tag=$(word 2,$(subst :, ,$(IMG_NODEMON))) \ + > $(DIST_DIR)/nodemon.yaml + @cat $(DIST_DIR)/nodemon.yaml >> $(DIST_INSTALL_BUNDLE) + + @echo "[INFO] Append Prometheus cleanup migration job" + @cat config/migration/prometheus-cleanup-job.yaml >> $(DIST_INSTALL_BUNDLE) + @echo "[INFO] Building backend installer" @$(MAKE) final-installer @@ -401,7 +346,6 @@ build-env-configmap: echo "" > $(DIST_INSTALL_BUNDLE) # Copy and patch environment config sed "s|\$$(DAKR_URL)|$(DAKR_URL)|g" $(ENV_CONFIGMAP_FILE) > temp.yaml && mv temp.yaml $(ENV_CONFIGMAP_FILE) - sed "s|\$$(PROMETHEUS_URL)|$(PROMETHEUS_URL)|g" $(ENV_CONFIGMAP_FILE) > temp.yaml && mv temp.yaml $(ENV_CONFIGMAP_FILE) sed "s|\$$(TARGET_NAMESPACES)|$(TARGET_NAMESPACES)|g" $(ENV_CONFIGMAP_FILE) > temp.yaml && mv temp.yaml $(ENV_CONFIGMAP_FILE) $(KUSTOMIZE) build config/default | \ yq eval 'select(.kind == "ConfigMap" and .metadata.name == "devzero-zxporter-env-config")' - >> $(DIST_INSTALL_BUNDLE) @@ -457,7 +401,6 @@ helm-chart-install-minimal: helm-chart-build ## Install only zxporter without mo --namespace devzero-system \ --create-namespace \ --set monitoring.enabled=false \ - --set zxporter.prometheusUrl="$(PROMETHEUS_URL)" \ --wait .PHONY: helm-chart-uninstall @@ -508,13 +451,8 @@ deploy-env-configmap: DIST_INSTALL_BUNDLE=$(DIST_DIR)/env_configmap.yaml deploy-env-configmap: build-env-configmap cat $(DIST_INSTALL_BUNDLE) | $(KUBECTL) apply -f - -.PHONY: undeploy-monitoring -undeploy-monitoring: ## Undeploy monitoring components. - $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f $(DIST_NODE_EXPORTER_BUNDLE) || true - $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f $(DIST_PROMETHEUS_BUNDLE) || true - .PHONY: undeploy -undeploy: kustomize undeploy-monitoring ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion. +undeploy: kustomize ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion. $(KUSTOMIZE) build config/default | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - ##@ Dependencies diff --git a/api/v1/collectionpolicy_types.go b/api/v1/collectionpolicy_types.go index a8d96d68..16f3f606 100644 --- a/api/v1/collectionpolicy_types.go +++ b/api/v1/collectionpolicy_types.go @@ -449,18 +449,7 @@ type Policies struct { // If ClusterToken is not provided but PATToken is, the system will exchange it for a cluster token PATToken string `json:"patToken,omitempty"` - // PrometheusURL is the URL of the Prometheus server to query for metrics - // If not provided, defaults to in-cluster Prometheus at "http://prometheus-service.monitoring.svc.cluster.local:8080" - // +optional - PrometheusURL string `json:"prometheusURL,omitempty"` - - // DisableNetworkIOMetrics disables collection of container network and I/O metrics from Prometheus - // These metrics include network throughput, packet rates, and disk I/O operations - // Default is false, meaning metrics are collected by default - // +optional - DisableNetworkIOMetrics bool `json:"disableNetworkIOMetrics,omitempty"` - - // DisableGpuMetrics disables collection of GPU metrics from Prometheus + // DisableGpuMetrics disables collection of GPU metrics // These metrics include GPU utilization, memory usage, and temperature // Default is false, meaning metrics are collected by default // +optional diff --git a/cmd/zxporter-nodemon/main.go b/cmd/zxporter-nodemon/main.go index 40f63ec6..ce65e429 100644 --- a/cmd/zxporter-nodemon/main.go +++ b/cmd/zxporter-nodemon/main.go @@ -76,13 +76,39 @@ func main() { ) mapper := nodemon.NewMapper(cfg.NodeName, workloadResolver, logger) - // Create exporter + // Create GPU exporter exporter := nodemon.NewExporter(cfg, dynClient, scraper, mapper, logger) - // Create HTTP handler and server - containerMetricsHandler := nodemon.NewContainerMetricsHandler(exporter, logger) + // Create a K8s-authenticated HTTP client for kubelet API proxy access + k8sTransport, err := rest.TransportFor(kubeConfig) + if err != nil { + logger.Error(err, "Failed to create K8s transport") + os.Exit(1) + } + k8sHTTPClient := &http.Client{Transport: k8sTransport, Timeout: 15 * time.Second} + + // Use the K8s API server proxy for kubelet access (same as Cortex pattern) + apiProxyBase := kubeConfig.Host + "/api/v1/nodes/" + cfg.NodeName + "/proxy" + statsPoller := nodemon.NewStatsPoller(apiProxyBase, k8sHTTPClient, logger) + cadvisorScraper := nodemon.NewCAdvisorScraper(apiProxyBase, k8sHTTPClient, logger) + + // Create unified exporter that combines all data sources + unifiedExporter := nodemon.NewUnifiedExporter(statsPoller, cadvisorScraper, exporter, cfg.NodeName, logger) + + // Start unified collection loop (every 30 seconds) + collectionCtx, collectionCancel := context.WithCancel(context.Background()) + defer collectionCancel() + go unifiedExporter.StartCollectionLoop(collectionCtx, 30*time.Second) + + // Create HTTP handlers + containerMetricsHandler := nodemon.NewContainerMetricsHandler(exporter, logger) // GPU-only (backward compat) mux := nodemon.NewServerMux(containerMetricsHandler) + // Register unified endpoints + mux.Handle("/v2/container/metrics", nodemon.NewUnifiedContainerHandler(unifiedExporter, logger)) + mux.Handle("/node/metrics", nodemon.NewNodeMetricsHandler(unifiedExporter, logger)) + mux.Handle("/pvc/metrics", nodemon.NewPVCMetricsHandler(unifiedExporter, logger)) + server := &http.Server{ Addr: fmt.Sprintf(":%d", cfg.HTTPListenPort), Handler: mux, diff --git a/config/crd/bases/devzero.io_collectionpolicies.yaml b/config/crd/bases/devzero.io_collectionpolicies.yaml index 54ffacb3..0aa85418 100644 --- a/config/crd/bases/devzero.io_collectionpolicies.yaml +++ b/config/crd/bases/devzero.io_collectionpolicies.yaml @@ -658,16 +658,10 @@ spec: type: string disableGpuMetrics: description: |- - DisableGpuMetrics disables collection of GPU metrics from Prometheus + DisableGpuMetrics disables collection of GPU metrics These metrics include GPU utilization, memory usage, and temperature Default is false, meaning metrics are collected by default type: boolean - disableNetworkIOMetrics: - description: |- - DisableNetworkIOMetrics disables collection of container network and I/O metrics from Prometheus - These metrics include network throughput, packet rates, and disk I/O operations - Default is false, meaning metrics are collected by default - type: boolean disabledCollectors: description: |- DisabledCollectors is a list of collector types to completely disable @@ -701,11 +695,6 @@ spec: PATToken is the Personal Access Token used for automatic cluster token exchange If ClusterToken is not provided but PATToken is, the system will exchange it for a cluster token type: string - prometheusURL: - description: |- - PrometheusURL is the URL of the Prometheus server to query for metrics - If not provided, defaults to in-cluster Prometheus at "http://prometheus-service.monitoring.svc.cluster.local:8080" - type: string watchedCRDs: description: WatchedCRDs is a list of custom resource definitions to explicitly watch diff --git a/config/manager/env_configmap.yaml b/config/manager/env_configmap.yaml index 6755e50a..3d81cc91 100644 --- a/config/manager/env_configmap.yaml +++ b/config/manager/env_configmap.yaml @@ -9,14 +9,12 @@ data: # PAT_TOKEN: "{{ .pat_token }}" # Uncomment to use PAT token (recommended to use Secret instead) KUBE_CONTEXT_NAME: '{{ .kube_context_name }}' DAKR_URL: "https://dakr.devzero.io" - PROMETHEUS_URL: "http://prometheus-dz-prometheus-server.devzero-system.svc.cluster.local:80" K8S_PROVIDER: "{{ .k8s_provider }}" COLLECTION_FREQUENCY: "" BUFFER_SIZE: "" EXCLUDED_NAMESPACES: "" EXCLUDED_NODES: "" TARGET_NAMESPACES: "" - DISABLE_NETWORK_IO_METRICS: "" MASK_SECRET_DATA: "" NODE_METRICS_INTERVAL: "" WATCHED_CRDS: "" diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index fa1be446..b5e598bd 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -14,5 +14,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization images: - name: controller - newName: ttl.sh/zxporter - newTag: latest + newName: docker.io/parthiba007/zxporter + newTag: prom14 diff --git a/config/migration/prometheus-cleanup-job.yaml b/config/migration/prometheus-cleanup-job.yaml new file mode 100644 index 00000000..320da2aa --- /dev/null +++ b/config/migration/prometheus-cleanup-job.yaml @@ -0,0 +1,176 @@ +--- +# Dedicated ServiceAccount for the one-time migration cleanup job. +# Scoped to only delete specific named resources left by previous zxporter installs. +apiVersion: v1 +kind: ServiceAccount +metadata: + name: zxporter-prometheus-cleanup + namespace: devzero-system + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +--- +# Namespaced Role: can only delete specific named Prometheus resources in the zxporter namespace. +# NOTE: Does NOT delete standalone nodemon — the kubectl install path reuses the same +# resource names (zxporter-nodemon), so kubectl apply updates them in-place. +# Nodemon cleanup is only in the Helm hook where names differ (zxporter-zxporter-nodemon). +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: zxporter-prometheus-cleanup + namespace: devzero-system + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +rules: +- apiGroups: ["apps"] + resources: ["deployments"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + verbs: ["delete"] +- apiGroups: ["apps"] + resources: ["daemonsets"] + resourceNames: + - dz-prometheus-node-exporter + verbs: ["delete"] +- apiGroups: [""] + resources: ["services"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + - dz-prometheus-node-exporter + verbs: ["delete"] +- apiGroups: [""] + resources: ["configmaps"] + resourceNames: + - prometheus-dz-prometheus-server + verbs: ["delete"] +- apiGroups: [""] + resources: ["serviceaccounts"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + - dz-prometheus-node-exporter + verbs: ["delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: zxporter-prometheus-cleanup + namespace: devzero-system + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: zxporter-prometheus-cleanup +subjects: +- kind: ServiceAccount + name: zxporter-prometheus-cleanup + namespace: devzero-system +--- +# ClusterRole: can only delete the exact named ClusterRoles/ClusterRoleBindings +# left by the old zxporter Prometheus install. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: zxporter-prometheus-cleanup + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +rules: +- apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterroles"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + verbs: ["delete"] +- apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterrolebindings"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + verbs: ["delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: zxporter-prometheus-cleanup + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: zxporter-prometheus-cleanup +subjects: +- kind: ServiceAccount + name: zxporter-prometheus-cleanup + namespace: devzero-system +--- +# One-time migration job: cleans up legacy Prometheus resources. +# - Idempotent: --ignore-not-found on every delete +# - Safe for fresh installs: all deletes succeed with "not found" (exit 0) +# - Scoped: only deletes Prometheus resources by exact name +# - Self-cleaning: Job auto-deletes after 5 minutes via ttlSecondsAfterFinished +# +# NOTE: Does NOT delete standalone nodemon — in the kubectl path, the new installer +# reuses the same names (zxporter-nodemon), so kubectl apply updates them in-place. +apiVersion: batch/v1 +kind: Job +metadata: + name: zxporter-prometheus-cleanup + namespace: devzero-system + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +spec: + ttlSecondsAfterFinished: 300 + backoffLimit: 1 + template: + metadata: + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + spec: + restartPolicy: Never + serviceAccountName: zxporter-prometheus-cleanup + containers: + - name: cleanup + image: bitnami/kubectl:latest + command: + - /bin/sh + - -c + - | + NS=$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace) + echo "Cleaning up legacy zxporter Prometheus resources in namespace: $NS" + echo "This only deletes Prometheus resources by exact name — other installs are unaffected." + + # Deployments + kubectl delete deployment prometheus-dz-prometheus-server -n $NS --ignore-not-found + kubectl delete deployment prometheus-kube-state-metrics -n $NS --ignore-not-found + + # DaemonSets + kubectl delete daemonset dz-prometheus-node-exporter -n $NS --ignore-not-found + + # Services + kubectl delete service prometheus-dz-prometheus-server -n $NS --ignore-not-found + kubectl delete service prometheus-kube-state-metrics -n $NS --ignore-not-found + kubectl delete service dz-prometheus-node-exporter -n $NS --ignore-not-found + + # ConfigMaps + kubectl delete configmap prometheus-dz-prometheus-server -n $NS --ignore-not-found + + # ServiceAccounts + kubectl delete serviceaccount prometheus-dz-prometheus-server -n $NS --ignore-not-found + kubectl delete serviceaccount prometheus-kube-state-metrics -n $NS --ignore-not-found + kubectl delete serviceaccount dz-prometheus-node-exporter -n $NS --ignore-not-found + + # ClusterRoles and ClusterRoleBindings + kubectl delete clusterrole prometheus-dz-prometheus-server --ignore-not-found + kubectl delete clusterrole prometheus-kube-state-metrics --ignore-not-found + kubectl delete clusterrolebinding prometheus-dz-prometheus-server --ignore-not-found + kubectl delete clusterrolebinding prometheus-kube-state-metrics --ignore-not-found + + echo "Cleanup complete" diff --git a/config/prometheus/hack.node-exporter.values.yaml b/config/prometheus/hack.node-exporter.values.yaml deleted file mode 100644 index 1b1b7491..00000000 --- a/config/prometheus/hack.node-exporter.values.yaml +++ /dev/null @@ -1,24 +0,0 @@ -nameOverride: dz-prometheus-node-exporter -fullnameOverride: dz-prometheus-node-exporter - -resources: - requests: - memory: "500Mi" - cpu: "250m" - limits: - memory: "500Mi" - -rbac: - pspEnabled: false - -containerSecurityContext: - allowPrivilegeEscalation: false - -service: - port: 9101 - targetPort: 9101 - -podAnnotations: - prometheus.io/scrape: "true" - prometheus.io/port: "9101" - prometheus.io/path: "/metrics" diff --git a/config/prometheus/hack.prometheus.values.yaml b/config/prometheus/hack.prometheus.values.yaml deleted file mode 100644 index 00a01da0..00000000 --- a/config/prometheus/hack.prometheus.values.yaml +++ /dev/null @@ -1,587 +0,0 @@ -nameOverride: dz-prometheus -fullnameOverride: dz-prometheus - -configmapReload: - prometheus: - resources: - requests: - memory: "500Mi" - cpu: "250m" - limits: - memory: "500Mi" - -server: - persistentVolume: - enabled: false - retention: "2d" - resources: - requests: - memory: "2Gi" - cpu: "100m" - limits: - memory: "2Gi" - -alertmanager: - enabled: false - -prometheus-pushgateway: - enabled: false - -prometheus-node-exporter: - enabled: false - -kube-state-metrics: - enabled: true - resources: - requests: - memory: "500Mi" - cpu: "250m" - limits: - memory: "500Mi" - collectors: - # - certificatesigningrequests - # - configmaps - # - cronjobs - # - daemonsets - # - deployments - # - endpoints - # - horizontalpodautoscalers - # - ingresses - # - jobs - # - leases - # - limitranges - # - mutatingwebhookconfigurations - # - namespaces - # - networkpolicies - - nodes - # - persistentvolumeclaims - # - persistentvolumes - # - poddisruptionbudgets - # - pods - # - replicasets - # - replicationcontrollers - # - resourcequotas - # - secrets - # - services - # - statefulsets - # - storageclasses - # - validatingwebhookconfigurations - # - volumeattachments - # - ingressclasses - # - clusterrolebindings - # - clusterroles - # - roles - -serverFiles: - prometheus.yml: - scrape_configs: - - job_name: prometheus - static_configs: - - targets: - - localhost:9090 - - # A scrape configuration for running Prometheus on a Kubernetes cluster. - # This uses separate scrape configs for cluster components (i.e. API server, node) - # and services to allow each to use different authentication configs. - # - # Kubernetes labels will be added as Prometheus labels on metrics via the - # `labelmap` relabeling action. - - ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING - # # Scrape config for API servers. - # # - # # Kubernetes exposes API servers as endpoints to the default/kubernetes - # # service so this uses `endpoints` role and uses relabelling to only keep - # # the endpoints associated with the default/kubernetes service using the - # # default named port `https`. This works for single API server deployments as - # # well as HA API server deployments. - # - job_name: 'kubernetes-apiservers' - # - # kubernetes_sd_configs: - # - role: endpoints - # - # # Default to scraping over https. If required, just disable this or change to - # # `http`. - # scheme: https - # - # # This TLS & bearer token file config is used to connect to the actual scrape - # # endpoints for cluster components. This is separate to discovery auth - # # configuration because discovery & scraping are two separate concerns in - # # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # # the cluster. Otherwise, more config options have to be provided within the - # # . - # tls_config: - # ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # # If your node certificates are self-signed or use a different CA to the - # # master CA, then disable certificate verification below. Note that - # # certificate verification is an integral part of a secure infrastructure - # # so this should only be disabled in a controlled environment. You can - # # disable certificate verification by uncommenting the line below. - # # - # # insecure_skip_verify: true - # bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - # - # # Keep only the default/kubernetes service endpoints for the https port. This - # # will add targets for each API server which Kubernetes adds an endpoint to - # # the default/kubernetes service. - # relabel_configs: - # - source_labels: [ __meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name ] - # action: keep - # regex: default;kubernetes;https - - - job_name: "kubernetes-nodes" - - # Default to scraping over https. If required, just disable this or change to - # `http`. - scheme: https - - # This TLS & bearer token file config is used to connect to the actual scrape - # endpoints for cluster components. This is separate to discovery auth - # configuration because discovery & scraping are two separate concerns in - # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # the cluster. Otherwise, more config options have to be provided within the - # . - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that - # certificate verification is an integral part of a secure infrastructure - # so this should only be disabled in a controlled environment. You can - # disable certificate verification by uncommenting the line below. - # - # insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: node - - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/$1/proxy/metrics - - # Metric relabel configs for node-exporter metrics - # ONLY keep node-level network and disk I/O metrics that zxporter queries - metric_relabel_configs: - # Keep node network and disk metrics - - source_labels: [__name__] - regex: "node_network_(receive|transmit)_(bytes|packets|errs|drop)_total|node_disk_(read_bytes|written_bytes|reads_completed|writes_completed)_total|kubelet_volume_stats_(used|capacity|available)_bytes" - action: keep - - - job_name: "kubernetes-nodes-cadvisor" - - # Default to scraping over https. If required, just disable this or change to - # `http`. - scheme: https - - # This TLS & bearer token file config is used to connect to the actual scrape - # endpoints for cluster components. This is separate to discovery auth - # configuration because discovery & scraping are two separate concerns in - # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # the cluster. Otherwise, more config options have to be provided within the - # . - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that - # certificate verification is an integral part of a secure infrastructure - # so this should only be disabled in a controlled environment. You can - # disable certificate verification by uncommenting the line below. - # - # insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: node - - # This configuration will work only on kubelet 1.7.3+ - # As the scrape endpoints for cAdvisor have changed - # if you are using older version you need to change the replacement to - # replacement: /api/v1/nodes/$1:4194/proxy/metrics - # more info here https://github.com/coreos/prometheus-operator/issues/633 - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor - - # Metric relabel configs to apply to samples before ingestion. - # ONLY keep the metrics that zxporter actually uses for container resource collection - # This dramatically reduces memory usage by dropping hundreds of unused cAdvisor metrics - metric_relabel_configs: - # Keep only the specific metrics we need for network I/O monitoring - - source_labels: [__name__] - regex: "container_network_(receive|transmit)_(bytes|packets|errors|packets_dropped)_total|container_fs_(reads|writes)_(bytes_)?total|kubelet_volume_stats_(used|capacity|available)_bytes|container_cpu_cfs_(throttled_)?periods_total" - action: keep - # Drop all other container metrics (saves ~90% of cAdvisor metrics) - # Drop high-cardinality labels we don't use - - action: labeldrop - regex: "(id|name|image)" - - ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING - # # Scrape config for service endpoints. - # # - # # The relabeling allows the actual service scrape endpoint to be configured - # # via the following annotations: - # # - # # * `prometheus.io/scrape`: Only scrape services that have a value of - # # `true`, except if `prometheus.io/scrape-slow` is set to `true` as well. - # # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need - # # to set this to `https` & most likely set the `tls_config` of the scrape config. - # # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. - # # * `prometheus.io/port`: If the metrics are exposed on a different port to the - # # service then set this appropriately. - # # * `prometheus.io/param_`: If the metrics endpoint uses parameters - # # then you can set any parameter - # - job_name: 'kubernetes-service-endpoints' - # honor_labels: true - # - # kubernetes_sd_configs: - # - role: endpoints - # - # relabel_configs: - # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape ] - # action: keep - # regex: true - # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape_slow ] - # action: drop - # regex: true - # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scheme ] - # action: replace - # target_label: __scheme__ - # regex: (https?) - # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_path ] - # action: replace - # target_label: __metrics_path__ - # regex: (.+) - # - source_labels: [ __address__, __meta_kubernetes_service_annotation_prometheus_io_port ] - # action: replace - # target_label: __address__ - # regex: (.+?)(?::\d+)?;(\d+) - # replacement: $1:$2 - # - action: labelmap - # regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) - # replacement: __param_$1 - # - action: labelmap - # regex: __meta_kubernetes_service_label_(.+) - # - source_labels: [ __meta_kubernetes_namespace ] - # action: replace - # target_label: namespace - # - source_labels: [ __meta_kubernetes_service_name ] - # action: replace - # target_label: service - # - source_labels: [ __meta_kubernetes_pod_node_name ] - # action: replace - # target_label: node - - ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING - # # Scrape config for slow service endpoints; same as above, but with a larger - # # timeout and a larger interval - # # - # # The relabeling allows the actual service scrape endpoint to be configured - # # via the following annotations: - # # - # # * `prometheus.io/scrape-slow`: Only scrape services that have a value of `true` - # # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need - # # to set this to `https` & most likely set the `tls_config` of the scrape config. - # # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. - # # * `prometheus.io/port`: If the metrics are exposed on a different port to the - # # service then set this appropriately. - # # * `prometheus.io/param_`: If the metrics endpoint uses parameters - # # then you can set any parameter - # - job_name: 'kubernetes-service-endpoints-slow' - # honor_labels: true - # - # scrape_interval: 5m - # scrape_timeout: 30s - # - # kubernetes_sd_configs: - # - role: endpoints - # - # relabel_configs: - # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scrape_slow ] - # action: keep - # regex: true - # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_scheme ] - # action: replace - # target_label: __scheme__ - # regex: (https?) - # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_path ] - # action: replace - # target_label: __metrics_path__ - # regex: (.+) - # - source_labels: [ __address__, __meta_kubernetes_service_annotation_prometheus_io_port ] - # action: replace - # target_label: __address__ - # regex: (.+?)(?::\d+)?;(\d+) - # replacement: $1:$2 - # - action: labelmap - # regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) - # replacement: __param_$1 - # - action: labelmap - # regex: __meta_kubernetes_service_label_(.+) - # - source_labels: [ __meta_kubernetes_namespace ] - # action: replace - # target_label: namespace - # - source_labels: [ __meta_kubernetes_service_name ] - # action: replace - # target_label: service - # - source_labels: [ __meta_kubernetes_pod_node_name ] - # action: replace - # target_label: node - # - # - job_name: 'prometheus-pushgateway' - # honor_labels: true - # - # kubernetes_sd_configs: - # - role: service - # - # relabel_configs: - # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_probe ] - # action: keep - # regex: pushgateway - - ## DEVZERO COMMENTED OUT TO PREVENT SCRAPING - # # Example scrape config for probing services via the Blackbox Exporter. - # # - # # The relabeling allows the actual service scrape endpoint to be configured - # # via the following annotations: - # # - # # * `prometheus.io/probe`: Only probe services that have a value of `true` - # - job_name: 'kubernetes-services' - # honor_labels: true - # - # metrics_path: /probe - # params: - # module: [ http_2xx ] - # - # kubernetes_sd_configs: - # - role: service - # - # relabel_configs: - # - source_labels: [ __meta_kubernetes_service_annotation_prometheus_io_probe ] - # action: keep - # regex: true - # - source_labels: [ __address__ ] - # target_label: __param_target - # - target_label: __address__ - # replacement: blackbox - # - source_labels: [ __param_target ] - # target_label: instance - # - action: labelmap - # regex: __meta_kubernetes_service_label_(.+) - # - source_labels: [ __meta_kubernetes_namespace ] - # target_label: namespace - # - source_labels: [ __meta_kubernetes_service_name ] - # target_label: service - - # Example scrape config for pods - # - # The relabeling allows the actual pod scrape endpoint to be configured via the - # following annotations: - # - # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`, - # except if `prometheus.io/scrape-slow` is set to `true` as well. - # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need - # to set this to `https` & most likely set the `tls_config` of the scrape config. - # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. - # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. - - job_name: "kubernetes-pods" - honor_labels: true - - kubernetes_sd_configs: - - role: pod - - relabel_configs: - - source_labels: - [__meta_kubernetes_pod_annotation_prometheus_io_scrape] - action: keep - regex: true - # ONLY scrape DCGM exporter pods to reduce load on Prometheus - # This prevents scraping random pods that have prometheus.io/scrape=true - - action: keep - regex: ".*dcgm-exporter.*" - source_labels: - [ - __meta_kubernetes_pod_label_app, - __meta_kubernetes_pod_label_app_kubernetes_io_name, - ] - - source_labels: - [__meta_kubernetes_pod_annotation_prometheus_io_scrape_slow] - action: drop - regex: true - - source_labels: - [__meta_kubernetes_pod_annotation_prometheus_io_scheme] - action: replace - regex: (https?) - target_label: __scheme__ - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: - [ - __meta_kubernetes_pod_annotation_prometheus_io_port, - __meta_kubernetes_pod_ip, - ] - action: replace - regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: "[$2]:$1" - target_label: __address__ - - source_labels: - [ - __meta_kubernetes_pod_annotation_prometheus_io_port, - __meta_kubernetes_pod_ip, - ] - action: replace - regex: (\d+);((([0-9]+?)(\.|$)){4}) - replacement: $2:$1 - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - - source_labels: [__meta_kubernetes_pod_phase] - regex: Pending|Succeeded|Failed|Completed - action: drop - - source_labels: [__meta_kubernetes_pod_node_name] - action: replace - target_label: node - - # Metric relabel configs to filter metrics after scraping - # ONLY keep GPU metrics (DCGM) that zxporter uses - metric_relabel_configs: - # Keep only DCGM GPU metrics we actually query - - source_labels: [__name__] - regex: "DCGM_FI_DEV_(GPU_UTIL|FB_USED|FB_FREE|POWER_USAGE|GPU_TEMP|SM_CLOCK|MEM_CLOCK)" - action: keep - # Drop all other metrics from annotated pods to save memory - - - job_name: kubernetes-pods-node-exporter - honor_labels: true - - kubernetes_sd_configs: - - role: pod - - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape - - action: keep - regex: dz-prometheus-node-exporter - source_labels: - - __meta_kubernetes_pod_label_app_kubernetes_io_name - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: "[$2]:$1" - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: replace - regex: (\d+);((([0-9]+?)(\.|$)){4}) - replacement: $2:$1 - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - - metric_relabel_configs: - - action: keep - regex: "node_network_(receive|transmit)_(bytes|packets|errs|drop)_total|node_disk_(read_bytes|written_bytes|reads_completed|writes_completed)_total|kubelet_volume_stats_(used|capacity|available)_bytes" - source_labels: - - __name__ - -## DEVZERO COMMENTED OUT TO PREVENT SCRAPING -# # Example Scrape config for pods which should be scraped slower. An useful example -# # would be stackriver-exporter which queries an API on every scrape of the pod -# # -# # The relabeling allows the actual pod scrape endpoint to be configured via the -# # following annotations: -# # -# # * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true` -# # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need -# # to set this to `https` & most likely set the `tls_config` of the scrape config. -# # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. -# # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. -# - job_name: 'kubernetes-pods-slow' -# honor_labels: true -# -# scrape_interval: 5m -# scrape_timeout: 30s -# -# kubernetes_sd_configs: -# - role: pod -# -# relabel_configs: -# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ] -# action: keep -# regex: true -# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ] -# action: replace -# regex: (https?) -# target_label: __scheme__ -# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ] -# action: replace -# target_label: __metrics_path__ -# regex: (.+) -# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ] -# action: replace -# regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) -# replacement: '[$2]:$1' -# target_label: __address__ -# - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip ] -# action: replace -# regex: (\d+);((([0-9]+?)(\.|$)){4}) -# replacement: $2:$1 -# target_label: __address__ -# - action: labelmap -# regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) -# replacement: __param_$1 -# - action: labelmap -# regex: __meta_kubernetes_pod_label_(.+) -# - source_labels: [ __meta_kubernetes_namespace ] -# action: replace -# target_label: namespace -# - source_labels: [ __meta_kubernetes_pod_name ] -# action: replace -# target_label: pod -# - source_labels: [ __meta_kubernetes_pod_phase ] -# regex: Pending|Succeeded|Failed|Completed -# action: drop -# - source_labels: [ __meta_kubernetes_pod_node_name ] -# action: replace -# target_label: node diff --git a/config/prometheus/kustomization.yaml b/config/prometheus/kustomization.yaml deleted file mode 100644 index ed137168..00000000 --- a/config/prometheus/kustomization.yaml +++ /dev/null @@ -1,2 +0,0 @@ -resources: -- monitor.yaml diff --git a/config/prometheus/monitor.yaml b/config/prometheus/monitor.yaml deleted file mode 100644 index 06e5bc0c..00000000 --- a/config/prometheus/monitor.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# Prometheus Monitor Service (Metrics) -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - control-plane: controller-manager - app.kubernetes.io/name: devzero-zxporter - app.kubernetes.io/managed-by: kustomize - name: controller-manager-metrics-monitor - namespace: system -spec: - endpoints: - - path: /metrics - port: https # Ensure this is the name of the port that exposes HTTPS metrics - scheme: https - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - tlsConfig: - # TODO(user): The option insecureSkipVerify: true is not recommended for production since it disables - # certificate verification. This poses a significant security risk by making the system vulnerable to - # man-in-the-middle attacks, where an attacker could intercept and manipulate the communication between - # Prometheus and the monitored services. This could lead to unauthorized access to sensitive metrics data, - # compromising the integrity and confidentiality of the information. - # Please use the following options for secure configurations: - # caFile: /etc/metrics-certs/ca.crt - # certFile: /etc/metrics-certs/tls.crt - # keyFile: /etc/metrics-certs/tls.key - insecureSkipVerify: true - selector: - matchLabels: - control-plane: controller-manager diff --git a/dist/backend-install.yaml b/dist/backend-install.yaml index d283fe2b..baf0763d 100644 --- a/dist/backend-install.yaml +++ b/dist/backend-install.yaml @@ -1,818 +1,11 @@ -## ATTN KUBERNETES ADMINS! Read this... -# If prometheus-server is already installed, and you want to use that version, -# comment out the section from "START PROM SERVER" to "END PROM SERVER" and update the "prometheusURL" variable. -# -# If prometheus-node-exporter is already installed, and you want to use that version, -# comment out the section from "START PROM NODE EXPORTER" to "END PROM NODE EXPORTER" -# - +# ZXPorter installer bundle apiVersion: v1 kind: Namespace metadata: labels: control-plane: controller-manager - app.kubernetes.io/name: devzero-system - name: devzero-system -# ----- START PROM SERVER ----- ---- -# Source: prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -automountServiceAccountToken: true -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - name: prometheus-kube-state-metrics - namespace: devzero-system ---- -# Source: prometheus/templates/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server - namespace: devzero-system - annotations: {} ---- -# Source: prometheus/templates/cm.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server - namespace: devzero-system -data: - allow-snippet-annotations: "false" - alerting_rules.yml: | - {} - alerts: | - {} - prometheus.yml: | - global: - evaluation_interval: 1m - scrape_interval: 1m - scrape_timeout: 10s - rule_files: - - /etc/config/recording_rules.yml - - /etc/config/alerting_rules.yml - - /etc/config/rules - - /etc/config/alerts - scrape_configs: - - job_name: prometheus - static_configs: - - targets: - - localhost:9090 - - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - job_name: kubernetes-nodes - kubernetes_sd_configs: - - role: node - metric_relabel_configs: - - action: keep - regex: node_network_(receive|transmit)_(bytes|packets|errs|drop)_total|node_disk_(read_bytes|written_bytes|reads_completed|writes_completed)_total|kubelet_volume_stats_(used|capacity|available)_bytes - source_labels: - - __name__ - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - replacement: kubernetes.default.svc:443 - target_label: __address__ - - regex: (.+) - replacement: /api/v1/nodes/$1/proxy/metrics - source_labels: - - __meta_kubernetes_node_name - target_label: __metrics_path__ - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - job_name: kubernetes-nodes-cadvisor - kubernetes_sd_configs: - - role: node - metric_relabel_configs: - - action: keep - regex: container_network_(receive|transmit)_(bytes|packets|errors|packets_dropped)_total|container_fs_(reads|writes)_(bytes_)?total|kubelet_volume_stats_(used|capacity|available)_bytes|container_cpu_cfs_(throttled_)?periods_total - source_labels: - - __name__ - - action: labeldrop - regex: (id|name|image) - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - replacement: kubernetes.default.svc:443 - target_label: __address__ - - regex: (.+) - replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor - source_labels: - - __meta_kubernetes_node_name - target_label: __metrics_path__ - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - - honor_labels: true - job_name: kubernetes-pods - kubernetes_sd_configs: - - role: pod - metric_relabel_configs: - - action: keep - regex: DCGM_FI_DEV_(GPU_UTIL|FB_USED|FB_FREE|POWER_USAGE|GPU_TEMP|SM_CLOCK|MEM_CLOCK) - source_labels: - - __name__ - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape - - action: keep - regex: .*dcgm-exporter.* - source_labels: - - __meta_kubernetes_pod_label_app - - __meta_kubernetes_pod_label_app_kubernetes_io_name - - action: drop - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: '[$2]:$1' - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: replace - regex: (\d+);((([0-9]+?)(\.|$)){4}) - replacement: $2:$1 - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - action: replace - source_labels: - - __meta_kubernetes_pod_name - target_label: pod - - action: drop - regex: Pending|Succeeded|Failed|Completed - source_labels: - - __meta_kubernetes_pod_phase - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - - honor_labels: true - job_name: kubernetes-pods-node-exporter - kubernetes_sd_configs: - - role: pod - metric_relabel_configs: - - action: keep - regex: node_network_(receive|transmit)_(bytes|packets|errs|drop)_total|node_disk_(read_bytes|written_bytes|reads_completed|writes_completed)_total|kubelet_volume_stats_(used|capacity|available)_bytes - source_labels: - - __name__ - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape - - action: keep - regex: dz-prometheus-node-exporter - source_labels: - - __meta_kubernetes_pod_label_app_kubernetes_io_name - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: '[$2]:$1' - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: replace - regex: (\d+);((([0-9]+?)(\.|$)){4}) - replacement: $2:$1 - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - recording_rules.yml: | - {} - rules: | - {} ---- -# Source: prometheus/charts/kube-state-metrics/templates/role.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - name: prometheus-kube-state-metrics -rules: - - apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] ---- -# Source: prometheus/templates/clusterrole.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server -rules: - - apiGroups: - - "" - resources: - - nodes - - nodes/proxy - - nodes/metrics - - services - - endpoints - - pods - - ingresses - - configmaps - verbs: - - get - - list - - watch - - apiGroups: - - "networking.k8s.io" - resources: - - ingresses/status - - ingresses - verbs: - - get - - list - - watch - - apiGroups: - - "discovery.k8s.io" - resources: - - endpointslices - verbs: - - get - - list - - watch - - nonResourceURLs: - - "/metrics" - verbs: - - get ---- -# Source: prometheus/charts/kube-state-metrics/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - name: prometheus-kube-state-metrics -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus-kube-state-metrics -subjects: - - kind: ServiceAccount - name: prometheus-kube-state-metrics - namespace: devzero-system ---- -# Source: prometheus/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server -subjects: - - kind: ServiceAccount - name: prometheus-dz-prometheus-server - namespace: devzero-system -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus-dz-prometheus-server ---- -# Source: prometheus/charts/kube-state-metrics/templates/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: prometheus-kube-state-metrics - namespace: devzero-system - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - annotations: - prometheus.io/scrape: 'true' -spec: - type: "ClusterIP" - ports: - - name: "http" - protocol: TCP - port: 8080 - targetPort: 8080 - selector: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus ---- -# Source: prometheus/templates/service.yaml -apiVersion: v1 -kind: Service -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server - namespace: devzero-system -spec: - ports: - - name: http - port: 80 - protocol: TCP - targetPort: 9090 - selector: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - sessionAffinity: None - type: "ClusterIP" ---- -# Source: prometheus/charts/kube-state-metrics/templates/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: prometheus-kube-state-metrics - namespace: devzero-system - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" -spec: - selector: - matchLabels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - replicas: 1 - strategy: - type: RollingUpdate - revisionHistoryLimit: 10 - template: - metadata: - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - spec: - automountServiceAccountToken: true - hostNetwork: false - serviceAccountName: prometheus-kube-state-metrics - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - dnsPolicy: ClusterFirst - containers: - - name: kube-state-metrics - args: - - --port=8080 - - --resources=nodes - imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 - ports: - - containerPort: 8080 - name: "http" - livenessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: /livez - port: 8080 - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 5 - readinessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: /readyz - port: 8081 - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 5 - resources: - limits: - memory: 500Mi - requests: - cpu: 250m - memory: 500Mi - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true ---- -# Source: prometheus/templates/deploy.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server - namespace: devzero-system -spec: - selector: - matchLabels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - replicas: 1 - revisionHistoryLimit: 10 - strategy: - type: Recreate - rollingUpdate: null - template: - metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - spec: - enableServiceLinks: true - serviceAccountName: prometheus-dz-prometheus-server - containers: - - name: dz-prometheus-server-configmap-reload - image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.83.0" - imagePullPolicy: "IfNotPresent" - args: - - --watched-dir=/etc/config - - --listen-address=0.0.0.0:8080 - - --reload-url=http://127.0.0.1:9090/-/reload - ports: - - containerPort: 8080 - name: metrics - livenessProbe: - httpGet: - path: /healthz - port: metrics - scheme: HTTP - initialDelaySeconds: 2 - periodSeconds: 10 - readinessProbe: - httpGet: - path: /healthz - port: metrics - scheme: HTTP - periodSeconds: 10 - resources: - limits: - memory: 500Mi - requests: - cpu: 250m - memory: 500Mi - volumeMounts: - - name: config-volume - mountPath: /etc/config - readOnly: true - - name: dz-prometheus-server - image: "quay.io/prometheus/prometheus:v3.4.1" - imagePullPolicy: "IfNotPresent" - args: - - --storage.tsdb.retention.time=2d - - --config.file=/etc/config/prometheus.yml - - --storage.tsdb.path=/data - - --web.console.libraries=/etc/prometheus/console_libraries - - --web.console.templates=/etc/prometheus/consoles - - --web.enable-lifecycle - ports: - - containerPort: 9090 - readinessProbe: - httpGet: - path: /-/ready - port: 9090 - scheme: HTTP - initialDelaySeconds: 30 - periodSeconds: 5 - timeoutSeconds: 4 - failureThreshold: 3 - successThreshold: 1 - livenessProbe: - httpGet: - path: /-/healthy - port: 9090 - scheme: HTTP - initialDelaySeconds: 30 - periodSeconds: 15 - timeoutSeconds: 10 - failureThreshold: 3 - successThreshold: 1 - resources: - limits: - memory: 2Gi - requests: - cpu: 100m - memory: 2Gi - volumeMounts: - - name: config-volume - mountPath: /etc/config - - name: storage-volume - mountPath: /data - subPath: "" - dnsPolicy: ClusterFirst - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - terminationGracePeriodSeconds: 300 - volumes: - - name: config-volume - configMap: - name: prometheus-dz-prometheus-server - - name: storage-volume - emptyDir: {} -# ----- END PROM SERVER ----- -# ----- START PROM NODE EXPORTER ----- ---- -# Source: prometheus-node-exporter/templates/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: dz-prometheus-node-exporter - namespace: devzero-system - labels: - helm.sh/chart: prometheus-node-exporter-4.47.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: dz-prometheus-node-exporter - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - app.kubernetes.io/version: "1.9.1" -automountServiceAccountToken: false ---- -# Source: prometheus-node-exporter/templates/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: dz-prometheus-node-exporter - namespace: devzero-system - labels: - helm.sh/chart: prometheus-node-exporter-4.47.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: dz-prometheus-node-exporter - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - app.kubernetes.io/version: "1.9.1" - annotations: - prometheus.io/scrape: "true" -spec: - type: ClusterIP - ports: - - port: 9101 - targetPort: 9101 - protocol: TCP - name: metrics - selector: - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter ---- -# Source: prometheus-node-exporter/templates/daemonset.yaml -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: dz-prometheus-node-exporter - namespace: devzero-system - labels: - helm.sh/chart: prometheus-node-exporter-4.47.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: dz-prometheus-node-exporter - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - app.kubernetes.io/version: "1.9.1" -spec: - selector: - matchLabels: - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - revisionHistoryLimit: 10 - updateStrategy: - rollingUpdate: - maxUnavailable: 1 - type: RollingUpdate - template: - metadata: - annotations: - cluster-autoscaler.kubernetes.io/safe-to-evict: "true" - prometheus.io/path: /metrics - prometheus.io/port: "9101" - prometheus.io/scrape: "true" - labels: - helm.sh/chart: prometheus-node-exporter-4.47.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: dz-prometheus-node-exporter - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - app.kubernetes.io/version: "1.9.1" - spec: - automountServiceAccountToken: false - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - serviceAccountName: dz-prometheus-node-exporter - containers: - - name: node-exporter - image: quay.io/prometheus/node-exporter:v1.9.1 - imagePullPolicy: IfNotPresent - args: - - --path.procfs=/host/proc - - --path.sysfs=/host/sys - - --path.rootfs=/host/root - - --path.udev.data=/host/root/run/udev/data - - --web.listen-address=[$(HOST_IP)]:9101 - securityContext: - allowPrivilegeEscalation: false - readOnlyRootFilesystem: true - env: - - name: HOST_IP - value: 0.0.0.0 - ports: - - name: metrics - containerPort: 9101 - protocol: TCP - livenessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: / - port: 9101 - scheme: HTTP - initialDelaySeconds: 0 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - readinessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: / - port: 9101 - scheme: HTTP - initialDelaySeconds: 0 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - resources: - limits: - memory: 500Mi - requests: - cpu: 250m - memory: 500Mi - volumeMounts: - - name: proc - mountPath: /host/proc - readOnly: true - - name: sys - mountPath: /host/sys - readOnly: true - - name: root - mountPath: /host/root - mountPropagation: HostToContainer - readOnly: true - hostNetwork: true - hostPID: true - hostIPC: false - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: eks.amazonaws.com/compute-type - operator: NotIn - values: - - fargate - - key: type - operator: NotIn - values: - - virtual-kubelet - nodeSelector: - kubernetes.io/os: linux - tolerations: - - effect: NoSchedule - operator: Exists - volumes: - - name: proc - hostPath: - path: /proc - - name: sys - hostPath: - path: /sys - - name: root - hostPath: - path: / -# ----- END PROM NODE EXPORTER ----- + app.kubernetes.io/name: {{.zxporter_namespace}} + name: {{.zxporter_namespace}} --- apiVersion: v1 kind: Namespace @@ -821,7 +14,7 @@ metadata: app.kubernetes.io/managed-by: kustomize app.kubernetes.io/name: devzero-zxporter control-plane: controller-manager - name: '{{.zxporter_namespace}}' + name: {{.zxporter_namespace}} --- apiVersion: v1 kind: ServiceAccount @@ -830,7 +23,7 @@ metadata: app.kubernetes.io/managed-by: kustomize app.kubernetes.io/name: devzero-zxporter name: devzero-zxporter-controller-manager - namespace: devzero-system + namespace: {{.zxporter_namespace}} --- apiVersion: rbac.authorization.k8s.io/v1 kind: Role @@ -839,7 +32,7 @@ metadata: app.kubernetes.io/managed-by: kustomize app.kubernetes.io/name: devzero-zxporter name: devzero-zxporter-leader-election-role - namespace: devzero-system + namespace: {{.zxporter_namespace}} rules: - apiGroups: - "" @@ -1329,7 +522,7 @@ metadata: app.kubernetes.io/managed-by: kustomize app.kubernetes.io/name: devzero-zxporter name: devzero-zxporter-leader-election-rolebinding - namespace: devzero-system + namespace: {{.zxporter_namespace}} roleRef: apiGroup: rbac.authorization.k8s.io kind: Role @@ -1337,7 +530,7 @@ roleRef: subjects: - kind: ServiceAccount name: devzero-zxporter-controller-manager - namespace: devzero-system + namespace: {{.zxporter_namespace}} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -1353,7 +546,7 @@ roleRef: subjects: - kind: ServiceAccount name: devzero-zxporter-controller-manager - namespace: devzero-system + namespace: {{.zxporter_namespace}} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -1366,7 +559,7 @@ roleRef: subjects: - kind: ServiceAccount name: devzero-zxporter-controller-manager - namespace: devzero-system + namespace: {{.zxporter_namespace}} --- apiVersion: v1 data: @@ -1374,7 +567,6 @@ data: CLUSTER_TOKEN: "" COLLECTION_FREQUENCY: "" DAKR_URL: '{{ .api_url }}/dakr' - DISABLE_NETWORK_IO_METRICS: "" DISABLED_COLLECTORS: "" EXCLUDED_CLUSTERROLEBINDINGS: "" EXCLUDED_CLUSTERROLES: "" @@ -1411,7 +603,6 @@ data: KUBE_CONTEXT_NAME: '{{ .kube_context_name }}' MASK_SECRET_DATA: "" NODE_METRICS_INTERVAL: "" - PROMETHEUS_URL: http://prometheus-dz-prometheus-server.devzero-system.svc.cluster.local:80 TARGET_NAMESPACES: "" TOKEN_CONFIGMAP_NAME: devzero-zxporter-env-config TOKEN_CREDENTIALS_SECRET_NAME: devzero-zxporter-credentials @@ -1422,15 +613,15 @@ data: kind: ConfigMap metadata: name: devzero-zxporter-env-config - namespace: devzero-system + namespace: {{.zxporter_namespace}} --- apiVersion: v1 kind: Secret metadata: name: devzero-zxporter-token - namespace: devzero-system + namespace: {{.zxporter_namespace}} stringData: - CLUSTER_TOKEN: '{{ .cluster_token }}' + CLUSTER_TOKEN: "{{ .cluster_token }}" type: Opaque --- apiVersion: v1 @@ -1441,7 +632,7 @@ metadata: app.kubernetes.io/name: devzero-zxporter control-plane: controller-manager name: devzero-zxporter-controller-manager-metrics-service - namespace: devzero-system + namespace: {{.zxporter_namespace}} spec: ports: - name: https @@ -1458,7 +649,7 @@ metadata: app.kubernetes.io/name: devzero-zxporter control-plane: controller-manager name: devzero-zxporter-controller-manager-mpa - namespace: devzero-system + namespace: {{.zxporter_namespace}} spec: ports: - name: mpa-grpc @@ -1486,7 +677,7 @@ metadata: app.kubernetes.io/name: devzero-zxporter control-plane: controller-manager name: devzero-zxporter-controller-manager - namespace: devzero-system + namespace: {{.zxporter_namespace}} spec: replicas: 2 selector: @@ -1563,9 +754,446 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: name: devzero-zxporter-devzero-zxporter-pdb - namespace: devzero-system + namespace: {{.zxporter_namespace}} spec: minAvailable: 1 selector: matchLabels: control-plane: controller-manager +--- +# Source: zxporter-nodemon/templates/rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: zxporter-nodemon + namespace: {{.zxporter_namespace}} + labels: + helm.sh/chart: zxporter-nodemon-0.0.2 + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + app.kubernetes.io/version: "0.0.2" + app.kubernetes.io/managed-by: Helm +--- +# Source: zxporter-nodemon/templates/dcgm-exporter-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: zxporter-nodemon-dcgm-metrics + namespace: {{.zxporter_namespace}} +data: + counters.csv: | + # Temperature and power usage,, + DCGM_FI_DEV_GPU_TEMP, gauge, Current temperature readings for the device in degrees C. + DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature for the device. + DCGM_FI_DEV_POWER_USAGE, gauge, Power usage for the device in Watts. + + # Utilization,, + # DCGM_FI_DEV_GPU_UTIL provides overall GPU utilization which is useful for scenarios + # like fractional GPU sharing (e.g., EKS time-slicing, MIG) where profiling metrics may not be available. + DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). + # DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned + # DCGM_FI_PROF_SM_OCCUPANCY, gauge, The fraction of resident warps on a multiprocessor + # DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %). + # DCGM_FI_PROF_DRAM_ACTIVE, gauge, The ratio of cycles the device memory interface is active sending or receiving data. + + # Memory usage,, + DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). + DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). + DCGM_FI_DEV_FB_TOTAL, gauge, Total Frame Buffer of the GPU in MB. + DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Utilization of the memory copy engine. + + # PCIE,, + # DCGM_FI_PROF_PCIE_TX_BYTES, gauge, Total number of bytes transmitted through PCIe TX + # DCGM_FI_PROF_PCIE_RX_BYTES, gauge, Total number of bytes received through PCIe RX + DCGM_FI_DEV_PCIE_LINK_GEN, gauge, PCIe Current Link Generation. + DCGM_FI_DEV_PCIE_LINK_WIDTH, gauge, PCIe Current Link Width. + + # Pipelines,, + # DCGM_FI_PROF_PIPE_INT_ACTIVE, gauge, Ratio of cycles the integer pipe is active. + # DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipe is active. + # DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipe is active. + # DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipe is active. + # DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, The ratio of cycles the tensor (HMMA) pipe is active (off the peak sustained elapsed cycles) + + # Health,, + # DCGM_FI_DEV_CLOCKS_EVENT_REASONS is not supported by DCGM 3.3.7 + # DCGM_FI_DEV_CLOCKS_EVENT_REASONS, gauge, Current clock event reasons (bitmask of DCGM_CLOCKS_EVENT_REASON_*) + DCGM_FI_DEV_XID_ERRORS, gauge, The value is the specific XID error + DCGM_FI_DEV_POWER_VIOLATION, gauge, Power Violation time in ns. + DCGM_FI_DEV_THERMAL_VIOLATION, gauge, Thermal Violation time in ns. + + # NVLink,, + # DCGM_FI_PROF_NVLINK_TX_BYTES, gauge, The number of bytes of active NvLink tx (transmit) data including both header and payload. + # DCGM_FI_PROF_NVLINK_RX_BYTES, gauge, The number of bytes of active NvLink rx (read) data including both header and payload. + + # Clocks,, + DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). + DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). +--- +# Source: zxporter-nodemon/templates/nodemon-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: zxporter-nodemon-zxporter-nodemon + namespace: {{.zxporter_namespace}} +data: +--- +# Source: zxporter-nodemon/templates/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: zxporter-nodemon + labels: + helm.sh/chart: zxporter-nodemon-0.0.2 + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + app.kubernetes.io/version: "0.0.2" + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - apiGroups: + - apps + resources: + - replicasets + - deployments + - statefulsets + - daemonsets + verbs: + - get + - apiGroups: + - batch + resources: + - jobs + - cronjobs + verbs: + - get + - apiGroups: + - argoproj.io + resources: + - rollouts + verbs: + - get + - apiGroups: + - "" + resources: + - nodes/proxy + - nodes/metrics + - nodes/stats + verbs: + - get +--- +# Source: zxporter-nodemon/templates/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: zxporter-nodemon + labels: + helm.sh/chart: zxporter-nodemon-0.0.2 + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + app.kubernetes.io/version: "0.0.2" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: zxporter-nodemon +subjects: + - kind: ServiceAccount + name: zxporter-nodemon + namespace: {{.zxporter_namespace}} +--- +# Source: zxporter-nodemon/templates/daemonset.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: zxporter-nodemon + namespace: {{.zxporter_namespace}} + labels: + helm.sh/chart: zxporter-nodemon-0.0.2 + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + app.kubernetes.io/version: "0.0.2" + app.kubernetes.io/managed-by: Helm + annotations: + ignore-check.kube-linter.io/privileged-container: "This daemon set needs to run DCGM Exporter as privileged to access the GPU metrics." + ignore-check.kube-linter.io/run-as-non-root: "This daemon set needs to run DCGM Exporter as root to access the GPU metrics." + ignore-check.kube-linter.io/privilege-escalation-container: "This daemon set needs escalate privileges for DCGM Exporter." + ignore-check.kube-linter.io/no-read-only-root-fs: "This daemon set needs to run DCGM Exporter with read-only root filesystem." +spec: + selector: + matchLabels: + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + template: + metadata: + labels: + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + spec: + serviceAccountName: zxporter-nodemon + volumes: + - name: "pod-gpu-resources" + hostPath: + path: /var/lib/kubelet/pod-resources + - name: zxporter-nodemon-dcgm-metrics + configMap: + name: zxporter-nodemon-dcgm-metrics + tolerations: + - effect: NoExecute + operator: Exists + - effect: NoSchedule + operator: Exists + - effect: PreferNoSchedule + operator: Exists + containers: + - name: zxporter-nodemon + securityContext: + readOnlyRootFilesystem: true + runAsNonRoot: true + image: "ttl.sh/zxporter-nodemon:latest" + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 6061 + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: http + readinessProbe: + httpGet: + path: /healthz + port: http + envFrom: + - configMapRef: + name: zxporter-nodemon-zxporter-nodemon + env: + - name: "NODE_NAME" + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: "DCGM_HOST" + value: "localhost" + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + - name: dcgm-exporter + securityContext: + capabilities: + add: + - SYS_ADMIN + drop: + - NET_RAW + runAsNonRoot: false + runAsUser: 0 + image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.7-3.5.0-ubuntu22.04" + imagePullPolicy: IfNotPresent + command: ["/bin/bash", "-c"] + args: + - hostname $NODE_NAME; for ((;;)) { r=$(dcgm-exporter -f /etc/dcgm-exporter/counters.csv); echo "dcgm-exporter could not run"; sleep 60 ; } + ports: + - name: "metrics" + containerPort: 9400 + env: + - name: "DCGM_EXPORTER_KUBERNETES" + value: "true" + - name: "DCGM_EXPORTER_LISTEN" + value: ":9400" + - name: "DCGM_EXPORTER_INTERVAL" + value: "5000" + - name: "NODE_NAME" + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: "pod-gpu-resources" + readOnly: true + mountPath: "/var/lib/kubelet/pod-resources" + - name: zxporter-nodemon-dcgm-metrics + mountPath: "/etc/dcgm-exporter" +--- +# Dedicated ServiceAccount for the one-time migration cleanup job. +# Scoped to only delete specific named resources left by previous zxporter installs. +apiVersion: v1 +kind: ServiceAccount +metadata: + name: zxporter-prometheus-cleanup + namespace: {{.zxporter_namespace}} + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +--- +# Namespaced Role: can only delete specific named Prometheus resources in the zxporter namespace. +# NOTE: Does NOT delete standalone nodemon — the kubectl install path reuses the same +# resource names (zxporter-nodemon), so kubectl apply updates them in-place. +# Nodemon cleanup is only in the Helm hook where names differ (zxporter-zxporter-nodemon). +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: zxporter-prometheus-cleanup + namespace: {{.zxporter_namespace}} + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +rules: + - apiGroups: ["apps"] + resources: ["deployments"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + verbs: ["delete"] + - apiGroups: ["apps"] + resources: ["daemonsets"] + resourceNames: + - dz-prometheus-node-exporter + verbs: ["delete"] + - apiGroups: [""] + resources: ["services"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + - dz-prometheus-node-exporter + verbs: ["delete"] + - apiGroups: [""] + resources: ["configmaps"] + resourceNames: + - prometheus-dz-prometheus-server + verbs: ["delete"] + - apiGroups: [""] + resources: ["serviceaccounts"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + - dz-prometheus-node-exporter + verbs: ["delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: zxporter-prometheus-cleanup + namespace: {{.zxporter_namespace}} + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: zxporter-prometheus-cleanup +subjects: + - kind: ServiceAccount + name: zxporter-prometheus-cleanup + namespace: {{.zxporter_namespace}} +--- +# ClusterRole: can only delete the exact named ClusterRoles/ClusterRoleBindings +# left by the old zxporter Prometheus install. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: zxporter-prometheus-cleanup + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +rules: + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterroles"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + verbs: ["delete"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterrolebindings"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + verbs: ["delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: zxporter-prometheus-cleanup + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: zxporter-prometheus-cleanup +subjects: + - kind: ServiceAccount + name: zxporter-prometheus-cleanup + namespace: {{.zxporter_namespace}} +--- +# One-time migration job: cleans up legacy Prometheus resources. +# - Idempotent: --ignore-not-found on every delete +# - Safe for fresh installs: all deletes succeed with "not found" (exit 0) +# - Scoped: only deletes Prometheus resources by exact name +# - Self-cleaning: Job auto-deletes after 5 minutes via ttlSecondsAfterFinished +# +# NOTE: Does NOT delete standalone nodemon — in the kubectl path, the new installer +# reuses the same names (zxporter-nodemon), so kubectl apply updates them in-place. +apiVersion: batch/v1 +kind: Job +metadata: + name: zxporter-prometheus-cleanup + namespace: {{.zxporter_namespace}} + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +spec: + ttlSecondsAfterFinished: 300 + backoffLimit: 1 + template: + metadata: + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + spec: + restartPolicy: Never + serviceAccountName: zxporter-prometheus-cleanup + containers: + - name: cleanup + image: bitnami/kubectl:latest + command: + - /bin/sh + - -c + - | + NS=$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace) + echo "Cleaning up legacy zxporter Prometheus resources in namespace: $NS" + echo "This only deletes Prometheus resources by exact name — other installs are unaffected." + + # Deployments + kubectl delete deployment prometheus-dz-prometheus-server -n $NS --ignore-not-found + kubectl delete deployment prometheus-kube-state-metrics -n $NS --ignore-not-found + + # DaemonSets + kubectl delete daemonset dz-prometheus-node-exporter -n $NS --ignore-not-found + + # Services + kubectl delete service prometheus-dz-prometheus-server -n $NS --ignore-not-found + kubectl delete service prometheus-kube-state-metrics -n $NS --ignore-not-found + kubectl delete service dz-prometheus-node-exporter -n $NS --ignore-not-found + + # ConfigMaps + kubectl delete configmap prometheus-dz-prometheus-server -n $NS --ignore-not-found + + # ServiceAccounts + kubectl delete serviceaccount prometheus-dz-prometheus-server -n $NS --ignore-not-found + kubectl delete serviceaccount prometheus-kube-state-metrics -n $NS --ignore-not-found + kubectl delete serviceaccount dz-prometheus-node-exporter -n $NS --ignore-not-found + + # ClusterRoles and ClusterRoleBindings + kubectl delete clusterrole prometheus-dz-prometheus-server --ignore-not-found + kubectl delete clusterrole prometheus-kube-state-metrics --ignore-not-found + kubectl delete clusterrolebinding prometheus-dz-prometheus-server --ignore-not-found + kubectl delete clusterrolebinding prometheus-kube-state-metrics --ignore-not-found + + echo "Cleanup complete" diff --git a/dist/install.yaml b/dist/install.yaml index 6af3b5e7..598281bd 100644 --- a/dist/install.yaml +++ b/dist/install.yaml @@ -1,11 +1,4 @@ -## ATTN KUBERNETES ADMINS! Read this... -# If prometheus-server is already installed, and you want to use that version, -# comment out the section from "START PROM SERVER" to "END PROM SERVER" and update the "prometheusURL" variable. -# -# If prometheus-node-exporter is already installed, and you want to use that version, -# comment out the section from "START PROM NODE EXPORTER" to "END PROM NODE EXPORTER" -# - +# ZXPorter installer bundle apiVersion: v1 kind: Namespace metadata: @@ -13,811 +6,6 @@ metadata: control-plane: controller-manager app.kubernetes.io/name: devzero-system name: devzero-system -# ----- START PROM SERVER ----- ---- -# Source: prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -automountServiceAccountToken: true -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - name: prometheus-kube-state-metrics - namespace: devzero-system ---- -# Source: prometheus/templates/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server - namespace: devzero-system - annotations: - {} ---- -# Source: prometheus/templates/cm.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server - namespace: devzero-system -data: - allow-snippet-annotations: "false" - alerting_rules.yml: | - {} - alerts: | - {} - prometheus.yml: | - global: - evaluation_interval: 1m - scrape_interval: 1m - scrape_timeout: 10s - rule_files: - - /etc/config/recording_rules.yml - - /etc/config/alerting_rules.yml - - /etc/config/rules - - /etc/config/alerts - scrape_configs: - - job_name: prometheus - static_configs: - - targets: - - localhost:9090 - - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - job_name: kubernetes-nodes - kubernetes_sd_configs: - - role: node - metric_relabel_configs: - - action: keep - regex: node_network_(receive|transmit)_(bytes|packets|errs|drop)_total|node_disk_(read_bytes|written_bytes|reads_completed|writes_completed)_total|kubelet_volume_stats_(used|capacity|available)_bytes - source_labels: - - __name__ - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - replacement: kubernetes.default.svc:443 - target_label: __address__ - - regex: (.+) - replacement: /api/v1/nodes/$1/proxy/metrics - source_labels: - - __meta_kubernetes_node_name - target_label: __metrics_path__ - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - job_name: kubernetes-nodes-cadvisor - kubernetes_sd_configs: - - role: node - metric_relabel_configs: - - action: keep - regex: container_network_(receive|transmit)_(bytes|packets|errors|packets_dropped)_total|container_fs_(reads|writes)_(bytes_)?total|kubelet_volume_stats_(used|capacity|available)_bytes|container_cpu_cfs_(throttled_)?periods_total - source_labels: - - __name__ - - action: labeldrop - regex: (id|name|image) - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - replacement: kubernetes.default.svc:443 - target_label: __address__ - - regex: (.+) - replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor - source_labels: - - __meta_kubernetes_node_name - target_label: __metrics_path__ - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - - honor_labels: true - job_name: kubernetes-pods - kubernetes_sd_configs: - - role: pod - metric_relabel_configs: - - action: keep - regex: DCGM_FI_DEV_(GPU_UTIL|FB_USED|FB_FREE|POWER_USAGE|GPU_TEMP|SM_CLOCK|MEM_CLOCK) - source_labels: - - __name__ - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape - - action: keep - regex: .*dcgm-exporter.* - source_labels: - - __meta_kubernetes_pod_label_app - - __meta_kubernetes_pod_label_app_kubernetes_io_name - - action: drop - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: '[$2]:$1' - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: replace - regex: (\d+);((([0-9]+?)(\.|$)){4}) - replacement: $2:$1 - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - action: replace - source_labels: - - __meta_kubernetes_pod_name - target_label: pod - - action: drop - regex: Pending|Succeeded|Failed|Completed - source_labels: - - __meta_kubernetes_pod_phase - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - - honor_labels: true - job_name: kubernetes-pods-node-exporter - kubernetes_sd_configs: - - role: pod - metric_relabel_configs: - - action: keep - regex: node_network_(receive|transmit)_(bytes|packets|errs|drop)_total|node_disk_(read_bytes|written_bytes|reads_completed|writes_completed)_total|kubelet_volume_stats_(used|capacity|available)_bytes - source_labels: - - __name__ - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape - - action: keep - regex: dz-prometheus-node-exporter - source_labels: - - __meta_kubernetes_pod_label_app_kubernetes_io_name - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: '[$2]:$1' - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: replace - regex: (\d+);((([0-9]+?)(\.|$)){4}) - replacement: $2:$1 - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - recording_rules.yml: | - {} - rules: | - {} ---- -# Source: prometheus/charts/kube-state-metrics/templates/role.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - name: prometheus-kube-state-metrics -rules: - -- apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] ---- -# Source: prometheus/templates/clusterrole.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server -rules: - - apiGroups: - - "" - resources: - - nodes - - nodes/proxy - - nodes/metrics - - services - - endpoints - - pods - - ingresses - - configmaps - verbs: - - get - - list - - watch - - apiGroups: - - "networking.k8s.io" - resources: - - ingresses/status - - ingresses - verbs: - - get - - list - - watch - - apiGroups: - - "discovery.k8s.io" - resources: - - endpointslices - verbs: - - get - - list - - watch - - nonResourceURLs: - - "/metrics" - verbs: - - get ---- -# Source: prometheus/charts/kube-state-metrics/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - name: prometheus-kube-state-metrics -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus-kube-state-metrics -subjects: -- kind: ServiceAccount - name: prometheus-kube-state-metrics - namespace: devzero-system ---- -# Source: prometheus/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server -subjects: - - kind: ServiceAccount - name: prometheus-dz-prometheus-server - namespace: devzero-system -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus-dz-prometheus-server ---- -# Source: prometheus/charts/kube-state-metrics/templates/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: prometheus-kube-state-metrics - namespace: devzero-system - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - annotations: - prometheus.io/scrape: 'true' -spec: - type: "ClusterIP" - ports: - - name: "http" - protocol: TCP - port: 8080 - targetPort: 8080 - - selector: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus ---- -# Source: prometheus/templates/service.yaml -apiVersion: v1 -kind: Service -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server - namespace: devzero-system -spec: - ports: - - name: http - port: 80 - protocol: TCP - targetPort: 9090 - selector: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - sessionAffinity: None - type: "ClusterIP" ---- -# Source: prometheus/charts/kube-state-metrics/templates/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: prometheus-kube-state-metrics - namespace: devzero-system - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" -spec: - selector: - matchLabels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - replicas: 1 - strategy: - type: RollingUpdate - revisionHistoryLimit: 10 - template: - metadata: - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - spec: - automountServiceAccountToken: true - hostNetwork: false - serviceAccountName: prometheus-kube-state-metrics - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - dnsPolicy: ClusterFirst - containers: - - name: kube-state-metrics - args: - - --port=8080 - - --resources=nodes - imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 - ports: - - containerPort: 8080 - name: "http" - livenessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: /livez - port: 8080 - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 5 - readinessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: /readyz - port: 8081 - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 5 - resources: - limits: - memory: 500Mi - requests: - cpu: 250m - memory: 500Mi - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true ---- -# Source: prometheus/templates/deploy.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server - namespace: devzero-system -spec: - selector: - matchLabels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - replicas: 1 - revisionHistoryLimit: 10 - strategy: - type: Recreate - rollingUpdate: null - template: - metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - spec: - enableServiceLinks: true - serviceAccountName: prometheus-dz-prometheus-server - containers: - - name: dz-prometheus-server-configmap-reload - image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.83.0" - imagePullPolicy: "IfNotPresent" - args: - - --watched-dir=/etc/config - - --listen-address=0.0.0.0:8080 - - --reload-url=http://127.0.0.1:9090/-/reload - ports: - - containerPort: 8080 - name: metrics - livenessProbe: - httpGet: - path: /healthz - port: metrics - scheme: HTTP - initialDelaySeconds: 2 - periodSeconds: 10 - readinessProbe: - httpGet: - path: /healthz - port: metrics - scheme: HTTP - periodSeconds: 10 - resources: - limits: - memory: 500Mi - requests: - cpu: 250m - memory: 500Mi - volumeMounts: - - name: config-volume - mountPath: /etc/config - readOnly: true - - - name: dz-prometheus-server - image: "quay.io/prometheus/prometheus:v3.4.1" - imagePullPolicy: "IfNotPresent" - args: - - --storage.tsdb.retention.time=2d - - --config.file=/etc/config/prometheus.yml - - --storage.tsdb.path=/data - - --web.console.libraries=/etc/prometheus/console_libraries - - --web.console.templates=/etc/prometheus/consoles - - --web.enable-lifecycle - ports: - - containerPort: 9090 - readinessProbe: - httpGet: - path: /-/ready - port: 9090 - scheme: HTTP - initialDelaySeconds: 30 - periodSeconds: 5 - timeoutSeconds: 4 - failureThreshold: 3 - successThreshold: 1 - livenessProbe: - httpGet: - path: /-/healthy - port: 9090 - scheme: HTTP - initialDelaySeconds: 30 - periodSeconds: 15 - timeoutSeconds: 10 - failureThreshold: 3 - successThreshold: 1 - resources: - limits: - memory: 2Gi - requests: - cpu: 100m - memory: 2Gi - volumeMounts: - - name: config-volume - mountPath: /etc/config - - name: storage-volume - mountPath: /data - subPath: "" - dnsPolicy: ClusterFirst - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - terminationGracePeriodSeconds: 300 - volumes: - - name: config-volume - configMap: - name: prometheus-dz-prometheus-server - - name: storage-volume - emptyDir: - {} -# ----- END PROM SERVER ----- -# ----- START PROM NODE EXPORTER ----- ---- -# Source: prometheus-node-exporter/templates/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: dz-prometheus-node-exporter - namespace: devzero-system - labels: - helm.sh/chart: prometheus-node-exporter-4.47.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: dz-prometheus-node-exporter - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - app.kubernetes.io/version: "1.9.1" -automountServiceAccountToken: false ---- -# Source: prometheus-node-exporter/templates/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: dz-prometheus-node-exporter - namespace: devzero-system - labels: - helm.sh/chart: prometheus-node-exporter-4.47.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: dz-prometheus-node-exporter - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - app.kubernetes.io/version: "1.9.1" - annotations: - prometheus.io/scrape: "true" -spec: - type: ClusterIP - ports: - - port: 9101 - targetPort: 9101 - protocol: TCP - name: metrics - selector: - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter ---- -# Source: prometheus-node-exporter/templates/daemonset.yaml -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: dz-prometheus-node-exporter - namespace: devzero-system - labels: - helm.sh/chart: prometheus-node-exporter-4.47.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: dz-prometheus-node-exporter - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - app.kubernetes.io/version: "1.9.1" -spec: - selector: - matchLabels: - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - revisionHistoryLimit: 10 - updateStrategy: - rollingUpdate: - maxUnavailable: 1 - type: RollingUpdate - template: - metadata: - annotations: - cluster-autoscaler.kubernetes.io/safe-to-evict: "true" - prometheus.io/path: /metrics - prometheus.io/port: "9101" - prometheus.io/scrape: "true" - labels: - helm.sh/chart: prometheus-node-exporter-4.47.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: dz-prometheus-node-exporter - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - app.kubernetes.io/version: "1.9.1" - spec: - automountServiceAccountToken: false - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - serviceAccountName: dz-prometheus-node-exporter - containers: - - name: node-exporter - image: quay.io/prometheus/node-exporter:v1.9.1 - imagePullPolicy: IfNotPresent - args: - - --path.procfs=/host/proc - - --path.sysfs=/host/sys - - --path.rootfs=/host/root - - --path.udev.data=/host/root/run/udev/data - - --web.listen-address=[$(HOST_IP)]:9101 - securityContext: - allowPrivilegeEscalation: false - readOnlyRootFilesystem: true - env: - - name: HOST_IP - value: 0.0.0.0 - ports: - - name: metrics - containerPort: 9101 - protocol: TCP - livenessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: / - port: 9101 - scheme: HTTP - initialDelaySeconds: 0 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - readinessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: / - port: 9101 - scheme: HTTP - initialDelaySeconds: 0 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - resources: - limits: - memory: 500Mi - requests: - cpu: 250m - memory: 500Mi - volumeMounts: - - name: proc - mountPath: /host/proc - readOnly: true - - name: sys - mountPath: /host/sys - readOnly: true - - name: root - mountPath: /host/root - mountPropagation: HostToContainer - readOnly: true - hostNetwork: true - hostPID: true - hostIPC: false - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: eks.amazonaws.com/compute-type - operator: NotIn - values: - - fargate - - key: type - operator: NotIn - values: - - virtual-kubelet - nodeSelector: - kubernetes.io/os: linux - tolerations: - - effect: NoSchedule - operator: Exists - volumes: - - name: proc - hostPath: - path: /proc - - name: sys - hostPath: - path: /sys - - name: root - hostPath: - path: / -# ----- END PROM NODE EXPORTER ----- --- apiVersion: v1 kind: Namespace @@ -1379,7 +567,6 @@ data: CLUSTER_TOKEN: "" COLLECTION_FREQUENCY: "" DAKR_URL: https://dakr.devzero.io - DISABLE_NETWORK_IO_METRICS: "" DISABLED_COLLECTORS: "" EXCLUDED_CLUSTERROLEBINDINGS: "" EXCLUDED_CLUSTERROLES: "" @@ -1416,7 +603,6 @@ data: KUBE_CONTEXT_NAME: '{{ .kube_context_name }}' MASK_SECRET_DATA: "" NODE_METRICS_INTERVAL: "" - PROMETHEUS_URL: http://prometheus-dz-prometheus-server.devzero-system.svc.cluster.local:80 TARGET_NAMESPACES: "" TOKEN_CONFIGMAP_NAME: devzero-zxporter-env-config TOKEN_CREDENTIALS_SECRET_NAME: devzero-zxporter-credentials @@ -1435,7 +621,7 @@ metadata: name: devzero-zxporter-token namespace: devzero-system stringData: - CLUSTER_TOKEN: '{{ .cluster_token }}' + CLUSTER_TOKEN: "" type: Opaque --- apiVersion: v1 @@ -1574,3 +760,440 @@ spec: selector: matchLabels: control-plane: controller-manager +--- +# Source: zxporter-nodemon/templates/rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: zxporter-nodemon + namespace: devzero-system + labels: + helm.sh/chart: zxporter-nodemon-0.0.2 + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + app.kubernetes.io/version: "0.0.2" + app.kubernetes.io/managed-by: Helm +--- +# Source: zxporter-nodemon/templates/dcgm-exporter-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: zxporter-nodemon-dcgm-metrics + namespace: devzero-system +data: + counters.csv: | + # Temperature and power usage,, + DCGM_FI_DEV_GPU_TEMP, gauge, Current temperature readings for the device in degrees C. + DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature for the device. + DCGM_FI_DEV_POWER_USAGE, gauge, Power usage for the device in Watts. + + # Utilization,, + # DCGM_FI_DEV_GPU_UTIL provides overall GPU utilization which is useful for scenarios + # like fractional GPU sharing (e.g., EKS time-slicing, MIG) where profiling metrics may not be available. + DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). + # DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned + # DCGM_FI_PROF_SM_OCCUPANCY, gauge, The fraction of resident warps on a multiprocessor + # DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %). + # DCGM_FI_PROF_DRAM_ACTIVE, gauge, The ratio of cycles the device memory interface is active sending or receiving data. + + # Memory usage,, + DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). + DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). + DCGM_FI_DEV_FB_TOTAL, gauge, Total Frame Buffer of the GPU in MB. + DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Utilization of the memory copy engine. + + # PCIE,, + # DCGM_FI_PROF_PCIE_TX_BYTES, gauge, Total number of bytes transmitted through PCIe TX + # DCGM_FI_PROF_PCIE_RX_BYTES, gauge, Total number of bytes received through PCIe RX + DCGM_FI_DEV_PCIE_LINK_GEN, gauge, PCIe Current Link Generation. + DCGM_FI_DEV_PCIE_LINK_WIDTH, gauge, PCIe Current Link Width. + + # Pipelines,, + # DCGM_FI_PROF_PIPE_INT_ACTIVE, gauge, Ratio of cycles the integer pipe is active. + # DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipe is active. + # DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipe is active. + # DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipe is active. + # DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, The ratio of cycles the tensor (HMMA) pipe is active (off the peak sustained elapsed cycles) + + # Health,, + # DCGM_FI_DEV_CLOCKS_EVENT_REASONS is not supported by DCGM 3.3.7 + # DCGM_FI_DEV_CLOCKS_EVENT_REASONS, gauge, Current clock event reasons (bitmask of DCGM_CLOCKS_EVENT_REASON_*) + DCGM_FI_DEV_XID_ERRORS, gauge, The value is the specific XID error + DCGM_FI_DEV_POWER_VIOLATION, gauge, Power Violation time in ns. + DCGM_FI_DEV_THERMAL_VIOLATION, gauge, Thermal Violation time in ns. + + # NVLink,, + # DCGM_FI_PROF_NVLINK_TX_BYTES, gauge, The number of bytes of active NvLink tx (transmit) data including both header and payload. + # DCGM_FI_PROF_NVLINK_RX_BYTES, gauge, The number of bytes of active NvLink rx (read) data including both header and payload. + + # Clocks,, + DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). + DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). +--- +# Source: zxporter-nodemon/templates/nodemon-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: zxporter-nodemon-zxporter-nodemon + namespace: devzero-system +data: +--- +# Source: zxporter-nodemon/templates/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: zxporter-nodemon + labels: + helm.sh/chart: zxporter-nodemon-0.0.2 + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + app.kubernetes.io/version: "0.0.2" + app.kubernetes.io/managed-by: Helm +rules: +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list +- apiGroups: + - apps + resources: + - replicasets + - deployments + - statefulsets + - daemonsets + verbs: + - get +- apiGroups: + - batch + resources: + - jobs + - cronjobs + verbs: + - get +- apiGroups: + - argoproj.io + resources: + - rollouts + verbs: + - get +- apiGroups: + - "" + resources: + - nodes/proxy + - nodes/metrics + - nodes/stats + verbs: + - get +--- +# Source: zxporter-nodemon/templates/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: zxporter-nodemon + labels: + helm.sh/chart: zxporter-nodemon-0.0.2 + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + app.kubernetes.io/version: "0.0.2" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: zxporter-nodemon +subjects: +- kind: ServiceAccount + name: zxporter-nodemon + namespace: devzero-system +--- +# Source: zxporter-nodemon/templates/daemonset.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: zxporter-nodemon + namespace: devzero-system + labels: + helm.sh/chart: zxporter-nodemon-0.0.2 + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + app.kubernetes.io/version: "0.0.2" + app.kubernetes.io/managed-by: Helm + annotations: + ignore-check.kube-linter.io/privileged-container: "This daemon set needs to run DCGM Exporter as privileged to access the GPU metrics." + ignore-check.kube-linter.io/run-as-non-root: "This daemon set needs to run DCGM Exporter as root to access the GPU metrics." + ignore-check.kube-linter.io/privilege-escalation-container: "This daemon set needs escalate privileges for DCGM Exporter." + ignore-check.kube-linter.io/no-read-only-root-fs: "This daemon set needs to run DCGM Exporter with read-only root filesystem." +spec: + selector: + matchLabels: + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + template: + metadata: + labels: + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + spec: + serviceAccountName: zxporter-nodemon + volumes: + - name: "pod-gpu-resources" + hostPath: + path: /var/lib/kubelet/pod-resources + - name: zxporter-nodemon-dcgm-metrics + configMap: + name: zxporter-nodemon-dcgm-metrics + tolerations: + - effect: NoExecute + operator: Exists + - effect: NoSchedule + operator: Exists + - effect: PreferNoSchedule + operator: Exists + containers: + - name: zxporter-nodemon + securityContext: + readOnlyRootFilesystem: true + runAsNonRoot: true + image: "ttl.sh/zxporter-nodemon:latest" + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 6061 + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: http + readinessProbe: + httpGet: + path: /healthz + port: http + envFrom: + - configMapRef: + name: zxporter-nodemon-zxporter-nodemon + env: + - name: "NODE_NAME" + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: "DCGM_HOST" + value: "localhost" + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + - name: dcgm-exporter + securityContext: + capabilities: + add: + - SYS_ADMIN + drop: + - NET_RAW + runAsNonRoot: false + runAsUser: 0 + image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.7-3.5.0-ubuntu22.04" + imagePullPolicy: IfNotPresent + command: [ "/bin/bash", "-c" ] + args: + - hostname $NODE_NAME; for ((;;)) { r=$(dcgm-exporter -f /etc/dcgm-exporter/counters.csv); echo "dcgm-exporter could not run"; sleep 60 ; } + ports: + - name: "metrics" + containerPort: 9400 + env: + - name: "DCGM_EXPORTER_KUBERNETES" + value: "true" + - name: "DCGM_EXPORTER_LISTEN" + value: ":9400" + - name: "DCGM_EXPORTER_INTERVAL" + value: "5000" + - name: "NODE_NAME" + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: "pod-gpu-resources" + readOnly: true + mountPath: "/var/lib/kubelet/pod-resources" + - name: zxporter-nodemon-dcgm-metrics + mountPath: "/etc/dcgm-exporter" +--- +# Dedicated ServiceAccount for the one-time migration cleanup job. +# Scoped to only delete specific named resources left by previous zxporter installs. +apiVersion: v1 +kind: ServiceAccount +metadata: + name: zxporter-prometheus-cleanup + namespace: devzero-system + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +--- +# Namespaced Role: can only delete specific named Prometheus resources in the zxporter namespace. +# NOTE: Does NOT delete standalone nodemon — the kubectl install path reuses the same +# resource names (zxporter-nodemon), so kubectl apply updates them in-place. +# Nodemon cleanup is only in the Helm hook where names differ (zxporter-zxporter-nodemon). +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: zxporter-prometheus-cleanup + namespace: devzero-system + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +rules: +- apiGroups: ["apps"] + resources: ["deployments"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + verbs: ["delete"] +- apiGroups: ["apps"] + resources: ["daemonsets"] + resourceNames: + - dz-prometheus-node-exporter + verbs: ["delete"] +- apiGroups: [""] + resources: ["services"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + - dz-prometheus-node-exporter + verbs: ["delete"] +- apiGroups: [""] + resources: ["configmaps"] + resourceNames: + - prometheus-dz-prometheus-server + verbs: ["delete"] +- apiGroups: [""] + resources: ["serviceaccounts"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + - dz-prometheus-node-exporter + verbs: ["delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: zxporter-prometheus-cleanup + namespace: devzero-system + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: zxporter-prometheus-cleanup +subjects: +- kind: ServiceAccount + name: zxporter-prometheus-cleanup + namespace: devzero-system +--- +# ClusterRole: can only delete the exact named ClusterRoles/ClusterRoleBindings +# left by the old zxporter Prometheus install. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: zxporter-prometheus-cleanup + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +rules: +- apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterroles"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + verbs: ["delete"] +- apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterrolebindings"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + verbs: ["delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: zxporter-prometheus-cleanup + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: zxporter-prometheus-cleanup +subjects: +- kind: ServiceAccount + name: zxporter-prometheus-cleanup + namespace: devzero-system +--- +# One-time migration job: cleans up legacy Prometheus resources. +# - Idempotent: --ignore-not-found on every delete +# - Safe for fresh installs: all deletes succeed with "not found" (exit 0) +# - Scoped: only deletes Prometheus resources by exact name +# - Self-cleaning: Job auto-deletes after 5 minutes via ttlSecondsAfterFinished +# +# NOTE: Does NOT delete standalone nodemon — in the kubectl path, the new installer +# reuses the same names (zxporter-nodemon), so kubectl apply updates them in-place. +apiVersion: batch/v1 +kind: Job +metadata: + name: zxporter-prometheus-cleanup + namespace: devzero-system + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +spec: + ttlSecondsAfterFinished: 300 + backoffLimit: 1 + template: + metadata: + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + spec: + restartPolicy: Never + serviceAccountName: zxporter-prometheus-cleanup + containers: + - name: cleanup + image: bitnami/kubectl:latest + command: + - /bin/sh + - -c + - | + NS=$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace) + echo "Cleaning up legacy zxporter Prometheus resources in namespace: $NS" + echo "This only deletes Prometheus resources by exact name — other installs are unaffected." + + # Deployments + kubectl delete deployment prometheus-dz-prometheus-server -n $NS --ignore-not-found + kubectl delete deployment prometheus-kube-state-metrics -n $NS --ignore-not-found + + # DaemonSets + kubectl delete daemonset dz-prometheus-node-exporter -n $NS --ignore-not-found + + # Services + kubectl delete service prometheus-dz-prometheus-server -n $NS --ignore-not-found + kubectl delete service prometheus-kube-state-metrics -n $NS --ignore-not-found + kubectl delete service dz-prometheus-node-exporter -n $NS --ignore-not-found + + # ConfigMaps + kubectl delete configmap prometheus-dz-prometheus-server -n $NS --ignore-not-found + + # ServiceAccounts + kubectl delete serviceaccount prometheus-dz-prometheus-server -n $NS --ignore-not-found + kubectl delete serviceaccount prometheus-kube-state-metrics -n $NS --ignore-not-found + kubectl delete serviceaccount dz-prometheus-node-exporter -n $NS --ignore-not-found + + # ClusterRoles and ClusterRoleBindings + kubectl delete clusterrole prometheus-dz-prometheus-server --ignore-not-found + kubectl delete clusterrole prometheus-kube-state-metrics --ignore-not-found + kubectl delete clusterrolebinding prometheus-dz-prometheus-server --ignore-not-found + kubectl delete clusterrolebinding prometheus-kube-state-metrics --ignore-not-found + + echo "Cleanup complete" diff --git a/dist/installer_updater.yaml b/dist/installer_updater.yaml index b77c3ada..82afad08 100644 --- a/dist/installer_updater.yaml +++ b/dist/installer_updater.yaml @@ -1,818 +1,11 @@ -## ATTN KUBERNETES ADMINS! Read this... -# If prometheus-server is already installed, and you want to use that version, -# comment out the section from "START PROM SERVER" to "END PROM SERVER" and update the "prometheusURL" variable. -# -# If prometheus-node-exporter is already installed, and you want to use that version, -# comment out the section from "START PROM NODE EXPORTER" to "END PROM NODE EXPORTER" -# - +# ZXPorter installer bundle apiVersion: v1 kind: Namespace metadata: labels: control-plane: controller-manager - app.kubernetes.io/name: devzero-system - name: devzero-system -# ----- START PROM SERVER ----- ---- -# Source: prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -automountServiceAccountToken: true -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - name: prometheus-kube-state-metrics - namespace: devzero-system ---- -# Source: prometheus/templates/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server - namespace: devzero-system - annotations: {} ---- -# Source: prometheus/templates/cm.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server - namespace: devzero-system -data: - allow-snippet-annotations: "false" - alerting_rules.yml: | - {} - alerts: | - {} - prometheus.yml: | - global: - evaluation_interval: 1m - scrape_interval: 1m - scrape_timeout: 10s - rule_files: - - /etc/config/recording_rules.yml - - /etc/config/alerting_rules.yml - - /etc/config/rules - - /etc/config/alerts - scrape_configs: - - job_name: prometheus - static_configs: - - targets: - - localhost:9090 - - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - job_name: kubernetes-nodes - kubernetes_sd_configs: - - role: node - metric_relabel_configs: - - action: keep - regex: node_network_(receive|transmit)_(bytes|packets|errs|drop)_total|node_disk_(read_bytes|written_bytes|reads_completed|writes_completed)_total|kubelet_volume_stats_(used|capacity|available)_bytes - source_labels: - - __name__ - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - replacement: kubernetes.default.svc:443 - target_label: __address__ - - regex: (.+) - replacement: /api/v1/nodes/$1/proxy/metrics - source_labels: - - __meta_kubernetes_node_name - target_label: __metrics_path__ - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - job_name: kubernetes-nodes-cadvisor - kubernetes_sd_configs: - - role: node - metric_relabel_configs: - - action: keep - regex: container_network_(receive|transmit)_(bytes|packets|errors|packets_dropped)_total|container_fs_(reads|writes)_(bytes_)?total|kubelet_volume_stats_(used|capacity|available)_bytes|container_cpu_cfs_(throttled_)?periods_total - source_labels: - - __name__ - - action: labeldrop - regex: (id|name|image) - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - replacement: kubernetes.default.svc:443 - target_label: __address__ - - regex: (.+) - replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor - source_labels: - - __meta_kubernetes_node_name - target_label: __metrics_path__ - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - - honor_labels: true - job_name: kubernetes-pods - kubernetes_sd_configs: - - role: pod - metric_relabel_configs: - - action: keep - regex: DCGM_FI_DEV_(GPU_UTIL|FB_USED|FB_FREE|POWER_USAGE|GPU_TEMP|SM_CLOCK|MEM_CLOCK) - source_labels: - - __name__ - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape - - action: keep - regex: .*dcgm-exporter.* - source_labels: - - __meta_kubernetes_pod_label_app - - __meta_kubernetes_pod_label_app_kubernetes_io_name - - action: drop - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: '[$2]:$1' - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: replace - regex: (\d+);((([0-9]+?)(\.|$)){4}) - replacement: $2:$1 - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - action: replace - source_labels: - - __meta_kubernetes_pod_name - target_label: pod - - action: drop - regex: Pending|Succeeded|Failed|Completed - source_labels: - - __meta_kubernetes_pod_phase - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - - honor_labels: true - job_name: kubernetes-pods-node-exporter - kubernetes_sd_configs: - - role: pod - metric_relabel_configs: - - action: keep - regex: node_network_(receive|transmit)_(bytes|packets|errs|drop)_total|node_disk_(read_bytes|written_bytes|reads_completed|writes_completed)_total|kubelet_volume_stats_(used|capacity|available)_bytes - source_labels: - - __name__ - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape - - action: keep - regex: dz-prometheus-node-exporter - source_labels: - - __meta_kubernetes_pod_label_app_kubernetes_io_name - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: '[$2]:$1' - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: replace - regex: (\d+);((([0-9]+?)(\.|$)){4}) - replacement: $2:$1 - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - recording_rules.yml: | - {} - rules: | - {} ---- -# Source: prometheus/charts/kube-state-metrics/templates/role.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - name: prometheus-kube-state-metrics -rules: - - apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] ---- -# Source: prometheus/templates/clusterrole.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server -rules: - - apiGroups: - - "" - resources: - - nodes - - nodes/proxy - - nodes/metrics - - services - - endpoints - - pods - - ingresses - - configmaps - verbs: - - get - - list - - watch - - apiGroups: - - "networking.k8s.io" - resources: - - ingresses/status - - ingresses - verbs: - - get - - list - - watch - - apiGroups: - - "discovery.k8s.io" - resources: - - endpointslices - verbs: - - get - - list - - watch - - nonResourceURLs: - - "/metrics" - verbs: - - get ---- -# Source: prometheus/charts/kube-state-metrics/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - name: prometheus-kube-state-metrics -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus-kube-state-metrics -subjects: - - kind: ServiceAccount - name: prometheus-kube-state-metrics - namespace: devzero-system ---- -# Source: prometheus/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server -subjects: - - kind: ServiceAccount - name: prometheus-dz-prometheus-server - namespace: devzero-system -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus-dz-prometheus-server ---- -# Source: prometheus/charts/kube-state-metrics/templates/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: prometheus-kube-state-metrics - namespace: devzero-system - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - annotations: - prometheus.io/scrape: 'true' -spec: - type: "ClusterIP" - ports: - - name: "http" - protocol: TCP - port: 8080 - targetPort: 8080 - selector: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus ---- -# Source: prometheus/templates/service.yaml -apiVersion: v1 -kind: Service -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server - namespace: devzero-system -spec: - ports: - - name: http - port: 80 - protocol: TCP - targetPort: 9090 - selector: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - sessionAffinity: None - type: "ClusterIP" ---- -# Source: prometheus/charts/kube-state-metrics/templates/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: prometheus-kube-state-metrics - namespace: devzero-system - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" -spec: - selector: - matchLabels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - replicas: 1 - strategy: - type: RollingUpdate - revisionHistoryLimit: 10 - template: - metadata: - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - spec: - automountServiceAccountToken: true - hostNetwork: false - serviceAccountName: prometheus-kube-state-metrics - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - dnsPolicy: ClusterFirst - containers: - - name: kube-state-metrics - args: - - --port=8080 - - --resources=nodes - imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 - ports: - - containerPort: 8080 - name: "http" - livenessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: /livez - port: 8080 - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 5 - readinessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: /readyz - port: 8081 - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 5 - resources: - limits: - memory: 500Mi - requests: - cpu: 250m - memory: 500Mi - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true ---- -# Source: prometheus/templates/deploy.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server - namespace: devzero-system -spec: - selector: - matchLabels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - replicas: 1 - revisionHistoryLimit: 10 - strategy: - type: Recreate - rollingUpdate: null - template: - metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - spec: - enableServiceLinks: true - serviceAccountName: prometheus-dz-prometheus-server - containers: - - name: dz-prometheus-server-configmap-reload - image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.83.0" - imagePullPolicy: "IfNotPresent" - args: - - --watched-dir=/etc/config - - --listen-address=0.0.0.0:8080 - - --reload-url=http://127.0.0.1:9090/-/reload - ports: - - containerPort: 8080 - name: metrics - livenessProbe: - httpGet: - path: /healthz - port: metrics - scheme: HTTP - initialDelaySeconds: 2 - periodSeconds: 10 - readinessProbe: - httpGet: - path: /healthz - port: metrics - scheme: HTTP - periodSeconds: 10 - resources: - limits: - memory: 500Mi - requests: - cpu: 250m - memory: 500Mi - volumeMounts: - - name: config-volume - mountPath: /etc/config - readOnly: true - - name: dz-prometheus-server - image: "quay.io/prometheus/prometheus:v3.4.1" - imagePullPolicy: "IfNotPresent" - args: - - --storage.tsdb.retention.time=2d - - --config.file=/etc/config/prometheus.yml - - --storage.tsdb.path=/data - - --web.console.libraries=/etc/prometheus/console_libraries - - --web.console.templates=/etc/prometheus/consoles - - --web.enable-lifecycle - ports: - - containerPort: 9090 - readinessProbe: - httpGet: - path: /-/ready - port: 9090 - scheme: HTTP - initialDelaySeconds: 30 - periodSeconds: 5 - timeoutSeconds: 4 - failureThreshold: 3 - successThreshold: 1 - livenessProbe: - httpGet: - path: /-/healthy - port: 9090 - scheme: HTTP - initialDelaySeconds: 30 - periodSeconds: 15 - timeoutSeconds: 10 - failureThreshold: 3 - successThreshold: 1 - resources: - limits: - memory: 2Gi - requests: - cpu: 100m - memory: 2Gi - volumeMounts: - - name: config-volume - mountPath: /etc/config - - name: storage-volume - mountPath: /data - subPath: "" - dnsPolicy: ClusterFirst - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - terminationGracePeriodSeconds: 300 - volumes: - - name: config-volume - configMap: - name: prometheus-dz-prometheus-server - - name: storage-volume - emptyDir: {} -# ----- END PROM SERVER ----- -# ----- START PROM NODE EXPORTER ----- ---- -# Source: prometheus-node-exporter/templates/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: dz-prometheus-node-exporter - namespace: devzero-system - labels: - helm.sh/chart: prometheus-node-exporter-4.47.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: dz-prometheus-node-exporter - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - app.kubernetes.io/version: "1.9.1" -automountServiceAccountToken: false ---- -# Source: prometheus-node-exporter/templates/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: dz-prometheus-node-exporter - namespace: devzero-system - labels: - helm.sh/chart: prometheus-node-exporter-4.47.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: dz-prometheus-node-exporter - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - app.kubernetes.io/version: "1.9.1" - annotations: - prometheus.io/scrape: "true" -spec: - type: ClusterIP - ports: - - port: 9101 - targetPort: 9101 - protocol: TCP - name: metrics - selector: - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter ---- -# Source: prometheus-node-exporter/templates/daemonset.yaml -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: dz-prometheus-node-exporter - namespace: devzero-system - labels: - helm.sh/chart: prometheus-node-exporter-4.47.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: dz-prometheus-node-exporter - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - app.kubernetes.io/version: "1.9.1" -spec: - selector: - matchLabels: - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - revisionHistoryLimit: 10 - updateStrategy: - rollingUpdate: - maxUnavailable: 1 - type: RollingUpdate - template: - metadata: - annotations: - cluster-autoscaler.kubernetes.io/safe-to-evict: "true" - prometheus.io/path: /metrics - prometheus.io/port: "9101" - prometheus.io/scrape: "true" - labels: - helm.sh/chart: prometheus-node-exporter-4.47.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: dz-prometheus-node-exporter - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - app.kubernetes.io/version: "1.9.1" - spec: - automountServiceAccountToken: false - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - serviceAccountName: dz-prometheus-node-exporter - containers: - - name: node-exporter - image: quay.io/prometheus/node-exporter:v1.9.1 - imagePullPolicy: IfNotPresent - args: - - --path.procfs=/host/proc - - --path.sysfs=/host/sys - - --path.rootfs=/host/root - - --path.udev.data=/host/root/run/udev/data - - --web.listen-address=[$(HOST_IP)]:9101 - securityContext: - allowPrivilegeEscalation: false - readOnlyRootFilesystem: true - env: - - name: HOST_IP - value: 0.0.0.0 - ports: - - name: metrics - containerPort: 9101 - protocol: TCP - livenessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: / - port: 9101 - scheme: HTTP - initialDelaySeconds: 0 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - readinessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: / - port: 9101 - scheme: HTTP - initialDelaySeconds: 0 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - resources: - limits: - memory: 500Mi - requests: - cpu: 250m - memory: 500Mi - volumeMounts: - - name: proc - mountPath: /host/proc - readOnly: true - - name: sys - mountPath: /host/sys - readOnly: true - - name: root - mountPath: /host/root - mountPropagation: HostToContainer - readOnly: true - hostNetwork: true - hostPID: true - hostIPC: false - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: eks.amazonaws.com/compute-type - operator: NotIn - values: - - fargate - - key: type - operator: NotIn - values: - - virtual-kubelet - nodeSelector: - kubernetes.io/os: linux - tolerations: - - effect: NoSchedule - operator: Exists - volumes: - - name: proc - hostPath: - path: /proc - - name: sys - hostPath: - path: /sys - - name: root - hostPath: - path: / -# ----- END PROM NODE EXPORTER ----- + app.kubernetes.io/name: {{.zxporter_namespace}} + name: {{.zxporter_namespace}} --- apiVersion: v1 kind: Namespace @@ -821,7 +14,7 @@ metadata: app.kubernetes.io/managed-by: kustomize app.kubernetes.io/name: devzero-zxporter control-plane: controller-manager - name: '{{.zxporter_namespace}}' + name: {{.zxporter_namespace}} --- apiVersion: v1 kind: ServiceAccount @@ -830,7 +23,7 @@ metadata: app.kubernetes.io/managed-by: kustomize app.kubernetes.io/name: devzero-zxporter name: devzero-zxporter-controller-manager - namespace: devzero-system + namespace: {{.zxporter_namespace}} --- apiVersion: rbac.authorization.k8s.io/v1 kind: Role @@ -839,7 +32,7 @@ metadata: app.kubernetes.io/managed-by: kustomize app.kubernetes.io/name: devzero-zxporter name: devzero-zxporter-leader-election-role - namespace: devzero-system + namespace: {{.zxporter_namespace}} rules: - apiGroups: - "" @@ -1329,7 +522,7 @@ metadata: app.kubernetes.io/managed-by: kustomize app.kubernetes.io/name: devzero-zxporter name: devzero-zxporter-leader-election-rolebinding - namespace: devzero-system + namespace: {{.zxporter_namespace}} roleRef: apiGroup: rbac.authorization.k8s.io kind: Role @@ -1337,7 +530,7 @@ roleRef: subjects: - kind: ServiceAccount name: devzero-zxporter-controller-manager - namespace: devzero-system + namespace: {{.zxporter_namespace}} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -1353,7 +546,7 @@ roleRef: subjects: - kind: ServiceAccount name: devzero-zxporter-controller-manager - namespace: devzero-system + namespace: {{.zxporter_namespace}} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -1366,7 +559,7 @@ roleRef: subjects: - kind: ServiceAccount name: devzero-zxporter-controller-manager - namespace: devzero-system + namespace: {{.zxporter_namespace}} --- apiVersion: v1 kind: Service @@ -1376,7 +569,7 @@ metadata: app.kubernetes.io/name: devzero-zxporter control-plane: controller-manager name: devzero-zxporter-controller-manager-metrics-service - namespace: devzero-system + namespace: {{.zxporter_namespace}} spec: ports: - name: https @@ -1393,7 +586,7 @@ metadata: app.kubernetes.io/name: devzero-zxporter control-plane: controller-manager name: devzero-zxporter-controller-manager-mpa - namespace: devzero-system + namespace: {{.zxporter_namespace}} spec: ports: - name: mpa-grpc @@ -1421,7 +614,7 @@ metadata: app.kubernetes.io/name: devzero-zxporter control-plane: controller-manager name: devzero-zxporter-controller-manager - namespace: devzero-system + namespace: {{.zxporter_namespace}} spec: replicas: 2 selector: @@ -1498,9 +691,446 @@ apiVersion: policy/v1 kind: PodDisruptionBudget metadata: name: devzero-zxporter-devzero-zxporter-pdb - namespace: devzero-system + namespace: {{.zxporter_namespace}} spec: minAvailable: 1 selector: matchLabels: control-plane: controller-manager +--- +# Source: zxporter-nodemon/templates/rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: zxporter-nodemon + namespace: {{.zxporter_namespace}} + labels: + helm.sh/chart: zxporter-nodemon-0.0.2 + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + app.kubernetes.io/version: "0.0.2" + app.kubernetes.io/managed-by: Helm +--- +# Source: zxporter-nodemon/templates/dcgm-exporter-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: zxporter-nodemon-dcgm-metrics + namespace: {{.zxporter_namespace}} +data: + counters.csv: | + # Temperature and power usage,, + DCGM_FI_DEV_GPU_TEMP, gauge, Current temperature readings for the device in degrees C. + DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature for the device. + DCGM_FI_DEV_POWER_USAGE, gauge, Power usage for the device in Watts. + + # Utilization,, + # DCGM_FI_DEV_GPU_UTIL provides overall GPU utilization which is useful for scenarios + # like fractional GPU sharing (e.g., EKS time-slicing, MIG) where profiling metrics may not be available. + DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). + # DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned + # DCGM_FI_PROF_SM_OCCUPANCY, gauge, The fraction of resident warps on a multiprocessor + # DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %). + # DCGM_FI_PROF_DRAM_ACTIVE, gauge, The ratio of cycles the device memory interface is active sending or receiving data. + + # Memory usage,, + DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). + DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). + DCGM_FI_DEV_FB_TOTAL, gauge, Total Frame Buffer of the GPU in MB. + DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Utilization of the memory copy engine. + + # PCIE,, + # DCGM_FI_PROF_PCIE_TX_BYTES, gauge, Total number of bytes transmitted through PCIe TX + # DCGM_FI_PROF_PCIE_RX_BYTES, gauge, Total number of bytes received through PCIe RX + DCGM_FI_DEV_PCIE_LINK_GEN, gauge, PCIe Current Link Generation. + DCGM_FI_DEV_PCIE_LINK_WIDTH, gauge, PCIe Current Link Width. + + # Pipelines,, + # DCGM_FI_PROF_PIPE_INT_ACTIVE, gauge, Ratio of cycles the integer pipe is active. + # DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipe is active. + # DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipe is active. + # DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipe is active. + # DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, The ratio of cycles the tensor (HMMA) pipe is active (off the peak sustained elapsed cycles) + + # Health,, + # DCGM_FI_DEV_CLOCKS_EVENT_REASONS is not supported by DCGM 3.3.7 + # DCGM_FI_DEV_CLOCKS_EVENT_REASONS, gauge, Current clock event reasons (bitmask of DCGM_CLOCKS_EVENT_REASON_*) + DCGM_FI_DEV_XID_ERRORS, gauge, The value is the specific XID error + DCGM_FI_DEV_POWER_VIOLATION, gauge, Power Violation time in ns. + DCGM_FI_DEV_THERMAL_VIOLATION, gauge, Thermal Violation time in ns. + + # NVLink,, + # DCGM_FI_PROF_NVLINK_TX_BYTES, gauge, The number of bytes of active NvLink tx (transmit) data including both header and payload. + # DCGM_FI_PROF_NVLINK_RX_BYTES, gauge, The number of bytes of active NvLink rx (read) data including both header and payload. + + # Clocks,, + DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). + DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). +--- +# Source: zxporter-nodemon/templates/nodemon-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: zxporter-nodemon-zxporter-nodemon + namespace: {{.zxporter_namespace}} +data: +--- +# Source: zxporter-nodemon/templates/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: zxporter-nodemon + labels: + helm.sh/chart: zxporter-nodemon-0.0.2 + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + app.kubernetes.io/version: "0.0.2" + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - apiGroups: + - apps + resources: + - replicasets + - deployments + - statefulsets + - daemonsets + verbs: + - get + - apiGroups: + - batch + resources: + - jobs + - cronjobs + verbs: + - get + - apiGroups: + - argoproj.io + resources: + - rollouts + verbs: + - get + - apiGroups: + - "" + resources: + - nodes/proxy + - nodes/metrics + - nodes/stats + verbs: + - get +--- +# Source: zxporter-nodemon/templates/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: zxporter-nodemon + labels: + helm.sh/chart: zxporter-nodemon-0.0.2 + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + app.kubernetes.io/version: "0.0.2" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: zxporter-nodemon +subjects: + - kind: ServiceAccount + name: zxporter-nodemon + namespace: {{.zxporter_namespace}} +--- +# Source: zxporter-nodemon/templates/daemonset.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: zxporter-nodemon + namespace: {{.zxporter_namespace}} + labels: + helm.sh/chart: zxporter-nodemon-0.0.2 + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + app.kubernetes.io/version: "0.0.2" + app.kubernetes.io/managed-by: Helm + annotations: + ignore-check.kube-linter.io/privileged-container: "This daemon set needs to run DCGM Exporter as privileged to access the GPU metrics." + ignore-check.kube-linter.io/run-as-non-root: "This daemon set needs to run DCGM Exporter as root to access the GPU metrics." + ignore-check.kube-linter.io/privilege-escalation-container: "This daemon set needs escalate privileges for DCGM Exporter." + ignore-check.kube-linter.io/no-read-only-root-fs: "This daemon set needs to run DCGM Exporter with read-only root filesystem." +spec: + selector: + matchLabels: + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + template: + metadata: + labels: + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + spec: + serviceAccountName: zxporter-nodemon + volumes: + - name: "pod-gpu-resources" + hostPath: + path: /var/lib/kubelet/pod-resources + - name: zxporter-nodemon-dcgm-metrics + configMap: + name: zxporter-nodemon-dcgm-metrics + tolerations: + - effect: NoExecute + operator: Exists + - effect: NoSchedule + operator: Exists + - effect: PreferNoSchedule + operator: Exists + containers: + - name: zxporter-nodemon + securityContext: + readOnlyRootFilesystem: true + runAsNonRoot: true + image: "ttl.sh/zxporter-nodemon:latest" + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 6061 + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: http + readinessProbe: + httpGet: + path: /healthz + port: http + envFrom: + - configMapRef: + name: zxporter-nodemon-zxporter-nodemon + env: + - name: "NODE_NAME" + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: "DCGM_HOST" + value: "localhost" + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + - name: dcgm-exporter + securityContext: + capabilities: + add: + - SYS_ADMIN + drop: + - NET_RAW + runAsNonRoot: false + runAsUser: 0 + image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.7-3.5.0-ubuntu22.04" + imagePullPolicy: IfNotPresent + command: ["/bin/bash", "-c"] + args: + - hostname $NODE_NAME; for ((;;)) { r=$(dcgm-exporter -f /etc/dcgm-exporter/counters.csv); echo "dcgm-exporter could not run"; sleep 60 ; } + ports: + - name: "metrics" + containerPort: 9400 + env: + - name: "DCGM_EXPORTER_KUBERNETES" + value: "true" + - name: "DCGM_EXPORTER_LISTEN" + value: ":9400" + - name: "DCGM_EXPORTER_INTERVAL" + value: "5000" + - name: "NODE_NAME" + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: "pod-gpu-resources" + readOnly: true + mountPath: "/var/lib/kubelet/pod-resources" + - name: zxporter-nodemon-dcgm-metrics + mountPath: "/etc/dcgm-exporter" +--- +# Dedicated ServiceAccount for the one-time migration cleanup job. +# Scoped to only delete specific named resources left by previous zxporter installs. +apiVersion: v1 +kind: ServiceAccount +metadata: + name: zxporter-prometheus-cleanup + namespace: {{.zxporter_namespace}} + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +--- +# Namespaced Role: can only delete specific named Prometheus resources in the zxporter namespace. +# NOTE: Does NOT delete standalone nodemon — the kubectl install path reuses the same +# resource names (zxporter-nodemon), so kubectl apply updates them in-place. +# Nodemon cleanup is only in the Helm hook where names differ (zxporter-zxporter-nodemon). +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: zxporter-prometheus-cleanup + namespace: {{.zxporter_namespace}} + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +rules: + - apiGroups: ["apps"] + resources: ["deployments"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + verbs: ["delete"] + - apiGroups: ["apps"] + resources: ["daemonsets"] + resourceNames: + - dz-prometheus-node-exporter + verbs: ["delete"] + - apiGroups: [""] + resources: ["services"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + - dz-prometheus-node-exporter + verbs: ["delete"] + - apiGroups: [""] + resources: ["configmaps"] + resourceNames: + - prometheus-dz-prometheus-server + verbs: ["delete"] + - apiGroups: [""] + resources: ["serviceaccounts"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + - dz-prometheus-node-exporter + verbs: ["delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: zxporter-prometheus-cleanup + namespace: {{.zxporter_namespace}} + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: zxporter-prometheus-cleanup +subjects: + - kind: ServiceAccount + name: zxporter-prometheus-cleanup + namespace: {{.zxporter_namespace}} +--- +# ClusterRole: can only delete the exact named ClusterRoles/ClusterRoleBindings +# left by the old zxporter Prometheus install. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: zxporter-prometheus-cleanup + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +rules: + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterroles"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + verbs: ["delete"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterrolebindings"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + verbs: ["delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: zxporter-prometheus-cleanup + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: zxporter-prometheus-cleanup +subjects: + - kind: ServiceAccount + name: zxporter-prometheus-cleanup + namespace: {{.zxporter_namespace}} +--- +# One-time migration job: cleans up legacy Prometheus resources. +# - Idempotent: --ignore-not-found on every delete +# - Safe for fresh installs: all deletes succeed with "not found" (exit 0) +# - Scoped: only deletes Prometheus resources by exact name +# - Self-cleaning: Job auto-deletes after 5 minutes via ttlSecondsAfterFinished +# +# NOTE: Does NOT delete standalone nodemon — in the kubectl path, the new installer +# reuses the same names (zxporter-nodemon), so kubectl apply updates them in-place. +apiVersion: batch/v1 +kind: Job +metadata: + name: zxporter-prometheus-cleanup + namespace: {{.zxporter_namespace}} + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter +spec: + ttlSecondsAfterFinished: 300 + backoffLimit: 1 + template: + metadata: + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + spec: + restartPolicy: Never + serviceAccountName: zxporter-prometheus-cleanup + containers: + - name: cleanup + image: bitnami/kubectl:latest + command: + - /bin/sh + - -c + - | + NS=$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace) + echo "Cleaning up legacy zxporter Prometheus resources in namespace: $NS" + echo "This only deletes Prometheus resources by exact name — other installs are unaffected." + + # Deployments + kubectl delete deployment prometheus-dz-prometheus-server -n $NS --ignore-not-found + kubectl delete deployment prometheus-kube-state-metrics -n $NS --ignore-not-found + + # DaemonSets + kubectl delete daemonset dz-prometheus-node-exporter -n $NS --ignore-not-found + + # Services + kubectl delete service prometheus-dz-prometheus-server -n $NS --ignore-not-found + kubectl delete service prometheus-kube-state-metrics -n $NS --ignore-not-found + kubectl delete service dz-prometheus-node-exporter -n $NS --ignore-not-found + + # ConfigMaps + kubectl delete configmap prometheus-dz-prometheus-server -n $NS --ignore-not-found + + # ServiceAccounts + kubectl delete serviceaccount prometheus-dz-prometheus-server -n $NS --ignore-not-found + kubectl delete serviceaccount prometheus-kube-state-metrics -n $NS --ignore-not-found + kubectl delete serviceaccount dz-prometheus-node-exporter -n $NS --ignore-not-found + + # ClusterRoles and ClusterRoleBindings + kubectl delete clusterrole prometheus-dz-prometheus-server --ignore-not-found + kubectl delete clusterrole prometheus-kube-state-metrics --ignore-not-found + kubectl delete clusterrolebinding prometheus-dz-prometheus-server --ignore-not-found + kubectl delete clusterrolebinding prometheus-kube-state-metrics --ignore-not-found + + echo "Cleanup complete" diff --git a/dist/metrics-server.yaml b/dist/metrics-server.yaml deleted file mode 100644 index b7bb521a..00000000 --- a/dist/metrics-server.yaml +++ /dev/null @@ -1,253 +0,0 @@ ---- -# Source: metrics-server/templates/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: dz-metrics-server - namespace: devzero-system - labels: - helm.sh/chart: metrics-server-3.12.2 - app.kubernetes.io/name: dz-metrics-server - app.kubernetes.io/instance: metrics-server - app.kubernetes.io/version: "0.7.2" - app.kubernetes.io/managed-by: Helm ---- -# Source: metrics-server/templates/clusterrole-aggregated-reader.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: system:dz-metrics-server-aggregated-reader - labels: - helm.sh/chart: metrics-server-3.12.2 - app.kubernetes.io/name: dz-metrics-server - app.kubernetes.io/instance: metrics-server - app.kubernetes.io/version: "0.7.2" - app.kubernetes.io/managed-by: Helm - rbac.authorization.k8s.io/aggregate-to-admin: "true" - rbac.authorization.k8s.io/aggregate-to-edit: "true" - rbac.authorization.k8s.io/aggregate-to-view: "true" -rules: - - apiGroups: - - metrics.k8s.io - resources: - - pods - - nodes - verbs: - - get - - list - - watch ---- -# Source: metrics-server/templates/clusterrole.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: system:dz-metrics-server - labels: - helm.sh/chart: metrics-server-3.12.2 - app.kubernetes.io/name: dz-metrics-server - app.kubernetes.io/instance: metrics-server - app.kubernetes.io/version: "0.7.2" - app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - "" - resources: - - nodes/metrics - verbs: - - get - - apiGroups: - - "" - resources: - - pods - - nodes - - namespaces - - configmaps - verbs: - - get - - list - - watch ---- -# Source: metrics-server/templates/clusterrolebinding-auth-delegator.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: dz-metrics-server:system:auth-delegator - labels: - helm.sh/chart: metrics-server-3.12.2 - app.kubernetes.io/name: dz-metrics-server - app.kubernetes.io/instance: metrics-server - app.kubernetes.io/version: "0.7.2" - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:auth-delegator -subjects: - - kind: ServiceAccount - name: dz-metrics-server - namespace: devzero-system ---- -# Source: metrics-server/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: system:dz-metrics-server - labels: - helm.sh/chart: metrics-server-3.12.2 - app.kubernetes.io/name: dz-metrics-server - app.kubernetes.io/instance: metrics-server - app.kubernetes.io/version: "0.7.2" - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:dz-metrics-server -subjects: - - kind: ServiceAccount - name: dz-metrics-server - namespace: devzero-system ---- -# Source: metrics-server/templates/rolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: dz-metrics-server-auth-reader - namespace: kube-system - labels: - helm.sh/chart: metrics-server-3.12.2 - app.kubernetes.io/name: dz-metrics-server - app.kubernetes.io/instance: metrics-server - app.kubernetes.io/version: "0.7.2" - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: extension-apiserver-authentication-reader -subjects: - - kind: ServiceAccount - name: dz-metrics-server - namespace: devzero-system ---- -# Source: metrics-server/templates/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: dz-metrics-server - namespace: devzero-system - labels: - helm.sh/chart: metrics-server-3.12.2 - app.kubernetes.io/name: dz-metrics-server - app.kubernetes.io/instance: metrics-server - app.kubernetes.io/version: "0.7.2" - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - name: https - port: 443 - protocol: TCP - targetPort: https - appProtocol: https - selector: - app.kubernetes.io/name: dz-metrics-server - app.kubernetes.io/instance: metrics-server ---- -# Source: metrics-server/templates/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: dz-metrics-server - namespace: devzero-system - labels: - helm.sh/chart: metrics-server-3.12.2 - app.kubernetes.io/name: dz-metrics-server - app.kubernetes.io/instance: metrics-server - app.kubernetes.io/version: "0.7.2" - app.kubernetes.io/managed-by: Helm -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: dz-metrics-server - app.kubernetes.io/instance: metrics-server - template: - metadata: - labels: - app.kubernetes.io/name: dz-metrics-server - app.kubernetes.io/instance: metrics-server - spec: - serviceAccountName: dz-metrics-server - priorityClassName: "system-cluster-critical" - containers: - - name: metrics-server - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - runAsNonRoot: true - runAsUser: 1000 - seccompProfile: - type: RuntimeDefault - image: registry.k8s.io/metrics-server/metrics-server:v0.7.2 - imagePullPolicy: IfNotPresent - args: - - --secure-port=10250 - - --cert-dir=/tmp - - --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname - - --kubelet-use-node-status-port - - --metric-resolution=15s - - --kubelet-insecure-tls - ports: - - name: https - protocol: TCP - containerPort: 10250 - livenessProbe: - failureThreshold: 3 - httpGet: - path: /livez - port: https - scheme: HTTPS - initialDelaySeconds: 0 - periodSeconds: 10 - readinessProbe: - failureThreshold: 3 - httpGet: - path: /readyz - port: https - scheme: HTTPS - initialDelaySeconds: 20 - periodSeconds: 10 - volumeMounts: - - name: tmp - mountPath: /tmp - resources: - requests: - cpu: 100m - memory: 200Mi - volumes: - - name: tmp - emptyDir: {} ---- -# Source: metrics-server/templates/apiservice.yaml -apiVersion: apiregistration.k8s.io/v1 -kind: APIService -metadata: - name: v1beta1.metrics.k8s.io - labels: - helm.sh/chart: metrics-server-3.12.2 - app.kubernetes.io/name: dz-metrics-server - app.kubernetes.io/instance: metrics-server - app.kubernetes.io/version: "0.7.2" - app.kubernetes.io/managed-by: Helm -spec: - group: metrics.k8s.io - groupPriorityMinimum: 100 - insecureSkipTLSVerify: true - service: - name: dz-metrics-server - namespace: devzero-system - port: 443 - version: v1beta1 - versionPriority: 100 diff --git a/dist/node-exporter.yaml b/dist/node-exporter.yaml deleted file mode 100644 index 2be597f3..00000000 --- a/dist/node-exporter.yaml +++ /dev/null @@ -1,181 +0,0 @@ ---- -# Source: prometheus-node-exporter/templates/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: dz-prometheus-node-exporter - namespace: devzero-system - labels: - helm.sh/chart: prometheus-node-exporter-4.47.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: dz-prometheus-node-exporter - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - app.kubernetes.io/version: "1.9.1" -automountServiceAccountToken: false ---- -# Source: prometheus-node-exporter/templates/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: dz-prometheus-node-exporter - namespace: devzero-system - labels: - helm.sh/chart: prometheus-node-exporter-4.47.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: dz-prometheus-node-exporter - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - app.kubernetes.io/version: "1.9.1" - annotations: - prometheus.io/scrape: "true" -spec: - type: ClusterIP - ports: - - port: 9101 - targetPort: 9101 - protocol: TCP - name: metrics - selector: - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter ---- -# Source: prometheus-node-exporter/templates/daemonset.yaml -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: dz-prometheus-node-exporter - namespace: devzero-system - labels: - helm.sh/chart: prometheus-node-exporter-4.47.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: dz-prometheus-node-exporter - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - app.kubernetes.io/version: "1.9.1" -spec: - selector: - matchLabels: - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - revisionHistoryLimit: 10 - updateStrategy: - rollingUpdate: - maxUnavailable: 1 - type: RollingUpdate - template: - metadata: - annotations: - cluster-autoscaler.kubernetes.io/safe-to-evict: "true" - prometheus.io/path: /metrics - prometheus.io/port: "9101" - prometheus.io/scrape: "true" - labels: - helm.sh/chart: prometheus-node-exporter-4.47.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: dz-prometheus-node-exporter - app.kubernetes.io/name: dz-prometheus-node-exporter - app.kubernetes.io/instance: node-exporter - app.kubernetes.io/version: "1.9.1" - spec: - automountServiceAccountToken: false - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - serviceAccountName: dz-prometheus-node-exporter - containers: - - name: node-exporter - image: quay.io/prometheus/node-exporter:v1.9.1 - imagePullPolicy: IfNotPresent - args: - - --path.procfs=/host/proc - - --path.sysfs=/host/sys - - --path.rootfs=/host/root - - --path.udev.data=/host/root/run/udev/data - - --web.listen-address=[$(HOST_IP)]:9101 - securityContext: - allowPrivilegeEscalation: false - readOnlyRootFilesystem: true - env: - - name: HOST_IP - value: 0.0.0.0 - ports: - - name: metrics - containerPort: 9101 - protocol: TCP - livenessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: / - port: 9101 - scheme: HTTP - initialDelaySeconds: 0 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - readinessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: / - port: 9101 - scheme: HTTP - initialDelaySeconds: 0 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - resources: - limits: - memory: 500Mi - requests: - cpu: 250m - memory: 500Mi - volumeMounts: - - name: proc - mountPath: /host/proc - readOnly: true - - name: sys - mountPath: /host/sys - readOnly: true - - name: root - mountPath: /host/root - mountPropagation: HostToContainer - readOnly: true - hostNetwork: true - hostPID: true - hostIPC: false - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: eks.amazonaws.com/compute-type - operator: NotIn - values: - - fargate - - key: type - operator: NotIn - values: - - virtual-kubelet - nodeSelector: - kubernetes.io/os: linux - tolerations: - - effect: NoSchedule - operator: Exists - volumes: - - name: proc - hostPath: - path: /proc - - name: sys - hostPath: - path: /sys - - name: root - hostPath: - path: / diff --git a/dist/nodemon.yaml b/dist/nodemon.yaml new file mode 100644 index 00000000..fa6f04be --- /dev/null +++ b/dist/nodemon.yaml @@ -0,0 +1,261 @@ +--- +# Source: zxporter-nodemon/templates/rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: zxporter-nodemon + namespace: devzero-system + labels: + helm.sh/chart: zxporter-nodemon-0.0.2 + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + app.kubernetes.io/version: "0.0.2" + app.kubernetes.io/managed-by: Helm +--- +# Source: zxporter-nodemon/templates/dcgm-exporter-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: zxporter-nodemon-dcgm-metrics + namespace: devzero-system +data: + counters.csv: | + # Temperature and power usage,, + DCGM_FI_DEV_GPU_TEMP, gauge, Current temperature readings for the device in degrees C. + DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature for the device. + DCGM_FI_DEV_POWER_USAGE, gauge, Power usage for the device in Watts. + + # Utilization,, + # DCGM_FI_DEV_GPU_UTIL provides overall GPU utilization which is useful for scenarios + # like fractional GPU sharing (e.g., EKS time-slicing, MIG) where profiling metrics may not be available. + DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). + # DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned + # DCGM_FI_PROF_SM_OCCUPANCY, gauge, The fraction of resident warps on a multiprocessor + # DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %). + # DCGM_FI_PROF_DRAM_ACTIVE, gauge, The ratio of cycles the device memory interface is active sending or receiving data. + + # Memory usage,, + DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). + DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). + DCGM_FI_DEV_FB_TOTAL, gauge, Total Frame Buffer of the GPU in MB. + DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Utilization of the memory copy engine. + + # PCIE,, + # DCGM_FI_PROF_PCIE_TX_BYTES, gauge, Total number of bytes transmitted through PCIe TX + # DCGM_FI_PROF_PCIE_RX_BYTES, gauge, Total number of bytes received through PCIe RX + DCGM_FI_DEV_PCIE_LINK_GEN, gauge, PCIe Current Link Generation. + DCGM_FI_DEV_PCIE_LINK_WIDTH, gauge, PCIe Current Link Width. + + # Pipelines,, + # DCGM_FI_PROF_PIPE_INT_ACTIVE, gauge, Ratio of cycles the integer pipe is active. + # DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipe is active. + # DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipe is active. + # DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipe is active. + # DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, The ratio of cycles the tensor (HMMA) pipe is active (off the peak sustained elapsed cycles) + + # Health,, + # DCGM_FI_DEV_CLOCKS_EVENT_REASONS is not supported by DCGM 3.3.7 + # DCGM_FI_DEV_CLOCKS_EVENT_REASONS, gauge, Current clock event reasons (bitmask of DCGM_CLOCKS_EVENT_REASON_*) + DCGM_FI_DEV_XID_ERRORS, gauge, The value is the specific XID error + DCGM_FI_DEV_POWER_VIOLATION, gauge, Power Violation time in ns. + DCGM_FI_DEV_THERMAL_VIOLATION, gauge, Thermal Violation time in ns. + + # NVLink,, + # DCGM_FI_PROF_NVLINK_TX_BYTES, gauge, The number of bytes of active NvLink tx (transmit) data including both header and payload. + # DCGM_FI_PROF_NVLINK_RX_BYTES, gauge, The number of bytes of active NvLink rx (read) data including both header and payload. + + # Clocks,, + DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). + DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). +--- +# Source: zxporter-nodemon/templates/nodemon-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: zxporter-nodemon-zxporter-nodemon + namespace: devzero-system +data: +--- +# Source: zxporter-nodemon/templates/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: zxporter-nodemon + labels: + helm.sh/chart: zxporter-nodemon-0.0.2 + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + app.kubernetes.io/version: "0.0.2" + app.kubernetes.io/managed-by: Helm +rules: +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list +- apiGroups: + - apps + resources: + - replicasets + - deployments + - statefulsets + - daemonsets + verbs: + - get +- apiGroups: + - batch + resources: + - jobs + - cronjobs + verbs: + - get +- apiGroups: + - argoproj.io + resources: + - rollouts + verbs: + - get +- apiGroups: + - "" + resources: + - nodes/proxy + - nodes/metrics + - nodes/stats + verbs: + - get +--- +# Source: zxporter-nodemon/templates/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: zxporter-nodemon + labels: + helm.sh/chart: zxporter-nodemon-0.0.2 + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + app.kubernetes.io/version: "0.0.2" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: zxporter-nodemon +subjects: +- kind: ServiceAccount + name: zxporter-nodemon + namespace: devzero-system +--- +# Source: zxporter-nodemon/templates/daemonset.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: zxporter-nodemon + namespace: devzero-system + labels: + helm.sh/chart: zxporter-nodemon-0.0.2 + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + app.kubernetes.io/version: "0.0.2" + app.kubernetes.io/managed-by: Helm + annotations: + ignore-check.kube-linter.io/privileged-container: "This daemon set needs to run DCGM Exporter as privileged to access the GPU metrics." + ignore-check.kube-linter.io/run-as-non-root: "This daemon set needs to run DCGM Exporter as root to access the GPU metrics." + ignore-check.kube-linter.io/privilege-escalation-container: "This daemon set needs escalate privileges for DCGM Exporter." + ignore-check.kube-linter.io/no-read-only-root-fs: "This daemon set needs to run DCGM Exporter with read-only root filesystem." +spec: + selector: + matchLabels: + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + template: + metadata: + labels: + app.kubernetes.io/name: zxporter-nodemon + app.kubernetes.io/instance: zxporter-nodemon + spec: + serviceAccountName: zxporter-nodemon + volumes: + - name: "pod-gpu-resources" + hostPath: + path: /var/lib/kubelet/pod-resources + - name: zxporter-nodemon-dcgm-metrics + configMap: + name: zxporter-nodemon-dcgm-metrics + tolerations: + - effect: NoExecute + operator: Exists + - effect: NoSchedule + operator: Exists + - effect: PreferNoSchedule + operator: Exists + containers: + - name: zxporter-nodemon + securityContext: + readOnlyRootFilesystem: true + runAsNonRoot: true + image: "ttl.sh/zxporter-nodemon:latest" + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 6061 + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: http + readinessProbe: + httpGet: + path: /healthz + port: http + envFrom: + - configMapRef: + name: zxporter-nodemon-zxporter-nodemon + env: + - name: "NODE_NAME" + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: "DCGM_HOST" + value: "localhost" + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + - name: dcgm-exporter + securityContext: + capabilities: + add: + - SYS_ADMIN + drop: + - NET_RAW + runAsNonRoot: false + runAsUser: 0 + image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.7-3.5.0-ubuntu22.04" + imagePullPolicy: IfNotPresent + command: [ "/bin/bash", "-c" ] + args: + - hostname $NODE_NAME; for ((;;)) { r=$(dcgm-exporter -f /etc/dcgm-exporter/counters.csv); echo "dcgm-exporter could not run"; sleep 60 ; } + ports: + - name: "metrics" + containerPort: 9400 + env: + - name: "DCGM_EXPORTER_KUBERNETES" + value: "true" + - name: "DCGM_EXPORTER_LISTEN" + value: ":9400" + - name: "DCGM_EXPORTER_INTERVAL" + value: "5000" + - name: "NODE_NAME" + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: "pod-gpu-resources" + readOnly: true + mountPath: "/var/lib/kubelet/pod-resources" + - name: zxporter-nodemon-dcgm-metrics + mountPath: "/etc/dcgm-exporter" diff --git a/dist/prometheus.yaml b/dist/prometheus.yaml deleted file mode 100644 index 0f2ce776..00000000 --- a/dist/prometheus.yaml +++ /dev/null @@ -1,620 +0,0 @@ ---- -# Source: prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -automountServiceAccountToken: true -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - name: prometheus-kube-state-metrics - namespace: devzero-system ---- -# Source: prometheus/templates/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server - namespace: devzero-system - annotations: - {} ---- -# Source: prometheus/templates/cm.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server - namespace: devzero-system -data: - allow-snippet-annotations: "false" - alerting_rules.yml: | - {} - alerts: | - {} - prometheus.yml: | - global: - evaluation_interval: 1m - scrape_interval: 1m - scrape_timeout: 10s - rule_files: - - /etc/config/recording_rules.yml - - /etc/config/alerting_rules.yml - - /etc/config/rules - - /etc/config/alerts - scrape_configs: - - job_name: prometheus - static_configs: - - targets: - - localhost:9090 - - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - job_name: kubernetes-nodes - kubernetes_sd_configs: - - role: node - metric_relabel_configs: - - action: keep - regex: node_network_(receive|transmit)_(bytes|packets|errs|drop)_total|node_disk_(read_bytes|written_bytes|reads_completed|writes_completed)_total|kubelet_volume_stats_(used|capacity|available)_bytes - source_labels: - - __name__ - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - replacement: kubernetes.default.svc:443 - target_label: __address__ - - regex: (.+) - replacement: /api/v1/nodes/$1/proxy/metrics - source_labels: - - __meta_kubernetes_node_name - target_label: __metrics_path__ - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - job_name: kubernetes-nodes-cadvisor - kubernetes_sd_configs: - - role: node - metric_relabel_configs: - - action: keep - regex: container_network_(receive|transmit)_(bytes|packets|errors|packets_dropped)_total|container_fs_(reads|writes)_(bytes_)?total|kubelet_volume_stats_(used|capacity|available)_bytes|container_cpu_cfs_(throttled_)?periods_total - source_labels: - - __name__ - - action: labeldrop - regex: (id|name|image) - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - replacement: kubernetes.default.svc:443 - target_label: __address__ - - regex: (.+) - replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor - source_labels: - - __meta_kubernetes_node_name - target_label: __metrics_path__ - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - - honor_labels: true - job_name: kubernetes-pods - kubernetes_sd_configs: - - role: pod - metric_relabel_configs: - - action: keep - regex: DCGM_FI_DEV_(GPU_UTIL|FB_USED|FB_FREE|POWER_USAGE|GPU_TEMP|SM_CLOCK|MEM_CLOCK) - source_labels: - - __name__ - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape - - action: keep - regex: .*dcgm-exporter.* - source_labels: - - __meta_kubernetes_pod_label_app - - __meta_kubernetes_pod_label_app_kubernetes_io_name - - action: drop - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: '[$2]:$1' - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: replace - regex: (\d+);((([0-9]+?)(\.|$)){4}) - replacement: $2:$1 - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - action: replace - source_labels: - - __meta_kubernetes_pod_name - target_label: pod - - action: drop - regex: Pending|Succeeded|Failed|Completed - source_labels: - - __meta_kubernetes_pod_phase - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - - honor_labels: true - job_name: kubernetes-pods-node-exporter - kubernetes_sd_configs: - - role: pod - metric_relabel_configs: - - action: keep - regex: node_network_(receive|transmit)_(bytes|packets|errs|drop)_total|node_disk_(read_bytes|written_bytes|reads_completed|writes_completed)_total|kubelet_volume_stats_(used|capacity|available)_bytes - source_labels: - - __name__ - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape - - action: keep - regex: dz-prometheus-node-exporter - source_labels: - - __meta_kubernetes_pod_label_app_kubernetes_io_name - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: '[$2]:$1' - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: replace - regex: (\d+);((([0-9]+?)(\.|$)){4}) - replacement: $2:$1 - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - recording_rules.yml: | - {} - rules: | - {} ---- -# Source: prometheus/charts/kube-state-metrics/templates/role.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - name: prometheus-kube-state-metrics -rules: - -- apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] ---- -# Source: prometheus/templates/clusterrole.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server -rules: - - apiGroups: - - "" - resources: - - nodes - - nodes/proxy - - nodes/metrics - - services - - endpoints - - pods - - ingresses - - configmaps - verbs: - - get - - list - - watch - - apiGroups: - - "networking.k8s.io" - resources: - - ingresses/status - - ingresses - verbs: - - get - - list - - watch - - apiGroups: - - "discovery.k8s.io" - resources: - - endpointslices - verbs: - - get - - list - - watch - - nonResourceURLs: - - "/metrics" - verbs: - - get ---- -# Source: prometheus/charts/kube-state-metrics/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - name: prometheus-kube-state-metrics -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus-kube-state-metrics -subjects: -- kind: ServiceAccount - name: prometheus-kube-state-metrics - namespace: devzero-system ---- -# Source: prometheus/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server -subjects: - - kind: ServiceAccount - name: prometheus-dz-prometheus-server - namespace: devzero-system -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus-dz-prometheus-server ---- -# Source: prometheus/charts/kube-state-metrics/templates/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: prometheus-kube-state-metrics - namespace: devzero-system - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - annotations: - prometheus.io/scrape: 'true' -spec: - type: "ClusterIP" - ports: - - name: "http" - protocol: TCP - port: 8080 - targetPort: 8080 - - selector: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus ---- -# Source: prometheus/templates/service.yaml -apiVersion: v1 -kind: Service -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server - namespace: devzero-system -spec: - ports: - - name: http - port: 80 - protocol: TCP - targetPort: 9090 - selector: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - sessionAffinity: None - type: "ClusterIP" ---- -# Source: prometheus/charts/kube-state-metrics/templates/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: prometheus-kube-state-metrics - namespace: devzero-system - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" -spec: - selector: - matchLabels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - replicas: 1 - strategy: - type: RollingUpdate - revisionHistoryLimit: 10 - template: - metadata: - labels: - helm.sh/chart: kube-state-metrics-5.33.2 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: "2.15.0" - spec: - automountServiceAccountToken: true - hostNetwork: false - serviceAccountName: prometheus-kube-state-metrics - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - dnsPolicy: ClusterFirst - containers: - - name: kube-state-metrics - args: - - --port=8080 - - --resources=nodes - imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 - ports: - - containerPort: 8080 - name: "http" - livenessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: /livez - port: 8080 - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 5 - readinessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: /readyz - port: 8081 - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 5 - resources: - limits: - memory: 500Mi - requests: - cpu: 250m - memory: 500Mi - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true ---- -# Source: prometheus/templates/deploy.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - name: prometheus-dz-prometheus-server - namespace: devzero-system -spec: - selector: - matchLabels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - replicas: 1 - revisionHistoryLimit: 10 - strategy: - type: Recreate - rollingUpdate: null - template: - metadata: - labels: - app.kubernetes.io/component: server - app.kubernetes.io/name: dz-prometheus - app.kubernetes.io/instance: prometheus - app.kubernetes.io/version: v3.4.1 - helm.sh/chart: prometheus-27.20.0 - app.kubernetes.io/part-of: dz-prometheus - spec: - enableServiceLinks: true - serviceAccountName: prometheus-dz-prometheus-server - containers: - - name: dz-prometheus-server-configmap-reload - image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.83.0" - imagePullPolicy: "IfNotPresent" - args: - - --watched-dir=/etc/config - - --listen-address=0.0.0.0:8080 - - --reload-url=http://127.0.0.1:9090/-/reload - ports: - - containerPort: 8080 - name: metrics - livenessProbe: - httpGet: - path: /healthz - port: metrics - scheme: HTTP - initialDelaySeconds: 2 - periodSeconds: 10 - readinessProbe: - httpGet: - path: /healthz - port: metrics - scheme: HTTP - periodSeconds: 10 - resources: - limits: - memory: 500Mi - requests: - cpu: 250m - memory: 500Mi - volumeMounts: - - name: config-volume - mountPath: /etc/config - readOnly: true - - - name: dz-prometheus-server - image: "quay.io/prometheus/prometheus:v3.4.1" - imagePullPolicy: "IfNotPresent" - args: - - --storage.tsdb.retention.time=2d - - --config.file=/etc/config/prometheus.yml - - --storage.tsdb.path=/data - - --web.console.libraries=/etc/prometheus/console_libraries - - --web.console.templates=/etc/prometheus/consoles - - --web.enable-lifecycle - ports: - - containerPort: 9090 - readinessProbe: - httpGet: - path: /-/ready - port: 9090 - scheme: HTTP - initialDelaySeconds: 30 - periodSeconds: 5 - timeoutSeconds: 4 - failureThreshold: 3 - successThreshold: 1 - livenessProbe: - httpGet: - path: /-/healthy - port: 9090 - scheme: HTTP - initialDelaySeconds: 30 - periodSeconds: 15 - timeoutSeconds: 10 - failureThreshold: 3 - successThreshold: 1 - resources: - limits: - memory: 2Gi - requests: - cpu: 100m - memory: 2Gi - volumeMounts: - - name: config-volume - mountPath: /etc/config - - name: storage-volume - mountPath: /data - subPath: "" - dnsPolicy: ClusterFirst - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - terminationGracePeriodSeconds: 300 - volumes: - - name: config-volume - configMap: - name: prometheus-dz-prometheus-server - - name: storage-volume - emptyDir: - {} diff --git a/dist/zxporter.yaml b/dist/zxporter.yaml index b97b1a48..4cde8ee2 100644 --- a/dist/zxporter.yaml +++ b/dist/zxporter.yaml @@ -558,7 +558,6 @@ data: CLUSTER_TOKEN: "" COLLECTION_FREQUENCY: "" DAKR_URL: https://dakr.devzero.io - DISABLE_NETWORK_IO_METRICS: "" DISABLED_COLLECTORS: "" EXCLUDED_CLUSTERROLEBINDINGS: "" EXCLUDED_CLUSTERROLES: "" @@ -595,7 +594,6 @@ data: KUBE_CONTEXT_NAME: '{{ .kube_context_name }}' MASK_SECRET_DATA: "" NODE_METRICS_INTERVAL: "" - PROMETHEUS_URL: http://prometheus-dz-prometheus-server.devzero-system.svc.cluster.local:80 TARGET_NAMESPACES: "" TOKEN_CONFIGMAP_NAME: devzero-zxporter-env-config TOKEN_CREDENTIALS_SECRET_NAME: devzero-zxporter-credentials @@ -614,7 +612,7 @@ metadata: name: devzero-zxporter-token namespace: devzero-system stringData: - CLUSTER_TOKEN: '{{ .cluster_token }}' + CLUSTER_TOKEN: "" type: Opaque --- apiVersion: v1 diff --git a/docs/superpowers/specs/2026-04-28-remove-prometheus-dependency-design.md b/docs/superpowers/specs/2026-04-28-remove-prometheus-dependency-design.md new file mode 100644 index 00000000..71019f16 --- /dev/null +++ b/docs/superpowers/specs/2026-04-28-remove-prometheus-dependency-design.md @@ -0,0 +1,314 @@ +# Remove Prometheus Dependency from ZXporter + +## Goal + +Eliminate zxporter's runtime dependency on Prometheus server, node-exporter, and kube-state-metrics. All metrics currently sourced from Prometheus will be collected directly from kubelet endpoints via the nodemon DaemonSet, following the pattern established by Cortex (`/api/v1/nodes/{node}/proxy/stats/summary`) and the existing GPU metrics exporter branch (`ph/gpu-metrics-exporter`). + +## Current State + +ZXporter deploys and queries a Prometheus stack (Prometheus server, node-exporter DaemonSet, kube-state-metrics) to collect container, node, PVC, and GPU metrics. Five collectors depend on Prometheus: + +| Collector | What it queries | PromQL examples | +|---|---|---| +| ContainerResourceCollector | Container CPU, memory, network I/O, disk I/O, CPU throttle | `rate(container_cpu_usage_seconds_total[5m])`, `container_memory_working_set_bytes`, `rate(container_network_*[5m])`, `rate(container_fs_*[5m])`, `rate(container_cpu_cfs_throttled_periods_total[5m]) / rate(container_cpu_cfs_periods_total[5m])` | +| NodeCollector | Node network, disk, GPU metrics | `rate(node_network_*[5m])`, `rate(node_disk_*[5m])`, `DCGM_FI_DEV_*` | +| PVCMetricsCollector | PVC storage usage | `kubelet_volume_stats_used_bytes`, `kubelet_volume_stats_capacity_bytes`, `kubelet_volume_stats_available_bytes` | +| HistoricalMetricsCollector | 24h percentile aggregations (P50-P99, Pmax) for CPU and memory | `quantile_over_time(0.90, rate(container_cpu_usage_seconds_total[5m])[24h:1m])` | +| GPU metrics (NodeCollector + ContainerResourceCollector) | DCGM GPU utilization, memory, power, temp | `DCGM_FI_DEV_GPU_UTIL`, `DCGM_FI_DEV_FB_USED`, etc. | + +## Data Source Replacement Map + +### Container-level metrics (ContainerResourceCollector) + +| Current PromQL | New Source | Endpoint | +|---|---|---| +| `rate(container_cpu_usage_seconds_total[5m])` | kubelet `stats/summary` -> `usageNanoCores` (instant rate, no computation needed) | `/api/v1/nodes/{node}/proxy/stats/summary` | +| `container_memory_working_set_bytes` | kubelet `stats/summary` -> `workingSetBytes` | same | +| `rate(container_network_receive_bytes_total[5m])` | kubelet `stats/summary` -> `rxBytes` | same | +| `rate(container_network_transmit_bytes_total[5m])` | kubelet `stats/summary` -> `txBytes` | same | +| `rate(container_network_receive_packets_total[5m])` | cAdvisor counter -> nodemon computes rate | `/metrics/cadvisor` | +| `rate(container_network_receive_errors_total[5m])` | cAdvisor counter -> nodemon computes rate | same | +| `rate(container_network_*_dropped_total[5m])` | cAdvisor counter -> nodemon computes rate | same | +| `rate(container_fs_reads_bytes_total[5m])` | cAdvisor counter -> nodemon computes rate | same | +| `rate(container_fs_writes_bytes_total[5m])` | cAdvisor counter -> nodemon computes rate | same | +| `rate(container_cpu_cfs_throttled_periods_total[5m]) / rate(container_cpu_cfs_periods_total[5m])` | cAdvisor counters -> nodemon computes ratio | same | + +### Node-level metrics (NodeCollector) + +| Current PromQL | New Source | +|---|---| +| `rate(node_network_*[5m])` | kubelet `stats/summary` for bytes; cAdvisor for errors/drops/packets | +| `rate(node_disk_*[5m])` | cAdvisor `/metrics/cadvisor` counters | + +### PVC metrics (PVCMetricsCollector) + +| Current PromQL | New Source | +|---|---| +| `kubelet_volume_stats_used_bytes` | kubelet `stats/summary` -> `volume[].usedBytes` | +| `kubelet_volume_stats_capacity_bytes` | kubelet `stats/summary` -> `volume[].capacityBytes` | +| `kubelet_volume_stats_available_bytes` | kubelet `stats/summary` -> `volume[].availableBytes` | + +### GPU metrics + +Already handled by nodemon on `ph/gpu-metrics-exporter` branch. No change needed. + +### Historical 24h percentiles (HistoricalMetricsCollector) + +| Current PromQL | New Source | +|---|---| +| `quantile_over_time(P, rate(cpu[5m])[24h:1m])` | DAKR ClickHouse `hourly_workload_summary_metrics_ch` via `quantilesGKMerge()` | +| `quantile_over_time(P, memory[24h])` | same | +| `max_over_time(rate(cpu[5m])[24h:1m])` | Derivable from P999 or `max()` on hourly table | +| `count_over_time(memory[24h])` | `count` column in hourly summary table | + +## Architecture + +### Nodemon DaemonSet (extended) + +The existing nodemon DaemonSet (`ph/gpu-metrics-exporter` branch) is extended from GPU-only to a unified node-local metric collector: + +``` +nodemon pod (DaemonSet, one per node) +|-- stats/summary poller <-- kubelet JSON API (Cortex pattern) +| CPU, memory, network bytes, PVC storage +| +|-- cAdvisor scraper <-- /metrics/cadvisor (Prometheus text format) +| CPU throttle, disk I/O, network errors/drops/packets +| +|-- DCGM scraper <-- already exists +| GPU metrics +| +|-- HTTP API server (:6061) <-- already exists + GET /container/metrics <-- extended: all container metrics + GET /node/metrics <-- new: node-level aggregates + GET /pvc/metrics <-- new: PVC storage stats + GET /healthz <-- already exists +``` + +### Rate computation for counters + +Nodemon computes rates locally for all counter-based metrics. This covers: +- cAdvisor counters (CPU throttle periods, disk I/O ops, network packets/errors/drops) scraped from the local kubelet's `/metrics/cadvisor` endpoint (not via API proxy — nodemon runs on the same node) +- `stats/summary` cumulative values (network `rxBytes`/`txBytes` are cumulative totals, not instant rates) + +Rate computation approach: +- Scrapes every 30 seconds (configurable) +- Stores previous counter values in memory per container/node +- Computes `(current - previous) / elapsed_seconds` +- First sample after startup returns 0 (no previous value) +- Handles counter resets (if current < previous, treat as reset, skip one sample) + +This is the same approach Prometheus uses internally. + +### Historical percentile cache (replaces HistoricalMetricsCollector) + +New component `HistoricalPercentileCache` in zxporter replaces `HistoricalMetricsCollector`: + +``` +DAKR ClickHouse (source of truth, minute-bucketed data + hourly GK sketches) + | (every 15 min, background fetch via existing DakrClient) +zxporter HistoricalPercentileCache (in-memory map) + | (gRPC stream, sub-ms, in-cluster) +DAKR operator (HPA engine, 30s evaluation) +``` + +Design rationale for keeping historical data in-cluster (zxporter -> operator): +- Operator makes autoscaling decisions every 30 seconds; this is a hot path +- In-cluster latency is sub-millisecond vs 10-200ms+ to control plane +- Autoscaling continues working during control plane outages +- zxporter caches pre-computed percentiles, serves them to operator via existing fast gRPC stream + +The `HistoricalPercentileCache` implements the same interface as `HistoricalMetricsCollector`: + +```go +FetchPercentilesForAll(ctx, queries []WorkloadQuery) []PercentileResult +``` + +MPA Server and DAKR operator see zero change. + +### Failure modes + +| Scenario | Behavior | +|---|---| +| Control plane unreachable on startup | MPA Server starts without historical data (same as today when Prometheus is starting up) | +| Control plane goes down after running | Stale cache continues serving last-fetched percentiles. Log warning. | +| Control plane slow (>5s) | Background fetch with timeout. Never blocks the MPA stream. | +| zxporter pod restarts | First fetch on startup (~1 API call). No 24h blind spot (unlike Prometheus which needs to re-scrape). | +| Nodemon pod restarts | First cAdvisor scrape returns 0 for rate metrics (no previous sample). Normal after one interval. | + +## Components Removed + +| Component | Action | +|---|---| +| Prometheus server Deployment | Remove from Helm chart | +| Node-exporter DaemonSet | Remove (nodemon replaces it) | +| Kube-state-metrics Deployment | Remove (zxporter already watches nodes via K8s API) | +| Prometheus ConfigMap (scrape config) | Remove | +| Prometheus ServiceAccount + ClusterRole + ClusterRoleBinding | Remove | +| `prometheus-dz-prometheus-server` Service | Remove | + +## Components Unchanged + +| Component | Why unchanged | +|---|---| +| MPA Server gRPC stream | Same interface, same proto messages | +| DAKR operator | Receives identical data, unaware of source change | +| OOM detection (pod_collector.go) | Already from pod status, not Prometheus | +| TelemetrySender | Collects from local Prometheus *registry* (self-instrumentation), not the Prometheus *server* | +| Collection pipeline (batcher, DirectSender, DakrClient) | Data source is upstream of these | +| Nodemon GPU scraping | Already exists, no change | + +## RBAC Changes + +| Permission | ServiceAccount | Change | +|---|---|---| +| `nodes/proxy` | nodemon | Add (needed for `stats/summary` and cAdvisor) | +| `nodes/proxy` | Prometheus (`prometheus-dz-prometheus-server`) | Remove (Prometheus gone) | +| `nodes/metrics` | zxporter (`devzero-zxporter-manager-role`) | Keep (already has it) | +| All Prometheus RBAC | `prometheus-dz-prometheus-server`, `prometheus-kube-state-metrics` | Remove | + +## Migration Strategy + +### Phase 1: Test current behavior + +Add tests that pin down the transformation logic of each Prometheus-dependent collector. These tests mock the Prometheus API client, feed known PromQL responses, and assert exact output. + +| Collector | What to test | +|---|---| +| ContainerResourceCollector | Given known Prometheus responses -> assert exact `ContainerMetricsSnapshot` (CPU millis, memory bytes, throttle fraction, network, disk) | +| NodeCollector | Given known Prometheus responses -> assert node metric output | +| PVCMetricsCollector | Given known Prometheus responses -> assert PVC used/capacity/available bytes | +| HistoricalMetricsCollector | Given known Prometheus responses -> assert percentile values (P50-P99, Pmax, sample count) | +| MPA Server Broadcast | Given a `ContainerMetricsSnapshot` -> assert exact `ContainerMetricItem` proto message | + +These tests validate the transformation, not the data source. When we swap the source, these tests must still pass. + +### Phase 2: Extend nodemon (behind feature flag) + +Add `stats/summary` + cAdvisor scraping to nodemon. Deploy alongside existing Prometheus. + +Feature flag: `ENABLE_NODEMON_METRICS=true` (default: false) + +When enabled, ContainerResourceCollector, NodeCollector, and PVCMetricsCollector query nodemon instead of Prometheus. Existing Prometheus path stays as fallback. + +Use the `CompareGPUMetrics()` validation pattern (already exists on GPU branch) to compare nodemon output vs Prometheus output in production. + +### Phase 3: Replace HistoricalMetricsCollector + +Swap to `HistoricalPercentileCache` (DAKR control plane fetch). Also behind the same feature flag. + +### Phase 4: Remove Prometheus + +Once validated in production: +- Remove Prometheus, node-exporter, kube-state-metrics from Helm chart +- Deprecate `PROMETHEUS_URL` config with warning log +- Remove Prometheus client imports from collectors +- Clean up RBAC +- Feature flag becomes default-on, eventually removed + +## Nodemon HTTP API (extended) + +### GET /container/metrics + +Query params: `?node=` (required) + +Optional filters: `?namespace=&pod=&container=` + +Response: +```json +[ + { + "node_name": "node-1", + "namespace": "default", + "pod": "web-abc123", + "container": "nginx", + "timestamp": "2026-04-28T12:00:00Z", + + "cpu_usage_nanocores": 50000000, + "memory_working_set_bytes": 104857600, + "memory_usage_bytes": 134217728, + "memory_rss_bytes": 94371840, + + "network_rx_bytes": 1024000, + "network_tx_bytes": 512000, + "network_rx_packets_per_sec": 150.5, + "network_tx_packets_per_sec": 120.3, + "network_rx_errors_per_sec": 0.0, + "network_tx_errors_per_sec": 0.0, + "network_rx_drops_per_sec": 0.0, + "network_tx_drops_per_sec": 0.0, + + "disk_read_bytes_per_sec": 4096.0, + "disk_write_bytes_per_sec": 8192.0, + "disk_read_ops_per_sec": 10.0, + "disk_write_ops_per_sec": 20.0, + + "cpu_throttle_fraction": 0.05, + + "gpu_utilization": 85.5, + "gpu_memory_used_mib": 8192.0, + "gpu_memory_free_mib": 8192.0, + "gpu_power_watts": 125.0, + "gpu_temperature_celsius": 72.0 + } +] +``` + +### GET /node/metrics + +Query params: `?node=` (required) + +Response: +```json +{ + "node_name": "node-1", + "timestamp": "2026-04-28T12:00:00Z", + + "network_rx_bytes_per_sec": 52428800.0, + "network_tx_bytes_per_sec": 26214400.0, + "network_rx_packets_per_sec": 50000.0, + "network_tx_packets_per_sec": 40000.0, + "network_rx_errors_per_sec": 0.0, + "network_tx_errors_per_sec": 0.0, + "network_rx_drops_per_sec": 0.0, + "network_tx_drops_per_sec": 0.0, + + "disk_read_bytes_per_sec": 10485760.0, + "disk_write_bytes_per_sec": 20971520.0, + "disk_read_ops_per_sec": 500.0, + "disk_write_ops_per_sec": 1000.0, + + "gpu_count": 4, + "gpu_utilization_avg": 72.5, + "gpu_utilization_max": 95.0, + "gpu_memory_used_total_mib": 32768.0, + "gpu_memory_free_total_mib": 32768.0, + "gpu_power_usage_total_watts": 500.0, + "gpu_temperature_avg_celsius": 68.0, + "gpu_temperature_max_celsius": 78.0 +} +``` + +### GET /pvc/metrics + +Query params: `?node=` (required) + +Response: +```json +[ + { + "namespace": "default", + "pod": "postgres-0", + "pvc_name": "data-postgres-0", + "used_bytes": 5368709120, + "capacity_bytes": 10737418240, + "available_bytes": 5368709120 + } +] +``` + +## Dependencies + +- `ph/gpu-metrics-exporter` branch must be merged to `main` first (nodemon base) +- DAKR control plane must expose a percentile fetch API accessible to zxporter (may already exist via existing DakrClient connection; needs verification) +- Nodemon Helm chart needs `nodes/proxy` RBAC permission added diff --git a/entrypoint.sh b/entrypoint.sh index c573d1b4..ec354664 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -1,63 +1,6 @@ #!/bin/bash - -# Exit on error and pipefail to catch errors in pipes set -eo pipefail - -# Function to log messages -log() { - echo "$(date +"%Y-%m-%d %H:%M:%S") - $1" -} - -# Function to handle errors -handle_error() { - log "ERROR: $1" - # Continue execution (don't exit) since we need to run the main app regardless -} - +log() { echo "$(date +"%Y-%m-%d %H:%M:%S") - $1"; } log "Starting entrypoint script" - -# Check if metrics-server is installed -log "Checking if metrics-server is installed..." -if ! kubectl get apiservice v1beta1.metrics.k8s.io &>/dev/null || ! kubectl top nodes &>/dev/null || ! kubectl get --raw "/apis/metrics.k8s.io/v1beta1/nodes" &>/dev/null; then - log "metrics-server not found, installing it now..." - - # Check if metrics-server.yaml exists - if [ ! -f /metrics-server.yaml ]; then - handle_error "metrics-server.yaml not found at /metrics-server.yaml" - else - # Apply the metrics-server yaml - if kubectl apply -f /metrics-server.yaml; then - log "metrics-server installed successfully" - else - handle_error "Failed to install metrics-server" - fi - - # Wait for metrics-server to be ready - log "Waiting for metrics-server to be ready..." - ATTEMPTS=0 - MAX_ATTEMPTS=30 - - while [ $ATTEMPTS -lt $MAX_ATTEMPTS ]; do - if kubectl get --raw "/apis/metrics.k8s.io/v1beta1/nodes" &>/dev/null; then - log "metrics-server is now ready" - break - fi - - ATTEMPTS=$((ATTEMPTS+1)) - log "Waiting for metrics-server to be ready (attempt $ATTEMPTS/$MAX_ATTEMPTS)..." - sleep 2 - done - - if [ $ATTEMPTS -eq $MAX_ATTEMPTS ]; then - handle_error "metrics-server did not become ready in time" - fi - fi -else - log "metrics-server is already installed" -fi - -# Run the main application log "Starting main application..." - -# Execute the main command (assumes it's passed as arguments to this script) exec "$@" diff --git a/helm-chart/zxporter-nodemon/templates/dcgm-exporter-configmap.yaml b/helm-chart/zxporter-nodemon/templates/dcgm-exporter-configmap.yaml index d2fa0e46..4f74bec7 100644 --- a/helm-chart/zxporter-nodemon/templates/dcgm-exporter-configmap.yaml +++ b/helm-chart/zxporter-nodemon/templates/dcgm-exporter-configmap.yaml @@ -3,6 +3,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: {{- include "dcgm-exporter.config-map" . | indent 1 }} + namespace: {{ .Release.Namespace }} data: counters.csv: | # Temperature and power usage,, diff --git a/helm-chart/zxporter-nodemon/templates/nodemon-configmap.yaml b/helm-chart/zxporter-nodemon/templates/nodemon-configmap.yaml index 3a254c03..284abd8d 100644 --- a/helm-chart/zxporter-nodemon/templates/nodemon-configmap.yaml +++ b/helm-chart/zxporter-nodemon/templates/nodemon-configmap.yaml @@ -2,6 +2,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: {{- include "zxporter-nodemon.config-map" . | indent 1}} + namespace: {{ .Release.Namespace }} data: {{- $config := .Values.gpuMetricsExporter.config | default dict }} {{- if $config }} diff --git a/helm-chart/zxporter-nodemon/templates/rbac.yaml b/helm-chart/zxporter-nodemon/templates/rbac.yaml index ebdedcb9..63956fd4 100644 --- a/helm-chart/zxporter-nodemon/templates/rbac.yaml +++ b/helm-chart/zxporter-nodemon/templates/rbac.yaml @@ -51,6 +51,14 @@ rules: - rollouts verbs: - get +- apiGroups: + - "" + resources: + - nodes/proxy + - nodes/metrics + - nodes/stats + verbs: + - get --- apiVersion: rbac.authorization.k8s.io/v1 kind: {{ if .Values.gpuMetricsExporter.rbac.clusterWide }}ClusterRoleBinding{{ else }}RoleBinding{{ end }} diff --git a/helm-chart/zxporter/Chart.lock b/helm-chart/zxporter/Chart.lock new file mode 100644 index 00000000..24b2dca7 --- /dev/null +++ b/helm-chart/zxporter/Chart.lock @@ -0,0 +1,6 @@ +dependencies: +- name: zxporter-nodemon + repository: file://../zxporter-nodemon + version: 0.0.2 +digest: sha256:e95869581c38969f227cf52a74935ea84986bc79cd06bd2b8ffe39543b53acad +generated: "2026-05-01T21:18:14.726252+05:30" diff --git a/helm-chart/zxporter/Chart.yaml b/helm-chart/zxporter/Chart.yaml index 48c83e10..0ee7a469 100644 --- a/helm-chart/zxporter/Chart.yaml +++ b/helm-chart/zxporter/Chart.yaml @@ -15,3 +15,8 @@ keywords: - kubernetes - resource-optimization - devzero +dependencies: + - name: zxporter-nodemon + version: "0.0.2" + repository: "file://../zxporter-nodemon" + condition: nodemonMetrics.enabled diff --git a/helm-chart/zxporter/charts/zxporter-nodemon-0.0.2.tgz b/helm-chart/zxporter/charts/zxporter-nodemon-0.0.2.tgz new file mode 100644 index 00000000..714c953e Binary files /dev/null and b/helm-chart/zxporter/charts/zxporter-nodemon-0.0.2.tgz differ diff --git a/helm-chart/zxporter/templates/configmap.yaml b/helm-chart/zxporter/templates/configmap.yaml index 20798397..6c364693 100644 --- a/helm-chart/zxporter/templates/configmap.yaml +++ b/helm-chart/zxporter/templates/configmap.yaml @@ -6,7 +6,7 @@ data: PAT_TOKEN: "{{ .Values.zxporter.patToken }}" {{- end }} COLLECTION_FREQUENCY: "" - DISABLE_NETWORK_IO_METRICS: "" + DISABLE_GPU_METRICS: "{{ .Values.zxporter.disableGPUMetrics }}" DISABLED_COLLECTORS: "" EXCLUDED_CLUSTERROLEBINDINGS: "" EXCLUDED_CLUSTERROLES: "" @@ -44,7 +44,7 @@ data: LOG_LEVEL: "{{ .Values.zxporter.logLevel }}" MASK_SECRET_DATA: "" NODE_METRICS_INTERVAL: "" - PROMETHEUS_URL: "{{ .Values.zxporter.prometheusUrl }}" + ENABLE_NODEMON_METRICS: "{{ .Values.nodemonMetrics.enabled }}" DAKR_URL: "{{ .Values.zxporter.dakrUrl }}" TARGET_NAMESPACES: "{{ .Values.zxporter.targetNamespaces }}" TOKEN_CONFIGMAP_NAME: "{{ .Values.zxporter.tokenConfigMapName }}" diff --git a/helm-chart/zxporter/templates/node-exporter-daemonset.yaml b/helm-chart/zxporter/templates/node-exporter-daemonset.yaml index c512ad02..57880513 100644 --- a/helm-chart/zxporter/templates/node-exporter-daemonset.yaml +++ b/helm-chart/zxporter/templates/node-exporter-daemonset.yaml @@ -1,3 +1,4 @@ +{{- if not .Values.nodemonMetrics.enabled }} {{- if .Values.monitoring.enabled }} {{- if .Values.monitoring.nodeExporter.enabled }} # Source: prometheus-node-exporter/templates/daemonset.yaml @@ -135,3 +136,4 @@ spec: path: / {{- end }} {{- end }} +{{- end }} diff --git a/helm-chart/zxporter/templates/prometheus-cleanup-job.yaml b/helm-chart/zxporter/templates/prometheus-cleanup-job.yaml new file mode 100644 index 00000000..91d506b2 --- /dev/null +++ b/helm-chart/zxporter/templates/prometheus-cleanup-job.yaml @@ -0,0 +1,198 @@ +{{- /* +One-time migration hook: cleans up legacy Prometheus/node-exporter/kube-state-metrics +and standalone nodemon resources from previous zxporter installations. +- Runs as post-install and post-upgrade hook +- Dedicated ServiceAccount with least-privilege RBAC (resourceNames scoped) +- Idempotent: --ignore-not-found on all deletes +- Self-cleaning: hook-delete-policy deletes resources after completion +*/ -}} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: zxporter-prometheus-cleanup + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: zxporter-prometheus-cleanup + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +rules: +- apiGroups: ["apps"] + resources: ["deployments"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + verbs: ["delete"] +- apiGroups: ["apps"] + resources: ["daemonsets"] + resourceNames: + - dz-prometheus-node-exporter + - zxporter-nodemon + verbs: ["delete"] +- apiGroups: [""] + resources: ["services"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + - dz-prometheus-node-exporter + verbs: ["delete"] +- apiGroups: [""] + resources: ["configmaps"] + resourceNames: + - prometheus-dz-prometheus-server + - zxporter-nodemon-dcgm-metrics + - zxporter-nodemon-zxporter-nodemon + verbs: ["delete"] +- apiGroups: [""] + resources: ["serviceaccounts"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + - dz-prometheus-node-exporter + - zxporter-nodemon + verbs: ["delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: zxporter-prometheus-cleanup + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: zxporter-prometheus-cleanup +subjects: +- kind: ServiceAccount + name: zxporter-prometheus-cleanup + namespace: {{ .Release.Namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: zxporter-prometheus-cleanup + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +rules: +- apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterroles"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + - zxporter-nodemon + verbs: ["delete"] +- apiGroups: ["rbac.authorization.k8s.io"] + resources: ["clusterrolebindings"] + resourceNames: + - prometheus-dz-prometheus-server + - prometheus-kube-state-metrics + - zxporter-nodemon + verbs: ["delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: zxporter-prometheus-cleanup + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: zxporter-prometheus-cleanup +subjects: +- kind: ServiceAccount + name: zxporter-prometheus-cleanup + namespace: {{ .Release.Namespace }} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: zxporter-prometheus-cleanup + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + app.kubernetes.io/part-of: devzero-zxporter + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "0" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + ttlSecondsAfterFinished: 300 + backoffLimit: 1 + template: + metadata: + labels: + app.kubernetes.io/name: zxporter-prometheus-cleanup + spec: + restartPolicy: Never + serviceAccountName: zxporter-prometheus-cleanup + containers: + - name: cleanup + image: bitnami/kubectl:latest + command: + - /bin/sh + - -c + - | + NS={{ .Release.Namespace }} + echo "Cleaning up legacy zxporter Prometheus and standalone nodemon resources in namespace: $NS" + echo "This only deletes resources by exact name — other installs are unaffected." + + # === Legacy Prometheus resources === + kubectl delete deployment prometheus-dz-prometheus-server -n $NS --ignore-not-found + kubectl delete deployment prometheus-kube-state-metrics -n $NS --ignore-not-found + kubectl delete daemonset dz-prometheus-node-exporter -n $NS --ignore-not-found + kubectl delete service prometheus-dz-prometheus-server -n $NS --ignore-not-found + kubectl delete service prometheus-kube-state-metrics -n $NS --ignore-not-found + kubectl delete service dz-prometheus-node-exporter -n $NS --ignore-not-found + kubectl delete configmap prometheus-dz-prometheus-server -n $NS --ignore-not-found + kubectl delete serviceaccount prometheus-dz-prometheus-server -n $NS --ignore-not-found + kubectl delete serviceaccount prometheus-kube-state-metrics -n $NS --ignore-not-found + kubectl delete serviceaccount dz-prometheus-node-exporter -n $NS --ignore-not-found + kubectl delete clusterrole prometheus-dz-prometheus-server --ignore-not-found + kubectl delete clusterrole prometheus-kube-state-metrics --ignore-not-found + kubectl delete clusterrolebinding prometheus-dz-prometheus-server --ignore-not-found + kubectl delete clusterrolebinding prometheus-kube-state-metrics --ignore-not-found + + # === Legacy standalone nodemon (GPU-only, installed separately via helm) === + # New zxporter bundles nodemon as subchart (named zxporter-zxporter-nodemon). + # The old standalone install used "zxporter-nodemon" — delete it to avoid duplicates. + kubectl delete daemonset zxporter-nodemon -n $NS --ignore-not-found + kubectl delete configmap zxporter-nodemon-dcgm-metrics -n $NS --ignore-not-found + kubectl delete configmap zxporter-nodemon-zxporter-nodemon -n $NS --ignore-not-found + kubectl delete serviceaccount zxporter-nodemon -n $NS --ignore-not-found + kubectl delete clusterrole zxporter-nodemon --ignore-not-found + kubectl delete clusterrolebinding zxporter-nodemon --ignore-not-found + + echo "Cleanup complete" diff --git a/helm-chart/zxporter/templates/prometheus-configmap.yaml b/helm-chart/zxporter/templates/prometheus-configmap.yaml index cd0dcb5c..d10f740e 100644 --- a/helm-chart/zxporter/templates/prometheus-configmap.yaml +++ b/helm-chart/zxporter/templates/prometheus-configmap.yaml @@ -1,3 +1,4 @@ +{{- if not .Values.nodemonMetrics.enabled }} {{- if .Values.monitoring.enabled }} {{- if .Values.monitoring.prometheus.enabled }} # Source: prometheus/templates/cm.yaml @@ -202,3 +203,4 @@ data: {} {{- end }} {{- end }} +{{- end }} diff --git a/helm-chart/zxporter/templates/prometheus-deployments.yaml b/helm-chart/zxporter/templates/prometheus-deployments.yaml index 7593102c..6b3ee9fe 100644 --- a/helm-chart/zxporter/templates/prometheus-deployments.yaml +++ b/helm-chart/zxporter/templates/prometheus-deployments.yaml @@ -1,3 +1,4 @@ +{{- if not .Values.nodemonMetrics.enabled }} {{- if .Values.monitoring.enabled }} {{- if .Values.monitoring.prometheus.enabled }} # Source: prometheus/charts/kube-state-metrics/templates/deployment.yaml @@ -237,3 +238,4 @@ spec: emptyDir: {} {{- end }} {{- end }} +{{- end }} diff --git a/helm-chart/zxporter/templates/prometheus-rbac.yaml b/helm-chart/zxporter/templates/prometheus-rbac.yaml index 333c82c0..87b7dec7 100644 --- a/helm-chart/zxporter/templates/prometheus-rbac.yaml +++ b/helm-chart/zxporter/templates/prometheus-rbac.yaml @@ -1,3 +1,4 @@ +{{- if not .Values.nodemonMetrics.enabled }} {{- if .Values.monitoring.enabled }} {{- if .Values.monitoring.prometheus.enabled }} # Source: prometheus/charts/kube-state-metrics/templates/role.yaml @@ -113,3 +114,4 @@ roleRef: name: prometheus-dz-prometheus-server {{- end }} {{- end }} +{{- end }} diff --git a/helm-chart/zxporter/templates/prometheus-serviceaccounts.yaml b/helm-chart/zxporter/templates/prometheus-serviceaccounts.yaml index f54b90d4..e5b9070c 100644 --- a/helm-chart/zxporter/templates/prometheus-serviceaccounts.yaml +++ b/helm-chart/zxporter/templates/prometheus-serviceaccounts.yaml @@ -1,3 +1,4 @@ +{{- if not .Values.nodemonMetrics.enabled }} {{- if .Values.monitoring.enabled }} {{- if .Values.monitoring.prometheus.enabled }} # Source: prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml @@ -48,3 +49,4 @@ metadata: automountServiceAccountToken: false {{- end }} {{- end }} +{{- end }} diff --git a/helm-chart/zxporter/templates/prometheus-services.yaml b/helm-chart/zxporter/templates/prometheus-services.yaml index 51f1e8f9..9aaec19b 100644 --- a/helm-chart/zxporter/templates/prometheus-services.yaml +++ b/helm-chart/zxporter/templates/prometheus-services.yaml @@ -1,3 +1,4 @@ +{{- if not .Values.nodemonMetrics.enabled }} {{- if .Values.monitoring.enabled }} {{- if .Values.monitoring.prometheus.enabled }} # Source: prometheus/charts/kube-state-metrics/templates/service.yaml @@ -81,3 +82,4 @@ spec: app.kubernetes.io/instance: node-exporter {{- end }} {{- end }} +{{- end }} diff --git a/helm-chart/zxporter/values.yaml b/helm-chart/zxporter/values.yaml index b698226b..fbb6145c 100644 --- a/helm-chart/zxporter/values.yaml +++ b/helm-chart/zxporter/values.yaml @@ -16,7 +16,6 @@ image: # ZXPorter configuration zxporter: dakrUrl: "https://dakr.devzero.io" - prometheusUrl: "http://prometheus-dz-prometheus-server.devzero-system.svc.cluster.local:80" targetNamespaces: "" # Cluster token for authentication with the DevZero platform # Required: Either clusterToken or patToken must be provided @@ -52,6 +51,9 @@ zxporter: # Default: error logLevel: "error" + # Disable GPU metrics collection (set to "true" for clusters without GPUs) + disableGPUMetrics: "" + # Resource names for token storage (configurable for different environments) # Input credentials Secret: Contains initial PAT/CLUSTER tokens (user-managed) tokenCredentialsSecretName: "devzero-zxporter-credentials" @@ -126,6 +128,14 @@ prometheus: evaluationInterval: 1m scrapeTimeout: 10s +# Nodemon metrics configuration +# When enabled, the nodemon DaemonSet collects metrics directly from kubelet, +# making Prometheus server, node-exporter, and kube-state-metrics unnecessary. +nodemonMetrics: + enabled: false # When true, skips Prometheus/node-exporter/kube-state-metrics deployment + +# DEPRECATED: When nodemonMetrics.enabled=true, Prometheus is not deployed. +# The nodemon DaemonSet collects metrics directly from kubelet. # Monitoring components (set to false to use your own Prometheus/Node Exporter) monitoring: enabled: true diff --git a/internal/collector/container_resource_collector.go b/internal/collector/container_resource_collector.go index 04c2ebb0..a7982416 100644 --- a/internal/collector/container_resource_collector.go +++ b/internal/collector/container_resource_collector.go @@ -11,14 +11,12 @@ import ( "sync" "time" - gpuconst "github.com/NVIDIA/KAI-scheduler/pkg/common/constants" telemetry_logger "github.com/devzero-inc/zxporter/internal/logger" + "github.com/devzero-inc/zxporter/internal/nodemon" "github.com/devzero-inc/zxporter/internal/version" "github.com/go-logr/logr" - "github.com/prometheus/client_golang/api" - v1 "github.com/prometheus/client_golang/api/prometheus/v1" - "github.com/prometheus/common/model" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/informers" @@ -36,26 +34,11 @@ type ContainerResourceCollectorConfig struct { // UpdateInterval specifies how often to collect metrics UpdateInterval time.Duration - // PrometheusURL specifies the URL of the Prometheus instance to query - // If empty, defaults to in-cluster Prometheus at http://prometheus.monitoring:9090 - PrometheusURL string - - // QueryTimeout specifies the timeout for Prometheus queries - QueryTimeout time.Duration - - // DisableNetworkIOMetrics determines whether to disable network and I/O metrics collection - // Default is false, so metrics are collected by default - DisableNetworkIOMetrics bool - // DisableGPUMetrics determines whether to disable GPU metrics collection // Default is false, so metrics are collected by default DisableGPUMetrics bool } -type gpuQueryState struct { - lastFailed bool -} - // throttleTracker tracks last emission time for CPU throttle events per container to avoid duplicates. type throttleTracker struct { lastEmitted map[string]time.Time // key: "ns/pod/container" → last emit time @@ -64,30 +47,32 @@ type throttleTracker struct { // ContainerResourceCollector collects container resource usage metrics type ContainerResourceCollector struct { - k8sClient kubernetes.Interface - metricsClient *metricsv1.Clientset - prometheusAPI v1.API - nodemonClient *NodemonClient - informerFactory informers.SharedInformerFactory - podInformer cache.SharedIndexInformer - batchChan chan CollectedResource // Channel for individual resources -> input to batcher - resourceChan chan []CollectedResource // Channel for batched resources -> output from batcher - batcher *ResourcesBatcher - stopCh chan struct{} - ticker *time.Ticker - config ContainerResourceCollectorConfig - namespaces []string - excludedPods map[types.NamespacedName]bool - logger logr.Logger - metrics *TelemetryMetrics - telemetryLogger telemetry_logger.Logger - mu sync.RWMutex - gpuQueryErrorState map[string]*gpuQueryState // most of the case we are not deploying zxporter to GPU nodes which cause GPU query to fail infinitely, and we dont want to get that GPU query fails error every minute for every container - gpuQueryMu sync.Mutex - throttle throttleTracker - // Removed manual cache - rsLister appslisters.ReplicaSetLister - rsInformer cache.SharedIndexInformer + k8sClient kubernetes.Interface + metricsClient *metricsv1.Clientset + nodemonClient *NodemonClient + informerFactory informers.SharedInformerFactory + podInformer cache.SharedIndexInformer + batchChan chan CollectedResource // Channel for individual resources -> input to batcher + resourceChan chan []CollectedResource // Channel for batched resources -> output from batcher + batcher *ResourcesBatcher + stopCh chan struct{} + ticker *time.Ticker + config ContainerResourceCollectorConfig + namespaces []string + excludedPods map[types.NamespacedName]bool + logger logr.Logger + metrics *TelemetryMetrics + telemetryLogger telemetry_logger.Logger + mu sync.RWMutex + throttle throttleTracker + // nodemonContainerMetricsCache holds pre-fetched container metrics from nodemon, + // indexed by "namespace/podName", refreshed once per collection cycle. + nodemonContainerMetricsCache map[string][]UnifiedContainerMetric + // networkByteRates computes per-second rates from cumulative network byte counters + // returned by kubelet stats/summary (which are totals, not rates). + networkByteRates *nodemon.RateCalculator + rsLister appslisters.ReplicaSetLister + rsInformer cache.SharedIndexInformer } // NewContainerResourceCollector creates a new collector for container resource metrics @@ -117,16 +102,6 @@ func NewContainerResourceCollector( config.UpdateInterval = 10 * time.Second } - // Default Prometheus URL if not specified - if config.PrometheusURL == "" { - config.PrometheusURL = "http://prometheus-service.monitoring.svc.cluster.local:8080" - } - - // Default query timeout if not specified - if config.QueryTimeout <= 0 { - config.QueryTimeout = 10 * time.Second - } - // Create channels batchChan := make( chan CollectedResource, @@ -143,9 +118,18 @@ func NewContainerResourceCollector( logger, ) + // Initialize nodemon client for auto-discovery in constructor + // so IsAvailable() can check it before Start() is called. + ns := os.Getenv("POD_NAMESPACE") + if ns == "" { + ns = "devzero-system" + } + nodemonClient := NewNodemonClient(k8sClient, ns, logger) + return &ContainerResourceCollector{ k8sClient: k8sClient, metricsClient: metricsClient, + nodemonClient: nodemonClient, batchChan: batchChan, resourceChan: resourceChan, batcher: batcher, @@ -156,8 +140,8 @@ func NewContainerResourceCollector( logger: logger.WithName("container-resource-collector"), metrics: metrics, telemetryLogger: telemetryLogger, - gpuQueryErrorState: make(map[string]*gpuQueryState), throttle: throttleTracker{lastEmitted: make(map[string]time.Time)}, + networkByteRates: nodemon.NewRateCalculator(), } } @@ -166,7 +150,6 @@ func (c *ContainerResourceCollector) Start(ctx context.Context) error { c.logger.Info("Starting container resource collector", "namespaces", c.namespaces, "updateInterval", c.config.UpdateInterval, - "disableNetworkIOMetrics", c.config.DisableNetworkIOMetrics, "disableGPUMetrics", c.config.DisableGPUMetrics) // Check if metrics client is available @@ -174,54 +157,6 @@ func (c *ContainerResourceCollector) Start(ctx context.Context) error { return fmt.Errorf("metrics client is not available, cannot collect container resources") } - // Initialize nodemon client for auto-discovery - // It discovers DaemonSet pods by well-known label — no config needed. - if !c.config.DisableGPUMetrics { - ns := os.Getenv("POD_NAMESPACE") - if ns == "" { - ns = "devzero-system" - } - c.nodemonClient = NewNodemonClient(c.k8sClient, ns, c.logger) - c.logger.Info("Initialized nodemon client (auto-discovery)", "namespace", ns) - } - - // Initialize Prometheus client if network/IO metrics are not disabled - // Always init Prometheus when nodemon is set — we need it for comparison mode - needPrometheus := !c.config.DisableNetworkIOMetrics || !c.config.DisableGPUMetrics - if needPrometheus { - c.logger.Info("Initializing Prometheus client for network/IO or GPU metrics", - "prometheusURL", c.config.PrometheusURL) - - // Create a custom HTTP client with metrics - httpClient := NewPrometheusClient(c.metrics) - - client, err := api.NewClient(api.Config{ - Address: c.config.PrometheusURL, - Client: httpClient, - }) - if err != nil { - c.telemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_ERROR, - "ContainerResourceCollector", - "Failed to create Prometheus client", - err, - map[string]string{ - "prometheus_url": c.config.PrometheusURL, - "zxporter_version": version.Get().String(), - }, - ) - - c.logger.Error( - err, - "Failed to create Prometheus client, network/IO and GPU metrics will be disabled", - ) - } else { - c.prometheusAPI = v1.NewAPI(client) - } - } else if c.config.DisableNetworkIOMetrics && (c.config.DisableGPUMetrics || c.nodemonClient != nil) { - c.logger.Info("Network and I/O metrics collection is disabled; GPU metrics via exporter") - } - // Create informer factory based on namespace configuration if len(c.namespaces) == 1 && c.namespaces[0] != "" { // Watch a specific namespace @@ -294,35 +229,25 @@ func (c *ContainerResourceCollector) collectResourcesLoop(ctx context.Context) { // collectAllContainerResources collects resource metrics for all containers func (c *ContainerResourceCollector) collectAllContainerResources(ctx context.Context) { - // Fetch pod metrics from the metrics server - var podMetricsList *metricsv1beta1.PodMetricsList - var err error - - if len(c.namespaces) == 1 && c.namespaces[0] != "" { - // Fetch metrics for a specific namespace - podMetricsList, err = c.metricsClient.MetricsV1beta1(). - PodMetricses(c.namespaces[0]). - List(ctx, metav1.ListOptions{}) - } else { - // Fetch metrics for all namespaces - podMetricsList, err = c.metricsClient.MetricsV1beta1().PodMetricses("").List(ctx, metav1.ListOptions{}) - } + // Build pod metrics from nodemon data + podMetricsList, err := c.buildPodMetricsFromNodemon(ctx) if err != nil { if c.telemetryLogger != nil { c.telemetryLogger.Report( gen.LogLevel_LOG_LEVEL_ERROR, "ContainerResourceCollector", - "Failed to get pod metrics from metrics server", + "Failed to get pod metrics", err, map[string]string{ "namespaces": fmt.Sprintf("%v", c.namespaces), - "error_type": "metrics_server_query_failed", + "error_type": "metrics_query_failed", + "source": "nodemon", "zxporter_version": version.Get().String(), }, ) } - c.logger.Error(err, "Failed to get pod metrics from metrics server") + c.logger.Error(err, "Failed to get pod metrics", "source", "nodemon") return } @@ -330,25 +255,26 @@ func (c *ContainerResourceCollector) collectAllContainerResources(ctx context.Co c.telemetryLogger.Report( gen.LogLevel_LOG_LEVEL_INFO, "ContainerResourceCollector", - "Successfully fetched pod metrics from metrics server", + "Successfully fetched pod metrics", nil, map[string]string{ "pod_count": fmt.Sprintf("%d", len(podMetricsList.Items)), "namespaces": fmt.Sprintf("%v", c.namespaces), - "event_type": "metrics_server_query_success", + "source": "nodemon", + "event_type": "metrics_query_success", "zxporter_version": version.Get().String(), }, ) } - // Pre-fetch GPU metrics from the nodemon (one HTTP call for the entire cycle) - var gpuIndex map[gpuContainerKey][]NodemonMetric - if c.nodemonClient != nil && !c.config.DisableGPUMetrics { - allGPUMetrics, err := c.nodemonClient.FetchAllMetrics(ctx) + // Pre-fetch container metrics from nodemon for network/IO/throttle (one call per cycle) + c.nodemonContainerMetricsCache = nil + if c.nodemonClient != nil { + allContainerMetrics, err := c.nodemonClient.FetchAllContainerMetrics(ctx) if err != nil { - c.logger.Error(err, "Failed to fetch GPU metrics from nodemon, falling back") + c.logger.Error(err, "Failed to fetch container metrics from nodemon") } else { - gpuIndex = IndexByContainer(allGPUMetrics) + c.nodemonContainerMetricsCache = indexContainerMetricsByPod(allContainerMetrics) } } @@ -368,18 +294,10 @@ func (c *ContainerResourceCollector) collectAllContainerResources(ctx context.Co continue } - // Create a context with timeout for Prometheus queries if needed - var queryCtx context.Context - var cancel context.CancelFunc - if c.prometheusAPI != nil { - queryCtx, cancel = context.WithTimeout(ctx, c.config.QueryTimeout) - defer cancel() - } - - // Fetch network metrics + // Fetch network metrics from nodemon var networkMetrics map[string]float64 - if !c.config.DisableNetworkIOMetrics && c.prometheusAPI != nil && queryCtx != nil { - networkMetrics = c.collectPodNetworkMetrics(queryCtx, pod) + if c.nodemonContainerMetricsCache != nil { + networkMetrics = c.collectPodNetworkMetrics(ctx, pod) } // Process each container's metrics @@ -388,111 +306,60 @@ func (c *ContainerResourceCollector) collectAllContainerResources(ctx context.Co var ioMetrics map[string]float64 var gpuMetrics map[string]interface{} var throttleFraction float64 - if c.prometheusAPI != nil && queryCtx != nil { - - // Fetch CPU throttle metrics for this container - throttleFraction, err = c.collectContainerCPUThrottleMetrics( - queryCtx, - pod, - containerMetrics.Name, - ) - if err != nil { - c.logger.Error(err, "Failed to collect CPU throttle metrics", - "namespace", podMetrics.Namespace, - "pod", podMetrics.Name, - "container", containerMetrics.Name) - // Continue with 0 throttle fraction - throttleFraction = 0 - } - // Emit CPU throttle event if fraction exceeds threshold - if throttleFraction > 0.1 { - c.emitCPUThrottleEvent(pod, containerMetrics, throttleFraction) - } + // Collect CPU throttle metrics from nodemon + if c.nodemonContainerMetricsCache != nil { + throttleFraction = c.collectContainerCPUThrottleMetrics(ctx, pod, containerMetrics.Name) + } - // Fetch I/O metrics for this container - if !c.config.DisableNetworkIOMetrics { - ioMetrics = c.collectContainerIOMetrics( - queryCtx, - pod, - containerMetrics.Name, - ) - } + // Emit CPU throttle event if fraction exceeds threshold + if throttleFraction > 0.1 { + c.emitCPUThrottleEvent(pod, containerMetrics, throttleFraction) } - // GPU metrics collection + // Collect I/O metrics for this container from nodemon + if c.nodemonContainerMetricsCache != nil { + ioMetrics = c.collectContainerIOMetrics(ctx, pod, containerMetrics.Name) + } + + // GPU metrics: requests/limits from pod spec + usage from nodemon + gpuMetrics = make(map[string]interface{}) if !c.config.DisableGPUMetrics { - // Check if container has GPU resources - hasGPU := false - var gpuRequestCount, gpuLimitCount int64 + // GPU requests/limits from the pod spec (nvidia.com/gpu resource) for i := range pod.Spec.Containers { if pod.Spec.Containers[i].Name == containerMetrics.Name { - requests := pod.Spec.Containers[i].Resources.Requests - limits := pod.Spec.Containers[i].Resources.Limits - - if requests != nil { - if gpuReq, ok := requests[gpuconst.GpuResource]; ok && - gpuReq.Value() > 0 { - hasGPU = true - gpuRequestCount = gpuReq.Value() + if pod.Spec.Containers[i].Resources.Requests != nil { + if gpuReq, ok := pod.Spec.Containers[i].Resources.Requests[corev1.ResourceName("nvidia.com/gpu")]; ok { + gpuMetrics["GPURequestCount"] = gpuReq.Value() } } - - if limits != nil { - if gpuLim, ok := limits[gpuconst.GpuResource]; ok && - gpuLim.Value() > 0 { - hasGPU = true - gpuLimitCount = gpuLim.Value() + if pod.Spec.Containers[i].Resources.Limits != nil { + if gpuLim, ok := pod.Spec.Containers[i].Resources.Limits[corev1.ResourceName("nvidia.com/gpu")]; ok { + gpuMetrics["GPULimitCount"] = gpuLim.Value() } } break } } - if hasGPU { - if gpuIndex != nil { - // Primary: nodemon - key := gpuContainerKey{ - Pod: podMetrics.Name, - Container: containerMetrics.Name, - Namespace: podMetrics.Namespace, - } - if containerGPUs, ok := gpuIndex[key]; ok { - gpuMetrics = ContainerGPUMetricsFromNodemon( - containerGPUs, - gpuRequestCount, - gpuLimitCount, - ) - } else { - // Exporter running but no GPU data for this container — fallback to Prometheus - if c.prometheusAPI != nil && queryCtx != nil { - gpuMetrics, err = c.collectContainerGPUMetrics(queryCtx, pod, containerMetrics.Name) - if err != nil { - c.logger.Error(err, "Prometheus fallback failed for container GPU metrics", - "namespace", podMetrics.Namespace, - "pod", podMetrics.Name, - "container", containerMetrics.Name) - gpuMetrics = make(map[string]interface{}) - } - } else { - gpuMetrics = make(map[string]interface{}) + // GPU usage metrics from nodemon unified endpoint + if c.nodemonContainerMetricsCache != nil { + key := pod.Namespace + "/" + pod.Name + if containers, ok := c.nodemonContainerMetricsCache[key]; ok { + for _, m := range containers { + if m.Container == containerMetrics.Name && m.GPUUtilization > 0 { + gpuMetrics["GPUUsage"] = m.GPUUtilization + gpuMetrics["GPUUtilizationPercentage"] = m.GPUUtilization + gpuMetrics["GPUMemoryUsedMb"] = m.GPUMemoryUsedMiB + gpuMetrics["GPUMemoryFreeMb"] = m.GPUMemoryFreeMiB + gpuMetrics["GPUPowerUsageWatts"] = m.GPUPowerWatts + gpuMetrics["GPUTemperatureCelsius"] = m.GPUTemperature + gpuMetrics["GPUTotalMemoryMb"] = m.GPUMemoryUsedMiB + m.GPUMemoryFreeMiB + gpuMetrics["GPUMetricsCount"] = int64(1) + break } } - } else if c.prometheusAPI != nil && queryCtx != nil { - // No nodemon available — use Prometheus - gpuMetrics, err = c.collectContainerGPUMetrics(queryCtx, pod, containerMetrics.Name) - if err != nil { - c.logger.Error(err, "Failed to collect container GPU metrics", - "namespace", podMetrics.Namespace, - "pod", podMetrics.Name, - "container", containerMetrics.Name) - gpuMetrics = make(map[string]interface{}) - } - } else { - gpuMetrics = make(map[string]interface{}) } - } else { - gpuMetrics = make(map[string]interface{}) } } @@ -695,203 +562,174 @@ func (c *ContainerResourceCollector) processContainerMetrics( } } -// collectPodNetworkMetrics collects network metrics for a pod using Prometheus queries -func (c *ContainerResourceCollector) collectPodNetworkMetrics( - ctx context.Context, - pod *corev1.Pod, -) map[string]float64 { - metrics := make(map[string]float64) - - // Define queries for network metrics - queries := map[string]string{ - "NetworkReceiveBytes": fmt.Sprintf( - `sum(rate(container_network_receive_bytes_total{namespace="%s", pod="%s"}[5m]))`, - pod.Namespace, - pod.Name, - ), - "NetworkTransmitBytes": fmt.Sprintf( - `sum(rate(container_network_transmit_bytes_total{namespace="%s", pod="%s"}[5m]))`, - pod.Namespace, - pod.Name, - ), - "NetworkReceivePackets": fmt.Sprintf( - `sum(rate(container_network_receive_packets_total{namespace="%s", pod="%s"}[5m]))`, - pod.Namespace, - pod.Name, - ), - "NetworkTransmitPackets": fmt.Sprintf( - `sum(rate(container_network_transmit_packets_total{namespace="%s", pod="%s"}[5m]))`, - pod.Namespace, - pod.Name, - ), - "NetworkReceiveErrors": fmt.Sprintf( - `sum(rate(container_network_receive_errors_total{namespace="%s", pod="%s"}[5m]))`, - pod.Namespace, - pod.Name, - ), - "NetworkTransmitErrors": fmt.Sprintf( - `sum(rate(container_network_transmit_errors_total{namespace="%s", pod="%s"}[5m]))`, - pod.Namespace, - pod.Name, - ), - "NetworkReceiveDropped": fmt.Sprintf( - `sum(rate(container_network_receive_packets_dropped_total{namespace="%s", pod="%s"}[5m]))`, - pod.Namespace, - pod.Name, - ), - "NetworkTransmitDropped": fmt.Sprintf( - `sum(rate(container_network_transmit_packets_dropped_total{namespace="%s", pod="%s"}[5m]))`, - pod.Namespace, - pod.Name, - ), + +// buildPodMetricsFromNodemon fetches container metrics from nodemon and converts them +// into a PodMetricsList compatible with the metrics-server format. This allows the rest +// of collectAllContainerResources to work unchanged — CPU/memory come from nodemon's +// stats/summary data (usageNanoCores, workingSetBytes) instead of the metrics-server API. +func (c *ContainerResourceCollector) buildPodMetricsFromNodemon(ctx context.Context) (*metricsv1beta1.PodMetricsList, error) { + allMetrics, err := c.nodemonClient.FetchAllContainerMetrics(ctx) + if err != nil { + return nil, fmt.Errorf("failed to fetch container metrics from nodemon: %w", err) + } + + // Group by pod + podMap := make(map[string]*metricsv1beta1.PodMetrics) + for _, m := range allMetrics { + key := m.Namespace + "/" + m.Pod + pm, exists := podMap[key] + if !exists { + pm = &metricsv1beta1.PodMetrics{ + ObjectMeta: metav1.ObjectMeta{ + Name: m.Pod, + Namespace: m.Namespace, + }, + } + podMap[key] = pm + } + + // Convert nanocores to resource.Quantity (millicores) + cpuMillis := int64(m.CPUUsageNanoCores / 1_000_000) + cpuQuantity := *resource.NewMilliQuantity(cpuMillis, resource.DecimalSI) + + // Memory in bytes + memQuantity := *resource.NewQuantity(int64(m.MemoryWorkingSet), resource.BinarySI) + + pm.Containers = append(pm.Containers, metricsv1beta1.ContainerMetrics{ + Name: m.Container, + Usage: corev1.ResourceList{ + corev1.ResourceCPU: cpuQuantity, + corev1.ResourceMemory: memQuantity, + }, + }) } - // Execute each query and store the result - for metricName, query := range queries { - metrics[metricName] = 0 // Default to 0 for all metrics + // Build the list + result := &metricsv1beta1.PodMetricsList{} + for _, pm := range podMap { + result.Items = append(result.Items, *pm) + } - result, _, err := c.prometheusAPI.Query(ctx, query, time.Now()) - if err != nil { - c.logger.Error(err, "Error querying Prometheus", - "metric", metricName, - "query", query) - continue - } + c.logger.V(1).Info("Built pod metrics from nodemon", "pods", len(result.Items), "containers", len(allMetrics)) + return result, nil +} - // Extract value from result (if any) - if result.Type() == model.ValVector { - vector := result.(model.Vector) - if len(vector) > 0 { - metrics[metricName] = float64(vector[0].Value) - } - } +// indexContainerMetricsByPod builds a lookup map keyed by "namespace/podName" from a +// flat slice of UnifiedContainerMetric returned by nodemon. +func indexContainerMetricsByPod(metrics []UnifiedContainerMetric) map[string][]UnifiedContainerMetric { + idx := make(map[string][]UnifiedContainerMetric, len(metrics)) + for _, m := range metrics { + key := m.Namespace + "/" + m.Pod + idx[key] = append(idx[key], m) } + return idx +} + +// collectPodNetworkMetrics returns pod-level network metrics from the +// pre-fetched nodemon container metrics cache. +func (c *ContainerResourceCollector) collectPodNetworkMetrics( + _ context.Context, + pod *corev1.Pod, +) map[string]float64 { + metrics := map[string]float64{ + "NetworkReceiveBytes": 0, + "NetworkTransmitBytes": 0, + "NetworkReceivePackets": 0, + "NetworkTransmitPackets": 0, + "NetworkReceiveErrors": 0, + "NetworkTransmitErrors": 0, + "NetworkReceiveDropped": 0, + "NetworkTransmitDropped": 0, + } + + key := pod.Namespace + "/" + pod.Name + containerMetrics, ok := c.nodemonContainerMetricsCache[key] + if !ok || len(containerMetrics) == 0 { + return metrics + } + + // Network metrics are pod-level (shared network namespace). Pick the first + // container's values since nodemon reports identical pod-level counters for each. + // + // NetworkRxBytes/TxBytes from stats/summary are CUMULATIVE totals. + // The MPA proto expects bytes/sec (NetworkReceiveBytesPerSec), so we compute + // per-second rates from successive samples using RateCalculator. + // Packet/error/drop rates are already per-second from the cAdvisor scraper. + m := containerMetrics[0] + now := time.Now() + podKey := pod.Namespace + "/" + pod.Name + metrics["NetworkReceiveBytes"] = c.networkByteRates.Rate(podKey, "rx_bytes", float64(m.NetworkRxBytes), now) + metrics["NetworkTransmitBytes"] = c.networkByteRates.Rate(podKey, "tx_bytes", float64(m.NetworkTxBytes), now) + metrics["NetworkReceivePackets"] = m.NetworkRxPacketsPerSec + metrics["NetworkTransmitPackets"] = m.NetworkTxPacketsPerSec + metrics["NetworkReceiveErrors"] = m.NetworkRxErrorsPerSec + metrics["NetworkTransmitErrors"] = m.NetworkTxErrorsPerSec + metrics["NetworkReceiveDropped"] = m.NetworkRxDropsPerSec + metrics["NetworkTransmitDropped"] = m.NetworkTxDropsPerSec return metrics } -// collectContainerIOMetrics collects I/O metrics for a container using Prometheus queries +// collectContainerIOMetrics returns container-level disk I/O metrics from +// the pre-fetched nodemon cache. func (c *ContainerResourceCollector) collectContainerIOMetrics( - ctx context.Context, + _ context.Context, pod *corev1.Pod, containerName string, ) map[string]float64 { - metrics := make(map[string]float64) - - // Define queries for I/O metrics - queries := map[string]string{ - "FSReadBytes": fmt.Sprintf( - `sum(rate(container_fs_reads_bytes_total{namespace="%s", pod="%s", container="%s"}[5m]))`, - pod.Namespace, - pod.Name, - containerName, - ), - "FSWriteBytes": fmt.Sprintf( - `sum(rate(container_fs_writes_bytes_total{namespace="%s", pod="%s", container="%s"}[5m]))`, - pod.Namespace, - pod.Name, - containerName, - ), - "FSReads": fmt.Sprintf( - `sum(rate(container_fs_reads_total{namespace="%s", pod="%s", container="%s"}[5m]))`, - pod.Namespace, - pod.Name, - containerName, - ), - "FSWrites": fmt.Sprintf( - `sum(rate(container_fs_writes_total{namespace="%s", pod="%s", container="%s"}[5m]))`, - pod.Namespace, - pod.Name, - containerName, - ), + metrics := map[string]float64{ + "FSReadBytes": 0, + "FSWriteBytes": 0, + "FSReads": 0, + "FSWrites": 0, } - // Execute each query and store the result - for metricName, query := range queries { - metrics[metricName] = 0 // Default to 0 for all metrics - - result, _, err := c.prometheusAPI.Query(ctx, query, time.Now()) - if err != nil { - c.logger.Error(err, "Error querying Prometheus", - "metric", metricName, - "query", query, - "container", containerName) - continue - } + key := pod.Namespace + "/" + pod.Name + containerMetrics, ok := c.nodemonContainerMetricsCache[key] + if !ok { + return metrics + } - // Extract value from result (if any) - if result.Type() == model.ValVector { - vector := result.(model.Vector) - if len(vector) > 0 { - metrics[metricName] = float64(vector[0].Value) - } + for _, m := range containerMetrics { + if m.Container == containerName { + metrics["FSReadBytes"] = m.DiskReadBytesPerSec + metrics["FSWriteBytes"] = m.DiskWriteBytesPerSec + metrics["FSReads"] = m.DiskReadOpsPerSec + metrics["FSWrites"] = m.DiskWriteOpsPerSec + return metrics } } return metrics } -// collectContainerCPUThrottleMetrics collects CFS bandwidth throttle metrics for a container. -// Returns the throttle fraction (0.0-1.0): rate(throttled_periods) / rate(total_periods). +// collectContainerCPUThrottleMetrics returns the CPU throttle fraction from +// the pre-fetched nodemon cache for a specific container. func (c *ContainerResourceCollector) collectContainerCPUThrottleMetrics( - ctx context.Context, + _ context.Context, pod *corev1.Pod, containerName string, -) (float64, error) { - throttledQuery := fmt.Sprintf( - `sum(rate(container_cpu_cfs_throttled_periods_total{namespace="%s", pod="%s", container="%s"}[5m]))`, - pod.Namespace, - pod.Name, - containerName, - ) - totalQuery := fmt.Sprintf( - `sum(rate(container_cpu_cfs_periods_total{namespace="%s", pod="%s", container="%s"}[5m]))`, - pod.Namespace, pod.Name, containerName, - ) - - queryTime := time.Now() - - // Query throttled periods rate - throttledResult, _, err := c.prometheusAPI.Query(ctx, throttledQuery, queryTime) - if err != nil { - return 0, fmt.Errorf("error querying throttled periods: %w", err) - } - - // Query total periods rate - totalResult, _, err := c.prometheusAPI.Query(ctx, totalQuery, queryTime) - if err != nil { - return 0, fmt.Errorf("error querying total periods: %w", err) - } - - // Extract values - var throttledRate, totalRate float64 - if throttledResult.Type() == model.ValVector { - vector := throttledResult.(model.Vector) - if len(vector) > 0 { - throttledRate = float64(vector[0].Value) - } - } - if totalResult.Type() == model.ValVector { - vector := totalResult.(model.Vector) - if len(vector) > 0 { - totalRate = float64(vector[0].Value) +) float64 { + key := pod.Namespace + "/" + pod.Name + containerMetrics, ok := c.nodemonContainerMetricsCache[key] + if !ok { + return 0 + } + + for _, m := range containerMetrics { + if m.Container == containerName { + fraction := m.CPUThrottleFraction + if fraction < 0 || math.IsNaN(fraction) { + return 0 + } + if fraction > 1 { + return 1 + } + return fraction } } - // Compute fraction; return 0 if no CFS data or NaN/Inf values from Prometheus - if totalRate <= 0 || math.IsNaN(totalRate) || math.IsInf(totalRate, 0) || - math.IsNaN(throttledRate) || math.IsInf(throttledRate, 0) { - return 0, nil - } - - fraction := throttledRate / totalRate - if fraction < 0 || math.IsNaN(fraction) { - fraction = 0 - } - if fraction > 1 { - fraction = 1 - } - return fraction, nil + return 0 } // emitCPUThrottleEvent sends a CPU throttle event through the batch channel with 5-minute deduplication. @@ -960,263 +798,6 @@ func (c *ContainerResourceCollector) emitCPUThrottleEvent( } } -// collectContainerGPUMetrics collects GPU metrics for a container using Prometheus queries -// -//nolint:gocyclo -func (c *ContainerResourceCollector) collectContainerGPUMetrics( - ctx context.Context, - pod *corev1.Pod, - containerName string, -) (map[string]interface{}, error) { - metrics := make(map[string]interface{}) - - namespace := pod.Namespace - podName := pod.Name - baseLabels := fmt.Sprintf( - `namespace="%s", pod="%s", container="%s"`, - namespace, - podName, - containerName, - ) - queryTime := time.Now() - - stateKey := fmt.Sprintf("%s/%s/%s", pod.Namespace, pod.Name, containerName) - gpuCountQuery := fmt.Sprintf(`count(DCGM_FI_DEV_GPU_UTIL{%s})`, baseLabels) - - c.gpuQueryMu.Lock() - state, exists := c.gpuQueryErrorState[stateKey] - if !exists { - state = &gpuQueryState{lastFailed: false} - c.gpuQueryErrorState[stateKey] = state - } - c.gpuQueryMu.Unlock() - - result, _, err := c.prometheusAPI.Query(ctx, gpuCountQuery, queryTime) - if err != nil { - // BEGIN: latching - c.gpuQueryMu.Lock() - defer c.gpuQueryMu.Unlock() - if !state.lastFailed { - // Report/log the FIRST failure - c.telemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_ERROR, - "ContainerResourceCollector", - fmt.Sprintf("Failed querying GPU metric for %s", stateKey), - err, - map[string]string{ - "namespace": pod.Namespace, - "pod": pod.Name, - "container": containerName, - "error_type": "prometheus_gpu_query_failed", - "prometheus_url": c.config.PrometheusURL, - "query": gpuCountQuery, - "zxporter_version": version.Get().String(), - }, - ) - state.lastFailed = true - } - return nil, fmt.Errorf("error querying GPU availability: %w", err) - } - - // On success, if previously had error then only log a success transition - c.gpuQueryMu.Lock() - if state.lastFailed { - c.telemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_INFO, - "ContainerResourceCollector", - fmt.Sprintf("GPU metric query succeeded again for %s", stateKey), - nil, - map[string]string{ - "namespace": pod.Namespace, - "pod": pod.Name, - "container": containerName, - "event_type": "prometheus_gpu_query_succeed", - "prometheus_url": c.config.PrometheusURL, - "query": gpuCountQuery, - "zxporter_version": version.Get().String(), - }, - ) - state.lastFailed = false - } - c.gpuQueryMu.Unlock() - - gpuCount := 0.0 - if result.Type() == model.ValVector { - vector := result.(model.Vector) - if len(vector) > 0 { - gpuCount = float64(vector[0].Value) - } - } - - if gpuCount == 0 { - return metrics, nil - } - - metrics["GPUMetricsCount"] = gpuCount - - metricDefinitions := []struct { - name string // Human-readable name for logs - promMetric string // Prometheus metric name - resultKey string // Field name in individual GPUs - aggregateOp string // Aggregation operation (sum, avg) - metricKey string // Field name in aggregate metrics - }{ - { - "GPU Utilization", - "DCGM_FI_DEV_GPU_UTIL", - "Utilization", - "avg", - "GPUUtilizationPercentage", - }, - {"Memory Used", "DCGM_FI_DEV_FB_USED", "MemoryUsed", "sum", "GPUMemoryUsedMb"}, - {"Memory Free", "DCGM_FI_DEV_FB_FREE", "MemoryFree", "sum", "GPUMemoryFreeMb"}, - {"Power Usage", "DCGM_FI_DEV_POWER_USAGE", "PowerUsage", "sum", "GPUPowerUsageWatts"}, - {"Temperature", "DCGM_FI_DEV_GPU_TEMP", "Temperature", "avg", "GPUTemperatureCelsius"}, - {"SM Clock", "DCGM_FI_DEV_SM_CLOCK", "SMClock", "avg", "GPUSMClockMHz"}, - {"Memory Clock", "DCGM_FI_DEV_MEM_CLOCK", "MemClock", "avg", "GPUMemClockMHz"}, - } - - individualGPUs := make(map[string]map[string]interface{}) - gpuUUIDSet := make(map[string]bool) - gpuModels := make(map[string]int) - - // Process each metric with a single query - for _, def := range metricDefinitions { - query := fmt.Sprintf(`%s{%s}`, def.promMetric, baseLabels) - - result, _, err := c.prometheusAPI.Query(ctx, query, queryTime) - if err != nil { - c.telemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_ERROR, - "ContainerResourceCollector", - "Error querying GPU metric", - err, - map[string]string{ - "namespace": pod.Namespace, - "pod": pod.Name, - "container": containerName, - "error_type": "prometheus_gpu_query_failed", - "query": query, - "prometheus_url": c.config.PrometheusURL, - "zxporter_version": version.Get().String(), - }, - ) - c.logger.Error(err, "Error querying GPU metric", - "metric", def.name, "query", query) - continue - } - - // Calculate aggregate metrics - var aggValue float64 - var validSamples float64 - - if result.Type() == model.ValVector { - vector := result.(model.Vector) - - for _, sample := range vector { - uuid := string(sample.Metric["UUID"]) - if uuid == "" { - continue - } - - gpuUUIDSet[uuid] = true - - model := string(sample.Metric["modelName"]) - if model != "" { - gpuModels[model]++ - } - - if _, exists := individualGPUs[uuid]; !exists { - individualGPUs[uuid] = map[string]interface{}{ - "UUID": uuid, - "ModelName": model, - "DeviceIndex": string(sample.Metric["device"]), - } - } - - value := float64(sample.Value) - individualGPUs[uuid][def.resultKey] = value - - if def.aggregateOp == "sum" { - aggValue += value - } else { // "avg" - aggValue += value - validSamples++ - } - } - - if def.aggregateOp == "avg" && validSamples > 0 { - aggValue /= validSamples - } - - metrics[def.metricKey] = aggValue - } - } - - for _, gpu := range individualGPUs { - memUsed, hasUsed := gpu["MemoryUsed"].(float64) - memFree, hasFree := gpu["MemoryFree"].(float64) - - if hasUsed && hasFree { - totalMem := memUsed + memFree - gpu["TotalMemory"] = totalMem - gpu["MemoryUtilizationPercentage"] = (memUsed / totalMem) * 100 - } - } - - gpuUUIDs := make([]string, 0, len(gpuUUIDSet)) - for uuid := range gpuUUIDSet { - gpuUUIDs = append(gpuUUIDs, uuid) - } - - modelSummary := make([]string, 0, len(gpuModels)) - for model, count := range gpuModels { - modelSummary = append(modelSummary, fmt.Sprintf("%dx %s", count, model)) - } - - metrics["GPUModels"] = modelSummary - metrics["GPUUUIDs"] = gpuUUIDs - - // Convert individual GPU map to array for JSON serialization - gpuArray := make([]map[string]interface{}, 0, len(individualGPUs)) - for _, gpu := range individualGPUs { - gpuArray = append(gpuArray, gpu) - } - metrics["IndividualGPUs"] = gpuArray - - // GPUUsage = (utilization percentage * GPU count) / 100 - if gpuCount > 0 { - if gpuUtil, ok := metrics["GPUUtilizationPercentage"].(float64); ok { - metrics["GPUUsage"] = (gpuUtil * gpuCount) / 100.0 - } - } - - // Calculate total memory - if memUsed, ok := metrics["GPUMemoryUsedMb"].(float64); ok { - if memFree, ok := metrics["GPUMemoryFreeMb"].(float64); ok { - metrics["GPUTotalMemoryMb"] = memUsed + memFree - } - } - - // Get resource requests and limits from container spec - for i := range pod.Spec.Containers { - if pod.Spec.Containers[i].Name == containerName { - requests := pod.Spec.Containers[i].Resources.Requests - limits := pod.Spec.Containers[i].Resources.Limits - - if gpuReq, ok := requests[gpuconst.GpuResource]; ok { - metrics["GPURequestCount"] = gpuReq.Value() - } - - if gpuLim, ok := limits[gpuconst.GpuResource]; ok { - metrics["GPULimitCount"] = gpuLim.Value() - } - break - } - } - - return metrics, nil -} // getPodFromCache retrieves a pod from the informer cache func (c *ContainerResourceCollector) getPodFromCache(namespace, name string) (*corev1.Pod, error) { @@ -1296,88 +877,12 @@ func (c *ContainerResourceCollector) GetType() string { return "container_resource" } -// IsAvailable always returns true - actual availability is checked during collection +// IsAvailable checks whether the collector can operate. +// Always returns true — nodemon pods are discovered dynamically and may not be +// ready at the instant IsAvailable is called during startup. The collection loop +// gracefully handles empty nodemon responses. func (c *ContainerResourceCollector) IsAvailable(ctx context.Context) bool { - if c.metricsClient == nil { - c.logger.Info("Metrics client is not available, cannot collect container resources") - c.telemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_ERROR, - "ContainerResourceCollector", - "Metrics client is not available or set properly, cannot collect container resources", - fmt.Errorf("metrics server client is not available or set"), - map[string]string{ - "collector_type": c.GetType(), - "zxporter_version": version.Get().String(), - }, - ) - // return false - } - - // Try to list pod metrics to check metrics API availability - _, err := c.metricsClient.MetricsV1beta1().PodMetricses("").List(ctx, metav1.ListOptions{ - Limit: 1, // Only request a single item to minimize load - }) - if err != nil { - c.logger.Info("Metrics server API not available", "error", err.Error()) - c.telemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_ERROR, - "ContainerResourceCollector", - "Metrics server API not available", - err, - map[string]string{ - "collector_type": c.GetType(), - "zxporter_version": version.Get().String(), - }, - ) - // return false - } - - // If network/IO and GPU metrics are not disabled, also check Prometheus availability - if !c.config.DisableNetworkIOMetrics && !c.config.DisableGPUMetrics { - if c.prometheusAPI == nil { - c.logger.Info("Prometheus client is not available for network/IO or GPU metrics") - c.telemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_ERROR, - "ContainerResourceCollector", - "Prometheus client is not available for network/IO or GPU metrics", - fmt.Errorf("prometehus client not available or set properly"), - map[string]string{ - "collector_type": c.GetType(), - "zxporter_version": version.Get().String(), - }, - ) - // Still return true since the main metrics are available - // return true - } - - // Try a simple query to check if Prometheus is available - if c.prometheusAPI != nil { - queryCtx, cancel := context.WithTimeout(ctx, 10*time.Second) - defer cancel() - - _, _, err = c.prometheusAPI.Query(queryCtx, "up", time.Now()) - if err != nil { - c.telemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_ERROR, - "ContainerResourceCollector", - "Prometheus API not available for network and I/O metrics", - err, - map[string]string{ - "collector_type": c.GetType(), - "zxporter_version": version.Get().String(), - }, - ) - c.logger.Info( - "Prometheus API not available for network and I/O metrics", - "error", - err.Error(), - ) - // Still return true since the main metrics are available - } - } - } - - return true + return c.nodemonClient != nil } // AddResource is a no-op for container resource collector - we never sync individual containers diff --git a/internal/collector/historical_metrics_collector.go b/internal/collector/historical_metrics_collector.go deleted file mode 100644 index 47dffddc..00000000 --- a/internal/collector/historical_metrics_collector.go +++ /dev/null @@ -1,318 +0,0 @@ -package collector - -import ( - "context" - "fmt" - "sync" - "time" - - "github.com/go-logr/logr" - v1 "github.com/prometheus/client_golang/api/prometheus/v1" - "github.com/prometheus/common/model" - "google.golang.org/protobuf/types/known/timestamppb" - - gen "github.com/devzero-inc/zxporter/gen/api/v1" - "github.com/devzero-inc/zxporter/internal/health" -) - -const ( - historicalWindow = 24 * time.Hour - historicalStepInterval = "1m" - maxConcurrentQueries = 10 -) - -// HistoricalWorkloadQuery defines what to query for a workload. -type HistoricalWorkloadQuery struct { - Namespace string - WorkloadName string - WorkloadKind string - PodRegex string // e.g., "web-app-.*" - Containers []string // container names to query -} - -// HistoricalMetricsCollector queries Prometheus for historical CPU/memory percentiles. -type HistoricalMetricsCollector struct { - logger logr.Logger - prometheusAPI v1.API - semaphore chan struct{} // limits concurrent Prometheus queries - healthManager *health.HealthManager -} - -// NewHistoricalMetricsCollector creates a new collector. -func NewHistoricalMetricsCollector( - logger logr.Logger, - prometheusAPI v1.API, - healthManager *health.HealthManager, -) *HistoricalMetricsCollector { - return &HistoricalMetricsCollector{ - logger: logger.WithName("historical-metrics"), - prometheusAPI: prometheusAPI, - semaphore: make(chan struct{}, maxConcurrentQueries), - healthManager: healthManager, - } -} - -// FetchPercentiles queries Prometheus for 24h percentiles for a workload. -func (c *HistoricalMetricsCollector) FetchPercentiles( - ctx context.Context, - workload HistoricalWorkloadQuery, -) (*gen.HistoricalMetricsSummary, error) { - now := time.Now() - windowStart := now.Add(-historicalWindow) - - containerResults := make([]*gen.ContainerHistoricalMetrics, 0, len(workload.Containers)) - var totalSamples int32 - - for _, containerName := range workload.Containers { - metrics, samples := c.fetchContainerPercentiles(ctx, workload, containerName, now) - containerResults = append(containerResults, metrics) - if samples > totalSamples { - totalSamples = samples - } - } - - c.updateHealthStatus( - health.HealthStatusHealthy, - "Prometheus queries succeeding", - map[string]string{"workload ->": workload.WorkloadName}, - ) - - return &gen.HistoricalMetricsSummary{ - Workload: &gen.MpaWorkloadIdentifier{ - Namespace: workload.Namespace, - Name: workload.WorkloadName, - Kind: workload.WorkloadKind, - }, - Containers: containerResults, - WindowStart: timestamppb.New(windowStart), - WindowEnd: timestamppb.New(now), - SampleCount: totalSamples, - }, nil -} - -// FetchPercentilesForAll queries Prometheus for all workloads concurrently with rate limiting. -func (c *HistoricalMetricsCollector) FetchPercentilesForAll( - ctx context.Context, - workloads []HistoricalWorkloadQuery, -) map[string]*gen.HistoricalMetricsSummary { - results := make(map[string]*gen.HistoricalMetricsSummary) - var mu sync.Mutex - var wg sync.WaitGroup - - for _, w := range workloads { - wg.Add(1) - go func(workload HistoricalWorkloadQuery) { - defer wg.Done() - - // Rate limit - c.semaphore <- struct{}{} - defer func() { <-c.semaphore }() - - summary, err := c.FetchPercentiles(ctx, workload) - if err != nil { - c.logger.Error(err, "Failed to fetch historical metrics", - "workload", workload.WorkloadName) - return - } - - key := fmt.Sprintf( - "%s/%s/%s", - workload.Namespace, - workload.WorkloadKind, - workload.WorkloadName, - ) - mu.Lock() - results[key] = summary - mu.Unlock() - }(w) - } - - wg.Wait() - return results -} - -// DiscoverContainers discovers container names for a workload from Prometheus. -func (c *HistoricalMetricsCollector) DiscoverContainers( - ctx context.Context, - namespace, podRegex string, -) ([]string, error) { - query := fmt.Sprintf( - `group by (container) (container_memory_working_set_bytes{namespace="%s", pod=~"%s", container!="", container!="POD"})`, - namespace, - podRegex, - ) - result, _, err := c.prometheusAPI.Query(ctx, query, time.Now()) - if err != nil { - c.updateHealthStatus( - health.HealthStatusDegraded, - "Prometheus query failed", - map[string]string{"error": err.Error()}, - ) - return nil, err - } - - vec, ok := result.(model.Vector) - if !ok { - return nil, fmt.Errorf("unexpected result type: %T", result) - } - - var containers []string - for _, sample := range vec { - if name := string(sample.Metric["container"]); name != "" { - containers = append(containers, name) - } - } - return containers, nil -} - -func (c *HistoricalMetricsCollector) fetchContainerPercentiles( - ctx context.Context, - workload HistoricalWorkloadQuery, - containerName string, - now time.Time, -) (*gen.ContainerHistoricalMetrics, int32) { - percentiles := []float64{0.50, 0.75, 0.80, 0.90, 0.99} - - cpuValues := make(map[float64]int64) - memValues := make(map[float64]int64) - - var sampleCount int32 - - for _, p := range percentiles { - // CPU query: rate over 5m, percentile over 24h - cpuQuery := fmt.Sprintf( - `quantile_over_time(%.2f, rate(container_cpu_usage_seconds_total{namespace="%s", pod=~"%s", container="%s"}[5m])[24h:%s])`, - p, - workload.Namespace, - workload.PodRegex, - containerName, - historicalStepInterval, - ) - - cpuVal, err := c.queryScalar(ctx, cpuQuery, now) - if err != nil { - c.logger.V(1).Info("CPU percentile query failed", - "percentile", p, "error", err) - } else { - // Convert from cores (float) to millicores (int) - cpuValues[p] = int64(cpuVal * 1000) - } - - // Memory query: direct percentile over 24h - memQuery := fmt.Sprintf( - `quantile_over_time(%.2f, container_memory_working_set_bytes{namespace="%s", pod=~"%s", container="%s"}[24h])`, - p, - workload.Namespace, - workload.PodRegex, - containerName, - ) - - memVal, err := c.queryScalar(ctx, memQuery, now) - if err != nil { - c.logger.V(1).Info("Memory percentile query failed", - "percentile", p, "error", err) - } else { - memValues[p] = int64(memVal) - } - } - - // Estimate sample count from Prometheus - countQuery := fmt.Sprintf( - `count_over_time(container_memory_working_set_bytes{namespace="%s", pod=~"%s", container="%s"}[24h])`, - workload.Namespace, - workload.PodRegex, - containerName, - ) - countVal, err := c.queryScalar(ctx, countQuery, now) - if err == nil { - sampleCount = int32(countVal) - } - - // Pmax queries: max observed value over 24h for spike protection - var cpuPmax, memPmax int64 - - cpuPmaxQuery := fmt.Sprintf( - `max_over_time(rate(container_cpu_usage_seconds_total{namespace="%s", pod=~"%s", container="%s"}[5m])[24h:%s])`, - workload.Namespace, - workload.PodRegex, - containerName, - historicalStepInterval, - ) - cpuPmaxVal, err := c.queryScalar(ctx, cpuPmaxQuery, now) - if err != nil { - c.logger.V(1).Info("CPU pmax query failed", "error", err) - } else { - cpuPmax = int64(cpuPmaxVal * 1000) // Convert cores to millicores - } - - memPmaxQuery := fmt.Sprintf( - `max_over_time(container_memory_working_set_bytes{namespace="%s", pod=~"%s", container="%s"}[24h])`, - workload.Namespace, - workload.PodRegex, - containerName, - ) - memPmaxVal, err := c.queryScalar(ctx, memPmaxQuery, now) - if err != nil { - c.logger.V(1).Info("Memory pmax query failed", "error", err) - } else { - memPmax = int64(memPmaxVal) - } - - return &gen.ContainerHistoricalMetrics{ - ContainerName: containerName, - CpuP50: cpuValues[0.50], - CpuP75: cpuValues[0.75], - CpuP80: cpuValues[0.80], - CpuP90: cpuValues[0.90], - CpuP99: cpuValues[0.99], - MemP50: memValues[0.50], - MemP75: memValues[0.75], - MemP80: memValues[0.80], - MemP90: memValues[0.90], - MemP99: memValues[0.99], - CpuPmax: cpuPmax, - MemPmax: memPmax, - }, sampleCount -} - -func (c *HistoricalMetricsCollector) queryScalar( - ctx context.Context, - query string, - ts time.Time, -) (float64, error) { - result, warnings, err := c.prometheusAPI.Query(ctx, query, ts) - if err != nil { - c.updateHealthStatus( - health.HealthStatusDegraded, - "Prometheus query failed", - map[string]string{"error": err.Error()}, - ) - return 0, fmt.Errorf("prometheus query failed: %w", err) - } - if len(warnings) > 0 { - c.logger.V(1).Info("Prometheus warnings", "warnings", warnings) - } - - switch v := result.(type) { - case *model.Scalar: - return float64(v.Value), nil - case model.Vector: - if len(v) > 0 { - return float64(v[0].Value), nil - } - return 0, fmt.Errorf("empty vector result") - default: - return 0, fmt.Errorf("unexpected result type: %T", result) - } -} - -// updateHealthStatus reports Prometheus component health if a HealthManager is configured. -func (c *HistoricalMetricsCollector) updateHealthStatus( - status health.HealthStatus, - message string, - metadata map[string]string, -) { - if c.healthManager != nil { - c.healthManager.UpdateStatus(health.ComponentPrometheus, status, message, metadata) - } -} diff --git a/internal/collector/historical_metrics_collector_test.go b/internal/collector/historical_metrics_collector_test.go deleted file mode 100644 index 375f0657..00000000 --- a/internal/collector/historical_metrics_collector_test.go +++ /dev/null @@ -1,176 +0,0 @@ -package collector - -import ( - "context" - "testing" - "time" - - "github.com/go-logr/logr" - v1 "github.com/prometheus/client_golang/api/prometheus/v1" - "github.com/prometheus/common/model" -) - -// mockPrometheusAPI implements v1.API for testing -type mockPrometheusAPI struct { - queryResults map[string]model.Value -} - -func (m *mockPrometheusAPI) Query( - ctx context.Context, - query string, - ts time.Time, - opts ...v1.Option, -) (model.Value, v1.Warnings, error) { - if result, ok := m.queryResults[query]; ok { - return result, nil, nil - } - return &model.Scalar{Value: 0, Timestamp: model.TimeFromUnix(time.Now().Unix())}, nil, nil -} - -func (m *mockPrometheusAPI) QueryRange( - ctx context.Context, - query string, - r v1.Range, - opts ...v1.Option, -) (model.Value, v1.Warnings, error) { - return nil, nil, nil -} - -func (m *mockPrometheusAPI) QueryExemplars( - ctx context.Context, - query string, - startTime, endTime time.Time, -) ([]v1.ExemplarQueryResult, error) { - return nil, nil -} - -func (m *mockPrometheusAPI) Series( - ctx context.Context, - matches []string, - startTime, endTime time.Time, - opts ...v1.Option, -) ([]model.LabelSet, v1.Warnings, error) { - return nil, nil, nil -} - -func (m *mockPrometheusAPI) LabelNames( - ctx context.Context, - matches []string, - startTime, endTime time.Time, - opts ...v1.Option, -) ([]string, v1.Warnings, error) { - return nil, nil, nil -} - -func (m *mockPrometheusAPI) LabelValues( - ctx context.Context, - label string, - matches []string, - startTime, endTime time.Time, - opts ...v1.Option, -) (model.LabelValues, v1.Warnings, error) { - return nil, nil, nil -} - -func (m *mockPrometheusAPI) Alerts(ctx context.Context) (v1.AlertsResult, error) { - return v1.AlertsResult{}, nil -} - -func (m *mockPrometheusAPI) AlertManagers(ctx context.Context) (v1.AlertManagersResult, error) { - return v1.AlertManagersResult{}, nil -} - -func (m *mockPrometheusAPI) Buildinfo(ctx context.Context) (v1.BuildinfoResult, error) { - return v1.BuildinfoResult{}, nil -} -func (m *mockPrometheusAPI) CleanTombstones(ctx context.Context) error { return nil } -func (m *mockPrometheusAPI) Config(ctx context.Context) (v1.ConfigResult, error) { - return v1.ConfigResult{}, nil -} - -func (m *mockPrometheusAPI) DeleteSeries( - ctx context.Context, - matches []string, - startTime, endTime time.Time, -) error { - return nil -} - -func (m *mockPrometheusAPI) Flags(ctx context.Context) (v1.FlagsResult, error) { - return v1.FlagsResult{}, nil -} - -func (m *mockPrometheusAPI) Metadata( - ctx context.Context, - metric, limit string, -) (map[string][]v1.Metadata, error) { - return nil, nil -} - -func (m *mockPrometheusAPI) Runtimeinfo(ctx context.Context) (v1.RuntimeinfoResult, error) { - return v1.RuntimeinfoResult{}, nil -} - -func (m *mockPrometheusAPI) Snapshot( - ctx context.Context, - skipHead bool, -) (v1.SnapshotResult, error) { - return v1.SnapshotResult{}, nil -} - -func (m *mockPrometheusAPI) Targets(ctx context.Context) (v1.TargetsResult, error) { - return v1.TargetsResult{}, nil -} - -func (m *mockPrometheusAPI) TargetsMetadata( - ctx context.Context, - matchTarget, metric, limit string, -) ([]v1.MetricMetadata, error) { - return nil, nil -} - -func (m *mockPrometheusAPI) TSDB(ctx context.Context, opts ...v1.Option) (v1.TSDBResult, error) { - return v1.TSDBResult{}, nil -} - -func (m *mockPrometheusAPI) WalReplay(ctx context.Context) (v1.WalReplayStatus, error) { - return v1.WalReplayStatus{}, nil -} - -func (m *mockPrometheusAPI) Rules(ctx context.Context) (v1.RulesResult, error) { - return v1.RulesResult{}, nil -} - -func TestHistoricalCollector_FetchPercentiles(t *testing.T) { - mock := &mockPrometheusAPI{ - queryResults: make(map[string]model.Value), - } - - hc := NewHistoricalMetricsCollector(logr.Discard(), mock, nil) - - workload := HistoricalWorkloadQuery{ - Namespace: "default", - WorkloadName: "web-app", - WorkloadKind: "Deployment", - PodRegex: "web-app-.*", - Containers: []string{"app"}, - } - - result, err := hc.FetchPercentiles(context.Background(), workload) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - if result == nil { - t.Fatal("expected non-nil result") - } - if result.Workload == nil { - t.Fatal("expected workload identifier") - } - if len(result.Containers) != 1 { - t.Fatalf("expected 1 container, got %d", len(result.Containers)) - } - if result.Containers[0].ContainerName != "app" { - t.Fatalf("expected container 'app', got '%s'", result.Containers[0].ContainerName) - } -} diff --git a/internal/collector/historical_metrics_health_test.go b/internal/collector/historical_metrics_health_test.go deleted file mode 100644 index 1545529f..00000000 --- a/internal/collector/historical_metrics_health_test.go +++ /dev/null @@ -1,69 +0,0 @@ -package collector - -import ( - "context" - "fmt" - "testing" - "time" - - "github.com/devzero-inc/zxporter/internal/health" - "github.com/go-logr/logr" - v1 "github.com/prometheus/client_golang/api/prometheus/v1" - "github.com/prometheus/common/model" - "github.com/stretchr/testify/assert" -) - -type errorPrometheusAPI struct { - mockPrometheusAPI -} - -func (m *errorPrometheusAPI) Query( - ctx context.Context, - query string, - ts time.Time, - opts ...v1.Option, -) (model.Value, v1.Warnings, error) { - return nil, nil, fmt.Errorf("prometheus unavailable") -} - -func TestHistoricalCollector_HealthyOnSuccess(t *testing.T) { - hm := health.NewHealthManager() - hm.Register(health.ComponentPrometheus) - - mock := &mockPrometheusAPI{queryResults: make(map[string]model.Value)} - hc := NewHistoricalMetricsCollector(logr.Discard(), mock, hm) - - workload := HistoricalWorkloadQuery{ - Namespace: "default", - WorkloadName: "web-app", - WorkloadKind: "Deployment", - PodRegex: "web-app-.*", - Containers: []string{"app"}, - } - - _, err := hc.FetchPercentiles(context.Background(), workload) - assert.NoError(t, err) - - status, _ := hm.GetStatus(health.ComponentPrometheus) - assert.Equal(t, health.HealthStatusHealthy, status.Status) -} - -func TestHistoricalCollector_DegradedOnQueryError(t *testing.T) { - hm := health.NewHealthManager() - hm.Register(health.ComponentPrometheus) - - mock := &errorPrometheusAPI{} - hc := NewHistoricalMetricsCollector(logr.Discard(), mock, hm) - - _, err := hc.DiscoverContainers(context.Background(), "default", "web-app-.*") - assert.Error(t, err) - - status, _ := hm.GetStatus(health.ComponentPrometheus) - assert.Equal(t, health.HealthStatusDegraded, status.Status) -} - -func TestHistoricalCollector_NilHealthManager(t *testing.T) { - mock := &mockPrometheusAPI{queryResults: make(map[string]model.Value)} - hc := NewHistoricalMetricsCollector(logr.Discard(), mock, nil) - assert.NotNil(t, hc) -} diff --git a/internal/collector/historical_percentile_cache.go b/internal/collector/historical_percentile_cache.go new file mode 100644 index 00000000..604233e4 --- /dev/null +++ b/internal/collector/historical_percentile_cache.go @@ -0,0 +1,233 @@ +package collector + +import ( + "context" + "fmt" + "strings" + "sync" + "time" + + "github.com/go-logr/logr" + + gen "github.com/devzero-inc/zxporter/gen/api/v1" + "github.com/devzero-inc/zxporter/internal/health" +) + +const ( + // cacheRefreshInterval is how often Refresh is called by the background loop. + cacheRefreshInterval = 15 * time.Minute + + // refreshTimeout caps a single Refresh call so a slow DAKR response cannot + // block the refresh loop indefinitely. + refreshTimeout = 30 * time.Second +) + +// PercentileFetcher abstracts the DAKR control plane call to fetch per-workload +// pre-computed percentiles. +type PercentileFetcher interface { + FetchWorkloadPercentiles( + ctx context.Context, + clusterID string, + workloads []HistoricalWorkloadQuery, + ) (map[string]*gen.HistoricalMetricsSummary, error) +} + +// HistoricalPercentileCache fetches pre-computed percentiles from the DAKR +// control plane and caches them in-memory. It implements +// HistoricalPercentileProvider as a drop-in replacement for +// HistoricalMetricsCollector without a local Prometheus dependency. +type HistoricalPercentileCache struct { + logger logr.Logger + fetcher PercentileFetcher + clusterID string + healthManager *health.HealthManager + + mu sync.RWMutex + cache map[string]*gen.HistoricalMetricsSummary // key: "namespace/name/kind" +} + +// NewHistoricalPercentileCache constructs a ready-to-use cache. Call Start to +// begin the periodic background refresh. +func NewHistoricalPercentileCache( + logger logr.Logger, + fetcher PercentileFetcher, + clusterID string, + healthManager *health.HealthManager, +) *HistoricalPercentileCache { + return &HistoricalPercentileCache{ + logger: logger.WithName("historical-percentile-cache"), + fetcher: fetcher, + clusterID: clusterID, + healthManager: healthManager, + cache: make(map[string]*gen.HistoricalMetricsSummary), + } +} + +// Start performs an initial Refresh and then refreshes on a 15-minute ticker +// until ctx is cancelled. It blocks; run it in a goroutine. +func (c *HistoricalPercentileCache) Start(ctx context.Context) { + c.Refresh(ctx) + + ticker := time.NewTicker(cacheRefreshInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + c.logger.V(1).Info("historical percentile cache shutting down") + return + case <-ticker.C: + c.Refresh(ctx) + } + } +} + +// Refresh fetches percentiles from DAKR for all currently-known workloads and +// atomically replaces the cache. On error the stale cache is preserved and +// health is reported as Degraded. This method is safe to call directly in +// tests. +func (c *HistoricalPercentileCache) Refresh(ctx context.Context) { + fetchCtx, cancel := context.WithTimeout(ctx, refreshTimeout) + defer cancel() + + // Collect the workload queries from the current cache so we refresh all + // known workloads. On the very first call the cache is empty — the fetcher + // is expected to return whatever DAKR knows about this cluster. + c.mu.RLock() + workloads := make([]HistoricalWorkloadQuery, 0, len(c.cache)) + for key, summary := range c.cache { + if summary.Workload != nil { + workloads = append(workloads, HistoricalWorkloadQuery{ + Namespace: summary.Workload.Namespace, + WorkloadName: summary.Workload.Name, + WorkloadKind: summary.Workload.Kind, + }) + } else { + // Fallback: parse key "namespace/name/kind" + parts := strings.SplitN(key, "/", 3) + if len(parts) == 3 { + workloads = append(workloads, HistoricalWorkloadQuery{ + Namespace: parts[0], + WorkloadName: parts[1], + WorkloadKind: parts[2], + }) + } + } + } + c.mu.RUnlock() + + results, err := c.fetcher.FetchWorkloadPercentiles(fetchCtx, c.clusterID, workloads) + if err != nil { + c.logger.Error(err, "failed to fetch percentiles from DAKR; keeping stale cache") + c.updateHealthStatus( + health.HealthStatusDegraded, + "DAKR percentile fetch failed", + map[string]string{"error": err.Error()}, + ) + return + } + + // Atomically replace the entire cache. + c.mu.Lock() + c.cache = results + c.mu.Unlock() + + c.logger.V(1).Info("historical percentile cache refreshed", "workloads", len(results)) + c.updateHealthStatus( + health.HealthStatusHealthy, + "DAKR percentile fetch succeeded", + map[string]string{"workload_count": fmt.Sprintf("%d", len(results))}, + ) +} + +// FetchPercentiles returns the cached percentile summary for a single workload. +// If the workload is not in the cache an empty (non-nil) summary is returned +// rather than an error, so callers do not need to special-case cache misses. +func (c *HistoricalPercentileCache) FetchPercentiles( + ctx context.Context, + workload HistoricalWorkloadQuery, +) (*gen.HistoricalMetricsSummary, error) { + key := workloadKey(workload.Namespace, workload.WorkloadName, workload.WorkloadKind) + + c.mu.RLock() + summary, ok := c.cache[key] + c.mu.RUnlock() + + if !ok { + return &gen.HistoricalMetricsSummary{ + Workload: &gen.MpaWorkloadIdentifier{ + Namespace: workload.Namespace, + Name: workload.WorkloadName, + Kind: workload.WorkloadKind, + }, + }, nil + } + return summary, nil +} + +// FetchPercentilesForAll returns cached percentile summaries for all of the +// requested workloads. Only workloads present in the cache are included in the +// returned map. +func (c *HistoricalPercentileCache) FetchPercentilesForAll( + ctx context.Context, + workloads []HistoricalWorkloadQuery, +) map[string]*gen.HistoricalMetricsSummary { + out := make(map[string]*gen.HistoricalMetricsSummary, len(workloads)) + + c.mu.RLock() + defer c.mu.RUnlock() + + for _, w := range workloads { + key := workloadKey(w.Namespace, w.WorkloadName, w.WorkloadKind) + if summary, ok := c.cache[key]; ok { + out[key] = summary + } + } + return out +} + +// DiscoverContainers returns a deduplicated list of container names found +// across all cached workload summaries whose namespace matches the given +// namespace. The podRegex parameter is accepted for interface compatibility but +// is not used; filtering by namespace is sufficient for the cache-backed path. +func (c *HistoricalPercentileCache) DiscoverContainers( + ctx context.Context, + namespace, podRegex string, +) ([]string, error) { + seen := make(map[string]struct{}) + + c.mu.RLock() + for _, summary := range c.cache { + if summary.Workload == nil || summary.Workload.Namespace != namespace { + continue + } + for _, cm := range summary.Containers { + if cm.ContainerName != "" { + seen[cm.ContainerName] = struct{}{} + } + } + } + c.mu.RUnlock() + + containers := make([]string, 0, len(seen)) + for name := range seen { + containers = append(containers, name) + } + return containers, nil +} + +// workloadKey returns the canonical cache key for a workload. +func workloadKey(namespace, name, kind string) string { + return namespace + "/" + name + "/" + kind +} + +// updateHealthStatus reports status to the HealthManager if one was provided. +func (c *HistoricalPercentileCache) updateHealthStatus( + status health.HealthStatus, + message string, + metadata map[string]string, +) { + if c.healthManager != nil { + c.healthManager.UpdateStatus(health.ComponentPrometheus, status, message, metadata) + } +} diff --git a/internal/collector/historical_percentile_cache_test.go b/internal/collector/historical_percentile_cache_test.go new file mode 100644 index 00000000..df6201d7 --- /dev/null +++ b/internal/collector/historical_percentile_cache_test.go @@ -0,0 +1,280 @@ +package collector + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/go-logr/logr" + "google.golang.org/protobuf/types/known/timestamppb" + + gen "github.com/devzero-inc/zxporter/gen/api/v1" + "github.com/devzero-inc/zxporter/internal/health" +) + +// mockPercentileFetcher is a test double for PercentileFetcher. +type mockPercentileFetcher struct { + results map[string]*gen.HistoricalMetricsSummary + err error +} + +func (m *mockPercentileFetcher) FetchWorkloadPercentiles( + _ context.Context, + _ string, + _ []HistoricalWorkloadQuery, +) (map[string]*gen.HistoricalMetricsSummary, error) { + if m.err != nil { + return nil, m.err + } + // Return a copy so tests cannot mutate the mock's state. + out := make(map[string]*gen.HistoricalMetricsSummary, len(m.results)) + for k, v := range m.results { + out[k] = v + } + return out, nil +} + +// helpers ------------------------------------------------------------------- + +func makeTestSummary(namespace, name, kind string, containers ...string) *gen.HistoricalMetricsSummary { + now := time.Now() + cs := make([]*gen.ContainerHistoricalMetrics, 0, len(containers)) + for _, c := range containers { + cs = append(cs, &gen.ContainerHistoricalMetrics{ + ContainerName: c, + CpuP50: 100, + MemP50: 1024 * 1024, + }) + } + return &gen.HistoricalMetricsSummary{ + Workload: &gen.MpaWorkloadIdentifier{ + Namespace: namespace, + Name: name, + Kind: kind, + }, + Containers: cs, + WindowStart: timestamppb.New(now.Add(-24 * time.Hour)), + WindowEnd: timestamppb.New(now), + SampleCount: 42, + } +} + +func cacheKey(namespace, name, kind string) string { + return namespace + "/" + name + "/" + kind +} + +// --------------------------------------------------------------------------- +// Test 1: cache is populated on Refresh and FetchPercentiles returns data. +// --------------------------------------------------------------------------- + +func TestHistoricalPercentileCache_ServesFromCache(t *testing.T) { + key := cacheKey("default", "web-app", "Deployment") + summary := makeTestSummary("default", "web-app", "Deployment", "app", "sidecar") + + fetcher := &mockPercentileFetcher{ + results: map[string]*gen.HistoricalMetricsSummary{key: summary}, + } + + hm := health.NewHealthManager() + hm.Register(health.ComponentPrometheus) + + cache := NewHistoricalPercentileCache(logr.Discard(), fetcher, "cluster-1", hm) + cache.Refresh(context.Background()) + + workload := HistoricalWorkloadQuery{ + Namespace: "default", + WorkloadName: "web-app", + WorkloadKind: "Deployment", + } + got, err := cache.FetchPercentiles(context.Background(), workload) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got == nil { + t.Fatal("expected non-nil summary") + } + if got.Workload == nil { + t.Fatal("expected workload identifier in summary") + } + if got.Workload.Name != "web-app" { + t.Fatalf("expected workload name 'web-app', got %q", got.Workload.Name) + } + if len(got.Containers) != 2 { + t.Fatalf("expected 2 containers, got %d", len(got.Containers)) + } + if got.SampleCount != 42 { + t.Fatalf("expected SampleCount 42, got %d", got.SampleCount) + } +} + +// --------------------------------------------------------------------------- +// Test 2: FetchPercentilesForAll returns only the requested workloads. +// --------------------------------------------------------------------------- + +func TestHistoricalPercentileCache_FetchPercentilesForAll(t *testing.T) { + k1 := cacheKey("ns1", "svc-a", "Deployment") + k2 := cacheKey("ns1", "svc-b", "Deployment") + k3 := cacheKey("ns2", "svc-c", "StatefulSet") + + fetcher := &mockPercentileFetcher{ + results: map[string]*gen.HistoricalMetricsSummary{ + k1: makeTestSummary("ns1", "svc-a", "Deployment", "app"), + k2: makeTestSummary("ns1", "svc-b", "Deployment", "proxy"), + k3: makeTestSummary("ns2", "svc-c", "StatefulSet", "db"), + }, + } + + cache := NewHistoricalPercentileCache(logr.Discard(), fetcher, "cluster-1", nil) + cache.Refresh(context.Background()) + + workloads := []HistoricalWorkloadQuery{ + {Namespace: "ns1", WorkloadName: "svc-a", WorkloadKind: "Deployment"}, + {Namespace: "ns2", WorkloadName: "svc-c", WorkloadKind: "StatefulSet"}, + } + results := cache.FetchPercentilesForAll(context.Background(), workloads) + + if len(results) != 2 { + t.Fatalf("expected 2 results, got %d", len(results)) + } + if _, ok := results[k1]; !ok { + t.Errorf("expected result for key %q", k1) + } + if _, ok := results[k3]; !ok { + t.Errorf("expected result for key %q", k3) + } + // svc-b was NOT requested — it must not appear. + if _, ok := results[k2]; ok { + t.Errorf("unexpected result for key %q", k2) + } +} + +// --------------------------------------------------------------------------- +// Test 3: DiscoverContainers extracts container names from cached summaries. +// --------------------------------------------------------------------------- + +func TestHistoricalPercentileCache_DiscoverContainers(t *testing.T) { + k1 := cacheKey("prod", "api", "Deployment") + k2 := cacheKey("prod", "worker", "Deployment") + k3 := cacheKey("staging", "api", "Deployment") // different namespace + + fetcher := &mockPercentileFetcher{ + results: map[string]*gen.HistoricalMetricsSummary{ + k1: makeTestSummary("prod", "api", "Deployment", "web", "sidecar"), + k2: makeTestSummary("prod", "worker", "Deployment", "worker", "sidecar"), + k3: makeTestSummary("staging", "api", "Deployment", "web"), + }, + } + + cache := NewHistoricalPercentileCache(logr.Discard(), fetcher, "cluster-1", nil) + cache.Refresh(context.Background()) + + containers, err := cache.DiscoverContainers(context.Background(), "prod", "") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Expect "web", "sidecar", "worker" — all unique names from "prod" namespace. + // Order is not guaranteed. + if len(containers) != 3 { + t.Fatalf("expected 3 unique container names, got %d: %v", len(containers), containers) + } + seen := make(map[string]bool, len(containers)) + for _, c := range containers { + seen[c] = true + } + for _, want := range []string{"web", "sidecar", "worker"} { + if !seen[want] { + t.Errorf("expected container %q in results", want) + } + } + // staging namespace must not leak through. + if seen[""] { + t.Error("empty container name should not appear") + } +} + +// --------------------------------------------------------------------------- +// Test 4: Querying a non-cached workload returns an empty summary, not error. +// --------------------------------------------------------------------------- + +func TestHistoricalPercentileCache_MissingWorkloadReturnsEmptySummary(t *testing.T) { + fetcher := &mockPercentileFetcher{ + results: map[string]*gen.HistoricalMetricsSummary{}, + } + + cache := NewHistoricalPercentileCache(logr.Discard(), fetcher, "cluster-1", nil) + cache.Refresh(context.Background()) + + workload := HistoricalWorkloadQuery{ + Namespace: "missing", + WorkloadName: "ghost", + WorkloadKind: "Deployment", + } + got, err := cache.FetchPercentiles(context.Background(), workload) + if err != nil { + t.Fatalf("expected no error for missing workload, got: %v", err) + } + if got == nil { + t.Fatal("expected non-nil (empty) summary for missing workload") + } + if len(got.Containers) != 0 { + t.Fatalf("expected 0 containers in empty summary, got %d", len(got.Containers)) + } +} + +// --------------------------------------------------------------------------- +// Test 5: A fetcher error on second Refresh keeps stale data in the cache. +// --------------------------------------------------------------------------- + +func TestHistoricalPercentileCache_FetcherErrorKeepsStaleCache(t *testing.T) { + key := cacheKey("default", "stable-app", "Deployment") + summary := makeTestSummary("default", "stable-app", "Deployment", "app") + + fetcher := &mockPercentileFetcher{ + results: map[string]*gen.HistoricalMetricsSummary{key: summary}, + } + + hm := health.NewHealthManager() + hm.Register(health.ComponentPrometheus) + + cache := NewHistoricalPercentileCache(logr.Discard(), fetcher, "cluster-1", hm) + + // First refresh: success — cache is warm. + cache.Refresh(context.Background()) + + workload := HistoricalWorkloadQuery{ + Namespace: "default", + WorkloadName: "stable-app", + WorkloadKind: "Deployment", + } + got, err := cache.FetchPercentiles(context.Background(), workload) + if err != nil || got == nil { + t.Fatalf("expected stale data after first refresh, got err=%v, summary=%v", err, got) + } + + // Inject a fetcher error for the second refresh. + fetcher.err = errors.New("DAKR unavailable") + cache.Refresh(context.Background()) + + // Stale data must still be served. + got, err = cache.FetchPercentiles(context.Background(), workload) + if err != nil { + t.Fatalf("unexpected error after failed refresh: %v", err) + } + if got == nil { + t.Fatal("expected non-nil stale summary after fetcher error") + } + if got.Workload == nil || got.Workload.Name != "stable-app" { + t.Fatalf("stale summary has wrong workload: %+v", got.Workload) + } + + // Health status should reflect degraded state. + status, ok := hm.GetStatus(health.ComponentPrometheus) + if !ok { + t.Fatal("expected prometheus component to be registered") + } + if status.Status != health.HealthStatusDegraded { + t.Fatalf("expected Degraded health after fetch error, got %v", status.Status) + } +} diff --git a/internal/collector/interface.go b/internal/collector/interface.go index 9c5db8df..3a39e369 100644 --- a/internal/collector/interface.go +++ b/internal/collector/interface.go @@ -410,3 +410,20 @@ const ( type MpaMetricsPublisher interface { PublishMetrics(metrics *ContainerMetricsSnapshot, timestamp time.Time) } + +// HistoricalWorkloadQuery defines what to query for a workload. +type HistoricalWorkloadQuery struct { + Namespace string + WorkloadName string + WorkloadKind string + PodRegex string // e.g., "web-app-.*" + Containers []string // container names to query +} + +// HistoricalPercentileProvider abstracts historical percentile data retrieval. +// Implemented by HistoricalPercentileCache (DAKR-backed). +type HistoricalPercentileProvider interface { + FetchPercentiles(ctx context.Context, workload HistoricalWorkloadQuery) (*gen.HistoricalMetricsSummary, error) + FetchPercentilesForAll(ctx context.Context, workloads []HistoricalWorkloadQuery) map[string]*gen.HistoricalMetricsSummary + DiscoverContainers(ctx context.Context, namespace, podRegex string) ([]string, error) +} diff --git a/internal/collector/node_collector.go b/internal/collector/node_collector.go index 4ce9cdaa..f862e134 100644 --- a/internal/collector/node_collector.go +++ b/internal/collector/node_collector.go @@ -6,7 +6,6 @@ import ( "fmt" "os" "reflect" - "strings" "sync" "time" @@ -15,14 +14,13 @@ import ( telemetry_logger "github.com/devzero-inc/zxporter/internal/logger" "github.com/devzero-inc/zxporter/internal/version" "github.com/go-logr/logr" - "github.com/prometheus/client_golang/api" - v1 "github.com/prometheus/client_golang/api/prometheus/v1" - "github.com/prometheus/common/model" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" + metricsapisv1beta1 "k8s.io/metrics/pkg/apis/metrics/v1beta1" metricsv1 "k8s.io/metrics/pkg/client/clientset/versioned" ) @@ -31,17 +29,7 @@ type NodeCollectorConfig struct { // UpdateInterval specifies how often to collect metrics UpdateInterval time.Duration - // PrometheusURL specifies the URL of the Prometheus instance to query - // If empty, defaults to in-cluster Prometheus at http://prometheus.monitoring:9090 - PrometheusURL string - - // QueryTimeout specifies the timeout for Prometheus queries - QueryTimeout time.Duration - - // DisableNetworkIOMetrics determines whether to disable network and I/O metrics collection - // Default is false, so metrics are collected by default - DisableNetworkIOMetrics bool - + // DisableGPUMetrics determines whether to disable GPU metrics collection // Default is false, so metrics are collected by default DisableGPUMetrics bool } @@ -50,7 +38,6 @@ type NodeCollectorConfig struct { type NodeCollector struct { k8sClient kubernetes.Interface metricsClient *metricsv1.Clientset - prometheusAPI v1.API nodemonClient *NodemonClient informerFactory informers.SharedInformerFactory nodeInformer cache.SharedIndexInformer @@ -93,16 +80,6 @@ func NewNodeCollector( config.UpdateInterval = 10 * time.Second } - // Default Prometheus URL if not specified - if config.PrometheusURL == "" { - config.PrometheusURL = "http://prometheus.monitoring:9090" - } - - // Default query timeout if not specified - if config.QueryTimeout <= 0 { - config.QueryTimeout = 10 * time.Second - } - // Create channels batchChan := make(chan CollectedResource, 100) // For metrics resourceChan := make(chan []CollectedResource, 100) // For events and batched metrics @@ -116,9 +93,16 @@ func NewNodeCollector( logger, ) + ns := os.Getenv("POD_NAMESPACE") + if ns == "" { + ns = "devzero-system" + } + nodemonClient := NewNodemonClient(k8sClient, ns, logger) + return &NodeCollector{ k8sClient: k8sClient, metricsClient: metricsClient, + nodemonClient: nodemonClient, batchChan: batchChan, resourceChan: resourceChan, batcher: batcher, @@ -128,112 +112,16 @@ func NewNodeCollector( logger: logger.WithName("node-collector"), metrics: metrics, telemetryLogger: telemetryLogger, - nodeToPodsMap: make(map[string]map[string]*corev1.Pod), - } -} - -// initPrometheusClient initializes the Prometheus client with fallback mechanisms -func (c *NodeCollector) initPrometheusClient(ctx context.Context) error { - c.logger.Info("Initializing Prometheus client", - "prometheusURL", c.config.PrometheusURL) - - // Create a custom HTTP client with metrics - httpClient := NewPrometheusClient(c.metrics) - - client, err := api.NewClient(api.Config{ - Address: c.config.PrometheusURL, - Client: httpClient, - }) - if err != nil { - if c.telemetryLogger != nil { - c.telemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_ERROR, - "NodeCollector", - "Failed to create Prometheus client", - err, - map[string]string{ - "prometheus_url": c.config.PrometheusURL, - "zxporter_version": version.Get().String(), - }, - ) - } - c.logger.Error( - err, - "Failed to create Prometheus client, node network, I/O and GPU metrics will be disabled", - ) - return err - } - - // Set the API client - c.prometheusAPI = v1.NewAPI(client) - - // Verify access with a simple query - queryCtx, cancel := context.WithTimeout(ctx, c.config.QueryTimeout) - defer cancel() - - _, _, err = c.prometheusAPI.Query(queryCtx, "up", time.Now()) - if err != nil { - // Check if this is a permission error - if strings.Contains(err.Error(), "forbidden") || - strings.Contains(err.Error(), "unauthorized") { - c.logger.Error( - err, - "Permission denied when accessing Prometheus. Please ensure the service account has proper RBAC permissions", - ) - c.logger.Info( - "You may need to apply the 'zxporter-prometheus-reader' ClusterRole and ClusterRoleBinding", - ) - - // Continue with basic metrics only - c.prometheusAPI = nil - return fmt.Errorf("permission error accessing Prometheus: %w", err) - } - - // Handle other connection errors - c.logger.Error( - err, - "Failed to connect to Prometheus, node network, I/O and GPU metrics will be disabled", - ) - c.prometheusAPI = nil - return err + nodeToPodsMap: make(map[string]map[string]*corev1.Pod), } - - c.logger.Info("Successfully connected to Prometheus for node network, I/O and GPU metrics") - return nil } // Start begins the node collection process func (c *NodeCollector) Start(ctx context.Context) error { c.logger.Info("Starting node collector", "updateInterval", c.config.UpdateInterval, - "disableNetworkIOMetrics", c.config.DisableNetworkIOMetrics, "disableGPUMetrics", c.config.DisableGPUMetrics) - // Initialize nodemon client for auto-discovery - // It discovers DaemonSet pods by well-known label — no config needed. - if !c.config.DisableGPUMetrics { - ns := os.Getenv("POD_NAMESPACE") - if ns == "" { - ns = "devzero-system" - } - c.nodemonClient = NewNodemonClient(c.k8sClient, ns, c.logger) - c.logger.Info("Initialized nodemon client (auto-discovery)", "namespace", ns) - } - - // Initialize Prometheus client if network/IO metrics are not disabled - // Always init Prometheus when nodemon is set — we need it for comparison mode - needPrometheus := !c.config.DisableNetworkIOMetrics || !c.config.DisableGPUMetrics - if needPrometheus { - if err := c.initPrometheusClient(ctx); err != nil { - c.logger.Error( - err, - "Failed to initialize Prometheus client, continuing with basic metrics", - ) - // Log but continue - we can still collect CPU/memory metrics - c.logger.Info("Continuing with basic node metrics collection only") - } - } - // Create informer factory c.informerFactory = informers.NewSharedInformerFactory(c.k8sClient, 0) @@ -565,27 +453,32 @@ func (c *NodeCollector) collectAllNodeResources(ctx context.Context) { return } - // Fetch node metrics from the metrics server - nodeMetricsList, err := c.metricsClient.MetricsV1beta1(). - NodeMetricses(). - List(ctx, metav1.ListOptions{}) - if err != nil { - if c.telemetryLogger != nil { - c.telemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_ERROR, - "NodeCollector", - "Failed to get node metrics from metrics server", - err, - map[string]string{ - "excluded_nodes": fmt.Sprintf("%v", c.excludedNodes), - "error_type": "metrics_server_query_failed", - "zxporter_version": version.Get().String(), - }, - ) + // Build node metrics from nodemon /node/metrics endpoint (uses kubelet stats/summary + // node-level data which includes system processes, not just container aggregation) + nodeMetricsList := &metricsapisv1beta1.NodeMetricsList{} + + nodes := c.nodeInformer.GetIndexer().List() + for _, obj := range nodes { + node, ok := obj.(*corev1.Node) + if !ok { + continue } - c.logger.Error(err, "Failed to get node metrics from metrics server") - return + nm := metricsapisv1beta1.NodeMetrics{ + ObjectMeta: metav1.ObjectMeta{Name: node.Name}, + Usage: corev1.ResourceList{}, + } + // Fetch node-level metrics from nodemon (includes system process CPU/memory) + nodeMetric, err := c.nodemonClient.FetchNodeMetricsByNode(ctx, node.Name) + if err != nil { + c.logger.V(1).Info("Failed to fetch node metrics from nodemon, skipping CPU/memory", "node", node.Name, "error", err) + } else if nodeMetric != nil { + cpuMillis := int64(nodeMetric.CPUUsageNanoCores / 1_000_000) + nm.Usage[corev1.ResourceCPU] = *resource.NewMilliQuantity(cpuMillis, resource.DecimalSI) + nm.Usage[corev1.ResourceMemory] = *resource.NewQuantity(int64(nodeMetric.MemoryWorkingSet), resource.BinarySI) + } + nodeMetricsList.Items = append(nodeMetricsList.Items, nm) } + c.logger.V(1).Info("Built node metrics from nodemon container data", "nodes", len(nodeMetricsList.Items)) if c.telemetryLogger != nil { c.telemetryLogger.Report( @@ -683,129 +576,29 @@ func (c *NodeCollector) collectAllNodeResources(ctx context.Context) { cpuUtilizationPercent := float64(cpuUsage) / float64(cpuAllocatable) * 100 memoryUtilizationPercent := float64(memoryUsage) / float64(memoryAllocatable) * 100 + // Fetch network and I/O metrics from nodemon var networkMetrics map[string]float64 var gpuMetrics map[string]interface{} - // If there are no prometheus, don't even bother on checking metrics - if c.prometheusAPI != nil { - - // Create a context with timeout for Prometheus only when needed - var queryCtx context.Context - var cancel context.CancelFunc - - queryCtx, cancel = context.WithTimeout(ctx, c.config.QueryTimeout) - defer cancel() - - // Fetch network metrics for the node if enabled - if !c.config.DisableNetworkIOMetrics { - networkMetrics, err = c.collectNodeNetworkIOMetrics(queryCtx, node.Name) - if queryCtx.Err() != nil { - c.logger.Error( - queryCtx.Err(), - "Query context for node network metrics failed", - "node", - node.Name, - ) - } - if err != nil { - if c.telemetryLogger != nil { - c.telemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_ERROR, - "NodeCollector", - "Failed to collect node network and I/O metrics from Prometheus", - err, - map[string]string{ - "node": node.Name, - "error_type": "prometheus_network_io_query_failed", - "prometheus_url": c.config.PrometheusURL, - "zxporter_version": version.Get().String(), - }, - ) - } - c.logger.Error(err, "Failed to collect node network and io metrics", - "name", node.Name) - // Continue with basic metrics - networkMetrics = make(map[string]float64) - } - } + if netMetrics, netErr := c.collectNodeNetworkIOMetrics(ctx, node.Name); netErr != nil { + c.logger.Error(netErr, "Failed to collect node network and io metrics from nodemon", + "name", node.Name) + networkMetrics = make(map[string]float64) + } else { + networkMetrics = netMetrics + } - // Fetch GPU metrics for the node if enabled - if !c.config.DisableGPUMetrics { - if c.nodemonClient != nil { - // Primary: nodemon - nodemonMetrics, fetchErr := c.nodemonClient.FetchMetricsByNode( - queryCtx, - node.Name, - ) - if fetchErr != nil { - c.logger.Error( - fetchErr, - "nodemon failed, falling back to Prometheus", - "node", - node.Name, - ) - // Fallback to Prometheus on exporter failure - if c.prometheusAPI != nil { - gpuMetrics, err = c.collectNodeGPUMetrics(queryCtx, node.Name) - if err != nil { - c.logger.Error( - err, - "Prometheus fallback also failed for node GPU metrics", - "node", - node.Name, - ) - gpuMetrics = make(map[string]interface{}) - } - } else { - gpuMetrics = make(map[string]interface{}) - } - } else if len(nodemonMetrics) > 0 { - // nodemon returned data - gpuMetrics = NodeGPUMetricsFromNodemon(nodemonMetrics) - } else if c.prometheusAPI != nil { - // nodemon returned no data (no exporter on this node) — fallback to Prometheus - gpuMetrics, err = c.collectNodeGPUMetrics(queryCtx, node.Name) - if err != nil { - c.logger.Info( - "Prometheus fallback returned no GPU metrics for node (node may not have GPUs)", - "node", node.Name, - "error", err.Error(), - ) - gpuMetrics = make(map[string]interface{}) - } - } else { - gpuMetrics = make(map[string]interface{}) - } - } else if c.prometheusAPI != nil { - // No nodemon client initialized — use Prometheus - gpuMetrics, err = c.collectNodeGPUMetrics(queryCtx, node.Name) - if queryCtx.Err() != nil { - c.logger.Error(queryCtx.Err(), "Query context for node GPU metrics failed", "node", node.Name) - } - if err != nil { - if c.telemetryLogger != nil { - c.telemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_WARN, - "NodeCollector", - "Failed to collect node GPU metrics from Prometheus", - err, - map[string]string{ - "node": node.Name, - "error_type": "prometheus_gpu_query_failed", - "prometheus_url": c.config.PrometheusURL, - "zxporter_version": version.Get().String(), - }, - ) - } - c.logger.Error(err, "Failed to collect node GPU metrics", - "name", node.Name) - gpuMetrics = make(map[string]interface{}) - } - } else { - gpuMetrics = make(map[string]interface{}) - } + // Fetch GPU metrics from nodemon if enabled + if !c.config.DisableGPUMetrics && c.nodemonClient != nil { + nodemonMetrics, fetchErr := c.nodemonClient.FetchMetricsByNode(ctx, node.Name) + if fetchErr != nil { + c.logger.Error(fetchErr, "Failed to fetch GPU metrics from nodemon", "node", node.Name) + gpuMetrics = make(map[string]interface{}) + } else if len(nodemonMetrics) > 0 { + gpuMetrics = NodeGPUMetricsFromNodemon(nodemonMetrics) + } else { + gpuMetrics = make(map[string]interface{}) } - } // Create resource data @@ -905,248 +698,34 @@ func (c *NodeCollector) collectAllNodeResources(ctx context.Context) { } } -// collectNodeNetworkIOMetrics collects network metrics for a node using Prometheus queries -// -//nolint:unparam +// collectNodeNetworkIOMetrics collects network and I/O metrics for a node +// using the nodemon DaemonSet. func (c *NodeCollector) collectNodeNetworkIOMetrics( ctx context.Context, nodeName string, ) (map[string]float64, error) { - metrics := make(map[string]float64) - - queries := map[string]string{ - // Define queries for network metrics - "NetworkReceiveBytes": fmt.Sprintf( - `sum(rate(node_network_receive_bytes_total{node="%s"}[5m]))`, - nodeName, - ), - "NetworkTransmitBytes": fmt.Sprintf( - `sum(rate(node_network_transmit_bytes_total{node="%s"}[5m]))`, - nodeName, - ), - "NetworkReceivePackets": fmt.Sprintf( - `sum(rate(node_network_receive_packets_total{node="%s"}[5m]))`, - nodeName, - ), - "NetworkTransmitPackets": fmt.Sprintf( - `sum(rate(node_network_transmit_packets_total{node="%s"}[5m]))`, - nodeName, - ), - "NetworkReceiveErrors": fmt.Sprintf( - `sum(rate(node_network_receive_errs_total{node="%s"}[5m]))`, - nodeName, - ), - "NetworkTransmitErrors": fmt.Sprintf( - `sum(rate(node_network_transmit_errs_total{node="%s"}[5m]))`, - nodeName, - ), - "NetworkReceiveDropped": fmt.Sprintf( - `sum(rate(node_network_receive_drop_total{node="%s"}[5m]))`, - nodeName, - ), - "NetworkTransmitDropped": fmt.Sprintf( - `sum(rate(node_network_transmit_drop_total{node="%s"}[5m]))`, - nodeName, - ), - // Define queries for I/O metrics - "FSReadBytes": fmt.Sprintf( - `sum(rate(node_disk_read_bytes_total{node="%s"}[5m]))`, - nodeName, - ), - "FSWriteBytes": fmt.Sprintf( - `sum(rate(node_disk_written_bytes_total{node="%s"}[5m]))`, - nodeName, - ), - "FSReads": fmt.Sprintf( - `sum(rate(node_disk_reads_completed_total{node="%s"}[5m]))`, - nodeName, - ), - "FSWrites": fmt.Sprintf( - `sum(rate(node_disk_writes_completed_total{node="%s"}[5m]))`, - nodeName, - ), - } - - // Execute each query and store the result - for metricName, query := range queries { - c.logger.V(2).Info("Querying node network and IO metric: ", "metric_name", metricName) - metrics[metricName] = 0 // Default to 0 for all metrics - - result, _, err := c.prometheusAPI.Query(ctx, query, time.Now()) - if err != nil { - c.logger.Error(err, "Error querying Prometheus", - "metric", metricName, - "query", query) - continue - } - - // Extract value from result (if any) - if result.Type() == model.ValVector { - vector := result.(model.Vector) - if len(vector) > 0 { - metrics[metricName] = float64(vector[0].Value) - } - } - } - - return metrics, nil -} - -// collectNodeGPUMetrics collects GPU metrics for a node using Prometheus queries -func (c *NodeCollector) collectNodeGPUMetrics( - ctx context.Context, - nodeName string, -) (map[string]interface{}, error) { - metrics := make(map[string]interface{}) - - // First query to check if this node has any GPUs - nodeGPUQuery := fmt.Sprintf(`count(DCGM_FI_DEV_GPU_UTIL{node="%s"})`, nodeName) - - result, _, err := c.prometheusAPI.Query(ctx, nodeGPUQuery, time.Now()) + m, err := c.nodemonClient.FetchNodeMetricsByNode(ctx, nodeName) if err != nil { - return nil, fmt.Errorf("error querying GPU availability: %w", err) - } - - // Check if node has GPU metrics - hasGPU := false - if result.Type() == model.ValVector { - vector := result.(model.Vector) - if len(vector) > 0 && float64(vector[0].Value) > 0 { - hasGPU = true - } - } - - if !hasGPU { - // Return empty metrics if no GPU is on this node - return metrics, nil - } - - // Node has GPUs, collect metrics - queries := map[string]string{ - "GPUCount": fmt.Sprintf( - `count(DCGM_FI_DEV_GPU_UTIL{node="%s"})`, - nodeName, - ), - "GPUUtilizationAvg": fmt.Sprintf(`avg(DCGM_FI_DEV_GPU_UTIL{node="%s"})`, nodeName), - "GPUUtilizationMax": fmt.Sprintf(`max(DCGM_FI_DEV_GPU_UTIL{node="%s"})`, nodeName), - "GPUMemoryUsedTotal": fmt.Sprintf(`sum(DCGM_FI_DEV_FB_USED{node="%s"})`, nodeName), - "GPUMemoryFreeTotal": fmt.Sprintf(`sum(DCGM_FI_DEV_FB_FREE{node="%s"})`, nodeName), - "GPUPowerUsageTotal": fmt.Sprintf( - `sum(DCGM_FI_DEV_POWER_USAGE{node="%s"})`, - nodeName, - ), - "GPUTemperatureAvg": fmt.Sprintf(`avg(DCGM_FI_DEV_GPU_TEMP{node="%s"})`, nodeName), - "GPUTemperatureMax": fmt.Sprintf(`max(DCGM_FI_DEV_GPU_TEMP{node="%s"})`, nodeName), - "GPUMemoryTemperatureAvg": fmt.Sprintf( - `avg(DCGM_FI_DEV_MEMORY_TEMP{node="%s"})`, - nodeName, - ), - "GPUMemoryTemperatureMax": fmt.Sprintf( - `max(DCGM_FI_DEV_MEMORY_TEMP{node="%s"})`, - nodeName, - ), - "GPUTensorUtilizationAvg": fmt.Sprintf( - `avg(DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{node="%s"})`, - nodeName, - ), - "GPUDramUtilizationAvg": fmt.Sprintf( - `avg(DCGM_FI_PROF_DRAM_ACTIVE{node="%s"})`, - nodeName, - ), - "GPUPCIeTxBytesTotal": fmt.Sprintf( - `sum(DCGM_FI_PROF_PCIE_TX_BYTES{node="%s"})`, - nodeName, - ), - "GPUPCIeRxBytesTotal": fmt.Sprintf( - `sum(DCGM_FI_PROF_PCIE_RX_BYTES{node="%s"})`, - nodeName, - ), - "GPUGraphicsUtilizationAvg": fmt.Sprintf( - `avg(DCGM_FI_PROF_GR_ENGINE_ACTIVE{node="%s"})`, - nodeName, - ), - } - - gpuCountValue := 0.0 - gpuUtilValue := 0.0 - // Execute each query and store the result - for metricName, query := range queries { - c.logger.V(2).Info("Querying node GPU metric: ", "metric_name", metricName) - result, _, err := c.prometheusAPI.Query(ctx, query, time.Now()) - if err != nil { - c.logger.Error(err, "Error querying Prometheus for GPU metrics", - "metric", metricName, - "query", query, - "node", nodeName) - continue - } - - // Extract value from result (if any) - if result.Type() == model.ValVector { - vector := result.(model.Vector) - if len(vector) > 0 { - metrics[metricName] = float64(vector[0].Value) - if metricName == "GPUCount" { - gpuCountValue = float64(vector[0].Value) - } else if metricName == "GPUUtilizationAvg" { - gpuUtilValue = float64(vector[0].Value) - } - } - } - } - - // If we found no GPU metrics, return an empty map - if len(metrics) == 0 { - return metrics, nil - } - - // Get GPU models on this node - this requires a specific query and parsing (not sure if parsing is working or not :)) - modelQuery := fmt.Sprintf(`DCGM_FI_DEV_GPU_UTIL{node="%s"}`, nodeName) - result, _, err = c.prometheusAPI.Query(ctx, modelQuery, time.Now()) - if err == nil && result.Type() == model.ValVector { - vector := result.(model.Vector) - - // Store unique GPU models - gpuModels := make(map[string]int) - gpuUUIDs := make([]string, 0) - - for _, sample := range vector { - model := string(sample.Metric["modelName"]) - if model != "" { - gpuModels[model]++ - } - - // Collect UUID if available - uuid := string(sample.Metric["UUID"]) - if uuid != "" { - gpuUUIDs = append(gpuUUIDs, uuid) - } - } - - // Convert model map to a summarized string - modelSummary := make([]string, 0) - for model, count := range gpuModels { - modelSummary = append(modelSummary, fmt.Sprintf("%dx %s", count, model)) - } - - metrics["GPUModels"] = modelSummary - metrics["GPUUUIDs"] = gpuUUIDs - } - - // Calculate total GPU memory - if memUsed, ok := metrics["GPUMemoryUsedTotal"].(float64); ok { - if memFree, ok := metrics["GPUMemoryFreeTotal"].(float64); ok { - metrics["GPUMemoryTotalMb"] = memUsed + memFree - } - } - - // GPUUsage = GPUMetricsCount * GPUUtilizationPercentage / 100 - if gpuCountValue > 0 { - gpuUsage := (gpuUtilValue * gpuCountValue) / 100.0 - metrics["GPUUsage"] = gpuUsage - } - - return metrics, nil + return nil, fmt.Errorf("fetching node metrics from nodemon for %s: %w", nodeName, err) + } + if m == nil { + return nil, fmt.Errorf("no nodemon pod found on node %s", nodeName) + } + + return map[string]float64{ + "NetworkReceiveBytes": m.NetworkRxBytesPerSec, + "NetworkTransmitBytes": m.NetworkTxBytesPerSec, + "NetworkReceivePackets": m.NetworkRxPacketsPerSec, + "NetworkTransmitPackets": m.NetworkTxPacketsPerSec, + "NetworkReceiveErrors": m.NetworkRxErrorsPerSec, + "NetworkTransmitErrors": m.NetworkTxErrorsPerSec, + "NetworkReceiveDropped": m.NetworkRxDropsPerSec, + "NetworkTransmitDropped": m.NetworkTxDropsPerSec, + "FSReadBytes": m.DiskReadBytesPerSec, + "FSWriteBytes": m.DiskWriteBytesPerSec, + "FSReads": m.DiskReadOpsPerSec, + "FSWrites": m.DiskWriteOpsPerSec, + }, nil } // isExcluded checks if a node should be excluded from collection @@ -1208,44 +787,10 @@ func (c *NodeCollector) GetType() string { return "node" } -// IsAvailable checks if Node resources can be accessed in the cluster +// IsAvailable checks if Node resources can be accessed in the cluster. +// Always returns true — nodemon pods are discovered dynamically. func (c *NodeCollector) IsAvailable(ctx context.Context) bool { - // Check if the metrics client is available - this is required for basic metrics - if c.metricsClient == nil { - c.logger.Info("Metrics client is not available, cannot collect node metrics") - c.telemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_WARN, - "NodeCollector", - "Metrics client is not available, cannot collect node metrics", - fmt.Errorf("metrics client is not available or properly set"), - map[string]string{ - "collector_type": c.GetType(), - "zxporter_version": version.Get().String(), - }, - ) - // return false - } - - // Try a simple query to check if the metrics server is available - _, err := c.metricsClient.MetricsV1beta1().NodeMetricses().List(ctx, metav1.ListOptions{ - Limit: 1, // Only request a single item to minimize load - }) - if err != nil { - c.telemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_WARN, - "NodeCollector", - "Metrics server API not available for node metrics", - err, - map[string]string{ - "collector_type": c.GetType(), - "zxporter_version": version.Get().String(), - }, - ) - c.logger.Info("Metrics server API not available for node metrics", "error", err.Error()) - // return false - } - - return true + return c.nodemonClient != nil } // AddResource manually adds a node resource to be processed by the collector diff --git a/internal/collector/nodemon_client.go b/internal/collector/nodemon_client.go index 3f058473..27a964c6 100644 --- a/internal/collector/nodemon_client.go +++ b/internal/collector/nodemon_client.go @@ -14,6 +14,72 @@ import ( "k8s.io/client-go/kubernetes" ) +// UnifiedContainerMetric mirrors nodemon's ContainerMetricsResponse. +// Defined here to avoid a direct import dependency on the nodemon package. +type UnifiedContainerMetric struct { + NodeName string `json:"node_name"` + Namespace string `json:"namespace"` + Pod string `json:"pod"` + Container string `json:"container"` + Timestamp time.Time `json:"timestamp"` + CPUUsageNanoCores uint64 `json:"cpu_usage_nanocores"` + MemoryWorkingSet uint64 `json:"memory_working_set_bytes"` + MemoryUsageBytes uint64 `json:"memory_usage_bytes"` + MemoryRSSBytes uint64 `json:"memory_rss_bytes"` + NetworkRxBytes uint64 `json:"network_rx_bytes"` + NetworkTxBytes uint64 `json:"network_tx_bytes"` + // cAdvisor rates + NetworkRxPacketsPerSec float64 `json:"network_rx_packets_per_sec"` + NetworkTxPacketsPerSec float64 `json:"network_tx_packets_per_sec"` + NetworkRxErrorsPerSec float64 `json:"network_rx_errors_per_sec"` + NetworkTxErrorsPerSec float64 `json:"network_tx_errors_per_sec"` + NetworkRxDropsPerSec float64 `json:"network_rx_drops_per_sec"` + NetworkTxDropsPerSec float64 `json:"network_tx_drops_per_sec"` + DiskReadBytesPerSec float64 `json:"disk_read_bytes_per_sec"` + DiskWriteBytesPerSec float64 `json:"disk_write_bytes_per_sec"` + DiskReadOpsPerSec float64 `json:"disk_read_ops_per_sec"` + DiskWriteOpsPerSec float64 `json:"disk_write_ops_per_sec"` + CPUThrottleFraction float64 `json:"cpu_throttle_fraction"` + // GPU (optional) + GPUUtilization float64 `json:"gpu_utilization,omitempty"` + GPUMemoryUsedMiB float64 `json:"gpu_memory_used_mib,omitempty"` + GPUMemoryFreeMiB float64 `json:"gpu_memory_free_mib,omitempty"` + GPUPowerWatts float64 `json:"gpu_power_watts,omitempty"` + GPUTemperature float64 `json:"gpu_temperature_celsius,omitempty"` +} + +// UnifiedNodeMetric mirrors nodemon's NodeMetricsResponse. +// Defined here to avoid a direct import dependency on the nodemon package. +type UnifiedNodeMetric struct { + NodeName string `json:"node_name"` + Timestamp time.Time `json:"timestamp"` + CPUUsageNanoCores uint64 `json:"cpu_usage_nanocores"` + MemoryWorkingSet uint64 `json:"memory_working_set_bytes"` + NetworkRxBytesPerSec float64 `json:"network_rx_bytes_per_sec"` + NetworkTxBytesPerSec float64 `json:"network_tx_bytes_per_sec"` + NetworkRxPacketsPerSec float64 `json:"network_rx_packets_per_sec"` + NetworkTxPacketsPerSec float64 `json:"network_tx_packets_per_sec"` + NetworkRxErrorsPerSec float64 `json:"network_rx_errors_per_sec"` + NetworkTxErrorsPerSec float64 `json:"network_tx_errors_per_sec"` + NetworkRxDropsPerSec float64 `json:"network_rx_drops_per_sec"` + NetworkTxDropsPerSec float64 `json:"network_tx_drops_per_sec"` + DiskReadBytesPerSec float64 `json:"disk_read_bytes_per_sec"` + DiskWriteBytesPerSec float64 `json:"disk_write_bytes_per_sec"` + DiskReadOpsPerSec float64 `json:"disk_read_ops_per_sec"` + DiskWriteOpsPerSec float64 `json:"disk_write_ops_per_sec"` +} + +// UnifiedPVCMetric mirrors nodemon's PVCMetricsResponse. +// Defined here to avoid a direct import dependency on the nodemon package. +type UnifiedPVCMetric struct { + Namespace string `json:"namespace"` + Pod string `json:"pod"` + PVCName string `json:"pvc_name"` + UsedBytes uint64 `json:"used_bytes"` + CapacityBytes uint64 `json:"capacity_bytes"` + AvailableBytes uint64 `json:"available_bytes"` +} + // NodemonMetric represents a single GPU metric entry returned by the nodemon. // This mirrors nodemon.GPUMetricResponse but is defined here to avoid a direct import // dependency on the nodemon package (which is a separate binary). @@ -265,6 +331,170 @@ func (c *NodemonClient) fetchMetrics( return metrics, nil } +// fetchContainerMetrics fetches from a single nodemon pod's /v2/container/metrics endpoint. +func (c *NodemonClient) fetchContainerMetrics( + ctx context.Context, + baseURL string, +) ([]UnifiedContainerMetric, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL+"/v2/container/metrics", nil) + if err != nil { + return nil, fmt.Errorf("creating request: %w", err) + } + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("HTTP request to nodemon failed: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("nodemon returned status %d", resp.StatusCode) + } + + var metrics []UnifiedContainerMetric + if err := json.NewDecoder(resp.Body).Decode(&metrics); err != nil { + return nil, fmt.Errorf("decoding nodemon response: %w", err) + } + + return metrics, nil +} + +// FetchAllContainerMetrics fetches container metrics from all discovered nodemon pods, +// merging the results into a single slice. +func (c *NodemonClient) FetchAllContainerMetrics(ctx context.Context) ([]UnifiedContainerMetric, error) { + nodeToIP, err := c.refreshCache(ctx) + if err != nil { + return nil, err + } + if len(nodeToIP) == 0 { + return nil, nil + } + + var allMetrics []UnifiedContainerMetric + for nodeName, podIP := range nodeToIP { + baseURL := fmt.Sprintf("http://%s:%d", podIP, c.port) + metrics, fetchErr := c.fetchContainerMetrics(ctx, baseURL) + if fetchErr != nil { + c.log.Error( + fetchErr, + "Failed to fetch container metrics from nodemon pod", + "node", nodeName, + "podIP", podIP, + ) + continue + } + allMetrics = append(allMetrics, metrics...) + } + + return allMetrics, nil +} + +// fetchNodeMetrics fetches from a single nodemon pod's /node/metrics endpoint. +func (c *NodemonClient) fetchNodeMetrics( + ctx context.Context, + baseURL string, +) (*UnifiedNodeMetric, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL+"/node/metrics", nil) + if err != nil { + return nil, fmt.Errorf("creating request: %w", err) + } + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("HTTP request to nodemon failed: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("nodemon returned status %d", resp.StatusCode) + } + + var metric UnifiedNodeMetric + if err := json.NewDecoder(resp.Body).Decode(&metric); err != nil { + return nil, fmt.Errorf("decoding nodemon response: %w", err) + } + + return &metric, nil +} + +// FetchNodeMetricsByNode fetches node metrics from the nodemon pod running on the given node. +// Returns nil if no nodemon pod is running on that node. +func (c *NodemonClient) FetchNodeMetricsByNode( + ctx context.Context, + nodeName string, +) (*UnifiedNodeMetric, error) { + nodeToIP, err := c.refreshCache(ctx) + if err != nil { + return nil, err + } + + podIP, ok := nodeToIP[nodeName] + if !ok { + return nil, nil + } + + baseURL := fmt.Sprintf("http://%s:%d", podIP, c.port) + return c.fetchNodeMetrics(ctx, baseURL) +} + +// fetchPVCMetrics fetches from a single nodemon pod's /pvc/metrics endpoint. +func (c *NodemonClient) fetchPVCMetrics( + ctx context.Context, + baseURL string, +) ([]UnifiedPVCMetric, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL+"/pvc/metrics", nil) + if err != nil { + return nil, fmt.Errorf("creating request: %w", err) + } + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("HTTP request to nodemon failed: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("nodemon returned status %d", resp.StatusCode) + } + + var metrics []UnifiedPVCMetric + if err := json.NewDecoder(resp.Body).Decode(&metrics); err != nil { + return nil, fmt.Errorf("decoding nodemon response: %w", err) + } + + return metrics, nil +} + +// FetchAllPVCMetrics fetches PVC metrics from all discovered nodemon pods, +// merging the results into a single slice. +func (c *NodemonClient) FetchAllPVCMetrics(ctx context.Context) ([]UnifiedPVCMetric, error) { + nodeToIP, err := c.refreshCache(ctx) + if err != nil { + return nil, err + } + if len(nodeToIP) == 0 { + return nil, nil + } + + var allMetrics []UnifiedPVCMetric + for nodeName, podIP := range nodeToIP { + baseURL := fmt.Sprintf("http://%s:%d", podIP, c.port) + metrics, fetchErr := c.fetchPVCMetrics(ctx, baseURL) + if fetchErr != nil { + c.log.Error( + fetchErr, + "Failed to fetch PVC metrics from nodemon pod", + "node", nodeName, + "podIP", podIP, + ) + continue + } + allMetrics = append(allMetrics, metrics...) + } + + return allMetrics, nil +} + // IndexByContainer indexes GPU metrics by (pod, container, namespace) for O(1) lookup. // Multiple GPUs for the same container are grouped together. func IndexByContainer(metrics []NodemonMetric) map[gpuContainerKey][]NodemonMetric { diff --git a/internal/collector/nodemon_client_unified_test.go b/internal/collector/nodemon_client_unified_test.go new file mode 100644 index 00000000..25d1ee04 --- /dev/null +++ b/internal/collector/nodemon_client_unified_test.go @@ -0,0 +1,338 @@ +package collector + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/go-logr/logr" +) + +// TestNodemonClient_FetchContainerMetrics verifies JSON parsing of /v2/container/metrics. +func TestNodemonClient_FetchContainerMetrics(t *testing.T) { + ts := time.Date(2024, 1, 15, 10, 0, 0, 0, time.UTC) + + want := []UnifiedContainerMetric{ + { + NodeName: "node-1", + Namespace: "default", + Pod: "web-abc", + Container: "nginx", + Timestamp: ts, + CPUUsageNanoCores: 250000000, + MemoryWorkingSet: 134217728, + MemoryUsageBytes: 150000000, + MemoryRSSBytes: 120000000, + NetworkRxBytes: 1048576, + NetworkTxBytes: 524288, + NetworkRxPacketsPerSec: 100.5, + NetworkTxPacketsPerSec: 80.2, + NetworkRxErrorsPerSec: 0.0, + NetworkTxErrorsPerSec: 0.0, + NetworkRxDropsPerSec: 0.1, + NetworkTxDropsPerSec: 0.0, + DiskReadBytesPerSec: 4096.0, + DiskWriteBytesPerSec: 8192.0, + DiskReadOpsPerSec: 10.0, + DiskWriteOpsPerSec: 20.0, + CPUThrottleFraction: 0.05, + }, + { + NodeName: "node-1", + Namespace: "ml", + Pod: "trainer-xyz", + Container: "pytorch", + Timestamp: ts, + CPUUsageNanoCores: 4000000000, + MemoryWorkingSet: 8589934592, + MemoryUsageBytes: 9000000000, + MemoryRSSBytes: 8000000000, + NetworkRxBytes: 10485760, + NetworkTxBytes: 5242880, + GPUUtilization: 85.5, + GPUMemoryUsedMiB: 20480.0, + GPUMemoryFreeMiB: 20480.0, + GPUPowerWatts: 300.0, + GPUTemperature: 72.0, + CPUThrottleFraction: 0.0, + }, + } + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/v2/container/metrics" { + http.NotFound(w, r) + return + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(want) + })) + defer srv.Close() + + client := newTestNodemonClient(t) + got, err := client.fetchContainerMetrics(t.Context(), srv.URL) + if err != nil { + t.Fatalf("fetchContainerMetrics returned error: %v", err) + } + + if len(got) != len(want) { + t.Fatalf("got %d metrics, want %d", len(got), len(want)) + } + + // Verify first metric (non-GPU container) + g0 := got[0] + if g0.NodeName != "node-1" { + t.Errorf("NodeName: got %q, want %q", g0.NodeName, "node-1") + } + if g0.Namespace != "default" { + t.Errorf("Namespace: got %q, want %q", g0.Namespace, "default") + } + if g0.Pod != "web-abc" { + t.Errorf("Pod: got %q, want %q", g0.Pod, "web-abc") + } + if g0.Container != "nginx" { + t.Errorf("Container: got %q, want %q", g0.Container, "nginx") + } + if g0.CPUUsageNanoCores != 250000000 { + t.Errorf("CPUUsageNanoCores: got %d, want %d", g0.CPUUsageNanoCores, 250000000) + } + if g0.MemoryWorkingSet != 134217728 { + t.Errorf("MemoryWorkingSet: got %d, want %d", g0.MemoryWorkingSet, 134217728) + } + if g0.NetworkRxBytes != 1048576 { + t.Errorf("NetworkRxBytes: got %d, want %d", g0.NetworkRxBytes, 1048576) + } + if g0.DiskReadBytesPerSec != 4096.0 { + t.Errorf("DiskReadBytesPerSec: got %f, want %f", g0.DiskReadBytesPerSec, 4096.0) + } + if g0.CPUThrottleFraction != 0.05 { + t.Errorf("CPUThrottleFraction: got %f, want %f", g0.CPUThrottleFraction, 0.05) + } + + // Verify second metric (GPU container) + g1 := got[1] + if g1.GPUUtilization != 85.5 { + t.Errorf("GPUUtilization: got %f, want %f", g1.GPUUtilization, 85.5) + } + if g1.GPUMemoryUsedMiB != 20480.0 { + t.Errorf("GPUMemoryUsedMiB: got %f, want %f", g1.GPUMemoryUsedMiB, 20480.0) + } + if g1.GPUPowerWatts != 300.0 { + t.Errorf("GPUPowerWatts: got %f, want %f", g1.GPUPowerWatts, 300.0) + } + if g1.GPUTemperature != 72.0 { + t.Errorf("GPUTemperature: got %f, want %f", g1.GPUTemperature, 72.0) + } +} + +func TestNodemonClient_FetchContainerMetrics_HTTPError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "service unavailable", http.StatusServiceUnavailable) + })) + defer srv.Close() + + client := newTestNodemonClient(t) + got, err := client.fetchContainerMetrics(t.Context(), srv.URL) + if err == nil { + t.Fatal("expected error for non-200 status, got nil") + } + if got != nil { + t.Errorf("expected nil metrics on error, got %v", got) + } +} + +// TestNodemonClient_FetchNodeMetrics verifies JSON parsing of /node/metrics. +func TestNodemonClient_FetchNodeMetrics(t *testing.T) { + ts := time.Date(2024, 1, 15, 10, 0, 0, 0, time.UTC) + + want := UnifiedNodeMetric{ + NodeName: "node-2", + Timestamp: ts, + NetworkRxBytesPerSec: 10485760.0, + NetworkTxBytesPerSec: 5242880.0, + DiskReadBytesPerSec: 204800.0, + DiskWriteBytesPerSec: 409600.0, + DiskReadOpsPerSec: 50.0, + DiskWriteOpsPerSec: 100.0, + } + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/node/metrics" { + http.NotFound(w, r) + return + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(want) + })) + defer srv.Close() + + client := newTestNodemonClient(t) + got, err := client.fetchNodeMetrics(t.Context(), srv.URL) + if err != nil { + t.Fatalf("fetchNodeMetrics returned error: %v", err) + } + if got == nil { + t.Fatal("expected non-nil metric, got nil") + } + + if got.NodeName != "node-2" { + t.Errorf("NodeName: got %q, want %q", got.NodeName, "node-2") + } + if got.NetworkRxBytesPerSec != 10485760.0 { + t.Errorf("NetworkRxBytesPerSec: got %f, want %f", got.NetworkRxBytesPerSec, 10485760.0) + } + if got.NetworkTxBytesPerSec != 5242880.0 { + t.Errorf("NetworkTxBytesPerSec: got %f, want %f", got.NetworkTxBytesPerSec, 5242880.0) + } + if got.DiskReadBytesPerSec != 204800.0 { + t.Errorf("DiskReadBytesPerSec: got %f, want %f", got.DiskReadBytesPerSec, 204800.0) + } + if got.DiskWriteBytesPerSec != 409600.0 { + t.Errorf("DiskWriteBytesPerSec: got %f, want %f", got.DiskWriteBytesPerSec, 409600.0) + } + if got.DiskReadOpsPerSec != 50.0 { + t.Errorf("DiskReadOpsPerSec: got %f, want %f", got.DiskReadOpsPerSec, 50.0) + } + if got.DiskWriteOpsPerSec != 100.0 { + t.Errorf("DiskWriteOpsPerSec: got %f, want %f", got.DiskWriteOpsPerSec, 100.0) + } + if !got.Timestamp.Equal(ts) { + t.Errorf("Timestamp: got %v, want %v", got.Timestamp, ts) + } +} + +func TestNodemonClient_FetchNodeMetrics_HTTPError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "not found", http.StatusNotFound) + })) + defer srv.Close() + + client := newTestNodemonClient(t) + got, err := client.fetchNodeMetrics(t.Context(), srv.URL) + if err == nil { + t.Fatal("expected error for non-200 status, got nil") + } + if got != nil { + t.Errorf("expected nil metric on error, got %v", got) + } +} + +// TestNodemonClient_FetchPVCMetrics verifies JSON parsing of /pvc/metrics. +func TestNodemonClient_FetchPVCMetrics(t *testing.T) { + want := []UnifiedPVCMetric{ + { + Namespace: "default", + Pod: "db-0", + PVCName: "data-db-0", + UsedBytes: 10737418240, + CapacityBytes: 107374182400, + AvailableBytes: 96636764160, + }, + { + Namespace: "monitoring", + Pod: "prometheus-0", + PVCName: "prometheus-data", + UsedBytes: 53687091200, + CapacityBytes: 107374182400, + AvailableBytes: 53687091200, + }, + } + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/pvc/metrics" { + http.NotFound(w, r) + return + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(want) + })) + defer srv.Close() + + client := newTestNodemonClient(t) + got, err := client.fetchPVCMetrics(t.Context(), srv.URL) + if err != nil { + t.Fatalf("fetchPVCMetrics returned error: %v", err) + } + + if len(got) != len(want) { + t.Fatalf("got %d metrics, want %d", len(got), len(want)) + } + + // Verify first PVC + g0 := got[0] + if g0.Namespace != "default" { + t.Errorf("Namespace: got %q, want %q", g0.Namespace, "default") + } + if g0.Pod != "db-0" { + t.Errorf("Pod: got %q, want %q", g0.Pod, "db-0") + } + if g0.PVCName != "data-db-0" { + t.Errorf("PVCName: got %q, want %q", g0.PVCName, "data-db-0") + } + if g0.UsedBytes != 10737418240 { + t.Errorf("UsedBytes: got %d, want %d", g0.UsedBytes, 10737418240) + } + if g0.CapacityBytes != 107374182400 { + t.Errorf("CapacityBytes: got %d, want %d", g0.CapacityBytes, 107374182400) + } + if g0.AvailableBytes != 96636764160 { + t.Errorf("AvailableBytes: got %d, want %d", g0.AvailableBytes, 96636764160) + } + + // Verify second PVC + g1 := got[1] + if g1.Namespace != "monitoring" { + t.Errorf("Namespace: got %q, want %q", g1.Namespace, "monitoring") + } + if g1.PVCName != "prometheus-data" { + t.Errorf("PVCName: got %q, want %q", g1.PVCName, "prometheus-data") + } + if g1.UsedBytes != 53687091200 { + t.Errorf("UsedBytes: got %d, want %d", g1.UsedBytes, 53687091200) + } +} + +func TestNodemonClient_FetchPVCMetrics_Empty(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte("[]")) + })) + defer srv.Close() + + client := newTestNodemonClient(t) + got, err := client.fetchPVCMetrics(t.Context(), srv.URL) + if err != nil { + t.Fatalf("fetchPVCMetrics returned error for empty response: %v", err) + } + if len(got) != 0 { + t.Errorf("expected 0 metrics, got %d", len(got)) + } +} + +func TestNodemonClient_FetchPVCMetrics_HTTPError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "internal server error", http.StatusInternalServerError) + })) + defer srv.Close() + + client := newTestNodemonClient(t) + got, err := client.fetchPVCMetrics(t.Context(), srv.URL) + if err == nil { + t.Fatal("expected error for non-200 status, got nil") + } + if got != nil { + t.Errorf("expected nil metrics on error, got %v", got) + } +} + +// newTestNodemonClient returns a NodemonClient suitable for unit-testing the fetch* methods. +// The k8sClient and namespace are not used by the direct fetch* methods. +func newTestNodemonClient(t *testing.T) *NodemonClient { + t.Helper() + return &NodemonClient{ + port: 6061, + httpClient: &http.Client{}, + log: logr.Discard(), + } +} diff --git a/internal/collector/pvc_metrics_collector.go b/internal/collector/pvc_metrics_collector.go index d8b73ef7..be52d71d 100644 --- a/internal/collector/pvc_metrics_collector.go +++ b/internal/collector/pvc_metrics_collector.go @@ -3,6 +3,7 @@ package collector import ( "context" "fmt" + "os" "sync" "time" @@ -10,9 +11,6 @@ import ( telemetry_logger "github.com/devzero-inc/zxporter/internal/logger" "github.com/devzero-inc/zxporter/internal/version" "github.com/go-logr/logr" - "github.com/prometheus/client_golang/api" - v1 "github.com/prometheus/client_golang/api/prometheus/v1" - "github.com/prometheus/common/model" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" @@ -25,19 +23,12 @@ import ( type PersistentVolumeClaimMetricsCollectorConfig struct { // UpdateInterval specifies how often to collect PVC metrics UpdateInterval time.Duration - - // PrometheusURL specifies the URL of the Prometheus instance to query - // If empty, defaults to in-cluster Prometheus at http://prometheus.monitoring:9090 - PrometheusURL string - - // QueryTimeout specifies the timeout for Prometheus queries - QueryTimeout time.Duration } // PersistentVolumeClaimMetricsCollector collects PVC storage usage metrics type PersistentVolumeClaimMetricsCollector struct { k8sClient kubernetes.Interface - prometheusAPI v1.API + nodemonClient *NodemonClient informerFactory informers.SharedInformerFactory pvcInformer cache.SharedIndexInformer pvInformer cache.SharedIndexInformer @@ -79,15 +70,6 @@ func NewPersistentVolumeClaimMetricsCollector( config.UpdateInterval = 60 * time.Second } - // Default Prometheus URL if not specified - if config.PrometheusURL == "" { - config.PrometheusURL = "http://prometheus-service.monitoring.svc.cluster.local:9090" - } - - if config.QueryTimeout <= 0 { - config.QueryTimeout = 10 * time.Second - } - batchChan := make(chan CollectedResource, 500) resourceChan := make(chan []CollectedResource, 200) @@ -99,8 +81,15 @@ func NewPersistentVolumeClaimMetricsCollector( logger, ) + ns := os.Getenv("POD_NAMESPACE") + if ns == "" { + ns = "devzero-system" + } + nodemonClient := NewNodemonClient(k8sClient, ns, logger) + return &PersistentVolumeClaimMetricsCollector{ k8sClient: k8sClient, + nodemonClient: nodemonClient, batchChan: batchChan, resourceChan: resourceChan, batcher: batcher, @@ -118,35 +107,7 @@ func NewPersistentVolumeClaimMetricsCollector( func (c *PersistentVolumeClaimMetricsCollector) Start(ctx context.Context) error { c.logger.Info("Starting PVC metrics collector", "namespaces", c.namespaces, - "updateInterval", c.config.UpdateInterval, - "prometheusURL", c.config.PrometheusURL) - - c.logger.Info("Initializing Prometheus client for PVC metrics", - "prometheusURL", c.config.PrometheusURL) - - httpClient := NewPrometheusClient(c.metrics) - - client, err := api.NewClient(api.Config{ - Address: c.config.PrometheusURL, - Client: httpClient, - }) - if err != nil { - c.telemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_ERROR, - "PVCMetricsCollector", - "Failed to create Prometheus client", - err, - map[string]string{ - "prometheus_url": c.config.PrometheusURL, - "zxporter_version": version.Get().String(), - }, - ) - - c.logger.Error(err, "Failed to create Prometheus client, PVC metrics will be unavailable") - return fmt.Errorf("failed to create Prometheus client: %w", err) - } - - c.prometheusAPI = v1.NewAPI(client) + "updateInterval", c.config.UpdateInterval) if len(c.namespaces) == 1 && c.namespaces[0] != "" { c.informerFactory = informers.NewSharedInformerFactoryWithOptions( @@ -386,9 +347,6 @@ func (c *PersistentVolumeClaimMetricsCollector) processPVCMetrics( ctx context.Context, pvc *corev1.PersistentVolumeClaim, ) (*PersistentVolumeClaimMetricsSnapshot, error) { - queryCtx, cancel := context.WithTimeout(ctx, c.config.QueryTimeout) - defer cancel() - c.logger.V(1).Info("Processing PVC metrics", "namespace", pvc.Namespace, "name", pvc.Name, @@ -439,32 +397,31 @@ func (c *PersistentVolumeClaimMetricsCollector) processPVCMetrics( return metricsSnapshot, nil } - usage, err := c.getFilesystemUsageFromPrometheus(queryCtx, pvc) + usage, err := c.getFilesystemUsage(ctx, pvc) if err != nil { - c.logger.V(1).Info("Failed to get filesystem usage from Prometheus", + c.logger.V(1).Info("Failed to get filesystem usage from nodemon", "namespace", pvc.Namespace, "name", pvc.Name, "error", err) - metricsSnapshot.UnavailableReason = fmt.Sprintf("prometheus_query_failed: %v", err) + metricsSnapshot.UnavailableReason = fmt.Sprintf("nodemon_query_failed: %v", err) if c.telemetryLogger != nil { c.telemetryLogger.Report( gen.LogLevel_LOG_LEVEL_WARN, "PVCMetricsCollector", - "Failed to collect PVC metrics from Prometheus", + "Failed to collect PVC metrics from nodemon", err, map[string]string{ "namespace": pvc.Namespace, "pvc": pvc.Name, - "error_type": "prometheus_pvc_query_failed", - "prometheus_url": c.config.PrometheusURL, + "error_type": "nodemon_pvc_query_failed", "zxporter_version": version.Get().String(), }, ) } } else if usage != nil { metricsSnapshot.StatsAvailable = true - metricsSnapshot.StatsSource = "prometheus" + metricsSnapshot.StatsSource = "nodemon" metricsSnapshot.UsedBytes = usage.UsedBytes metricsSnapshot.CapacityBytes = usage.CapacityBytes metricsSnapshot.AvailableBytes = usage.AvailableBytes @@ -497,80 +454,27 @@ type filesystemUsage struct { AvailableBytes int64 } -// getFilesystemUsageFromPrometheus retrieves filesystem usage from Prometheus -func (c *PersistentVolumeClaimMetricsCollector) getFilesystemUsageFromPrometheus( +// getFilesystemUsage retrieves PVC filesystem usage from the nodemon DaemonSet. +func (c *PersistentVolumeClaimMetricsCollector) getFilesystemUsage( ctx context.Context, pvc *corev1.PersistentVolumeClaim, ) (*filesystemUsage, error) { - if c.prometheusAPI == nil { - return nil, fmt.Errorf("prometheus API not initialized") - } - - // Prometheus queries for PVC volume stats - queries := map[string]string{ - "used": fmt.Sprintf( - `kubelet_volume_stats_used_bytes{namespace="%s", persistentvolumeclaim="%s"}`, - pvc.Namespace, - pvc.Name, - ), - "capacity": fmt.Sprintf( - `kubelet_volume_stats_capacity_bytes{namespace="%s", persistentvolumeclaim="%s"}`, - pvc.Namespace, - pvc.Name, - ), - "available": fmt.Sprintf( - `kubelet_volume_stats_available_bytes{namespace="%s", persistentvolumeclaim="%s"}`, - pvc.Namespace, - pvc.Name, - ), - } - - usage := &filesystemUsage{} - queryTime := time.Now() - - result, _, err := c.prometheusAPI.Query(ctx, queries["used"], queryTime) - if err != nil { - return nil, fmt.Errorf("failed to query used bytes: %w", err) - } - - if result.Type() == model.ValVector { - vector := result.(model.Vector) - if len(vector) > 0 { - usage.UsedBytes = int64(vector[0].Value) - } else { - return nil, fmt.Errorf("no used bytes metric found") - } - } - - result, _, err = c.prometheusAPI.Query(ctx, queries["capacity"], queryTime) - if err != nil { - return nil, fmt.Errorf("failed to query capacity bytes: %w", err) - } - - if result.Type() == model.ValVector { - vector := result.(model.Vector) - if len(vector) > 0 { - usage.CapacityBytes = int64(vector[0].Value) - } else { - return nil, fmt.Errorf("no capacity bytes metric found") - } - } - - result, _, err = c.prometheusAPI.Query(ctx, queries["available"], queryTime) + allMetrics, err := c.nodemonClient.FetchAllPVCMetrics(ctx) if err != nil { - return nil, fmt.Errorf("failed to query available bytes: %w", err) + return nil, fmt.Errorf("fetching PVC metrics from nodemon: %w", err) } - if result.Type() == model.ValVector { - vector := result.(model.Vector) - if len(vector) > 0 { - usage.AvailableBytes = int64(vector[0].Value) - } else { - return nil, fmt.Errorf("no available bytes metric found") + for _, m := range allMetrics { + if m.Namespace == pvc.Namespace && m.PVCName == pvc.Name { + return &filesystemUsage{ + UsedBytes: int64(m.UsedBytes), + CapacityBytes: int64(m.CapacityBytes), + AvailableBytes: int64(m.AvailableBytes), + }, nil } } - return usage, nil + return nil, fmt.Errorf("no nodemon metrics found for PVC %s/%s", pvc.Namespace, pvc.Name) } // emitSnapshot sends the metrics snapshot to the batch channel @@ -698,22 +602,10 @@ func (c *PersistentVolumeClaimMetricsCollector) GetType() string { return "pvc_metrics" } +// IsAvailable checks if PVC metrics can be collected. +// Always returns true — nodemon pods are discovered dynamically. func (c *PersistentVolumeClaimMetricsCollector) IsAvailable(ctx context.Context) bool { - if c.prometheusAPI == nil { - c.logger.Info("Prometheus API not available for PVC metrics") - return true - } - - queryCtx, cancel := context.WithTimeout(ctx, 10*time.Second) - defer cancel() - - _, _, err := c.prometheusAPI.Query(queryCtx, "up", time.Now()) - if err != nil { - c.logger.Info("Prometheus API not responding", "error", err.Error()) - return true - } - - return true + return c.nodemonClient != nil } // AddResource is a no-op for PVC metrics collector diff --git a/internal/controller/collectionpolicy_controller.go b/internal/controller/collectionpolicy_controller.go index 834aa88c..4c97a247 100644 --- a/internal/controller/collectionpolicy_controller.go +++ b/internal/controller/collectionpolicy_controller.go @@ -19,9 +19,7 @@ package controller import ( "context" "fmt" - "net/http" "reflect" - "strings" "sync" "time" @@ -148,13 +146,11 @@ type PolicyConfig struct { DisabledCollectors []string - KubeContextName string - DakrURL string - ClusterToken string - PrometheusURL string - DisableNetworkIOMetrics bool - DisableGPUMetrics bool - UpdateInterval time.Duration + KubeContextName string + DakrURL string + ClusterToken string + DisableGPUMetrics bool + UpdateInterval time.Duration NodeMetricsInterval time.Duration ClusterSnapshotInterval time.Duration BufferSize int @@ -397,13 +393,11 @@ func (r *CollectionPolicyReconciler) createNewConfig( ExcludedCSINodes: envSpec.Exclusions.ExcludedNodes, // Same as nodes // Policies - KubeContextName: envSpec.Policies.KubeContextName, - DakrURL: envSpec.Policies.DakrURL, - ClusterToken: envSpec.Policies.ClusterToken, - PrometheusURL: envSpec.Policies.PrometheusURL, - DisableNetworkIOMetrics: envSpec.Policies.DisableNetworkIOMetrics, - DisableGPUMetrics: envSpec.Policies.DisableGPUMetrics, - MaskSecretData: envSpec.Policies.MaskSecretData, + KubeContextName: envSpec.Policies.KubeContextName, + DakrURL: envSpec.Policies.DakrURL, + ClusterToken: envSpec.Policies.ClusterToken, + DisableGPUMetrics: envSpec.Policies.DisableGPUMetrics, + MaskSecretData: envSpec.Policies.MaskSecretData, DisabledCollectors: envSpec.Policies.DisabledCollectors, BufferSize: envSpec.Policies.BufferSize, } @@ -1000,8 +994,6 @@ func (r *CollectionPolicyReconciler) identifyAffectedCollectors( // Check if the special node collectors are affected by the update interval change if oldConfig.UpdateInterval != newConfig.UpdateInterval || - oldConfig.PrometheusURL != newConfig.PrometheusURL || - oldConfig.DisableNetworkIOMetrics != newConfig.DisableNetworkIOMetrics || oldConfig.DisableGPUMetrics != newConfig.DisableGPUMetrics { affectedCollectors["node"] = true affectedCollectors["container_resource"] = true @@ -1036,39 +1028,6 @@ func (r *CollectionPolicyReconciler) restartCollectors( r.RestartInProgress = false }() - // Check Prometheus availability if URL changed or metrics configuration changed - if newConfig.PrometheusURL != "" && - (r.CurrentConfig.PrometheusURL != newConfig.PrometheusURL || - r.CurrentConfig.DisableNetworkIOMetrics != newConfig.DisableNetworkIOMetrics || - r.CurrentConfig.DisableGPUMetrics != newConfig.DisableGPUMetrics) { - logger.Info( - "Prometheus configuration changed, checking availability", - "url", - newConfig.PrometheusURL, - ) - - r.TelemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_DEBUG, - "CollectionPolicyReconciler_restartCollectors", - "Prometheus or metrics configuration changed", - nil, - map[string]string{ - "prometheus_url": fmt.Sprintf("%v", newConfig.PrometheusURL), - "zxporter_version": version.Get().String(), - }, - ) - - prometheusAvailable := r.waitForPrometheusAvailability(ctx, newConfig.PrometheusURL) - if !prometheusAvailable { - logger.Info( - "Prometheus is not available after waiting, will continue with restart but metrics may be limited", - ) - // We continue with restart, but log the warning - } else { - logger.Info("Prometheus is available, continuing with full metrics collection") - } - } - // Check if the DisabledCollectors list has changed if !reflect.DeepEqual(r.CurrentConfig.DisabledCollectors, newConfig.DisabledCollectors) { logger.Info("Disabled collectors configuration changed, updating affected collectors") @@ -1341,11 +1300,8 @@ func (r *CollectionPolicyReconciler) restartCollectors( r.K8sClient, metricsClient, collector.ContainerResourceCollectorConfig{ - PrometheusURL: newConfig.PrometheusURL, - UpdateInterval: newConfig.UpdateInterval, - QueryTimeout: 10 * time.Second, - DisableNetworkIOMetrics: newConfig.DisableNetworkIOMetrics, - DisableGPUMetrics: newConfig.DisableGPUMetrics, + UpdateInterval: newConfig.UpdateInterval, + DisableGPUMetrics: newConfig.DisableGPUMetrics, }, newConfig.TargetNamespaces, newConfig.ExcludedPods, @@ -1360,11 +1316,8 @@ func (r *CollectionPolicyReconciler) restartCollectors( r.K8sClient, metricsClient, collector.NodeCollectorConfig{ - PrometheusURL: newConfig.PrometheusURL, - UpdateInterval: newConfig.UpdateInterval, - QueryTimeout: 10 * time.Second, - DisableNetworkIOMetrics: newConfig.DisableNetworkIOMetrics, - DisableGPUMetrics: newConfig.DisableGPUMetrics, + UpdateInterval: newConfig.UpdateInterval, + DisableGPUMetrics: newConfig.DisableGPUMetrics, }, newConfig.ExcludedNodes, collector.DefaultMaxBatchSize, @@ -1396,9 +1349,7 @@ func (r *CollectionPolicyReconciler) restartCollectors( replacedCollector = collector.NewPersistentVolumeClaimMetricsCollector( r.K8sClient, collector.PersistentVolumeClaimMetricsCollectorConfig{ - PrometheusURL: newConfig.PrometheusURL, UpdateInterval: newConfig.UpdateInterval, - QueryTimeout: 10 * time.Second, }, newConfig.TargetNamespaces, newConfig.ExcludedPVCs, @@ -1855,51 +1806,6 @@ func (r *CollectionPolicyReconciler) initializeCollectors( logger := r.Log.WithName("initialize") logger.Info("Initializing collectors", "config", fmt.Sprintf("%+v", config)) - // Check if Prometheus is available if URL is configured - if config.PrometheusURL != "" { - logger.Info("Prometheus URL configured, checking availability", "url", config.PrometheusURL) - prometheusAvailable := r.waitForPrometheusAvailability(ctx, config.PrometheusURL) - if !prometheusAvailable { - logger.Info( - "Prometheus is not available after waiting, will continue initialization but metrics may be limited", - ) - r.TelemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_DEBUG, - "CollectionPolicyReconciler_initializeCollectors", - "Prometheus is not available after waiting, will continue initialization but metrics may be limited", - nil, - map[string]string{ - "prometheus_url": fmt.Sprintf("%v", config.PrometheusURL), - "zxporter_version": version.Get().String(), - }, - ) - // We continue initialization, but log the warning - } else { - r.TelemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_INFO, - "CollectionPolicyReconciler_initializeCollectors", - "Prometheus is available, continuing with full metrics collection", - nil, - map[string]string{ - "prometheus_url": fmt.Sprintf("%v", config.PrometheusURL), - "zxporter_version": version.Get().String(), - }, - ) - logger.Info("Prometheus is available, continuing with full metrics collection") - } - } else { - r.TelemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_WARN, - "CollectionPolicyReconciler_initializeCollectors", - "Prometheus URL is empty in config, metrics will be limited", - nil, - map[string]string{ - "prometheus_url": fmt.Sprintf("%v", config.PrometheusURL), - "zxporter_version": version.Get().String(), - }, - ) - } - // Setup collection manager and basic services if err := r.setupCollectionManager(ctx, config, logger); err != nil { if r.TelemetryLogger != nil { @@ -1919,7 +1825,7 @@ func (r *CollectionPolicyReconciler) initializeCollectors( } // Setup and start MPA Server - if err := r.setupMpaServer(); err != nil { + if err := r.setupMpaServer(ctx, config); err != nil { logger.Error(err, "Failed to setup MPA server") // Not fatal } @@ -2071,11 +1977,21 @@ func (r *CollectionPolicyReconciler) setupCollectionManager( } // setupMpaServer initializes and starts the gRPC server -func (r *CollectionPolicyReconciler) setupMpaServer() error { +func (r *CollectionPolicyReconciler) setupMpaServer(ctx context.Context, config *PolicyConfig) error { if r.MpaServer != nil { return nil } - r.MpaServer = server.NewMpaServer(r.Log, nil, r.HealthManager) + + var historicalProvider collector.HistoricalPercentileProvider + if config != nil && r.DakrClient != nil { + fetcher := r.DakrClient.NewPercentileFetcher() + cache := collector.NewHistoricalPercentileCache(r.Log, fetcher, "" /* clusterID resolved from auth token */, r.HealthManager) + go cache.Start(ctx) + historicalProvider = cache + r.Log.Info("Historical percentile cache started with DAKR fetcher") + } + + r.MpaServer = server.NewMpaServer(r.Log, historicalProvider, r.HealthManager) return r.MpaServer.Start(r.MpaServerPort) } @@ -2675,9 +2591,7 @@ func (r *CollectionPolicyReconciler) registerResourceCollectors( collector: collector.NewPersistentVolumeClaimMetricsCollector( r.K8sClient, collector.PersistentVolumeClaimMetricsCollectorConfig{ - PrometheusURL: config.PrometheusURL, UpdateInterval: config.UpdateInterval, - QueryTimeout: 10 * time.Second, }, config.TargetNamespaces, config.ExcludedPVCs, @@ -2757,11 +2671,8 @@ func (r *CollectionPolicyReconciler) registerResourceCollectors( r.K8sClient, metricsClient, collector.ContainerResourceCollectorConfig{ - PrometheusURL: config.PrometheusURL, - UpdateInterval: config.UpdateInterval, - QueryTimeout: 10 * time.Second, - DisableNetworkIOMetrics: config.DisableNetworkIOMetrics, - DisableGPUMetrics: config.DisableGPUMetrics, + UpdateInterval: config.UpdateInterval, + DisableGPUMetrics: config.DisableGPUMetrics, }, config.TargetNamespaces, config.ExcludedPods, @@ -2778,11 +2689,8 @@ func (r *CollectionPolicyReconciler) registerResourceCollectors( r.K8sClient, metricsClient, collector.NodeCollectorConfig{ - PrometheusURL: config.PrometheusURL, - UpdateInterval: config.UpdateInterval, - QueryTimeout: 10 * time.Second, - DisableNetworkIOMetrics: config.DisableNetworkIOMetrics, - DisableGPUMetrics: config.DisableGPUMetrics, + UpdateInterval: config.UpdateInterval, + DisableGPUMetrics: config.DisableGPUMetrics, }, config.ExcludedNodes, collector.DefaultMaxBatchSize, @@ -3515,16 +3423,12 @@ func (r *CollectionPolicyReconciler) handleDisabledCollectorsChange( r.TelemetryLogger, ) case "container_resource": - // Use the reconciler's shared Prometheus metrics instance replacedCollector = collector.NewContainerResourceCollector( r.K8sClient, metricsClient, collector.ContainerResourceCollectorConfig{ - PrometheusURL: newConfig.PrometheusURL, - UpdateInterval: newConfig.UpdateInterval, - QueryTimeout: 10 * time.Second, - DisableNetworkIOMetrics: newConfig.DisableNetworkIOMetrics, - DisableGPUMetrics: newConfig.DisableGPUMetrics, + UpdateInterval: newConfig.UpdateInterval, + DisableGPUMetrics: newConfig.DisableGPUMetrics, }, newConfig.TargetNamespaces, newConfig.ExcludedPods, @@ -3548,9 +3452,7 @@ func (r *CollectionPolicyReconciler) handleDisabledCollectorsChange( replacedCollector = collector.NewPersistentVolumeClaimMetricsCollector( r.K8sClient, collector.PersistentVolumeClaimMetricsCollectorConfig{ - PrometheusURL: newConfig.PrometheusURL, UpdateInterval: newConfig.UpdateInterval, - QueryTimeout: 10 * time.Second, }, newConfig.TargetNamespaces, newConfig.ExcludedPVCs, @@ -3736,17 +3638,12 @@ func (r *CollectionPolicyReconciler) handleDisabledCollectorsChange( /// Cluster wide resources ////////////////////////////////////////////////////////////////////////////////// case "node": - // Use the reconciler's shared Prometheus metrics instance - replacedCollector = collector.NewNodeCollector( r.K8sClient, metricsClient, collector.NodeCollectorConfig{ - PrometheusURL: newConfig.PrometheusURL, - UpdateInterval: newConfig.UpdateInterval, - QueryTimeout: 10 * time.Second, - DisableNetworkIOMetrics: newConfig.DisableNetworkIOMetrics, - DisableGPUMetrics: newConfig.DisableGPUMetrics, + UpdateInterval: newConfig.UpdateInterval, + DisableGPUMetrics: newConfig.DisableGPUMetrics, }, newConfig.ExcludedNodes, collector.DefaultMaxBatchSize, @@ -3993,136 +3890,6 @@ func (r *CollectionPolicyReconciler) handleDisabledCollectorsChange( return nil } -// waitForPrometheusAvailability checks if Prometheus is available with retry logic -// Returns true if Prometheus is available, false if not after maxRetries -func (r *CollectionPolicyReconciler) waitForPrometheusAvailability( - ctx context.Context, - prometheusURL string, -) bool { - logger := r.Log.WithName("prometheus-check") - - if prometheusURL == "" { - logger.Info("No Prometheus URL configured, skipping availability check") - return true - } - - logger.Info("Checking Prometheus availability", "url", prometheusURL) - - // Configuration for retry mechanism - initialBackoff := 5 * time.Second - maxBackoff := 2 * time.Minute - backoff := initialBackoff - maxRetries := 12 // About 25 minutes total with exponential backoff - - // Clone DefaultTransport to preserve proxy settings, TLS config, and connection pooling - tr := http.DefaultTransport.(*http.Transport).Clone() - tr.DisableCompression = false // Ensure gzip compression is enabled for responses - client := &http.Client{ - Timeout: 5 * time.Second, - Transport: tr, - } - - // Endpoint to verify prometheus is ready - healthEndpoint := fmt.Sprintf("%s/-/ready", prometheusURL) - if !strings.HasPrefix(prometheusURL, "http") { - healthEndpoint = fmt.Sprintf("http://%s/-/ready", prometheusURL) - } - - for i := 0; i < maxRetries; i++ { - select { - case <-ctx.Done(): - logger.Info("Context cancelled while checking Prometheus availability") - return false - default: - // Try to contact Prometheus - req, err := http.NewRequestWithContext(ctx, http.MethodGet, healthEndpoint, nil) - if err != nil { - logger.Error(err, "Failed to create request for Prometheus health check") - r.TelemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_ERROR, - "CollectionPolicyReconciler_waitForPrometheusAvailability", - "Failed to start collector", - err, - map[string]string{ - "prometheus_url": prometheusURL, - "zxporter_version": version.Get().String(), - }, - ) - time.Sleep(backoff) - backoff = min(backoff*2, maxBackoff) - continue - } - - resp, err := client.Do(req) - if err != nil { - logger.Info("Prometheus not yet available", - "attempt", i+1, - "maxRetries", maxRetries, - "backoff", backoff.String(), - "error", err.Error()) - time.Sleep(backoff) - backoff = min(backoff*2, maxBackoff) - continue - } - - // Check response status - if resp.StatusCode == http.StatusOK { - logger.Info("Prometheus is available", "statusCode", resp.StatusCode) - resp.Body.Close() - r.updateHealthStatus( - health.HealthStatusHealthy, - "Prometheus available", - map[string]string{"url": prometheusURL}, - ) - return true - } - - logger.Info("Prometheus returned non-OK status", - "statusCode", resp.StatusCode, - "attempt", i+1, - "maxRetries", maxRetries) - r.TelemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_ERROR, - "CollectionPolicyReconciler_waitForPrometheusAvailability", - "Prometheus returned non-OK status", - err, - map[string]string{ - "prometheus_url": prometheusURL, - "zxporter_version": version.Get().String(), - }, - ) - resp.Body.Close() - time.Sleep(backoff) - backoff = min(backoff*2, maxBackoff) - } - } - - logger.Info( - "Prometheus availability check failed after maximum retries", - "maxRetries", - maxRetries, - ) - r.TelemetryLogger.Report( - gen.LogLevel_LOG_LEVEL_ERROR, - "CollectionPolicyReconciler_waitForPrometheusAvailability", - "Prometheus availability check failed after maximum retries", - fmt.Errorf("prometheus not available"), - map[string]string{ - "prometheus_url": prometheusURL, - "max_retries": fmt.Sprintf("%v", maxRetries), - "zxporter_version": version.Get().String(), - }, - ) - r.updateHealthStatus( - health.HealthStatusUnhealthy, - "Prometheus unavailable after retries", - map[string]string{ - "url": prometheusURL, - "max_retries": fmt.Sprintf("%d", maxRetries), - }, - ) - return false -} // SetupWithManager sets up the controller with the Manager. func (r *CollectionPolicyReconciler) SetupWithManager(mgr ctrl.Manager) error { diff --git a/internal/nodemon/cadvisor_scraper.go b/internal/nodemon/cadvisor_scraper.go new file mode 100644 index 00000000..0024baa7 --- /dev/null +++ b/internal/nodemon/cadvisor_scraper.go @@ -0,0 +1,299 @@ +package nodemon + +import ( + "context" + "fmt" + "net/http" + "time" + + "github.com/go-logr/logr" + dto "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" + "github.com/prometheus/common/model" +) + +// podNetworkSentinel is a placeholder container name used as the grouping key +// for network metrics that carry no container label (only namespace + pod). +const podNetworkSentinel = "__pod__" + +// containerKey is the composite identity for grouping per-container counters. +type containerKey struct { + namespace string + pod string + container string +} + +// rawCounters accumulates the latest scraped counter values for a single key. +type rawCounters struct { + cpuThrottledPeriods float64 + cpuTotalPeriods float64 + + fsReadBytes float64 + fsWriteBytes float64 + fsReadOps float64 + fsWriteOps float64 + + netRxPackets float64 + netTxPackets float64 + netRxErrors float64 + netTxErrors float64 + netRxDrops float64 + netTxDrops float64 +} + +// CAdvisorScraper fetches the kubelet /metrics/cadvisor endpoint and computes +// per-second rates for CPU throttle, disk I/O, and network counters. +type CAdvisorScraper struct { + url string + httpClient HTTPClient + rates *RateCalculator + log logr.Logger +} + +// NewCAdvisorScraper constructs a CAdvisorScraper that will scrape +// baseURL + "/metrics/cadvisor". +func NewCAdvisorScraper(baseURL string, httpClient HTTPClient, log logr.Logger) *CAdvisorScraper { + return &CAdvisorScraper{ + url: baseURL + "/metrics/cadvisor", + httpClient: httpClient, + rates: NewRateCalculator(), + log: log.WithName("cadvisor-scraper"), + } +} + +// Scrape fetches cAdvisor metrics, computes rates, and returns per-container +// results. The first call returns an empty slice because no baseline has been +// established for rate computation yet. +func (s *CAdvisorScraper) Scrape(ctx context.Context, now time.Time) ([]CAdvisorContainerMetrics, error) { + families, err := s.fetchMetrics(ctx) + if err != nil { + return nil, err + } + + counters := s.groupCounters(families) + return s.computeRates(counters, now), nil +} + +// fetchMetrics performs the HTTP request and parses the Prometheus text body. +func (s *CAdvisorScraper) fetchMetrics(ctx context.Context) (map[string]*dto.MetricFamily, error) { + ctxWithTimeout, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + req, err := http.NewRequestWithContext(ctxWithTimeout, http.MethodGet, s.url, nil) + if err != nil { + return nil, fmt.Errorf("cadvisor: cannot create request: %w", err) + } + + resp, err := s.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("cadvisor: HTTP request failed: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("cadvisor: request failed with status %d", resp.StatusCode) + } + + parser := expfmt.NewTextParser(model.UTF8Validation) + families, err := parser.TextToMetricFamilies(resp.Body) + if err != nil { + return nil, fmt.Errorf("cadvisor: cannot parse metrics: %w", err) + } + + return families, nil +} + +// labelValue returns the value of a label by name from a metric, or "" if absent. +func labelValue(m *dto.Metric, name string) string { + for _, lp := range m.GetLabel() { + if lp.GetName() == name { + return lp.GetValue() + } + } + return "" +} + +// counterValue returns the float64 value of a counter metric. +func counterValue(m *dto.Metric) float64 { + if c := m.GetCounter(); c != nil { + return c.GetValue() + } + return 0 +} + +// groupCounters walks the parsed metric families and accumulates raw counter +// values into a map keyed by (namespace, pod, container). +func (s *CAdvisorScraper) groupCounters(families map[string]*dto.MetricFamily) map[containerKey]*rawCounters { + groups := make(map[containerKey]*rawCounters) + + // Helper that returns (or lazily creates) the rawCounters entry for a key. + get := func(k containerKey) *rawCounters { + if groups[k] == nil { + groups[k] = &rawCounters{} + } + return groups[k] + } + + // Per-container metrics (have a "container" label). + applyContainerMetric := func(family *dto.MetricFamily, apply func(rc *rawCounters, v float64)) { + if family == nil { + return + } + for _, m := range family.GetMetric() { + ns := labelValue(m, "namespace") + pod := labelValue(m, "pod") + container := labelValue(m, "container") + if ns == "" || pod == "" || container == "" { + continue + } + apply(get(containerKey{ns, pod, container}), counterValue(m)) + } + } + + // Per-pod network metrics (no "container" label — use sentinel). + applyNetworkMetric := func(family *dto.MetricFamily, apply func(rc *rawCounters, v float64)) { + if family == nil { + return + } + for _, m := range family.GetMetric() { + ns := labelValue(m, "namespace") + pod := labelValue(m, "pod") + if ns == "" || pod == "" { + continue + } + apply(get(containerKey{ns, pod, podNetworkSentinel}), counterValue(m)) + } + } + + applyContainerMetric(families["container_cpu_cfs_throttled_periods_total"], + func(rc *rawCounters, v float64) { rc.cpuThrottledPeriods = v }) + applyContainerMetric(families["container_cpu_cfs_periods_total"], + func(rc *rawCounters, v float64) { rc.cpuTotalPeriods = v }) + + applyContainerMetric(families["container_fs_reads_bytes_total"], + func(rc *rawCounters, v float64) { rc.fsReadBytes = v }) + applyContainerMetric(families["container_fs_writes_bytes_total"], + func(rc *rawCounters, v float64) { rc.fsWriteBytes = v }) + applyContainerMetric(families["container_fs_reads_total"], + func(rc *rawCounters, v float64) { rc.fsReadOps = v }) + applyContainerMetric(families["container_fs_writes_total"], + func(rc *rawCounters, v float64) { rc.fsWriteOps = v }) + + applyNetworkMetric(families["container_network_receive_packets_total"], + func(rc *rawCounters, v float64) { rc.netRxPackets = v }) + applyNetworkMetric(families["container_network_transmit_packets_total"], + func(rc *rawCounters, v float64) { rc.netTxPackets = v }) + applyNetworkMetric(families["container_network_receive_errors_total"], + func(rc *rawCounters, v float64) { rc.netRxErrors = v }) + applyNetworkMetric(families["container_network_transmit_errors_total"], + func(rc *rawCounters, v float64) { rc.netTxErrors = v }) + applyNetworkMetric(families["container_network_receive_packets_dropped_total"], + func(rc *rawCounters, v float64) { rc.netRxDrops = v }) + applyNetworkMetric(families["container_network_transmit_packets_dropped_total"], + func(rc *rawCounters, v float64) { rc.netTxDrops = v }) + + return groups +} + +// computeRates turns raw counter snapshots into per-second rate metrics. +// It merges per-container CPU/disk entries with their corresponding pod-level +// network entries so that the final result has one record per real container. +func (s *CAdvisorScraper) computeRates(groups map[containerKey]*rawCounters, now time.Time) []CAdvisorContainerMetrics { + // Compute rates for every key (including network sentinels). + type rateRecord struct { + m CAdvisorContainerMetrics + nonZero bool + isNetKey bool + } + + // First pass — process all non-network keys (real containers). + // Second pass — attach network rates from sentinel keys. + + // Build a pod→network-rates lookup from sentinel entries. + type netRates struct { + rxPkts float64 + txPkts float64 + rxErr float64 + txErr float64 + rxDrop float64 + txDrop float64 + } + podNet := make(map[[2]string]netRates) // key: [namespace, pod] + + for k, rc := range groups { + if k.container != podNetworkSentinel { + continue + } + entity := "cadvisor/" + k.namespace + "/" + k.pod + "/" + k.container + nr := netRates{ + rxPkts: s.rates.Rate(entity, "net_rx_packets", rc.netRxPackets, now), + txPkts: s.rates.Rate(entity, "net_tx_packets", rc.netTxPackets, now), + rxErr: s.rates.Rate(entity, "net_rx_errors", rc.netRxErrors, now), + txErr: s.rates.Rate(entity, "net_tx_errors", rc.netTxErrors, now), + rxDrop: s.rates.Rate(entity, "net_rx_drops", rc.netRxDrops, now), + txDrop: s.rates.Rate(entity, "net_tx_drops", rc.netTxDrops, now), + } + podNet[[2]string{k.namespace, k.pod}] = nr + } + + // Second pass — real container entries. + var results []CAdvisorContainerMetrics + + for k, rc := range groups { + if k.container == podNetworkSentinel { + continue + } + + entity := "cadvisor/" + k.namespace + "/" + k.pod + "/" + k.container + + throttledRate := s.rates.Rate(entity, "cpu_throttled_periods", rc.cpuThrottledPeriods, now) + totalRate := s.rates.Rate(entity, "cpu_total_periods", rc.cpuTotalPeriods, now) + diskRB := s.rates.Rate(entity, "fs_read_bytes", rc.fsReadBytes, now) + diskWB := s.rates.Rate(entity, "fs_write_bytes", rc.fsWriteBytes, now) + diskRO := s.rates.Rate(entity, "fs_read_ops", rc.fsReadOps, now) + diskWO := s.rates.Rate(entity, "fs_write_ops", rc.fsWriteOps, now) + + // All counter rates are 0 on first scrape — skip this entry. + if throttledRate == 0 && totalRate == 0 && + diskRB == 0 && diskWB == 0 && diskRO == 0 && diskWO == 0 { + // Check if there are any network rates too. + nr := podNet[[2]string{k.namespace, k.pod}] + if nr.rxPkts == 0 && nr.txPkts == 0 && nr.rxErr == 0 && + nr.txErr == 0 && nr.rxDrop == 0 && nr.txDrop == 0 { + continue + } + } + + var throttleFraction float64 + if totalRate > 0 { + throttleFraction = throttledRate / totalRate + if throttleFraction > 1 { + throttleFraction = 1 + } + } + + nr := podNet[[2]string{k.namespace, k.pod}] + + results = append(results, CAdvisorContainerMetrics{ + Namespace: k.namespace, + Pod: k.pod, + Container: k.container, + + NetworkRxPacketsPerSec: nr.rxPkts, + NetworkTxPacketsPerSec: nr.txPkts, + NetworkRxErrorsPerSec: nr.rxErr, + NetworkTxErrorsPerSec: nr.txErr, + NetworkRxDropsPerSec: nr.rxDrop, + NetworkTxDropsPerSec: nr.txDrop, + + DiskReadBytesPerSec: diskRB, + DiskWriteBytesPerSec: diskWB, + DiskReadOpsPerSec: diskRO, + DiskWriteOpsPerSec: diskWO, + + CPUThrottleFraction: throttleFraction, + }) + } + + return results +} diff --git a/internal/nodemon/cadvisor_scraper_test.go b/internal/nodemon/cadvisor_scraper_test.go new file mode 100644 index 00000000..509b1a95 --- /dev/null +++ b/internal/nodemon/cadvisor_scraper_test.go @@ -0,0 +1,204 @@ +package nodemon_test + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/go-logr/logr" + "github.com/go-logr/zapr" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + + "github.com/devzero-inc/zxporter/internal/nodemon" +) + +// sample1 is the first scrape — sets the counter baseline. +const cadvisorSample1 = `# HELP container_cpu_cfs_throttled_periods_total Total throttled CPU periods. +# TYPE container_cpu_cfs_throttled_periods_total counter +container_cpu_cfs_throttled_periods_total{container="nginx",namespace="default",pod="web-abc"} 300 +# HELP container_cpu_cfs_periods_total Total CPU CFS periods. +# TYPE container_cpu_cfs_periods_total counter +container_cpu_cfs_periods_total{container="nginx",namespace="default",pod="web-abc"} 1000 +# HELP container_fs_reads_bytes_total Total bytes read from filesystem. +# TYPE container_fs_reads_bytes_total counter +container_fs_reads_bytes_total{container="nginx",namespace="default",pod="web-abc"} 409600 +# HELP container_fs_writes_bytes_total Total bytes written to filesystem. +# TYPE container_fs_writes_bytes_total counter +container_fs_writes_bytes_total{container="nginx",namespace="default",pod="web-abc"} 0 +# HELP container_fs_reads_total Total filesystem read operations. +# TYPE container_fs_reads_total counter +container_fs_reads_total{container="nginx",namespace="default",pod="web-abc"} 100 +# HELP container_fs_writes_total Total filesystem write operations. +# TYPE container_fs_writes_total counter +container_fs_writes_total{container="nginx",namespace="default",pod="web-abc"} 50 +# HELP container_network_receive_packets_total Total network packets received. +# TYPE container_network_receive_packets_total counter +container_network_receive_packets_total{namespace="default",pod="web-abc"} 1000 +# HELP container_network_transmit_packets_total Total network packets transmitted. +# TYPE container_network_transmit_packets_total counter +container_network_transmit_packets_total{namespace="default",pod="web-abc"} 800 +# HELP container_network_receive_errors_total Total network receive errors. +# TYPE container_network_receive_errors_total counter +container_network_receive_errors_total{namespace="default",pod="web-abc"} 10 +# HELP container_network_transmit_errors_total Total network transmit errors. +# TYPE container_network_transmit_errors_total counter +container_network_transmit_errors_total{namespace="default",pod="web-abc"} 5 +# HELP container_network_receive_packets_dropped_total Total network receive packet drops. +# TYPE container_network_receive_packets_dropped_total counter +container_network_receive_packets_dropped_total{namespace="default",pod="web-abc"} 2 +# HELP container_network_transmit_packets_dropped_total Total network transmit packet drops. +# TYPE container_network_transmit_packets_dropped_total counter +container_network_transmit_packets_dropped_total{namespace="default",pod="web-abc"} 1 +` + +// sample2 is the second scrape — 30 seconds later with incremented counters. +// throttle fraction: (330-300)/(1100-1000) = 30/100 = 0.30 +// disk read bytes/sec: (532480-409600)/30 = 122880/30 = 4096.0 +// disk write bytes/sec: (61440-0)/30 = 2048.0 +// disk read ops/sec: (130-100)/30 ≈ 1.0 +// disk write ops/sec: (80-50)/30 = 1.0 +// network rx packets/sec: (1300-1000)/30 = 10.0 +// network tx packets/sec: (1100-800)/30 ≈ 10.0 +const cadvisorSample2 = `# HELP container_cpu_cfs_throttled_periods_total Total throttled CPU periods. +# TYPE container_cpu_cfs_throttled_periods_total counter +container_cpu_cfs_throttled_periods_total{container="nginx",namespace="default",pod="web-abc"} 330 +# HELP container_cpu_cfs_periods_total Total CPU CFS periods. +# TYPE container_cpu_cfs_periods_total counter +container_cpu_cfs_periods_total{container="nginx",namespace="default",pod="web-abc"} 1100 +# HELP container_fs_reads_bytes_total Total bytes read from filesystem. +# TYPE container_fs_reads_bytes_total counter +container_fs_reads_bytes_total{container="nginx",namespace="default",pod="web-abc"} 532480 +# HELP container_fs_writes_bytes_total Total bytes written to filesystem. +# TYPE container_fs_writes_bytes_total counter +container_fs_writes_bytes_total{container="nginx",namespace="default",pod="web-abc"} 61440 +# HELP container_fs_reads_total Total filesystem read operations. +# TYPE container_fs_reads_total counter +container_fs_reads_total{container="nginx",namespace="default",pod="web-abc"} 130 +# HELP container_fs_writes_total Total filesystem write operations. +# TYPE container_fs_writes_total counter +container_fs_writes_total{container="nginx",namespace="default",pod="web-abc"} 80 +# HELP container_network_receive_packets_total Total network packets received. +# TYPE container_network_receive_packets_total counter +container_network_receive_packets_total{namespace="default",pod="web-abc"} 1300 +# HELP container_network_transmit_packets_total Total network packets transmitted. +# TYPE container_network_transmit_packets_total counter +container_network_transmit_packets_total{namespace="default",pod="web-abc"} 1100 +# HELP container_network_receive_errors_total Total network receive errors. +# TYPE container_network_receive_errors_total counter +container_network_receive_errors_total{namespace="default",pod="web-abc"} 10 +# HELP container_network_transmit_errors_total Total network transmit errors. +# TYPE container_network_transmit_errors_total counter +container_network_transmit_errors_total{namespace="default",pod="web-abc"} 5 +# HELP container_network_receive_packets_dropped_total Total network receive packet drops. +# TYPE container_network_receive_packets_dropped_total counter +container_network_receive_packets_dropped_total{namespace="default",pod="web-abc"} 2 +# HELP container_network_transmit_packets_dropped_total Total network transmit packet drops. +# TYPE container_network_transmit_packets_dropped_total counter +container_network_transmit_packets_dropped_total{namespace="default",pod="web-abc"} 1 +` + +func newCAdvisorTestLogger() logr.Logger { + zapLog, _ := zap.NewDevelopment() + return zapr.NewLogger(zapLog) +} + +// TestCAdvisorScraper_FirstScrapeReturnsEmpty verifies that the first scrape +// returns an empty slice because there is no baseline for rate computation. +func TestCAdvisorScraper_FirstScrapeReturnsEmpty(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain; version=0.0.4") + _, _ = w.Write([]byte(cadvisorSample1)) + })) + defer srv.Close() + + log := newCAdvisorTestLogger() + scraper := nodemon.NewCAdvisorScraper(srv.URL, srv.Client(), log) + + now := time.Now() + results, err := scraper.Scrape(context.Background(), now) + + require.NoError(t, err) + assert.Empty(t, results, "first scrape should return empty slice (no baseline for rates)") +} + +// TestCAdvisorScraper_ComputesRatesAfterTwoScrapes verifies that the second +// scrape returns correct per-second rates computed from the counter deltas. +func TestCAdvisorScraper_ComputesRatesAfterTwoScrapes(t *testing.T) { + callCount := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain; version=0.0.4") + callCount++ + if callCount == 1 { + _, _ = w.Write([]byte(cadvisorSample1)) + } else { + _, _ = w.Write([]byte(cadvisorSample2)) + } + })) + defer srv.Close() + + log := newCAdvisorTestLogger() + scraper := nodemon.NewCAdvisorScraper(srv.URL, srv.Client(), log) + + base := time.Now() + // First scrape — seeds the baseline counters. + _, err := scraper.Scrape(context.Background(), base) + require.NoError(t, err) + + // Second scrape — 30 seconds later with incremented counters. + results, err := scraper.Scrape(context.Background(), base.Add(30*time.Second)) + require.NoError(t, err) + require.Len(t, results, 1, "should return one container entry") + + m := results[0] + assert.Equal(t, "default", m.Namespace) + assert.Equal(t, "web-abc", m.Pod) + assert.Equal(t, "nginx", m.Container) + + // CPU throttle: (330-300)/(1100-1000) = 30/100 = 0.30 + assert.InDelta(t, 0.30, m.CPUThrottleFraction, 0.0001, "CPU throttle fraction") + + // Disk read bytes/sec: (532480-409600)/30 = 4096.0 + assert.InDelta(t, 4096.0, m.DiskReadBytesPerSec, 0.01, "disk read bytes/sec") + + // Disk write bytes/sec: (61440-0)/30 = 2048.0 + assert.InDelta(t, 2048.0, m.DiskWriteBytesPerSec, 0.01, "disk write bytes/sec") + + // Disk read ops/sec: (130-100)/30 = 1.0 + assert.InDelta(t, 1.0, m.DiskReadOpsPerSec, 0.001, "disk read ops/sec") + + // Disk write ops/sec: (80-50)/30 = 1.0 + assert.InDelta(t, 1.0, m.DiskWriteOpsPerSec, 0.001, "disk write ops/sec") + + // Network rx packets/sec: (1300-1000)/30 = 10.0 + assert.InDelta(t, 10.0, m.NetworkRxPacketsPerSec, 0.001, "network rx packets/sec") + + // Network tx packets/sec: (1100-800)/30 = 10.0 + assert.InDelta(t, 10.0, m.NetworkTxPacketsPerSec, 0.001, "network tx packets/sec") + + // Network errors unchanged → rate = 0 + assert.InDelta(t, 0.0, m.NetworkRxErrorsPerSec, 0.001, "network rx errors/sec (no change)") + assert.InDelta(t, 0.0, m.NetworkTxErrorsPerSec, 0.001, "network tx errors/sec (no change)") + + // Network drops unchanged → rate = 0 + assert.InDelta(t, 0.0, m.NetworkRxDropsPerSec, 0.001, "network rx drops/sec (no change)") + assert.InDelta(t, 0.0, m.NetworkTxDropsPerSec, 0.001, "network tx drops/sec (no change)") +} + +// TestCAdvisorScraper_HandlesHTTPError verifies that an HTTP error from the +// cAdvisor endpoint is propagated as an error (not silently ignored). +func TestCAdvisorScraper_HandlesHTTPError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer srv.Close() + + log := newCAdvisorTestLogger() + scraper := nodemon.NewCAdvisorScraper(srv.URL, srv.Client(), log) + + _, err := scraper.Scrape(context.Background(), time.Now()) + require.Error(t, err, "HTTP 500 should result in an error") +} diff --git a/internal/nodemon/cadvisor_types.go b/internal/nodemon/cadvisor_types.go new file mode 100644 index 00000000..6f65cd38 --- /dev/null +++ b/internal/nodemon/cadvisor_types.go @@ -0,0 +1,25 @@ +package nodemon + +// CAdvisorContainerMetrics holds rate-computed metrics from cAdvisor for a single container. +type CAdvisorContainerMetrics struct { + Namespace string + Pod string + Container string + + // Network rates (per second) + NetworkRxPacketsPerSec float64 + NetworkTxPacketsPerSec float64 + NetworkRxErrorsPerSec float64 + NetworkTxErrorsPerSec float64 + NetworkRxDropsPerSec float64 + NetworkTxDropsPerSec float64 + + // Disk I/O rates (per second) + DiskReadBytesPerSec float64 + DiskWriteBytesPerSec float64 + DiskReadOpsPerSec float64 + DiskWriteOpsPerSec float64 + + // CPU throttle (ratio 0-1) + CPUThrottleFraction float64 +} diff --git a/internal/nodemon/integration_test.go b/internal/nodemon/integration_test.go new file mode 100644 index 00000000..fe2ab882 --- /dev/null +++ b/internal/nodemon/integration_test.go @@ -0,0 +1,306 @@ +package nodemon_test + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/go-logr/logr" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/devzero-inc/zxporter/internal/nodemon" +) + +// integrationStatsSummaryJSON is the stats/summary payload used throughout the +// integration test. It contains one pod ("web-abc" in "default") with one +// container ("nginx"), a network interface, and a PVC-backed volume. +const integrationStatsSummaryJSON = `{ + "node": { + "nodeName": "integration-node", + "network": { + "interfaces": [ + {"name": "eth0", "rxBytes": 5000, "txBytes": 6000} + ] + } + }, + "pods": [ + { + "podRef": { + "name": "web-abc", + "namespace": "default" + }, + "containers": [ + { + "name": "nginx", + "cpu": { + "time": "2024-01-01T00:00:00Z", + "usageNanoCores": 50000000, + "usageCoreNanoSeconds": 9000000000 + }, + "memory": { + "time": "2024-01-01T00:00:00Z", + "usageBytes": 209715200, + "workingSetBytes": 104857600, + "rssBytes": 52428800, + "availableBytes": 1073741824, + "pageFaults": 5, + "majorPageFaults": 0 + } + } + ], + "network": { + "interfaces": [ + {"name": "eth0", "rxBytes": 1024, "txBytes": 2048} + ] + }, + "volume": [ + { + "name": "data-pvc", + "pvcRef": { + "name": "data-claim", + "namespace": "default" + }, + "usedBytes": 536870912, + "capacityBytes": 5368709120, + "availableBytes": 4831838208 + } + ] + } + ] +}` + +// integrationCAdvisorBaseline is the first cAdvisor scrape — it seeds counter +// baselines without producing any rate output. +const integrationCAdvisorBaseline = `# HELP container_cpu_cfs_throttled_periods_total Total throttled CPU periods. +# TYPE container_cpu_cfs_throttled_periods_total counter +container_cpu_cfs_throttled_periods_total{container="nginx",namespace="default",pod="web-abc"} 300 +# HELP container_cpu_cfs_periods_total Total CPU CFS periods. +# TYPE container_cpu_cfs_periods_total counter +container_cpu_cfs_periods_total{container="nginx",namespace="default",pod="web-abc"} 1000 +# HELP container_fs_reads_bytes_total Total bytes read from filesystem. +# TYPE container_fs_reads_bytes_total counter +container_fs_reads_bytes_total{container="nginx",namespace="default",pod="web-abc"} 409600 +# HELP container_fs_writes_bytes_total Total bytes written to filesystem. +# TYPE container_fs_writes_bytes_total counter +container_fs_writes_bytes_total{container="nginx",namespace="default",pod="web-abc"} 0 +# HELP container_fs_reads_total Total filesystem read operations. +# TYPE container_fs_reads_total counter +container_fs_reads_total{container="nginx",namespace="default",pod="web-abc"} 100 +# HELP container_fs_writes_total Total filesystem write operations. +# TYPE container_fs_writes_total counter +container_fs_writes_total{container="nginx",namespace="default",pod="web-abc"} 50 +# HELP container_network_receive_packets_total Total network packets received. +# TYPE container_network_receive_packets_total counter +container_network_receive_packets_total{namespace="default",pod="web-abc"} 1000 +# HELP container_network_transmit_packets_total Total network packets transmitted. +# TYPE container_network_transmit_packets_total counter +container_network_transmit_packets_total{namespace="default",pod="web-abc"} 800 +# HELP container_network_receive_errors_total Total network receive errors. +# TYPE container_network_receive_errors_total counter +container_network_receive_errors_total{namespace="default",pod="web-abc"} 10 +# HELP container_network_transmit_errors_total Total network transmit errors. +# TYPE container_network_transmit_errors_total counter +container_network_transmit_errors_total{namespace="default",pod="web-abc"} 5 +# HELP container_network_receive_packets_dropped_total Total network receive drops. +# TYPE container_network_receive_packets_dropped_total counter +container_network_receive_packets_dropped_total{namespace="default",pod="web-abc"} 2 +# HELP container_network_transmit_packets_dropped_total Total network transmit drops. +# TYPE container_network_transmit_packets_dropped_total counter +container_network_transmit_packets_dropped_total{namespace="default",pod="web-abc"} 1 +` + +// integrationCAdvisorSecond is the second cAdvisor scrape — counters are +// incremented 30 seconds after the baseline. Expected rates: +// +// CPU throttle fraction : (330-300)/(1100-1000) = 30/100 = 0.30 +// disk read bytes/sec : (532480-409600)/30 = 4096.0 +// disk write bytes/sec : (61440-0)/30 = 2048.0 +// disk read ops/sec : (130-100)/30 ≈ 1.0 +// disk write ops/sec : (80-50)/30 = 1.0 +// net rx packets/sec : (1300-1000)/30 = 10.0 +// net tx packets/sec : (1100-800)/30 = 10.0 +const integrationCAdvisorSecond = `# HELP container_cpu_cfs_throttled_periods_total Total throttled CPU periods. +# TYPE container_cpu_cfs_throttled_periods_total counter +container_cpu_cfs_throttled_periods_total{container="nginx",namespace="default",pod="web-abc"} 330 +# HELP container_cpu_cfs_periods_total Total CPU CFS periods. +# TYPE container_cpu_cfs_periods_total counter +container_cpu_cfs_periods_total{container="nginx",namespace="default",pod="web-abc"} 1100 +# HELP container_fs_reads_bytes_total Total bytes read from filesystem. +# TYPE container_fs_reads_bytes_total counter +container_fs_reads_bytes_total{container="nginx",namespace="default",pod="web-abc"} 532480 +# HELP container_fs_writes_bytes_total Total bytes written to filesystem. +# TYPE container_fs_writes_bytes_total counter +container_fs_writes_bytes_total{container="nginx",namespace="default",pod="web-abc"} 61440 +# HELP container_fs_reads_total Total filesystem read operations. +# TYPE container_fs_reads_total counter +container_fs_reads_total{container="nginx",namespace="default",pod="web-abc"} 130 +# HELP container_fs_writes_total Total filesystem write operations. +# TYPE container_fs_writes_total counter +container_fs_writes_total{container="nginx",namespace="default",pod="web-abc"} 80 +# HELP container_network_receive_packets_total Total network packets received. +# TYPE container_network_receive_packets_total counter +container_network_receive_packets_total{namespace="default",pod="web-abc"} 1300 +# HELP container_network_transmit_packets_total Total network packets transmitted. +# TYPE container_network_transmit_packets_total counter +container_network_transmit_packets_total{namespace="default",pod="web-abc"} 1100 +# HELP container_network_receive_errors_total Total network receive errors. +# TYPE container_network_receive_errors_total counter +container_network_receive_errors_total{namespace="default",pod="web-abc"} 10 +# HELP container_network_transmit_errors_total Total network transmit errors. +# TYPE container_network_transmit_errors_total counter +container_network_transmit_errors_total{namespace="default",pod="web-abc"} 5 +# HELP container_network_receive_packets_dropped_total Total network receive drops. +# TYPE container_network_receive_packets_dropped_total counter +container_network_receive_packets_dropped_total{namespace="default",pod="web-abc"} 2 +# HELP container_network_transmit_packets_dropped_total Total network transmit drops. +# TYPE container_network_transmit_packets_dropped_total counter +container_network_transmit_packets_dropped_total{namespace="default",pod="web-abc"} 1 +` + +// TestIntegration_FullMetricsFlow exercises the complete pipeline end-to-end: +// +// 1. Stats/summary poller parses kubelet JSON and returns structured pod data. +// 2. CAdvisor scraper seeds baselines on the first call (no rates yet). +// 3. CAdvisor scraper computes per-second rates on the second call. +// 4. The expected CPU/memory/network fields from stats/summary are correct. +// 5. The expected throttle, disk-I/O, and network-packet rates from cAdvisor +// are computed with the right formulas. +func TestIntegration_FullMetricsFlow(t *testing.T) { + // --- stats/summary mock server ------------------------------------------ + statsSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/stats/summary" { + http.NotFound(w, r) + return + } + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(integrationStatsSummaryJSON)) + })) + defer statsSrv.Close() + + // --- cAdvisor mock server ------------------------------------------------ + cadvisorCallCount := 0 + cadvisorSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain; version=0.0.4") + w.WriteHeader(http.StatusOK) + if cadvisorCallCount == 0 { + _, _ = w.Write([]byte(integrationCAdvisorBaseline)) + } else { + _, _ = w.Write([]byte(integrationCAdvisorSecond)) + } + cadvisorCallCount++ + })) + defer cadvisorSrv.Close() + + // --- real components (not mocks) ---------------------------------------- + log := logr.Discard() + poller := nodemon.NewStatsPoller(statsSrv.URL, &http.Client{}, log) + scraper := nodemon.NewCAdvisorScraper(cadvisorSrv.URL, &http.Client{}, log) + + ctx := context.Background() + + // --- first scrape: seeds stats and cAdvisor baseline -------------------- + stats, err := poller.Poll(ctx) + require.NoError(t, err, "first stats/summary poll must succeed") + + t0 := time.Now() + cadvisorResults1, err := scraper.Scrape(ctx, t0) + require.NoError(t, err, "first cAdvisor scrape must not error") + assert.Empty(t, cadvisorResults1, "first cAdvisor scrape should return no rates (baseline only)") + + // --- second scrape: 30 seconds later, rates now available --------------- + stats2, err := poller.Poll(ctx) + require.NoError(t, err, "second stats/summary poll must succeed") + + t1 := t0.Add(30 * time.Second) + cadvisorResults2, err := scraper.Scrape(ctx, t1) + require.NoError(t, err, "second cAdvisor scrape must not error") + require.NotEmpty(t, cadvisorResults2, "second cAdvisor scrape should return rate results") + + // ========================================================================= + // Verify stats/summary fields (both polls return the same data) + // ========================================================================= + + for _, s := range []*nodemon.StatsSummary{stats, stats2} { + require.Len(t, s.Pods, 1, "should have exactly one pod") + pod := s.Pods[0] + + // Pod identity + assert.Equal(t, "web-abc", pod.PodRef.Name) + assert.Equal(t, "default", pod.PodRef.Namespace) + + // Container CPU (from stats/summary) + require.Len(t, pod.Containers, 1) + c := pod.Containers[0] + assert.Equal(t, "nginx", c.Name) + require.NotNil(t, c.CPU.UsageNanoCores, "usageNanoCores must be present") + assert.Equal(t, uint64(50000000), *c.CPU.UsageNanoCores, "usageNanoCores") + require.NotNil(t, c.CPU.UsageCoreNanoSeconds) + assert.Equal(t, uint64(9000000000), *c.CPU.UsageCoreNanoSeconds) + + // Container memory (from stats/summary) + require.NotNil(t, c.Memory.WorkingSetBytes, "workingSetBytes must be present") + assert.Equal(t, uint64(104857600), *c.Memory.WorkingSetBytes, "workingSetBytes") + require.NotNil(t, c.Memory.UsageBytes) + assert.Equal(t, uint64(209715200), *c.Memory.UsageBytes, "usageBytes") + + // Pod network bytes (from stats/summary) + require.Len(t, pod.Network.Interfaces, 1) + iface := pod.Network.Interfaces[0] + require.NotNil(t, iface.RxBytes) + assert.Equal(t, uint64(1024), *iface.RxBytes, "network rxBytes") + require.NotNil(t, iface.TxBytes) + assert.Equal(t, uint64(2048), *iface.TxBytes, "network txBytes") + + // PVC volume (from stats/summary) + require.Len(t, pod.VolumeStats, 1) + vol := pod.VolumeStats[0] + assert.Equal(t, "data-pvc", vol.Name) + require.NotNil(t, vol.PVCRef, "pvcRef must be present") + assert.Equal(t, "data-claim", vol.PVCRef.Name) + assert.Equal(t, "default", vol.PVCRef.Namespace) + require.NotNil(t, vol.UsedBytes) + assert.Equal(t, uint64(536870912), *vol.UsedBytes, "volume usedBytes") + require.NotNil(t, vol.CapacityBytes) + assert.Equal(t, uint64(5368709120), *vol.CapacityBytes, "volume capacityBytes") + } + + // ========================================================================= + // Verify cAdvisor rates (second scrape only) + // ========================================================================= + + require.Len(t, cadvisorResults2, 1, "exactly one container entry expected from cAdvisor") + cm := cadvisorResults2[0] + + assert.Equal(t, "default", cm.Namespace) + assert.Equal(t, "web-abc", cm.Pod) + assert.Equal(t, "nginx", cm.Container) + + // CPU throttle fraction: (330-300)/(1100-1000) = 30/100 = 0.30 + assert.InDelta(t, 0.30, cm.CPUThrottleFraction, 0.01, "CPU throttle fraction") + + // Disk I/O rates + // read bytes/sec: (532480 - 409600) / 30 = 4096.0 + assert.InDelta(t, 4096.0, cm.DiskReadBytesPerSec, 1.0, "disk read bytes/sec") + // write bytes/sec: (61440 - 0) / 30 = 2048.0 + assert.InDelta(t, 2048.0, cm.DiskWriteBytesPerSec, 1.0, "disk write bytes/sec") + // read ops/sec: (130 - 100) / 30 ≈ 1.0 + assert.InDelta(t, 1.0, cm.DiskReadOpsPerSec, 0.01, "disk read ops/sec") + // write ops/sec: (80 - 50) / 30 = 1.0 + assert.InDelta(t, 1.0, cm.DiskWriteOpsPerSec, 0.01, "disk write ops/sec") + + // Network packet rates + // rx packets/sec: (1300 - 1000) / 30 = 10.0 + assert.InDelta(t, 10.0, cm.NetworkRxPacketsPerSec, 0.01, "network rx packets/sec") + // tx packets/sec: (1100 - 800) / 30 = 10.0 + assert.InDelta(t, 10.0, cm.NetworkTxPacketsPerSec, 0.01, "network tx packets/sec") + + // Error and drop counters are unchanged between scrapes → rates must be 0 + assert.InDelta(t, 0.0, cm.NetworkRxErrorsPerSec, 0.001, "network rx errors/sec (unchanged)") + assert.InDelta(t, 0.0, cm.NetworkTxErrorsPerSec, 0.001, "network tx errors/sec (unchanged)") + assert.InDelta(t, 0.0, cm.NetworkRxDropsPerSec, 0.001, "network rx drops/sec (unchanged)") + assert.InDelta(t, 0.0, cm.NetworkTxDropsPerSec, 0.001, "network tx drops/sec (unchanged)") +} diff --git a/internal/nodemon/rate_calculator.go b/internal/nodemon/rate_calculator.go new file mode 100644 index 00000000..5267abfd --- /dev/null +++ b/internal/nodemon/rate_calculator.go @@ -0,0 +1,75 @@ +package nodemon + +import ( + "sync" + "time" +) + +type counterSample struct { + value float64 + timestamp time.Time +} + +// RateCalculator converts monotonically-increasing counter observations into +// per-second rates. It is safe for concurrent use. +type RateCalculator struct { + mu sync.Mutex + previous map[string]counterSample // key: "entity\x00metric" +} + +// NewRateCalculator returns an initialised RateCalculator. +func NewRateCalculator() *RateCalculator { + return &RateCalculator{ + previous: make(map[string]counterSample), + } +} + +// Rate records a counter value and returns the per-second rate since the last +// observation for the same (entity, metric) pair. +// +// Returns 0 on: +// - first call for a key (no baseline yet) +// - counter reset (current value < previous value) +// - zero elapsed time between observations +func (rc *RateCalculator) Rate(entity, metric string, value float64, ts time.Time) float64 { + key := entity + "\x00" + metric + + rc.mu.Lock() + defer rc.mu.Unlock() + + prev, exists := rc.previous[key] + rc.previous[key] = counterSample{value: value, timestamp: ts} + + if !exists { + return 0 + } + + elapsed := ts.Sub(prev.timestamp).Seconds() + if elapsed <= 0 { + return 0 + } + + delta := value - prev.value + if delta < 0 { + // Counter reset — new baseline is already stored above. + return 0 + } + + return delta / elapsed +} + +// EvictOlderThan removes entries whose last observation timestamp is older than +// maxAge relative to the current wall-clock time. This prevents unbounded +// memory growth for short-lived workloads. +func (rc *RateCalculator) EvictOlderThan(maxAge time.Duration) { + cutoff := time.Now().Add(-maxAge) + + rc.mu.Lock() + defer rc.mu.Unlock() + + for key, sample := range rc.previous { + if sample.timestamp.Before(cutoff) { + delete(rc.previous, key) + } + } +} diff --git a/internal/nodemon/rate_calculator_test.go b/internal/nodemon/rate_calculator_test.go new file mode 100644 index 00000000..2401d740 --- /dev/null +++ b/internal/nodemon/rate_calculator_test.go @@ -0,0 +1,80 @@ +package nodemon + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestRateCalculator_FirstSampleReturnsZero(t *testing.T) { + rc := NewRateCalculator() + ts := time.Now() + rate := rc.Rate("pod/default/nginx", "cpu_total", 1000.0, ts) + assert.Equal(t, 0.0, rate, "first call should return 0") +} + +func TestRateCalculator_SecondSampleComputesRate(t *testing.T) { + rc := NewRateCalculator() + base := time.Now() + + rc.Rate("pod/default/nginx", "cpu_total", 1000.0, base) + rate := rc.Rate("pod/default/nginx", "cpu_total", 1300.0, base.Add(30*time.Second)) + + assert.InDelta(t, 10.0, rate, 0.0001, "expected (1300-1000)/30 = 10.0 per second") +} + +func TestRateCalculator_CounterResetReturnsZero(t *testing.T) { + rc := NewRateCalculator() + base := time.Now() + + rc.Rate("pod/default/nginx", "cpu_total", 5000.0, base) + rate := rc.Rate("pod/default/nginx", "cpu_total", 100.0, base.Add(10*time.Second)) + + assert.Equal(t, 0.0, rate, "counter reset (current < previous) should return 0") +} + +func TestRateCalculator_IndependentKeys(t *testing.T) { + rc := NewRateCalculator() + base := time.Now() + + // Seed two independent entities + rc.Rate("pod/default/alpha", "net_bytes", 0.0, base) + rc.Rate("pod/default/beta", "net_bytes", 0.0, base) + + rateAlpha := rc.Rate("pod/default/alpha", "net_bytes", 600.0, base.Add(60*time.Second)) + rateBeta := rc.Rate("pod/default/beta", "net_bytes", 300.0, base.Add(60*time.Second)) + + assert.InDelta(t, 10.0, rateAlpha, 0.0001, "alpha: 600/60 = 10 per second") + assert.InDelta(t, 5.0, rateBeta, 0.0001, "beta: 300/60 = 5 per second") +} + +func TestRateCalculator_ZeroElapsedTimeReturnsZero(t *testing.T) { + rc := NewRateCalculator() + ts := time.Now() + + rc.Rate("node/worker-1", "disk_reads", 500.0, ts) + rate := rc.Rate("node/worker-1", "disk_reads", 600.0, ts) // same timestamp + + assert.Equal(t, 0.0, rate, "zero elapsed time should return 0") +} + +func TestRateCalculator_EvictStaleEntries(t *testing.T) { + rc := NewRateCalculator() + now := time.Now() + + // Two entries seeded at different times + rc.Rate("entity/stale", "metric_a", 100.0, now.Add(-10*time.Minute)) + rc.Rate("entity/fresh", "metric_b", 200.0, now.Add(-1*time.Minute)) + + // Evict entries older than 5 minutes + rc.EvictOlderThan(5 * time.Minute) + + // Stale entry should be gone — first call after eviction returns 0 + rateStale := rc.Rate("entity/stale", "metric_a", 999.0, now) + assert.Equal(t, 0.0, rateStale, "evicted entry should behave like first call") + + // Fresh entry should still be tracked — returns a non-zero rate + rateFresh := rc.Rate("entity/fresh", "metric_b", 260.0, now) + assert.InDelta(t, 1.0, rateFresh, 0.0001, "fresh entry: 60 delta / 60s = 1.0 per second") +} diff --git a/internal/nodemon/stats_poller.go b/internal/nodemon/stats_poller.go new file mode 100644 index 00000000..167e1548 --- /dev/null +++ b/internal/nodemon/stats_poller.go @@ -0,0 +1,64 @@ +package nodemon + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "time" + + "github.com/go-logr/logr" +) + +const statsSummaryPath = "/stats/summary" + +// StatsPoller fetches the kubelet /stats/summary endpoint and parses the response. +type StatsPoller struct { + baseURL string + httpClient HTTPClient + log logr.Logger +} + +// NewStatsPoller creates a new StatsPoller targeting the given kubelet base URL. +func NewStatsPoller(baseURL string, httpClient HTTPClient, log logr.Logger) *StatsPoller { + return &StatsPoller{ + baseURL: baseURL, + httpClient: httpClient, + log: log.WithName("stats-poller"), + } +} + +// Poll fetches and parses /stats/summary from the kubelet. Uses a 10-second timeout. +func (p *StatsPoller) Poll(ctx context.Context) (*StatsSummary, error) { + ctxWithTimeout, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + url := p.baseURL + statsSummaryPath + + req, err := http.NewRequestWithContext(ctxWithTimeout, http.MethodGet, url, nil) + if err != nil { + return nil, fmt.Errorf("stats poller: cannot create request: %w", err) + } + + resp, err := p.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("stats poller: HTTP request failed: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("stats poller: request failed with status %d", resp.StatusCode) + } + + var summary StatsSummary + if err := json.NewDecoder(resp.Body).Decode(&summary); err != nil { + return nil, fmt.Errorf("stats poller: failed to decode response: %w", err) + } + + p.log.V(1).Info("Polled kubelet stats/summary", + "node", summary.Node.NodeName, + "podCount", len(summary.Pods), + ) + + return &summary, nil +} diff --git a/internal/nodemon/stats_poller_test.go b/internal/nodemon/stats_poller_test.go new file mode 100644 index 00000000..1e57073f --- /dev/null +++ b/internal/nodemon/stats_poller_test.go @@ -0,0 +1,180 @@ +package nodemon_test + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + + "github.com/go-logr/zapr" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + + "github.com/devzero-inc/zxporter/internal/nodemon" +) + +const fullStatsSummaryJSON = `{ + "node": { + "nodeName": "test-node", + "network": { + "interfaces": [ + {"name": "eth0", "rxBytes": 1000, "txBytes": 2000} + ] + } + }, + "pods": [ + { + "podRef": { + "name": "my-pod", + "namespace": "default" + }, + "containers": [ + { + "name": "my-container", + "cpu": { + "time": "2024-01-01T00:00:00Z", + "usageNanoCores": 500000, + "usageCoreNanoSeconds": 1234567890 + }, + "memory": { + "time": "2024-01-01T00:00:00Z", + "usageBytes": 104857600, + "workingSetBytes": 52428800, + "rssBytes": 26214400, + "availableBytes": 524288000, + "pageFaults": 10, + "majorPageFaults": 0 + } + } + ], + "network": { + "interfaces": [ + {"name": "eth0", "rxBytes": 300, "txBytes": 400} + ] + }, + "volume": [ + { + "name": "my-pvc", + "pvcRef": { + "name": "my-claim", + "namespace": "default" + }, + "usedBytes": 1073741824, + "capacityBytes": 10737418240, + "availableBytes": 9663676416 + } + ] + } + ] +}` + +func TestStatsPoller_ParsesKubeletResponse(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/stats/summary" { + http.NotFound(w, r) + return + } + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(fullStatsSummaryJSON)) + })) + defer srv.Close() + + zapLog, _ := zap.NewDevelopment() + log := zapr.NewLogger(zapLog) + + poller := nodemon.NewStatsPoller(srv.URL, srv.Client(), log) + + summary, err := poller.Poll(context.Background()) + + r := require.New(t) + r.NoError(err) + r.NotNil(summary) + + // Node + r.Equal("test-node", summary.Node.NodeName) + + // Pod + r.Len(summary.Pods, 1) + pod := summary.Pods[0] + r.Equal("my-pod", pod.PodRef.Name) + r.Equal("default", pod.PodRef.Namespace) + + // Container CPU + r.Len(pod.Containers, 1) + c := pod.Containers[0] + r.Equal("my-container", c.Name) + r.NotNil(c.CPU.UsageNanoCores) + r.Equal(uint64(500000), *c.CPU.UsageNanoCores) + r.NotNil(c.CPU.UsageCoreNanoSeconds) + r.Equal(uint64(1234567890), *c.CPU.UsageCoreNanoSeconds) + + // Container Memory + r.NotNil(c.Memory.UsageBytes) + r.Equal(uint64(104857600), *c.Memory.UsageBytes) + r.NotNil(c.Memory.WorkingSetBytes) + r.Equal(uint64(52428800), *c.Memory.WorkingSetBytes) + r.NotNil(c.Memory.RSSBytes) + r.Equal(uint64(26214400), *c.Memory.RSSBytes) + + // Pod Network + r.Len(pod.Network.Interfaces, 1) + iface := pod.Network.Interfaces[0] + r.Equal("eth0", iface.Name) + r.NotNil(iface.RxBytes) + r.Equal(uint64(300), *iface.RxBytes) + r.NotNil(iface.TxBytes) + r.Equal(uint64(400), *iface.TxBytes) + + // Volume + r.Len(pod.VolumeStats, 1) + vol := pod.VolumeStats[0] + r.Equal("my-pvc", vol.Name) + r.NotNil(vol.PVCRef) + r.Equal("my-claim", vol.PVCRef.Name) + r.Equal("default", vol.PVCRef.Namespace) + r.NotNil(vol.UsedBytes) + r.Equal(uint64(1073741824), *vol.UsedBytes) + r.NotNil(vol.CapacityBytes) + r.Equal(uint64(10737418240), *vol.CapacityBytes) +} + +func TestStatsPoller_HandlesEmptyResponse(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"node":{"nodeName":"empty-node","network":{}},"pods":[]}`)) + })) + defer srv.Close() + + zapLog, _ := zap.NewDevelopment() + log := zapr.NewLogger(zapLog) + + poller := nodemon.NewStatsPoller(srv.URL, srv.Client(), log) + + summary, err := poller.Poll(context.Background()) + + r := require.New(t) + r.NoError(err) + r.NotNil(summary) + r.Empty(summary.Pods) + r.Equal("empty-node", summary.Node.NodeName) +} + +func TestStatsPoller_HandlesServerError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "internal server error", http.StatusInternalServerError) + })) + defer srv.Close() + + zapLog, _ := zap.NewDevelopment() + log := zapr.NewLogger(zapLog) + + poller := nodemon.NewStatsPoller(srv.URL, srv.Client(), log) + + summary, err := poller.Poll(context.Background()) + + r := require.New(t) + r.Error(err) + r.Nil(summary) +} diff --git a/internal/nodemon/stats_types.go b/internal/nodemon/stats_types.go new file mode 100644 index 00000000..ccee9056 --- /dev/null +++ b/internal/nodemon/stats_types.go @@ -0,0 +1,80 @@ +package nodemon + +import "time" + +// StatsSummary is the top-level response from the kubelet /stats/summary endpoint. +type StatsSummary struct { + Pods []PodStats `json:"pods"` + Node NodeStats `json:"node"` +} + +// PodStats holds per-pod resource metrics. +type PodStats struct { + PodRef struct { + Name string `json:"name"` + Namespace string `json:"namespace"` + } `json:"podRef"` + Containers []ContainerStats `json:"containers"` + Network NetworkStats `json:"network"` + VolumeStats []VolumeStats `json:"volume"` +} + +// ContainerStats holds CPU and memory metrics for a single container. +type ContainerStats struct { + Name string `json:"name"` + CPU CPUStats `json:"cpu"` + Memory MemStats `json:"memory"` +} + +// CPUStats holds CPU usage metrics. Pointer fields are omitted by kubelet when unavailable. +type CPUStats struct { + Time time.Time `json:"time"` + UsageNanoCores *uint64 `json:"usageNanoCores"` + UsageCoreNanoSeconds *uint64 `json:"usageCoreNanoSeconds"` +} + +// MemStats holds memory usage metrics. Pointer fields are omitted by kubelet when unavailable. +type MemStats struct { + Time time.Time `json:"time"` + AvailableBytes *uint64 `json:"availableBytes"` + UsageBytes *uint64 `json:"usageBytes"` + WorkingSetBytes *uint64 `json:"workingSetBytes"` + RSSBytes *uint64 `json:"rssBytes"` + PageFaults *uint64 `json:"pageFaults"` + MajorPageFaults *uint64 `json:"majorPageFaults"` +} + +// NetworkStats holds network interface metrics. +type NetworkStats struct { + Interfaces []InterfaceStats `json:"interfaces"` +} + +// InterfaceStats holds per-interface RX/TX byte counters. +type InterfaceStats struct { + Name string `json:"name"` + RxBytes *uint64 `json:"rxBytes"` + TxBytes *uint64 `json:"txBytes"` +} + +// VolumeStats holds PVC/volume usage metrics. +type VolumeStats struct { + Name string `json:"name"` + PVCRef *PVCRef `json:"pvcRef,omitempty"` + UsedBytes *uint64 `json:"usedBytes"` + CapacityBytes *uint64 `json:"capacityBytes"` + AvailableBytes *uint64 `json:"availableBytes"` +} + +// PVCRef identifies the PersistentVolumeClaim backing a volume. +type PVCRef struct { + Name string `json:"name"` + Namespace string `json:"namespace"` +} + +// NodeStats holds node-level metrics from kubelet stats/summary. +type NodeStats struct { + NodeName string `json:"nodeName"` + CPU CPUStats `json:"cpu"` + Memory MemStats `json:"memory"` + Network NetworkStats `json:"network"` +} diff --git a/internal/nodemon/unified_exporter.go b/internal/nodemon/unified_exporter.go new file mode 100644 index 00000000..21c73574 --- /dev/null +++ b/internal/nodemon/unified_exporter.go @@ -0,0 +1,283 @@ +package nodemon + +import ( + "context" + "sync" + "time" + + "github.com/go-logr/logr" +) + +// UnifiedExporter combines stats/summary, cAdvisor, and GPU data into the +// unified response types consumed by the HTTP handlers. It implements UnifiedQuerier. +type UnifiedExporter struct { + statsPoller *StatsPoller + cadvisorScraper *CAdvisorScraper + gpuExporter *Exporter // existing GPU exporter, may be nil + nodeName string + log logr.Logger + + nodeNetRates *RateCalculator // for computing node network byte rates from cumulative counters + + mu sync.RWMutex + containerMetrics []ContainerMetricsResponse + nodeMetrics *NodeMetricsResponse + pvcMetrics []PVCMetricsResponse + lastCollected time.Time +} + +// NewUnifiedExporter creates a UnifiedExporter. +func NewUnifiedExporter( + statsPoller *StatsPoller, + cadvisorScraper *CAdvisorScraper, + gpuExporter *Exporter, + nodeName string, + log logr.Logger, +) *UnifiedExporter { + return &UnifiedExporter{ + statsPoller: statsPoller, + cadvisorScraper: cadvisorScraper, + gpuExporter: gpuExporter, + nodeName: nodeName, + log: log.WithName("unified-exporter"), + nodeNetRates: NewRateCalculator(), + } +} + +// StartCollectionLoop runs periodic collection. Call in a goroutine. +func (u *UnifiedExporter) StartCollectionLoop(ctx context.Context, interval time.Duration) { + // Initial collection + u.Collect(ctx) + + ticker := time.NewTicker(interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + u.Collect(ctx) + } + } +} + +// Collect fetches from all sources and updates cached results. +func (u *UnifiedExporter) Collect(ctx context.Context) { + now := time.Now() + + // Fetch stats/summary + var stats *StatsSummary + if u.statsPoller != nil { + var err error + stats, err = u.statsPoller.Poll(ctx) + if err != nil { + u.log.Error(err, "Failed to poll stats/summary") + } + } + + // Fetch cAdvisor rates + var cadvisorMetrics []CAdvisorContainerMetrics + if u.cadvisorScraper != nil { + var err error + cadvisorMetrics, err = u.cadvisorScraper.Scrape(ctx, now) + if err != nil { + u.log.Error(err, "Failed to scrape cAdvisor") + } + } + + // Fetch GPU metrics + var gpuMetrics []GPUMetric + if u.gpuExporter != nil { + var err error + gpuMetrics, err = u.gpuExporter.QueryMetrics(ctx) + if err != nil { + u.log.V(1).Info("Failed to query GPU metrics", "error", err) + } + } + + // Index cAdvisor metrics by pod/container + cadvisorIndex := make(map[string]*CAdvisorContainerMetrics) + for i := range cadvisorMetrics { + m := &cadvisorMetrics[i] + key := m.Namespace + "/" + m.Pod + "/" + m.Container + cadvisorIndex[key] = m + } + + // Index GPU metrics by pod/container + gpuIndex := make(map[string]*GPUMetric) + for i := range gpuMetrics { + m := &gpuMetrics[i] + key := m.Namespace + "/" + m.Pod + "/" + m.Container + gpuIndex[key] = m + } + + // Build container metrics from stats/summary + var containerResults []ContainerMetricsResponse + var pvcResults []PVCMetricsResponse + + if stats != nil { + for _, pod := range stats.Pods { + // Aggregate pod-level network bytes + var rxBytes, txBytes uint64 + for _, iface := range pod.Network.Interfaces { + if iface.RxBytes != nil { + rxBytes += *iface.RxBytes + } + if iface.TxBytes != nil { + txBytes += *iface.TxBytes + } + } + + for _, container := range pod.Containers { + resp := ContainerMetricsResponse{ + NodeName: u.nodeName, + Namespace: pod.PodRef.Namespace, + Pod: pod.PodRef.Name, + Container: container.Name, + Timestamp: now, + NetworkRxBytes: rxBytes, + NetworkTxBytes: txBytes, + } + + if container.CPU.UsageNanoCores != nil { + resp.CPUUsageNanoCores = *container.CPU.UsageNanoCores + } + if container.Memory.WorkingSetBytes != nil { + resp.MemoryWorkingSet = *container.Memory.WorkingSetBytes + } + if container.Memory.UsageBytes != nil { + resp.MemoryUsageBytes = *container.Memory.UsageBytes + } + if container.Memory.RSSBytes != nil { + resp.MemoryRSSBytes = *container.Memory.RSSBytes + } + + // Merge cAdvisor rates + cadKey := pod.PodRef.Namespace + "/" + pod.PodRef.Name + "/" + container.Name + if cm, ok := cadvisorIndex[cadKey]; ok { + resp.NetworkRxPacketsPerSec = cm.NetworkRxPacketsPerSec + resp.NetworkTxPacketsPerSec = cm.NetworkTxPacketsPerSec + resp.NetworkRxErrorsPerSec = cm.NetworkRxErrorsPerSec + resp.NetworkTxErrorsPerSec = cm.NetworkTxErrorsPerSec + resp.NetworkRxDropsPerSec = cm.NetworkRxDropsPerSec + resp.NetworkTxDropsPerSec = cm.NetworkTxDropsPerSec + resp.DiskReadBytesPerSec = cm.DiskReadBytesPerSec + resp.DiskWriteBytesPerSec = cm.DiskWriteBytesPerSec + resp.DiskReadOpsPerSec = cm.DiskReadOpsPerSec + resp.DiskWriteOpsPerSec = cm.DiskWriteOpsPerSec + resp.CPUThrottleFraction = cm.CPUThrottleFraction + } + + // Merge GPU metrics + gpuKey := pod.PodRef.Namespace + "/" + pod.PodRef.Name + "/" + container.Name + if gm, ok := gpuIndex[gpuKey]; ok { + resp.GPUUtilization = gm.GPUUtilization + resp.GPUMemoryUsedMiB = gm.FramebufferUsed + resp.GPUMemoryFreeMiB = gm.FramebufferFree + resp.GPUPowerWatts = gm.PowerUsage + resp.GPUTemperature = gm.Temperature + } + + containerResults = append(containerResults, resp) + } + + // PVC metrics from volume stats + for _, vol := range pod.VolumeStats { + if vol.PVCRef == nil { + continue + } + pvc := PVCMetricsResponse{ + Namespace: pod.PodRef.Namespace, + Pod: pod.PodRef.Name, + PVCName: vol.PVCRef.Name, + } + if vol.UsedBytes != nil { + pvc.UsedBytes = *vol.UsedBytes + } + if vol.CapacityBytes != nil { + pvc.CapacityBytes = *vol.CapacityBytes + } + if vol.AvailableBytes != nil { + pvc.AvailableBytes = *vol.AvailableBytes + } + pvcResults = append(pvcResults, pvc) + } + } + } + + // Build node metrics by aggregating cAdvisor per-container rates + nodeResult := &NodeMetricsResponse{ + NodeName: u.nodeName, + Timestamp: now, + } + for _, cm := range cadvisorMetrics { + nodeResult.NetworkRxPacketsPerSec += cm.NetworkRxPacketsPerSec + nodeResult.NetworkTxPacketsPerSec += cm.NetworkTxPacketsPerSec + nodeResult.NetworkRxErrorsPerSec += cm.NetworkRxErrorsPerSec + nodeResult.NetworkTxErrorsPerSec += cm.NetworkTxErrorsPerSec + nodeResult.NetworkRxDropsPerSec += cm.NetworkRxDropsPerSec + nodeResult.NetworkTxDropsPerSec += cm.NetworkTxDropsPerSec + nodeResult.DiskReadBytesPerSec += cm.DiskReadBytesPerSec + nodeResult.DiskWriteBytesPerSec += cm.DiskWriteBytesPerSec + nodeResult.DiskReadOpsPerSec += cm.DiskReadOpsPerSec + nodeResult.DiskWriteOpsPerSec += cm.DiskWriteOpsPerSec + } + // Node-level CPU/memory from stats/summary (includes system processes, not just containers) + if stats != nil { + if stats.Node.CPU.UsageNanoCores != nil { + nodeResult.CPUUsageNanoCores = *stats.Node.CPU.UsageNanoCores + } + if stats.Node.Memory.WorkingSetBytes != nil { + nodeResult.MemoryWorkingSet = *stats.Node.Memory.WorkingSetBytes + } + } + + // Node-level network bytes rate from stats/summary node section + if stats != nil { + var nodeRxBytes, nodeTxBytes uint64 + for _, iface := range stats.Node.Network.Interfaces { + if iface.RxBytes != nil { + nodeRxBytes += *iface.RxBytes + } + if iface.TxBytes != nil { + nodeTxBytes += *iface.TxBytes + } + } + nodeResult.NetworkRxBytesPerSec = u.nodeNetRates.Rate(u.nodeName, "rx_bytes", float64(nodeRxBytes), now) + nodeResult.NetworkTxBytesPerSec = u.nodeNetRates.Rate(u.nodeName, "tx_bytes", float64(nodeTxBytes), now) + } + + // Update cache + u.mu.Lock() + u.containerMetrics = containerResults + u.nodeMetrics = nodeResult + u.pvcMetrics = pvcResults + u.lastCollected = now + u.mu.Unlock() + + u.log.V(1).Info("Collected unified metrics", + "containers", len(containerResults), + "pvcs", len(pvcResults)) +} + +// QueryContainerMetrics implements UnifiedQuerier. +func (u *UnifiedExporter) QueryContainerMetrics() []ContainerMetricsResponse { + u.mu.RLock() + defer u.mu.RUnlock() + return u.containerMetrics +} + +// QueryNodeMetrics implements UnifiedQuerier. +func (u *UnifiedExporter) QueryNodeMetrics() *NodeMetricsResponse { + u.mu.RLock() + defer u.mu.RUnlock() + return u.nodeMetrics +} + +// QueryPVCMetrics implements UnifiedQuerier. +func (u *UnifiedExporter) QueryPVCMetrics() []PVCMetricsResponse { + u.mu.RLock() + defer u.mu.RUnlock() + return u.pvcMetrics +} diff --git a/internal/nodemon/unified_handler.go b/internal/nodemon/unified_handler.go new file mode 100644 index 00000000..ef0aaf88 --- /dev/null +++ b/internal/nodemon/unified_handler.go @@ -0,0 +1,153 @@ +package nodemon + +import ( + "encoding/json" + "net/http" + + "github.com/go-logr/logr" +) + +// UnifiedQuerier provides merged metrics from all sources. +type UnifiedQuerier interface { + QueryContainerMetrics() []ContainerMetricsResponse + QueryNodeMetrics() *NodeMetricsResponse + QueryPVCMetrics() []PVCMetricsResponse +} + +// unifiedContainerHandler serves GET /v2/container/metrics. +type unifiedContainerHandler struct { + querier UnifiedQuerier + log logr.Logger +} + +// NewUnifiedContainerHandler creates an HTTP handler for GET /v2/container/metrics. +// Supports query parameters: namespace, pod, container. +func NewUnifiedContainerHandler(querier UnifiedQuerier, log logr.Logger) http.Handler { + return &unifiedContainerHandler{ + querier: querier, + log: log.WithName("unified-container-handler"), + } +} + +func (h *unifiedContainerHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + namespace := r.URL.Query().Get("namespace") + pod := r.URL.Query().Get("pod") + container := r.URL.Query().Get("container") + + all := h.querier.QueryContainerMetrics() + result := filterContainerMetrics(all, namespace, pod, container) + + w.Header().Set("Content-Type", "application/json") + enc := json.NewEncoder(w) + enc.SetIndent("", " ") + if err := enc.Encode(result); err != nil { + h.log.Error(err, "Failed to encode unified container metrics response") + } +} + +// filterContainerMetrics returns the subset of metrics matching all non-empty filter values. +func filterContainerMetrics(metrics []ContainerMetricsResponse, namespace, pod, container string) []ContainerMetricsResponse { + if namespace == "" && pod == "" && container == "" { + return metrics + } + result := make([]ContainerMetricsResponse, 0, len(metrics)) + for _, m := range metrics { + if namespace != "" && m.Namespace != namespace { + continue + } + if pod != "" && m.Pod != pod { + continue + } + if container != "" && m.Container != container { + continue + } + result = append(result, m) + } + return result +} + +// nodeMetricsHandler serves GET /node/metrics. +type nodeMetricsHandler struct { + querier UnifiedQuerier + log logr.Logger +} + +// NewNodeMetricsHandler creates an HTTP handler for GET /node/metrics. +func NewNodeMetricsHandler(querier UnifiedQuerier, log logr.Logger) http.Handler { + return &nodeMetricsHandler{ + querier: querier, + log: log.WithName("node-metrics-handler"), + } +} + +func (h *nodeMetricsHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + node := h.querier.QueryNodeMetrics() + if node == nil { + http.Error(w, "node metrics not available", http.StatusNotFound) + return + } + + w.Header().Set("Content-Type", "application/json") + enc := json.NewEncoder(w) + enc.SetIndent("", " ") + if err := enc.Encode(node); err != nil { + h.log.Error(err, "Failed to encode node metrics response") + } +} + +// pvcMetricsHandler serves GET /pvc/metrics. +type pvcMetricsHandler struct { + querier UnifiedQuerier + log logr.Logger +} + +// NewPVCMetricsHandler creates an HTTP handler for GET /pvc/metrics. +// Supports query parameter: namespace. +func NewPVCMetricsHandler(querier UnifiedQuerier, log logr.Logger) http.Handler { + return &pvcMetricsHandler{ + querier: querier, + log: log.WithName("pvc-metrics-handler"), + } +} + +func (h *pvcMetricsHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + namespace := r.URL.Query().Get("namespace") + all := h.querier.QueryPVCMetrics() + result := filterPVCMetrics(all, namespace) + + w.Header().Set("Content-Type", "application/json") + enc := json.NewEncoder(w) + enc.SetIndent("", " ") + if err := enc.Encode(result); err != nil { + h.log.Error(err, "Failed to encode PVC metrics response") + } +} + +// filterPVCMetrics returns the subset of PVC metrics matching the given namespace (empty = all). +func filterPVCMetrics(metrics []PVCMetricsResponse, namespace string) []PVCMetricsResponse { + if namespace == "" { + return metrics + } + result := make([]PVCMetricsResponse, 0, len(metrics)) + for _, m := range metrics { + if m.Namespace == namespace { + result = append(result, m) + } + } + return result +} diff --git a/internal/nodemon/unified_handler_test.go b/internal/nodemon/unified_handler_test.go new file mode 100644 index 00000000..0b8e4ad0 --- /dev/null +++ b/internal/nodemon/unified_handler_test.go @@ -0,0 +1,323 @@ +package nodemon_test + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/go-logr/logr" + "github.com/go-logr/zapr" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + + "github.com/devzero-inc/zxporter/internal/nodemon" +) + +// mockUnifiedQuerier is a test double for UnifiedQuerier. +type mockUnifiedQuerier struct { + containers []nodemon.ContainerMetricsResponse + node *nodemon.NodeMetricsResponse + pvcs []nodemon.PVCMetricsResponse +} + +func (m *mockUnifiedQuerier) QueryContainerMetrics() []nodemon.ContainerMetricsResponse { + return m.containers +} + +func (m *mockUnifiedQuerier) QueryNodeMetrics() *nodemon.NodeMetricsResponse { + return m.node +} + +func (m *mockUnifiedQuerier) QueryPVCMetrics() []nodemon.PVCMetricsResponse { + return m.pvcs +} + +func sampleContainerMetrics() []nodemon.ContainerMetricsResponse { + now := time.Date(2026, 2, 23, 12, 0, 0, 0, time.UTC) + return []nodemon.ContainerMetricsResponse{ + { + NodeName: "node-1", + Namespace: "production", + Pod: "web-pod-abc", + Container: "nginx", + Timestamp: now, + CPUUsageNanoCores: 500_000_000, + MemoryWorkingSet: 256 * 1024 * 1024, + MemoryUsageBytes: 300 * 1024 * 1024, + MemoryRSSBytes: 200 * 1024 * 1024, + NetworkRxBytes: 1_000_000, + NetworkTxBytes: 500_000, + DiskReadBytesPerSec: 1024.0, + CPUThrottleFraction: 0.05, + }, + { + NodeName: "node-1", + Namespace: "staging", + Pod: "api-pod-xyz", + Container: "go-server", + Timestamp: now, + CPUUsageNanoCores: 200_000_000, + MemoryWorkingSet: 128 * 1024 * 1024, + }, + } +} + +func sampleNodeMetrics() *nodemon.NodeMetricsResponse { + now := time.Date(2026, 2, 23, 12, 0, 0, 0, time.UTC) + return &nodemon.NodeMetricsResponse{ + NodeName: "node-1", + Timestamp: now, + NetworkRxBytesPerSec: 10_000.0, + NetworkTxBytesPerSec: 5_000.0, + DiskReadBytesPerSec: 2048.0, + DiskWriteBytesPerSec: 1024.0, + } +} + +func samplePVCMetrics() []nodemon.PVCMetricsResponse { + return []nodemon.PVCMetricsResponse{ + { + Namespace: "production", + Pod: "web-pod-abc", + PVCName: "data-pvc", + UsedBytes: 5 * 1024 * 1024 * 1024, + CapacityBytes: 10 * 1024 * 1024 * 1024, + AvailableBytes: 5 * 1024 * 1024 * 1024, + }, + { + Namespace: "staging", + Pod: "api-pod-xyz", + PVCName: "logs-pvc", + UsedBytes: 1 * 1024 * 1024 * 1024, + CapacityBytes: 2 * 1024 * 1024 * 1024, + AvailableBytes: 1 * 1024 * 1024 * 1024, + }, + } +} + +func newTestLogger() logr.Logger { + zapLog, _ := zap.NewDevelopment() + return zapr.NewLogger(zapLog) +} + +// TestUnifiedContainerHandler_ReturnsJSON verifies the handler encodes ContainerMetricsResponse correctly. +func TestUnifiedContainerHandler_ReturnsJSON(t *testing.T) { + r := require.New(t) + log := newTestLogger() + + q := &mockUnifiedQuerier{containers: sampleContainerMetrics()} + handler := nodemon.NewUnifiedContainerHandler(q, log) + + req := httptest.NewRequest(http.MethodGet, "/v2/container/metrics", nil) + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + r.Equal(http.StatusOK, rec.Code) + r.Equal("application/json", rec.Header().Get("Content-Type")) + + var result []nodemon.ContainerMetricsResponse + r.NoError(json.Unmarshal(rec.Body.Bytes(), &result)) + r.Len(result, 2) + + first := result[0] + r.Equal("node-1", first.NodeName) + r.Equal("production", first.Namespace) + r.Equal("web-pod-abc", first.Pod) + r.Equal("nginx", first.Container) + r.Equal(uint64(500_000_000), first.CPUUsageNanoCores) + r.Equal(uint64(256*1024*1024), first.MemoryWorkingSet) + r.InDelta(0.05, first.CPUThrottleFraction, 0.0001) +} + +// TestUnifiedContainerHandler_FiltersNamespace verifies namespace query param filtering. +func TestUnifiedContainerHandler_FiltersNamespace(t *testing.T) { + r := require.New(t) + log := newTestLogger() + + q := &mockUnifiedQuerier{containers: sampleContainerMetrics()} + handler := nodemon.NewUnifiedContainerHandler(q, log) + + req := httptest.NewRequest(http.MethodGet, "/v2/container/metrics?namespace=production", nil) + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + r.Equal(http.StatusOK, rec.Code) + + var result []nodemon.ContainerMetricsResponse + r.NoError(json.Unmarshal(rec.Body.Bytes(), &result)) + r.Len(result, 1) + r.Equal("production", result[0].Namespace) + r.Equal("nginx", result[0].Container) +} + +// TestUnifiedContainerHandler_FiltersPod verifies pod query param filtering. +func TestUnifiedContainerHandler_FiltersPod(t *testing.T) { + r := require.New(t) + log := newTestLogger() + + q := &mockUnifiedQuerier{containers: sampleContainerMetrics()} + handler := nodemon.NewUnifiedContainerHandler(q, log) + + req := httptest.NewRequest(http.MethodGet, "/v2/container/metrics?pod=api-pod-xyz", nil) + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + r.Equal(http.StatusOK, rec.Code) + + var result []nodemon.ContainerMetricsResponse + r.NoError(json.Unmarshal(rec.Body.Bytes(), &result)) + r.Len(result, 1) + r.Equal("api-pod-xyz", result[0].Pod) +} + +// TestUnifiedContainerHandler_FiltersContainer verifies container query param filtering. +func TestUnifiedContainerHandler_FiltersContainer(t *testing.T) { + r := require.New(t) + log := newTestLogger() + + q := &mockUnifiedQuerier{containers: sampleContainerMetrics()} + handler := nodemon.NewUnifiedContainerHandler(q, log) + + req := httptest.NewRequest(http.MethodGet, "/v2/container/metrics?container=nginx", nil) + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + r.Equal(http.StatusOK, rec.Code) + + var result []nodemon.ContainerMetricsResponse + r.NoError(json.Unmarshal(rec.Body.Bytes(), &result)) + r.Len(result, 1) + r.Equal("nginx", result[0].Container) +} + +// TestUnifiedContainerHandler_MethodNotAllowed verifies POST returns HTTP 405. +func TestUnifiedContainerHandler_MethodNotAllowed(t *testing.T) { + r := require.New(t) + log := newTestLogger() + + q := &mockUnifiedQuerier{containers: sampleContainerMetrics()} + handler := nodemon.NewUnifiedContainerHandler(q, log) + + req := httptest.NewRequest(http.MethodPost, "/v2/container/metrics", nil) + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + r.Equal(http.StatusMethodNotAllowed, rec.Code) +} + +// TestNodeMetricsHandler_ReturnsJSON verifies the node metrics handler encodes NodeMetricsResponse. +func TestNodeMetricsHandler_ReturnsJSON(t *testing.T) { + r := require.New(t) + log := newTestLogger() + + q := &mockUnifiedQuerier{node: sampleNodeMetrics()} + handler := nodemon.NewNodeMetricsHandler(q, log) + + req := httptest.NewRequest(http.MethodGet, "/node/metrics", nil) + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + r.Equal(http.StatusOK, rec.Code) + r.Equal("application/json", rec.Header().Get("Content-Type")) + + var result nodemon.NodeMetricsResponse + r.NoError(json.Unmarshal(rec.Body.Bytes(), &result)) + r.Equal("node-1", result.NodeName) + r.InDelta(10_000.0, result.NetworkRxBytesPerSec, 0.001) + r.InDelta(2048.0, result.DiskReadBytesPerSec, 0.001) +} + +// TestNodeMetricsHandler_NotFound verifies 404 is returned when no node metrics are available. +func TestNodeMetricsHandler_NotFound(t *testing.T) { + r := require.New(t) + log := newTestLogger() + + q := &mockUnifiedQuerier{node: nil} + handler := nodemon.NewNodeMetricsHandler(q, log) + + req := httptest.NewRequest(http.MethodGet, "/node/metrics", nil) + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + r.Equal(http.StatusNotFound, rec.Code) +} + +// TestNodeMetricsHandler_MethodNotAllowed verifies POST returns HTTP 405. +func TestNodeMetricsHandler_MethodNotAllowed(t *testing.T) { + r := require.New(t) + log := newTestLogger() + + q := &mockUnifiedQuerier{node: sampleNodeMetrics()} + handler := nodemon.NewNodeMetricsHandler(q, log) + + req := httptest.NewRequest(http.MethodPost, "/node/metrics", nil) + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + r.Equal(http.StatusMethodNotAllowed, rec.Code) +} + +// TestPVCMetricsHandler_ReturnsJSON verifies the PVC metrics handler encodes PVCMetricsResponse. +func TestPVCMetricsHandler_ReturnsJSON(t *testing.T) { + r := require.New(t) + log := newTestLogger() + + q := &mockUnifiedQuerier{pvcs: samplePVCMetrics()} + handler := nodemon.NewPVCMetricsHandler(q, log) + + req := httptest.NewRequest(http.MethodGet, "/pvc/metrics", nil) + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + r.Equal(http.StatusOK, rec.Code) + r.Equal("application/json", rec.Header().Get("Content-Type")) + + var result []nodemon.PVCMetricsResponse + r.NoError(json.Unmarshal(rec.Body.Bytes(), &result)) + r.Len(result, 2) + + first := result[0] + r.Equal("production", first.Namespace) + r.Equal("web-pod-abc", first.Pod) + r.Equal("data-pvc", first.PVCName) + r.Equal(uint64(5*1024*1024*1024), first.UsedBytes) + r.Equal(uint64(10*1024*1024*1024), first.CapacityBytes) +} + +// TestPVCMetricsHandler_FiltersNamespace verifies namespace filtering on PVC endpoint. +func TestPVCMetricsHandler_FiltersNamespace(t *testing.T) { + r := require.New(t) + log := newTestLogger() + + q := &mockUnifiedQuerier{pvcs: samplePVCMetrics()} + handler := nodemon.NewPVCMetricsHandler(q, log) + + req := httptest.NewRequest(http.MethodGet, "/pvc/metrics?namespace=staging", nil) + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + r.Equal(http.StatusOK, rec.Code) + + var result []nodemon.PVCMetricsResponse + r.NoError(json.Unmarshal(rec.Body.Bytes(), &result)) + r.Len(result, 1) + r.Equal("staging", result[0].Namespace) + r.Equal("logs-pvc", result[0].PVCName) +} + +// TestPVCMetricsHandler_MethodNotAllowed verifies POST returns HTTP 405. +func TestPVCMetricsHandler_MethodNotAllowed(t *testing.T) { + r := require.New(t) + log := newTestLogger() + + q := &mockUnifiedQuerier{pvcs: samplePVCMetrics()} + handler := nodemon.NewPVCMetricsHandler(q, log) + + req := httptest.NewRequest(http.MethodPost, "/pvc/metrics", nil) + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + r.Equal(http.StatusMethodNotAllowed, rec.Code) +} diff --git a/internal/nodemon/unified_types.go b/internal/nodemon/unified_types.go new file mode 100644 index 00000000..dd96aec8 --- /dev/null +++ b/internal/nodemon/unified_types.go @@ -0,0 +1,82 @@ +package nodemon + +import "time" + +// ContainerMetricsResponse is the JSON response for GET /v2/container/metrics. +type ContainerMetricsResponse struct { + NodeName string `json:"node_name"` + Namespace string `json:"namespace"` + Pod string `json:"pod"` + Container string `json:"container"` + Timestamp time.Time `json:"timestamp"` + + // From stats/summary + CPUUsageNanoCores uint64 `json:"cpu_usage_nanocores"` + MemoryWorkingSet uint64 `json:"memory_working_set_bytes"` + MemoryUsageBytes uint64 `json:"memory_usage_bytes"` + MemoryRSSBytes uint64 `json:"memory_rss_bytes"` + NetworkRxBytes uint64 `json:"network_rx_bytes"` + NetworkTxBytes uint64 `json:"network_tx_bytes"` + + // From cAdvisor (rates) + NetworkRxPacketsPerSec float64 `json:"network_rx_packets_per_sec"` + NetworkTxPacketsPerSec float64 `json:"network_tx_packets_per_sec"` + NetworkRxErrorsPerSec float64 `json:"network_rx_errors_per_sec"` + NetworkTxErrorsPerSec float64 `json:"network_tx_errors_per_sec"` + NetworkRxDropsPerSec float64 `json:"network_rx_drops_per_sec"` + NetworkTxDropsPerSec float64 `json:"network_tx_drops_per_sec"` + DiskReadBytesPerSec float64 `json:"disk_read_bytes_per_sec"` + DiskWriteBytesPerSec float64 `json:"disk_write_bytes_per_sec"` + DiskReadOpsPerSec float64 `json:"disk_read_ops_per_sec"` + DiskWriteOpsPerSec float64 `json:"disk_write_ops_per_sec"` + CPUThrottleFraction float64 `json:"cpu_throttle_fraction"` + + // From GPU (optional) + GPUUtilization float64 `json:"gpu_utilization,omitempty"` + GPUMemoryUsedMiB float64 `json:"gpu_memory_used_mib,omitempty"` + GPUMemoryFreeMiB float64 `json:"gpu_memory_free_mib,omitempty"` + GPUPowerWatts float64 `json:"gpu_power_watts,omitempty"` + GPUTemperature float64 `json:"gpu_temperature_celsius,omitempty"` +} + +// NodeMetricsResponse is the JSON response for GET /node/metrics. +type NodeMetricsResponse struct { + NodeName string `json:"node_name"` + Timestamp time.Time `json:"timestamp"` + + // Node-level CPU/memory from kubelet stats/summary (includes system processes) + CPUUsageNanoCores uint64 `json:"cpu_usage_nanocores"` + MemoryWorkingSet uint64 `json:"memory_working_set_bytes"` + + // Network rates (per second) + NetworkRxBytesPerSec float64 `json:"network_rx_bytes_per_sec"` + NetworkTxBytesPerSec float64 `json:"network_tx_bytes_per_sec"` + NetworkRxPacketsPerSec float64 `json:"network_rx_packets_per_sec"` + NetworkTxPacketsPerSec float64 `json:"network_tx_packets_per_sec"` + NetworkRxErrorsPerSec float64 `json:"network_rx_errors_per_sec"` + NetworkTxErrorsPerSec float64 `json:"network_tx_errors_per_sec"` + NetworkRxDropsPerSec float64 `json:"network_rx_drops_per_sec"` + NetworkTxDropsPerSec float64 `json:"network_tx_drops_per_sec"` + + // Disk I/O rates (per second) + DiskReadBytesPerSec float64 `json:"disk_read_bytes_per_sec"` + DiskWriteBytesPerSec float64 `json:"disk_write_bytes_per_sec"` + DiskReadOpsPerSec float64 `json:"disk_read_ops_per_sec"` + DiskWriteOpsPerSec float64 `json:"disk_write_ops_per_sec"` + + // GPU aggregates (optional) + GPUUtilizationAvg float64 `json:"gpu_utilization_avg,omitempty"` + GPUMemoryUsedMiBSum float64 `json:"gpu_memory_used_mib_sum,omitempty"` + GPUPowerWattsSum float64 `json:"gpu_power_watts_sum,omitempty"` + GPUTemperatureMax float64 `json:"gpu_temperature_max_celsius,omitempty"` +} + +// PVCMetricsResponse is the JSON response for GET /pvc/metrics. +type PVCMetricsResponse struct { + Namespace string `json:"namespace"` + Pod string `json:"pod"` + PVCName string `json:"pvc_name"` + UsedBytes uint64 `json:"used_bytes"` + CapacityBytes uint64 `json:"capacity_bytes"` + AvailableBytes uint64 `json:"available_bytes"` +} diff --git a/internal/server/mpa_server.go b/internal/server/mpa_server.go index 50d1a957..0b9f419f 100644 --- a/internal/server/mpa_server.go +++ b/internal/server/mpa_server.go @@ -21,7 +21,7 @@ type MpaServer struct { logger logr.Logger subscriptionManager *SubscriptionManager grpcServer *grpc.Server - historicalCollector *collector.HistoricalMetricsCollector + historicalCollector collector.HistoricalPercentileProvider healthManager *health.HealthManager } @@ -29,7 +29,7 @@ type MpaServer struct { // historicalCollector may be nil if Prometheus is not available. func NewMpaServer( logger logr.Logger, - historicalCollector *collector.HistoricalMetricsCollector, + historicalCollector collector.HistoricalPercentileProvider, healthManager *health.HealthManager, ) *MpaServer { return &MpaServer{ diff --git a/internal/transport/dakr_client.go b/internal/transport/dakr_client.go index 69ce59f7..049c37fe 100644 --- a/internal/transport/dakr_client.go +++ b/internal/transport/dakr_client.go @@ -123,6 +123,7 @@ type RealDakrClient struct { clusterClient genconnect.ClusterServiceClient clientHeaders *ClientHeaders operatorHealthClient genconnect.OperatorHealthServiceClient + k8sClient genconnect.K8SServiceClient } // NewDakrClient creates a new client for Dakr service @@ -255,12 +256,19 @@ func NewDakrClient(dakrBaseURL string, clusterToken string, logger logr.Logger) clientOptions..., ) + k8sClient := genconnect.NewK8SServiceClient( + httpClient, + dakrBaseURL, + clientOptions..., + ) + return &RealDakrClient{ logger: logger.WithName("dakr-client"), client: client, clusterClient: clusterClient, clientHeaders: clientHeaders, operatorHealthClient: operatorHealthClient, + k8sClient: k8sClient, } } @@ -714,3 +722,10 @@ func (c *RealDakrClient) ReportHealth(ctx context.Context, req *gen.ReportHealth _, err := c.operatorHealthClient.ReportHealth(ctx, connect.NewRequest(req)) return err } + +// NewPercentileFetcher creates a DakrPercentileFetcher that uses the DAKR +// control plane to retrieve pre-computed workload percentiles, replacing the +// local Prometheus dependency for historical metrics. +func (c *RealDakrClient) NewPercentileFetcher() *DakrPercentileFetcher { + return NewDakrPercentileFetcher(c.k8sClient, c.clientHeaders, c.logger) +} diff --git a/internal/transport/interface.go b/internal/transport/interface.go index 04cb0e72..3f19b3d7 100644 --- a/internal/transport/interface.go +++ b/internal/transport/interface.go @@ -46,6 +46,10 @@ type DakrClient interface { // ReportHealth ReportHealth(ctx context.Context, req *gen.ReportHealthRequest) error + + // NewPercentileFetcher creates a PercentileFetcher that retrieves + // pre-computed workload percentiles from the DAKR control plane. + NewPercentileFetcher() *DakrPercentileFetcher } // Sender defines methods for sending data to external systems diff --git a/internal/transport/percentile_fetcher.go b/internal/transport/percentile_fetcher.go new file mode 100644 index 00000000..1d6643bb --- /dev/null +++ b/internal/transport/percentile_fetcher.go @@ -0,0 +1,194 @@ +// internal/transport/percentile_fetcher.go +package transport + +import ( + "context" + "fmt" + "sync" + "time" + + "connectrpc.com/connect" + "github.com/go-logr/logr" + "google.golang.org/protobuf/types/known/timestamppb" + + gen "github.com/devzero-inc/zxporter/gen/api/v1" + genconnect "github.com/devzero-inc/zxporter/gen/api/v1/apiv1connect" + "github.com/devzero-inc/zxporter/internal/collector" +) + +const ( + // maxConcurrentPercentileRequests limits how many DAKR API calls run in parallel. + maxConcurrentPercentileRequests = 10 +) + +// percentileClient is the subset of genconnect.K8SServiceClient that the +// fetcher actually needs. Using a narrow interface makes the code easier to +// test without stubbing dozens of unrelated RPCs. +type percentileClient interface { + GetWorkloadContainerPercentiles( + context.Context, + *connect.Request[gen.GetWorkloadContainerPercentilesRequest], + ) (*connect.Response[gen.GetWorkloadContainerPercentilesResponse], error) +} + +// Compile-time check: the generated K8SServiceClient satisfies our narrow +// interface. +var _ percentileClient = (genconnect.K8SServiceClient)(nil) + +// DakrPercentileFetcher implements collector.PercentileFetcher using the DAKR +// control plane's GetWorkloadContainerPercentiles RPC. +type DakrPercentileFetcher struct { + client percentileClient + clientHeaders *ClientHeaders + logger logr.Logger +} + +// NewDakrPercentileFetcher creates a new fetcher that calls the DAKR control +// plane to retrieve pre-computed percentiles for workload containers. +func NewDakrPercentileFetcher( + client genconnect.K8SServiceClient, + headers *ClientHeaders, + logger logr.Logger, +) *DakrPercentileFetcher { + return &DakrPercentileFetcher{ + client: client, + clientHeaders: headers, + logger: logger.WithName("dakr-percentile-fetcher"), + } +} + +// FetchWorkloadPercentiles calls the DAKR control plane for each workload and +// maps the response into gen.HistoricalMetricsSummary entries keyed by +// "namespace/workloadName/workloadKind". +func (f *DakrPercentileFetcher) FetchWorkloadPercentiles( + ctx context.Context, + clusterID string, + workloads []collector.HistoricalWorkloadQuery, +) (map[string]*gen.HistoricalMetricsSummary, error) { + if len(workloads) == 0 { + return make(map[string]*gen.HistoricalMetricsSummary), nil + } + + results := make(map[string]*gen.HistoricalMetricsSummary, len(workloads)) + var mu sync.Mutex + var wg sync.WaitGroup + + semaphore := make(chan struct{}, maxConcurrentPercentileRequests) + + now := time.Now() + windowStart := now.Add(-24 * time.Hour) + + for _, w := range workloads { + wg.Add(1) + go func(workload collector.HistoricalWorkloadQuery) { + defer wg.Done() + + // Rate-limit concurrent requests. + semaphore <- struct{}{} + defer func() { <-semaphore }() + + summary, err := f.fetchSingleWorkload(ctx, clusterID, workload, windowStart, now) + if err != nil { + f.logger.Error(err, "failed to fetch percentiles for workload", + "namespace", workload.Namespace, + "workload", workload.WorkloadName, + "kind", workload.WorkloadKind, + ) + return + } + + key := workload.Namespace + "/" + workload.WorkloadName + "/" + workload.WorkloadKind + mu.Lock() + results[key] = summary + mu.Unlock() + }(w) + } + + wg.Wait() + return results, nil +} + +// fetchSingleWorkload issues one GetWorkloadContainerPercentiles RPC and +// converts the response to a gen.HistoricalMetricsSummary. +func (f *DakrPercentileFetcher) fetchSingleWorkload( + ctx context.Context, + clusterID string, + workload collector.HistoricalWorkloadQuery, + windowStart, windowEnd time.Time, +) (*gen.HistoricalMetricsSummary, error) { + req := connect.NewRequest(&gen.GetWorkloadContainerPercentilesRequest{ + ClusterId: clusterID, + Kind: workload.WorkloadKind, + // TeamId and Uid are not available from HistoricalWorkloadQuery; + // the DAKR server resolves the workload from the cluster token + kind. + StartTime: timestamppb.New(windowStart), + EndTime: timestamppb.New(windowEnd), + }) + + // Attach auth and operator headers. + f.clientHeaders.AttachToRequest(req.Header()) + + resp, err := f.client.GetWorkloadContainerPercentiles(ctx, req) + if err != nil { + return nil, fmt.Errorf("GetWorkloadContainerPercentiles RPC failed: %w", err) + } + + containers := mapContainerPercentiles(resp.Msg.GetContainers()) + + return &gen.HistoricalMetricsSummary{ + Workload: &gen.MpaWorkloadIdentifier{ + Namespace: workload.Namespace, + Name: workload.WorkloadName, + Kind: workload.WorkloadKind, + }, + Containers: containers, + WindowStart: timestamppb.New(windowStart), + WindowEnd: timestamppb.New(windowEnd), + }, nil +} + +// mapContainerPercentiles converts a slice of DAKR ContainerPercentileSummary +// (float64 values) into the MPA ContainerHistoricalMetrics (int64 millicores / +// bytes). +func mapContainerPercentiles( + summaries []*gen.ContainerPercentileSummary, +) []*gen.ContainerHistoricalMetrics { + out := make([]*gen.ContainerHistoricalMetrics, 0, len(summaries)) + + for _, s := range summaries { + m := &gen.ContainerHistoricalMetrics{ + ContainerName: s.GetContainerName(), + } + + // Map CPU usage percentiles (DAKR returns cores as float64, MPA wants millicores). + if cpu := s.GetCpuUsage(); cpu != nil { + m.CpuP50 = cpuToMillicores(cpu.GetP50()) + m.CpuP75 = cpuToMillicores(cpu.GetP75()) + m.CpuP80 = cpuToMillicores(cpu.GetP80()) + m.CpuP90 = cpuToMillicores(cpu.GetP90()) + m.CpuP95 = cpuToMillicores(cpu.GetP95()) + m.CpuP99 = cpuToMillicores(cpu.GetP99()) + m.CpuPmax = cpuToMillicores(cpu.GetMax()) + } + + // Map memory usage percentiles (DAKR returns bytes as float64, MPA wants bytes as int64). + if mem := s.GetMemoryUsage(); mem != nil { + m.MemP50 = int64(mem.GetP50()) + m.MemP75 = int64(mem.GetP75()) + m.MemP80 = int64(mem.GetP80()) + m.MemP90 = int64(mem.GetP90()) + m.MemP95 = int64(mem.GetP95()) + m.MemP99 = int64(mem.GetP99()) + m.MemPmax = int64(mem.GetMax()) + } + + out = append(out, m) + } + + return out +} + +// cpuToMillicores converts CPU cores (float64) to millicores (int64). +func cpuToMillicores(cores float64) int64 { + return int64(cores * 1000) +} diff --git a/internal/transport/percentile_fetcher_test.go b/internal/transport/percentile_fetcher_test.go new file mode 100644 index 00000000..26079cc8 --- /dev/null +++ b/internal/transport/percentile_fetcher_test.go @@ -0,0 +1,256 @@ +package transport + +import ( + "context" + "errors" + "testing" + + "connectrpc.com/connect" + "github.com/go-logr/logr" + "github.com/go-logr/zapr" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap/zaptest" + + gen "github.com/devzero-inc/zxporter/gen/api/v1" + "github.com/devzero-inc/zxporter/internal/collector" +) + +// mockK8SServiceClient is a minimal mock for the K8SServiceClient interface +// that only implements GetWorkloadContainerPercentiles. +type mockK8SServiceClient struct { + // resp is returned for every call; override per-test. + resp *connect.Response[gen.GetWorkloadContainerPercentilesResponse] + err error + // calls records the requests received. + calls []*connect.Request[gen.GetWorkloadContainerPercentilesRequest] +} + +func (m *mockK8SServiceClient) GetWorkloadContainerPercentiles( + _ context.Context, + req *connect.Request[gen.GetWorkloadContainerPercentilesRequest], +) (*connect.Response[gen.GetWorkloadContainerPercentilesResponse], error) { + m.calls = append(m.calls, req) + return m.resp, m.err +} + +// --- Tests --- + +// newTestFetcher constructs a DakrPercentileFetcher with the narrow +// percentileClient interface, allowing tests to use a lightweight mock. +func newTestFetcher(client percentileClient, logger logr.Logger) *DakrPercentileFetcher { + return &DakrPercentileFetcher{ + client: client, + clientHeaders: NewClientHeaders("test-token"), + logger: logger.WithName("dakr-percentile-fetcher"), + } +} + +func TestFetchWorkloadPercentiles_EmptyWorkloads(t *testing.T) { + logger := zapr.NewLogger(zaptest.NewLogger(t)) + + fetcher := newTestFetcher(&mockK8SServiceClient{}, logger) + + results, err := fetcher.FetchWorkloadPercentiles(context.Background(), "cluster-1", nil) + require.NoError(t, err) + assert.Empty(t, results) + + results, err = fetcher.FetchWorkloadPercentiles(context.Background(), "cluster-1", []collector.HistoricalWorkloadQuery{}) + require.NoError(t, err) + assert.Empty(t, results) +} + +func TestFetchWorkloadPercentiles_MapsPercentiles(t *testing.T) { + logger := zapr.NewLogger(zaptest.NewLogger(t)) + + mock := &mockK8SServiceClient{ + resp: connect.NewResponse(&gen.GetWorkloadContainerPercentilesResponse{ + Containers: []*gen.ContainerPercentileSummary{ + { + ContainerName: "app", + CpuUsage: &gen.MetricPercentiles{ + P50: 0.250, // 250m + P75: 0.500, + P80: 0.600, + P90: 0.800, + P95: 0.900, + P99: 0.950, + Max: 1.000, + }, + MemoryUsage: &gen.MetricPercentiles{ + P50: 100_000_000, // ~100MB + P75: 200_000_000, + P80: 250_000_000, + P90: 300_000_000, + P95: 350_000_000, + P99: 400_000_000, + Max: 500_000_000, + }, + }, + }, + }), + } + + fetcher := newTestFetcher(mock, logger) + + workloads := []collector.HistoricalWorkloadQuery{ + { + Namespace: "default", + WorkloadName: "web-app", + WorkloadKind: "Deployment", + }, + } + + results, err := fetcher.FetchWorkloadPercentiles(context.Background(), "cluster-1", workloads) + require.NoError(t, err) + require.Len(t, results, 1) + + summary, ok := results["default/web-app/Deployment"] + require.True(t, ok, "expected key default/web-app/Deployment in results") + + // Verify workload identifier. + require.NotNil(t, summary.Workload) + assert.Equal(t, "default", summary.Workload.Namespace) + assert.Equal(t, "web-app", summary.Workload.Name) + assert.Equal(t, "Deployment", summary.Workload.Kind) + + // Verify container mapping. + require.Len(t, summary.Containers, 1) + c := summary.Containers[0] + assert.Equal(t, "app", c.ContainerName) + + // CPU: cores → millicores + assert.Equal(t, int64(250), c.CpuP50) + assert.Equal(t, int64(500), c.CpuP75) + assert.Equal(t, int64(600), c.CpuP80) + assert.Equal(t, int64(800), c.CpuP90) + assert.Equal(t, int64(900), c.CpuP95) + assert.Equal(t, int64(950), c.CpuP99) + assert.Equal(t, int64(1000), c.CpuPmax) + + // Memory: bytes passthrough + assert.Equal(t, int64(100_000_000), c.MemP50) + assert.Equal(t, int64(200_000_000), c.MemP75) + assert.Equal(t, int64(250_000_000), c.MemP80) + assert.Equal(t, int64(300_000_000), c.MemP90) + assert.Equal(t, int64(350_000_000), c.MemP95) + assert.Equal(t, int64(400_000_000), c.MemP99) + assert.Equal(t, int64(500_000_000), c.MemPmax) + + // Verify the RPC received correct parameters. + require.Len(t, mock.calls, 1) + assert.Equal(t, "cluster-1", mock.calls[0].Msg.ClusterId) + assert.Equal(t, "Deployment", mock.calls[0].Msg.Kind) +} + +func TestFetchWorkloadPercentiles_RPCError(t *testing.T) { + logger := zapr.NewLogger(zaptest.NewLogger(t)) + + mock := &mockK8SServiceClient{ + err: errors.New("connection refused"), + } + + fetcher := newTestFetcher(mock, logger) + + workloads := []collector.HistoricalWorkloadQuery{ + { + Namespace: "prod", + WorkloadName: "api-server", + WorkloadKind: "Deployment", + }, + } + + // The fetcher logs errors but does not propagate them for individual + // workloads — it returns a partial result map instead. + results, err := fetcher.FetchWorkloadPercentiles(context.Background(), "cluster-1", workloads) + require.NoError(t, err) + assert.Empty(t, results, "expected empty results when RPC fails") +} + +func TestFetchWorkloadPercentiles_NilCPUAndMemory(t *testing.T) { + logger := zapr.NewLogger(zaptest.NewLogger(t)) + + mock := &mockK8SServiceClient{ + resp: connect.NewResponse(&gen.GetWorkloadContainerPercentilesResponse{ + Containers: []*gen.ContainerPercentileSummary{ + { + ContainerName: "sidecar", + // CpuUsage and MemoryUsage are nil + }, + }, + }), + } + + fetcher := newTestFetcher(mock, logger) + + workloads := []collector.HistoricalWorkloadQuery{ + { + Namespace: "default", + WorkloadName: "app", + WorkloadKind: "StatefulSet", + }, + } + + results, err := fetcher.FetchWorkloadPercentiles(context.Background(), "cluster-1", workloads) + require.NoError(t, err) + require.Len(t, results, 1) + + summary := results["default/app/StatefulSet"] + require.NotNil(t, summary) + require.Len(t, summary.Containers, 1) + + c := summary.Containers[0] + assert.Equal(t, "sidecar", c.ContainerName) + // All values should be zero when usage is nil. + assert.Equal(t, int64(0), c.CpuP50) + assert.Equal(t, int64(0), c.MemP50) +} + +func TestMapContainerPercentiles(t *testing.T) { + summaries := []*gen.ContainerPercentileSummary{ + { + ContainerName: "main", + CpuUsage: &gen.MetricPercentiles{ + P50: 1.5, + Max: 3.0, + }, + MemoryUsage: &gen.MetricPercentiles{ + P50: 1024, + Max: 2048, + }, + }, + { + ContainerName: "init", + // No metrics + }, + } + + result := mapContainerPercentiles(summaries) + require.Len(t, result, 2) + + assert.Equal(t, "main", result[0].ContainerName) + assert.Equal(t, int64(1500), result[0].CpuP50) // 1.5 cores = 1500m + assert.Equal(t, int64(3000), result[0].CpuPmax) + assert.Equal(t, int64(1024), result[0].MemP50) + assert.Equal(t, int64(2048), result[0].MemPmax) + + assert.Equal(t, "init", result[1].ContainerName) + assert.Equal(t, int64(0), result[1].CpuP50) + assert.Equal(t, int64(0), result[1].MemP50) +} + +func TestCpuToMillicores(t *testing.T) { + tests := []struct { + cores float64 + want int64 + }{ + {0, 0}, + {0.001, 1}, + {0.250, 250}, + {1.0, 1000}, + {2.5, 2500}, + } + for _, tt := range tests { + assert.Equal(t, tt.want, cpuToMillicores(tt.cores), "cores=%v", tt.cores) + } +} diff --git a/internal/transport/sender.go b/internal/transport/sender.go index c055f9f5..e1ff423b 100644 --- a/internal/transport/sender.go +++ b/internal/transport/sender.go @@ -247,3 +247,10 @@ func (c *SimpleDakrClient) ReportHealth(ctx context.Context, req *gen.ReportHeal ) return nil } + +// NewPercentileFetcher returns nil for the simple client since it has no +// backing K8S service connection. Callers must nil-check before use. +func (c *SimpleDakrClient) NewPercentileFetcher() *DakrPercentileFetcher { + c.logger.Info("NewPercentileFetcher called on simple client, returning nil") + return nil +} diff --git a/internal/util/env.go b/internal/util/env.go index f07a2961..ff4c5cff 100644 --- a/internal/util/env.go +++ b/internal/util/env.go @@ -67,6 +67,10 @@ const ( // Default value: false _ENV_DISABLE_GPU_METRICS = "DISABLE_GPU_METRICS" + // ENABLE_NODEMON_METRICS determines whether to enable node monitor metrics. + // Default value: false + _ENV_ENABLE_NODEMON_METRICS = "ENABLE_NODEMON_METRICS" + // MASK_SECRET_DATA determines whether to redact secret values. // Default value: false _ENV_MASK_SECRET_DATA = "MASK_SECRET_DATA" @@ -526,9 +530,6 @@ func LoadCollectionPolicySpecFromEnv() (v1.CollectionPolicySpec, error) { if v := getEnv(_ENV_DAKR_URL); v != "" { newSpec.Policies.DakrURL = v } - if v := getEnv(_ENV_PROMETHEUS_URL); v != "" { - newSpec.Policies.PrometheusURL = v - } if v := getEnv(_ENV_COLLECTION_FREQUENCY); v != "" { newSpec.Policies.Frequency = v } @@ -539,13 +540,6 @@ func LoadCollectionPolicySpecFromEnv() (v1.CollectionPolicySpec, error) { newSpec.Policies.BufferSize = i } } - if v := getEnv(_ENV_DISABLE_NETWORK_IO_METRICS); v != "" { - if b, err := strconv.ParseBool(v); err != nil { - return newSpec, fmt.Errorf("invalid %s: %w", _ENV_DISABLE_NETWORK_IO_METRICS, err) - } else { - newSpec.Policies.DisableNetworkIOMetrics = b - } - } if v := getEnv(_ENV_DISABLE_GPU_METRICS); v != "" { if b, err := strconv.ParseBool(v); err != nil { return newSpec, fmt.Errorf("invalid %s: %w", _ENV_DISABLE_GPU_METRICS, err)