From 0a00d1dde4087f597d5082e95dd3d0cc52ece328 Mon Sep 17 00:00:00 2001
From: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
Date: Wed, 8 Apr 2026 20:40:07 +0200
Subject: [PATCH 1/3] feat: add component contributor test harness
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Validate AICR components end-to-end with a single command:

  make component-test COMPONENT=cert-manager

Three test tiers, auto-detected from registry.yaml:
- scheduling: redirects to existing KWOK infrastructure
- deploy: Kind cluster + aicr bundle + chainsaw health check
- gpu-aware: Kind + nvml-mock DaemonSet + deploy + health check

New files:
- tools/component-test/{detect-tier,ensure-cluster,setup-gpu-mock,
  deploy-component,run-health-check,cleanup}.sh
- tools/component-test/{kind-config.yaml,manifests/nvml-mock.yaml,README.md}

Makefile targets: component-test, component-detect, component-cluster,
component-deploy, component-health, component-cleanup.

Uses ghcr.io/nvidia/nvml-mock:0.1.0 for GPU simulation in Kind clusters
(arm64+amd64, includes nvidia-smi).

Tested end-to-end:
- deploy tier: cert-manager (build → deploy → health check → cleanup)
- gpu-aware tier: gpu-operator (build → nvml-mock → deploy → health check → cleanup)

Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
---
 .settings.yaml                                |  11 +
 CONTRIBUTING.md                               |  16 ++
 DEVELOPMENT.md                                |  42 ++++
 Makefile                                      |  64 +++++
 recipes/registry.yaml                         |   2 +
 tools/component-test/README.md                | 192 +++++++++++++++
 tools/component-test/cleanup.sh               | 119 ++++++++++
 tools/component-test/deploy-component.sh      | 221 ++++++++++++++++++
 tools/component-test/detect-tier.sh           | 126 ++++++++++
 tools/component-test/ensure-cluster.sh        |  78 +++++++
 tools/component-test/kind-config.yaml         |  25 ++
 tools/component-test/manifests/nvml-mock.yaml | 137 +++++++++++
 tools/component-test/run-health-check.sh      |  89 +++++++
 tools/component-test/setup-gpu-mock.sh        | 137 +++++++++++
 14 files changed, 1259 insertions(+)
 create mode 100644 tools/component-test/README.md
 create mode 100755 tools/component-test/cleanup.sh
 create mode 100755 tools/component-test/deploy-component.sh
 create mode 100755 tools/component-test/detect-tier.sh
 create mode 100755 tools/component-test/ensure-cluster.sh
 create mode 100644 tools/component-test/kind-config.yaml
 create mode 100644 tools/component-test/manifests/nvml-mock.yaml
 create mode 100755 tools/component-test/run-health-check.sh
 create mode 100755 tools/component-test/setup-gpu-mock.sh

diff --git a/.settings.yaml b/.settings.yaml
index f4591bbab..567224855 100644
--- a/.settings.yaml
+++ b/.settings.yaml
@@ -84,3 +84,14 @@ docs_tools:
 # Testing Configuration
 testing:
   kind_node_image: 'kindest/node:v1.32.0'
+
+  # Component test harness configuration
+  # Used by tools/component-test/ scripts to validate individual components
+  component_test:
+    nvml_mock_version: '0.1.0'
+    nvml_mock_image: 'ghcr.io/nvidia/nvml-mock'
+    default_gpu_profile: 'a100'
+    default_gpu_count: 8
+    cluster_name: 'aicr-component-test'
+    helm_timeout: '300s'
+    health_check_timeout: '5m'
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1f0a69b2b..01860b51f 100755
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -84,6 +84,22 @@ This creates three files with TODOs guiding implementation:
 
 **See [docs/contributor/validator.md](docs/contributor/validator.md) for complete guide with examples, architecture overview, and troubleshooting.**
 
+#### Adding a Component
+
+AICR components are declarative — add an entry to `recipes/registry.yaml` with
+Helm or Kustomize settings, create a `values.yaml`, and optionally add a health
+check. No Go code needed.
+
+**Validate your component:**
+```bash
+make build
+make component-test COMPONENT=my-component
+```
+
+This auto-detects the right test tier, creates a Kind cluster, deploys the
+component, and runs its health check. See
+[tools/component-test/README.md](tools/component-test/README.md) for details.
+
 ## Design Principles
 
 These principles guide all design decisions in AICR. When faced with trade-offs, these principles take precedence.
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index baf59ec99..600a2deaa 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -802,6 +802,48 @@ make validate-local RECIPE=recipe.yaml IMAGE_TAG=dev
    make qualify
    ```
 
+## Testing a New Component
+
+The component test harness validates that a component deploys and passes its
+health check in an isolated Kind cluster. No GPU hardware required for most
+components.
+
+### Quick Start
+
+```bash
+# Build aicr, then test your component
+make build
+make component-test COMPONENT=cert-manager
+```
+
+The harness auto-detects the test tier (`scheduling`, `deploy`, or `gpu-aware`),
+creates a Kind cluster, deploys the component, and runs its health check.
+
+### Available Targets
+
+```bash
+make component-test COMPONENT=cert-manager              # Full end-to-end test
+make component-detect COMPONENT=cert-manager            # Show detected tier
+make component-cluster                            # Create/reuse cluster
+make component-deploy COMPONENT=cert-manager            # Deploy only
+make component-health COMPONENT=cert-manager            # Health check only
+make component-cleanup COMPONENT=cert-manager           # Uninstall component
+```
+
+### Debugging
+
+```bash
+# Keep cluster for inspection
+KEEP_CLUSTER=true make component-test COMPONENT=cert-manager
+
+# Inspect and re-run
+kubectl -n cert-manager get pods
+make component-health COMPONENT=cert-manager
+```
+
+See [tools/component-test/README.md](tools/component-test/README.md) for full
+environment variable reference and troubleshooting.
+
 ## Validator Development
 
 For detailed information on adding validation checks and constraint validators, see:
diff --git a/Makefile b/Makefile
index 2de42430b..f88103201 100644
--- a/Makefile
+++ b/Makefile
@@ -587,6 +587,70 @@ endif
 kwok-test-all: build ## Run all KWOK recipe tests in a shared cluster
 	@bash kwok/scripts/run-all-recipes.sh
 
+# =============================================================================
+# Component Testing
+# =============================================================================
+
+.PHONY: component-test
+component-test: build ## Test a single component end-to-end (COMPONENT=cert-manager [TIER=deploy])
+ifndef COMPONENT
+	@echo "Error: COMPONENT is required"
+	@echo "Usage: make component-test COMPONENT=cert-manager"
+	@echo "       make component-test COMPONENT=gpu-operator TIER=gpu-aware"
+	@exit 1
+endif
+	@set -e; \
+	TIER=$${TIER:-$$(bash tools/component-test/detect-tier.sh $(COMPONENT))}; \
+	echo "[INFO] Detected tier: $$TIER"; \
+	do_cleanup() { \
+		if [ "$${KEEP_CLUSTER:-false}" != "true" ]; then \
+			COMPONENT=$(COMPONENT) bash tools/component-test/cleanup.sh || true; \
+		fi; \
+	}; \
+	trap do_cleanup EXIT; \
+	TIER=$$TIER bash tools/component-test/ensure-cluster.sh; \
+	if [ "$$TIER" = "gpu-aware" ]; then \
+		GPU_PROFILE=$${GPU_PROFILE:-} GPU_COUNT=$${GPU_COUNT:-} bash tools/component-test/setup-gpu-mock.sh; \
+	fi; \
+	if [ "$$TIER" = "scheduling" ]; then \
+		exit 0; \
+	fi; \
+	COMPONENT=$(COMPONENT) HELM_NAMESPACE=$${HELM_NAMESPACE:-} bash tools/component-test/deploy-component.sh; \
+	COMPONENT=$(COMPONENT) bash tools/component-test/run-health-check.sh
+
+.PHONY: component-detect
+component-detect: ## Show detected test tier for a component (COMPONENT=cert-manager)
+ifndef COMPONENT
+	@echo "Error: COMPONENT is required"
+	@echo "Usage: make component-detect COMPONENT=cert-manager"
+	@exit 1
+endif
+	@bash tools/component-test/detect-tier.sh $(COMPONENT)
+
+.PHONY: component-cluster
+component-cluster: ## Create or reuse the component test Kind cluster
+	@TIER=$${TIER:-deploy} bash tools/component-test/ensure-cluster.sh
+
+.PHONY: component-deploy
+component-deploy: build ## Deploy a single component (COMPONENT=cert-manager)
+ifndef COMPONENT
+	@echo "Error: COMPONENT is required"
+	@exit 1
+endif
+	@COMPONENT=$(COMPONENT) HELM_NAMESPACE=$${HELM_NAMESPACE:-} bash tools/component-test/deploy-component.sh
+
+.PHONY: component-health
+component-health: ## Run health check for a deployed component (COMPONENT=cert-manager)
+ifndef COMPONENT
+	@echo "Error: COMPONENT is required"
+	@exit 1
+endif
+	@COMPONENT=$(COMPONENT) bash tools/component-test/run-health-check.sh
+
+.PHONY: component-cleanup
+component-cleanup: ## Clean up component test resources (COMPONENT=cert-manager [DELETE_CLUSTER=true])
+	@COMPONENT=$${COMPONENT:-} DELETE_CLUSTER=$${DELETE_CLUSTER:-false} KEEP_CLUSTER=$${KEEP_CLUSTER:-false} bash tools/component-test/cleanup.sh
+
 # =============================================================================
 # Combined Development Targets
 # =============================================================================
diff --git a/recipes/registry.yaml b/recipes/registry.yaml
index cc86eb282..21c1bce86 100644
--- a/recipes/registry.yaml
+++ b/recipes/registry.yaml
@@ -29,6 +29,8 @@
 #     defaultSource:     Git repository or OCI reference
 #     defaultPath:       Path within the repository to the kustomization
 #     defaultTag:        Git tag, branch, or commit
+#   testTier:          Optional override for component test tier detection
+#                      (scheduling, deploy, or gpu-aware). Used by tools/component-test/.
 #   nodeScheduling:    Paths in Helm values where node selectors/tolerations are injected
 #
 # Note: A component must have either 'helm' OR 'kustomize' configuration, not both.
diff --git a/tools/component-test/README.md b/tools/component-test/README.md
new file mode 100644
index 000000000..7faa0fdce
--- /dev/null
+++ b/tools/component-test/README.md
@@ -0,0 +1,192 @@
+# Component Test Harness
+
+Validate AICR components end-to-end with a single command. No GPU hardware
+required for most components.
+
+## Quick Start
+
+```bash
+# 1. Build aicr
+make build
+
+# 2. Test your component
+make component-test COMPONENT=cert-manager
+```
+
+That's it. The harness auto-detects the right test tier, creates a Kind cluster,
+deploys the component, and runs its health check.
+
+## Test Tiers
+
+| Tier | What it validates | Cluster needs | Example |
+|------|-------------------|---------------|---------|
+| `scheduling` | Pods schedule on correct nodes | Kind + KWOK | Any component with overlays |
+| `deploy` | Component deploys and health checks pass | Kind | cert-manager, kai-scheduler |
+| `gpu-aware` | GPU-dependent components init against fake GPUs | Kind + nvml-mock | gpu-operator |
+
+### Auto-Detection
+
+The harness reads `recipes/registry.yaml` to determine the tier:
+
+| Has health check? | GPU references? | Detected tier |
+|--------------------|-----------------|---------------|
+| No | No | `scheduling` |
+| Yes | No | `deploy` |
+| Yes | Yes | `gpu-aware` |
+| No | Yes | `gpu-aware` (warns about missing health check) |
+
+Override with `TIER=`:
+
+```bash
+make component-test COMPONENT=gpu-operator TIER=gpu-aware
+```
+
+Or set `testTier` in `registry.yaml`:
+
+```yaml
+- name: my-component
+  testTier: gpu-aware
+  helm: ...
+```
+
+## Make Targets
+
+```bash
+# Full test (auto-detect tier, create cluster, deploy, health check, cleanup)
+make component-test COMPONENT=cert-manager
+
+# Individual steps (for debugging)
+make component-detect COMPONENT=cert-manager     # Show detected tier
+make component-cluster                     # Create/reuse Kind cluster
+make component-deploy COMPONENT=cert-manager     # Deploy component only
+make component-health COMPONENT=cert-manager     # Run health check only
+make component-cleanup COMPONENT=cert-manager    # Uninstall component
+
+# Keep cluster for debugging
+KEEP_CLUSTER=true make component-test COMPONENT=cert-manager
+
+# Delete cluster entirely
+make component-cleanup DELETE_CLUSTER=true
+```
+
+## Environment Variables
+
+### Global
+
+| Variable | Default | Purpose |
+|----------|---------|---------|
+| `COMPONENT` | (required) | Component name from registry.yaml |
+| `TIER` | (auto-detected) | Override: `scheduling`, `deploy`, `gpu-aware` |
+| `CLUSTER_NAME` | `aicr-component-test` | Kind cluster name |
+| `KUBECONFIG` | (auto) | Path to kubeconfig |
+| `KEEP_CLUSTER` | `false` | Preserve cluster after test |
+| `DEBUG` | `false` | Extra debug logging |
+
+### Cluster (ensure-cluster.sh)
+
+| Variable | Default | Purpose |
+|----------|---------|---------|
+| `KIND_NODE_IMAGE` | from `.settings.yaml` | Kind node image |
+| `KIND_CONFIG` | `tools/component-test/kind-config.yaml` | Kind config file |
+| `CLUSTER_WAIT_TIMEOUT` | `120s` | Node readiness timeout |
+
+### GPU Mock (setup-gpu-mock.sh)
+
+| Variable | Default | Purpose |
+|----------|---------|---------|
+| `NVML_MOCK_VERSION` | from `.settings.yaml` | nvml-mock version |
+| `NVML_MOCK_IMAGE` | `ghcr.io/nvidia/nvml-mock` | Image override |
+| `GPU_PROFILE` | `a100` | GPU profile: `a100`, `h100`, `gb200` |
+| `GPU_COUNT` | `8` | GPUs per node |
+| `DRIVER_VERSION` | auto from profile | Mock driver version (e.g., `550.163.01`) |
+| `MOCK_READY_TIMEOUT` | `300s` | DaemonSet readiness timeout |
+
+### Deploy (deploy-component.sh)
+
+| Variable | Default | Purpose |
+|----------|---------|---------|
+| `HELM_TIMEOUT` | `300s` | Helm install timeout |
+| `HELM_NAMESPACE` | from registry.yaml | Override namespace |
+| `HELM_VALUES` | (none) | Extra `--values` file |
+| `HELM_SET` | (none) | Extra `--set` overrides (comma-separated) |
+| `AICR_BIN` | auto-detected from `dist/` | Path to aicr binary |
+
+### Health Check (run-health-check.sh)
+
+| Variable | Default | Purpose |
+|----------|---------|---------|
+| `HEALTH_CHECK_TIMEOUT` | `5m` | Chainsaw assert timeout |
+| `HEALTH_CHECK_FILE` | from registry.yaml | Override health check path |
+| `CHAINSAW_BIN` | `chainsaw` | Path to chainsaw binary |
+
+### Cleanup (cleanup.sh)
+
+| Variable | Default | Purpose |
+|----------|---------|---------|
+| `DELETE_CLUSTER` | `false` | Delete the Kind cluster |
+| `FORCE_CLEANUP` | `false` | Skip confirmation prompts |
+
+## Debugging a Failure
+
+```bash
+# 1. Run with KEEP_CLUSTER to preserve state
+KEEP_CLUSTER=true make component-test COMPONENT=cert-manager
+
+# 2. Inspect pods
+kubectl -n cert-manager get pods
+kubectl -n cert-manager describe pod <pod-name>
+kubectl -n cert-manager logs <pod-name>
+
+# 3. Re-run just the health check
+make component-health COMPONENT=cert-manager
+
+# 4. Re-deploy after fixing
+COMPONENT=cert-manager bash tools/component-test/cleanup.sh
+make component-deploy COMPONENT=cert-manager
+make component-health COMPONENT=cert-manager
+
+# 5. Clean up when done
+make component-cleanup COMPONENT=cert-manager DELETE_CLUSTER=true
+```
+
+## Adding GPU-Aware Testing
+
+For components that require GPU resources:
+
+1. Ensure `.settings.yaml` has `component_test.nvml_mock_version` set
+2. The harness auto-detects GPU references in `values.yaml` or registry entry
+3. Override with `TIER=gpu-aware` or set `testTier: gpu-aware` in registry.yaml
+4. Customize GPU profile: `GPU_PROFILE=h100 GPU_COUNT=4 make component-test ...`
+
+## Troubleshooting
+
+| Issue | Check |
+|-------|-------|
+| `aicr binary not found` | Run `make build` first |
+| `Component not found in registry` | Verify component name matches `recipes/registry.yaml` |
+| `Health check file not found` | Create `recipes/checks/<component>/health-check.yaml` |
+| `Kind cluster creation fails` | Check Docker is running, `kind` is installed |
+| `Helm install timeout` | Increase `HELM_TIMEOUT`, check pod events |
+| `chainsaw not found` | Run `make tools-setup` |
+| `nvml-mock not ready` | Increase `MOCK_READY_TIMEOUT`, check DaemonSet logs |
+
+## Architecture
+
+```
+make component-test COMPONENT=cert-manager
+         │
+    detect-tier.sh          → scheduling | deploy | gpu-aware
+         │
+    ensure-cluster.sh       → Reuse or create Kind cluster
+         │
+    setup-gpu-mock.sh       → (gpu-aware only) Deploy nvml-mock
+         │
+    deploy-component.sh     → Bundle + helm install
+         │
+    run-health-check.sh     → Chainsaw health check
+         │
+    cleanup.sh              → Uninstall + optionally delete cluster
+```
+
+All scripts are independently runnable and accept environment variables for
+override. Configuration defaults come from `.settings.yaml`.
diff --git a/tools/component-test/cleanup.sh b/tools/component-test/cleanup.sh
new file mode 100755
index 000000000..68fe788b7
--- /dev/null
+++ b/tools/component-test/cleanup.sh
@@ -0,0 +1,119 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# cleanup.sh - Clean up after component testing
+#
+# Usage:
+#   COMPONENT=cert-manager ./cleanup.sh                        # Uninstall component only
+#   COMPONENT=cert-manager DELETE_CLUSTER=true ./cleanup.sh     # Delete entire cluster
+#   KEEP_CLUSTER=true ./cleanup.sh                       # Skip cleanup (for debugging)
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+# Source common utilities
+# shellcheck source=../common
+. "${REPO_ROOT}/tools/common"
+
+has_tools helm kubectl yq
+
+COMPONENT="${COMPONENT:-}"
+SETTINGS="${REPO_ROOT}/.settings.yaml"
+REGISTRY="${REPO_ROOT}/recipes/registry.yaml"
+
+CLUSTER_NAME="${CLUSTER_NAME:-$(yq -r '.testing.component_test.cluster_name // "aicr-component-test"' "$SETTINGS" 2>/dev/null)}"
+KEEP_CLUSTER="${KEEP_CLUSTER:-false}"
+DELETE_CLUSTER="${DELETE_CLUSTER:-false}"
+FORCE_CLEANUP="${FORCE_CLEANUP:-false}"
+
+# If KEEP_CLUSTER is true, skip everything
+if [[ "$KEEP_CLUSTER" == "true" ]]; then
+    log_info "KEEP_CLUSTER=true, skipping cleanup"
+    log_info "Cluster: $CLUSTER_NAME"
+    if [[ -n "$COMPONENT" ]]; then
+        log_info "Component '$COMPONENT' is still deployed"
+        log_info "To inspect: kubectl -n <namespace> get pods"
+    fi
+    exit 0
+fi
+
+# Uninstall the component if specified
+if [[ -n "$COMPONENT" ]]; then
+    # Determine namespace
+    helm_namespace=$(yq eval ".components[] | select(.name == \"${COMPONENT}\") | .helm.defaultNamespace // .kustomize.defaultNamespace // \"${COMPONENT}\"" "$REGISTRY" 2>/dev/null)
+
+    log_info "Uninstalling component: $COMPONENT (namespace: $helm_namespace)"
+
+    if helm status "$COMPONENT" -n "$helm_namespace" &>/dev/null; then
+        helm uninstall "$COMPONENT" -n "$helm_namespace" --wait --timeout 120s 2>/dev/null || {
+            log_warning "Helm uninstall timed out, force removing..."
+            helm uninstall "$COMPONENT" -n "$helm_namespace" --no-hooks 2>/dev/null || true
+        }
+        log_info "Component '$COMPONENT' uninstalled"
+    else
+        log_info "Component '$COMPONENT' not found (already uninstalled?)"
+    fi
+
+    # Clean up nvml-mock if it was deployed
+    if helm status nvml-mock -n nvml-mock &>/dev/null; then
+        log_info "Uninstalling nvml-mock..."
+        helm uninstall nvml-mock -n nvml-mock --wait 2>/dev/null || true
+    elif kubectl get daemonset nvml-mock -n nvml-mock &>/dev/null; then
+        log_info "Removing nvml-mock resources..."
+        kubectl delete daemonset nvml-mock -n nvml-mock --ignore-not-found 2>/dev/null || true
+        kubectl delete configmap nvml-mock-config -n nvml-mock --ignore-not-found 2>/dev/null || true
+    fi
+
+    # Delete component namespace if empty
+    if kubectl get namespace "$helm_namespace" &>/dev/null; then
+        pod_count=$(kubectl get pods -n "$helm_namespace" --no-headers 2>/dev/null | wc -l | tr -d ' ')
+        if [[ "$pod_count" -eq 0 ]]; then
+            log_info "Deleting empty namespace: $helm_namespace"
+            kubectl delete namespace "$helm_namespace" --wait=true --timeout=60s 2>/dev/null || true
+        fi
+    fi
+
+    # Delete nvml-mock namespace if it exists
+    kubectl delete namespace nvml-mock --ignore-not-found --wait=true --timeout=60s 2>/dev/null || true
+fi
+
+# Delete the Kind cluster if requested
+if [[ "$DELETE_CLUSTER" == "true" ]]; then
+    if ! command -v kind &>/dev/null; then
+        log_error "kind not installed, cannot delete cluster"
+        exit 1
+    fi
+
+    if kind get clusters 2>/dev/null | grep -q "^${CLUSTER_NAME}$"; then
+        if [[ "$FORCE_CLEANUP" != "true" ]]; then
+            log_info "About to delete Kind cluster: $CLUSTER_NAME"
+            read -r -p "Continue? [y/N] " confirm
+            if [[ "$confirm" != [yY] ]]; then
+                log_info "Aborted"
+                exit 0
+            fi
+        fi
+
+        log_info "Deleting Kind cluster: $CLUSTER_NAME"
+        kind delete cluster --name "$CLUSTER_NAME"
+        log_info "Cluster deleted"
+    else
+        log_info "Cluster '$CLUSTER_NAME' not found (already deleted?)"
+    fi
+fi
+
+log_info "Cleanup complete"
diff --git a/tools/component-test/deploy-component.sh b/tools/component-test/deploy-component.sh
new file mode 100755
index 000000000..11ea6f02b
--- /dev/null
+++ b/tools/component-test/deploy-component.sh
@@ -0,0 +1,221 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# deploy-component.sh - Bundle and deploy a single component
+#
+# Usage:
+#   COMPONENT=cert-manager ./deploy-component.sh
+#   COMPONENT=gpu-operator HELM_NAMESPACE=gpu-operator ./deploy-component.sh
+#
+# Generates a minimal single-component recipe, runs aicr bundle, and
+# helm-installs the result into the test cluster.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+# Source common utilities
+# shellcheck source=../common
+. "${REPO_ROOT}/tools/common"
+
+has_tools helm kubectl yq
+
+COMPONENT="${COMPONENT:?COMPONENT is required}"
+SETTINGS="${REPO_ROOT}/.settings.yaml"
+REGISTRY="${REPO_ROOT}/recipes/registry.yaml"
+
+HELM_TIMEOUT="${HELM_TIMEOUT:-$(yq -r '.testing.component_test.helm_timeout // "300s"' "$SETTINGS" 2>/dev/null)}"
+HELM_NAMESPACE="${HELM_NAMESPACE:-}"
+HELM_VALUES="${HELM_VALUES:-}"
+HELM_SET="${HELM_SET:-}"
+
+# Find aicr binary (same pattern as kwok/scripts/validate-scheduling.sh)
+find_aicr_binary() {
+    local aicr_bin="${AICR_BIN:-}"
+    if [[ -n "$aicr_bin" ]] && [[ -x "$aicr_bin" ]]; then
+        echo "$aicr_bin"
+        return 0
+    fi
+
+    local candidates=(
+        "${REPO_ROOT}/dist/aicr"
+        "${REPO_ROOT}/dist/aicr_darwin_arm64_v8.0/aicr"
+        "${REPO_ROOT}/dist/aicr_darwin_all/aicr"
+        "${REPO_ROOT}/dist/aicr_linux_amd64_v1/aicr"
+    )
+
+    for candidate in "${candidates[@]}"; do
+        if [[ -x "$candidate" ]]; then
+            echo "$candidate"
+            return 0
+        fi
+    done
+
+    # Glob fallback
+    local found
+    found=$(find "${REPO_ROOT}/dist" -name "aicr" -type f -perm /111 2>/dev/null | head -1)
+    if [[ -n "$found" ]]; then
+        echo "$found"
+        return 0
+    fi
+
+    return 1
+}
+
+AICR_BIN=$(find_aicr_binary) || {
+    log_error "aicr binary not found in dist/"
+    log_error "Run 'make build' first"
+    exit 1
+}
+log_info "Using aicr binary: $AICR_BIN"
+
+# Verify component exists in registry
+component_entry=$(yq eval ".components[] | select(.name == \"${COMPONENT}\")" "$REGISTRY")
+if [[ -z "$component_entry" ]]; then
+    log_error "Component '$COMPONENT' not found in $REGISTRY"
+    exit 1
+fi
+
+# Determine namespace: env override > registry defaultNamespace > component name
+if [[ -z "$HELM_NAMESPACE" ]]; then
+    HELM_NAMESPACE=$(yq eval ".components[] | select(.name == \"${COMPONENT}\") | .helm.defaultNamespace // .kustomize.defaultNamespace // \"${COMPONENT}\"" "$REGISTRY")
+fi
+
+# Create temp working directory
+WORK_DIR=$(mktemp -d)
+trap 'rm -rf "$WORK_DIR"' EXIT
+
+# Generate a minimal single-component recipe
+log_info "Generating single-component recipe for: $COMPONENT"
+
+# Extract component details from registry — detect type by checking which key exists
+has_helm=$(yq eval ".components[] | select(.name == \"${COMPONENT}\") | has(\"helm\")" "$REGISTRY")
+has_kustomize=$(yq eval ".components[] | select(.name == \"${COMPONENT}\") | has(\"kustomize\")" "$REGISTRY")
+has_manifest=$(yq eval ".components[] | select(.name == \"${COMPONENT}\") | has(\"manifest\")" "$REGISTRY")
+
+if [[ "$has_manifest" == "true" ]]; then
+    log_error "Component '$COMPONENT' uses manifest type, which is not supported by this harness"
+    log_error "Manifest components are deployed via raw YAML, not bundled via Helm/Kustomize"
+    exit 1
+fi
+
+if [[ "$has_helm" == "true" ]]; then
+    chart_type="Helm"
+elif [[ "$has_kustomize" == "true" ]]; then
+    chart_type="Kustomize"
+else
+    log_error "Component '$COMPONENT' has no helm, kustomize, or manifest configuration in registry"
+    exit 1
+fi
+
+chart_source=$(yq eval ".components[] | select(.name == \"${COMPONENT}\") | .helm.defaultRepository // .kustomize.defaultSource // \"\"" "$REGISTRY")
+chart_name_raw=$(yq eval ".components[] | select(.name == \"${COMPONENT}\") | .helm.defaultChart // \"\"" "$REGISTRY")
+# Strip repo prefix from chart name (e.g., "jetstack/cert-manager" → "cert-manager")
+# Mirrors the Go recipe resolver logic in pkg/recipe/metadata.go
+chart_name="${chart_name_raw##*/}"
+chart_version=$(yq eval ".components[] | select(.name == \"${COMPONENT}\") | .helm.defaultVersion // .kustomize.defaultTag // \"\"" "$REGISTRY")
+
+# Find the component's values file by searching overlays (base.yaml first, then others)
+values_file=""
+checked_base=false
+for overlay in "${REPO_ROOT}"/recipes/overlays/base.yaml "${REPO_ROOT}"/recipes/overlays/*.yaml; do
+    [[ -f "$overlay" ]] || continue
+    # base.yaml appears in both the explicit path and the glob; skip the duplicate
+    if [[ "$(basename "$overlay")" == "base.yaml" ]]; then
+        if [[ "$checked_base" == "true" ]]; then continue; fi
+        checked_base=true
+    fi
+    candidate=$(yq eval ".spec.componentRefs[] | select(.name == \"${COMPONENT}\") | .valuesFile // \"\"" "$overlay" 2>/dev/null)
+    if [[ -n "$candidate" ]]; then
+        values_file="$candidate"
+        break
+    fi
+done
+if [[ -z "$values_file" ]]; then
+    # Try component default values
+    if [[ -f "${REPO_ROOT}/recipes/components/${COMPONENT}/values.yaml" ]]; then
+        values_file="components/${COMPONENT}/values.yaml"
+    fi
+fi
+
+# Build a minimal resolved recipe (RecipeResult format, which aicr bundle expects)
+cat > "${WORK_DIR}/recipe.yaml" <<EOF
+kind: RecipeResult
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  version: component-test
+componentRefs:
+  - name: ${COMPONENT}
+    namespace: ${HELM_NAMESPACE}
+    type: ${chart_type}
+    source: ${chart_source}
+EOF
+
+if [[ -n "$chart_name" ]]; then
+    echo "    chart: ${chart_name}" >> "${WORK_DIR}/recipe.yaml"
+fi
+
+if [[ -n "$chart_version" ]]; then
+    echo "    version: ${chart_version}" >> "${WORK_DIR}/recipe.yaml"
+fi
+
+if [[ -n "$values_file" ]]; then
+    echo "    valuesFile: ${values_file}" >> "${WORK_DIR}/recipe.yaml"
+fi
+
+log_info "Recipe:"
+cat "${WORK_DIR}/recipe.yaml"
+
+# Generate bundle
+log_info "Generating bundle..."
+if ! "$AICR_BIN" bundle \
+    --recipe "${WORK_DIR}/recipe.yaml" \
+    --output "${WORK_DIR}/bundle" 2>&1; then
+    log_error "Bundle generation failed"
+    exit 1
+fi
+
+if [[ ! -d "${WORK_DIR}/bundle" ]]; then
+    log_error "Bundle directory not created"
+    exit 1
+fi
+
+log_info "Bundle contents:"
+ls -1 "${WORK_DIR}/bundle"
+
+# Deploy using the generated deploy.sh script (same approach as KWOK validation)
+DEPLOY_SCRIPT="${WORK_DIR}/bundle/deploy.sh"
+if [[ ! -f "$DEPLOY_SCRIPT" ]]; then
+    log_error "deploy.sh not found in bundle directory"
+    log_error "Bundle generation may have failed"
+    exit 1
+fi
+
+chmod +x "$DEPLOY_SCRIPT"
+
+# Pass --no-wait to deploy.sh; readiness is verified by the health check step
+DEPLOY_ARGS="--no-wait"
+
+log_info "Installing $COMPONENT into namespace $HELM_NAMESPACE..."
+
+if ! "$DEPLOY_SCRIPT" $DEPLOY_ARGS 2>&1; then
+    log_error "Deploy script failed"
+    log_error "Debug with: kubectl -n $HELM_NAMESPACE get pods"
+    log_error "            kubectl -n $HELM_NAMESPACE describe pods"
+    exit 1
+fi
+
+log_info "Component '$COMPONENT' deployed successfully in namespace '$HELM_NAMESPACE'"
diff --git a/tools/component-test/detect-tier.sh b/tools/component-test/detect-tier.sh
new file mode 100755
index 000000000..7df315673
--- /dev/null
+++ b/tools/component-test/detect-tier.sh
@@ -0,0 +1,126 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# detect-tier.sh - Determine the test tier for a component
+#
+# Usage:
+#   ./detect-tier.sh <component-name>
+#   TIER=deploy ./detect-tier.sh <component-name>
+#
+# Output: Single word to stdout: scheduling, deploy, or gpu-aware
+#
+# Detection matrix:
+#   Has health check? | GPU references? | Detected tier
+#   No                | No              | scheduling
+#   Yes               | No              | deploy
+#   Yes               | Yes             | gpu-aware
+#   No                | Yes             | gpu-aware (warn: no health check)
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+# Source common utilities
+# shellcheck source=../common
+. "${REPO_ROOT}/tools/common"
+
+has_tools yq
+
+COMPONENT="${1:-${COMPONENT:-}}"
+if [[ -z "$COMPONENT" ]]; then
+    log_error "COMPONENT is required"
+    echo "Usage: $0 <component-name>" >&2
+    exit 1
+fi
+
+REGISTRY="${REPO_ROOT}/recipes/registry.yaml"
+
+# 1. If TIER env var is set, use it (contributor override)
+if [[ -n "${TIER:-}" ]]; then
+    case "$TIER" in
+        scheduling|deploy|gpu-aware)
+            echo "$TIER"
+            exit 0
+            ;;
+        *)
+            log_error "Invalid TIER: $TIER (must be scheduling, deploy, or gpu-aware)"
+            exit 1
+            ;;
+    esac
+fi
+
+# 2. If testTier field exists in registry.yaml, use it
+test_tier=$(yq eval ".components[] | select(.name == \"${COMPONENT}\") | .testTier // \"\"" "$REGISTRY")
+if [[ -n "$test_tier" ]]; then
+    case "$test_tier" in
+        scheduling|deploy|gpu-aware)
+            echo "$test_tier"
+            exit 0
+            ;;
+        *)
+            log_error "Invalid testTier in registry.yaml for $COMPONENT: $test_tier"
+            exit 1
+            ;;
+    esac
+fi
+
+# Verify component exists in registry
+component_exists=$(yq eval ".components[] | select(.name == \"${COMPONENT}\") | .name" "$REGISTRY")
+if [[ -z "$component_exists" ]]; then
+    log_error "Component '$COMPONENT' not found in $REGISTRY"
+    exit 1
+fi
+
+# 3. Check if health check exists
+has_health_check=false
+health_check_ref=$(yq eval ".components[] | select(.name == \"${COMPONENT}\") | .healthCheck.assertFile // \"\"" "$REGISTRY")
+if [[ -n "$health_check_ref" ]] && [[ -f "${REPO_ROOT}/recipes/${health_check_ref}" ]]; then
+    has_health_check=true
+elif [[ -f "${REPO_ROOT}/recipes/checks/${COMPONENT}/health-check.yaml" ]]; then
+    has_health_check=true
+fi
+
+# 4. Check for GPU resource references
+has_gpu_refs=false
+
+# Check component values.yaml for nvidia.com/gpu
+values_file="${REPO_ROOT}/recipes/components/${COMPONENT}/values.yaml"
+if [[ -f "$values_file" ]] && grep -q 'nvidia\.com/gpu' "$values_file" 2>/dev/null; then
+    has_gpu_refs=true
+fi
+
+# Check registry entry for GPU-related nodeScheduling paths
+if [[ "$has_gpu_refs" == "false" ]]; then
+    gpu_scheduling=$(yq eval ".components[] | select(.name == \"${COMPONENT}\") | .nodeScheduling.accelerated // \"\"" "$REGISTRY")
+    if [[ -n "$gpu_scheduling" ]]; then
+        # Check if the component's values reference GPU resources
+        if [[ -f "$values_file" ]] && grep -qE '(gpu|nvidia|cuda)' "$values_file" 2>/dev/null; then
+            has_gpu_refs=true
+        fi
+    fi
+fi
+
+# 5. Apply decision matrix
+if [[ "$has_gpu_refs" == "true" ]]; then
+    if [[ "$has_health_check" == "false" ]]; then
+        log_warning "Component '$COMPONENT' has GPU references but no health check" >&2
+    fi
+    echo "gpu-aware"
+elif [[ "$has_health_check" == "true" ]]; then
+    echo "deploy"
+else
+    echo "scheduling"
+fi
diff --git a/tools/component-test/ensure-cluster.sh b/tools/component-test/ensure-cluster.sh
new file mode 100755
index 000000000..e4c0bd8e2
--- /dev/null
+++ b/tools/component-test/ensure-cluster.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ensure-cluster.sh - Ensure a suitable Kind cluster is running
+#
+# Usage:
+#   TIER=deploy ./ensure-cluster.sh
+#   TIER=scheduling COMPONENT=cert-manager ./ensure-cluster.sh
+#
+# For scheduling tier: delegates to existing KWOK infrastructure (make kwok-cluster)
+# For deploy/gpu-aware: creates/reuses an aicr-component-test Kind cluster
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+# Source common utilities
+# shellcheck source=../common
+. "${REPO_ROOT}/tools/common"
+
+has_tools kind kubectl yq
+
+TIER="${TIER:-deploy}"
+SETTINGS="${REPO_ROOT}/.settings.yaml"
+
+CLUSTER_NAME="${CLUSTER_NAME:-$(yq -r '.testing.component_test.cluster_name // "aicr-component-test"' "$SETTINGS" 2>/dev/null)}"
+KIND_NODE_IMAGE="${KIND_NODE_IMAGE:-$(yq -r '.testing.kind_node_image // "kindest/node:v1.32.0"' "$SETTINGS" 2>/dev/null)}"
+KIND_CONFIG="${KIND_CONFIG:-${SCRIPT_DIR}/kind-config.yaml}"
+CLUSTER_WAIT_TIMEOUT="${CLUSTER_WAIT_TIMEOUT:-120s}"
+
+CONTEXT="kind-${CLUSTER_NAME}"
+
+# For scheduling tier, delegate to KWOK cluster infrastructure
+if [[ "$TIER" == "scheduling" ]]; then
+    log_info "Scheduling tier detected"
+    log_info "Component scheduling validation uses the KWOK infrastructure, not this harness"
+    log_info ""
+    log_info "To validate scheduling, use:"
+    log_info "  make kwok-e2e RECIPE=<recipe-name>"
+    log_info ""
+    log_info "See: kwok/README.md for details"
+    exit 0
+fi
+
+# For deploy/gpu-aware: create or reuse the component-test cluster
+if kind get clusters 2>/dev/null | grep -q "^${CLUSTER_NAME}$"; then
+    log_info "Reusing existing cluster: ${CLUSTER_NAME}"
+    kubectl config use-context "$CONTEXT"
+else
+    log_info "Creating Kind cluster: ${CLUSTER_NAME}"
+    kind create cluster \
+        --name "$CLUSTER_NAME" \
+        --image "$KIND_NODE_IMAGE" \
+        --config "$KIND_CONFIG" \
+        --wait 60s
+
+    kubectl config use-context "$CONTEXT"
+fi
+
+# Wait for all nodes to be ready
+log_info "Waiting for nodes to be ready (timeout: ${CLUSTER_WAIT_TIMEOUT})..."
+kubectl wait --for=condition=Ready node --all --timeout="$CLUSTER_WAIT_TIMEOUT"
+
+log_info "Cluster '${CLUSTER_NAME}' is ready"
+log_info "Context: ${CONTEXT}"
diff --git a/tools/component-test/kind-config.yaml b/tools/component-test/kind-config.yaml
new file mode 100644
index 000000000..4e9cf4611
--- /dev/null
+++ b/tools/component-test/kind-config.yaml
@@ -0,0 +1,25 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Kind configuration for component testing
+#
+# This cluster runs real workloads (deploy and gpu-aware tiers), so it
+# includes CoreDNS and a single worker node unlike the KWOK config which
+# only validates scheduling.
+
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+nodes:
+  - role: control-plane
+  - role: worker
diff --git a/tools/component-test/manifests/nvml-mock.yaml b/tools/component-test/manifests/nvml-mock.yaml
new file mode 100644
index 000000000..b9829fa9e
--- /dev/null
+++ b/tools/component-test/manifests/nvml-mock.yaml
@@ -0,0 +1,137 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# nvml-mock DaemonSet - Fallback manifest for GPU simulation
+#
+# This manifest is used when the nvml-mock Helm chart from
+# NVIDIA/k8s-test-infra is not available via OCI registry.
+#
+# Placeholder values (NVML_MOCK_IMAGE_PLACEHOLDER, NVML_MOCK_VERSION_PLACEHOLDER,
+# GPU_COUNT_PLACEHOLDER, GPU_PROFILE_PLACEHOLDER) are substituted by
+# setup-gpu-mock.sh at deploy time.
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: nvml-mock
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: nvml-mock
+  namespace: nvml-mock
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: nvml-mock
+rules:
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["get", "patch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: nvml-mock
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: nvml-mock
+subjects:
+  - kind: ServiceAccount
+    name: nvml-mock
+    namespace: nvml-mock
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: nvml-mock-config
+  namespace: nvml-mock
+data:
+  config.yaml: |
+    profile: GPU_PROFILE_PLACEHOLDER
+    gpuCount: GPU_COUNT_PLACEHOLDER
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nvml-mock
+  namespace: nvml-mock
+  labels:
+    app: nvml-mock
+spec:
+  selector:
+    matchLabels:
+      app: nvml-mock
+  template:
+    metadata:
+      labels:
+        app: nvml-mock
+    spec:
+      serviceAccountName: nvml-mock
+      containers:
+        - name: nvml-mock
+          image: NVML_MOCK_IMAGE_PLACEHOLDER:NVML_MOCK_VERSION_PLACEHOLDER
+          command: ["/scripts/entrypoint.sh"]
+          env:
+            - name: GPU_COUNT
+              value: "GPU_COUNT_PLACEHOLDER"
+            - name: DRIVER_VERSION
+              value: "DRIVER_VERSION_PLACEHOLDER"
+            - name: NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+          volumeMounts:
+            - name: host-nvml-mock
+              mountPath: /host/var/lib/nvml-mock
+            - name: gpu-config
+              mountPath: /config
+              readOnly: true
+            - name: host-cdi
+              mountPath: /host/var/run/cdi
+            - name: host-run-nvidia
+              mountPath: /host/run/nvidia
+          securityContext:
+            privileged: true
+          lifecycle:
+            preStop:
+              exec:
+                command: ["/scripts/cleanup.sh"]
+          resources:
+            requests:
+              cpu: 50m
+              memory: 64Mi
+            limits:
+              cpu: 200m
+              memory: 128Mi
+      volumes:
+        - name: host-nvml-mock
+          hostPath:
+            path: /var/lib/nvml-mock
+            type: DirectoryOrCreate
+        - name: gpu-config
+          configMap:
+            name: nvml-mock-config
+        - name: host-cdi
+          hostPath:
+            path: /var/run/cdi
+            type: DirectoryOrCreate
+        - name: host-run-nvidia
+          hostPath:
+            path: /run/nvidia
+            type: DirectoryOrCreate
+      tolerations:
+        - operator: Exists
diff --git a/tools/component-test/run-health-check.sh b/tools/component-test/run-health-check.sh
new file mode 100755
index 000000000..cb678b483
--- /dev/null
+++ b/tools/component-test/run-health-check.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# run-health-check.sh - Execute a component's health check
+#
+# Usage:
+#   COMPONENT=cert-manager ./run-health-check.sh
+#   COMPONENT=gpu-operator HEALTH_CHECK_TIMEOUT=10m ./run-health-check.sh
+#
+# Runs chainsaw test against the Kind cluster (not --no-cluster) to validate
+# the component is actually healthy.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+# Source common utilities
+# shellcheck source=../common
+. "${REPO_ROOT}/tools/common"
+
+has_tools yq
+
+COMPONENT="${COMPONENT:?COMPONENT is required}"
+SETTINGS="${REPO_ROOT}/.settings.yaml"
+REGISTRY="${REPO_ROOT}/recipes/registry.yaml"
+
+HEALTH_CHECK_TIMEOUT="${HEALTH_CHECK_TIMEOUT:-$(yq -r '.testing.component_test.health_check_timeout // "5m"' "$SETTINGS" 2>/dev/null)}"
+CHAINSAW_BIN="${CHAINSAW_BIN:-chainsaw}"
+
+# Resolve health check file path
+if [[ -n "${HEALTH_CHECK_FILE:-}" ]]; then
+    HEALTH_CHECK="$HEALTH_CHECK_FILE"
+else
+    # Try registry.yaml reference first
+    health_ref=$(yq eval ".components[] | select(.name == \"${COMPONENT}\") | .healthCheck.assertFile // \"\"" "$REGISTRY")
+    if [[ -n "$health_ref" ]]; then
+        HEALTH_CHECK="${REPO_ROOT}/recipes/${health_ref}"
+    else
+        HEALTH_CHECK="${REPO_ROOT}/recipes/checks/${COMPONENT}/health-check.yaml"
+    fi
+fi
+
+if [[ ! -f "$HEALTH_CHECK" ]]; then
+    log_error "Health check file not found: $HEALTH_CHECK"
+    log_error "Component '$COMPONENT' may not have a health check defined"
+    exit 1
+fi
+
+log_info "Running health check for: $COMPONENT"
+log_info "Health check file: $HEALTH_CHECK"
+log_info "Timeout: $HEALTH_CHECK_TIMEOUT"
+
+# Verify chainsaw is available
+if ! command -v "$CHAINSAW_BIN" &>/dev/null; then
+    log_error "chainsaw not found: $CHAINSAW_BIN"
+    log_error "Install with: make tools-setup"
+    exit 1
+fi
+
+# Run chainsaw against the live cluster (no --no-cluster flag)
+health_check_dir=$(dirname "$HEALTH_CHECK")
+
+if "$CHAINSAW_BIN" test "$health_check_dir" \
+    --test-file "$(basename "$HEALTH_CHECK")" \
+    --assert-timeout "$HEALTH_CHECK_TIMEOUT" \
+    --no-color=false 2>&1; then
+    log_info "Health check PASSED for: $COMPONENT"
+else
+    log_error "Health check FAILED for: $COMPONENT"
+    log_error ""
+    log_error "Debug with:"
+    log_error "  kubectl get pods -A"
+    log_error "  kubectl describe pods -n <namespace>"
+    log_error "  kubectl logs -n <namespace> <pod>"
+    exit 1
+fi
diff --git a/tools/component-test/setup-gpu-mock.sh b/tools/component-test/setup-gpu-mock.sh
new file mode 100755
index 000000000..106b96ce4
--- /dev/null
+++ b/tools/component-test/setup-gpu-mock.sh
@@ -0,0 +1,137 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# setup-gpu-mock.sh - Deploy nvml-mock DaemonSet for GPU simulation
+#
+# Usage:
+#   ./setup-gpu-mock.sh
+#   GPU_PROFILE=h100 GPU_COUNT=4 ./setup-gpu-mock.sh
+#
+# Deploys nvml-mock to simulate GPU hardware in Kind clusters.
+# Tries OCI Helm chart first, falls back to kubectl apply with bundled manifest.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+# Source common utilities
+# shellcheck source=../common
+. "${REPO_ROOT}/tools/common"
+
+has_tools kubectl yq
+
+SETTINGS="${REPO_ROOT}/.settings.yaml"
+
+NVML_MOCK_VERSION="${NVML_MOCK_VERSION:-$(yq -r '.testing.component_test.nvml_mock_version // "v0.1.0"' "$SETTINGS" 2>/dev/null)}"
+NVML_MOCK_IMAGE="${NVML_MOCK_IMAGE:-$(yq -r '.testing.component_test.nvml_mock_image // "ghcr.io/nvidia/nvml-mock"' "$SETTINGS" 2>/dev/null)}"
+GPU_PROFILE="${GPU_PROFILE:-$(yq -r '.testing.component_test.default_gpu_profile // "a100"' "$SETTINGS" 2>/dev/null)}"
+GPU_COUNT="${GPU_COUNT:-$(yq -r '.testing.component_test.default_gpu_count // 8' "$SETTINGS" 2>/dev/null)}"
+MOCK_READY_TIMEOUT="${MOCK_READY_TIMEOUT:-300s}"
+MANIFEST_FILE="${SCRIPT_DIR}/manifests/nvml-mock.yaml"
+
+# Map GPU profile to driver version (matches nvml-mock Helm chart defaults)
+profile_to_driver_version() {
+    case "$1" in
+        a100|l40s|t4) echo "550.163.01" ;;
+        h100|b200|gb200) echo "570.86.16" ;;
+        *) echo "550.163.01" ;;
+    esac
+}
+DRIVER_VERSION="${DRIVER_VERSION:-$(profile_to_driver_version "$GPU_PROFILE")}"
+
+log_info "Setting up GPU mock: profile=${GPU_PROFILE}, count=${GPU_COUNT}, driver=${DRIVER_VERSION}"
+log_info "Image: ${NVML_MOCK_IMAGE}:${NVML_MOCK_VERSION}"
+
+# Check if nvml-mock is already deployed and healthy
+if kubectl get daemonset nvml-mock -n nvml-mock &>/dev/null; then
+    desired=$(kubectl get daemonset nvml-mock -n nvml-mock -o jsonpath='{.status.desiredNumberScheduled}' 2>/dev/null || echo "0")
+    ready=$(kubectl get daemonset nvml-mock -n nvml-mock -o jsonpath='{.status.numberReady}' 2>/dev/null || echo "0")
+    if [[ "$desired" -gt 0 ]] && [[ "$ready" -eq "$desired" ]]; then
+        log_info "nvml-mock DaemonSet already running (${ready}/${desired} ready)"
+        exit 0
+    fi
+    log_info "nvml-mock exists but not fully ready, redeploying..."
+    kubectl delete daemonset nvml-mock -n nvml-mock --ignore-not-found 2>/dev/null || true
+fi
+
+# Try Helm chart first (preferred when OCI chart is published)
+deploy_via_helm() {
+    if ! command -v helm &>/dev/null; then
+        return 1
+    fi
+
+    local chart_ref="oci://${NVML_MOCK_IMAGE}"
+    log_info "Attempting Helm install from: ${chart_ref}"
+
+    if helm install nvml-mock "$chart_ref" \
+        --version "$NVML_MOCK_VERSION" \
+        --namespace nvml-mock --create-namespace \
+        --set gpu.profile="$GPU_PROFILE" \
+        --set gpu.count="$GPU_COUNT" \
+        --wait --timeout "$MOCK_READY_TIMEOUT" 2>/dev/null; then
+        return 0
+    fi
+
+    log_info "Helm chart not available, falling back to manifest"
+    # Clean up partial Helm install
+    helm uninstall nvml-mock -n nvml-mock 2>/dev/null || true
+    return 1
+}
+
+# Fallback: deploy via kubectl with bundled manifest
+deploy_via_manifest() {
+    if [[ ! -f "$MANIFEST_FILE" ]]; then
+        log_error "Fallback manifest not found: $MANIFEST_FILE"
+        exit 1
+    fi
+
+    log_info "Deploying nvml-mock via manifest: $MANIFEST_FILE"
+
+    # Substitute placeholders in manifest
+    sed \
+        -e "s|NVML_MOCK_IMAGE_PLACEHOLDER|${NVML_MOCK_IMAGE}|g" \
+        -e "s|NVML_MOCK_VERSION_PLACEHOLDER|${NVML_MOCK_VERSION}|g" \
+        -e "s|GPU_PROFILE_PLACEHOLDER|${GPU_PROFILE}|g" \
+        -e "s|GPU_COUNT_PLACEHOLDER|${GPU_COUNT}|g" \
+        -e "s|DRIVER_VERSION_PLACEHOLDER|${DRIVER_VERSION}|g" \
+        "$MANIFEST_FILE" | kubectl apply -f -
+}
+
+# Try Helm, fall back to manifest
+if ! deploy_via_helm; then
+    deploy_via_manifest
+fi
+
+# Wait for DaemonSet readiness
+log_info "Waiting for nvml-mock DaemonSet to be ready (timeout: ${MOCK_READY_TIMEOUT})..."
+kubectl rollout status daemonset/nvml-mock -n nvml-mock --timeout="$MOCK_READY_TIMEOUT"
+
+# Verify nvml-mock labeled the nodes
+log_info "Verifying nvml-mock node labels..."
+gpu_nodes=$(kubectl get nodes -l nvidia.com/gpu.present=true -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null)
+
+if [[ -z "$gpu_nodes" ]]; then
+    log_warning "No nodes have nvidia.com/gpu.present=true label yet"
+    log_warning "nvml-mock may need additional time to label nodes"
+    log_warning "Check: kubectl get nodes --show-labels | grep nvidia"
+else
+    log_info "Nodes with nvml-mock GPU simulation:"
+    echo "$gpu_nodes" | while IFS= read -r line; do
+        log_info "  $line"
+    done
+fi
+
+log_info "GPU mock setup complete"

From 48c34e2e3fe493552b4c6ca93a97fb5f7b843634 Mon Sep 17 00:00:00 2001
From: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
Date: Wed, 8 Apr 2026 20:40:27 +0200
Subject: [PATCH 2/3] fix: conditionally include --version in deploy.sh
 template

The deploy.sh template unconditionally included '--version {{ .Version }}'
which produced a broken helm command when Version was empty (e.g.,
gpu-operator has no defaultVersion in registry.yaml). Helm 4 treats
the empty --version as a missing required argument.

The template now conditionally includes --version only when Version
is non-empty, allowing components without pinned versions to install
the latest chart from the repository.

Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
---
 pkg/bundler/deployer/helm/helm_test.go        | 69 +++++++++++++++++++
 .../deployer/helm/templates/deploy.sh.tmpl    |  6 +-
 2 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go
index a74b288e1..0b65a1d9a 100644
--- a/pkg/bundler/deployer/helm/helm_test.go
+++ b/pkg/bundler/deployer/helm/helm_test.go
@@ -1378,6 +1378,75 @@ func TestGenerateDeployScript(t *testing.T) {
 	}
 }
 
+func TestGenerateDeployScript_EmptyVersionOmitsFlag(t *testing.T) {
+	g := NewGenerator()
+	ctx := context.Background()
+	dir := t.TempDir()
+
+	components := []ComponentData{
+		{
+			Name:       "gpu-operator",
+			Namespace:  "gpu-operator",
+			Repository: "https://helm.ngc.nvidia.com/nvidia",
+			ChartName:  "gpu-operator",
+			Version:    "", // empty version — should not produce --version flag
+			HasChart:   true,
+		},
+	}
+
+	input := &GeneratorInput{Version: "v1.0.0"}
+	path, _, err := g.generateDeployScript(ctx, input, components, dir)
+	if err != nil {
+		t.Fatalf("generateDeployScript failed: %v", err)
+	}
+
+	content, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("reading deploy.sh: %v", err)
+	}
+
+	script := string(content)
+	if strings.Contains(script, "--version") {
+		t.Errorf("deploy.sh should not contain --version when Version is empty, got:\n%s", script)
+	}
+	if !strings.Contains(script, "helm upgrade --install gpu-operator gpu-operator") {
+		t.Errorf("deploy.sh should contain helm install command for gpu-operator")
+	}
+}
+
+func TestGenerateDeployScript_WithVersionIncludesFlag(t *testing.T) {
+	g := NewGenerator()
+	ctx := context.Background()
+	dir := t.TempDir()
+
+	components := []ComponentData{
+		{
+			Name:       "cert-manager",
+			Namespace:  "cert-manager",
+			Repository: "https://charts.jetstack.io",
+			ChartName:  "cert-manager",
+			Version:    "v1.17.2",
+			HasChart:   true,
+		},
+	}
+
+	input := &GeneratorInput{Version: "v1.0.0"}
+	path, _, err := g.generateDeployScript(ctx, input, components, dir)
+	if err != nil {
+		t.Fatalf("generateDeployScript failed: %v", err)
+	}
+
+	content, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("reading deploy.sh: %v", err)
+	}
+
+	script := string(content)
+	if !strings.Contains(script, "--version v1.17.2") {
+		t.Errorf("deploy.sh should contain --version v1.17.2, got:\n%s", script)
+	}
+}
+
 func TestGenerateUndeployScript(t *testing.T) {
 	tests := []struct {
 		name       string
diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
index 6a6b1cf7b..bd9762a96 100644
--- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
+++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
@@ -282,7 +282,8 @@ fi
 {{ if .IsOCI -}}
 retry "{{ .Name }} helm install" \
   helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \
-  --version {{ .Version }} \
+  {{ if .Version }}--version {{ .Version }} \
+  {{ end -}}
   -n {{ .Namespace }} --create-namespace \
   -f "${SCRIPT_DIR}/{{ .Name }}/values.yaml" \
   ${COMPONENT_WAIT_ARGS} \
@@ -291,7 +292,8 @@ retry "{{ .Name }} helm install" \
 retry "{{ .Name }} helm install" \
   helm upgrade --install {{ .Name }} {{ .ChartName }} \
   --repo {{ .Repository }} \
-  --version {{ .Version }} \
+  {{ if .Version }}--version {{ .Version }} \
+  {{ end -}}
   -n {{ .Namespace }} --create-namespace \
   -f "${SCRIPT_DIR}/{{ .Name }}/values.yaml" \
   ${COMPONENT_WAIT_ARGS} \

From 4584c25072bd069c20d0db4dbd286f53ef4159c7 Mon Sep 17 00:00:00 2001
From: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
Date: Thu, 9 Apr 2026 08:08:27 +0200
Subject: [PATCH 3/3] fix: address review feedback on cleanup and scheduling
 tier

- cleanup.sh: Detect non-interactive mode (no TTY) and fail with a
  clear error instead of hanging on 'read' when DELETE_CLUSTER=true
  without FORCE_CLEANUP=true.

- Makefile: Scheduling tier now exits with code 2 instead of 0 to
  signal that no test was executed, with guidance to use make kwok-e2e.

- README: Clarify that scheduling tier redirects to KWOK and does not
  create a Kind cluster.

Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
---
 Makefile                        | 5 ++++-
 tools/component-test/README.md  | 4 +++-
 tools/component-test/cleanup.sh | 4 ++++
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index f88103201..243d167fa 100644
--- a/Makefile
+++ b/Makefile
@@ -613,7 +613,10 @@ endif
 		GPU_PROFILE=$${GPU_PROFILE:-} GPU_COUNT=$${GPU_COUNT:-} bash tools/component-test/setup-gpu-mock.sh; \
 	fi; \
 	if [ "$$TIER" = "scheduling" ]; then \
-		exit 0; \
+		echo "[INFO] Scheduling tier uses KWOK, not this harness."; \
+		echo "[INFO] Run: make kwok-e2e RECIPE=<recipe-name>"; \
+		echo "[INFO] No test was executed. Exiting with code 2."; \
+		exit 2; \
 	fi; \
 	COMPONENT=$(COMPONENT) HELM_NAMESPACE=$${HELM_NAMESPACE:-} bash tools/component-test/deploy-component.sh; \
 	COMPONENT=$(COMPONENT) bash tools/component-test/run-health-check.sh
diff --git a/tools/component-test/README.md b/tools/component-test/README.md
index 7faa0fdce..8c502e650 100644
--- a/tools/component-test/README.md
+++ b/tools/component-test/README.md
@@ -14,7 +14,9 @@ make component-test COMPONENT=cert-manager
 ```
 
 That's it. The harness auto-detects the right test tier, creates a Kind cluster,
-deploys the component, and runs its health check.
+deploys the component, and runs its health check. Components detected as
+`scheduling` tier are redirected to the KWOK infrastructure (`make kwok-e2e`)
+and exit with code 2 — no Kind cluster is created for those.
 
 ## Test Tiers
 
diff --git a/tools/component-test/cleanup.sh b/tools/component-test/cleanup.sh
index 68fe788b7..240160d43 100755
--- a/tools/component-test/cleanup.sh
+++ b/tools/component-test/cleanup.sh
@@ -100,6 +100,10 @@ if [[ "$DELETE_CLUSTER" == "true" ]]; then
 
     if kind get clusters 2>/dev/null | grep -q "^${CLUSTER_NAME}$"; then
         if [[ "$FORCE_CLEANUP" != "true" ]]; then
+            if [[ ! -t 0 ]]; then
+                log_error "DELETE_CLUSTER=true requires FORCE_CLEANUP=true in non-interactive mode"
+                exit 1
+            fi
             log_info "About to delete Kind cluster: $CLUSTER_NAME"
             read -r -p "Continue? [y/N] " confirm
             if [[ "$confirm" != [yY] ]]; then