NVIDIA · ksapru · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 20, 2026
diff --git a/README.md b/README.md
@@ -194,6 +194,49 @@ For security bulletins and PSIRT policies, visit the [NVIDIA Product Security](h
 
 This software automatically retrieves, accesses or interacts with external materials. Those retrieved materials are not distributed with this software and are governed solely by separate terms, conditions and licenses. You are solely responsible for finding, reviewing and complying with all applicable terms, conditions, and licenses, and for verifying the security, integrity and suitability of any retrieved materials for your specific use case. This software is provided "AS IS", without warranty of any kind. The author makes no representations or warranties regarding any retrieved materials, and assumes no liability for any losses, damages, liabilities or legal consequences from your use or inability to use this software or any retrieved materials. Use this software and the retrieved materials at your own risk.
 
+---
+
+## Observability
+
+NemoClaw includes an optional observability layer for tracking agent execution and blueprint performance.
+
+### Enabling Metrics
+
+Metrics are disabled by default. To enable them, set the `NEMOCLAW_METRICS_ENABLED` environment variable:
+
+```bash
+export NEMOCLAW_METRICS_ENABLED=true
+export NEMOCLAW_METRICS_PORT=9090  # Optional, defaults to 9090
+```
+
+### Accessing Metrics
+
+When enabled, NemoClaw starts a lightweight Prometheus-compatible metrics server. You can view the metrics by curling the `/metrics` endpoint:
+
+```bash
+curl http://localhost:9090/metrics
+```
+
+### Example Output
+
+```text
+# HELP blueprint_execution_total Total count of blueprint_execution
+# TYPE blueprint_execution_total counter
+blueprint_execution_total{action="apply",profile="default",status="success"} 1
+
+# HELP blueprint_execution_latency_seconds Latency histogram for blueprint_execution
+# TYPE blueprint_execution_latency_seconds histogram
+blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="0.1"} 0
+blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="0.5"} 0
+blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="1"} 0
+blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="5"} 1
+blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="+Inf"} 1
+blueprint_execution_latency_seconds_sum{action="apply",profile="default",status="success"} 3.42
+blueprint_execution_latency_seconds_count{action="apply",profile="default",status="success"} 1
+```
+
+---
+
 ## License
 
 Apache 2.0. See [LICENSE](LICENSE).
diff --git a/nemoclaw/src/index.ts b/nemoclaw/src/index.ts
@@ -17,6 +17,8 @@ import {
   describeOnboardProvider,
   loadOnboardConfig,
 } from "./onboard/config.js";
+import { metrics } from "./observability/metrics.js";
+import { createServer } from "node:http";
 
 // ---------------------------------------------------------------------------
 // OpenClaw Plugin SDK compatible types (mirrors openclaw/plugin-sdk)
@@ -243,7 +245,37 @@ export default function register(api: OpenClawPluginApi): void {
     handler: (ctx) => handleSlashCommand(ctx, api),
   });
 
-  // 2. Register nvidia-nim provider — use onboard config if available
+  // 2. Register Metrics Service if enabled
+  if (metrics.isEnabled()) {
+    let server: ReturnType<typeof createServer> | undefined;
+
+    api.registerService({
+      id: "nemoclaw-metrics",
+      start: ({ logger }) => {
+        const port = Number(process.env.NEMOCLAW_METRICS_PORT || 9090);
+        server = createServer((req, res) => {
+          if (req.url === "/metrics") {
+            res.writeHead(200, { "Content-Type": "text/plain" });
+            res.end(metrics.getPrometheusMetrics());
+          } else {
+            res.writeHead(404);
+            res.end();
+          }
+        });
+        server.listen(port, "0.0.0.0", () => {
+          logger.info(`NemoClaw metrics server listening on port ${port}`);
+        });
+      },
+      stop: ({ logger }) => {
+        if (server) {
+          server.close();
+          logger.info("NemoClaw metrics server stopped");
+        }
+      },
+    });
+  }
+
+  // 3. Register nvidia-nim provider — use onboard config if available
   const onboardCfg = loadOnboardConfig();
   const providerCredentialEnv = onboardCfg?.credentialEnv ?? "NVIDIA_API_KEY";
   api.registerProvider(registeredProviderForConfig(onboardCfg, providerCredentialEnv));

diff --git a/nemoclaw/src/observability/metrics.ts b/nemoclaw/src/observability/metrics.ts
@@ -0,0 +1,153 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Lightweight metrics implementation for NemoClaw.
+ *
+ * Provides counters and histograms for request tracking and latency observation.
+ * Enabled only when NEMOCLAW_METRICS_ENABLED=true.
+ */
+
+export interface MetricValue {
+  name: string;
+  help: string;
+  type: "counter" | "histogram";
+  labels: Record<string, string>;
+  value: number;
+  timestamp: number;
+}
+
+export interface HistogramValue extends MetricValue {
+  type: "histogram";
+  buckets: Record<number, number>;
+  sum: number;
+  count: number;
+}
+
+class MetricsRegistry {
+  private counters: Map<string, number> = new Map();
+  private histograms: Map<string, { sum: number; count: number; buckets: Record<number, number> }> =
+    new Map();
+
+  // Standard buckets for latency (seconds)
+  private defaultBuckets = [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 30, 60];
+
+  public isEnabled(): boolean {
+    return process.env.NEMOCLAW_METRICS_ENABLED === "true";
+  }
+
+  public incrementCounter(name: string, labels: Record<string, string> = {}): void {
+    if (!this.isEnabled()) return;
+    const key = this.formatKey(name, labels);
+    this.counters.set(key, (this.counters.get(key) || 0) + 1);
+  }
+
+  public observeHistogram(
+    name: string,
+    value: number,
+    labels: Record<string, string> = {},
+    buckets = this.defaultBuckets,
+  ): void {
+    if (!this.isEnabled()) return;
+    const key = this.formatKey(name, labels);
+    let hist = this.histograms.get(key);
+    if (!hist) {
+      hist = { sum: 0, count: 0, buckets: {} };
+      buckets.forEach((b) => (hist!.buckets[b] = 0));
+      this.histograms.set(key, hist);
+    }
+
+    hist.sum += value;
+    hist.count += 1;
+    buckets.forEach((b) => {
+      if (value <= b) {
+        hist!.buckets[b] = (hist!.buckets[b] || 0) + 1;
+      }
+    });
+  }
+
+  public getPrometheusMetrics(): string {
+    let output = "";
+
+    // Export counters
+    for (const [key, value] of this.counters.entries()) {
+      const [name, labelStr] = this.parseKey(key);
+      output += `# HELP ${name} Total count of ${name}\n`;
+      output += `# TYPE ${name} counter\n`;
+      output += `${name}${labelStr} ${value}\n\n`;
+    }
+
+    // Export histograms
+    for (const [key, hist] of this.histograms.entries()) {
+      const [name, labelStr] = this.parseKey(key);
+      output += `# HELP ${name} Latency histogram for ${name}\n`;
+      output += `# TYPE ${name} histogram\n`;
+
+      const sortedBuckets = Object.keys(hist.buckets)
+        .map(Number)
+        .sort((a, b) => a - b);
+      const labelsBase = labelStr.length > 2 ? labelStr.slice(1, -1) + "," : "";
+
+      sortedBuckets.forEach((b) => {
+        output += `${name}_bucket{${labelsBase}le="${b === Infinity ? "+Inf" : b}"} ${hist.buckets[b]}\n`;
+      });
+      output += `${name}_bucket{${labelsBase}le="+Inf"} ${hist.count}\n`;
+      output += `${name}_sum${labelStr} ${hist.sum}\n`;
+      output += `${name}_count${labelStr} ${hist.count}\n\n`;
+    }
+
+    return output;
+  }
+
+  private formatKey(name: string, labels: Record<string, string>): string {
+    const labelPairs = Object.entries(labels)
+      .map(([k, v]) => `${k}="${v}"`)
+      .join(",");
+    return labelPairs ? `${name}{${labelPairs}}` : name;
+  }
+
+  private parseKey(key: string): [string, string] {
+    const braceIdx = key.indexOf("{");
+    if (braceIdx === -1) return [key, ""];
+    return [key.slice(0, braceIdx), key.slice(braceIdx)];
+  }
+}
+
+export const metrics = new MetricsRegistry();
+
+/**
+ * Helper to measure execution time of a promise.
+ */
+export async function observeLatency<T>(
+  name: string,
+  labels: Record<string, string>,
+  fn: () => Promise<T>,
+): Promise<T> {
+  if (!metrics.isEnabled()) return fn();
+
+  const start = process.hrtime.bigint();
+  try {
+    const result = await fn();
+    const end = process.hrtime.bigint();
+    const durationSec = Number(end - start) / 1e9;
+
+    metrics.observeHistogram(`${name}_latency_seconds`, durationSec, {
+      ...labels,
+      status: "success",
+    });
+    metrics.incrementCounter(`${name}_total`, { ...labels, status: "success" });
+
+    return result;
+  } catch (error) {
+    const end = process.hrtime.bigint();
+    const durationSec = Number(end - start) / 1e9;
+
+    metrics.observeHistogram(`${name}_latency_seconds`, durationSec, {
+      ...labels,
+      status: "error",
+    });
+    metrics.incrementCounter(`${name}_total`, { ...labels, status: "error" });
+
+    throw error;
+  }
+}
diff --git a/test/metrics.test.js b/test/metrics.test.js
@@ -0,0 +1,57 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+const test = require("node:test");
+const assert = require("node:assert");
+const path = require("node:path");
+
+// Load the compiled metrics module
+const metricsPath = path.resolve(__dirname, "../nemoclaw/dist/observability/metrics.js");
+const { metrics, observeLatency } = require(metricsPath);
+
+// Enable metrics for testing
+process.env.NEMOCLAW_METRICS_ENABLED = "true";
+
+test("MetricsRegistry stores and exports counters", () => {
+  metrics.incrementCounter("test_counter", { foo: "bar" });
+  const output = metrics.getPrometheusMetrics();
+
+  assert.match(output, /# TYPE test_counter counter/);
+  assert.match(output, /test_counter\{foo="bar"\} 1/);
+});
+
+test("MetricsRegistry stores and exports histograms", () => {
+  metrics.observeHistogram("test_hist", 0.5, { abc: "123" });
+  const output = metrics.getPrometheusMetrics();
+
+  assert.match(output, /# TYPE test_hist histogram/);
+  assert.match(output, /test_hist_bucket\{abc="123",le="0\.5"\} 1/);
+  assert.match(output, /test_hist_sum\{abc="123"\} 0\.5/);
+  assert.match(output, /test_hist_count\{abc="123"\} 1/);
+});
+
+test("observeLatency tracks success metrics", async () => {
+  const result = await observeLatency("test_op", { op: "success" }, async () => {
+    return "done";
+  });
+
+  assert.strictEqual(result, "done");
+  const output = metrics.getPrometheusMetrics();
+
+  assert.match(output, /test_op_total\{op="success",status="success"\} 1/);
+  assert.match(output, /test_op_latency_seconds_count\{op="success",status="success"\} 1/);
+});
+
+test("observeLatency tracks error metrics", async () => {
+  try {
+    await observeLatency("test_op_err", { op: "fail" }, async () => {
+      throw new Error("oops");
+    });
+  } catch (err) {
+    assert.strictEqual(err.message, "oops");
+  }
+
+  const output = metrics.getPrometheusMetrics();
+  assert.match(output, /test_op_err_total\{op="fail",status="error"\} 1/);
+  assert.match(output, /test_op_err_latency_seconds_count\{op="fail",status="error"\} 1/);
+});