From 1c74bc01d495cd59ec117cacbd120123677a8b31 Mon Sep 17 00:00:00 2001 From: Krish Sapru Date: Tue, 17 Mar 2026 14:33:42 -0400 Subject: [PATCH 1/3] feat: add lightweight observability and metrics service --- README.md | 43 ++++++++ nemoclaw/src/blueprint/exec.ts | 12 ++ nemoclaw/src/index.ts | 33 +++++- nemoclaw/src/observability/metrics.ts | 153 ++++++++++++++++++++++++++ nemoclaw/src/onboard/validate.ts | 13 +++ test/metrics.test.js | 57 ++++++++++ 6 files changed, 310 insertions(+), 1 deletion(-) create mode 100644 nemoclaw/src/observability/metrics.ts create mode 100644 test/metrics.test.js diff --git a/README.md b/README.md index 2bf5d8dc9d..2ebdc6cce7 100644 --- a/README.md +++ b/README.md @@ -180,6 +180,49 @@ Refer to the documentation for more information on NemoClaw. - [Network Policies](https://docs.nvidia.com/nemoclaw/latest/reference/network-policies.html): egress control and policy customization - [CLI Commands](https://docs.nvidia.com/nemoclaw/latest/reference/commands.html): full command reference +--- + +## Observability + +NemoClaw includes an optional observability layer for tracking agent execution and blueprint performance. + +### Enabling Metrics + +Metrics are disabled by default. To enable them, set the `NEMOCLAW_METRICS_ENABLED` environment variable: + +```bash +export NEMOCLAW_METRICS_ENABLED=true +export NEMOCLAW_METRICS_PORT=9090 # Optional, defaults to 9090 +``` + +### Accessing Metrics + +When enabled, NemoClaw starts a lightweight Prometheus-compatible metrics server. You can view the metrics by curling the `/metrics` endpoint: + +```bash +curl http://localhost:9090/metrics +``` + +### Example Output + +```text +# HELP blueprint_exec_total Total count of blueprint_exec +# TYPE blueprint_exec_total counter +blueprint_exec_total{action="apply",profile="default",status="success"} 1 + +# HELP blueprint_exec_latency_seconds Latency histogram for blueprint_exec +# TYPE blueprint_exec_latency_seconds histogram +blueprint_exec_latency_seconds_bucket{action="apply",profile="default",status="success",le="0.1"} 0 +blueprint_exec_latency_seconds_bucket{action="apply",profile="default",status="success",le="0.5"} 0 +blueprint_exec_latency_seconds_bucket{action="apply",profile="default",status="success",le="1"} 0 +blueprint_exec_latency_seconds_bucket{action="apply",profile="default",status="success",le="5"} 1 +blueprint_exec_latency_seconds_bucket{action="apply",profile="default",status="success",le="+Inf"} 1 +blueprint_exec_latency_seconds_sum{action="apply",profile="default",status="success"} 3.42 +blueprint_exec_latency_seconds_count{action="apply",profile="default",status="success"} 1 +``` + +--- + ## License This project is licensed under the [Apache License 2.0](LICENSE). diff --git a/nemoclaw/src/blueprint/exec.ts b/nemoclaw/src/blueprint/exec.ts index b3e57680b7..088916508a 100644 --- a/nemoclaw/src/blueprint/exec.ts +++ b/nemoclaw/src/blueprint/exec.ts @@ -5,6 +5,7 @@ import { spawn } from "node:child_process"; import { existsSync } from "node:fs"; import { join } from "node:path"; import type { PluginLogger } from "../index.js"; +import { observeLatency } from "../observability/metrics.js"; export type BlueprintAction = "plan" | "apply" | "status" | "rollback"; @@ -34,6 +35,17 @@ function failResult(action: BlueprintAction, message: string): BlueprintRunResul export async function execBlueprint( options: BlueprintRunOptions, logger: PluginLogger, +): Promise { + return observeLatency( + "blueprint_exec", + { action: options.action, profile: options.profile }, + () => execBlueprintInternal(options, logger), + ); +} + +async function execBlueprintInternal( + options: BlueprintRunOptions, + logger: PluginLogger, ): Promise { const runnerPath = join(options.blueprintPath, "orchestrator", "runner.py"); diff --git a/nemoclaw/src/index.ts b/nemoclaw/src/index.ts index 796564d8be..5f77515c0a 100644 --- a/nemoclaw/src/index.ts +++ b/nemoclaw/src/index.ts @@ -15,6 +15,8 @@ import type { Command } from "commander"; import { registerCliCommands } from "./cli.js"; import { handleSlashCommand } from "./commands/slash.js"; import { loadOnboardConfig } from "./onboard/config.js"; +import { metrics } from "./observability/metrics.js"; +import { createServer } from "node:http"; // --------------------------------------------------------------------------- // OpenClaw Plugin SDK compatible types (mirrors openclaw/plugin-sdk) @@ -193,7 +195,36 @@ export default function register(api: OpenClawPluginApi): void { { commands: ["nemoclaw"] }, ); - // 3. Register nvidia-nim provider — use onboard config if available + // 3. Register Metrics Service if enabled + if (metrics.isEnabled()) { + api.registerService({ + id: "nemoclaw-metrics", + start: ({ logger }) => { + const port = Number(process.env.NEMOCLAW_METRICS_PORT || 9090); + const server = createServer((req, res) => { + if (req.url === "/metrics") { + res.writeHead(200, { "Content-Type": "text/plain" }); + res.end(metrics.getPrometheusMetrics()); + } else { + res.writeHead(404); + res.end(); + } + }); + server.listen(port, "0.0.0.0", () => { + logger.info(`NemoClaw metrics server listening on port ${port}`); + }); + (server as any)._nemoclaw_server = server; + }, + stop: ({ logger }) => { + // Since we can't easily get the server instance back from registerService's start + // unless we store it somewhere globally or use a closure. + // For simplicity in this plugin context, we'll just log. + logger.info("NemoClaw metrics server stopping"); + }, + }); + } + + // 4. Register nvidia-nim provider — use onboard config if available const onboardCfg = loadOnboardConfig(); const providerCredentialEnv = onboardCfg?.credentialEnv ?? "NVIDIA_API_KEY"; const providerLabel = onboardCfg diff --git a/nemoclaw/src/observability/metrics.ts b/nemoclaw/src/observability/metrics.ts new file mode 100644 index 0000000000..180b7d3ea4 --- /dev/null +++ b/nemoclaw/src/observability/metrics.ts @@ -0,0 +1,153 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * Lightweight metrics implementation for NemoClaw. + * + * Provides counters and histograms for request tracking and latency observation. + * Enabled only when NEMOCLAW_METRICS_ENABLED=true. + */ + +export interface MetricValue { + name: string; + help: string; + type: "counter" | "histogram"; + labels: Record; + value: number; + timestamp: number; +} + +export interface HistogramValue extends MetricValue { + type: "histogram"; + buckets: Record; + sum: number; + count: number; +} + +class MetricsRegistry { + private counters: Map = new Map(); + private histograms: Map }> = + new Map(); + + // Standard buckets for latency (seconds) + private defaultBuckets = [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 30, 60]; + + public isEnabled(): boolean { + return process.env.NEMOCLAW_METRICS_ENABLED === "true"; + } + + public incrementCounter(name: string, labels: Record = {}): void { + if (!this.isEnabled()) return; + const key = this.formatKey(name, labels); + this.counters.set(key, (this.counters.get(key) || 0) + 1); + } + + public observeHistogram( + name: string, + value: number, + labels: Record = {}, + buckets = this.defaultBuckets, + ): void { + if (!this.isEnabled()) return; + const key = this.formatKey(name, labels); + let hist = this.histograms.get(key); + if (!hist) { + hist = { sum: 0, count: 0, buckets: {} }; + buckets.forEach((b) => (hist!.buckets[b] = 0)); + this.histograms.set(key, hist); + } + + hist.sum += value; + hist.count += 1; + buckets.forEach((b) => { + if (value <= b) { + hist!.buckets[b] = (hist!.buckets[b] || 0) + 1; + } + }); + } + + public getPrometheusMetrics(): string { + let output = ""; + + // Export counters + for (const [key, value] of this.counters.entries()) { + const [name, labelStr] = this.parseKey(key); + output += `# HELP ${name} Total count of ${name}\n`; + output += `# TYPE ${name} counter\n`; + output += `${name}${labelStr} ${value}\n\n`; + } + + // Export histograms + for (const [key, hist] of this.histograms.entries()) { + const [name, labelStr] = this.parseKey(key); + output += `# HELP ${name} Latency histogram for ${name}\n`; + output += `# TYPE ${name} histogram\n`; + + const sortedBuckets = Object.keys(hist.buckets) + .map(Number) + .sort((a, b) => a - b); + const labelsBase = labelStr.length > 2 ? labelStr.slice(1, -1) + "," : ""; + + sortedBuckets.forEach((b) => { + output += `${name}_bucket{${labelsBase}le="${b === Infinity ? "+Inf" : b}"} ${hist.buckets[b]}\n`; + }); + output += `${name}_bucket{${labelsBase}le="+Inf"} ${hist.count}\n`; + output += `${name}_sum${labelStr} ${hist.sum}\n`; + output += `${name}_count${labelStr} ${hist.count}\n\n`; + } + + return output; + } + + private formatKey(name: string, labels: Record): string { + const labelPairs = Object.entries(labels) + .map(([k, v]) => `${k}="${v}"`) + .join(","); + return labelPairs ? `${name}{${labelPairs}}` : name; + } + + private parseKey(key: string): [string, string] { + const braceIdx = key.indexOf("{"); + if (braceIdx === -1) return [key, ""]; + return [key.slice(0, braceIdx), key.slice(braceIdx)]; + } +} + +export const metrics = new MetricsRegistry(); + +/** + * Helper to measure execution time of a promise. + */ +export async function observeLatency( + name: string, + labels: Record, + fn: () => Promise, +): Promise { + if (!metrics.isEnabled()) return fn(); + + const start = process.hrtime.bigint(); + try { + const result = await fn(); + const end = process.hrtime.bigint(); + const durationSec = Number(end - start) / 1e9; + + metrics.observeHistogram(`${name}_latency_seconds`, durationSec, { + ...labels, + status: "success", + }); + metrics.incrementCounter(`${name}_total`, { ...labels, status: "success" }); + + return result; + } catch (error) { + const end = process.hrtime.bigint(); + const durationSec = Number(end - start) / 1e9; + + metrics.observeHistogram(`${name}_latency_seconds`, durationSec, { + ...labels, + status: "error", + }); + metrics.incrementCounter(`${name}_total`, { ...labels, status: "error" }); + + throw error; + } +} diff --git a/nemoclaw/src/onboard/validate.ts b/nemoclaw/src/onboard/validate.ts index c2b27de8b9..3784676586 100644 --- a/nemoclaw/src/onboard/validate.ts +++ b/nemoclaw/src/onboard/validate.ts @@ -1,6 +1,8 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +import { observeLatency } from "../observability/metrics.js"; + export interface ValidationResult { valid: boolean; models: string[]; @@ -10,6 +12,17 @@ export interface ValidationResult { export async function validateApiKey( apiKey: string, endpointUrl: string, +): Promise { + return observeLatency( + "tool_api_validate", + { endpoint: endpointUrl }, + () => validateApiKeyInternal(apiKey, endpointUrl), + ); +} + +async function validateApiKeyInternal( + apiKey: string, + endpointUrl: string, ): Promise { const url = `${endpointUrl.replace(/\/+$/, "")}/models`; const controller = new AbortController(); diff --git a/test/metrics.test.js b/test/metrics.test.js new file mode 100644 index 0000000000..1c6f93699a --- /dev/null +++ b/test/metrics.test.js @@ -0,0 +1,57 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +const test = require("node:test"); +const assert = require("node:assert"); +const path = require("node:path"); + +// Load the compiled metrics module +const metricsPath = path.resolve(__dirname, "../nemoclaw/dist/observability/metrics.js"); +const { metrics, observeLatency } = require(metricsPath); + +// Enable metrics for testing +process.env.NEMOCLAW_METRICS_ENABLED = "true"; + +test("MetricsRegistry stores and exports counters", () => { + metrics.incrementCounter("test_counter", { foo: "bar" }); + const output = metrics.getPrometheusMetrics(); + + assert.match(output, /# TYPE test_counter counter/); + assert.match(output, /test_counter\{foo="bar"\} 1/); +}); + +test("MetricsRegistry stores and exports histograms", () => { + metrics.observeHistogram("test_hist", 0.5, { abc: "123" }); + const output = metrics.getPrometheusMetrics(); + + assert.match(output, /# TYPE test_hist histogram/); + assert.match(output, /test_hist_bucket\{abc="123",le="0\.5"\} 1/); + assert.match(output, /test_hist_sum\{abc="123"\} 0\.5/); + assert.match(output, /test_hist_count\{abc="123"\} 1/); +}); + +test("observeLatency tracks success metrics", async () => { + const result = await observeLatency("test_op", { op: "success" }, async () => { + return "done"; + }); + + assert.strictEqual(result, "done"); + const output = metrics.getPrometheusMetrics(); + + assert.match(output, /test_op_total\{op="success",status="success"\} 1/); + assert.match(output, /test_op_latency_seconds_count\{op="success",status="success"\} 1/); +}); + +test("observeLatency tracks error metrics", async () => { + try { + await observeLatency("test_op_err", { op: "fail" }, async () => { + throw new Error("oops"); + }); + } catch (err) { + assert.strictEqual(err.message, "oops"); + } + + const output = metrics.getPrometheusMetrics(); + assert.match(output, /test_op_err_total\{op="fail",status="error"\} 1/); + assert.match(output, /test_op_err_latency_seconds_count\{op="fail",status="error"\} 1/); +}); From ad1f2e75890825931ec91d67fc0f5f9770845e0c Mon Sep 17 00:00:00 2001 From: Krish Sapru Date: Tue, 17 Mar 2026 14:46:00 -0400 Subject: [PATCH 2/3] fix: improve metrics server lifecycle and add sandbox operation metric --- nemoclaw/src/commands/launch.ts | 3 +++ nemoclaw/src/index.ts | 13 +++++++------ pr_description.md | 28 ++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 6 deletions(-) create mode 100644 pr_description.md diff --git a/nemoclaw/src/commands/launch.ts b/nemoclaw/src/commands/launch.ts index ee08e96bf8..4b584c45f0 100644 --- a/nemoclaw/src/commands/launch.ts +++ b/nemoclaw/src/commands/launch.ts @@ -8,6 +8,7 @@ import { verifyBlueprintDigest, checkCompatibility } from "../blueprint/verify.j import { execBlueprint } from "../blueprint/exec.js"; import { loadState, saveState } from "../blueprint/state.js"; import { detectHostOpenClaw } from "./migrate.js"; +import { metrics } from "../observability/metrics.js"; export interface LaunchOptions { force: boolean; @@ -106,6 +107,8 @@ export async function cliLaunch(opts: LaunchOptions): Promise { return; } + metrics.incrementCounter("sandbox_operation_total", { operation: "launch" }); + // Save state saveState({ ...loadState(), diff --git a/nemoclaw/src/index.ts b/nemoclaw/src/index.ts index 5f77515c0a..efea1f56bb 100644 --- a/nemoclaw/src/index.ts +++ b/nemoclaw/src/index.ts @@ -197,11 +197,13 @@ export default function register(api: OpenClawPluginApi): void { // 3. Register Metrics Service if enabled if (metrics.isEnabled()) { + let server: ReturnType | undefined; + api.registerService({ id: "nemoclaw-metrics", start: ({ logger }) => { const port = Number(process.env.NEMOCLAW_METRICS_PORT || 9090); - const server = createServer((req, res) => { + server = createServer((req, res) => { if (req.url === "/metrics") { res.writeHead(200, { "Content-Type": "text/plain" }); res.end(metrics.getPrometheusMetrics()); @@ -213,13 +215,12 @@ export default function register(api: OpenClawPluginApi): void { server.listen(port, "0.0.0.0", () => { logger.info(`NemoClaw metrics server listening on port ${port}`); }); - (server as any)._nemoclaw_server = server; }, stop: ({ logger }) => { - // Since we can't easily get the server instance back from registerService's start - // unless we store it somewhere globally or use a closure. - // For simplicity in this plugin context, we'll just log. - logger.info("NemoClaw metrics server stopping"); + if (server) { + server.close(); + logger.info("NemoClaw metrics server stopped"); + } }, }); } diff --git a/pr_description.md b/pr_description.md new file mode 100644 index 0000000000..97e9610d6b --- /dev/null +++ b/pr_description.md @@ -0,0 +1,28 @@ +### Summary +Adds a lightweight observability layer and a Prometheus-compatible metrics service to NemoClaw. This allows tracking agent execution performance, blueprint latency, and API validation health without external dependencies. + +### Key Components +- **Metrics Registry** — Custom lightweight implementation of counters and histograms in `nemoclaw/src/observability/metrics.ts`. +- **Metrics Service** — Integrated HTTP server (default port 9090) that exports Prometheus-formatted metrics at `/metrics`. +- **Instrumentation** — Wrapped `execBlueprint` and `validateApiKey` with latency observers to track real-world performance. +- **Documentation** — Updated README with configuration details (`NEMOCLAW_METRICS_ENABLED`) and example Prometheus output. + +### Usage +```bash +# Enable metrics +export NEMOCLAW_METRICS_ENABLED=true +# Optional: Change port (defaults to 9090) +export NEMOCLAW_METRICS_PORT=9090 + +# View metrics +curl http://localhost:9090/metrics +``` + +### Verification +- **Unit Tests** — Added `test/metrics.test.js` verifying registry storage, histogram bucket logic, and `observeLatency` wrapper. +- **Manual Verification** — Verified Prometheus output format matches standard specifications. + +### Notes +- Zero external dependencies for metrics (uses `node:http` and `process.hrtime.bigint`). +- Disabled by default to ensure zero overhead for standard CLI users. +- Adheres to NVIDIA's SPDX licensing headers. \ No newline at end of file From 68bb16caec6ca4505c0dc0b6e456804465c43003 Mon Sep 17 00:00:00 2001 From: Krish Sapru Date: Tue, 17 Mar 2026 14:50:14 -0400 Subject: [PATCH 3/3] chore: cleanup pr description and rename metric to blueprint_execution --- README.md | 26 +++++++++++++------------- nemoclaw/src/blueprint/exec.ts | 2 +- pr_description.md | 28 ---------------------------- 3 files changed, 14 insertions(+), 42 deletions(-) delete mode 100644 pr_description.md diff --git a/README.md b/README.md index 2ebdc6cce7..179829683c 100644 --- a/README.md +++ b/README.md @@ -206,19 +206,19 @@ curl http://localhost:9090/metrics ### Example Output ```text -# HELP blueprint_exec_total Total count of blueprint_exec -# TYPE blueprint_exec_total counter -blueprint_exec_total{action="apply",profile="default",status="success"} 1 - -# HELP blueprint_exec_latency_seconds Latency histogram for blueprint_exec -# TYPE blueprint_exec_latency_seconds histogram -blueprint_exec_latency_seconds_bucket{action="apply",profile="default",status="success",le="0.1"} 0 -blueprint_exec_latency_seconds_bucket{action="apply",profile="default",status="success",le="0.5"} 0 -blueprint_exec_latency_seconds_bucket{action="apply",profile="default",status="success",le="1"} 0 -blueprint_exec_latency_seconds_bucket{action="apply",profile="default",status="success",le="5"} 1 -blueprint_exec_latency_seconds_bucket{action="apply",profile="default",status="success",le="+Inf"} 1 -blueprint_exec_latency_seconds_sum{action="apply",profile="default",status="success"} 3.42 -blueprint_exec_latency_seconds_count{action="apply",profile="default",status="success"} 1 +# HELP blueprint_execution_total Total count of blueprint_execution +# TYPE blueprint_execution_total counter +blueprint_execution_total{action="apply",profile="default",status="success"} 1 + +# HELP blueprint_execution_latency_seconds Latency histogram for blueprint_execution +# TYPE blueprint_execution_latency_seconds histogram +blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="0.1"} 0 +blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="0.5"} 0 +blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="1"} 0 +blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="5"} 1 +blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="+Inf"} 1 +blueprint_execution_latency_seconds_sum{action="apply",profile="default",status="success"} 3.42 +blueprint_execution_latency_seconds_count{action="apply",profile="default",status="success"} 1 ``` --- diff --git a/nemoclaw/src/blueprint/exec.ts b/nemoclaw/src/blueprint/exec.ts index 088916508a..7a088167cc 100644 --- a/nemoclaw/src/blueprint/exec.ts +++ b/nemoclaw/src/blueprint/exec.ts @@ -37,7 +37,7 @@ export async function execBlueprint( logger: PluginLogger, ): Promise { return observeLatency( - "blueprint_exec", + "blueprint_execution", { action: options.action, profile: options.profile }, () => execBlueprintInternal(options, logger), ); diff --git a/pr_description.md b/pr_description.md deleted file mode 100644 index 97e9610d6b..0000000000 --- a/pr_description.md +++ /dev/null @@ -1,28 +0,0 @@ -### Summary -Adds a lightweight observability layer and a Prometheus-compatible metrics service to NemoClaw. This allows tracking agent execution performance, blueprint latency, and API validation health without external dependencies. - -### Key Components -- **Metrics Registry** — Custom lightweight implementation of counters and histograms in `nemoclaw/src/observability/metrics.ts`. -- **Metrics Service** — Integrated HTTP server (default port 9090) that exports Prometheus-formatted metrics at `/metrics`. -- **Instrumentation** — Wrapped `execBlueprint` and `validateApiKey` with latency observers to track real-world performance. -- **Documentation** — Updated README with configuration details (`NEMOCLAW_METRICS_ENABLED`) and example Prometheus output. - -### Usage -```bash -# Enable metrics -export NEMOCLAW_METRICS_ENABLED=true -# Optional: Change port (defaults to 9090) -export NEMOCLAW_METRICS_PORT=9090 - -# View metrics -curl http://localhost:9090/metrics -``` - -### Verification -- **Unit Tests** — Added `test/metrics.test.js` verifying registry storage, histogram bucket logic, and `observeLatency` wrapper. -- **Manual Verification** — Verified Prometheus output format matches standard specifications. - -### Notes -- Zero external dependencies for metrics (uses `node:http` and `process.hrtime.bigint`). -- Disabled by default to ensure zero overhead for standard CLI users. -- Adheres to NVIDIA's SPDX licensing headers. \ No newline at end of file