diff --git a/README.md b/README.md index fedb3b31f2..0f46612d9e 100644 --- a/README.md +++ b/README.md @@ -194,6 +194,49 @@ For security bulletins and PSIRT policies, visit the [NVIDIA Product Security](h This software automatically retrieves, accesses or interacts with external materials. Those retrieved materials are not distributed with this software and are governed solely by separate terms, conditions and licenses. You are solely responsible for finding, reviewing and complying with all applicable terms, conditions, and licenses, and for verifying the security, integrity and suitability of any retrieved materials for your specific use case. This software is provided "AS IS", without warranty of any kind. The author makes no representations or warranties regarding any retrieved materials, and assumes no liability for any losses, damages, liabilities or legal consequences from your use or inability to use this software or any retrieved materials. Use this software and the retrieved materials at your own risk. +--- + +## Observability + +NemoClaw includes an optional observability layer for tracking agent execution and blueprint performance. + +### Enabling Metrics + +Metrics are disabled by default. To enable them, set the `NEMOCLAW_METRICS_ENABLED` environment variable: + +```bash +export NEMOCLAW_METRICS_ENABLED=true +export NEMOCLAW_METRICS_PORT=9090 # Optional, defaults to 9090 +``` + +### Accessing Metrics + +When enabled, NemoClaw starts a lightweight Prometheus-compatible metrics server. You can view the metrics by curling the `/metrics` endpoint: + +```bash +curl http://localhost:9090/metrics +``` + +### Example Output + +```text +# HELP blueprint_execution_total Total count of blueprint_execution +# TYPE blueprint_execution_total counter +blueprint_execution_total{action="apply",profile="default",status="success"} 1 + +# HELP blueprint_execution_latency_seconds Latency histogram for blueprint_execution +# TYPE blueprint_execution_latency_seconds histogram +blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="0.1"} 0 +blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="0.5"} 0 +blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="1"} 0 +blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="5"} 1 +blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="+Inf"} 1 +blueprint_execution_latency_seconds_sum{action="apply",profile="default",status="success"} 3.42 +blueprint_execution_latency_seconds_count{action="apply",profile="default",status="success"} 1 +``` + +--- + ## License Apache 2.0. See [LICENSE](LICENSE). diff --git a/nemoclaw/src/index.ts b/nemoclaw/src/index.ts index 8a8ac9f09c..c58e1b8355 100644 --- a/nemoclaw/src/index.ts +++ b/nemoclaw/src/index.ts @@ -17,6 +17,8 @@ import { describeOnboardProvider, loadOnboardConfig, } from "./onboard/config.js"; +import { metrics } from "./observability/metrics.js"; +import { createServer } from "node:http"; // --------------------------------------------------------------------------- // OpenClaw Plugin SDK compatible types (mirrors openclaw/plugin-sdk) @@ -243,7 +245,37 @@ export default function register(api: OpenClawPluginApi): void { handler: (ctx) => handleSlashCommand(ctx, api), }); - // 2. Register nvidia-nim provider — use onboard config if available + // 2. Register Metrics Service if enabled + if (metrics.isEnabled()) { + let server: ReturnType | undefined; + + api.registerService({ + id: "nemoclaw-metrics", + start: ({ logger }) => { + const port = Number(process.env.NEMOCLAW_METRICS_PORT || 9090); + server = createServer((req, res) => { + if (req.url === "/metrics") { + res.writeHead(200, { "Content-Type": "text/plain" }); + res.end(metrics.getPrometheusMetrics()); + } else { + res.writeHead(404); + res.end(); + } + }); + server.listen(port, "0.0.0.0", () => { + logger.info(`NemoClaw metrics server listening on port ${port}`); + }); + }, + stop: ({ logger }) => { + if (server) { + server.close(); + logger.info("NemoClaw metrics server stopped"); + } + }, + }); + } + + // 3. Register nvidia-nim provider — use onboard config if available const onboardCfg = loadOnboardConfig(); const providerCredentialEnv = onboardCfg?.credentialEnv ?? "NVIDIA_API_KEY"; api.registerProvider(registeredProviderForConfig(onboardCfg, providerCredentialEnv)); diff --git a/nemoclaw/src/observability/metrics.ts b/nemoclaw/src/observability/metrics.ts new file mode 100644 index 0000000000..180b7d3ea4 --- /dev/null +++ b/nemoclaw/src/observability/metrics.ts @@ -0,0 +1,153 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * Lightweight metrics implementation for NemoClaw. + * + * Provides counters and histograms for request tracking and latency observation. + * Enabled only when NEMOCLAW_METRICS_ENABLED=true. + */ + +export interface MetricValue { + name: string; + help: string; + type: "counter" | "histogram"; + labels: Record; + value: number; + timestamp: number; +} + +export interface HistogramValue extends MetricValue { + type: "histogram"; + buckets: Record; + sum: number; + count: number; +} + +class MetricsRegistry { + private counters: Map = new Map(); + private histograms: Map }> = + new Map(); + + // Standard buckets for latency (seconds) + private defaultBuckets = [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 30, 60]; + + public isEnabled(): boolean { + return process.env.NEMOCLAW_METRICS_ENABLED === "true"; + } + + public incrementCounter(name: string, labels: Record = {}): void { + if (!this.isEnabled()) return; + const key = this.formatKey(name, labels); + this.counters.set(key, (this.counters.get(key) || 0) + 1); + } + + public observeHistogram( + name: string, + value: number, + labels: Record = {}, + buckets = this.defaultBuckets, + ): void { + if (!this.isEnabled()) return; + const key = this.formatKey(name, labels); + let hist = this.histograms.get(key); + if (!hist) { + hist = { sum: 0, count: 0, buckets: {} }; + buckets.forEach((b) => (hist!.buckets[b] = 0)); + this.histograms.set(key, hist); + } + + hist.sum += value; + hist.count += 1; + buckets.forEach((b) => { + if (value <= b) { + hist!.buckets[b] = (hist!.buckets[b] || 0) + 1; + } + }); + } + + public getPrometheusMetrics(): string { + let output = ""; + + // Export counters + for (const [key, value] of this.counters.entries()) { + const [name, labelStr] = this.parseKey(key); + output += `# HELP ${name} Total count of ${name}\n`; + output += `# TYPE ${name} counter\n`; + output += `${name}${labelStr} ${value}\n\n`; + } + + // Export histograms + for (const [key, hist] of this.histograms.entries()) { + const [name, labelStr] = this.parseKey(key); + output += `# HELP ${name} Latency histogram for ${name}\n`; + output += `# TYPE ${name} histogram\n`; + + const sortedBuckets = Object.keys(hist.buckets) + .map(Number) + .sort((a, b) => a - b); + const labelsBase = labelStr.length > 2 ? labelStr.slice(1, -1) + "," : ""; + + sortedBuckets.forEach((b) => { + output += `${name}_bucket{${labelsBase}le="${b === Infinity ? "+Inf" : b}"} ${hist.buckets[b]}\n`; + }); + output += `${name}_bucket{${labelsBase}le="+Inf"} ${hist.count}\n`; + output += `${name}_sum${labelStr} ${hist.sum}\n`; + output += `${name}_count${labelStr} ${hist.count}\n\n`; + } + + return output; + } + + private formatKey(name: string, labels: Record): string { + const labelPairs = Object.entries(labels) + .map(([k, v]) => `${k}="${v}"`) + .join(","); + return labelPairs ? `${name}{${labelPairs}}` : name; + } + + private parseKey(key: string): [string, string] { + const braceIdx = key.indexOf("{"); + if (braceIdx === -1) return [key, ""]; + return [key.slice(0, braceIdx), key.slice(braceIdx)]; + } +} + +export const metrics = new MetricsRegistry(); + +/** + * Helper to measure execution time of a promise. + */ +export async function observeLatency( + name: string, + labels: Record, + fn: () => Promise, +): Promise { + if (!metrics.isEnabled()) return fn(); + + const start = process.hrtime.bigint(); + try { + const result = await fn(); + const end = process.hrtime.bigint(); + const durationSec = Number(end - start) / 1e9; + + metrics.observeHistogram(`${name}_latency_seconds`, durationSec, { + ...labels, + status: "success", + }); + metrics.incrementCounter(`${name}_total`, { ...labels, status: "success" }); + + return result; + } catch (error) { + const end = process.hrtime.bigint(); + const durationSec = Number(end - start) / 1e9; + + metrics.observeHistogram(`${name}_latency_seconds`, durationSec, { + ...labels, + status: "error", + }); + metrics.incrementCounter(`${name}_total`, { ...labels, status: "error" }); + + throw error; + } +} diff --git a/test/metrics.test.js b/test/metrics.test.js new file mode 100644 index 0000000000..1c6f93699a --- /dev/null +++ b/test/metrics.test.js @@ -0,0 +1,57 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +const test = require("node:test"); +const assert = require("node:assert"); +const path = require("node:path"); + +// Load the compiled metrics module +const metricsPath = path.resolve(__dirname, "../nemoclaw/dist/observability/metrics.js"); +const { metrics, observeLatency } = require(metricsPath); + +// Enable metrics for testing +process.env.NEMOCLAW_METRICS_ENABLED = "true"; + +test("MetricsRegistry stores and exports counters", () => { + metrics.incrementCounter("test_counter", { foo: "bar" }); + const output = metrics.getPrometheusMetrics(); + + assert.match(output, /# TYPE test_counter counter/); + assert.match(output, /test_counter\{foo="bar"\} 1/); +}); + +test("MetricsRegistry stores and exports histograms", () => { + metrics.observeHistogram("test_hist", 0.5, { abc: "123" }); + const output = metrics.getPrometheusMetrics(); + + assert.match(output, /# TYPE test_hist histogram/); + assert.match(output, /test_hist_bucket\{abc="123",le="0\.5"\} 1/); + assert.match(output, /test_hist_sum\{abc="123"\} 0\.5/); + assert.match(output, /test_hist_count\{abc="123"\} 1/); +}); + +test("observeLatency tracks success metrics", async () => { + const result = await observeLatency("test_op", { op: "success" }, async () => { + return "done"; + }); + + assert.strictEqual(result, "done"); + const output = metrics.getPrometheusMetrics(); + + assert.match(output, /test_op_total\{op="success",status="success"\} 1/); + assert.match(output, /test_op_latency_seconds_count\{op="success",status="success"\} 1/); +}); + +test("observeLatency tracks error metrics", async () => { + try { + await observeLatency("test_op_err", { op: "fail" }, async () => { + throw new Error("oops"); + }); + } catch (err) { + assert.strictEqual(err.message, "oops"); + } + + const output = metrics.getPrometheusMetrics(); + assert.match(output, /test_op_err_total\{op="fail",status="error"\} 1/); + assert.match(output, /test_op_err_latency_seconds_count\{op="fail",status="error"\} 1/); +});