Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,49 @@ For security bulletins and PSIRT policies, visit the [NVIDIA Product Security](h

This software automatically retrieves, accesses or interacts with external materials. Those retrieved materials are not distributed with this software and are governed solely by separate terms, conditions and licenses. You are solely responsible for finding, reviewing and complying with all applicable terms, conditions, and licenses, and for verifying the security, integrity and suitability of any retrieved materials for your specific use case. This software is provided "AS IS", without warranty of any kind. The author makes no representations or warranties regarding any retrieved materials, and assumes no liability for any losses, damages, liabilities or legal consequences from your use or inability to use this software or any retrieved materials. Use this software and the retrieved materials at your own risk.

---

## Observability

NemoClaw includes an optional observability layer for tracking agent execution and blueprint performance.

### Enabling Metrics

Metrics are disabled by default. To enable them, set the `NEMOCLAW_METRICS_ENABLED` environment variable:

```bash
export NEMOCLAW_METRICS_ENABLED=true
export NEMOCLAW_METRICS_PORT=9090 # Optional, defaults to 9090
```

### Accessing Metrics

When enabled, NemoClaw starts a lightweight Prometheus-compatible metrics server. You can view the metrics by curling the `/metrics` endpoint:

```bash
curl http://localhost:9090/metrics
```

### Example Output

```text
# HELP blueprint_execution_total Total count of blueprint_execution
# TYPE blueprint_execution_total counter
blueprint_execution_total{action="apply",profile="default",status="success"} 1

# HELP blueprint_execution_latency_seconds Latency histogram for blueprint_execution
# TYPE blueprint_execution_latency_seconds histogram
blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="0.1"} 0
blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="0.5"} 0
blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="1"} 0
blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="5"} 1
blueprint_execution_latency_seconds_bucket{action="apply",profile="default",status="success",le="+Inf"} 1
blueprint_execution_latency_seconds_sum{action="apply",profile="default",status="success"} 3.42
blueprint_execution_latency_seconds_count{action="apply",profile="default",status="success"} 1
```

---

## License

Apache 2.0. See [LICENSE](LICENSE).
34 changes: 33 additions & 1 deletion nemoclaw/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ import {
describeOnboardProvider,
loadOnboardConfig,
} from "./onboard/config.js";
import { metrics } from "./observability/metrics.js";
import { createServer } from "node:http";

// ---------------------------------------------------------------------------
// OpenClaw Plugin SDK compatible types (mirrors openclaw/plugin-sdk)
Expand Down Expand Up @@ -243,7 +245,37 @@ export default function register(api: OpenClawPluginApi): void {
handler: (ctx) => handleSlashCommand(ctx, api),
});

// 2. Register nvidia-nim provider — use onboard config if available
// 2. Register Metrics Service if enabled
if (metrics.isEnabled()) {
let server: ReturnType<typeof createServer> | undefined;

api.registerService({
id: "nemoclaw-metrics",
start: ({ logger }) => {
const port = Number(process.env.NEMOCLAW_METRICS_PORT || 9090);
server = createServer((req, res) => {
if (req.url === "/metrics") {
res.writeHead(200, { "Content-Type": "text/plain" });
res.end(metrics.getPrometheusMetrics());
Comment thread
coderabbitai[bot] marked this conversation as resolved.
} else {
res.writeHead(404);
res.end();
}
});
server.listen(port, "0.0.0.0", () => {
logger.info(`NemoClaw metrics server listening on port ${port}`);
});
Comment thread
ksapru marked this conversation as resolved.
},
stop: ({ logger }) => {
if (server) {
server.close();
logger.info("NemoClaw metrics server stopped");
}
},
});
}

// 3. Register nvidia-nim provider — use onboard config if available
const onboardCfg = loadOnboardConfig();
const providerCredentialEnv = onboardCfg?.credentialEnv ?? "NVIDIA_API_KEY";
api.registerProvider(registeredProviderForConfig(onboardCfg, providerCredentialEnv));
Expand Down
153 changes: 153 additions & 0 deletions nemoclaw/src/observability/metrics.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

/**
* Lightweight metrics implementation for NemoClaw.
*
* Provides counters and histograms for request tracking and latency observation.
* Enabled only when NEMOCLAW_METRICS_ENABLED=true.
*/

export interface MetricValue {
name: string;
help: string;
type: "counter" | "histogram";
labels: Record<string, string>;
value: number;
timestamp: number;
}

export interface HistogramValue extends MetricValue {
type: "histogram";
buckets: Record<number, number>;
sum: number;
count: number;
}

class MetricsRegistry {
private counters: Map<string, number> = new Map();
private histograms: Map<string, { sum: number; count: number; buckets: Record<number, number> }> =
new Map();

// Standard buckets for latency (seconds)
private defaultBuckets = [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 30, 60];

public isEnabled(): boolean {
return process.env.NEMOCLAW_METRICS_ENABLED === "true";
}

public incrementCounter(name: string, labels: Record<string, string> = {}): void {
if (!this.isEnabled()) return;
const key = this.formatKey(name, labels);
this.counters.set(key, (this.counters.get(key) || 0) + 1);
}

public observeHistogram(
name: string,
value: number,
labels: Record<string, string> = {},
buckets = this.defaultBuckets,
): void {
if (!this.isEnabled()) return;
const key = this.formatKey(name, labels);
let hist = this.histograms.get(key);
if (!hist) {
hist = { sum: 0, count: 0, buckets: {} };
buckets.forEach((b) => (hist!.buckets[b] = 0));
this.histograms.set(key, hist);
}

hist.sum += value;
hist.count += 1;
buckets.forEach((b) => {
if (value <= b) {
hist!.buckets[b] = (hist!.buckets[b] || 0) + 1;
}
});
}

public getPrometheusMetrics(): string {
let output = "";

// Export counters
for (const [key, value] of this.counters.entries()) {
const [name, labelStr] = this.parseKey(key);
output += `# HELP ${name} Total count of ${name}\n`;
output += `# TYPE ${name} counter\n`;
output += `${name}${labelStr} ${value}\n\n`;
}

// Export histograms
for (const [key, hist] of this.histograms.entries()) {
const [name, labelStr] = this.parseKey(key);
output += `# HELP ${name} Latency histogram for ${name}\n`;
output += `# TYPE ${name} histogram\n`;

const sortedBuckets = Object.keys(hist.buckets)
.map(Number)
.sort((a, b) => a - b);
const labelsBase = labelStr.length > 2 ? labelStr.slice(1, -1) + "," : "";

sortedBuckets.forEach((b) => {
output += `${name}_bucket{${labelsBase}le="${b === Infinity ? "+Inf" : b}"} ${hist.buckets[b]}\n`;
});
output += `${name}_bucket{${labelsBase}le="+Inf"} ${hist.count}\n`;
output += `${name}_sum${labelStr} ${hist.sum}\n`;
output += `${name}_count${labelStr} ${hist.count}\n\n`;
}

return output;
}

private formatKey(name: string, labels: Record<string, string>): string {
const labelPairs = Object.entries(labels)
.map(([k, v]) => `${k}="${v}"`)
.join(",");
return labelPairs ? `${name}{${labelPairs}}` : name;
}

private parseKey(key: string): [string, string] {
const braceIdx = key.indexOf("{");
if (braceIdx === -1) return [key, ""];
return [key.slice(0, braceIdx), key.slice(braceIdx)];
}
}

export const metrics = new MetricsRegistry();

/**
* Helper to measure execution time of a promise.
*/
export async function observeLatency<T>(
name: string,
labels: Record<string, string>,
fn: () => Promise<T>,
): Promise<T> {
if (!metrics.isEnabled()) return fn();

const start = process.hrtime.bigint();
try {
const result = await fn();
const end = process.hrtime.bigint();
const durationSec = Number(end - start) / 1e9;

metrics.observeHistogram(`${name}_latency_seconds`, durationSec, {
...labels,
status: "success",
});
metrics.incrementCounter(`${name}_total`, { ...labels, status: "success" });

return result;
} catch (error) {
const end = process.hrtime.bigint();
const durationSec = Number(end - start) / 1e9;

metrics.observeHistogram(`${name}_latency_seconds`, durationSec, {
...labels,
status: "error",
});
metrics.incrementCounter(`${name}_total`, { ...labels, status: "error" });

throw error;
}
}
57 changes: 57 additions & 0 deletions test/metrics.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

const test = require("node:test");
const assert = require("node:assert");
const path = require("node:path");

// Load the compiled metrics module
const metricsPath = path.resolve(__dirname, "../nemoclaw/dist/observability/metrics.js");
const { metrics, observeLatency } = require(metricsPath);

// Enable metrics for testing
process.env.NEMOCLAW_METRICS_ENABLED = "true";

test("MetricsRegistry stores and exports counters", () => {
metrics.incrementCounter("test_counter", { foo: "bar" });
const output = metrics.getPrometheusMetrics();

assert.match(output, /# TYPE test_counter counter/);
assert.match(output, /test_counter\{foo="bar"\} 1/);
});

test("MetricsRegistry stores and exports histograms", () => {
metrics.observeHistogram("test_hist", 0.5, { abc: "123" });
const output = metrics.getPrometheusMetrics();

assert.match(output, /# TYPE test_hist histogram/);
assert.match(output, /test_hist_bucket\{abc="123",le="0\.5"\} 1/);
assert.match(output, /test_hist_sum\{abc="123"\} 0\.5/);
assert.match(output, /test_hist_count\{abc="123"\} 1/);
});

test("observeLatency tracks success metrics", async () => {
const result = await observeLatency("test_op", { op: "success" }, async () => {
return "done";
});

assert.strictEqual(result, "done");
const output = metrics.getPrometheusMetrics();

assert.match(output, /test_op_total\{op="success",status="success"\} 1/);
assert.match(output, /test_op_latency_seconds_count\{op="success",status="success"\} 1/);
});

test("observeLatency tracks error metrics", async () => {
try {
await observeLatency("test_op_err", { op: "fail" }, async () => {
throw new Error("oops");
});
} catch (err) {
assert.strictEqual(err.message, "oops");
}

const output = metrics.getPrometheusMetrics();
assert.match(output, /test_op_err_total\{op="fail",status="error"\} 1/);
assert.match(output, /test_op_err_latency_seconds_count\{op="fail",status="error"\} 1/);
});