From a5141eb79f090c1fa75fceee25ce262a51907bd0 Mon Sep 17 00:00:00 2001 From: Ross Story Date: Mon, 18 May 2026 19:45:00 -0700 Subject: [PATCH 01/34] feat: add Copilot CLI plugin asset slice - plugin/.plugin/plugin.json: Copilot manifest with name/version/skills/mcpServers/hooks refs - plugin/.mcp.copilot.json: MCP server config with type:local, npx, env passthrough, tools:[*] - plugin/hooks/hooks.copilot.json: Copilot hooks (version:1) with 11 supported events and PreToolUse matcher - test/copilot-plugin.test.ts: 11 tests covering manifest, MCP config, and hooks validation Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- plugin/.mcp.copilot.json | 14 +++ plugin/.plugin/plugin.json | 15 +++ plugin/hooks/hooks.copilot.json | 72 ++++++++++++ test/copilot-plugin.test.ts | 200 ++++++++++++++++++++++++++++++++ 4 files changed, 301 insertions(+) create mode 100644 plugin/.mcp.copilot.json create mode 100644 plugin/.plugin/plugin.json create mode 100644 plugin/hooks/hooks.copilot.json create mode 100644 test/copilot-plugin.test.ts diff --git a/plugin/.mcp.copilot.json b/plugin/.mcp.copilot.json new file mode 100644 index 00000000..01d03f7d --- /dev/null +++ b/plugin/.mcp.copilot.json @@ -0,0 +1,14 @@ +{ + "mcpServers": { + "agentmemory": { + "type": "local", + "command": "npx", + "args": ["-y", "@agentmemory/mcp"], + "env": { + "AGENTMEMORY_URL": "${AGENTMEMORY_URL}", + "AGENTMEMORY_SECRET": "${AGENTMEMORY_SECRET}" + }, + "tools": ["*"] + } + } +} diff --git a/plugin/.plugin/plugin.json b/plugin/.plugin/plugin.json new file mode 100644 index 00000000..424b6882 --- /dev/null +++ b/plugin/.plugin/plugin.json @@ -0,0 +1,15 @@ +{ + "name": "agentmemory", + "version": "0.9.20", + "description": "Persistent memory for AI coding agents -- captures tool usage, compresses via LLM, injects context into future sessions. 12 hooks, 53 MCP tools, 4 skills, real-time viewer.", + "author": { + "name": "Rohit Ghumare", + "url": "https://github.com/rohitg00" + }, + "license": "Apache-2.0", + "homepage": "https://github.com/rohitg00/agentmemory", + "repository": "https://github.com/rohitg00/agentmemory", + "skills": "../skills/", + "mcpServers": "../.mcp.copilot.json", + "hooks": "../hooks/hooks.copilot.json" +} diff --git a/plugin/hooks/hooks.copilot.json b/plugin/hooks/hooks.copilot.json new file mode 100644 index 00000000..e3baa795 --- /dev/null +++ b/plugin/hooks/hooks.copilot.json @@ -0,0 +1,72 @@ +{ + "version": 1, + "hooks": { + "SessionStart": [ + { + "type": "command", + "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/session-start.mjs" + } + ], + "UserPromptSubmit": [ + { + "type": "command", + "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/prompt-submit.mjs" + } + ], + "PreToolUse": [ + { + "type": "command", + "matcher": "Edit|Write|Read|Glob|Grep", + "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/pre-tool-use.mjs" + } + ], + "PostToolUse": [ + { + "type": "command", + "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/post-tool-use.mjs" + } + ], + "PostToolUseFailure": [ + { + "type": "command", + "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/post-tool-failure.mjs" + } + ], + "PreCompact": [ + { + "type": "command", + "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/pre-compact.mjs" + } + ], + "Stop": [ + { + "type": "command", + "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/stop.mjs" + } + ], + "SessionEnd": [ + { + "type": "command", + "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/session-end.mjs" + } + ], + "SubagentStart": [ + { + "type": "command", + "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/subagent-start.mjs" + } + ], + "SubagentStop": [ + { + "type": "command", + "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/subagent-stop.mjs" + } + ], + "Notification": [ + { + "type": "command", + "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/notification.mjs" + } + ] + } +} diff --git a/test/copilot-plugin.test.ts b/test/copilot-plugin.test.ts new file mode 100644 index 00000000..f1e39c96 --- /dev/null +++ b/test/copilot-plugin.test.ts @@ -0,0 +1,200 @@ +import { describe, expect, it } from "vitest"; +import { readFileSync, existsSync } from "node:fs"; +import { join, resolve } from "node:path"; + +const repoRoot = resolve(__dirname, ".."); +const pluginRoot = join(repoRoot, "plugin"); + +function readJson(path: string): T { + return JSON.parse(readFileSync(path, "utf-8")) as T; +} + +const SUPPORTED_COPILOT_EVENTS = new Set([ + "SessionStart", + "UserPromptSubmit", + "PreToolUse", + "PostToolUse", + "PostToolUseFailure", + "PreCompact", + "Stop", + "SessionEnd", + "SubagentStart", + "SubagentStop", + "Notification", +]); + +const REQUIRED_MINIMUM_EVENTS = [ + "SessionStart", + "UserPromptSubmit", + "PreToolUse", + "PostToolUse", + "Stop", +]; + +const KNOWN_SKILL_DIRS = [ + "recall", + "remember", + "session-history", + "forget", + "handoff", + "recap", + "commit-context", + "commit-history", +]; + +describe("Copilot plugin manifest (plugin/.plugin/plugin.json)", () => { + it("manifest exists with kebab-case name, version, and required fields", () => { + const manifestPath = join(pluginRoot, ".plugin/plugin.json"); + expect(existsSync(manifestPath)).toBe(true); + const manifest = readJson<{ + name: string; + version: string; + description?: string; + skills?: string; + mcpServers?: string; + hooks?: string; + }>(manifestPath); + expect(manifest.name).toBe("agentmemory"); + expect(manifest.name).toMatch(/^[a-z][a-z0-9-]*$/); + expect(manifest.version).toMatch(/^\d+\.\d+\.\d+/); + expect(manifest.skills).toBeDefined(); + expect(manifest.mcpServers).toBeDefined(); + expect(manifest.hooks).toBeDefined(); + }); + + it("manifest version matches main package.json", () => { + const pkgVer = readJson<{ version: string }>(join(repoRoot, "package.json")).version; + const pluginVer = readJson<{ version: string }>( + join(pluginRoot, ".plugin/plugin.json"), + ).version; + expect(pluginVer).toBe(pkgVer); + }); + + it("all referenced manifest paths resolve to existing files / directories", () => { + const manifest = readJson<{ skills: string; mcpServers: string; hooks: string }>( + join(pluginRoot, ".plugin/plugin.json"), + ); + const manifestDir = join(pluginRoot, ".plugin"); + expect(existsSync(resolve(manifestDir, manifest.skills))).toBe(true); + expect(existsSync(resolve(manifestDir, manifest.mcpServers))).toBe(true); + expect(existsSync(resolve(manifestDir, manifest.hooks))).toBe(true); + }); + + it("skills path resolves and contains all known skill directories", () => { + const manifest = readJson<{ skills: string }>(join(pluginRoot, ".plugin/plugin.json")); + const manifestDir = join(pluginRoot, ".plugin"); + const skillsPath = resolve(manifestDir, manifest.skills); + for (const skill of KNOWN_SKILL_DIRS) { + expect( + existsSync(join(skillsPath, skill)), + `missing skill directory: ${skill}`, + ).toBe(true); + } + }); +}); + +describe("Copilot MCP config (.mcp.copilot.json)", () => { + it("file exists with expected shape", () => { + const mcpPath = join(pluginRoot, ".mcp.copilot.json"); + expect(existsSync(mcpPath)).toBe(true); + const config = readJson<{ + mcpServers: { + agentmemory: { + type: string; + command: string; + args: string[]; + env: Record; + tools: string[]; + }; + }; + }>(mcpPath); + const server = config.mcpServers.agentmemory; + expect(server.type).toBe("local"); + expect(server.command).toBe("npx"); + expect(server.args).toEqual(["-y", "@agentmemory/mcp"]); + expect(server.env["AGENTMEMORY_URL"]).toBe("${AGENTMEMORY_URL}"); + expect(server.env["AGENTMEMORY_SECRET"]).toBe("${AGENTMEMORY_SECRET}"); + expect(server.tools).toContain("*"); + }); +}); + +describe("Copilot hooks config (hooks/hooks.copilot.json)", () => { + type HookEntry = { + type: string; + command?: string; + bash?: string; + powershell?: string; + matcher?: string; + }; + + function loadHooks() { + return readJson<{ version: number; hooks: Record }>( + join(pluginRoot, "hooks/hooks.copilot.json"), + ); + } + + it("has top-level version === 1 and hooks object", () => { + const config = loadHooks(); + expect(config.version).toBe(1); + expect(config.hooks).toBeDefined(); + expect(typeof config.hooks).toBe("object"); + }); + + it("contains only supported Copilot event names", () => { + const config = loadHooks(); + for (const event of Object.keys(config.hooks)) { + expect( + SUPPORTED_COPILOT_EVENTS.has(event), + `unsupported event "${event}" in hooks.copilot.json`, + ).toBe(true); + } + }); + + it("contains all required minimum events", () => { + const config = loadHooks(); + const events = Object.keys(config.hooks); + for (const event of REQUIRED_MINIMUM_EVENTS) { + expect(events, `missing required event: ${event}`).toContain(event); + } + }); + + it("PreToolUse entry has the correct matcher", () => { + const config = loadHooks(); + const preToolEntries = config.hooks["PreToolUse"]; + expect(preToolEntries).toBeDefined(); + const withMatcher = preToolEntries.find((e) => e.matcher === "Edit|Write|Read|Glob|Grep"); + expect(withMatcher, "PreToolUse must have matcher Edit|Write|Read|Glob|Grep").toBeDefined(); + }); + + it("every handler has type === 'command' and exactly one of command/bash/powershell", () => { + const config = loadHooks(); + for (const [event, entries] of Object.entries(config.hooks)) { + for (const handler of entries) { + expect(handler.type, `${event} handler type`).toBe("command"); + const commandFields = [handler.command, handler.bash, handler.powershell].filter( + (v) => v !== undefined, + ); + expect( + commandFields.length, + `${event} handler must have exactly one of command/bash/powershell`, + ).toBe(1); + } + } + }); + + it("every referenced script exists on disk", () => { + const config = loadHooks(); + const scriptRefs = new Set(); + for (const entries of Object.values(config.hooks)) { + for (const handler of entries) { + const cmd = handler.command ?? handler.bash ?? handler.powershell ?? ""; + const match = cmd.match(/\$\{(?:COPILOT_PLUGIN_ROOT|CLAUDE_PLUGIN_ROOT)\}\/(scripts\/[^\s]+)/); + if (match) scriptRefs.add(match[1]); + } + } + expect(scriptRefs.size).toBeGreaterThan(0); + for (const rel of scriptRefs) { + expect(existsSync(join(pluginRoot, rel)), `missing hook script: ${rel}`).toBe(true); + } + }); +}); From 913fb5119f1daaeac4568a9cf8ca1b2bdf595974 Mon Sep 17 00:00:00 2001 From: Ross Story Date: Mon, 18 May 2026 19:48:46 -0700 Subject: [PATCH 02/34] Add Copilot CLI connect support Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- AGENTS.md | 2 + README.md | 23 ++++- src/cli.ts | 5 +- src/cli/connect/copilot-cli.ts | 91 ++++++++++++++++++ src/cli/connect/index.ts | 2 + src/cli/connect/util.ts | 11 +++ test/cli-connect.test.ts | 169 ++++++++++++++++++++++++++++++++- 7 files changed, 296 insertions(+), 7 deletions(-) create mode 100644 src/cli/connect/copilot-cli.ts diff --git a/AGENTS.md b/AGENTS.md index ebcf3584..15ef8ac2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -19,6 +19,7 @@ agentmemory is a persistent memory system for AI coding agents, built on iii-eng 5. `test/mcp-standalone.test.ts` — tool count assertion 6. `README.md` — tool counts (search for "MCP tools") 7. `plugin/.claude-plugin/plugin.json` — tool count in description +8. Copilot plugin manifest/config (when present) — tool count or MCP exposure **When adding REST endpoints, you MUST update:** 1. `src/triggers/api.ts` — endpoint registration @@ -32,6 +33,7 @@ agentmemory is a persistent memory system for AI coding agents, built on iii-eng 4. `src/functions/export-import.ts` — supportedVersions set 5. `test/export-import.test.ts` — version assertion 6. `plugin/.claude-plugin/plugin.json` — version field +7. Copilot plugin manifest/config (when present) — version field **When adding new KV scopes:** 1. `src/state/schema.ts` — add to the KV object diff --git a/README.md b/README.md index abfbb6e0..a06261ba 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Your coding agent remembers everything. No more re-explaining. Built on iii engine
- Persistent memory for Claude Code, Cursor, Gemini CLI, Codex CLI, Hermes, OpenClaw, pi, OpenCode, and any MCP client. + Persistent memory for Claude Code, GitHub Copilot CLI, Cursor, Gemini CLI, Codex CLI, Hermes, OpenClaw, pi, OpenCode, and any MCP client.

@@ -75,7 +75,7 @@ npm install -g @agentmemory/agentmemory # once — bare `agentmemory` on PATH agentmemory # start the memory server on :3111 agentmemory demo # seed sample sessions + prove recall -agentmemory connect claude-code # wire your agent (also: codex, cursor, gemini-cli, ...) +agentmemory connect claude-code # wire your agent (also: copilot-cli, codex, cursor, gemini-cli, ...) ``` Or via `npx` (no install): @@ -107,6 +107,11 @@ agentmemory works with any agent that supports hooks, MCP, or REST API. All agen native plugin + 6 hooks + MCP +GitHub Copilot CLI
+GitHub Copilot CLI
+MCP + plugin hooks/skills + + OpenClaw
OpenClaw
native plugin + MCP @@ -418,6 +423,18 @@ The Codex plugin ships from the same `plugin/` directory as the Claude Code plug Codex's hook engine injects `CLAUDE_PLUGIN_ROOT` into hook subprocesses (per [`codex-rs/hooks/src/engine/discovery.rs`](https://github.com/openai/codex/blob/main/codex-rs/hooks/src/engine/discovery.rs)), so the same hook scripts work across both hosts without duplication. Subagent / SessionEnd / Notification / TaskCompleted / PostToolUseFailure events are Claude-Code-only and are not registered for Codex. +### GitHub Copilot CLI + +```bash +# MCP-only wiring +agentmemory connect copilot-cli + +# Full hooks/skills plugin from the GitHub subdir +copilot plugin install rohitg00/agentmemory:plugin +``` + +`agentmemory connect copilot-cli` merges `mcpServers.agentmemory` into `~/.copilot/mcp-config.json` and preserves existing servers. Copilot picks up the MCP server on next launch or after `/mcp`. Install the plugin as well when you want the full hook/skill experience. +

OpenClaw (paste this prompt) @@ -489,6 +506,8 @@ The agentmemory entry is the **same MCP server block** across every host that us | **Cline / Roo Code / Kilo Code** | Cline MCP settings (Settings UI → MCP Servers → Edit) | Same `mcpServers` block. | | **Windsurf** | `~/.codeium/windsurf/mcp_config.json` | Same `mcpServers` block. | | **Gemini CLI** | `~/.gemini/settings.json` | `gemini mcp add agentmemory npx -y @agentmemory/mcp --scope user` (auto-merges). | +| **GitHub Copilot CLI (MCP only)** | `~/.copilot/mcp-config.json` | `agentmemory connect copilot-cli` merges `mcpServers.agentmemory`; Copilot picks it up on next launch or `/mcp`. | +| **GitHub Copilot CLI (full plugin)** | Copilot plugin install | `copilot plugin install rohitg00/agentmemory:plugin` for the plugin from the GitHub subdir. | | **OpenClaw** | OpenClaw MCP config | Same `mcpServers` block, or use the deeper [memory plugin](integrations/openclaw/). | | **Codex CLI (MCP only)** | `.codex/config.toml` | TOML shape: `codex mcp add agentmemory -- npx -y @agentmemory/mcp`, or add `[mcp_servers.agentmemory]` manually. | | **Codex CLI (full plugin)** | Codex plugin marketplace | `codex plugin marketplace add rohitg00/agentmemory` then `codex plugin install agentmemory`. Registers MCP + 6 lifecycle hooks (SessionStart, UserPromptSubmit, PreToolUse, PostToolUse, PreCompact, Stop) + 4 skills. | diff --git a/src/cli.ts b/src/cli.ts index 5eca18ce..82535abb 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -117,8 +117,9 @@ Usage: agentmemory [command] [options] Commands: (default) Start agentmemory worker init Copy bundled .env.example to ~/.agentmemory/.env if absent - connect [agent] Wire agentmemory into an installed agent (claude-code, codex, - cursor, gemini-cli, openclaw, hermes, pi, openhuman). + connect [agent] Wire agentmemory into an installed agent (claude-code, + copilot-cli, codex, cursor, gemini-cli, openclaw, + hermes, pi, openhuman). No arg = interactive picker. --all wires every detected agent. --dry-run shows what would change. --force re-installs. status Show connection status, memory count, flags, and health diff --git a/src/cli/connect/copilot-cli.ts b/src/cli/connect/copilot-cli.ts new file mode 100644 index 00000000..245ffc9b --- /dev/null +++ b/src/cli/connect/copilot-cli.ts @@ -0,0 +1,91 @@ +import { existsSync, mkdirSync } from "node:fs"; +import { homedir } from "node:os"; +import { dirname, join } from "node:path"; +import * as p from "@clack/prompts"; +import type { ConnectAdapter, ConnectOptions, ConnectResult } from "./types.js"; +import { + AGENTMEMORY_COPILOT_MCP_BLOCK, + backupFile, + logAlreadyWired, + logBackup, + logInstalled, + readJsonSafe, + writeJsonAtomic, +} from "./util.js"; + +const COPILOT_DIR = join(homedir(), ".copilot"); +const COPILOT_MCP_JSON = join(COPILOT_DIR, "mcp-config.json"); + +type CopilotMcpEntry = typeof AGENTMEMORY_COPILOT_MCP_BLOCK; +type CopilotConfig = { + mcpServers?: Record; + [key: string]: unknown; +}; + +function entryMatches(entry: unknown): boolean { + if (!entry || typeof entry !== "object") return false; + return JSON.stringify(entry) === JSON.stringify(AGENTMEMORY_COPILOT_MCP_BLOCK); +} + +export const adapter: ConnectAdapter = { + name: "copilot-cli", + displayName: "GitHub Copilot CLI", + docs: "https://github.com/rohitg00/agentmemory#github-copilot-cli", + protocolNote: + "→ Using MCP. Install the plugin too for full hooks/skills coverage.", + + detect(): boolean { + return existsSync(COPILOT_DIR); + }, + + async install(opts: ConnectOptions): Promise { + const existing = readJsonSafe(COPILOT_MCP_JSON); + const next: CopilotConfig = existing ? { ...existing } : {}; + const servers: Record = { + ...((next.mcpServers as Record) ?? {}), + }; + + const alreadyHas = entryMatches(servers["agentmemory"]); + if (alreadyHas && !opts.force) { + logAlreadyWired("GitHub Copilot CLI", COPILOT_MCP_JSON); + return { kind: "already-wired", mutatedPath: COPILOT_MCP_JSON }; + } + + if (opts.dryRun) { + p.log.info( + `[dry-run] Would ${alreadyHas ? "overwrite" : "add"} mcpServers.agentmemory in ${COPILOT_MCP_JSON}`, + ); + return { kind: "installed", mutatedPath: COPILOT_MCP_JSON }; + } + + let backupPath: string | undefined; + if (existsSync(COPILOT_MCP_JSON)) { + backupPath = backupFile(COPILOT_MCP_JSON, "copilot-cli"); + logBackup(backupPath); + } else { + mkdirSync(dirname(COPILOT_MCP_JSON), { recursive: true }); + } + + servers["agentmemory"] = AGENTMEMORY_COPILOT_MCP_BLOCK; + next.mcpServers = servers; + writeJsonAtomic(COPILOT_MCP_JSON, next); + + const verify = readJsonSafe(COPILOT_MCP_JSON); + if (!entryMatches(verify?.mcpServers?.["agentmemory"])) { + p.log.error( + `Verification failed: ${COPILOT_MCP_JSON} did not contain mcpServers.agentmemory after write.`, + ); + return { kind: "skipped", reason: "verification-failed" }; + } + + logInstalled("GitHub Copilot CLI", COPILOT_MCP_JSON); + p.log.info( + "Copilot picks up MCP servers on next launch or after `/mcp`. Install the plugin too for full hooks/skills.", + ); + return { + kind: "installed", + mutatedPath: COPILOT_MCP_JSON, + ...(backupPath !== undefined && { backupPath }), + }; + }, +}; diff --git a/src/cli/connect/index.ts b/src/cli/connect/index.ts index 17aedf8f..1f1e38c9 100644 --- a/src/cli/connect/index.ts +++ b/src/cli/connect/index.ts @@ -2,6 +2,7 @@ import { platform } from "node:os"; import * as p from "@clack/prompts"; import type { ConnectAdapter, ConnectOptions, ConnectResult } from "./types.js"; import { adapter as claudeCode } from "./claude-code.js"; +import { adapter as copilotCli } from "./copilot-cli.js"; import { adapter as codex } from "./codex.js"; import { adapter as cursor } from "./cursor.js"; import { adapter as geminiCli } from "./gemini-cli.js"; @@ -12,6 +13,7 @@ import { adapter as pi } from "./pi.js"; export const ADAPTERS: readonly ConnectAdapter[] = [ claudeCode, + copilotCli, codex, cursor, geminiCli, diff --git a/src/cli/connect/util.ts b/src/cli/connect/util.ts index 6d5f61ac..bff7e1a5 100644 --- a/src/cli/connect/util.ts +++ b/src/cli/connect/util.ts @@ -26,6 +26,17 @@ export const AGENTMEMORY_MCP_BLOCK = { }, }; +export const AGENTMEMORY_COPILOT_MCP_BLOCK = { + type: "local" as const, + command: "npx", + args: ["-y", "@agentmemory/mcp"], + env: { + AGENTMEMORY_URL: "${AGENTMEMORY_URL}", + AGENTMEMORY_SECRET: "${AGENTMEMORY_SECRET}", + }, + tools: ["*"], +}; + export function backupsDir(): string { return join(homedir(), ".agentmemory", "backups"); } diff --git a/test/cli-connect.test.ts b/test/cli-connect.test.ts index 99174dac..8443f06f 100644 --- a/test/cli-connect.test.ts +++ b/test/cli-connect.test.ts @@ -29,10 +29,11 @@ describe("agentmemory connect — dispatcher", () => { expect(resolveAdapter("")).toBeNull(); }); - it("ships exactly the 8 agents specified by the spec", () => { + it("ships exactly the 9 agents specified by the spec", () => { expect(knownAgents().sort()).toEqual( [ "claude-code", + "copilot-cli", "codex", "cursor", "gemini-cli", @@ -42,7 +43,7 @@ describe("agentmemory connect — dispatcher", () => { "pi", ].sort(), ); - expect(ADAPTERS.length).toBe(8); + expect(ADAPTERS.length).toBe(9); }); it("every adapter exposes detect() and install()", () => { @@ -175,7 +176,169 @@ describe("agentmemory connect — claude-code adapter (mock filesystem)", () => if (result.kind === "installed") { expect(result.backupPath).toBeDefined(); expect(existsSync(result.backupPath!)).toBe(true); - expect(result.backupPath!).toContain(".agentmemory/backups"); + expect(result.backupPath!).toContain(join(".agentmemory", "backups")); + } + }); +}); + +describe("agentmemory connect — copilot-cli adapter (mock filesystem)", () => { + let tmpHome: string; + let originalHome: string | undefined; + let originalUserprofile: string | undefined; + + beforeEach(() => { + tmpHome = mkdtempSync(join(tmpdir(), "am-connect-")); + originalHome = process.env["HOME"]; + originalUserprofile = process.env["USERPROFILE"]; + process.env["HOME"] = tmpHome; + process.env["USERPROFILE"] = tmpHome; + vi.resetModules(); + }); + + afterEach(() => { + if (originalHome !== undefined) process.env["HOME"] = originalHome; + else delete process.env["HOME"]; + if (originalUserprofile !== undefined) + process.env["USERPROFILE"] = originalUserprofile; + else delete process.env["USERPROFILE"]; + rmSync(tmpHome, { recursive: true, force: true }); + vi.resetModules(); + }); + + async function loadAdapter(): Promise { + const mod = await import("../src/cli/connect/copilot-cli.js?t=" + Date.now()); + return (mod as { adapter: ConnectAdapter }).adapter; + } + + it("detect() returns false when ~/.copilot doesn't exist", async () => { + const a = await loadAdapter(); + expect(a.detect()).toBe(false); + }); + + it("install() writes mcpServers.agentmemory into ~/.copilot/mcp-config.json and is idempotent", async () => { + require("node:fs").mkdirSync(join(tmpHome, ".copilot"), { recursive: true }); + + const a = await loadAdapter(); + expect(a.detect()).toBe(true); + + const first = await a.install({ dryRun: false, force: false }); + expect(first.kind).toBe("installed"); + + const config = JSON.parse( + readFileSync(join(tmpHome, ".copilot", "mcp-config.json"), "utf-8"), + ); + expect(config.mcpServers.agentmemory).toEqual({ + type: "local", + command: "npx", + args: ["-y", "@agentmemory/mcp"], + env: { + AGENTMEMORY_URL: "${AGENTMEMORY_URL}", + AGENTMEMORY_SECRET: "${AGENTMEMORY_SECRET}", + }, + tools: ["*"], + }); + + const second = await a.install({ dryRun: false, force: false }); + expect(second.kind).toBe("already-wired"); + }); + + it("install() preserves unrelated top-level keys and mcpServers entries", async () => { + require("node:fs").mkdirSync(join(tmpHome, ".copilot"), { recursive: true }); + writeFileSync( + join(tmpHome, ".copilot", "mcp-config.json"), + JSON.stringify({ + otherTopLevel: { keep: true }, + mcpServers: { other: { type: "local", command: "other" } }, + }), + ); + + const a = await loadAdapter(); + const result = await a.install({ dryRun: false, force: false }); + expect(result.kind).toBe("installed"); + + const config = JSON.parse( + readFileSync(join(tmpHome, ".copilot", "mcp-config.json"), "utf-8"), + ); + expect(config.otherTopLevel).toEqual({ keep: true }); + expect(config.mcpServers.other).toEqual({ type: "local", command: "other" }); + expect(config.mcpServers.agentmemory.command).toBe("npx"); + }); + + it("install() writes env passthrough block for AGENTMEMORY_URL + AGENTMEMORY_SECRET", async () => { + require("node:fs").mkdirSync(join(tmpHome, ".copilot"), { recursive: true }); + + const a = await loadAdapter(); + const result = await a.install({ dryRun: false, force: false }); + expect(result.kind).toBe("installed"); + + const config = JSON.parse( + readFileSync(join(tmpHome, ".copilot", "mcp-config.json"), "utf-8"), + ); + const entry = config.mcpServers.agentmemory; + expect(entry.env.AGENTMEMORY_URL).toBe("${AGENTMEMORY_URL}"); + expect(entry.env.AGENTMEMORY_SECRET).toBe("${AGENTMEMORY_SECRET}"); + }); + + it("install() with --force rewrites even when already wired", async () => { + require("node:fs").mkdirSync(join(tmpHome, ".copilot"), { recursive: true }); + writeFileSync( + join(tmpHome, ".copilot", "mcp-config.json"), + JSON.stringify({ + mcpServers: { + agentmemory: { + type: "local", + command: "npx", + args: ["-y", "@agentmemory/mcp"], + env: { + AGENTMEMORY_URL: "${AGENTMEMORY_URL}", + AGENTMEMORY_SECRET: "${AGENTMEMORY_SECRET}", + }, + tools: ["memory_save"], + }, + }, + }), + ); + + const a = await loadAdapter(); + const result = await a.install({ dryRun: false, force: true }); + expect(result.kind).toBe("installed"); + + const config = JSON.parse( + readFileSync(join(tmpHome, ".copilot", "mcp-config.json"), "utf-8"), + ); + expect(config.mcpServers.agentmemory.tools).toEqual(["*"]); + }); + + it("install() with --dry-run does not mutate the file", async () => { + require("node:fs").mkdirSync(join(tmpHome, ".copilot"), { recursive: true }); + const before = JSON.stringify({ mcpServers: {} }); + writeFileSync(join(tmpHome, ".copilot", "mcp-config.json"), before); + + const a = await loadAdapter(); + const result = await a.install({ dryRun: true, force: false }); + expect(result.kind).toBe("installed"); + + const after = readFileSync( + join(tmpHome, ".copilot", "mcp-config.json"), + "utf-8", + ); + expect(after).toBe(before); + }); + + it("install() creates a backup file when config pre-exists", async () => { + require("node:fs").mkdirSync(join(tmpHome, ".copilot"), { recursive: true }); + writeFileSync( + join(tmpHome, ".copilot", "mcp-config.json"), + JSON.stringify({ mcpServers: {} }), + ); + + const a = await loadAdapter(); + const result = await a.install({ dryRun: false, force: false }); + expect(result.kind).toBe("installed"); + if (result.kind === "installed") { + expect(result.backupPath).toBeDefined(); + expect(existsSync(result.backupPath!)).toBe(true); + expect(result.backupPath!).toContain(join(".agentmemory", "backups")); } }); }); From 134f8b199f337d9ed31eb89a63626fd7d462aef8 Mon Sep 17 00:00:00 2001 From: Ross Story Date: Mon, 18 May 2026 20:32:32 -0700 Subject: [PATCH 03/34] Add GitHub Copilot CLI support Adds Copilot CLI support through a root plugin manifest, Copilot-specific MCP and hook configuration, and a connect adapter for MCP-only setup. Includes Windows-safe Copilot MCP command generation, COPILOT_HOME handling, Copilot hook payload normalization, generated hook scripts, and targeted tests for plugin shape, hook execution, and connect behavior. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- AGENTS.md | 4 +- README.md | 2 +- plugin/hooks/hooks.copilot.json | 24 ++-- plugin/{.plugin => }/plugin.json | 6 +- plugin/scripts/notification.mjs | 7 +- plugin/scripts/post-tool-failure.mjs | 13 +- plugin/scripts/post-tool-use.mjs | 19 ++- plugin/scripts/pre-compact.mjs | 2 +- plugin/scripts/pre-tool-use.mjs | 25 ++-- plugin/scripts/prompt-submit.mjs | 4 +- plugin/scripts/session-end.mjs | 2 +- plugin/scripts/session-start.mjs | 2 +- plugin/scripts/stop.mjs | 2 +- plugin/scripts/subagent-start.mjs | 8 +- plugin/scripts/subagent-stop.mjs | 8 +- src/cli/connect/copilot-cli.ts | 2 +- src/cli/connect/index.ts | 6 +- src/cli/connect/util.ts | 14 +- src/hooks/notification.ts | 7 +- src/hooks/post-tool-failure.ts | 21 +-- src/hooks/post-tool-use.ts | 20 ++- src/hooks/pre-compact.ts | 2 +- src/hooks/pre-tool-use.ts | 18 ++- src/hooks/prompt-submit.ts | 4 +- src/hooks/session-end.ts | 4 +- src/hooks/session-start.ts | 3 +- src/hooks/stop.ts | 4 +- src/hooks/subagent-start.ts | 8 +- src/hooks/subagent-stop.ts | 8 +- test/cli-connect.test.ts | 47 ++++++- test/copilot-plugin.test.ts | 203 +++++++++++++++++++++++---- 31 files changed, 372 insertions(+), 127 deletions(-) rename plugin/{.plugin => }/plugin.json (81%) diff --git a/AGENTS.md b/AGENTS.md index 15ef8ac2..24e74245 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -19,7 +19,7 @@ agentmemory is a persistent memory system for AI coding agents, built on iii-eng 5. `test/mcp-standalone.test.ts` — tool count assertion 6. `README.md` — tool counts (search for "MCP tools") 7. `plugin/.claude-plugin/plugin.json` — tool count in description -8. Copilot plugin manifest/config (when present) — tool count or MCP exposure +8. `plugin/plugin.json` and `plugin/.mcp.copilot.json` (when present) — tool count or MCP exposure **When adding REST endpoints, you MUST update:** 1. `src/triggers/api.ts` — endpoint registration @@ -33,7 +33,7 @@ agentmemory is a persistent memory system for AI coding agents, built on iii-eng 4. `src/functions/export-import.ts` — supportedVersions set 5. `test/export-import.test.ts` — version assertion 6. `plugin/.claude-plugin/plugin.json` — version field -7. Copilot plugin manifest/config (when present) — version field +7. `plugin/plugin.json` (when present) — version field **When adding new KV scopes:** 1. `src/state/schema.ts` — add to the KV object diff --git a/README.md b/README.md index a06261ba..c462746d 100644 --- a/README.md +++ b/README.md @@ -433,7 +433,7 @@ agentmemory connect copilot-cli copilot plugin install rohitg00/agentmemory:plugin ``` -`agentmemory connect copilot-cli` merges `mcpServers.agentmemory` into `~/.copilot/mcp-config.json` and preserves existing servers. Copilot picks up the MCP server on next launch or after `/mcp`. Install the plugin as well when you want the full hook/skill experience. +`agentmemory connect copilot-cli` merges `mcpServers.agentmemory` into `~/.copilot/mcp-config.json` (or `$COPILOT_HOME/mcp-config.json` when `COPILOT_HOME` is set) and preserves existing servers. This adapter is Windows-safe even though other `connect` adapters still require manual Windows setup. Copilot picks up the MCP server on next launch or after `/mcp`. Install the plugin as well when you want the full hook/skill experience.
OpenClaw (paste this prompt) diff --git a/plugin/hooks/hooks.copilot.json b/plugin/hooks/hooks.copilot.json index e3baa795..62bd077a 100644 --- a/plugin/hooks/hooks.copilot.json +++ b/plugin/hooks/hooks.copilot.json @@ -1,68 +1,68 @@ { "version": 1, "hooks": { - "SessionStart": [ + "sessionStart": [ { "type": "command", "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/session-start.mjs" } ], - "UserPromptSubmit": [ + "userPromptSubmitted": [ { "type": "command", "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/prompt-submit.mjs" } ], - "PreToolUse": [ + "preToolUse": [ { "type": "command", - "matcher": "Edit|Write|Read|Glob|Grep", + "matcher": "edit|create|view|glob|grep", "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/pre-tool-use.mjs" } ], - "PostToolUse": [ + "postToolUse": [ { "type": "command", "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/post-tool-use.mjs" } ], - "PostToolUseFailure": [ + "postToolUseFailure": [ { "type": "command", "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/post-tool-failure.mjs" } ], - "PreCompact": [ + "preCompact": [ { "type": "command", "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/pre-compact.mjs" } ], - "Stop": [ + "agentStop": [ { "type": "command", "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/stop.mjs" } ], - "SessionEnd": [ + "sessionEnd": [ { "type": "command", "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/session-end.mjs" } ], - "SubagentStart": [ + "subagentStart": [ { "type": "command", "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/subagent-start.mjs" } ], - "SubagentStop": [ + "subagentStop": [ { "type": "command", "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/subagent-stop.mjs" } ], - "Notification": [ + "notification": [ { "type": "command", "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/notification.mjs" diff --git a/plugin/.plugin/plugin.json b/plugin/plugin.json similarity index 81% rename from plugin/.plugin/plugin.json rename to plugin/plugin.json index 424b6882..8de92acd 100644 --- a/plugin/.plugin/plugin.json +++ b/plugin/plugin.json @@ -9,7 +9,7 @@ "license": "Apache-2.0", "homepage": "https://github.com/rohitg00/agentmemory", "repository": "https://github.com/rohitg00/agentmemory", - "skills": "../skills/", - "mcpServers": "../.mcp.copilot.json", - "hooks": "../hooks/hooks.copilot.json" + "skills": "skills/", + "mcpServers": ".mcp.copilot.json", + "hooks": "hooks/hooks.copilot.json" } diff --git a/plugin/scripts/notification.mjs b/plugin/scripts/notification.mjs index a318848d..98049a7a 100755 --- a/plugin/scripts/notification.mjs +++ b/plugin/scripts/notification.mjs @@ -22,8 +22,9 @@ async function main() { return; } if (isSdkChildContext(data)) return; - if (data.notification_type !== "permission_prompt") return; - const sessionId = data.session_id || "unknown"; + const notificationType = data.notification_type ?? data.notificationType; + if (notificationType !== "permission_prompt") return; + const sessionId = data.session_id || data.sessionId || "unknown"; try { await fetch(`${REST_URL}/agentmemory/observe`, { method: "POST", @@ -35,7 +36,7 @@ async function main() { cwd: data.cwd || process.cwd(), timestamp: (/* @__PURE__ */ new Date()).toISOString(), data: { - notification_type: data.notification_type, + notification_type: notificationType, title: data.title, message: data.message } diff --git a/plugin/scripts/post-tool-failure.mjs b/plugin/scripts/post-tool-failure.mjs index 3a593f3a..902a0930 100755 --- a/plugin/scripts/post-tool-failure.mjs +++ b/plugin/scripts/post-tool-failure.mjs @@ -22,8 +22,11 @@ async function main() { return; } if (isSdkChildContext(data)) return; - if (data.is_interrupt) return; - const sessionId = data.session_id || "unknown"; + if (data.is_interrupt || data.isInterrupt) return; + const sessionId = data.session_id || data.sessionId || "unknown"; + const toolName = data.tool_name ?? data.toolName; + const toolInput = data.tool_input ?? data.toolArgs; + const error = data.error ?? data.errorMessage; try { await fetch(`${REST_URL}/agentmemory/observe`, { method: "POST", @@ -35,9 +38,9 @@ async function main() { cwd: data.cwd || process.cwd(), timestamp: (/* @__PURE__ */ new Date()).toISOString(), data: { - tool_name: data.tool_name, - tool_input: typeof data.tool_input === "string" ? data.tool_input.slice(0, 4e3) : JSON.stringify(data.tool_input ?? "").slice(0, 4e3), - error: typeof data.error === "string" ? data.error.slice(0, 4e3) : JSON.stringify(data.error ?? "").slice(0, 4e3) + tool_name: toolName, + tool_input: typeof toolInput === "string" ? toolInput.slice(0, 4e3) : JSON.stringify(toolInput ?? "").slice(0, 4e3), + error: typeof error === "string" ? error.slice(0, 4e3) : JSON.stringify(error ?? "").slice(0, 4e3) } }), signal: AbortSignal.timeout(3e3) diff --git a/plugin/scripts/post-tool-use.mjs b/plugin/scripts/post-tool-use.mjs index 5ebec645..b1c30d41 100755 --- a/plugin/scripts/post-tool-use.mjs +++ b/plugin/scripts/post-tool-use.mjs @@ -22,8 +22,10 @@ async function main() { return; } if (isSdkChildContext(data)) return; - const sessionId = data.session_id || "unknown"; - const { imageData, cleanOutput } = extractImageData(data.tool_output); + const sessionId = data.session_id || data.sessionId || "unknown"; + const toolName = data.tool_name ?? data.toolName; + const toolInput = data.tool_input ?? data.toolArgs; + const { imageData, cleanOutput } = extractImageData(toolOutput(data)); try { await fetch(`${REST_URL}/agentmemory/observe`, { method: "POST", @@ -35,8 +37,8 @@ async function main() { cwd: data.cwd || process.cwd(), timestamp: (/* @__PURE__ */ new Date()).toISOString(), data: { - tool_name: data.tool_name, - tool_input: data.tool_input, + tool_name: toolName, + tool_input: toolInput, tool_output: truncate(cleanOutput, 8e3), ...imageData ? { image_data: imageData } : {} } @@ -45,6 +47,15 @@ async function main() { }); } catch {} } +function toolOutput(data) { + if (data.tool_output !== void 0) return data.tool_output; + const result = data.tool_result ?? data.toolResult; + if (typeof result === "object" && result !== null) { + const obj = result; + return obj.text_result_for_llm ?? obj.textResultForLlm ?? result; + } + return result; +} function isBase64Image(val) { return typeof val === "string" && (val.startsWith("data:image/") || val.startsWith("iVBORw0KGgo") || val.startsWith("/9j/")); } diff --git a/plugin/scripts/pre-compact.mjs b/plugin/scripts/pre-compact.mjs index bff9e7fa..b68bf025 100755 --- a/plugin/scripts/pre-compact.mjs +++ b/plugin/scripts/pre-compact.mjs @@ -22,7 +22,7 @@ async function main() { return; } if (isSdkChildContext(data)) return; - const sessionId = data.session_id || "unknown"; + const sessionId = data.session_id || data.sessionId || "unknown"; const project = data.cwd || process.cwd(); if (process.env["CLAUDE_MEMORY_BRIDGE"] === "true") try { await fetch(`${REST_URL}/agentmemory/claude-bridge/sync`, { diff --git a/plugin/scripts/pre-tool-use.mjs b/plugin/scripts/pre-tool-use.mjs index 561b6b0d..cab44fb8 100755 --- a/plugin/scripts/pre-tool-use.mjs +++ b/plugin/scripts/pre-tool-use.mjs @@ -24,18 +24,21 @@ async function main() { return; } if (isSdkChildContext(data)) return; - const toolName = data.tool_name; + const toolName = typeof data.tool_name === "string" ? data.tool_name : data.toolName; if (!toolName) return; + const normalizedToolName = toolName.toLowerCase(); if (![ - "Edit", - "Write", - "Read", - "Glob", - "Grep" - ].includes(toolName)) return; - const toolInput = data.tool_input || {}; + "edit", + "write", + "create", + "read", + "view", + "glob", + "grep" + ].includes(normalizedToolName)) return; + const toolInput = data.tool_input || data.toolArgs || {}; const files = []; - const fileKeys = toolName === "Grep" ? ["path", "file"] : [ + const fileKeys = normalizedToolName === "grep" ? ["path", "file"] : [ "file_path", "path", "file", @@ -47,11 +50,11 @@ async function main() { } if (files.length === 0) return; const terms = []; - if (toolName === "Grep" || toolName === "Glob") { + if (normalizedToolName === "grep" || normalizedToolName === "glob") { const pattern = toolInput["pattern"]; if (typeof pattern === "string" && pattern.length > 0) terms.push(pattern); } - const sessionId = data.session_id || "unknown"; + const sessionId = data.session_id || data.sessionId || "unknown"; try { const res = await fetch(`${REST_URL}/agentmemory/enrich`, { method: "POST", diff --git a/plugin/scripts/prompt-submit.mjs b/plugin/scripts/prompt-submit.mjs index 18aa040a..a8a61192 100755 --- a/plugin/scripts/prompt-submit.mjs +++ b/plugin/scripts/prompt-submit.mjs @@ -22,7 +22,7 @@ async function main() { return; } if (isSdkChildContext(data)) return; - const sessionId = data.session_id || "unknown"; + const sessionId = data.session_id || data.sessionId || "unknown"; try { await fetch(`${REST_URL}/agentmemory/observe`, { method: "POST", @@ -33,7 +33,7 @@ async function main() { project: data.cwd || process.cwd(), cwd: data.cwd || process.cwd(), timestamp: (/* @__PURE__ */ new Date()).toISOString(), - data: { prompt: data.prompt } + data: { prompt: data.prompt ?? data.userPrompt } }), signal: AbortSignal.timeout(3e3) }); diff --git a/plugin/scripts/session-end.mjs b/plugin/scripts/session-end.mjs index 8e1de092..7707e357 100755 --- a/plugin/scripts/session-end.mjs +++ b/plugin/scripts/session-end.mjs @@ -22,7 +22,7 @@ async function main() { return; } if (isSdkChildContext(data)) return; - const sessionId = data.session_id || "unknown"; + const sessionId = data.session_id || data.sessionId || "unknown"; try { await fetch(`${REST_URL}/agentmemory/session/end`, { method: "POST", diff --git a/plugin/scripts/session-start.mjs b/plugin/scripts/session-start.mjs index 9e573e24..f1ec1be6 100755 --- a/plugin/scripts/session-start.mjs +++ b/plugin/scripts/session-start.mjs @@ -25,7 +25,7 @@ async function main() { return; } if (isSdkChildContext(data)) return; - const sessionId = data.session_id || `ses_${Date.now().toString(36)}`; + const sessionId = data.session_id || data.sessionId || `ses_${Date.now().toString(36)}`; const project = data.cwd || process.cwd(); const url = `${REST_URL}/agentmemory/session/start`; const init = { diff --git a/plugin/scripts/stop.mjs b/plugin/scripts/stop.mjs index e0ffa350..3fe5cb36 100755 --- a/plugin/scripts/stop.mjs +++ b/plugin/scripts/stop.mjs @@ -22,7 +22,7 @@ async function main() { return; } if (isSdkChildContext(data)) return; - const sessionId = data.session_id || "unknown"; + const sessionId = data.session_id || data.sessionId || "unknown"; try { await fetch(`${REST_URL}/agentmemory/summarize`, { method: "POST", diff --git a/plugin/scripts/subagent-start.mjs b/plugin/scripts/subagent-start.mjs index db143459..b9872d8b 100755 --- a/plugin/scripts/subagent-start.mjs +++ b/plugin/scripts/subagent-start.mjs @@ -23,7 +23,9 @@ async function main() { return; } if (isSdkChildContext(data)) return; - const sessionId = data.session_id || "unknown"; + const sessionId = data.session_id || data.sessionId || "unknown"; + const agentId = data.agent_id ?? data.agentName; + const agentType = data.agent_type ?? data.agentDisplayName ?? data.agentName; fetch(`${REST_URL}/agentmemory/observe`, { method: "POST", headers: authHeaders(), @@ -34,8 +36,8 @@ async function main() { cwd: data.cwd || process.cwd(), timestamp: (/* @__PURE__ */ new Date()).toISOString(), data: { - agent_id: data.agent_id, - agent_type: data.agent_type + agent_id: agentId, + agent_type: agentType } }), signal: AbortSignal.timeout(TIMEOUT_MS) diff --git a/plugin/scripts/subagent-stop.mjs b/plugin/scripts/subagent-stop.mjs index 7ec66a7d..a04b068f 100755 --- a/plugin/scripts/subagent-stop.mjs +++ b/plugin/scripts/subagent-stop.mjs @@ -22,7 +22,9 @@ async function main() { return; } if (isSdkChildContext(data)) return; - const sessionId = data.session_id || "unknown"; + const sessionId = data.session_id || data.sessionId || "unknown"; + const agentId = data.agent_id ?? data.agentName; + const agentType = data.agent_type ?? data.agentDisplayName ?? data.agentName; const lastMsg = typeof data.last_assistant_message === "string" ? data.last_assistant_message.slice(0, 4e3) : ""; try { await fetch(`${REST_URL}/agentmemory/observe`, { @@ -35,8 +37,8 @@ async function main() { cwd: data.cwd || process.cwd(), timestamp: (/* @__PURE__ */ new Date()).toISOString(), data: { - agent_id: data.agent_id, - agent_type: data.agent_type, + agent_id: agentId, + agent_type: agentType, last_message: lastMsg } }), diff --git a/src/cli/connect/copilot-cli.ts b/src/cli/connect/copilot-cli.ts index 245ffc9b..8cce5a54 100644 --- a/src/cli/connect/copilot-cli.ts +++ b/src/cli/connect/copilot-cli.ts @@ -13,7 +13,7 @@ import { writeJsonAtomic, } from "./util.js"; -const COPILOT_DIR = join(homedir(), ".copilot"); +const COPILOT_DIR = process.env["COPILOT_HOME"] || join(homedir(), ".copilot"); const COPILOT_MCP_JSON = join(COPILOT_DIR, "mcp-config.json"); type CopilotMcpEntry = typeof AGENTMEMORY_COPILOT_MCP_BLOCK; diff --git a/src/cli/connect/index.ts b/src/cli/connect/index.ts index 1f1e38c9..e9db9fc5 100644 --- a/src/cli/connect/index.ts +++ b/src/cli/connect/index.ts @@ -76,7 +76,10 @@ export async function runAdapter( } export async function runConnect(args: string[]): Promise { - if (platform() === "win32") { + const { dryRun, force, all, positional } = parseFlags(args); + const allowWindowsAdapter = + positional.length === 1 && positional[0]?.toLowerCase() === "copilot-cli"; + if (platform() === "win32" && !allowWindowsAdapter) { p.intro("agentmemory connect"); p.log.warn( "Windows: automated `connect` is not supported yet. See https://github.com/rohitg00/agentmemory#other-agents for manual install steps.", @@ -85,7 +88,6 @@ export async function runConnect(args: string[]): Promise { return; } - const { dryRun, force, all, positional } = parseFlags(args); const opts: ConnectOptions = { dryRun, force }; p.intro("agentmemory connect"); diff --git a/src/cli/connect/util.ts b/src/cli/connect/util.ts index bff7e1a5..8902e3ef 100644 --- a/src/cli/connect/util.ts +++ b/src/cli/connect/util.ts @@ -26,10 +26,20 @@ export const AGENTMEMORY_MCP_BLOCK = { }, }; +const COPILOT_MCP_COMMAND = + process.platform === "win32" + ? { + command: process.env["ComSpec"] || process.env["COMSPEC"] || "cmd.exe", + args: ["/d", "/s", "/c", "npx", "-y", "@agentmemory/mcp"], + } + : { + command: "npx", + args: ["-y", "@agentmemory/mcp"], + }; + export const AGENTMEMORY_COPILOT_MCP_BLOCK = { type: "local" as const, - command: "npx", - args: ["-y", "@agentmemory/mcp"], + ...COPILOT_MCP_COMMAND, env: { AGENTMEMORY_URL: "${AGENTMEMORY_URL}", AGENTMEMORY_SECRET: "${AGENTMEMORY_SECRET}", diff --git a/src/hooks/notification.ts b/src/hooks/notification.ts index 6c4b7b81..42ef594c 100644 --- a/src/hooks/notification.ts +++ b/src/hooks/notification.ts @@ -29,9 +29,10 @@ async function main() { } if (isSdkChildContext(data)) return; - if (data.notification_type !== "permission_prompt") return; + const notificationType = data.notification_type ?? data.notificationType; + if (notificationType !== "permission_prompt") return; - const sessionId = (data.session_id as string) || "unknown"; + const sessionId = ((data.session_id || data.sessionId) as string) || "unknown"; try { await fetch(`${REST_URL}/agentmemory/observe`, { @@ -44,7 +45,7 @@ async function main() { cwd: data.cwd || process.cwd(), timestamp: new Date().toISOString(), data: { - notification_type: data.notification_type, + notification_type: notificationType, title: data.title, message: data.message, }, diff --git a/src/hooks/post-tool-failure.ts b/src/hooks/post-tool-failure.ts index 337aebdd..7fa71d05 100644 --- a/src/hooks/post-tool-failure.ts +++ b/src/hooks/post-tool-failure.ts @@ -29,9 +29,12 @@ async function main() { } if (isSdkChildContext(data)) return; - if (data.is_interrupt) return; + if (data.is_interrupt || data.isInterrupt) return; - const sessionId = (data.session_id as string) || "unknown"; + const sessionId = ((data.session_id || data.sessionId) as string) || "unknown"; + const toolName = data.tool_name ?? data.toolName; + const toolInput = data.tool_input ?? data.toolArgs; + const error = data.error ?? data.errorMessage; try { await fetch(`${REST_URL}/agentmemory/observe`, { @@ -44,15 +47,15 @@ async function main() { cwd: data.cwd || process.cwd(), timestamp: new Date().toISOString(), data: { - tool_name: data.tool_name, + tool_name: toolName, tool_input: - typeof data.tool_input === "string" - ? data.tool_input.slice(0, 4000) - : JSON.stringify(data.tool_input ?? "").slice(0, 4000), + typeof toolInput === "string" + ? toolInput.slice(0, 4000) + : JSON.stringify(toolInput ?? "").slice(0, 4000), error: - typeof data.error === "string" - ? data.error.slice(0, 4000) - : JSON.stringify(data.error ?? "").slice(0, 4000), + typeof error === "string" + ? error.slice(0, 4000) + : JSON.stringify(error ?? "").slice(0, 4000), }, }), signal: AbortSignal.timeout(3000), diff --git a/src/hooks/post-tool-use.ts b/src/hooks/post-tool-use.ts index 65afc8b1..512ef844 100644 --- a/src/hooks/post-tool-use.ts +++ b/src/hooks/post-tool-use.ts @@ -30,9 +30,11 @@ async function main() { if (isSdkChildContext(data)) return; - const sessionId = (data.session_id as string) || "unknown"; + const sessionId = ((data.session_id || data.sessionId) as string) || "unknown"; + const toolName = data.tool_name ?? data.toolName; + const toolInput = data.tool_input ?? data.toolArgs; - const { imageData, cleanOutput } = extractImageData(data.tool_output); + const { imageData, cleanOutput } = extractImageData(toolOutput(data)); try { await fetch(`${REST_URL}/agentmemory/observe`, { @@ -45,8 +47,8 @@ async function main() { cwd: data.cwd || process.cwd(), timestamp: new Date().toISOString(), data: { - tool_name: data.tool_name, - tool_input: data.tool_input, + tool_name: toolName, + tool_input: toolInput, tool_output: truncate(cleanOutput, 8000), ...(imageData ? { image_data: imageData } : {}), }, @@ -57,6 +59,16 @@ async function main() { } } +function toolOutput(data: Record): unknown { + if (data.tool_output !== undefined) return data.tool_output; + const result = data.tool_result ?? data.toolResult; + if (typeof result === "object" && result !== null) { + const obj = result as Record; + return obj.text_result_for_llm ?? obj.textResultForLlm ?? result; + } + return result; +} + function isBase64Image(val: unknown): val is string { return typeof val === "string" && ( val.startsWith("data:image/") || diff --git a/src/hooks/pre-compact.ts b/src/hooks/pre-compact.ts index ea13ebec..77fb7a57 100644 --- a/src/hooks/pre-compact.ts +++ b/src/hooks/pre-compact.ts @@ -30,7 +30,7 @@ async function main() { if (isSdkChildContext(data)) return; - const sessionId = (data.session_id as string) || "unknown"; + const sessionId = ((data.session_id || data.sessionId) as string) || "unknown"; const project = (data.cwd as string) || process.cwd(); if (process.env["CLAUDE_MEMORY_BRIDGE"] === "true") { diff --git a/src/hooks/pre-tool-use.ts b/src/hooks/pre-tool-use.ts index 61f6c443..6fbf34b8 100644 --- a/src/hooks/pre-tool-use.ts +++ b/src/hooks/pre-tool-use.ts @@ -50,16 +50,20 @@ async function main() { if (isSdkChildContext(data)) return; - const toolName = data.tool_name as string; + const toolName = + typeof data.tool_name === "string" + ? data.tool_name + : (data.toolName as string); if (!toolName) return; - const fileTools = ["Edit", "Write", "Read", "Glob", "Grep"]; - if (!fileTools.includes(toolName)) return; + const normalizedToolName = toolName.toLowerCase(); + const fileTools = ["edit", "write", "create", "read", "view", "glob", "grep"]; + if (!fileTools.includes(normalizedToolName)) return; - const toolInput = (data.tool_input || {}) as Record; + const toolInput = (data.tool_input || data.toolArgs || {}) as Record; const files: string[] = []; const fileKeys = - toolName === "Grep" + normalizedToolName === "grep" ? ["path", "file"] : ["file_path", "path", "file", "pattern"]; for (const key of fileKeys) { @@ -69,14 +73,14 @@ async function main() { if (files.length === 0) return; const terms: string[] = []; - if (toolName === "Grep" || toolName === "Glob") { + if (normalizedToolName === "grep" || normalizedToolName === "glob") { const pattern = toolInput["pattern"]; if (typeof pattern === "string" && pattern.length > 0) { terms.push(pattern); } } - const sessionId = (data.session_id as string) || "unknown"; + const sessionId = ((data.session_id || data.sessionId) as string) || "unknown"; try { const res = await fetch(`${REST_URL}/agentmemory/enrich`, { diff --git a/src/hooks/prompt-submit.ts b/src/hooks/prompt-submit.ts index 971b11be..10265a77 100644 --- a/src/hooks/prompt-submit.ts +++ b/src/hooks/prompt-submit.ts @@ -30,7 +30,7 @@ async function main() { if (isSdkChildContext(data)) return; - const sessionId = (data.session_id as string) || "unknown"; + const sessionId = ((data.session_id || data.sessionId) as string) || "unknown"; try { await fetch(`${REST_URL}/agentmemory/observe`, { @@ -42,7 +42,7 @@ async function main() { project: data.cwd || process.cwd(), cwd: data.cwd || process.cwd(), timestamp: new Date().toISOString(), - data: { prompt: data.prompt }, + data: { prompt: data.prompt ?? data.userPrompt }, }), signal: AbortSignal.timeout(3000), }); diff --git a/src/hooks/session-end.ts b/src/hooks/session-end.ts index 31bef22e..7efa550e 100644 --- a/src/hooks/session-end.ts +++ b/src/hooks/session-end.ts @@ -30,7 +30,7 @@ async function main() { if (isSdkChildContext(data)) return; - const sessionId = (data.session_id as string) || "unknown"; + const sessionId = ((data.session_id || data.sessionId) as string) || "unknown"; try { await fetch(`${REST_URL}/agentmemory/session/end`, { @@ -76,4 +76,4 @@ async function main() { } } -main(); \ No newline at end of file +main(); diff --git a/src/hooks/session-start.ts b/src/hooks/session-start.ts index a6cefe41..444edc32 100644 --- a/src/hooks/session-start.ts +++ b/src/hooks/session-start.ts @@ -49,7 +49,8 @@ async function main() { if (isSdkChildContext(data)) return; const sessionId = - (data.session_id as string) || `ses_${Date.now().toString(36)}`; + ((data.session_id || data.sessionId) as string) || + `ses_${Date.now().toString(36)}`; const project = (data.cwd as string) || process.cwd(); const url = `${REST_URL}/agentmemory/session/start`; diff --git a/src/hooks/stop.ts b/src/hooks/stop.ts index 1f2f5b8a..18ca371d 100644 --- a/src/hooks/stop.ts +++ b/src/hooks/stop.ts @@ -37,7 +37,7 @@ async function main() { return; } - const sessionId = (data.session_id as string) || "unknown"; + const sessionId = ((data.session_id || data.sessionId) as string) || "unknown"; try { await fetch(`${REST_URL}/agentmemory/summarize`, { @@ -51,4 +51,4 @@ async function main() { } } -main(); \ No newline at end of file +main(); diff --git a/src/hooks/subagent-start.ts b/src/hooks/subagent-start.ts index 3f730adb..f3f560bc 100644 --- a/src/hooks/subagent-start.ts +++ b/src/hooks/subagent-start.ts @@ -38,7 +38,9 @@ async function main() { if (isSdkChildContext(data)) return; - const sessionId = (data.session_id as string) || "unknown"; + const sessionId = ((data.session_id || data.sessionId) as string) || "unknown"; + const agentId = data.agent_id ?? data.agentName; + const agentType = data.agent_type ?? data.agentDisplayName ?? data.agentName; fetch(`${REST_URL}/agentmemory/observe`, { method: "POST", @@ -50,8 +52,8 @@ async function main() { cwd: data.cwd || process.cwd(), timestamp: new Date().toISOString(), data: { - agent_id: data.agent_id, - agent_type: data.agent_type, + agent_id: agentId, + agent_type: agentType, }, }), signal: AbortSignal.timeout(TIMEOUT_MS), diff --git a/src/hooks/subagent-stop.ts b/src/hooks/subagent-stop.ts index c555746e..1f41e5a4 100644 --- a/src/hooks/subagent-stop.ts +++ b/src/hooks/subagent-stop.ts @@ -30,7 +30,9 @@ async function main() { if (isSdkChildContext(data)) return; - const sessionId = (data.session_id as string) || "unknown"; + const sessionId = ((data.session_id || data.sessionId) as string) || "unknown"; + const agentId = data.agent_id ?? data.agentName; + const agentType = data.agent_type ?? data.agentDisplayName ?? data.agentName; const lastMsg = typeof data.last_assistant_message === "string" ? data.last_assistant_message.slice(0, 4000) @@ -47,8 +49,8 @@ async function main() { cwd: data.cwd || process.cwd(), timestamp: new Date().toISOString(), data: { - agent_id: data.agent_id, - agent_type: data.agent_type, + agent_id: agentId, + agent_type: agentType, last_message: lastMsg, }, }), diff --git a/test/cli-connect.test.ts b/test/cli-connect.test.ts index 8443f06f..fbb8c2b5 100644 --- a/test/cli-connect.test.ts +++ b/test/cli-connect.test.ts @@ -10,6 +10,17 @@ import { } from "../src/cli/connect/index.js"; import type { ConnectAdapter } from "../src/cli/connect/types.js"; +const EXPECTED_COPILOT_MCP_COMMAND = + process.platform === "win32" + ? { + command: process.env["ComSpec"] || process.env["COMSPEC"] || "cmd.exe", + args: ["/d", "/s", "/c", "npx", "-y", "@agentmemory/mcp"], + } + : { + command: "npx", + args: ["-y", "@agentmemory/mcp"], + }; + describe("agentmemory connect — dispatcher", () => { it("resolves every known agent by lowercase name", () => { for (const name of knownAgents()) { @@ -185,13 +196,17 @@ describe("agentmemory connect — copilot-cli adapter (mock filesystem)", () => let tmpHome: string; let originalHome: string | undefined; let originalUserprofile: string | undefined; + let originalCopilotHome: string | undefined; + let importCounter = 0; beforeEach(() => { tmpHome = mkdtempSync(join(tmpdir(), "am-connect-")); originalHome = process.env["HOME"]; originalUserprofile = process.env["USERPROFILE"]; + originalCopilotHome = process.env["COPILOT_HOME"]; process.env["HOME"] = tmpHome; process.env["USERPROFILE"] = tmpHome; + delete process.env["COPILOT_HOME"]; vi.resetModules(); }); @@ -201,12 +216,17 @@ describe("agentmemory connect — copilot-cli adapter (mock filesystem)", () => if (originalUserprofile !== undefined) process.env["USERPROFILE"] = originalUserprofile; else delete process.env["USERPROFILE"]; + if (originalCopilotHome !== undefined) + process.env["COPILOT_HOME"] = originalCopilotHome; + else delete process.env["COPILOT_HOME"]; rmSync(tmpHome, { recursive: true, force: true }); vi.resetModules(); }); async function loadAdapter(): Promise { - const mod = await import("../src/cli/connect/copilot-cli.js?t=" + Date.now()); + const mod = await import( + "../src/cli/connect/copilot-cli.js?t=" + Date.now() + "-" + importCounter++ + ); return (mod as { adapter: ConnectAdapter }).adapter; } @@ -229,8 +249,7 @@ describe("agentmemory connect — copilot-cli adapter (mock filesystem)", () => ); expect(config.mcpServers.agentmemory).toEqual({ type: "local", - command: "npx", - args: ["-y", "@agentmemory/mcp"], + ...EXPECTED_COPILOT_MCP_COMMAND, env: { AGENTMEMORY_URL: "${AGENTMEMORY_URL}", AGENTMEMORY_SECRET: "${AGENTMEMORY_SECRET}", @@ -242,6 +261,21 @@ describe("agentmemory connect — copilot-cli adapter (mock filesystem)", () => expect(second.kind).toBe("already-wired"); }); + it("honors COPILOT_HOME when locating mcp-config.json", async () => { + const customCopilotHome = join(tmpHome, "custom-copilot-home"); + process.env["COPILOT_HOME"] = customCopilotHome; + require("node:fs").mkdirSync(customCopilotHome, { recursive: true }); + + const a = await loadAdapter(); + expect(a.detect()).toBe(true); + + const result = await a.install({ dryRun: false, force: false }); + expect(result.kind).toBe("installed"); + expect(result.mutatedPath).toBe(join(customCopilotHome, "mcp-config.json")); + expect(existsSync(join(customCopilotHome, "mcp-config.json"))).toBe(true); + expect(existsSync(join(tmpHome, ".copilot", "mcp-config.json"))).toBe(false); + }); + it("install() preserves unrelated top-level keys and mcpServers entries", async () => { require("node:fs").mkdirSync(join(tmpHome, ".copilot"), { recursive: true }); writeFileSync( @@ -261,7 +295,9 @@ describe("agentmemory connect — copilot-cli adapter (mock filesystem)", () => ); expect(config.otherTopLevel).toEqual({ keep: true }); expect(config.mcpServers.other).toEqual({ type: "local", command: "other" }); - expect(config.mcpServers.agentmemory.command).toBe("npx"); + expect(config.mcpServers.agentmemory.command).toBe( + EXPECTED_COPILOT_MCP_COMMAND.command, + ); }); it("install() writes env passthrough block for AGENTMEMORY_URL + AGENTMEMORY_SECRET", async () => { @@ -287,8 +323,7 @@ describe("agentmemory connect — copilot-cli adapter (mock filesystem)", () => mcpServers: { agentmemory: { type: "local", - command: "npx", - args: ["-y", "@agentmemory/mcp"], + ...EXPECTED_COPILOT_MCP_COMMAND, env: { AGENTMEMORY_URL: "${AGENTMEMORY_URL}", AGENTMEMORY_SECRET: "${AGENTMEMORY_SECRET}", diff --git a/test/copilot-plugin.test.ts b/test/copilot-plugin.test.ts index f1e39c96..dc734e1e 100644 --- a/test/copilot-plugin.test.ts +++ b/test/copilot-plugin.test.ts @@ -1,6 +1,8 @@ import { describe, expect, it } from "vitest"; import { readFileSync, existsSync } from "node:fs"; import { join, resolve } from "node:path"; +import { createServer } from "node:http"; +import { spawn } from "node:child_process"; const repoRoot = resolve(__dirname, ".."); const pluginRoot = join(repoRoot, "plugin"); @@ -10,25 +12,25 @@ function readJson(path: string): T { } const SUPPORTED_COPILOT_EVENTS = new Set([ - "SessionStart", - "UserPromptSubmit", - "PreToolUse", - "PostToolUse", - "PostToolUseFailure", - "PreCompact", - "Stop", - "SessionEnd", - "SubagentStart", - "SubagentStop", - "Notification", + "sessionStart", + "userPromptSubmitted", + "preToolUse", + "postToolUse", + "postToolUseFailure", + "preCompact", + "agentStop", + "sessionEnd", + "subagentStart", + "subagentStop", + "notification", ]); const REQUIRED_MINIMUM_EVENTS = [ - "SessionStart", - "UserPromptSubmit", - "PreToolUse", - "PostToolUse", - "Stop", + "sessionStart", + "userPromptSubmitted", + "preToolUse", + "postToolUse", + "agentStop", ]; const KNOWN_SKILL_DIRS = [ @@ -42,9 +44,9 @@ const KNOWN_SKILL_DIRS = [ "commit-history", ]; -describe("Copilot plugin manifest (plugin/.plugin/plugin.json)", () => { +describe("Copilot plugin manifest (plugin/plugin.json)", () => { it("manifest exists with kebab-case name, version, and required fields", () => { - const manifestPath = join(pluginRoot, ".plugin/plugin.json"); + const manifestPath = join(pluginRoot, "plugin.json"); expect(existsSync(manifestPath)).toBe(true); const manifest = readJson<{ name: string; @@ -65,24 +67,24 @@ describe("Copilot plugin manifest (plugin/.plugin/plugin.json)", () => { it("manifest version matches main package.json", () => { const pkgVer = readJson<{ version: string }>(join(repoRoot, "package.json")).version; const pluginVer = readJson<{ version: string }>( - join(pluginRoot, ".plugin/plugin.json"), + join(pluginRoot, "plugin.json"), ).version; expect(pluginVer).toBe(pkgVer); }); it("all referenced manifest paths resolve to existing files / directories", () => { const manifest = readJson<{ skills: string; mcpServers: string; hooks: string }>( - join(pluginRoot, ".plugin/plugin.json"), + join(pluginRoot, "plugin.json"), ); - const manifestDir = join(pluginRoot, ".plugin"); + const manifestDir = pluginRoot; expect(existsSync(resolve(manifestDir, manifest.skills))).toBe(true); expect(existsSync(resolve(manifestDir, manifest.mcpServers))).toBe(true); expect(existsSync(resolve(manifestDir, manifest.hooks))).toBe(true); }); it("skills path resolves and contains all known skill directories", () => { - const manifest = readJson<{ skills: string }>(join(pluginRoot, ".plugin/plugin.json")); - const manifestDir = join(pluginRoot, ".plugin"); + const manifest = readJson<{ skills: string }>(join(pluginRoot, "plugin.json")); + const manifestDir = pluginRoot; const skillsPath = resolve(manifestDir, manifest.skills); for (const skill of KNOWN_SKILL_DIRS) { expect( @@ -160,10 +162,10 @@ describe("Copilot hooks config (hooks/hooks.copilot.json)", () => { it("PreToolUse entry has the correct matcher", () => { const config = loadHooks(); - const preToolEntries = config.hooks["PreToolUse"]; + const preToolEntries = config.hooks["preToolUse"]; expect(preToolEntries).toBeDefined(); - const withMatcher = preToolEntries.find((e) => e.matcher === "Edit|Write|Read|Glob|Grep"); - expect(withMatcher, "PreToolUse must have matcher Edit|Write|Read|Glob|Grep").toBeDefined(); + const withMatcher = preToolEntries.find((e) => e.matcher === "edit|create|view|glob|grep"); + expect(withMatcher, "PreToolUse must have matcher edit|create|view|glob|grep").toBeDefined(); }); it("every handler has type === 'command' and exactly one of command/bash/powershell", () => { @@ -198,3 +200,152 @@ describe("Copilot hooks config (hooks/hooks.copilot.json)", () => { } }); }); + +describe("Copilot hook scripts", () => { + type ObservedRequest = { path: string; body: Record }; + + async function runHook( + script: string, + payload: Record, + env: Record = {}, + ): Promise<{ requests: ObservedRequest[]; stdout: string }> { + const requests: ObservedRequest[] = []; + const server = createServer((req, res) => { + let raw = ""; + req.on("data", (chunk) => { + raw += chunk; + }); + req.on("end", () => { + requests.push({ + path: req.url ?? "", + body: raw ? (JSON.parse(raw) as Record) : {}, + }); + res.writeHead(200, { "Content-Type": "application/json" }); + res.end(JSON.stringify({ context: "remembered context" })); + }); + }); + + await new Promise((resolveServer) => { + server.listen(0, "127.0.0.1", resolveServer); + }); + + const address = server.address(); + if (!address || typeof address === "string") { + server.close(); + throw new Error("test server did not bind to a TCP port"); + } + + try { + const child = spawn(process.execPath, [join(pluginRoot, script)], { + env: { + ...process.env, + AGENTMEMORY_URL: `http://127.0.0.1:${address.port}`, + AGENTMEMORY_SECRET: "", + ...env, + }, + stdio: ["pipe", "pipe", "pipe"], + }); + let stdout = ""; + let stderr = ""; + child.stdout.on("data", (chunk) => { + stdout += chunk; + }); + child.stderr.on("data", (chunk) => { + stderr += chunk; + }); + child.stdin.end(JSON.stringify(payload)); + + const exitCode = await new Promise((resolveExit, reject) => { + const timeout = setTimeout(() => { + child.kill(); + reject(new Error(`hook ${script} timed out`)); + }, 5000); + child.on("error", reject); + child.on("close", (code) => { + clearTimeout(timeout); + resolveExit(code); + }); + }); + + expect(exitCode, stderr).toBe(0); + return { requests, stdout }; + } finally { + await new Promise((resolveClose) => { + server.close(() => resolveClose()); + }); + } + } + + it("session-start accepts Copilot camelCase sessionId", async () => { + const result = await runHook( + "scripts/session-start.mjs", + { sessionId: "copilot-session", cwd: "C:\\repo" }, + { AGENTMEMORY_INJECT_CONTEXT: "true" }, + ); + + expect(result.stdout).toBe("remembered context"); + expect(result.requests[0]?.path).toBe("/agentmemory/session/start"); + expect(result.requests[0]?.body).toMatchObject({ + sessionId: "copilot-session", + project: "C:\\repo", + cwd: "C:\\repo", + }); + }); + + it("prompt-submit accepts Copilot camelCase prompt payload", async () => { + const result = await runHook("scripts/prompt-submit.mjs", { + sessionId: "copilot-session", + cwd: "C:\\repo", + userPrompt: "remember this prompt", + }); + + expect(result.requests[0]?.path).toBe("/agentmemory/observe"); + expect(result.requests[0]?.body).toMatchObject({ + hookType: "prompt_submit", + sessionId: "copilot-session", + data: { prompt: "remember this prompt" }, + }); + }); + + it("post-tool-failure accepts Copilot camelCase tool and error payloads", async () => { + const result = await runHook("scripts/post-tool-failure.mjs", { + sessionId: "copilot-session", + cwd: "C:\\repo", + toolName: "edit", + toolArgs: { filePath: "src/index.ts" }, + errorMessage: "failed", + }); + + expect(result.requests[0]?.path).toBe("/agentmemory/observe"); + expect(result.requests[0]?.body).toMatchObject({ + hookType: "post_tool_failure", + sessionId: "copilot-session", + data: { + tool_name: "edit", + tool_input: JSON.stringify({ filePath: "src/index.ts" }), + error: "failed", + }, + }); + }); + + it("notification accepts Copilot camelCase notificationType", async () => { + const result = await runHook("scripts/notification.mjs", { + sessionId: "copilot-session", + cwd: "C:\\repo", + notificationType: "permission_prompt", + title: "Tool approval", + message: "Approve edit", + }); + + expect(result.requests[0]?.path).toBe("/agentmemory/observe"); + expect(result.requests[0]?.body).toMatchObject({ + hookType: "notification", + sessionId: "copilot-session", + data: { + notification_type: "permission_prompt", + title: "Tool approval", + message: "Approve edit", + }, + }); + }); +}); From 46b0e86f3eebc4c19974840cbba9949d8fa61f0a Mon Sep 17 00:00:00 2001 From: Ross Story Date: Tue, 19 May 2026 02:04:50 -0700 Subject: [PATCH 04/34] Harden Copilot hook handling Addresses upstream AI review suggestions by aligning the Copilot preToolUse matcher with the hook allowlist, narrowing hook payload fields at runtime, normalizing subagent fallbacks, and tightening hook config validation. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- plugin/hooks/hooks.copilot.json | 2 +- plugin/scripts/notification.mjs | 3 ++- plugin/scripts/pre-tool-use.mjs | 5 +++-- plugin/scripts/subagent-start.mjs | 4 ++-- plugin/scripts/subagent-stop.mjs | 4 ++-- src/hooks/notification.ts | 6 +++++- src/hooks/pre-tool-use.ts | 12 ++++++++++-- src/hooks/subagent-start.ts | 4 ++-- src/hooks/subagent-stop.ts | 4 ++-- test/copilot-plugin.test.ts | 11 ++++++++--- 10 files changed, 37 insertions(+), 18 deletions(-) diff --git a/plugin/hooks/hooks.copilot.json b/plugin/hooks/hooks.copilot.json index 62bd077a..b7d09f8b 100644 --- a/plugin/hooks/hooks.copilot.json +++ b/plugin/hooks/hooks.copilot.json @@ -16,7 +16,7 @@ "preToolUse": [ { "type": "command", - "matcher": "edit|create|view|glob|grep", + "matcher": "edit|write|create|read|view|glob|grep", "command": "node ${COPILOT_PLUGIN_ROOT}/scripts/pre-tool-use.mjs" } ], diff --git a/plugin/scripts/notification.mjs b/plugin/scripts/notification.mjs index 98049a7a..8ba2c9b0 100755 --- a/plugin/scripts/notification.mjs +++ b/plugin/scripts/notification.mjs @@ -24,7 +24,8 @@ async function main() { if (isSdkChildContext(data)) return; const notificationType = data.notification_type ?? data.notificationType; if (notificationType !== "permission_prompt") return; - const sessionId = data.session_id || data.sessionId || "unknown"; + const rawSessionId = data.session_id ?? data.sessionId; + const sessionId = typeof rawSessionId === "string" && rawSessionId.length > 0 ? rawSessionId : "unknown"; try { await fetch(`${REST_URL}/agentmemory/observe`, { method: "POST", diff --git a/plugin/scripts/pre-tool-use.mjs b/plugin/scripts/pre-tool-use.mjs index cab44fb8..3d9dd986 100755 --- a/plugin/scripts/pre-tool-use.mjs +++ b/plugin/scripts/pre-tool-use.mjs @@ -24,7 +24,7 @@ async function main() { return; } if (isSdkChildContext(data)) return; - const toolName = typeof data.tool_name === "string" ? data.tool_name : data.toolName; + const toolName = typeof data.tool_name === "string" ? data.tool_name : typeof data.toolName === "string" ? data.toolName : void 0; if (!toolName) return; const normalizedToolName = toolName.toLowerCase(); if (![ @@ -36,7 +36,8 @@ async function main() { "glob", "grep" ].includes(normalizedToolName)) return; - const toolInput = data.tool_input || data.toolArgs || {}; + const rawToolInput = data.tool_input ?? data.toolArgs; + const toolInput = typeof rawToolInput === "object" && rawToolInput !== null && !Array.isArray(rawToolInput) ? rawToolInput : {}; const files = []; const fileKeys = normalizedToolName === "grep" ? ["path", "file"] : [ "file_path", diff --git a/plugin/scripts/subagent-start.mjs b/plugin/scripts/subagent-start.mjs index b9872d8b..c0d0b5eb 100755 --- a/plugin/scripts/subagent-start.mjs +++ b/plugin/scripts/subagent-start.mjs @@ -24,8 +24,8 @@ async function main() { } if (isSdkChildContext(data)) return; const sessionId = data.session_id || data.sessionId || "unknown"; - const agentId = data.agent_id ?? data.agentName; - const agentType = data.agent_type ?? data.agentDisplayName ?? data.agentName; + const agentId = data.agent_id || data.agentName; + const agentType = data.agent_type || data.agentDisplayName || data.agentName; fetch(`${REST_URL}/agentmemory/observe`, { method: "POST", headers: authHeaders(), diff --git a/plugin/scripts/subagent-stop.mjs b/plugin/scripts/subagent-stop.mjs index a04b068f..8765756d 100755 --- a/plugin/scripts/subagent-stop.mjs +++ b/plugin/scripts/subagent-stop.mjs @@ -23,8 +23,8 @@ async function main() { } if (isSdkChildContext(data)) return; const sessionId = data.session_id || data.sessionId || "unknown"; - const agentId = data.agent_id ?? data.agentName; - const agentType = data.agent_type ?? data.agentDisplayName ?? data.agentName; + const agentId = data.agent_id || data.agentName; + const agentType = data.agent_type || data.agentDisplayName || data.agentName; const lastMsg = typeof data.last_assistant_message === "string" ? data.last_assistant_message.slice(0, 4e3) : ""; try { await fetch(`${REST_URL}/agentmemory/observe`, { diff --git a/src/hooks/notification.ts b/src/hooks/notification.ts index 42ef594c..51347d50 100644 --- a/src/hooks/notification.ts +++ b/src/hooks/notification.ts @@ -32,7 +32,11 @@ async function main() { const notificationType = data.notification_type ?? data.notificationType; if (notificationType !== "permission_prompt") return; - const sessionId = ((data.session_id || data.sessionId) as string) || "unknown"; + const rawSessionId = data.session_id ?? data.sessionId; + const sessionId = + typeof rawSessionId === "string" && rawSessionId.length > 0 + ? rawSessionId + : "unknown"; try { await fetch(`${REST_URL}/agentmemory/observe`, { diff --git a/src/hooks/pre-tool-use.ts b/src/hooks/pre-tool-use.ts index 6fbf34b8..63061b3d 100644 --- a/src/hooks/pre-tool-use.ts +++ b/src/hooks/pre-tool-use.ts @@ -53,14 +53,22 @@ async function main() { const toolName = typeof data.tool_name === "string" ? data.tool_name - : (data.toolName as string); + : typeof data.toolName === "string" + ? data.toolName + : undefined; if (!toolName) return; const normalizedToolName = toolName.toLowerCase(); const fileTools = ["edit", "write", "create", "read", "view", "glob", "grep"]; if (!fileTools.includes(normalizedToolName)) return; - const toolInput = (data.tool_input || data.toolArgs || {}) as Record; + const rawToolInput = data.tool_input ?? data.toolArgs; + const toolInput = + typeof rawToolInput === "object" && + rawToolInput !== null && + !Array.isArray(rawToolInput) + ? (rawToolInput as Record) + : {}; const files: string[] = []; const fileKeys = normalizedToolName === "grep" diff --git a/src/hooks/subagent-start.ts b/src/hooks/subagent-start.ts index f3f560bc..3463da0b 100644 --- a/src/hooks/subagent-start.ts +++ b/src/hooks/subagent-start.ts @@ -39,8 +39,8 @@ async function main() { if (isSdkChildContext(data)) return; const sessionId = ((data.session_id || data.sessionId) as string) || "unknown"; - const agentId = data.agent_id ?? data.agentName; - const agentType = data.agent_type ?? data.agentDisplayName ?? data.agentName; + const agentId = data.agent_id || data.agentName; + const agentType = data.agent_type || data.agentDisplayName || data.agentName; fetch(`${REST_URL}/agentmemory/observe`, { method: "POST", diff --git a/src/hooks/subagent-stop.ts b/src/hooks/subagent-stop.ts index 1f41e5a4..90b99fd6 100644 --- a/src/hooks/subagent-stop.ts +++ b/src/hooks/subagent-stop.ts @@ -31,8 +31,8 @@ async function main() { if (isSdkChildContext(data)) return; const sessionId = ((data.session_id || data.sessionId) as string) || "unknown"; - const agentId = data.agent_id ?? data.agentName; - const agentType = data.agent_type ?? data.agentDisplayName ?? data.agentName; + const agentId = data.agent_id || data.agentName; + const agentType = data.agent_type || data.agentDisplayName || data.agentName; const lastMsg = typeof data.last_assistant_message === "string" ? data.last_assistant_message.slice(0, 4000) diff --git a/test/copilot-plugin.test.ts b/test/copilot-plugin.test.ts index dc734e1e..4956d2ac 100644 --- a/test/copilot-plugin.test.ts +++ b/test/copilot-plugin.test.ts @@ -164,8 +164,13 @@ describe("Copilot hooks config (hooks/hooks.copilot.json)", () => { const config = loadHooks(); const preToolEntries = config.hooks["preToolUse"]; expect(preToolEntries).toBeDefined(); - const withMatcher = preToolEntries.find((e) => e.matcher === "edit|create|view|glob|grep"); - expect(withMatcher, "PreToolUse must have matcher edit|create|view|glob|grep").toBeDefined(); + const withMatcher = preToolEntries.find( + (e) => e.matcher === "edit|write|create|read|view|glob|grep", + ); + expect( + withMatcher, + "PreToolUse must have matcher edit|write|create|read|view|glob|grep", + ).toBeDefined(); }); it("every handler has type === 'command' and exactly one of command/bash/powershell", () => { @@ -174,7 +179,7 @@ describe("Copilot hooks config (hooks/hooks.copilot.json)", () => { for (const handler of entries) { expect(handler.type, `${event} handler type`).toBe("command"); const commandFields = [handler.command, handler.bash, handler.powershell].filter( - (v) => v !== undefined, + (v): v is string => typeof v === "string" && v.trim().length > 0, ); expect( commandFields.length, From 71f4c6b3b3c008be851077cba45f1968b3f9ee4a Mon Sep 17 00:00:00 2001 From: Ross Story Date: Tue, 19 May 2026 02:25:57 -0700 Subject: [PATCH 05/34] Add Copilot to first-run onboarding Includes GitHub Copilot CLI in the first-run agent picker and adds a regression test so the Copilot setup path remains discoverable. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/cli/onboarding.ts | 5 +++-- test/onboarding.test.ts | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) create mode 100644 test/onboarding.test.ts diff --git a/src/cli/onboarding.ts b/src/cli/onboarding.ts index 92b23d62..50fe9d47 100644 --- a/src/cli/onboarding.ts +++ b/src/cli/onboarding.ts @@ -36,6 +36,7 @@ const __dirname = dirname(fileURLToPath(import.meta.url)); // where they overlap; the rest fall back to the generic `◇`. const NATIVE_AGENTS: { value: string; label: string; glyph: string }[] = [ { value: "claude-code", label: "Claude Code", glyph: "⟁" }, + { value: "copilot-cli", label: "GitHub Copilot CLI", glyph: "◈" }, { value: "codex", label: "Codex", glyph: "◎" }, { value: "openhuman", label: "OpenHuman", glyph: "◇" }, { value: "openclaw", label: "OpenClaw", glyph: "◇" }, @@ -67,7 +68,7 @@ const PROVIDERS: { value: string; label: string; envKey: string | null }[] = [ { value: "skip", label: "Skip — BM25-only mode (no LLM key)", envKey: null }, ]; -function buildAgentOptions(): { value: string; label: string; hint?: string }[] { +export function buildAgentOptions(): { value: string; label: string; hint?: string }[] { return [ ...NATIVE_AGENTS.map((a) => ({ value: a.value, @@ -166,7 +167,7 @@ export async function runOnboarding(): Promise { [ "━ how this works ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━", "All selected agents share the same memory at :3111.", - "A memory saved by Claude Code is visible to Codex + Cursor instantly.", + "A memory saved by Claude Code is visible to Copilot + Codex + Cursor instantly.", "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━", ].join("\n"), ); diff --git a/test/onboarding.test.ts b/test/onboarding.test.ts new file mode 100644 index 00000000..a706dc06 --- /dev/null +++ b/test/onboarding.test.ts @@ -0,0 +1,18 @@ +import { describe, expect, it } from "vitest"; + +import { buildAgentOptions } from "../src/cli/onboarding.js"; + +describe("first-run onboarding", () => { + it("offers GitHub Copilot CLI as a native setup target", () => { + const options = buildAgentOptions(); + expect(options).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + value: "copilot-cli", + label: expect.stringContaining("GitHub Copilot CLI"), + hint: "native plugin", + }), + ]), + ); + }); +}); From 11a0200334b2a44b135123e31fae8f6880621507 Mon Sep 17 00:00:00 2001 From: Ross Story Date: Tue, 19 May 2026 02:38:50 -0700 Subject: [PATCH 06/34] Default onboarding to Copilot inside Copilot CLI Detect Copilot CLI environment markers during first-run setup so pressing Enter wires the current agent instead of the historical Claude Code default. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/cli/onboarding.ts | 11 ++++++++++- test/onboarding.test.ts | 11 ++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/cli/onboarding.ts b/src/cli/onboarding.ts index 50fe9d47..76c446de 100644 --- a/src/cli/onboarding.ts +++ b/src/cli/onboarding.ts @@ -83,6 +83,15 @@ export function buildAgentOptions(): { value: string; label: string; hint?: stri ]; } +export function getInitialAgentValues( + env: Record = process.env, +): string[] { + if (env["COPILOT_CLI"] === "1" || env["COPILOT_AGENT_SESSION_ID"]) { + return ["copilot-cli"]; + } + return ["claude-code"]; +} + // Mirror src/cli.ts findEnvExample so onboarding ships the same .env // skeleton whether called directly or via `agentmemory init`. We // duplicate (rather than import) so the onboarding module doesn't @@ -154,7 +163,7 @@ export async function runOnboarding(): Promise { message: "Which agents will use agentmemory? (space to toggle, enter to confirm)", options: buildAgentOptions(), required: false, - initialValues: ["claude-code"], + initialValues: getInitialAgentValues(), }); if (p.isCancel(agentsPicked)) { p.cancel("Setup cancelled. Re-run any time with: agentmemory --reset"); diff --git a/test/onboarding.test.ts b/test/onboarding.test.ts index a706dc06..053085b8 100644 --- a/test/onboarding.test.ts +++ b/test/onboarding.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from "vitest"; -import { buildAgentOptions } from "../src/cli/onboarding.js"; +import { buildAgentOptions, getInitialAgentValues } from "../src/cli/onboarding.js"; describe("first-run onboarding", () => { it("offers GitHub Copilot CLI as a native setup target", () => { @@ -15,4 +15,13 @@ describe("first-run onboarding", () => { ]), ); }); + + it("selects GitHub Copilot CLI by default when running inside Copilot CLI", () => { + expect(getInitialAgentValues({ COPILOT_CLI: "1" })).toEqual(["copilot-cli"]); + expect(getInitialAgentValues({ COPILOT_AGENT_SESSION_ID: "session" })).toEqual(["copilot-cli"]); + }); + + it("keeps Claude Code as the default outside known agent environments", () => { + expect(getInitialAgentValues({})).toEqual(["claude-code"]); + }); }); From c21bb069ecc46ff372ceb1f523ba5450b28b6f40 Mon Sep 17 00:00:00 2001 From: Ross Story Date: Tue, 19 May 2026 03:17:32 -0700 Subject: [PATCH 07/34] Support framed stdio MCP transport Accept Content-Length framed JSON-RPC messages in addition to the existing newline-delimited transport so Copilot CLI can initialize the standalone MCP server. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/mcp/transport.ts | 126 ++++++++++++++++++++++++++++++++++--- test/mcp-transport.test.ts | 46 ++++++++++++++ 2 files changed, 163 insertions(+), 9 deletions(-) diff --git a/src/mcp/transport.ts b/src/mcp/transport.ts index 766e6472..759ed019 100644 --- a/src/mcp/transport.ts +++ b/src/mcp/transport.ts @@ -1,5 +1,3 @@ -import { createInterface } from "node:readline"; - export interface JsonRpcRequest { jsonrpc: "2.0"; id?: string | number; @@ -19,6 +17,11 @@ export type RequestHandler = ( params: Record, ) => Promise; +export interface StdioMessageParser { + push: (chunk: Buffer | string) => void; + isFramed: () => boolean; +} + // JSON-RPC 2.0 notifications are messages without an `id` field. The spec // (and the MCP transport contract) requires the server to NOT send a // response for notifications. Some clients tolerate spurious responses; @@ -130,26 +133,131 @@ export async function processLine( } } +function findHeaderEnd(buffer: Buffer): { headerEnd: number; bodyStart: number } | null { + const crlf = buffer.indexOf("\r\n\r\n"); + const lf = buffer.indexOf("\n\n"); + if (crlf === -1 && lf === -1) return null; + if (crlf !== -1 && (lf === -1 || crlf <= lf)) { + return { headerEnd: crlf, bodyStart: crlf + 4 }; + } + return { headerEnd: lf, bodyStart: lf + 2 }; +} + +function parseContentLength(header: string): number | null { + for (const line of header.split(/\r?\n/)) { + const match = line.match(/^content-length:\s*(\d+)\s*$/i); + if (match) return Number(match[1]); + } + return null; +} + +export function formatResponse( + response: JsonRpcResponse, + framed: boolean, +): string | Buffer[] { + const body = JSON.stringify(response); + if (!framed) return `${body}\n`; + const bytes = Buffer.from(body, "utf8"); + return [Buffer.from(`Content-Length: ${bytes.length}\r\n\r\n`, "ascii"), bytes]; +} + +export function createMessageParser( + onMessage: (message: string) => void, + writeErr: (msg: string) => void = (msg) => process.stderr.write(msg), +): StdioMessageParser { + let buffer = Buffer.alloc(0); + let framed = false; + + function processBuffer(): void { + while (buffer.length > 0) { + if (buffer[0] === 10 || buffer[0] === 13) { + buffer = buffer.subarray(1); + continue; + } + + const preview = buffer.toString("ascii", 0, Math.min(buffer.length, 32)); + if (/^content-length:/i.test(preview)) { + const header = findHeaderEnd(buffer); + if (!header) return; + + const headerText = buffer.subarray(0, header.headerEnd).toString("ascii"); + const contentLength = parseContentLength(headerText); + if (contentLength === null) { + writeErr("[mcp-transport] missing Content-Length header\n"); + buffer = buffer.subarray(header.bodyStart); + continue; + } + + const messageEnd = header.bodyStart + contentLength; + if (buffer.length < messageEnd) return; + + framed = true; + const message = buffer.subarray(header.bodyStart, messageEnd).toString("utf8"); + buffer = buffer.subarray(messageEnd); + onMessage(message); + continue; + } + + const newline = buffer.indexOf(10); + if (newline === -1) return; + const line = buffer + .subarray(0, newline) + .toString("utf8") + .replace(/\r$/, ""); + buffer = buffer.subarray(newline + 1); + onMessage(line); + } + } + + return { + push(chunk) { + const bytes = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk, "utf8"); + buffer = Buffer.concat([buffer, bytes]); + processBuffer(); + }, + isFramed() { + return framed; + }, + }; +} + export function createStdioTransport(handler: RequestHandler): { start: () => void; stop: () => void; } { - let rl: ReturnType | null = null; + let parser: StdioMessageParser | null = null; + let queue = Promise.resolve(); const writeResponse = (response: JsonRpcResponse) => { - process.stdout.write(JSON.stringify(response) + "\n"); + const formatted = formatResponse(response, parser?.isFramed() ?? false); + if (typeof formatted === "string") { + process.stdout.write(formatted); + return; + } + for (const chunk of formatted) { + process.stdout.write(chunk); + } }; - const onLine = (line: string) => processLine(line, handler, writeResponse); + const onData = (chunk: Buffer) => parser?.push(chunk); return { start() { - rl = createInterface({ input: process.stdin }); - rl.on("line", onLine); + parser = createMessageParser((message) => { + queue = queue.then(() => processLine(message, handler, writeResponse)); + void queue.catch((err) => { + process.stderr.write( + `[mcp-transport] request processing failed: ${ + err instanceof Error ? err.message : String(err) + }\n`, + ); + }); + }); + process.stdin.on("data", onData); }, stop() { - rl?.close(); - rl = null; + process.stdin.off("data", onData); + parser = null; }, }; } diff --git a/test/mcp-transport.test.ts b/test/mcp-transport.test.ts index bb8627dc..006ecc9e 100644 --- a/test/mcp-transport.test.ts +++ b/test/mcp-transport.test.ts @@ -1,5 +1,7 @@ import { describe, it, expect, vi } from "vitest"; import { + createMessageParser, + formatResponse, processLine, type JsonRpcResponse, type RequestHandler, @@ -227,3 +229,47 @@ describe("processLine — id type validation (JSON-RPC §4)", () => { expect(c.out[0].result).toEqual({ method: "ping" }); }); }); + +describe("stdio framing", () => { + it("parses Content-Length framed MCP messages split across chunks", () => { + const messages: string[] = []; + const parser = createMessageParser((message) => messages.push(message)); + const body = JSON.stringify({ jsonrpc: "2.0", id: 1, method: "initialize" }); + const framed = `Content-Length: ${Buffer.byteLength(body, "utf8")}\r\n\r\n${body}`; + + parser.push(framed.slice(0, 12)); + parser.push(framed.slice(12)); + + expect(messages).toEqual([body]); + expect(parser.isFramed()).toBe(true); + }); + + it("parses newline-delimited JSON for existing clients", () => { + const messages: string[] = []; + const parser = createMessageParser((message) => messages.push(message)); + const first = JSON.stringify({ jsonrpc: "2.0", id: 1, method: "tools/list" }); + const second = JSON.stringify({ jsonrpc: "2.0", method: "notifications/initialized" }); + + parser.push(`${first}\n${second}\n`); + + expect(messages).toEqual([first, second]); + expect(parser.isFramed()).toBe(false); + }); + + it("formats responses with Content-Length framing when requested", () => { + const response: JsonRpcResponse = { + jsonrpc: "2.0", + id: 1, + result: { ok: true }, + }; + const formatted = formatResponse(response, true); + + expect(Array.isArray(formatted)).toBe(true); + if (!Array.isArray(formatted)) throw new Error("expected framed response"); + const header = formatted[0].toString("ascii"); + const body = formatted[1].toString("utf8"); + + expect(header).toBe(`Content-Length: ${Buffer.byteLength(body, "utf8")}\r\n\r\n`); + expect(JSON.parse(body)).toEqual(response); + }); +}); From 3a24c326b8cff5596ede9b5f266217cfa3af4b5a Mon Sep 17 00:00:00 2001 From: Jz Date: Tue, 19 May 2026 18:49:07 +0800 Subject: [PATCH 08/34] fix(viewer): prevent IME composition interruption in search inputs (#517) The viewer's five search inputs (graph, memories, lessons, actions, crystals) destroy and recreate their input DOM via innerHTML on every keystroke, which interrupts active IME composition sessions and makes non-Latin input (Chinese, Japanese, Korean) unusable. Additionally, the viewer's CSP includes script-src-attr 'none', which silently blocks the inline oninput=/onchange= handlers on the lessons, actions, and crystals panels. Those three search/filter controls have been non-functional under the strict CSP. This patch: 1. Adds a bindImeSafeSearch helper that guards on both an explicit compositionstart/compositionend flag and event.isComposing. compositionend triggers an immediate commit and sets a justCommitted one-shot flag to suppress the redundant trailing input event that browsers dispatch after compositionend. 2. Adds captureSearchFocus/restoreSearchFocus helpers to preserve focus and cursor position across innerHTML rebuilds, so multi-word IME input doesn't require clicking back into the search box after each commit. 3. Migrates all five search inputs to addEventListener via the new helpers, removing the CSP-blocked inline handlers on lessons, actions, and crystals. The actions panel's status filter '; + html += ''; html += '' + items.length + ' lessons'; html += ''; @@ -2882,7 +2913,11 @@

agentmemory

html += ''; } + var __focus = captureSearchFocus(['lessons-search']); el.innerHTML = html; + var __ls = document.getElementById('lessons-search'); + if (__ls) bindImeSafeSearch(__ls, 200, function(v){ state.lessons.search = v; renderLessons(); }); + restoreSearchFocus(__focus); } async function loadActions() { @@ -2912,8 +2947,8 @@

agentmemory

} var html = '
'; - html += ''; - html += ''; + html += ''; + html += ''; html += '' + items.length + ' crystals'; html += '
'; @@ -3060,7 +3101,11 @@

agentmemory

}); } + var __focus = captureSearchFocus(['crystals-search']); el.innerHTML = html; + var __cs = document.getElementById('crystals-search'); + if (__cs) bindImeSafeSearch(__cs, 200, function(v){ state.crystals.search = v; renderCrystals(); }); + restoreSearchFocus(__focus); } async function loadAudit() { From a74c2880d5ecaa259d08f71e5910d02cfb4d2185 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=9D=B4=EB=AF=BC=EC=9E=AC?= Date: Tue, 19 May 2026 20:57:47 +0900 Subject: [PATCH 09/34] fix(config): honor env file drop-stale-index flag (#461) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: 이민재 <19909783+honor2030@users.noreply.github.com> --- src/config.ts | 4 ++++ src/index.ts | 4 ++-- test/env-loader.test.ts | 7 +++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/config.ts b/src/config.ts index 4a416ed1..eed5725e 100644 --- a/src/config.ts +++ b/src/config.ts @@ -159,6 +159,10 @@ export function getEnvVar(key: string): string | undefined { return getMergedEnv()[key]; } +export function isDropStaleIndexEnabled(): boolean { + return getMergedEnv()["AGENTMEMORY_DROP_STALE_INDEX"] === "true"; +} + export function detectLlmProviderKind(): "llm" | "noop" { const env = getMergedEnv(); if ( diff --git a/src/index.ts b/src/index.ts index b9b9e84d..630475c2 100644 --- a/src/index.ts +++ b/src/index.ts @@ -11,6 +11,7 @@ import { isAutoCompressEnabled, isConsolidationEnabled, isContextInjectionEnabled, + isDropStaleIndexEnabled, } from "./config.js"; import { createProvider, @@ -376,8 +377,7 @@ async function main() { .map((m) => `${m.obsId} (dim=${m.dim})`) .join(", "); const distinct = Array.from(seenDimensions).sort((a, b) => a - b).join(", "); - const dropStale = - process.env["AGENTMEMORY_DROP_STALE_INDEX"] === "true"; + const dropStale = isDropStaleIndexEnabled(); if (dropStale) { console.warn( `[agentmemory] Persisted vector index has ${mismatches.length} of ` + diff --git a/test/env-loader.test.ts b/test/env-loader.test.ts index 9c6f2955..17ff6a8e 100644 --- a/test/env-loader.test.ts +++ b/test/env-loader.test.ts @@ -25,6 +25,7 @@ describe("loadEnvFile", () => { process.env["HOME"] = sandboxHome; process.env["USERPROFILE"] = sandboxHome; delete process.env["AGENTMEMORY_AUTO_COMPRESS"]; + delete process.env["AGENTMEMORY_DROP_STALE_INDEX"]; delete process.env["CONSOLIDATION_ENABLED"]; delete process.env["GRAPH_EXTRACTION_ENABLED"]; delete process.env["TOKEN"]; @@ -82,4 +83,10 @@ describe("loadEnvFile", () => { const cfg = await freshConfig(); expect(cfg.getEnvVar("TOKEN")).toBe("abc"); }); + + it("reads AGENTMEMORY_DROP_STALE_INDEX from the env file", async () => { + writeEnv("AGENTMEMORY_DROP_STALE_INDEX=true"); + const cfg = await freshConfig(); + expect(cfg.isDropStaleIndexEnabled()).toBe(true); + }); }); From 48bf700f62e66c20ee3dc085971d9228eee62ab8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=9D=B4=EB=AF=BC=EC=9E=AC?= Date: Tue, 19 May 2026 20:59:22 +0900 Subject: [PATCH 10/34] fix(hooks): quote plugin script paths (#487) Co-authored-by: honor2030 <19909783+honor2030@users.noreply.github.com> --- plugin/hooks/hooks.codex.json | 12 ++++++------ plugin/hooks/hooks.json | 24 ++++++++++++------------ test/codex-plugin.test.ts | 27 ++++++++++++++++++++++++--- 3 files changed, 42 insertions(+), 21 deletions(-) diff --git a/plugin/hooks/hooks.codex.json b/plugin/hooks/hooks.codex.json index 73e43c66..d2c3a3b6 100644 --- a/plugin/hooks/hooks.codex.json +++ b/plugin/hooks/hooks.codex.json @@ -5,7 +5,7 @@ "hooks": [ { "type": "command", - "command": "node ${CLAUDE_PLUGIN_ROOT}/scripts/session-start.mjs", + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/session-start.mjs\"", "statusMessage": "agentmemory: loading session context" } ] @@ -16,7 +16,7 @@ "hooks": [ { "type": "command", - "command": "node ${CLAUDE_PLUGIN_ROOT}/scripts/prompt-submit.mjs", + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/prompt-submit.mjs\"", "statusMessage": "agentmemory: recalling relevant memories" } ] @@ -28,7 +28,7 @@ "hooks": [ { "type": "command", - "command": "node ${CLAUDE_PLUGIN_ROOT}/scripts/pre-tool-use.mjs" + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/pre-tool-use.mjs\"" } ] } @@ -38,7 +38,7 @@ "hooks": [ { "type": "command", - "command": "node ${CLAUDE_PLUGIN_ROOT}/scripts/post-tool-use.mjs" + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/post-tool-use.mjs\"" } ] } @@ -48,7 +48,7 @@ "hooks": [ { "type": "command", - "command": "node ${CLAUDE_PLUGIN_ROOT}/scripts/pre-compact.mjs" + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/pre-compact.mjs\"" } ] } @@ -58,7 +58,7 @@ "hooks": [ { "type": "command", - "command": "node ${CLAUDE_PLUGIN_ROOT}/scripts/stop.mjs" + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/stop.mjs\"" } ] } diff --git a/plugin/hooks/hooks.json b/plugin/hooks/hooks.json index d60d664a..a13c9973 100644 --- a/plugin/hooks/hooks.json +++ b/plugin/hooks/hooks.json @@ -5,7 +5,7 @@ "hooks": [ { "type": "command", - "command": "node ${CLAUDE_PLUGIN_ROOT}/scripts/session-start.mjs" + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/session-start.mjs\"" } ] } @@ -15,7 +15,7 @@ "hooks": [ { "type": "command", - "command": "node ${CLAUDE_PLUGIN_ROOT}/scripts/prompt-submit.mjs" + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/prompt-submit.mjs\"" } ] } @@ -26,7 +26,7 @@ "hooks": [ { "type": "command", - "command": "node ${CLAUDE_PLUGIN_ROOT}/scripts/pre-tool-use.mjs" + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/pre-tool-use.mjs\"" } ] } @@ -36,7 +36,7 @@ "hooks": [ { "type": "command", - "command": "node ${CLAUDE_PLUGIN_ROOT}/scripts/post-tool-use.mjs" + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/post-tool-use.mjs\"" } ] } @@ -46,7 +46,7 @@ "hooks": [ { "type": "command", - "command": "node ${CLAUDE_PLUGIN_ROOT}/scripts/post-tool-failure.mjs" + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/post-tool-failure.mjs\"" } ] } @@ -56,7 +56,7 @@ "hooks": [ { "type": "command", - "command": "node ${CLAUDE_PLUGIN_ROOT}/scripts/pre-compact.mjs" + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/pre-compact.mjs\"" } ] } @@ -66,7 +66,7 @@ "hooks": [ { "type": "command", - "command": "node ${CLAUDE_PLUGIN_ROOT}/scripts/subagent-start.mjs" + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/subagent-start.mjs\"" } ] } @@ -76,7 +76,7 @@ "hooks": [ { "type": "command", - "command": "node ${CLAUDE_PLUGIN_ROOT}/scripts/subagent-stop.mjs" + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/subagent-stop.mjs\"" } ] } @@ -86,7 +86,7 @@ "hooks": [ { "type": "command", - "command": "node ${CLAUDE_PLUGIN_ROOT}/scripts/notification.mjs" + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/notification.mjs\"" } ] } @@ -96,7 +96,7 @@ "hooks": [ { "type": "command", - "command": "node ${CLAUDE_PLUGIN_ROOT}/scripts/task-completed.mjs" + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/task-completed.mjs\"" } ] } @@ -106,7 +106,7 @@ "hooks": [ { "type": "command", - "command": "node ${CLAUDE_PLUGIN_ROOT}/scripts/stop.mjs" + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/stop.mjs\"" } ] } @@ -116,7 +116,7 @@ "hooks": [ { "type": "command", - "command": "node ${CLAUDE_PLUGIN_ROOT}/scripts/session-end.mjs" + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/session-end.mjs\"" } ] } diff --git a/test/codex-plugin.test.ts b/test/codex-plugin.test.ts index bb380876..bbbd88db 100644 --- a/test/codex-plugin.test.ts +++ b/test/codex-plugin.test.ts @@ -9,6 +9,29 @@ function readJson(path: string): T { return JSON.parse(readFileSync(path, "utf-8")) as T; } +type HookHandler = { type: string; command: string }; +type HookEntry = { hooks: HookHandler[] }; + +function hookCommands(path: string): string[] { + const manifest = readJson<{ hooks: Record }>(path); + return Object.values(manifest.hooks).flatMap((entries) => + entries.flatMap((entry) => entry.hooks.map((handler) => handler.command)), + ); +} + +describe("Plugin hook manifests", () => { + it("quote plugin script paths so roots with spaces stay intact", () => { + for (const manifest of ["hooks.json", "hooks.codex.json"]) { + const commands = hookCommands(join(pluginRoot, "hooks", manifest)); + expect(commands.length, `${manifest} should contain hook commands`).toBeGreaterThan(0); + + for (const command of commands) { + expect(command).toMatch(/^node "\$\{CLAUDE_PLUGIN_ROOT\}\/scripts\/[^\s"]+\.mjs"$/); + } + } + }); +}); + describe("Codex plugin manifest (developers.openai.com/codex/plugins)", () => { it("ships .codex-plugin/plugin.json with kebab-case name + version + references", () => { const manifestPath = join(pluginRoot, ".codex-plugin/plugin.json"); @@ -72,8 +95,6 @@ describe("Codex plugin manifest (developers.openai.com/codex/plugins)", () => { }); it("hook command scripts referenced in hooks.codex.json exist on disk", () => { - type HookHandler = { type: string; command: string }; - type HookEntry = { hooks: HookHandler[] }; const hooks = readJson<{ hooks: Record }>( join(pluginRoot, "hooks/hooks.codex.json"), ); @@ -81,7 +102,7 @@ describe("Codex plugin manifest (developers.openai.com/codex/plugins)", () => { for (const entries of Object.values(hooks.hooks)) { for (const entry of entries) { for (const handler of entry.hooks) { - const match = handler.command.match(/\$\{CLAUDE_PLUGIN_ROOT\}\/(scripts\/[^\s]+)/); + const match = handler.command.match(/\$\{CLAUDE_PLUGIN_ROOT\}\/(scripts\/[^\s"]+)/); if (match) scriptRefs.add(match[1]); } } From c2f231fe8bcf9b1fa296ad5ee81267eec94de768 Mon Sep 17 00:00:00 2001 From: Serhii Zghama <20826225+serhiizghama@users.noreply.github.com> Date: Tue, 19 May 2026 19:01:21 +0700 Subject: [PATCH 11/34] fix(mcp): memory_recall hits the right endpoint and forwards format/token_budget (#507) (#516) * fix(mcp): route memory_recall to /agentmemory/search and forward format/token_budget memory_recall and memory_smart_search were sharing the smart-search endpoint, which always returns compact mode and silently drops the format and token_budget parameters that the tool schema advertises. Split the cases so memory_recall hits /agentmemory/search (which honors format) while memory_smart_search keeps its own endpoint. Default format to "full" for memory_recall so the documented behavior matches the wire call. Signed-off-by: serhiizghama * test(mcp): cover memory_recall endpoint, format forwarding, and defaults Two new proxy tests for issue #507: one asserts memory_recall calls POST /agentmemory/search with the format and token_budget fields, and never falls through to smart-search; the other pins the default format to "full" when the caller omits it. Signed-off-by: serhiizghama --------- Signed-off-by: serhiizghama --- src/mcp/standalone.ts | 32 ++++++++++++++++-- test/mcp-standalone-proxy.test.ts | 55 +++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/src/mcp/standalone.ts b/src/mcp/standalone.ts index 86678a76..1413cbf8 100644 --- a/src/mcp/standalone.ts +++ b/src/mcp/standalone.ts @@ -89,6 +89,8 @@ interface Validated { files?: string[]; query?: string; limit?: number; + format?: string; + tokenBudget?: number; memoryIds?: string[]; reason?: string; } @@ -118,6 +120,17 @@ function validate(toolName: string, args: Record): Validated { } v.query = query.trim(); v.limit = parseLimit(args["limit"]); + const fmt = args["format"]; + if (typeof fmt === "string" && fmt.trim()) { + v.format = fmt.trim().toLowerCase(); + } + const budget = args["token_budget"]; + if (typeof budget === "number" && Number.isFinite(budget) && budget > 0) { + v.tokenBudget = Math.floor(budget); + } else if (typeof budget === "string" && budget.trim()) { + const n = Number(budget); + if (Number.isFinite(n) && n > 0) v.tokenBudget = Math.floor(n); + } return v; } case "memory_sessions": { @@ -159,11 +172,26 @@ async function handleProxy( }); return textResponse(result); } - case "memory_recall": + case "memory_recall": { + const body: Record = { + query: v.query, + limit: v.limit, + format: v.format ?? "full", + }; + if (v.tokenBudget != null) body["token_budget"] = v.tokenBudget; + const result = await handle.call("/agentmemory/search", { + method: "POST", + body: JSON.stringify(body), + }); + return textResponse(result, true); + } case "memory_smart_search": { + const body: Record = { query: v.query, limit: v.limit }; + if (v.format != null) body["format"] = v.format; + if (v.tokenBudget != null) body["token_budget"] = v.tokenBudget; const result = await handle.call("/agentmemory/smart-search", { method: "POST", - body: JSON.stringify({ query: v.query, limit: v.limit }), + body: JSON.stringify(body), }); return textResponse(result, true); } diff --git a/test/mcp-standalone-proxy.test.ts b/test/mcp-standalone-proxy.test.ts index 0d93b227..dc08a024 100644 --- a/test/mcp-standalone-proxy.test.ts +++ b/test/mcp-standalone-proxy.test.ts @@ -75,6 +75,61 @@ describe("@agentmemory/mcp standalone — server proxy (issue #159)", () => { expect(body.results[0].id).toBe("m1"); }); + it("proxies memory_recall to POST /agentmemory/search and forwards format/token_budget (#507)", async () => { + const calls: Array<{ url: string; body?: unknown }> = []; + installFetch((url, init) => { + if (url.endsWith("/agentmemory/livez")) return new Response("ok", { status: 200 }); + const body = init?.body ? JSON.parse(init.body as string) : undefined; + calls.push({ url, body }); + if (url.endsWith("/agentmemory/search")) { + return new Response( + JSON.stringify({ + mode: "full", + facts: [{ id: "m1" }], + narrative: "n", + concepts: ["c"], + files: ["f"], + }), + { status: 200, headers: { "content-type": "application/json" } }, + ); + } + return new Response("not found", { status: 404 }); + }); + const res = await handleToolCall("memory_recall", { + query: "auth bug", + limit: 5, + format: "full", + token_budget: 800, + }); + const body = JSON.parse(res.content[0].text); + expect(body.mode).toBe("full"); + expect(body.facts[0].id).toBe("m1"); + const searchCall = calls.find((c) => c.url.endsWith("/agentmemory/search")); + expect(searchCall).toBeDefined(); + expect(searchCall?.body).toEqual({ + query: "auth bug", + limit: 5, + format: "full", + token_budget: 800, + }); + expect(calls.find((c) => c.url.endsWith("/agentmemory/smart-search"))).toBeUndefined(); + }); + + it("memory_recall defaults format to 'full' when omitted (#507)", async () => { + let recallBody: Record | undefined; + installFetch((url, init) => { + if (url.endsWith("/agentmemory/livez")) return new Response("ok", { status: 200 }); + if (url.endsWith("/agentmemory/search")) { + recallBody = init?.body ? JSON.parse(init.body as string) : undefined; + return new Response(JSON.stringify({ mode: "full", facts: [] }), { status: 200 }); + } + return new Response("not found", { status: 404 }); + }); + await handleToolCall("memory_recall", { query: "x" }); + expect(recallBody?.["format"]).toBe("full"); + expect(recallBody).not.toHaveProperty("token_budget"); + }); + it("proxies memory_governance_delete to the DELETE REST endpoint", async () => { const calls: Array<{ url: string; method: string; body?: unknown }> = []; installFetch((url, init) => { From 2a027d8952ee8c4656fc622f47e57ccfbfebf911 Mon Sep 17 00:00:00 2001 From: Rohit Ghumare <48523873+rohitg00@users.noreply.github.com> Date: Tue, 19 May 2026 18:23:28 +0100 Subject: [PATCH 12/34] revert: drop --next workaround for iii-console installer (upstream #1660 shipped) (#546) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v0.9.19 (#460 / commit bb259ac) routed the first-run iii-console install through `bash -s -- --next` to dodge the upstream tag-prefix bug at iii-hq/iii#1652. Upstream PR iii-hq/iii#1660 fixed the bug on 2026-05-19 — installer's jq filter now accepts both `iii/v...` and bare `v...` tags, and `-v X.Y.Z` falls back gracefully. `install.iii.dev/console/main/install.sh` is a thin proxy serving `raw.githubusercontent.com/iii-hq/iii/main/console/install.sh` with a 5-minute CDN cache — verified byte-for-byte that the live URL already serves the post-#1660 fix. No iii release tag needed. Switch agentmemory back to the canonical bare invocation: curl -fsSL https://install.iii.dev/console/main/install.sh | sh Drops the workaround comment block (10 lines) explaining the prior detour. v0.9.19/v0.9.20 users on the `--next` path will still resolve a valid release (next-release lookup also handles `iii/v...-next.*` correctly post-#1660), so this isn't a forced upgrade. 1038/1038 tests pass. --- src/cli.ts | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/cli.ts b/src/cli.ts index 5eca18ce..27885a95 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -497,17 +497,8 @@ function detectIiiConsole(): IiiConsoleState { return { kind: "missing" }; } -// install.iii.dev/console/main/install.sh has a bug in its release-tag -// filter that rejects every stable release for iii-hq/iii: the jq -// predicate uses `startswith("v")` while the actual tags are -// `iii/v0.12.0` (slash-prefixed). The `--next` path uses a regex -// without the startswith constraint and therefore works today, -// installing the most recent prerelease (e.g. iii/v0.14.0-next.1). -// -// Pass `--next` until the upstream fix lands (iii-hq/iii#1652). -// Switch back to the bare invocation once the script is patched. const III_CONSOLE_INSTALL_CMD = - "curl -fsSL https://install.iii.dev/console/main/install.sh | bash -s -- --next"; + "curl -fsSL https://install.iii.dev/console/main/install.sh | sh"; async function ensureIiiConsole(): Promise { const state = detectIiiConsole(); From b4259229a6163f3ca92721dc64efe1c13ffa7f22 Mon Sep 17 00:00:00 2001 From: Rohit Ghumare <48523873+rohitg00@users.noreply.github.com> Date: Tue, 19 May 2026 18:28:55 +0100 Subject: [PATCH 13/34] feat(repo): add Sponsor button + GH Packages mirror for sidebar surface (#545) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(repo): add Sponsor button + GH Packages mirror for sidebar surface Three additions that make the repo page surface clearer + give users a single place to fund the project: 1. `.github/FUNDING.yml` — `github: [rohitg00]` renders the "Sponsor" button at the top of the repo + the Sponsor widget in the right sidebar. Requires GitHub Sponsors to be enabled at github.com/sponsors/accounts on the rohitg00 profile before the link resolves (currently 404s — enable before merging this PR). 2. `.github/workflows/publish.yml` — new `publish-github-packages` job runs after the existing public-npm publish completes. Republishes the main package as `@rohitg00/agentmemory` to `npm.pkg.github.com`. The repo's right-sidebar "Packages" widget only surfaces packages on GitHub Packages, not packages on the public npm registry, so this is what makes the sidebar widget non-empty. Public npm remains the canonical install source; GH Packages is purely a discovery surface. - Uses built-in GITHUB_TOKEN, no new secrets needed. - Rewrites package.json `name` + `publishConfig` in-runner via a small node one-liner, publishes, then restores the original so main isn't permanently scope-changed. - Skip-on-already-published guard mirrors the existing public publish steps. - Marked `|| echo "non-fatal"` so a GH Packages hiccup never blocks the canonical npm release. - `permissions: packages: write` added at workflow level. 3. README badge row — added `npm downloads`, `GitHub Packages mirror`, and `Sponsor rohitg00 on GitHub Sponsors` badges alongside the existing `npm version` / `CI` / `License` / `Stars` row. The sponsor badge is the same link the FUNDING.yml sidebar widget uses; surfacing it in-README means readers who don't notice the sidebar still see it. Out of scope (asked, declined): - Docker Hub / ghcr.io publish workflow. Not in this PR. * ci(publish): scope write perms per-job + persist-credentials false Inline review on #545 flagged that the workflow-level permissions block granted `id-token: write` + `packages: write` to every job, including ones that don't need them. Tightened to least-privilege: - Workflow-level: only `contents: read`. - `publish` job: adds `id-token: write` (required for `npm publish --provenance` to mint a Sigstore OIDC token). The GH Packages job doesn't inherit this. - `publish-github-packages` job: adds `packages: write` (required to push to npm.pkg.github.com). The public-npm publish job doesn't inherit this. Both `actions/checkout@v6` calls also pick up `persist-credentials: false`. The publish steps never push back to the repo, so the GITHUB_TOKEN doesn't need to land in `.git/config` after checkout. Same posture both jobs. Skipped from the same review pass: - **Pin actions to commit SHAs.** Industry rule but introduces real maintenance friction — Renovate/Dependabot don't auto-bump SHA-pinned actions to new minors, so SHA pinning trades easy semver tracking for stale-action drift. We stay on `@v6` major-tag pins (GitHub publishes those via verified moving refs). - **Disable setup-node cache.** `actions/setup-node@v6` defaults to cache-off (the `cache:` input is opt-in). `package-manager-cache` only auto-enables when `package.json` has a `packageManager` field — agentmemory's doesn't (verified via `grep`). The fix is a no-op on this workflow. --- .github/FUNDING.yml | 5 +++ .github/workflows/publish.yml | 63 ++++++++++++++++++++++++++++++++++- README.md | 3 ++ 3 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 .github/FUNDING.yml diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 00000000..4fbf3e44 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,5 @@ +# GitHub renders this as the "Sponsor" button at the top of the repo +# and the Sponsor widget in the right sidebar. Enable GitHub Sponsors +# on the profile at github.com/sponsors/accounts before merging so the +# resulting link doesn't 404. +github: [rohitg00] diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 62dc8925..7162c585 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -10,15 +10,30 @@ on: required: false default: "agentmemory,mcp,fs-watcher" +# Workflow-level permissions stay minimal — only `contents: read` +# is required to check out the repo. Write scopes (`id-token: write` +# for npm provenance, `packages: write` for the GH Packages mirror) +# are granted per-job below so neither job inherits a scope it +# doesn't need. permissions: contents: read - id-token: write jobs: publish: runs-on: ubuntu-latest + # `id-token: write` is required for npm publish --provenance to + # mint a Sigstore OIDC token on this run. Scoped to this job only + # so the GH Packages mirror job doesn't inherit it. + permissions: + contents: read + id-token: write steps: - uses: actions/checkout@v6 + with: + # Don't persist the GITHUB_TOKEN to .git/config — the + # publish steps don't push back to the repo, so the token + # only needs to live in memory for this checkout. + persist-credentials: false - uses: actions/setup-node@v6 with: @@ -112,3 +127,49 @@ jobs: done echo "ERROR: fs-watcher never propagated after 2 minutes" >&2 exit 1 + + publish-github-packages: + # Mirror the same artifacts to GitHub Packages so the repo's + # right-sidebar "Packages" widget surfaces them. Scoped to + # @rohitg00 since GH Packages requires the org/user to match the + # repo owner. Public npm registry remains the canonical install + # source; this is purely a discovery surface on the repo page. + needs: publish + runs-on: ubuntu-latest + # `packages: write` only here — the public-npm publish job + # doesn't need it. `contents: read` for the checkout. + permissions: + contents: read + packages: write + steps: + - uses: actions/checkout@v6 + with: + # Same posture as the publish job — no push back to the + # repo, so don't persist the token to `.git/config`. + persist-credentials: false + + - uses: actions/setup-node@v6 + with: + node-version: 22 + registry-url: https://npm.pkg.github.com + scope: "@rohitg00" + + - run: npm install --package-lock-only --legacy-peer-deps --no-audit --no-fund + - run: npm ci --legacy-peer-deps --no-audit --no-fund + - run: npm run build + + - name: Publish @rohitg00/agentmemory to GitHub Packages + run: | + VERSION=$(node -p "require('./package.json').version") + # Save original package.json, rewrite scope for GH Packages, + # publish, restore. Avoids a permanent scope change in main. + cp package.json package.json.bak + node -e "const p=require('./package.json');p.name='@rohitg00/agentmemory';p.publishConfig={registry:'https://npm.pkg.github.com'};require('fs').writeFileSync('package.json',JSON.stringify(p,null,2));" + if npm view "@rohitg00/agentmemory@$VERSION" version --registry=https://npm.pkg.github.com >/dev/null 2>&1; then + echo "GH Packages version already published, skipping" + else + npm publish --access public || echo "GH Packages publish failed (non-fatal)" + fi + mv package.json.bak package.json + env: + NODE_AUTH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/README.md b/README.md index ef840011..21cd623f 100644 --- a/README.md +++ b/README.md @@ -34,9 +34,12 @@

npm version + npm downloads + GitHub Packages mirror CI License Stars + Sponsor rohitg00 on GitHub Sponsors

From 6ed47c155a7641d1b660cd717f1566aab65c9ebe Mon Sep 17 00:00:00 2001 From: Rohit Ghumare <48523873+rohitg00@users.noreply.github.com> Date: Tue, 19 May 2026 18:45:04 +0100 Subject: [PATCH 14/34] fix(funding): tighten FUNDING.yml to canonical single-line form (#547) Sponsor button still missing from the repo page despite #545 merging. The committed FUNDING.yml started with 4 lines of `#` comments before the canonical `github: [rohitg00]` directive. GitHub's FUNDING parser documents only the canonical key-value form; leading comments shouldn't break it but some users have reported indexer lag when the file starts with non-data lines. Strip to the bare single-line form to match the documented schema and remove any ambiguity. Sponsor profile is enabled (github.com/sponsors/rohitg00 returns 200 + 'Sponsor @rohitg00' button), so the only remaining gap is GitHub's side-bar indexing. Tightening the file forces a re-parse. --- .github/FUNDING.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index 4fbf3e44..a2f5e0c5 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -1,5 +1 @@ -# GitHub renders this as the "Sponsor" button at the top of the repo -# and the Sponsor widget in the right sidebar. Enable GitHub Sponsors -# on the profile at github.com/sponsors/accounts before merging so the -# resulting link doesn't 404. github: [rohitg00] From 632fa3531d14f3892278a0ee9d8df13abe1c8bbb Mon Sep 17 00:00:00 2001 From: Rohit Ghumare <48523873+rohitg00@users.noreply.github.com> Date: Tue, 19 May 2026 18:58:55 +0100 Subject: [PATCH 15/34] revert: drop GH Packages mirror, keep single canonical install path (#548) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverting the GH Packages publish from #545. GH Packages is a separate registry from npmjs.com — anyone installing `@rohitg00/agentmemory` from `npm.pkg.github.com` needs to point their registry there and authenticate, which is friction users don't have on the canonical `@agentmemory/agentmemory` install from public npm. The right-sidebar Packages widget on the repo page was the only motivation for the mirror. Acceptable to leave it empty — the single canonical install path is the better DX. - Drop `publish-github-packages` job from `.github/workflows/publish.yml` - Drop `packages: write` perm wording from the workflow comment block - Remove "GitHub Packages mirror" badge from README Manual follow-up (post-merge): delete the already-published `@rohitg00/agentmemory@0.9.20` from GH Packages registry via github.com/users/rohitg00/packages/npm/agentmemory/settings → Delete. --- .github/workflows/publish.yml | 55 ++--------------------------------- README.md | 1 - 2 files changed, 2 insertions(+), 54 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 7162c585..00003399 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -11,19 +11,14 @@ on: default: "agentmemory,mcp,fs-watcher" # Workflow-level permissions stay minimal — only `contents: read` -# is required to check out the repo. Write scopes (`id-token: write` -# for npm provenance, `packages: write` for the GH Packages mirror) -# are granted per-job below so neither job inherits a scope it -# doesn't need. +# is required to check out the repo. `id-token: write` is granted on +# the publish job for npm's --provenance Sigstore OIDC mint. permissions: contents: read jobs: publish: runs-on: ubuntu-latest - # `id-token: write` is required for npm publish --provenance to - # mint a Sigstore OIDC token on this run. Scoped to this job only - # so the GH Packages mirror job doesn't inherit it. permissions: contents: read id-token: write @@ -127,49 +122,3 @@ jobs: done echo "ERROR: fs-watcher never propagated after 2 minutes" >&2 exit 1 - - publish-github-packages: - # Mirror the same artifacts to GitHub Packages so the repo's - # right-sidebar "Packages" widget surfaces them. Scoped to - # @rohitg00 since GH Packages requires the org/user to match the - # repo owner. Public npm registry remains the canonical install - # source; this is purely a discovery surface on the repo page. - needs: publish - runs-on: ubuntu-latest - # `packages: write` only here — the public-npm publish job - # doesn't need it. `contents: read` for the checkout. - permissions: - contents: read - packages: write - steps: - - uses: actions/checkout@v6 - with: - # Same posture as the publish job — no push back to the - # repo, so don't persist the token to `.git/config`. - persist-credentials: false - - - uses: actions/setup-node@v6 - with: - node-version: 22 - registry-url: https://npm.pkg.github.com - scope: "@rohitg00" - - - run: npm install --package-lock-only --legacy-peer-deps --no-audit --no-fund - - run: npm ci --legacy-peer-deps --no-audit --no-fund - - run: npm run build - - - name: Publish @rohitg00/agentmemory to GitHub Packages - run: | - VERSION=$(node -p "require('./package.json').version") - # Save original package.json, rewrite scope for GH Packages, - # publish, restore. Avoids a permanent scope change in main. - cp package.json package.json.bak - node -e "const p=require('./package.json');p.name='@rohitg00/agentmemory';p.publishConfig={registry:'https://npm.pkg.github.com'};require('fs').writeFileSync('package.json',JSON.stringify(p,null,2));" - if npm view "@rohitg00/agentmemory@$VERSION" version --registry=https://npm.pkg.github.com >/dev/null 2>&1; then - echo "GH Packages version already published, skipping" - else - npm publish --access public || echo "GH Packages publish failed (non-fatal)" - fi - mv package.json.bak package.json - env: - NODE_AUTH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/README.md b/README.md index 21cd623f..11d24da7 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,6 @@

npm version npm downloads - GitHub Packages mirror CI License Stars From 564c24bdb10016fcf422a84136cc7e11551413fc Mon Sep 17 00:00:00 2001 From: Rohit Ghumare <48523873+rohitg00@users.noreply.github.com> Date: Tue, 19 May 2026 19:05:06 +0100 Subject: [PATCH 16/34] =?UTF-8?q?docs(readme):=20drop=20sponsor=20badge=20?= =?UTF-8?q?=E2=80=94=20sidebar=20widget=20covers=20it=20(#549)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GitHub auto-renders the "Sponsor this project" widget in the right sidebar from .github/FUNDING.yml (Sponsor button + heart icon + "Learn more about GitHub Sponsors" link). The README badge was redundant noise on the top badge row. Sidebar widget is the canonical surface — one path, one click. --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 11d24da7..cfa87bc4 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,6 @@ CI License Stars - Sponsor rohitg00 on GitHub Sponsors

From db9f000dcac11b1b703f99add3edfce4d966317c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=9D=B4=EB=AF=BC=EC=9E=AC?= Date: Wed, 20 May 2026 03:18:48 +0900 Subject: [PATCH 17/34] fix(cli): skip onboarding prompts without a tty (#491) Co-authored-by: honor2030 <19909783+honor2030@users.noreply.github.com> --- src/cli/onboarding.ts | 24 ++++++++++ test/cli-onboarding.test.ts | 94 +++++++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 test/cli-onboarding.test.ts diff --git a/src/cli/onboarding.ts b/src/cli/onboarding.ts index 92b23d62..48bbf4cc 100644 --- a/src/cli/onboarding.ts +++ b/src/cli/onboarding.ts @@ -137,7 +137,31 @@ export interface OnboardingResult { provider: string | null; } +function shouldSkipInteractiveOnboarding(): boolean { + const ci = process.env["CI"]; + return ( + process.stdin.isTTY !== true || + process.stdout.isTTY !== true || + (ci !== undefined && ci !== "" && ci !== "0" && ci.toLowerCase() !== "false") + ); +} + +function writeDefaultOnboardingPrefs(): OnboardingResult { + writePrefs({ + lastAgent: null, + lastAgents: [], + lastProvider: null, + skipSplash: true, + firstRunAt: new Date().toISOString(), + }); + return { agents: [], provider: null }; +} + export async function runOnboarding(): Promise { + if (shouldSkipInteractiveOnboarding()) { + return writeDefaultOnboardingPrefs(); + } + p.note( [ "Welcome to agentmemory.", diff --git a/test/cli-onboarding.test.ts b/test/cli-onboarding.test.ts new file mode 100644 index 00000000..9779a7e9 --- /dev/null +++ b/test/cli-onboarding.test.ts @@ -0,0 +1,94 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { existsSync, mkdtempSync, readFileSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +const prompts = vi.hoisted(() => ({ + note: vi.fn(), + multiselect: vi.fn(async () => { + throw new Error("interactive multiselect should not run in non-TTY onboarding"); + }), + select: vi.fn(async () => { + throw new Error("interactive select should not run in non-TTY onboarding"); + }), + confirm: vi.fn(async () => true), + isCancel: vi.fn(() => false), + cancel: vi.fn(), + log: { + warn: vi.fn(), + step: vi.fn(), + error: vi.fn(), + }, +})); + +vi.mock("@clack/prompts", () => prompts); +vi.mock("../src/cli/connect/index.js", () => ({ + resolveAdapter: vi.fn(), + runAdapter: vi.fn(), +})); + +const ORIGINAL_HOME = process.env["HOME"]; +const ORIGINAL_USERPROFILE = process.env["USERPROFILE"]; +const stdinTtyDescriptor = Object.getOwnPropertyDescriptor(process.stdin, "isTTY"); +const stdoutTtyDescriptor = Object.getOwnPropertyDescriptor(process.stdout, "isTTY"); + +let sandboxHome: string; + +function setTTY(value: boolean): void { + Object.defineProperty(process.stdin, "isTTY", { value, configurable: true }); + Object.defineProperty(process.stdout, "isTTY", { value, configurable: true }); +} + +function restoreTTY(): void { + if (stdinTtyDescriptor) Object.defineProperty(process.stdin, "isTTY", stdinTtyDescriptor); + else delete (process.stdin as NodeJS.ReadStream & { isTTY?: boolean }).isTTY; + if (stdoutTtyDescriptor) Object.defineProperty(process.stdout, "isTTY", stdoutTtyDescriptor); + else delete (process.stdout as NodeJS.WriteStream & { isTTY?: boolean }).isTTY; +} + +async function freshOnboarding() { + vi.resetModules(); + return await import("../src/cli/onboarding.js"); +} + +describe("cli onboarding", () => { + beforeEach(() => { + sandboxHome = mkdtempSync(join(tmpdir(), "agentmemory-onboarding-")); + process.env["HOME"] = sandboxHome; + process.env["USERPROFILE"] = sandboxHome; + setTTY(false); + vi.clearAllMocks(); + }); + + afterEach(() => { + restoreTTY(); + if (ORIGINAL_HOME === undefined) delete process.env["HOME"]; + else process.env["HOME"] = ORIGINAL_HOME; + if (ORIGINAL_USERPROFILE === undefined) delete process.env["USERPROFILE"]; + else process.env["USERPROFILE"] = ORIGINAL_USERPROFILE; + rmSync(sandboxHome, { recursive: true, force: true }); + }); + + it("does not prompt and records default preferences when onboarding runs without a TTY", async () => { + const { runOnboarding } = await freshOnboarding(); + + const result = await runOnboarding(); + + expect(result).toEqual({ agents: [], provider: null }); + expect(prompts.multiselect).not.toHaveBeenCalled(); + expect(prompts.select).not.toHaveBeenCalled(); + expect(prompts.confirm).not.toHaveBeenCalled(); + + const preferencesPath = join(sandboxHome, ".agentmemory", "preferences.json"); + expect(existsSync(preferencesPath)).toBe(true); + const preferences = JSON.parse(readFileSync(preferencesPath, "utf-8")); + expect(preferences).toMatchObject({ + schemaVersion: 1, + lastAgent: null, + lastAgents: [], + lastProvider: null, + skipSplash: true, + }); + expect(typeof preferences.firstRunAt).toBe("string"); + }); +}); From c1c2c3a94492a197711f4ede79ecc3e88f539270 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=9D=B4=EB=AF=BC=EC=9E=AC?= Date: Wed, 20 May 2026 03:19:24 +0900 Subject: [PATCH 18/34] fix(hermes): declare all plugin hooks (#486) * fix(hermes): declare all plugin hooks * test(hermes): compare manifest hooks to provider --------- Co-authored-by: honor2030 <19909783+honor2030@users.noreply.github.com> --- integrations/hermes/plugin.yaml | 3 ++ test/hermes-plugin.test.ts | 64 +++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 test/hermes-plugin.test.ts diff --git a/integrations/hermes/plugin.yaml b/integrations/hermes/plugin.yaml index b4f32151..9ea5cb98 100644 --- a/integrations/hermes/plugin.yaml +++ b/integrations/hermes/plugin.yaml @@ -4,6 +4,9 @@ description: "Persistent cross-session memory for Hermes Agent via agentmemory. author: "Rohit Ghumare" homepage: "https://github.com/rohitg00/agentmemory" hooks: + - prefetch + - sync_turn - on_session_end - on_pre_compress - on_memory_write + - system_prompt_block diff --git a/test/hermes-plugin.test.ts b/test/hermes-plugin.test.ts new file mode 100644 index 00000000..f13f06f3 --- /dev/null +++ b/test/hermes-plugin.test.ts @@ -0,0 +1,64 @@ +import { describe, expect, it } from "vitest"; +import { readFileSync } from "node:fs"; + +const expectedHermesHooks = [ + "prefetch", + "sync_turn", + "on_session_end", + "on_pre_compress", + "on_memory_write", + "system_prompt_block", +]; + +function readHermesPluginHooks(): string[] { + const manifest = readFileSync("integrations/hermes/plugin.yaml", "utf8"); + const hooks: string[] = []; + let inHooks = false; + + for (const line of manifest.split(/\r?\n/)) { + if (line.trim() === "hooks:") { + inHooks = true; + continue; + } + if (!inHooks) continue; + if (line.trim() === "") continue; + if (!line.startsWith(" ")) break; + + const match = line.match(/^\s*-\s*([A-Za-z_][A-Za-z0-9_]*)\s*$/); + if (match) hooks.push(match[1]); + } + + return hooks; +} + +function isHermesLifecycleHook(methodName: string): boolean { + return ( + methodName === "prefetch" || + methodName === "sync_turn" || + methodName === "system_prompt_block" || + methodName.startsWith("on_") + ); +} + +function readAgentMemoryProviderHookMethods(): string[] { + const source = readFileSync("integrations/hermes/__init__.py", "utf8"); + const methods: string[] = []; + const providerMethodPattern = /^ def ([a-z_][a-z0-9_]*)\(/gm; + + for (const match of source.matchAll(providerMethodPattern)) { + const methodName = match[1]; + if (isHermesLifecycleHook(methodName)) methods.push(methodName); + } + + return methods; +} + +describe("Hermes plugin manifest", () => { + it("declares every implemented lifecycle hook", () => { + const declaredHooks = readHermesPluginHooks(); + const implementedHooks = readAgentMemoryProviderHookMethods(); + + expect([...declaredHooks].sort()).toEqual([...implementedHooks].sort()); + expect(declaredHooks).toEqual(expectedHermesHooks); + }); +}); From c6a1fec2f183606adda217ba155175413aa56e41 Mon Sep 17 00:00:00 2001 From: efenex Date: Tue, 19 May 2026 20:20:18 +0200 Subject: [PATCH 19/34] fix(boot): make rebuildIndex non-blocking so viewer + later boot steps run (#500) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mem::observe's boot flow had this sequence in main(): 1. registerSearchFunction / registerContextFunction / ... (sync — completes immediately) 2. restore persisted vector index from disk 3. await rebuildIndex(kv) ← blocks here 4. bootLog "Ready" / "REST API" / "MCP surface" 5. startViewerServer(...) 6. setInterval auto-forget / lesson decay / consolidation rebuildIndex iterates every observation across every session and AWAITS an embedding-provider call per record. On a large corpus + a rate-limited embedding endpoint (e.g. 100 RPM), step 3 takes hours to days. Everything that runs AFTER it — including startViewerServer — is silently delayed for the same duration. Symptoms in the wild: - http://localhost:3113/ unreachable (no listening socket on the viewer port) even on a freshly-started server - `agentmemory doctor` reports "viewer-unreachable" - log floods with `vector-index add: embed failed — skipping {429: ...}` from the still-running rebuild burning rate-limit budget - no error message — the worker stays alive serving HTTP because sdk.registerFunction had already completed synchronously in step 1 Fix: detach rebuildIndex with `void` + .then/.catch instead of awaiting. The index lazily fills in over time, search degrades gracefully (BM25 keeps working immediately, vector results fill in as the embed queue drains), and the viewer comes up in seconds. Repro on the operator side: 1. import a sizeable jsonl corpus (`mem::replay::import-jsonl`) 2. clear the persisted vector index so rebuildIndex runs on next boot 3. restart agentmemory with EMBEDDING_PROVIDER pointed at a rate-limited endpoint (any OpenAI-compat with low RPM) 4. observe: REST API responds on :3111, but :3113 is never bound, and the doctor's "viewer-unreachable" check fires until the rebuild finishes (hours-to-days for a 300+ session corpus) The 5-second non-fix workaround was a hard kill + restart; that just re-entered the same hang. No tests added — main() isn't unit-tested today and wiring up a fake slow rebuildIndex + asserting the post-rebuild boot lines run early would need the full worker mock harness. The change is one line and the failure mode is dramatic; visual review + integration smoke covers the regression risk. --- src/index.ts | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/index.ts b/src/index.ts index 630475c2..704d4809 100644 --- a/src/index.ts +++ b/src/index.ts @@ -412,16 +412,24 @@ async function main() { const needsRebuild = bm25Index.size === 0; if (needsRebuild) { - const indexCount = await rebuildIndex(kv).catch((err) => { - console.warn(`[agentmemory] Failed to rebuild search index:`, err); - return 0; - }); - if (indexCount > 0) { - bootLog( - `Search index rebuilt: ${indexCount} entries`, - ); - indexPersistence.scheduleSave(); - } + // Fire-and-forget. rebuildIndex iterates every observation across + // every session and AWAITS an embedding-provider call per record. + // On a large corpus + rate-limited embedding endpoint that can + // take HOURS; awaiting it here blocks every subsequent boot step + // (including startViewerServer below, leaving the viewer port + // unbound for the duration). The index lazily fills in over time + // and search degrades gracefully — partial coverage > no viewer + // for hours. Errors still surface via the inner .catch. + void rebuildIndex(kv) + .then((indexCount) => { + if (indexCount > 0) { + bootLog(`Search index rebuilt: ${indexCount} entries`); + indexPersistence.scheduleSave(); + } + }) + .catch((err) => { + console.warn(`[agentmemory] Failed to rebuild search index:`, err); + }); } else { // Backfill memories into BM25 for users upgrading from <0.9.5: prior // versions of mem::remember never indexed memories, so the persisted From 6c2a689daaadecac1364e2d3a6fd77f6116c53b8 Mon Sep 17 00:00:00 2001 From: efenex Date: Tue, 19 May 2026 20:27:33 +0200 Subject: [PATCH 20/34] =?UTF-8?q?fix(rebuild):=20batch=20embed=20calls=20i?= =?UTF-8?q?n=20rebuildIndex=20(25h=20=E2=86=92=203h=20on=20large=20corpora?= =?UTF-8?q?)=20(#504)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(rebuild): batch embed calls in rebuildIndex (25h → 3h on large corpora) rebuildIndex called `await vectorIndexAddGuarded(...)` per memory and per observation. Each call is one HTTP round-trip to the embedding provider for a single input. On a 500k-observation imported corpus against an embedding endpoint with even modest latency, that's serial 100-200ms per call = 14-28 hours of wallclock. The new non-blocking rebuild path (#500) made this no longer block boot, but the rebuild itself still takes the same wallclock. Add `vectorIndexAddBatchGuarded()` next to the existing per-item helper, accepting an array of items and calling `provider.embedBatch()` once. For batchable endpoints (vLLM, Triton, OpenAI's `/v1/embeddings` all accept an `input` array), latency for N items is roughly the latency of a single embed because network + GPU setup amortize. Refactor `rebuildIndex` to accumulate items into a buffer and flush every REBUILD_EMBED_BATCH_SIZE (default 32). BM25 add stays per-item-synchronous; only the vector path is batched. Validated against a vLLM Qwen3-Embedding-8B endpoint: - single embed: 175ms - batch-of-32: 737ms (= 23ms/item amortized, ~7.6× speedup) - projected backfill time for 500k obs: 25h → 3h Per-item failure shape is preserved: - whole-batch network/provider error → all skipped, single warn line (vs N warns previously when the same error hit every item) - per-item dimension mismatch → that item skipped, others continue - rebuildIndex return value unchanged (count of attempted items) Override knob: - REBUILD_EMBED_BATCH_SIZE (default 32) — set lower for endpoints with small per-request input limits, higher for endpoints that prefer larger batches. Set to 1 to fall back to the per-item path. 39/39 existing tests in search-index/vector-index/remember-bm25-index pass unchanged. Related: #500 (non-blocking rebuildIndex), #503 (separate embedding base URL). * fix(rebuild): per-item vi.add try/catch to preserve soft-fail Restores the pre-batch soft-fail behavior — a single failing vi.add() no longer aborts the entire rebuild batch. Failures are logged and counted toward fail, just like dimension mismatches above. --- src/functions/search.ts | 145 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 132 insertions(+), 13 deletions(-) diff --git a/src/functions/search.ts b/src/functions/search.ts index 74af9ff1..b4444b48 100644 --- a/src/functions/search.ts +++ b/src/functions/search.ts @@ -86,6 +86,99 @@ export async function vectorIndexAddGuarded( } } +// Batched variant: calls EmbeddingProvider.embedBatch ONCE for the whole +// batch, then writes each resulting vector. Use this for bulk paths +// (rebuildIndex, future bulk-add APIs) where per-item serial awaits +// dominate wallclock. A batch of N has roughly the latency of a single +// embed (network + GPU setup amortized), so backfilling a 500k-obs +// corpus drops from days to hours on a per-batch endpoint like vLLM. +// +// Per-item failure shape: +// - whole-batch network/provider error → all skipped, single warn line +// - per-item dimension mismatch → that item skipped, others continue +export async function vectorIndexAddBatchGuarded( + items: Array<{ + id: string + sessionId: string + text: string + context: { kind: "memory" | "observation" | "synthetic"; logId: string } + }>, +): Promise<{ ok: number; fail: number }> { + const vi = vectorIndex + const ep = currentEmbeddingProvider + if (!vi || !ep || items.length === 0) return { ok: 0, fail: 0 } + + let embeddings: Float32Array[] + try { + embeddings = await ep.embedBatch(items.map((i) => clipEmbedInput(i.text))) + } catch (err) { + logger.warn("vector-index add batch: embed failed — skipping batch", { + batchSize: items.length, + provider: ep.name, + error: err instanceof Error ? err.message : String(err), + }) + return { ok: 0, fail: items.length } + } + + if (embeddings.length !== items.length) { + logger.warn( + "vector-index add batch: provider returned wrong length — skipping batch", + { + batchSize: items.length, + returned: embeddings.length, + provider: ep.name, + }, + ) + return { ok: 0, fail: items.length } + } + + let ok = 0 + let fail = 0 + for (let i = 0; i < items.length; i++) { + const item = items[i] + const embedding = embeddings[i] + if (embedding.length !== ep.dimensions) { + logger.warn("vector-index add batch: dimension mismatch — skipping item", { + kind: item.context.kind, + id: item.context.logId, + provider: ep.name, + expected: ep.dimensions, + received: embedding.length, + }) + fail++ + continue + } + try { + vi.add(item.id, item.sessionId, embedding) + ok++ + } catch (err) { + logger.warn("vector-index add batch: index write failed — skipping item", { + kind: item.context.kind, + id: item.context.logId, + error: err instanceof Error ? err.message : String(err), + }) + fail++ + } + } + return { ok, fail } +} + +// Embed-batch size for rebuild. Each item is one /v1/embeddings call's +// `input` array element; the provider sees the whole batch as one HTTP +// round-trip. 32 fits comfortably under typical per-request token budgets +// (32 × ~110 tok/item ≈ 3.5k tokens) and gets close to per-call +// throughput for GPU-backed endpoints (vLLM, Triton, etc.). Override via +// REBUILD_EMBED_BATCH_SIZE for endpoints that prefer smaller/larger +// batches. Set to 1 to fall back to the legacy per-item path. +const DEFAULT_REBUILD_EMBED_BATCH = 32 + +function getRebuildEmbedBatchSize(): number { + const raw = process.env.REBUILD_EMBED_BATCH_SIZE + if (!raw) return DEFAULT_REBUILD_EMBED_BATCH + const n = parseInt(raw, 10) + return Number.isFinite(n) && n > 0 ? n : DEFAULT_REBUILD_EMBED_BATCH +} + export async function rebuildIndex(kv: StateKV): Promise { const idx = getSearchIndex() idx.clear() @@ -96,8 +189,28 @@ export async function rebuildIndex(kv: StateKV): Promise { // repopulation loops run, so BM25 and vector stay in sync. vectorIndex?.clear() + const batchSize = getRebuildEmbedBatchSize() + // Accumulator for the batched embed flush. BM25 add is synchronous and + // doesn't need batching — only the vector path benefits. + type EmbedJob = { + id: string + sessionId: string + text: string + context: { kind: "memory" | "observation" | "synthetic"; logId: string } + } + const pending: EmbedJob[] = [] let count = 0 + const flush = async (): Promise => { + if (pending.length === 0) return + await vectorIndexAddBatchGuarded(pending) + pending.length = 0 + } + const enqueue = async (job: EmbedJob): Promise => { + pending.push(job) + if (pending.length >= batchSize) await flush() + } + // Memories live in their own KV scope outside per-session observation // scopes, so they need a separate walk. Without this, mem::remember // entries vanish from BM25 on every restart even after the live-write @@ -108,12 +221,12 @@ export async function rebuildIndex(kv: StateKV): Promise { if (memory.isLatest === false) continue if (!memory.title || !memory.content) continue idx.add(memoryToObservation(memory)) - await vectorIndexAddGuarded( - memory.id, - memory.sessionIds[0] ?? 'memory', - memory.title + ' ' + memory.content, - { kind: "memory", logId: memory.id }, - ) + await enqueue({ + id: memory.id, + sessionId: memory.sessionIds[0] ?? 'memory', + text: memory.title + ' ' + memory.content, + context: { kind: "memory", logId: memory.id }, + }) count++ } } catch (err) { @@ -123,7 +236,10 @@ export async function rebuildIndex(kv: StateKV): Promise { } const sessions = await kv.list(KV.sessions) - if (!sessions.length) return count + if (!sessions.length) { + await flush() + return count + } const obsPerSession: CompressedObservation[][] = [] const failedSessions: string[] = [] @@ -148,16 +264,19 @@ export async function rebuildIndex(kv: StateKV): Promise { for (const obs of observations) { if (obs.title && obs.narrative) { idx.add(obs) - await vectorIndexAddGuarded( - obs.id, - obs.sessionId, - obs.title + ' ' + obs.narrative, - { kind: "observation", logId: obs.id }, - ) + await enqueue({ + id: obs.id, + sessionId: obs.sessionId, + text: obs.title + ' ' + obs.narrative, + context: { kind: "observation", logId: obs.id }, + }) count++ } } } + + // Drain the last partial batch. + await flush() return count } From e68d4ebe9b5a503d4b8731bc4fde61aed4b35a9e Mon Sep 17 00:00:00 2001 From: efenex Date: Tue, 19 May 2026 20:29:06 +0200 Subject: [PATCH 21/34] fix(summarize): chunk large sessions to fit LLM context window (#472) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(summarize): chunk large sessions to fit LLM context window JSONL-imported sessions can have far more observations than the 500-cap MAX_OBS_PER_SESSION that constrains native sessions. mem::summarize previously built one prompt containing every observation and shipped it as a single LLM call, which exceeded the provider's context window for sessions >~7,000 observations and returned an unhelpful 400 from upstream — silently leaving large bulk-imported sessions out of the semantic tier. Approach: map-reduce inside mem::summarize. - Sessions ≤ SUMMARIZE_CHUNK_SIZE (default 400) take the legacy single-call path with no overhead - Larger sessions are split into chunks, each summarized with the existing per-session prompt in parallel batches of SUMMARIZE_CHUNK_CONCURRENCY (default 6), and partial summaries merged via a new REDUCE_SYSTEM prompt - Per-chunk retry-once on transient parse / provider errors - Persistently-failing chunks are skipped (not propagated) so a flaky chunk doesn't waste 30+ already-completed LLM calls on the same session - Bail with too_many_chunks_skipped only if >50% of chunks fail Companion operator tool: scripts/backfill-imported-sessions.sh walks jsonl-imported sessions and POSTs mem::summarize per session, with project / agent / obs-count filters, cost estimation, and per-failure payload dumping for debugging provider rejections. Validated locally against a real corpus: - 5,392-obs session (14 chunks, c=6): 39s - 10,704-obs session (27 chunks, c=6): 34s - 105,966-obs session (265 chunks, c=50): handler completes server-side and persists - 52-session bulk backfill → 25 new semantic facts + 6 new reflect insights produced by consolidate-pipeline Known limit: iii-engine has a hardcoded 180s function-invocation timeout. Sessions large enough that chunked summarize wallclock exceeds that will return a timeout/500 to the HTTP client even though the handler completes and persists server-side. High-RPM providers (Novita / DeepInfra / DeepSeek typically allow 100+ concurrent) can raise SUMMARIZE_CHUNK_CONCURRENCY to push the cliff well past any realistic session size. True fix is an async-job pattern; left as follow-up. - src/prompts/summary.ts: add REDUCE_SYSTEM + buildReducePrompt - src/functions/summarize.ts: chunking, retry, skip, parallelism - test/summarize.test.ts: 9 cases covering single-call path, chunking, env-override, retry-then-success, persistent skip, too-many-skipped bail, provider error after retry, concurrency - .env.example: document SUMMARIZE_CHUNK_SIZE / _CONCURRENCY - .gitignore: agentmemory-debug/, data-*/ (operator artefacts) - scripts/backfill-imported-sessions.sh: bulk-import backfill tool 9/9 new tests pass; existing tests untouched. * fix(summarize): address CodeRabbit review on #472 Four nits flagged by the automated reviewer, all worth fixing: - scripts/backfill: add curl --connect-timeout + --max-time profiles (META_CURL_OPTS vs WORK_CURL_OPTS). Metadata reads fail fast and retry on transient blips; LLM-backed work calls get a wide 30-min cap and no retry (retrying a half-finished LLM job double-spends). - scripts/backfill: sanitize sessionId before joining with DEBUG_DIR in dump_failure() (otherwise a session id containing `/` or `..` could escape the debug dir). UUIDs in practice, but the server doesn't enforce that. - scripts/backfill: switch the observations query to `--get --data-urlencode "sessionId=$id"` so special characters can't corrupt the query string. - scripts/backfill: guard `jq` on summarize + consolidate responses with `jq -e . &1` first. iii's HTTP layer occasionally returns non-JSON (HTML 5xx, empty body on timeout). Without the guard, `set -e` aborts the whole backfill loop on a single bad response — now it logs `invalid_json_response` and moves on. - test/summarize.test.ts: fix `vi.mock("./audit.js", ...)` path to `"../src/functions/audit.js"`. The old path resolved to `test/audit.js` (nonexistent), so the mock was a silent no-op. Tests passed anyway because `safeAudit` writes to a mocked KV. 9/9 tests still pass; backfill dry-run still resolves the corpus cleanly. --- .env.example | 2 + .gitignore | 2 + scripts/backfill-imported-sessions.sh | 259 ++++++++++++++++ src/functions/summarize.ts | 182 ++++++++++- src/prompts/summary.ts | 49 +++ test/summarize.test.ts | 417 ++++++++++++++++++++++++++ 6 files changed, 906 insertions(+), 5 deletions(-) create mode 100755 scripts/backfill-imported-sessions.sh create mode 100644 test/summarize.test.ts diff --git a/.env.example b/.env.example index f1c207c1..77ca0f3a 100644 --- a/.env.example +++ b/.env.example @@ -98,6 +98,8 @@ # AGENTMEMORY_GRAPH_WEIGHT=0.2 # Graph traversal bonus on smart-search ranking # TOKEN_BUDGET=2000 # Max tokens injected via mem::context per session # MAX_OBS_PER_SESSION=500 # Per-session observation cap before consolidation kicks in +# SUMMARIZE_CHUNK_SIZE=400 # When mem::summarize sees a session larger than this, it chunks observations and map-reduces (chunk-summarize → reduce-merge) to stay within the LLM's context window. Default 400 ≈ 50k tokens per chunk at ~110 tok/obs. Native sessions are capped by MAX_OBS_PER_SESSION; chunking primarily matters for bulk-imported jsonl sessions, which bypass that cap. +# SUMMARIZE_CHUNK_CONCURRENCY=6 # Parallel chunk LLM calls during chunked summarize. Default 6 fits ~100-chunk sessions under iii's 180s function-invocation timeout at typical ~8s/call. High-throughput providers (Novita, DeepInfra, DeepSeek) commonly allow 100+ concurrent — bump this for very large imported sessions. # ----------------------------------------------------------------------------- # 5. Behaviour flags diff --git a/.gitignore b/.gitignore index 9a9260b8..585d0f49 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,8 @@ dist/ plugin/scripts/*.map plugin/scripts/*.d.mts data/ +data-*/ +agentmemory-debug/ .gstack/ # Lock files — never commit (see feedback_no_lockfiles memory) diff --git a/scripts/backfill-imported-sessions.sh b/scripts/backfill-imported-sessions.sh new file mode 100755 index 00000000..a247a57e --- /dev/null +++ b/scripts/backfill-imported-sessions.sh @@ -0,0 +1,259 @@ +#!/usr/bin/env bash +# Backfill memory artifacts for sessions imported via `agentmemory import-jsonl`. +# +# The import path only persists Session + Observation rows (via synthetic, +# zero-LLM compression) and the deterministic crystal/lesson derivation. +# It does NOT call mem::summarize, so the semantic/procedural/reflect tiers +# of the consolidation pipeline have nothing to roll up. +# +# This script walks every session tagged `jsonl-import` and: +# 1. POSTs /agentmemory/summarize per session (LLM call) +# 2. POSTs /agentmemory/consolidate-pipeline once at the end +# +# Graph extraction (/agentmemory/graph/extract) is intentionally skipped — +# its API takes a per-observation payload, which is cost-prohibitive for +# bulk imports. `reflect` falls back to a no-graph clustering mode. +# +# Usage: +# scripts/backfill-imported-sessions.sh --dry-run +# scripts/backfill-imported-sessions.sh --limit 5 +# scripts/backfill-imported-sessions.sh # process all + +set -euo pipefail + +URL="${AGENTMEMORY_URL:-http://localhost:3111}" +DRY_RUN=0 +LIMIT=0 # 0 = no limit +ONLY_TAG="jsonl-import" +SKIP_CONSOLIDATE=0 +SKIP_AGENTS=0 # drop sessions whose project starts with "agent-" +MAX_OBS=0 # 0 = no cap; skip sessions with more observations than this +DEBUG_ON_ERROR=0 # on failure, dump session metadata + obs to DEBUG_DIR +DEBUG_DIR="${AGENTMEMORY_DEBUG_DIR:-./agentmemory-debug}" +PROJECT_PATTERN="" # jq test() regex against .project; "" means no filter + +# Cost-estimate knobs (defaults tuned for DeepSeek V4 Flash on DeepInfra: +# $0.14 / 1M input, $0.28 / 1M output). Override via env if needed. +COST_IN_PER_1M="${AGENTMEMORY_COST_IN_PER_1M:-0.14}" +COST_OUT_PER_1M="${AGENTMEMORY_COST_OUT_PER_1M:-0.28}" +# Rough token weight per compressed observation, derived from inspecting +# real synthetic-compression payloads in the kv store (mostly 100-300 tok, +# heavy-tailed). Override if your sessions are unusually verbose. +TOKENS_PER_OBS="${AGENTMEMORY_TOKENS_PER_OBS:-200}" +# Reserved per-call output budget (XML summary is small). +TOKENS_OUT_PER_SESSION="${AGENTMEMORY_TOKENS_OUT_PER_SESSION:-500}" + +while [[ $# -gt 0 ]]; do + case "$1" in + --dry-run) DRY_RUN=1; shift ;; + --limit) LIMIT="${2:?--limit needs a number}"; shift 2 ;; + --tag) ONLY_TAG="${2:?--tag needs a value (use empty string for all)}"; shift 2 ;; + --skip-consolidate) SKIP_CONSOLIDATE=1; shift ;; + --skip-agents) SKIP_AGENTS=1; shift ;; + --max-obs) MAX_OBS="${2:?--max-obs needs a number}"; shift 2 ;; + --debug-on-error) DEBUG_ON_ERROR=1; shift ;; + --project-pattern) PROJECT_PATTERN="${2:?--project-pattern needs a regex}"; shift 2 ;; + -h|--help) + sed -n '2,28p' "$0" + exit 0 ;; + *) echo "unknown flag: $1" >&2; exit 2 ;; + esac +done + +for bin in curl jq; do + command -v "$bin" >/dev/null || { echo "missing dependency: $bin" >&2; exit 1; } +done + +# Curl timeout profiles. Metadata reads (livez, sessions list, observations +# pull for debug dumps) should fail fast and retry transient blips. The LLM +# work calls (summarize, consolidate) intentionally have no --retry and a +# wide --max-time: each call can legitimately take minutes for chunked +# summarize on large sessions, and retrying a half-finished LLM job is +# expensive both in dollars and in duplicated server-side work. +META_CURL_OPTS=(--connect-timeout 10 --max-time 30 --retry 2 --retry-delay 1) +WORK_CURL_OPTS=(--connect-timeout 10 --max-time 1800) + +echo "agentmemory backfill — server: $URL" +[[ "$DRY_RUN" == 1 ]] && echo "DRY RUN: no POSTs will be made." + +# --- liveness --- +if ! curl -fsS "${META_CURL_OPTS[@]}" "$URL/agentmemory/livez" >/dev/null; then + echo "server not reachable at $URL (try: npx @agentmemory/agentmemory)" >&2 + exit 1 +fi + +# --- collect session ids --- +sessions_json="$(curl -fsS "${META_CURL_OPTS[@]}" "$URL/agentmemory/sessions")" +filter='.sessions[] | select(.status=="completed")' +if [[ -n "$ONLY_TAG" ]]; then + filter+=" | select((.tags // []) | index(\"$ONLY_TAG\"))" +fi +if [[ "$SKIP_AGENTS" == 1 ]]; then + filter+=' | select((.project // "") | startswith("agent-") | not)' +fi +if [[ -n "$PROJECT_PATTERN" ]]; then + # jq's test() applies a regex against the project string. + filter+=" | select((.project // \"\") | test(\"$PROJECT_PATTERN\"))" +fi +if [[ "$MAX_OBS" -gt 0 ]]; then + filter+=" | select((.observationCount // 0) <= $MAX_OBS)" +fi +filter+=' | "\(.id)\t\(.observationCount // 0)\t\(.project // "")"' + +rows=() +while IFS= read -r line; do + rows+=("$line") +done < <(echo "$sessions_json" | jq -r "$filter") +total="${#rows[@]}" + +if [[ "$total" -eq 0 ]]; then + echo "no sessions matched (tag='$ONLY_TAG'); nothing to do." + exit 0 +fi + +if [[ "$LIMIT" -gt 0 && "$LIMIT" -lt "$total" ]]; then + rows=("${rows[@]:0:$LIMIT}") +fi + +echo "matched $total session(s); will process ${#rows[@]}." +total_obs=0 +for row in "${rows[@]}"; do + obs="$(cut -f2 <<<"$row")" + total_obs=$(( total_obs + obs )) +done +est_in=$(( total_obs * TOKENS_PER_OBS + ${#rows[@]} * 500 )) +est_out=$(( ${#rows[@]} * TOKENS_OUT_PER_SESSION )) +est_cost="$(awk -v i="$est_in" -v o="$est_out" -v ci="$COST_IN_PER_1M" -v co="$COST_OUT_PER_1M" \ + 'BEGIN { printf "%.2f", (i*ci + o*co) / 1000000 }')" + +echo "≈ ${#rows[@]} summarize LLM calls (one per session, covering $total_obs observations)" +printf '≈ %d input tok + %d output tok → $%s (rates: in=$%s/1M out=$%s/1M, %s tok/obs)\n' \ + "$est_in" "$est_out" "$est_cost" "$COST_IN_PER_1M" "$COST_OUT_PER_1M" "$TOKENS_PER_OBS" +echo + +if [[ "$DRY_RUN" == 1 ]]; then + printf '%-40s %10s %s\n' "session" "obs" "project" + for row in "${rows[@]}"; do + id="$(cut -f1 <<<"$row")" + obs="$(cut -f2 <<<"$row")" + proj="$(cut -f3 <<<"$row")" + printf '%-40s %10s %s\n' "$id" "$obs" "$proj" + done + echo + echo "(dry run) next steps if you re-run without --dry-run:" + echo " for each session above: POST $URL/agentmemory/summarize {sessionId}" + if [[ "$SKIP_CONSOLIDATE" == 0 ]]; then + echo " then: POST $URL/agentmemory/consolidate-pipeline {}" + fi + exit 0 +fi + +# --- summarize loop --- +if [[ "$DEBUG_ON_ERROR" == 1 ]]; then + mkdir -p "$DEBUG_DIR" + echo "debug mode: failed calls will dump to $DEBUG_DIR/" + echo +fi + +dump_failure() { + local id="$1" obs="$2" resp="$3" + # Replace anything outside [A-Za-z0-9._-] with `_` before joining with + # DEBUG_DIR. Session IDs from the API are UUIDs in practice, but the + # server doesn't enforce that — a hostile or buggy id containing `/` or + # `..` would otherwise escape the debug directory. + local safe_id + safe_id="$(printf '%s' "$id" | tr -c 'A-Za-z0-9._-' '_')" + local file="$DEBUG_DIR/${safe_id}.json" + # Pull the raw observations (what would have gone into the prompt) so the + # operator can reconstruct the upstream payload locally. We also compute + # narrative size stats so size-related rejections are immediately visible. + # Stream observations through stdin (avoids exec-arg overflow on + # multi-thousand-obs sessions — macOS argv ceiling is ~256k). + # `--get --data-urlencode` percent-encodes the session id so special + # characters can't corrupt the query string. + curl -fsS "${META_CURL_OPTS[@]}" --get \ + --data-urlencode "sessionId=$id" \ + "$URL/agentmemory/observations" \ + | jq \ + --arg id "$id" \ + --argjson obsCount "$obs" \ + --arg url "$URL/agentmemory/summarize" \ + --argjson response "$resp" \ + '. as $root + | .observations as $obs + | { + sessionId: $id, + observationCount: $obsCount, + request: { url: $url, method: "POST", body: { sessionId: $id } }, + response: $response, + observations: $obs, + stats: { + totalNarrativeBytes: ($obs | map(.narrative // "" | length) | add // 0), + maxNarrativeBytes: ($obs | map(.narrative // "" | length) | max // 0), + titleHistogram: ($obs | group_by(.title) | map({title: .[0].title, count: length}) | sort_by(-.count)) + } + }' >"$file" + echo " → $file" +} + +ok=0; skipped=0; failed=0 +i=0 +for row in "${rows[@]}"; do + i=$(( i + 1 )) + id="$(cut -f1 <<<"$row")" + obs="$(cut -f2 <<<"$row")" + + body="$(jq -nc --arg id "$id" '{sessionId:$id}')" + resp="$(curl -sS "${WORK_CURL_OPTS[@]}" -X POST "$URL/agentmemory/summarize" \ + -H 'content-type: application/json' --data "$body" || echo '{"success":false,"error":"curl_failed"}')" + # iii's HTTP layer occasionally returns non-JSON (HTML 5xx, empty body + # on timeout, etc.). Validate before parsing so `set -e` doesn't abort + # the whole backfill loop on a single bad response. + if jq -e . >/dev/null 2>&1 <<<"$resp"; then + status="$(jq -r '.success // false' <<<"$resp")" + err="$(jq -r '.error // ""' <<<"$resp")" + title="$(jq -r '.summary.title // ""' <<<"$resp")" + else + status="false" + err="invalid_json_response" + title="" + fi + + if [[ "$status" == "true" ]]; then + ok=$(( ok + 1 )) + printf '[%3d/%3d] OK %s obs=%-5s %s\n' "$i" "${#rows[@]}" "$id" "$obs" "$title" + elif [[ "$err" == "no_observations" || "$err" == "no_provider" ]]; then + skipped=$(( skipped + 1 )) + printf '[%3d/%3d] SKIP %s obs=%-5s %s\n' "$i" "${#rows[@]}" "$id" "$obs" "$err" + else + failed=$(( failed + 1 )) + printf '[%3d/%3d] FAIL %s obs=%-5s %s\n' "$i" "${#rows[@]}" "$id" "$obs" "$err" + [[ "$DEBUG_ON_ERROR" == 1 ]] && dump_failure "$id" "$obs" "$resp" + fi +done + +echo +echo "summarize: ok=$ok skipped=$skipped failed=$failed" + +# --- consolidate --- +if [[ "$SKIP_CONSOLIDATE" == 1 ]]; then + echo "skipping consolidate-pipeline (--skip-consolidate)" + exit 0 +fi + +if [[ "$ok" -eq 0 ]]; then + echo "no summaries produced; skipping consolidate-pipeline." + exit 0 +fi + +echo +echo "running consolidate-pipeline …" +resp="$(curl -sS "${WORK_CURL_OPTS[@]}" -X POST "$URL/agentmemory/consolidate-pipeline" \ + -H 'content-type: application/json' --data '{}' || echo '{"success":false,"error":"curl_failed"}')" +if jq -e . >/dev/null 2>&1 <<<"$resp"; then + echo "$resp" | jq . +else + echo "consolidate-pipeline returned non-JSON (likely a timeout or upstream error):" + printf '%s\n' "$resp" | head -c 500 + echo +fi diff --git a/src/functions/summarize.ts b/src/functions/summarize.ts index 140e0e12..80b29a09 100644 --- a/src/functions/summarize.ts +++ b/src/functions/summarize.ts @@ -7,7 +7,12 @@ import type { } from "../types.js"; import { KV } from "../state/schema.js"; import { StateKV } from "../state/kv.js"; -import { SUMMARY_SYSTEM, buildSummaryPrompt } from "../prompts/summary.js"; +import { + SUMMARY_SYSTEM, + buildSummaryPrompt, + REDUCE_SYSTEM, + buildReducePrompt, +} from "../prompts/summary.js"; import { getXmlTag, getXmlChildren } from "../prompts/xml.js"; import { SummaryOutputSchema } from "../eval/schemas.js"; import { validateOutput } from "../eval/validator.js"; @@ -16,6 +21,169 @@ import type { MetricsStore } from "../eval/metrics-store.js"; import { safeAudit } from "./audit.js"; import { logger } from "../logger.js"; +// Per-chunk observation budget when a session is too large to fit in one +// LLM call. Default ≈ 50k input tokens per chunk at ~110 tok/obs — fits +// comfortably in 128k-window models. Override via SUMMARIZE_CHUNK_SIZE. +const CHUNK_SIZE_DEFAULT = 400; +// Concurrent in-flight chunk calls. 6 keeps a 100-chunk session under +// iii's 180s function-invocation timeout at ~8s/call while staying +// inside generous-but-not-unlimited provider rate limits (well below +// OpenAI free tier's 500 RPM). High-throughput providers +// (Novita / DeepInfra / DeepSeek) typically allow 100+ concurrent — set +// SUMMARIZE_CHUNK_CONCURRENCY higher to cover ~1000+ chunk sessions. +const CHUNK_CONCURRENCY_DEFAULT = 6; +// Bail on the merged summary if more than this fraction of chunks fail +// to parse — a half-blind narrative is worse than a clean error. +const MAX_SKIP_RATIO = 0.5; + +function getChunkSize(): number { + const raw = process.env.SUMMARIZE_CHUNK_SIZE; + if (!raw) return CHUNK_SIZE_DEFAULT; + const n = parseInt(raw, 10); + return Number.isFinite(n) && n > 0 ? n : CHUNK_SIZE_DEFAULT; +} + +function getChunkConcurrency(): number { + const raw = process.env.SUMMARIZE_CHUNK_CONCURRENCY; + if (!raw) return CHUNK_CONCURRENCY_DEFAULT; + const n = parseInt(raw, 10); + return Number.isFinite(n) && n > 0 ? n : CHUNK_CONCURRENCY_DEFAULT; +} + +// One chunk call with retry-once. Returns null when both attempts fail — +// whether by parse failure, provider 4xx (content rejected by upstream +// filters), or transient network/5xx errors that didn't recover on retry. +// All failure modes are equivalent at this layer: the chunk is unusable, +// skip it and let the caller decide via the skip-ratio bailout whether +// the overall summary is still trustworthy. Errors that affect every +// chunk (auth, model down) will trip the bailout naturally. +async function summarizeChunkWithRetry( + provider: MemoryProvider, + chunk: CompressedObservation[], + sessionId: string, + project: string, + idx: number, + total: number, +): Promise { + for (let attempt = 1; attempt <= 2; attempt++) { + try { + const xml = await provider.summarize( + SUMMARY_SYSTEM, + buildSummaryPrompt(chunk), + ); + const parsed = parseSummaryXml(xml, sessionId, project, chunk.length); + if (parsed) return parsed; + logger.warn("Summarize chunk parse failed", { + sessionId, + chunk: `${idx + 1}/${total}`, + attempt, + }); + } catch (err) { + logger.warn("Summarize chunk LLM call failed", { + sessionId, + chunk: `${idx + 1}/${total}`, + attempt, + error: err instanceof Error ? err.message : String(err), + }); + } + } + return null; +} + +// Returns the final summary XML string. For sessions ≤ chunk size, this is +// a single LLM call (legacy behavior). For larger sessions, observations +// are split into chunks processed in parallel batches, each chunk retried +// once on parse failure, persistently-bad chunks skipped, and remaining +// partials merged via a reduce call. +async function produceSummaryXml( + provider: MemoryProvider, + compressed: CompressedObservation[], + sessionId: string, + project: string, +): Promise<{ + response: string; + mode: "single" | "chunked"; + chunks: number; + skipped?: number; +}> { + const chunkSize = getChunkSize(); + if (compressed.length <= chunkSize) { + const response = await provider.summarize( + SUMMARY_SYSTEM, + buildSummaryPrompt(compressed), + ); + return { response, mode: "single", chunks: 1 }; + } + + const chunks: CompressedObservation[][] = []; + for (let i = 0; i < compressed.length; i += chunkSize) { + chunks.push(compressed.slice(i, i + chunkSize)); + } + const concurrency = getChunkConcurrency(); + logger.info("Summarize chunking session", { + sessionId, + chunks: chunks.length, + chunkSize, + concurrency, + totalObservations: compressed.length, + }); + + // Sparse array preserves chunk → index mapping after parallel resolution, + // so the reduce step sees partials in chronological order even when some + // were skipped. + const partialByIdx: Array = new Array(chunks.length).fill(null); + for (let batchStart = 0; batchStart < chunks.length; batchStart += concurrency) { + const batch = chunks.slice(batchStart, batchStart + concurrency); + await Promise.all( + batch.map(async (chunk, j) => { + const idx = batchStart + j; + partialByIdx[idx] = await summarizeChunkWithRetry( + provider, + chunk, + sessionId, + project, + idx, + chunks.length, + ); + }), + ); + } + + const skipped = partialByIdx.filter((p) => p === null).length; + const partials = partialByIdx.filter((p): p is SessionSummary => p !== null); + + if (skipped > Math.floor(chunks.length * MAX_SKIP_RATIO)) { + throw new Error( + `too_many_chunks_skipped: ${skipped}/${chunks.length} chunks failed to parse after retry`, + ); + } + if (skipped > 0) { + logger.warn("Summarize chunks partially skipped", { + sessionId, + skipped, + total: chunks.length, + }); + } + + const reduceInput = partials.map((p) => { + const originalIdx = partialByIdx.indexOf(p); + return { + title: p.title, + narrative: p.narrative, + keyDecisions: p.keyDecisions, + filesModified: p.filesModified, + concepts: p.concepts, + obsRangeStart: originalIdx * chunkSize + 1, + obsRangeEnd: Math.min((originalIdx + 1) * chunkSize, compressed.length), + }; + }); + const response = await provider.summarize( + REDUCE_SYSTEM, + buildReducePrompt(reduceInput), + ); + return { response, mode: "chunked", chunks: chunks.length, skipped }; +} + function parseSummaryXml( xml: string, sessionId: string, @@ -85,8 +253,12 @@ export function registerSummarizeFunction( } try { - const prompt = buildSummaryPrompt(compressed); - const response = await provider.summarize(SUMMARY_SYSTEM, prompt); + const { response, mode, chunks } = await produceSummaryXml( + provider, + compressed, + sessionId, + session.project, + ); if (!response || !response.trim()) { const latencyMs = Date.now() - startMs; if (metricsStore) { @@ -95,8 +267,8 @@ export function registerSummarizeFunction( logger.warn("Empty provider response on summarize", { sessionId, provider: provider.name, - promptBytes: prompt.length, - systemBytes: SUMMARY_SYSTEM.length, + mode, + chunks, observationCount: compressed.length, }); return { success: false, error: "empty_provider_response" }; diff --git a/src/prompts/summary.ts b/src/prompts/summary.ts index f01b28b8..bd040212 100644 --- a/src/prompts/summary.ts +++ b/src/prompts/summary.ts @@ -36,3 +36,52 @@ export function buildSummaryPrompt(observations: Array<{ }) return `Session observations (${observations.length} total):\n\n${lines.join('\n\n---\n\n')}` } + +export const REDUCE_SYSTEM = `You are merging multiple partial summaries of the SAME coding session into one final session summary. The partials are chronological chunks of one continuous session — not separate sessions. + +Output EXACTLY this XML format with no additional text: + +

+ Short session title (max 100 chars) + 3-5 sentence narrative covering the whole session + + Key technical decision made + + + path/to/modified/file + + + key concept from session + + + +Rules: +- Synthesize a single narrative that reflects the whole arc, not a chunk-by-chunk recap +- Preserve every distinct decision across chunks +- Union (deduplicate) all files and concepts +- Title should capture the session's overall outcome` + +export function buildReducePrompt(partials: Array<{ + title: string + narrative: string + keyDecisions: string[] + filesModified: string[] + concepts: string[] + obsRangeStart: number + obsRangeEnd: number +}>): string { + const sections = partials.map((p, i) => { + const decisions = p.keyDecisions.map((d) => ` - ${d}`).join('\n') + const files = p.filesModified.map((f) => ` - ${f}`).join('\n') + const concepts = p.concepts.join(', ') + return `[Chunk ${i + 1} of ${partials.length} — obs ${p.obsRangeStart}-${p.obsRangeEnd}] +Title: ${p.title} +Narrative: ${p.narrative} +Decisions: +${decisions} +Files: +${files} +Concepts: ${concepts}` + }) + return `Partial summaries (${partials.length} chunks of one session, chronological):\n\n${sections.join('\n\n---\n\n')}` +} diff --git a/test/summarize.test.ts b/test/summarize.test.ts new file mode 100644 index 00000000..03aa1926 --- /dev/null +++ b/test/summarize.test.ts @@ -0,0 +1,417 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from "vitest"; + +vi.mock("../src/logger.js", () => ({ + logger: { info: vi.fn(), warn: vi.fn(), error: vi.fn() }, +})); + +vi.mock("../src/state/schema.js", () => ({ + KV: { + sessions: "sessions", + summaries: "summaries", + observations: (sessionId: string) => `obs:${sessionId}`, + audit: "audit", + }, +})); + +vi.mock("../src/eval/schemas.js", () => ({ + SummaryOutputSchema: {}, +})); + +vi.mock("../src/eval/validator.js", () => ({ + validateOutput: () => ({ valid: true, result: { errors: [] } }), +})); + +vi.mock("../src/eval/quality.js", () => ({ + scoreSummary: () => 100, +})); + +vi.mock("../src/functions/audit.js", () => ({ + safeAudit: vi.fn(), +})); + +import { registerSummarizeFunction } from "../src/functions/summarize.js"; +import type { + CompressedObservation, + Session, + MemoryProvider, +} from "../src/types.js"; + +function mockKV() { + const store = new Map>(); + return { + store, + get: async (scope: string, key: string): Promise => + (store.get(scope)?.get(key) as T) ?? null, + set: async (scope: string, key: string, data: T): Promise => { + if (!store.has(scope)) store.set(scope, new Map()); + store.get(scope)!.set(key, data); + return data; + }, + delete: async (scope: string, key: string): Promise => { + store.get(scope)?.delete(key); + }, + list: async (scope: string): Promise => { + const entries = store.get(scope); + return entries ? (Array.from(entries.values()) as T[]) : []; + }, + }; +} + +function mockSdk() { + const functions = new Map(); + return { + functions, + registerFunction: (id: string, handler: Function) => { + functions.set(id, handler); + }, + registerTrigger: () => {}, + trigger: async () => ({}), + }; +} + +function makeObs(i: number, sessionId: string): CompressedObservation { + return { + id: `obs_${i}`, + sessionId, + timestamp: new Date().toISOString(), + type: "conversation", + title: `obs ${i}`, + facts: [`fact ${i}`], + narrative: `narrative for obs ${i}`, + concepts: [], + files: [`src/file_${i}.ts`], + importance: 5, + }; +} + +function makeProvider(responses: string[]): MemoryProvider & { + calls: Array<{ system: string; user: string }>; +} { + const calls: Array<{ system: string; user: string }> = []; + let i = 0; + return { + name: "test", + calls, + compress: async () => "", + summarize: async (system: string, user: string) => { + calls.push({ system, user }); + const r = responses[i] ?? responses[responses.length - 1]; + i += 1; + return r; + }, + }; +} + +function summaryXml(opts: { + title: string; + narrative?: string; + decisions?: string[]; + files?: string[]; + concepts?: string[]; +}): string { + const d = (opts.decisions ?? []).map((x) => `${x}`).join(""); + const f = (opts.files ?? []).map((x) => `${x}`).join(""); + const c = (opts.concepts ?? []).map((x) => `${x}`).join(""); + return ` +${opts.title} +${opts.narrative ?? "narrative"} +${d} +${f} +${c} +`; +} + +async function setupHandler(opts: { + sessionId: string; + obsCount: number; + provider: MemoryProvider; +}) { + const sdk = mockSdk(); + const kv = mockKV(); + const session: Session = { + id: opts.sessionId, + project: "test-project", + cwd: "/tmp", + startedAt: new Date().toISOString(), + status: "completed", + observationCount: opts.obsCount, + }; + await kv.set("sessions", opts.sessionId, session); + for (let i = 0; i < opts.obsCount; i++) { + const o = makeObs(i, opts.sessionId); + await kv.set(`obs:${opts.sessionId}`, o.id, o); + } + registerSummarizeFunction(sdk as any, kv as any, opts.provider); + const handler = sdk.functions.get("mem::summarize")!; + return { handler, kv }; +} + +describe("mem::summarize chunking", () => { + const ORIGINAL_ENV = { ...process.env }; + + beforeEach(() => { + delete process.env.SUMMARIZE_CHUNK_SIZE; + delete process.env.SUMMARIZE_CHUNK_CONCURRENCY; + }); + + afterEach(() => { + process.env = { ...ORIGINAL_ENV }; + }); + + it("small session takes the single-call path (no chunking, no reduce)", async () => { + const provider = makeProvider([ + summaryXml({ + title: "Small session", + decisions: ["decision A"], + files: ["src/a.ts"], + concepts: ["concept-a"], + }), + ]); + const { handler, kv } = await setupHandler({ + sessionId: "ses_small", + obsCount: 10, + provider, + }); + + const result: any = await handler({ sessionId: "ses_small" }); + + expect(result.success).toBe(true); + expect(provider.calls).toHaveLength(1); + expect(provider.calls[0].user).toContain("Session observations (10 total)"); + const stored: any = await kv.get("summaries", "ses_small"); + expect(stored?.title).toBe("Small session"); + }); + + it("large session map-reduces: N chunk calls + 1 reduce call", async () => { + process.env.SUMMARIZE_CHUNK_SIZE = "100"; + process.env.SUMMARIZE_CHUNK_CONCURRENCY = "1"; // serial keeps call ordering deterministic + const provider = makeProvider([ + summaryXml({ title: "Chunk 1", decisions: ["dA"], files: ["src/a.ts"], concepts: ["ca"] }), + summaryXml({ title: "Chunk 2", decisions: ["dB"], files: ["src/b.ts"], concepts: ["cb"] }), + summaryXml({ title: "Chunk 3", decisions: ["dC"], files: ["src/c.ts"], concepts: ["cc"] }), + summaryXml({ + title: "Merged", + decisions: ["dA", "dB", "dC"], + files: ["src/a.ts", "src/b.ts", "src/c.ts"], + concepts: ["ca", "cb", "cc"], + }), + ]); + const { handler, kv } = await setupHandler({ + sessionId: "ses_large", + obsCount: 250, + provider, + }); + + const result: any = await handler({ sessionId: "ses_large" }); + + expect(result.success).toBe(true); + expect(provider.calls).toHaveLength(4); + // First three are chunk calls (use the summary system prompt). + expect(provider.calls[0].system).toContain("session summarizer"); + expect(provider.calls[2].system).toContain("session summarizer"); + // Last is the reduce call (uses the merge system prompt). + expect(provider.calls[3].system).toContain("merging multiple partial summaries"); + expect(provider.calls[3].user).toContain("Chunk 1 of 3"); + expect(provider.calls[3].user).toContain("Chunk 3 of 3"); + + const stored: any = await kv.get("summaries", "ses_large"); + expect(stored?.title).toBe("Merged"); + // observationCount on the persisted summary should reflect the full session, + // not just the final chunk. + expect(stored?.observationCount).toBe(250); + expect(stored?.keyDecisions).toEqual(["dA", "dB", "dC"]); + }); + + it("SUMMARIZE_CHUNK_SIZE env override is respected", async () => { + process.env.SUMMARIZE_CHUNK_SIZE = "50"; + process.env.SUMMARIZE_CHUNK_CONCURRENCY = "1"; + const provider = makeProvider([ + summaryXml({ title: "chunk" }), + summaryXml({ title: "chunk" }), + summaryXml({ title: "chunk" }), + summaryXml({ title: "chunk" }), + summaryXml({ title: "merged" }), + ]); + const { handler } = await setupHandler({ + sessionId: "ses_env", + obsCount: 175, + provider, + }); + + const result: any = await handler({ sessionId: "ses_env" }); + + expect(result.success).toBe(true); + // 175 obs ÷ 50 = 4 chunks (last chunk has 25) + 1 reduce = 5 calls. + expect(provider.calls).toHaveLength(5); + }); + + it("flaky chunk: parse fails once, retried, then succeeds — no skip", async () => { + process.env.SUMMARIZE_CHUNK_SIZE = "100"; + process.env.SUMMARIZE_CHUNK_CONCURRENCY = "1"; + const provider = makeProvider([ + summaryXml({ title: "ok1" }), + "", // chunk 2 attempt 1: parse-fail + summaryXml({ title: "ok2" }), // chunk 2 attempt 2 (retry): success + summaryXml({ title: "ok3" }), + summaryXml({ title: "merged" }), + ]); + const { handler, kv } = await setupHandler({ + sessionId: "ses_flaky", + obsCount: 250, + provider, + }); + + const result: any = await handler({ sessionId: "ses_flaky" }); + + expect(result.success).toBe(true); + // 3 chunks × 1 attempt + 1 retry on chunk 2 + 1 reduce = 5 calls. + expect(provider.calls).toHaveLength(5); + const stored: any = await kv.get("summaries", "ses_flaky"); + expect(stored?.title).toBe("merged"); + }); + + it("persistently-broken chunk is skipped, reduce still runs on remaining partials", async () => { + process.env.SUMMARIZE_CHUNK_SIZE = "100"; + process.env.SUMMARIZE_CHUNK_CONCURRENCY = "1"; + const provider = makeProvider([ + summaryXml({ title: "ok1" }), + "", "", // chunk 2: both attempts parse-fail + summaryXml({ title: "ok3" }), + summaryXml({ title: "merged-with-skip" }), + ]); + const { handler, kv } = await setupHandler({ + sessionId: "ses_skip", + obsCount: 250, + provider, + }); + + const result: any = await handler({ sessionId: "ses_skip" }); + + expect(result.success).toBe(true); + // 1 ok + (1 + 1 retry skip) + 1 ok + 1 reduce = 5 calls. + expect(provider.calls).toHaveLength(5); + // Reduce input should mention only 2 of 3 chunks (chunk 2 skipped) — + // but the chunk indices in the reduce labels should reflect chunk 1 and 3, + // preserving chronological boundaries. + const reduceCall = provider.calls[4]; + expect(reduceCall.user).toContain("Chunk 1 of 2"); + expect(reduceCall.user).toContain("Chunk 2 of 2"); + expect(reduceCall.user).toContain("obs 1-100"); // first surviving chunk + expect(reduceCall.user).toContain("obs 201-250"); // third surviving chunk (was idx 2, range 201-250) + const stored: any = await kv.get("summaries", "ses_skip"); + expect(stored?.title).toBe("merged-with-skip"); + }); + + it("too many skipped chunks bails out with a clear error", async () => { + process.env.SUMMARIZE_CHUNK_SIZE = "100"; + process.env.SUMMARIZE_CHUNK_CONCURRENCY = "1"; + // 3 chunks, 2 fully broken → >50% skipped → bail. + const provider = makeProvider([ + summaryXml({ title: "ok1" }), + "", "", + "", "", + ]); + const { handler } = await setupHandler({ + sessionId: "ses_too_broken", + obsCount: 250, + provider, + }); + + const result: any = await handler({ sessionId: "ses_too_broken" }); + + expect(result.success).toBe(false); + expect(result.error).toMatch(/too_many_chunks_skipped: 2\/3/); + }); + + it("provider error on one chunk after retry is skipped, not propagated", async () => { + process.env.SUMMARIZE_CHUNK_SIZE = "100"; + process.env.SUMMARIZE_CHUNK_CONCURRENCY = "1"; + let i = 0; + const provider: MemoryProvider & { calls: any[] } = { + name: "test", + calls: [], + compress: async () => "", + summarize: async (system: string, user: string) => { + (provider as any).calls.push({ system, user }); + i += 1; + if (i === 1) return summaryXml({ title: "ok1" }); + // chunk 2: both attempts throw (e.g. provider 400) + if (i === 2 || i === 3) throw new Error("OpenAI API error (400): content rejected"); + if (i === 4) return summaryXml({ title: "ok3" }); + return summaryXml({ title: "merged-with-skip" }); + }, + }; + const { handler, kv } = await setupHandler({ + sessionId: "ses_net", + obsCount: 250, + provider, + }); + + const result: any = await handler({ sessionId: "ses_net" }); + + expect(result.success).toBe(true); + // 1 ok + 2 fail + 1 ok + 1 reduce = 5 calls. + expect((provider as any).calls.length).toBe(5); + const stored: any = await kv.get("summaries", "ses_net"); + expect(stored?.title).toBe("merged-with-skip"); + }); + + it("every chunk failing on provider error trips too_many_chunks_skipped", async () => { + process.env.SUMMARIZE_CHUNK_SIZE = "100"; + process.env.SUMMARIZE_CHUNK_CONCURRENCY = "1"; + // 3 chunks, all chunk calls throw → 3/3 skipped → bail. + const provider: MemoryProvider & { calls: any[] } = { + name: "test", + calls: [], + compress: async () => "", + summarize: async (system: string, user: string) => { + (provider as any).calls.push({ system, user }); + throw new Error("OpenAI API error (400): invalid request"); + }, + }; + const { handler } = await setupHandler({ + sessionId: "ses_all_400", + obsCount: 250, + provider, + }); + + const result: any = await handler({ sessionId: "ses_all_400" }); + + expect(result.success).toBe(false); + expect(result.error).toMatch(/too_many_chunks_skipped: 3\/3/); + }); + + it("chunks run in parallel batches according to SUMMARIZE_CHUNK_CONCURRENCY", async () => { + process.env.SUMMARIZE_CHUNK_SIZE = "100"; + process.env.SUMMARIZE_CHUNK_CONCURRENCY = "2"; + let inflight = 0; + let maxInflight = 0; + const provider: MemoryProvider & { calls: any[] } = { + name: "test", + calls: [], + compress: async () => "", + summarize: async (system: string, user: string) => { + (provider as any).calls.push({ system, user }); + inflight += 1; + maxInflight = Math.max(maxInflight, inflight); + // Yield to event loop so siblings can also enter before we resolve. + await new Promise((r) => setTimeout(r, 5)); + inflight -= 1; + if (system.includes("merging")) return summaryXml({ title: "merged" }); + return summaryXml({ title: "ok" }); + }, + }; + const { handler } = await setupHandler({ + sessionId: "ses_par", + obsCount: 400, // 4 chunks at chunkSize=100 + provider, + }); + + const result: any = await handler({ sessionId: "ses_par" }); + + expect(result.success).toBe(true); + // 4 chunks at concurrency 2 → max 2 in flight at once during the chunk phase. + // Reduce is a single call so doesn't bump it. + expect(maxInflight).toBe(2); + }); +}); From cb2e0131dc14cce15c2cde949f340ae664975fc3 Mon Sep 17 00:00:00 2001 From: efenex Date: Tue, 19 May 2026 20:29:46 +0200 Subject: [PATCH 22/34] fix(visibility): surface lessons in smart-search + tally per-store in diagnose (#473) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(visibility): surface lessons in smart-search + tally per-store in diagnose Two related UX gaps in the memory layer's reflection surfaces. A consumer that calls `memory_lesson_save` and gets `success:true` reasonably expects to find the lesson via `memory_smart_search` ("did my save land?") and to see it counted in `memory_diagnose` ("what's in the store?"). Neither was true: lessons live in their own KV store (`KV.lessons`), and both diagnostic surfaces only looked at `KV.observations` / `KV.memories`. A 4,350-lesson store could read as "memories: 0" on diagnose and return zero hits on smart_search — the trust-shock that prompted this fix. A) mem::smart-search: also return lessons in the compact response. - New optional `project` and `includeLessons` (default true) params. - Delegates lesson scoring to the existing mem::lesson-recall via sdk.trigger, so confidence + recency weighting stays consistent with mem::lesson-recall (no duplicate scoring logic). - Lessons come back in a separate `lessons` field on the response, not merged into `results`. Existing consumers reading `results` are unaffected; new consumers can read `result.lessons` too. - Content truncated to 240 chars in compact mode (full content remains available via mem::lesson-recall directly). - Lesson-recall failures are soft: log + return empty lessons, observation results still flow through. B) mem::diagnose: add per-store tally categories for lessons, summaries, semantic, procedural, crystals, insights. Mirrors the existing `memories` pattern: count + light consistency check (confidence range for scored memories; non-empty title/narrative/ steps for the rest). Each new category is in ALL_CATEGORIES so `--categories lessons` filtering works as expected. The empty-system pass count goes from 8 to 14 (8 original + 6 new stores). Test updated accordingly. - src/types.ts: add CompactLessonResult - src/functions/smart-search.ts: lesson recall + merge (single-call path unchanged, expand mode unchanged) - src/functions/diagnostics.ts: six new category blocks before mesh - test/smart-search.test.ts: 6 new cases (lesson inclusion, content preview truncation, includeLessons=false opt-out, project filter passthrough, soft-fail on recall error / non-success response) - test/diagnostics.test.ts: 7 new pass/warn cases for each new category + filter check; empty-system pass count bumped 8→14 43/43 tests pass. * fix(diagnostics): defensive guards on new validators (CodeRabbit #473 review) CodeRabbit flagged two patterns in the per-store validators added in the parent commit: 1. .trim() on .title / .narrative was unconditional — a corrupted row with title=null or title=42 would throw, abort the whole diagnose run, and silently skip every later category. Add typeof guards. 2. confidence range checks were `< 0 || > 1` which silently passes NaN and Infinity (NaN < 0 is false, NaN > 1 is false → "healthy"). Add Number.isFinite(...) prefix so corrupted scored rows surface as warnings instead. Applied across all 6 new validators: lesson confidence, summary title, semantic confidence, crystal narrative, insight confidence. Tests added in test/diagnostics.test.ts under "defensive row-shape handling": NaN confidence on a lesson, null summary title (verifies diagnose still completes and later categories still execute), undefined crystal narrative, Infinity / NaN on insight + semantic. 34/34 tests pass. --- src/functions/diagnostics.ts | 192 ++++++++++++++++++++++++++++ src/functions/smart-search.ts | 67 +++++++++- src/types.ts | 10 ++ test/diagnostics.test.ts | 230 +++++++++++++++++++++++++++++++++- test/smart-search.test.ts | 98 +++++++++++++++ 5 files changed, 593 insertions(+), 4 deletions(-) diff --git a/src/functions/diagnostics.ts b/src/functions/diagnostics.ts index 42f822cb..a63d7959 100644 --- a/src/functions/diagnostics.ts +++ b/src/functions/diagnostics.ts @@ -7,8 +7,14 @@ import type { Action, ActionEdge, DiagnosticCheck, + Insight, Lease, + Lesson, Checkpoint, + Crystal, + ProceduralMemory, + SemanticMemory, + SessionSummary, Signal, Sentinel, Sketch, @@ -25,6 +31,12 @@ const ALL_CATEGORIES = [ "signals", "sessions", "memories", + "lessons", + "summaries", + "semantic", + "procedural", + "crystals", + "insights", "mesh", ]; @@ -354,6 +366,186 @@ export function registerDiagnosticsFunction(sdk: ISdk, kv: StateKV): void { } } + if (categories.includes("lessons")) { + // Counts only live lessons (deleted=true rows are tombstoned). + // Catches bad confidence values that would silently break recall + // scoring (memory_lesson_recall multiplies by confidence). + const lessons = await kv.list(KV.lessons); + const live = lessons.filter((l) => !l.deleted); + let lessonIssues = 0; + for (const l of live) { + // Number.isFinite rejects NaN / Infinity / non-numbers; a + // corrupted row passing those would silently survive the < / > + // range check (e.g. NaN < 0 is false, NaN > 1 is false, so the + // bad row would be "healthy") and skew memory_lesson_recall's + // scoring downstream. Surface as warning. + if ( + !Number.isFinite(l.confidence) || + l.confidence < 0 || + l.confidence > 1 + ) { + checks.push({ + name: `lesson-bad-confidence:${l.id}`, + category: "lessons", + status: "warn", + message: `Lesson ${l.id} has confidence ${l.confidence} (expected finite number in 0..1)`, + fixable: false, + }); + lessonIssues++; + } + } + if (lessonIssues === 0) { + checks.push({ + name: "lessons-ok", + category: "lessons", + status: "pass", + message: `All ${live.length} lessons are healthy (${lessons.length - live.length} tombstoned)`, + fixable: false, + }); + } + } + + if (categories.includes("summaries")) { + const summaries = await kv.list(KV.summaries); + let summaryIssues = 0; + for (const s of summaries) { + // typeof guard before .trim() — a corrupted row with title=null + // or title=42 would otherwise throw and abort the whole diagnose + // run before later categories get checked. + if (typeof s.title !== "string" || s.title.trim().length === 0) { + checks.push({ + name: `summary-missing-title:${s.sessionId}`, + category: "summaries", + status: "warn", + message: `Summary for session ${s.sessionId} has no title`, + fixable: false, + }); + summaryIssues++; + } + } + if (summaryIssues === 0) { + checks.push({ + name: "summaries-ok", + category: "summaries", + status: "pass", + message: `All ${summaries.length} session summaries are consistent`, + fixable: false, + }); + } + } + + if (categories.includes("semantic")) { + const semantic = await kv.list(KV.semantic); + let semanticIssues = 0; + for (const s of semantic) { + if ( + !Number.isFinite(s.confidence) || + s.confidence < 0 || + s.confidence > 1 + ) { + checks.push({ + name: `semantic-bad-confidence:${s.id}`, + category: "semantic", + status: "warn", + message: `Semantic fact ${s.id} has confidence ${s.confidence} (expected finite number in 0..1)`, + fixable: false, + }); + semanticIssues++; + } + } + if (semanticIssues === 0) { + checks.push({ + name: "semantic-ok", + category: "semantic", + status: "pass", + message: `All ${semantic.length} semantic memories are consistent`, + fixable: false, + }); + } + } + + if (categories.includes("procedural")) { + const procedural = await kv.list(KV.procedural); + let proceduralIssues = 0; + for (const p of procedural) { + if (!Array.isArray(p.steps) || p.steps.length === 0) { + checks.push({ + name: `procedural-empty-steps:${p.id}`, + category: "procedural", + status: "warn", + message: `Procedural memory "${p.name}" (${p.id}) has no steps`, + fixable: false, + }); + proceduralIssues++; + } + } + if (proceduralIssues === 0) { + checks.push({ + name: "procedural-ok", + category: "procedural", + status: "pass", + message: `All ${procedural.length} procedural memories are consistent`, + fixable: false, + }); + } + } + + if (categories.includes("crystals")) { + const crystals = await kv.list(KV.crystals); + let crystalIssues = 0; + for (const c of crystals) { + if (typeof c.narrative !== "string" || c.narrative.trim().length === 0) { + checks.push({ + name: `crystal-empty-narrative:${c.id}`, + category: "crystals", + status: "warn", + message: `Crystal ${c.id} has empty narrative`, + fixable: false, + }); + crystalIssues++; + } + } + if (crystalIssues === 0) { + checks.push({ + name: "crystals-ok", + category: "crystals", + status: "pass", + message: `All ${crystals.length} crystals are consistent`, + fixable: false, + }); + } + } + + if (categories.includes("insights")) { + const insights = await kv.list(KV.insights); + let insightIssues = 0; + for (const i of insights) { + if ( + !Number.isFinite(i.confidence) || + i.confidence < 0 || + i.confidence > 1 + ) { + checks.push({ + name: `insight-bad-confidence:${i.id}`, + category: "insights", + status: "warn", + message: `Insight ${i.id} has confidence ${i.confidence} (expected finite number in 0..1)`, + fixable: false, + }); + insightIssues++; + } + } + if (insightIssues === 0) { + checks.push({ + name: "insights-ok", + category: "insights", + status: "pass", + message: `All ${insights.length} insights are consistent`, + fixable: false, + }); + } + } + if (categories.includes("mesh")) { const peers = await kv.list(KV.mesh); let meshIssues = 0; diff --git a/src/functions/smart-search.ts b/src/functions/smart-search.ts index fdeed273..c80b1f87 100644 --- a/src/functions/smart-search.ts +++ b/src/functions/smart-search.ts @@ -1,24 +1,32 @@ import type { ISdk } from "iii-sdk"; import type { + CompactLessonResult, CompactSearchResult, CompressedObservation, HybridSearchResult, + Lesson, } from "../types.js"; import { KV } from "../state/schema.js"; import { StateKV } from "../state/kv.js"; import { recordAccessBatch } from "./access-tracker.js"; import { logger } from "../logger.js"; +// Compact mode trims each lesson's content for at-a-glance display. The +// full content is fetched via memory_lesson_recall when the caller needs it. +const LESSON_CONTENT_PREVIEW_CHARS = 240; + export function registerSmartSearchFunction( sdk: ISdk, kv: StateKV, searchFn: (query: string, limit: number) => Promise, ): void { - sdk.registerFunction("mem::smart-search", + sdk.registerFunction("mem::smart-search", async (data: { query?: string; expandIds?: Array; limit?: number; + project?: string; + includeLessons?: boolean; }) => { if (data.expandIds && data.expandIds.length > 0) { @@ -68,7 +76,21 @@ export function registerSmartSearchFunction( } const limit = Math.max(1, Math.min(data.limit ?? 20, 100)); - const hybridResults = await searchFn(data.query, limit); + // Cap lesson results at a smaller number than observations: lessons + // are denser (curated insights) so 10 is usually plenty for a recall. + const lessonLimit = Math.min(limit, 10); + const includeLessons = data.includeLessons !== false; + + // Run observation hybrid-search and lesson recall in parallel so the + // extra lesson lookup adds no wallclock when the underlying calls + // can overlap. Lesson recall is best-effort: if mem::lesson-recall + // fails or returns unexpected shape, log + fall back to empty. + const [hybridResults, lessons] = await Promise.all([ + searchFn(data.query, limit), + includeLessons + ? recallLessons(sdk, data.query, lessonLimit, data.project) + : Promise.resolve([]), + ]); const compact: CompactSearchResult[] = hybridResults.map((r) => ({ obsId: r.observation.id, @@ -87,12 +109,51 @@ export function registerSmartSearchFunction( logger.info("Smart search compact", { query: data.query, results: compact.length, + lessons: lessons.length, }); - return { mode: "compact", results: compact }; + const response: { + mode: "compact"; + results: CompactSearchResult[]; + lessons?: CompactLessonResult[]; + } = { mode: "compact", results: compact }; + if (includeLessons) response.lessons = lessons; + return response; }, ); } +async function recallLessons( + sdk: ISdk, + query: string, + limit: number, + project?: string, +): Promise { + try { + const result = (await sdk.trigger({ + function_id: "mem::lesson-recall", + payload: { query, limit, project }, + })) as { success?: boolean; lessons?: Array }; + if (!result?.success || !Array.isArray(result.lessons)) return []; + return result.lessons.map((l) => ({ + lessonId: l.id, + content: + l.content.length > LESSON_CONTENT_PREVIEW_CHARS + ? l.content.slice(0, LESSON_CONTENT_PREVIEW_CHARS) + "…" + : l.content, + confidence: l.confidence, + score: l.score ?? l.confidence, + createdAt: l.createdAt, + project: l.project, + tags: l.tags ?? [], + })); + } catch (err) { + logger.warn("Smart search: mem::lesson-recall failed; returning empty lesson list", { + error: err instanceof Error ? err.message : String(err), + }); + return []; + } +} + async function findObservation( kv: StateKV, obsId: string, diff --git a/src/types.ts b/src/types.ts index bc38a058..87e274d1 100644 --- a/src/types.ts +++ b/src/types.ts @@ -266,6 +266,16 @@ export interface CompactSearchResult { timestamp: string; } +export interface CompactLessonResult { + lessonId: string; + content: string; + confidence: number; + score: number; + createdAt: string; + project?: string; + tags: string[]; +} + export interface TimelineEntry { observation: CompressedObservation; sessionId: string; diff --git a/test/diagnostics.test.ts b/test/diagnostics.test.ts index d2dc706e..053e1c40 100644 --- a/test/diagnostics.test.ts +++ b/test/diagnostics.test.ts @@ -195,7 +195,10 @@ describe("Diagnostics Functions", () => { }; expect(result.success).toBe(true); - expect(result.summary.pass).toBe(8); + // 14 = 8 original (actions, leases, sentinels, sketches, signals, + // sessions, memories, mesh) + 6 added in #lesson-visibility + // (lessons, summaries, semantic, procedural, crystals, insights). + expect(result.summary.pass).toBe(14); expect(result.summary.warn).toBe(0); expect(result.summary.fail).toBe(0); expect(result.summary.fixable).toBe(0); @@ -636,4 +639,229 @@ describe("Diagnostics Functions", () => { expect(unchanged!.status).toBe("blocked"); }); }); + + describe("per-store tally categories (#lesson-visibility)", () => { + it("lessons category: passes with valid live lessons + ignores tombstoned", async () => { + await kv.set(KV.lessons, "lsn_live", { + id: "lsn_live", content: "x", context: "", confidence: 0.8, + reinforcements: 0, source: "manual", sourceIds: [], tags: [], + createdAt: "", updatedAt: "", decayRate: 0.05, + }); + await kv.set(KV.lessons, "lsn_tomb", { + id: "lsn_tomb", content: "x", context: "", confidence: 0.5, + reinforcements: 0, source: "manual", sourceIds: [], tags: [], + createdAt: "", updatedAt: "", decayRate: 0.05, deleted: true, + }); + + const result = (await sdk.trigger("mem::diagnose", { + categories: ["lessons"], + })) as { checks: DiagnosticCheck[] }; + + const ok = result.checks.find((c) => c.name === "lessons-ok"); + expect(ok?.status).toBe("pass"); + expect(ok?.message).toMatch(/All 1 lessons.*1 tombstoned/); + }); + + it("lessons category: warns on out-of-range confidence", async () => { + await kv.set(KV.lessons, "lsn_bad", { + id: "lsn_bad", content: "x", context: "", confidence: 1.5, + reinforcements: 0, source: "manual", sourceIds: [], tags: [], + createdAt: "", updatedAt: "", decayRate: 0.05, + }); + + const result = (await sdk.trigger("mem::diagnose", { + categories: ["lessons"], + })) as { checks: DiagnosticCheck[] }; + + const warn = result.checks.find((c) => c.name.startsWith("lesson-bad-confidence:")); + expect(warn?.status).toBe("warn"); + }); + + it("summaries category: warns on missing title", async () => { + await kv.set(KV.summaries, "ses_1", { + sessionId: "ses_1", project: "p", createdAt: "", title: "", + narrative: "n", keyDecisions: [], filesModified: [], concepts: [], + observationCount: 1, + }); + + const result = (await sdk.trigger("mem::diagnose", { + categories: ["summaries"], + })) as { checks: DiagnosticCheck[] }; + + const warn = result.checks.find((c) => c.name.startsWith("summary-missing-title:")); + expect(warn?.status).toBe("warn"); + }); + + it("procedural category: warns on empty steps", async () => { + await kv.set(KV.procedural, "proc_1", { + id: "proc_1", name: "noop", steps: [], triggerCondition: "x", + frequency: 1, sourceSessionIds: [], strength: 0.5, + createdAt: "", updatedAt: "", + }); + + const result = (await sdk.trigger("mem::diagnose", { + categories: ["procedural"], + })) as { checks: DiagnosticCheck[] }; + + const warn = result.checks.find((c) => c.name.startsWith("procedural-empty-steps:")); + expect(warn?.status).toBe("warn"); + }); + + it("crystals category: warns on empty narrative", async () => { + await kv.set(KV.crystals, "cry_1", { + id: "cry_1", narrative: "", keyOutcomes: [], filesAffected: [], + lessons: [], sourceActionIds: [], createdAt: "", + }); + + const result = (await sdk.trigger("mem::diagnose", { + categories: ["crystals"], + })) as { checks: DiagnosticCheck[] }; + + const warn = result.checks.find((c) => c.name.startsWith("crystal-empty-narrative:")); + expect(warn?.status).toBe("warn"); + }); + + it("insights category: warns on out-of-range confidence", async () => { + await kv.set(KV.insights, "ins_bad", { + id: "ins_bad", title: "t", content: "c", confidence: -0.1, + reinforcements: 0, sourceConceptCluster: [], sourceMemoryIds: [], + sourceLessonIds: [], sourceCrystalIds: [], tags: [], + createdAt: "", updatedAt: "", decayRate: 0.05, + }); + + const result = (await sdk.trigger("mem::diagnose", { + categories: ["insights"], + })) as { checks: DiagnosticCheck[] }; + + const warn = result.checks.find((c) => c.name.startsWith("insight-bad-confidence:")); + expect(warn?.status).toBe("warn"); + }); + + it("semantic category: warns on out-of-range confidence", async () => { + await kv.set(KV.semantic, "sem_bad", { + id: "sem_bad", fact: "f", confidence: 2.0, sourceSessionIds: [], + sourceMemoryIds: [], accessCount: 0, lastAccessedAt: "", + strength: 0, createdAt: "", updatedAt: "", + }); + + const result = (await sdk.trigger("mem::diagnose", { + categories: ["semantic"], + })) as { checks: DiagnosticCheck[] }; + + const warn = result.checks.find((c) => c.name.startsWith("semantic-bad-confidence:")); + expect(warn?.status).toBe("warn"); + }); + + it("categories filter accepts new categories and skips others", async () => { + const result = (await sdk.trigger("mem::diagnose", { + categories: ["lessons", "summaries"], + })) as { checks: DiagnosticCheck[] }; + + expect(result.checks.every((c) => c.category === "lessons" || c.category === "summaries")).toBe(true); + expect(result.checks.some((c) => c.category === "lessons")).toBe(true); + expect(result.checks.some((c) => c.category === "summaries")).toBe(true); + }); + + describe("defensive row-shape handling (CodeRabbit #473 review)", () => { + it("NaN/Infinity confidence on a lesson is flagged as warn, not silently passed", async () => { + await kv.set(KV.lessons, "lsn_nan", { + id: "lsn_nan", content: "x", context: "", confidence: NaN, + reinforcements: 0, source: "manual", sourceIds: [], tags: [], + createdAt: "", updatedAt: "", decayRate: 0.05, + }); + + const result = (await sdk.trigger("mem::diagnose", { + categories: ["lessons"], + })) as { checks: DiagnosticCheck[] }; + + const warn = result.checks.find((c) => c.name.startsWith("lesson-bad-confidence:")); + expect(warn?.status).toBe("warn"); + }); + + it("non-string summary title doesn't throw — surfaces as warn", async () => { + await kv.set(KV.summaries, "ses_bad_title", { + sessionId: "ses_bad_title", + project: "p", + createdAt: "", + title: null as unknown as string, // simulate corrupted row + narrative: "n", + keyDecisions: [], + filesModified: [], + concepts: [], + observationCount: 1, + }); + + // The bug to guard against: the old code called .trim() unconditionally, + // which throws on null/number, which aborts the whole diagnose run and + // any later category check never executes. Verify diagnose completes + // AND surfaces the bad row. + const result = (await sdk.trigger("mem::diagnose", { + categories: ["summaries", "lessons"], + })) as { checks: DiagnosticCheck[]; success?: boolean }; + + expect(result.success).toBe(true); + const warn = result.checks.find((c) => c.name.startsWith("summary-missing-title:")); + expect(warn?.status).toBe("warn"); + // Later category still ran: + expect(result.checks.some((c) => c.category === "lessons")).toBe(true); + }); + + it("non-string crystal narrative doesn't throw — surfaces as warn", async () => { + await kv.set(KV.crystals, "cry_bad", { + id: "cry_bad", + narrative: undefined as unknown as string, + keyOutcomes: [], + filesAffected: [], + lessons: [], + sourceActionIds: [], + createdAt: "", + }); + + const result = (await sdk.trigger("mem::diagnose", { + categories: ["crystals"], + })) as { checks: DiagnosticCheck[]; success?: boolean }; + + expect(result.success).toBe(true); + const warn = result.checks.find((c) => c.name.startsWith("crystal-empty-narrative:")); + expect(warn?.status).toBe("warn"); + }); + + it("Infinity confidence on insight + semantic both flagged", async () => { + await kv.set(KV.insights, "ins_inf", { + id: "ins_inf", + title: "t", + content: "c", + confidence: Infinity, + reinforcements: 0, + sourceConceptCluster: [], + sourceMemoryIds: [], + sourceLessonIds: [], + sourceCrystalIds: [], + tags: [], + createdAt: "", + updatedAt: "", + decayRate: 0.05, + }); + await kv.set(KV.semantic, "sem_nan", { + id: "sem_nan", + fact: "f", + confidence: NaN, + sourceSessionIds: [], + sourceMemoryIds: [], + accessCount: 0, + lastAccessedAt: "", + strength: 0, + createdAt: "", + updatedAt: "", + }); + + const result = (await sdk.trigger("mem::diagnose", { + categories: ["insights", "semantic"], + })) as { checks: DiagnosticCheck[] }; + + expect(result.checks.find((c) => c.name === "insight-bad-confidence:ins_inf")?.status).toBe("warn"); + expect(result.checks.find((c) => c.name === "semantic-bad-confidence:sem_nan")?.status).toBe("warn"); + }); + }); + }); }); diff --git a/test/smart-search.test.ts b/test/smart-search.test.ts index 4f22d1a9..9d0c94e0 100644 --- a/test/smart-search.test.ts +++ b/test/smart-search.test.ts @@ -193,4 +193,102 @@ describe("Smart Search Function", () => { } | null; expect(log?.count).toBe(1); }); + + describe("lesson inclusion (#lesson-visibility)", () => { + it("compact mode returns lessons array alongside observation results", async () => { + sdk.registerFunction("mem::lesson-recall", async (payload: any) => ({ + success: true, + lessons: [ + { id: "lsn_a", content: "always rebase before push", confidence: 0.9, createdAt: "2026-04-01T00:00:00Z", project: "p", tags: ["git"], score: 0.81 }, + { id: "lsn_b", content: "never force-push to main", confidence: 0.95, createdAt: "2026-04-02T00:00:00Z", project: "p", tags: ["git"], score: 0.76 }, + ], + })); + + const result = (await sdk.trigger("mem::smart-search", { + query: "rebase", + })) as { mode: string; results: CompactSearchResult[]; lessons?: any[] }; + + expect(result.mode).toBe("compact"); + expect(result.results.length).toBe(2); // observations unchanged + expect(result.lessons).toBeDefined(); + expect(result.lessons!.length).toBe(2); + expect(result.lessons![0]).toMatchObject({ + lessonId: "lsn_a", + confidence: 0.9, + score: 0.81, + }); + expect(result.lessons![0].tags).toEqual(["git"]); + }); + + it("compact mode truncates long lesson content for preview", async () => { + const long = "x".repeat(500); + sdk.registerFunction("mem::lesson-recall", async () => ({ + success: true, + lessons: [{ id: "lsn_long", content: long, confidence: 0.5, createdAt: "", tags: [], score: 0.4 }], + })); + + const result = (await sdk.trigger("mem::smart-search", { + query: "x", + })) as { lessons: any[] }; + + expect(result.lessons[0].content.length).toBeLessThan(long.length); + expect(result.lessons[0].content).toMatch(/…$/); + }); + + it("includeLessons:false omits the lessons array entirely", async () => { + // No lesson-recall handler registered — would throw if invoked. + const result = (await sdk.trigger("mem::smart-search", { + query: "auth", + includeLessons: false, + })) as { mode: string; results: CompactSearchResult[]; lessons?: unknown }; + + expect(result.results.length).toBe(2); + expect(result.lessons).toBeUndefined(); + }); + + it("forwards project filter to mem::lesson-recall", async () => { + let receivedPayload: any = null; + sdk.registerFunction("mem::lesson-recall", async (payload: any) => { + receivedPayload = payload; + return { success: true, lessons: [] }; + }); + + await sdk.trigger("mem::smart-search", { + query: "rebase", + project: "gitops-assistant", + }); + + expect(receivedPayload).toMatchObject({ + query: "rebase", + project: "gitops-assistant", + }); + }); + + it("tolerates mem::lesson-recall failure: returns empty lessons, observations unchanged", async () => { + sdk.registerFunction("mem::lesson-recall", async () => { + throw new Error("lessons store unavailable"); + }); + + const result = (await sdk.trigger("mem::smart-search", { + query: "auth", + })) as { results: CompactSearchResult[]; lessons: any[] }; + + expect(result.results.length).toBe(2); + expect(result.lessons).toEqual([]); + }); + + it("tolerates non-success lesson-recall response shape", async () => { + sdk.registerFunction("mem::lesson-recall", async () => ({ + success: false, + error: "query is required", + })); + + const result = (await sdk.trigger("mem::smart-search", { + query: "auth", + })) as { results: CompactSearchResult[]; lessons: any[] }; + + expect(result.results.length).toBe(2); + expect(result.lessons).toEqual([]); + }); + }); }); From 1838f4d74c3a0accdd3764e7a8ec155cc140b831 Mon Sep 17 00:00:00 2001 From: Rohit Ghumare <48523873+rohitg00@users.noreply.github.com> Date: Tue, 19 May 2026 19:37:19 +0100 Subject: [PATCH 23/34] chore(release): v0.9.21 (#551) Quality + integration wave. Bundles 11 PRs since v0.9.20: Contributor feature: - #237 OpenCode plugin with 22 auto-capture hooks (@cl0ckt0wer) Bug fixes (9): - #516 memory_recall endpoint + format/token_budget (@serhiizghama, closes #507/#440) - #461 env-file AGENTMEMORY_DROP_STALE_INDEX flag honored (@honor2030, closes #456) - #487 Windows hook path quoting (@honor2030, closes #477) - #517 viewer IME composition guard (@jonathanzhan1975) - #472 chunk large sessions for LLM context window (@efenex) - #473 surface lessons in smart-search + diagnose tally (@efenex) - #486 declare all Hermes plugin hooks (@honor2030) - #500 rebuildIndex non-blocking on boot (@efenex) - #504 batched embed in rebuildIndex (25h -> 3h) (@efenex) - #491 cli skip onboarding without tty (@honor2030) Upstream-installer revert: - #546 drop --next workaround now that iii-hq/iii#1660 shipped 1067/1067 tests pass across 95 files. --- CHANGELOG.md | 42 +++++++++++++++++++++++++++++++ package.json | 2 +- packages/mcp/package.json | 2 +- plugin/.claude-plugin/plugin.json | 2 +- plugin/.codex-plugin/plugin.json | 2 +- src/functions/export-import.ts | 2 +- src/types.ts | 2 +- src/version.ts | 2 +- test/export-import.test.ts | 2 +- 9 files changed, 50 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c73c185..0188e05a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,48 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), ## [Unreleased] +## [0.9.21] — 2026-05-19 + +Quality + integration wave. Headline: native OpenCode plugin with full Claude Code hook parity ([#237](https://github.com/rohitg00/agentmemory/pull/237) by [@cl0ckt0wer](https://github.com/cl0ckt0wer)). Ten more PRs alongside: `memory_recall` returning the wrong shape, env-file `AGENTMEMORY_DROP_STALE_INDEX` silently ignored, hook scripts crashing on Windows usernames with spaces, viewer search inputs interrupting CJK IME composition, large sessions silently failing at the LLM context limit, lessons invisible to smart-search, Hermes plugin manifest missing hooks, cli onboarding crashing in non-TTY contexts, rebuildIndex blocking boot on large corpora, 25h embed-loop bottleneck during rebuild, and the v0.9.19 iii-console installer workaround can come out now that upstream is fixed. + +### Added + +- **OpenCode plugin with 22 auto-capture hooks** ([PR #237](https://github.com/rohitg00/agentmemory/pull/237) by [@cl0ckt0wer](https://github.com/cl0ckt0wer), closes [#236](https://github.com/rohitg00/agentmemory/issues/236) + [#244](https://github.com/rohitg00/agentmemory/issues/244)). Complete OpenCode plugin in `plugin/opencode/` matching Claude Code hook parity. Covers session lifecycle (8 hooks), messages (3), tool lifecycle (2), part tracking, permissions, task tracking, plus a two-layer enrichment pipeline (memory context on first turn, file enrichment on subsequent turns) and two slash commands (`/recall`, `/remember`). Full gap analysis in `plugin/opencode/README.md`. + +### Fixed + +- **`memory_recall` endpoint + format/token_budget forwarding** ([PR #516](https://github.com/rohitg00/agentmemory/pull/516) by [@serhiizghama](https://github.com/serhiizghama), closes [#507](https://github.com/rohitg00/agentmemory/issues/507) + [#440](https://github.com/rohitg00/agentmemory/issues/440)). MCP `memory_recall` always returned compact mode and dropped `format` + `token_budget` params. Two root causes fixed: standalone shim routed through `/agentmemory/smart-search` instead of `/agentmemory/search`, and the local-fallback path didn't read either param. Now routes correctly, forwards both params end-to-end, defaults `format` to `"full"` matching the MCP schema. + +- **env-file `AGENTMEMORY_DROP_STALE_INDEX` flag now honored** ([PR #461](https://github.com/rohitg00/agentmemory/pull/461) by [@honor2030](https://github.com/honor2030), closes [#456](https://github.com/rohitg00/agentmemory/issues/456)). Setting the flag in `~/.agentmemory/.env` was silently ignored because the boot path read `process.env` directly. New `isDropStaleIndexEnabled()` helper reads merged env. Combined with [#455](https://github.com/rohitg00/agentmemory/issues/455) + [#469](https://github.com/rohitg00/agentmemory/issues/469) reports, this is the unblock path for the stale-index server-crash recovery loop. + +- **Windows hook scripts quote plugin paths correctly** ([PR #487](https://github.com/rohitg00/agentmemory/pull/487) by [@honor2030](https://github.com/honor2030), closes [#477](https://github.com/rohitg00/agentmemory/issues/477)). Hook command strings referenced `${CLAUDE_PLUGIN_ROOT}/scripts/*.mjs` without quotes — Windows users with spaces in their username had every hook crash. Quotes added + regression test. + +- **Viewer search inputs honor IME composition** ([PR #517](https://github.com/rohitg00/agentmemory/pull/517) by [@jonathanzhan1975](https://github.com/jonathanzhan1975)). CJK users typing in the viewer's search inputs hit mid-character interruption — every keystroke fired the `oninput=` re-render handler, breaking IME composition mid-syllable. New `bindImeSafeSearch` helper defers re-render until `compositionend`. + +- **Chunk large sessions to fit LLM context window** ([PR #472](https://github.com/rohitg00/agentmemory/pull/472) by [@efenex](https://github.com/efenex)). Sessions with >7000 observations silently failed at the LLM provider's context limit — the consolidation pipeline silently skipped the session. New chunking splits oversized sessions across multiple compress calls + restitches the narrative via a `REDUCE_SYSTEM` prompt. Legacy single-call path preserved when obs count is under the chunk size. Backfill script under `scripts/` for users hitting the pre-fix bug. + +- **Surface lessons in smart-search + diagnose tally** ([PR #473](https://github.com/rohitg00/agentmemory/pull/473) by [@efenex](https://github.com/efenex)). Closes the lesson round-trip with [#458](https://github.com/rohitg00/agentmemory/pull/458) (lessons auto-injected into `mem::context`): lessons are now also returned alongside hybrid search results in a separate `lessons` field on `smart-search`, and the `diagnose` health surface tallies per-store counts so the trust-shock pattern (save succeeds, recall empty, diagnose says 0) goes away. + +- **Declare all Hermes plugin hooks** ([PR #486](https://github.com/rohitg00/agentmemory/pull/486) by [@honor2030](https://github.com/honor2030)). The Hermes `plugin.yaml` manifest only declared 3 of the 6 implemented hooks. All 6 now declared (`prefetch`, `sync_turn`, `on_session_end`, `on_pre_compress`, `on_memory_write`, `system_prompt_block`). + +- **`rebuildIndex` non-blocking on boot** ([PR #500](https://github.com/rohitg00/agentmemory/pull/500) by [@efenex](https://github.com/efenex)). Boot path previously `await`-ed `rebuildIndex(kv)`, so the viewer + later boot steps stalled — on large corpora this was 25h+ of blocked startup. Replaced with `void rebuildIndex(kv).then(...).catch(...)` so the rebuild runs in the background. + +- **Batched embed calls in `rebuildIndex` (25h → 3h on large corpora)** ([PR #504](https://github.com/rohitg00/agentmemory/pull/504) by [@efenex](https://github.com/efenex)). The rebuild loop made one embed call per observation, paying full HTTP RTT per item. New `vectorIndexAddBatchGuarded` helper batches embeds (default 32, configurable via `REBUILD_EMBED_BATCH_SIZE`) and try/catches per-item failures. Measured 25h → 3h on a 250k-observation corpus. + +- **CLI skips onboarding prompts without a tty** ([PR #491](https://github.com/rohitg00/agentmemory/pull/491) by [@honor2030](https://github.com/honor2030)). Onboarding prompts crashed in non-interactive contexts (CI, `docker run -d`, piped input). New guard short-circuits with sensible defaults when stdin/stdout aren't TTYs or `CI=1`. + +### Changed + +- **Drop iii-console installer `--next` workaround** ([PR #546](https://github.com/rohitg00/agentmemory/pull/546)). v0.9.19 routed first-run iii-console install through `bash -s -- --next` to dodge an upstream tag-prefix bug at [iii-hq/iii#1652](https://github.com/iii-hq/iii/issues/1652). Upstream [iii-hq/iii#1660](https://github.com/iii-hq/iii/pull/1660) shipped 2026-05-19; `install.iii.dev/console/main/install.sh` is a CDN proxy serving upstream main HEAD so the fix is live without an iii release tag. Reverted to canonical bare `curl ... | sh`. + +### Infrastructure + +- 95 test files (was 92), **1067 tests pass** (was 1038) on `chore(release): v0.9.21`. +- Bundles 11 PRs: 1 contributor feature + 9 bug fixes across MCP / hooks / viewer / summarize / lessons / Hermes / rebuildIndex / CLI + 1 upstream-installer revert. +- New contributors landing first PRs this release: [@cl0ckt0wer](https://github.com/cl0ckt0wer), [@serhiizghama](https://github.com/serhiizghama), [@jonathanzhan1975](https://github.com/jonathanzhan1975). + +[0.9.21]: https://github.com/rohitg00/agentmemory/compare/v0.9.20...v0.9.21 + ## [0.9.20] — 2026-05-18 Hotfix: revert the Codex Stop → session-end chain shipped in v0.9.19. diff --git a/package.json b/package.json index 820fc8f7..e775c305 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@agentmemory/agentmemory", - "version": "0.9.20", + "version": "0.9.21", "description": "Persistent memory for AI coding agents, powered by iii-engine's three primitives", "type": "module", "main": "dist/index.mjs", diff --git a/packages/mcp/package.json b/packages/mcp/package.json index 403295dd..96da3ae4 100644 --- a/packages/mcp/package.json +++ b/packages/mcp/package.json @@ -1,6 +1,6 @@ { "name": "@agentmemory/mcp", - "version": "0.9.20", + "version": "0.9.21", "description": "Standalone MCP server for agentmemory — thin shim that re-exposes @agentmemory/agentmemory's MCP entrypoint", "type": "module", "bin": { diff --git a/plugin/.claude-plugin/plugin.json b/plugin/.claude-plugin/plugin.json index a18860e4..e53f8088 100644 --- a/plugin/.claude-plugin/plugin.json +++ b/plugin/.claude-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "agentmemory", - "version": "0.9.20", + "version": "0.9.21", "description": "Persistent memory for AI coding agents -- captures tool usage, compresses via LLM, injects context into future sessions. 12 hooks, 51 MCP tools, 4 skills, real-time viewer.", "author": { "name": "Rohit Ghumare", diff --git a/plugin/.codex-plugin/plugin.json b/plugin/.codex-plugin/plugin.json index f8d676f6..0a7cc173 100644 --- a/plugin/.codex-plugin/plugin.json +++ b/plugin/.codex-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "agentmemory", - "version": "0.9.20", + "version": "0.9.21", "description": "Persistent memory for AI coding agents -- captures tool usage, compresses via LLM, injects context into future sessions. 6 hooks, 51 MCP tools, 4 skills, real-time viewer.", "author": { "name": "Rohit Ghumare", diff --git a/src/functions/export-import.ts b/src/functions/export-import.ts index 674b14da..4c997630 100644 --- a/src/functions/export-import.ts +++ b/src/functions/export-import.ts @@ -176,7 +176,7 @@ export function registerExportImportFunction(sdk: ISdk, kv: StateKV): void { const strategy = data.strategy || "merge"; const importData = data.exportData; - const supportedVersions = new Set(["0.3.0", "0.4.0", "0.5.0", "0.6.0", "0.6.1", "0.7.0", "0.7.2", "0.7.3", "0.7.4", "0.7.5", "0.7.6", "0.7.7", "0.7.9", "0.8.0", "0.8.1", "0.8.2", "0.8.3", "0.8.4", "0.8.5", "0.8.6", "0.8.7", "0.8.8", "0.8.9", "0.8.10", "0.8.11", "0.8.12", "0.8.13", "0.9.0", "0.9.1", "0.9.2", "0.9.3", "0.9.4", "0.9.5", "0.9.6", "0.9.7", "0.9.8", "0.9.9", "0.9.10", "0.9.11", "0.9.12", "0.9.13", "0.9.14", "0.9.15", "0.9.16", "0.9.17", "0.9.18", "0.9.19", "0.9.20"]); + const supportedVersions = new Set(["0.3.0", "0.4.0", "0.5.0", "0.6.0", "0.6.1", "0.7.0", "0.7.2", "0.7.3", "0.7.4", "0.7.5", "0.7.6", "0.7.7", "0.7.9", "0.8.0", "0.8.1", "0.8.2", "0.8.3", "0.8.4", "0.8.5", "0.8.6", "0.8.7", "0.8.8", "0.8.9", "0.8.10", "0.8.11", "0.8.12", "0.8.13", "0.9.0", "0.9.1", "0.9.2", "0.9.3", "0.9.4", "0.9.5", "0.9.6", "0.9.7", "0.9.8", "0.9.9", "0.9.10", "0.9.11", "0.9.12", "0.9.13", "0.9.14", "0.9.15", "0.9.16", "0.9.17", "0.9.18", "0.9.19", "0.9.20", "0.9.21"]); if (!supportedVersions.has(importData.version)) { return { success: false, diff --git a/src/types.ts b/src/types.ts index 87e274d1..72e347b3 100644 --- a/src/types.ts +++ b/src/types.ts @@ -303,7 +303,7 @@ export interface ExportPagination { } export interface ExportData { - version: "0.3.0" | "0.4.0" | "0.5.0" | "0.6.0" | "0.6.1" | "0.7.0" | "0.7.2" | "0.7.3" | "0.7.4" | "0.7.5" | "0.7.6" | "0.7.7" | "0.7.9" | "0.8.0" | "0.8.1" | "0.8.2" | "0.8.3" | "0.8.4" | "0.8.5" | "0.8.6" | "0.8.7" | "0.8.8" | "0.8.9" | "0.8.10" | "0.8.11" | "0.8.12" | "0.8.13" | "0.9.0" | "0.9.1" | "0.9.2" | "0.9.3" | "0.9.4" | "0.9.5" | "0.9.6" | "0.9.7" | "0.9.8" | "0.9.9" | "0.9.10" | "0.9.11" | "0.9.12" | "0.9.13" | "0.9.14" | "0.9.15" | "0.9.16" | "0.9.17" | "0.9.18" | "0.9.19" | "0.9.20"; + version: "0.3.0" | "0.4.0" | "0.5.0" | "0.6.0" | "0.6.1" | "0.7.0" | "0.7.2" | "0.7.3" | "0.7.4" | "0.7.5" | "0.7.6" | "0.7.7" | "0.7.9" | "0.8.0" | "0.8.1" | "0.8.2" | "0.8.3" | "0.8.4" | "0.8.5" | "0.8.6" | "0.8.7" | "0.8.8" | "0.8.9" | "0.8.10" | "0.8.11" | "0.8.12" | "0.8.13" | "0.9.0" | "0.9.1" | "0.9.2" | "0.9.3" | "0.9.4" | "0.9.5" | "0.9.6" | "0.9.7" | "0.9.8" | "0.9.9" | "0.9.10" | "0.9.11" | "0.9.12" | "0.9.13" | "0.9.14" | "0.9.15" | "0.9.16" | "0.9.17" | "0.9.18" | "0.9.19" | "0.9.20" | "0.9.21"; exportedAt: string; sessions: Session[]; observations: Record; diff --git a/src/version.ts b/src/version.ts index 35bfcbb0..8a1b6acf 100644 --- a/src/version.ts +++ b/src/version.ts @@ -1 +1 @@ -export const VERSION = "0.9.20"; +export const VERSION = "0.9.21"; diff --git a/test/export-import.test.ts b/test/export-import.test.ts index 4426ce8e..373d2518 100644 --- a/test/export-import.test.ts +++ b/test/export-import.test.ts @@ -119,7 +119,7 @@ describe("Export/Import Functions", () => { it("export produces valid ExportData structure", async () => { const result = (await sdk.trigger("mem::export", {})) as ExportData; - expect(result.version).toBe("0.9.20"); + expect(result.version).toBe("0.9.21"); expect(result.exportedAt).toBeDefined(); expect(result.sessions.length).toBe(1); expect(result.sessions[0].id).toBe("ses_1"); From e9dc710e5623106363cf2735beaff901eb1d5a46 Mon Sep 17 00:00:00 2001 From: Rohit Ghumare <48523873+rohitg00@users.noreply.github.com> Date: Wed, 20 May 2026 11:10:46 +0100 Subject: [PATCH 24/34] ci: cross-platform matrix + paths-ignore + concurrency (#556) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ci: cross-platform matrix + paths-ignore + concurrency 1. **OS matrix** — Linux + Windows + macOS, both Node 20 + 22. 6 cells, ~3min each, ~18min wall time. Direct test against the class of bug #487 caught: hooks crashing on Windows usernames with spaces. Pre-merge Linux-only CI meant that bug landed in main + a release. fail-fast: false so a flake on one cell doesn't mask whether the same failure reproduces elsewhere. 2. **paths-ignore** — skip CI runs on README / CHANGELOG / docs / website / assets / .md / .mdx pushes. ~half the runner minutes back on doc-only churn. Source / config / workflow changes always run. 3. **concurrency + cancel-in-progress** — PR force-pushes cancel in-flight runs instead of piling them up. Push to main protected (concurrency group still scoped to ref, no cancel for main pushes). Plus minor hardening: persist-credentials: false on the checkout step so the GITHUB_TOKEN doesn't land in .git/config. What was NOT lifted (rationale per plan): - Per-package reusable workflows (Rust/Python/Homebrew — non-TS). - License-header check (no per-file Apache banners in agentmemory). - CLA bot (defer until external PR volume justifies friction). - tsc --noEmit lint job (codebase has ~10 pre-existing type errors tsdown skips; gating CI on those would block every PR until fixed; tracked as separate cleanup). - Smoke test (`agentmemory demo + livez`) — defer to its own PR with its own validation cycle. - Codecov badge — defer until baseline is set. * ci(windows): force bash shell so build script's POSIX idioms work Windows runners default to cmd.exe for npm run scripts; the build script uses POSIX patterns the build script's exit codes (`cp ... 2>/dev/null || true`, `mkdir -p`) that cmd doesn't parse. ubuntu + macos already use bash by default so this is Windows-only behaviour change. Alternative: rewrite the build script in Node. Bigger lift, not minimal. * ci(windows): point npm script-shell at git-bash before build `shell: bash` on the step only sets the shell for the step's own runner; `npm run` still spawns its inner script via npm's `script-shell` config, which defaults to cmd.exe on Windows. Configure npm to use Git-Bash (preinstalled on GitHub-hosted Windows runners) so `npm run build` and `npm run test` execute the build script the same way ubuntu + macos do. Step is gated on `runner.os == 'Windows'` so it's a no-op on the other matrix cells. * ci: drop windows-latest from matrix (obsidian-export hardcoded POSIX paths) Windows runners fail on test/obsidian-export.test.ts because the test + src hardcode `/tmp/...` POSIX paths that don't resolve on the D:\ drive Windows uses. Fixing it cleanly requires reworking src/functions/obsidian-export.ts to use os.tmpdir() + path.join, which is a separate scope. Drop windows from the matrix for now. Ship ubuntu + macos coverage (real darwin/linux divergence catch) and file a follow-up to make obsidian-export cross-platform so Windows can be added back. * test(fs-watcher): bump waits to 1500ms + describe retry for macos fsevents flake --- .github/workflows/ci.yml | 45 +++++++++++++++++++++++++++++++++++++++- test/fs-watcher.test.ts | 10 ++++----- 2 files changed, 49 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 41c99434..b9671280 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,19 +1,62 @@ name: CI +# `paths-ignore` keeps doc-only / website / README / CHANGELOG churn from +# burning runner minutes. Source / config / workflow changes always run. +# `workflow_dispatch` gives a manual re-run button for flake debugging. on: push: branches: [main] + paths-ignore: + - "README.md" + - "CHANGELOG.md" + - "AGENTS.md" + - "ROADMAP.md" + - "website/**" + - "docs/**" + - "assets/**" + - "deploy/**/README.md" + - "**/*.md" + - "**/*.mdx" pull_request: branches: [main] + paths-ignore: + - "README.md" + - "CHANGELOG.md" + - "AGENTS.md" + - "ROADMAP.md" + - "website/**" + - "docs/**" + - "assets/**" + - "deploy/**/README.md" + - "**/*.md" + - "**/*.mdx" + workflow_dispatch: + +# Cancel in-flight PR runs when a force-push lands. Keep push runs to +# protect against partial state on main. +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} jobs: test: - runs-on: ubuntu-latest + runs-on: ${{ matrix.os }} strategy: + # Don't bail the whole matrix on one cell's failure — we want to + # see whether the same failure reproduces across OSes (e.g. + # whether a flake is platform-specific or universal). + fail-fast: false matrix: + # Windows held back: test/obsidian-export.test.ts has hardcoded + # POSIX paths (`/tmp/...`) that fail on D:\ drive runners. + # src/functions/obsidian-export.ts needs os.tmpdir() + path.join + # rework before Windows can be added back. Tracked as follow-up. + os: [ubuntu-latest, macos-latest] node-version: [20, 22] steps: - uses: actions/checkout@v6 + with: + persist-credentials: false - uses: actions/setup-node@v6 with: node-version: ${{ matrix.node-version }} diff --git a/test/fs-watcher.test.ts b/test/fs-watcher.test.ts index 76212b06..48c1b094 100644 --- a/test/fs-watcher.test.ts +++ b/test/fs-watcher.test.ts @@ -12,7 +12,7 @@ function wait(ms: number): Promise { return new Promise((r) => setTimeout(r, ms)); } -describe("FilesystemWatcher", () => { +describe("FilesystemWatcher", { retry: 2 }, () => { let root: string; const originalFetch = globalThis.fetch; let captured: Array<{ url: string; body: unknown; headers: Record }>; @@ -49,7 +49,7 @@ describe("FilesystemWatcher", () => { w.start(); try { writeFileSync(join(root, "notes.md"), "hello world\n"); - await wait(800); + await wait(1500); expect(captured.length).toBeGreaterThanOrEqual(1); const obs = captured[captured.length - 1]; expect(obs.url).toBe("http://localhost:3111/agentmemory/observe"); @@ -87,7 +87,7 @@ describe("FilesystemWatcher", () => { w.start(); try { unlinkSync(join(root, "old.md")); - await wait(800); + await wait(1500); const deletes = captured.filter( (c) => (c.body as { data: { changeKind: string } }).data?.changeKind === "file_delete", ); @@ -116,7 +116,7 @@ describe("FilesystemWatcher", () => { w.start(); try { writeFileSync(join(root, "node_modules", "ignored.js"), "x"); - await wait(800); + await wait(1500); const matches = captured.filter((c) => (c.body as { data: { files: string[] } }).data?.files?.some((f) => f.includes("ignored.js")), ); @@ -136,7 +136,7 @@ describe("FilesystemWatcher", () => { w.start(); try { writeFileSync(join(root, "secret.md"), "bearer test\n"); - await wait(800); + await wait(1500); expect(captured.length).toBeGreaterThanOrEqual(1); const headers = captured[captured.length - 1].headers as Record; expect(headers.authorization).toBe("Bearer shhh"); From 7fb72f40108516e80979a4ebd142deab447d7aa9 Mon Sep 17 00:00:00 2001 From: Rohit Ghumare <48523873+rohitg00@users.noreply.github.com> Date: Wed, 20 May 2026 14:11:52 +0100 Subject: [PATCH 25/34] feat(eval): pluggable benchmark harness with in-house coding-agent corpus (#562) * feat(eval): pluggable benchmark harness with in-house coding-agent corpus Adds eval/ tree (outside files field so npm tarball stays thin) with Adapter interface, three reference adapters (grep / vector / agentmemory-hybrid), two benchmarks (LongMemEval _s public, coding-agent-life-v1 in-house 15 sessions), scoring (P@K, R@K, hit, top-gold-rank), NDJSON output, sandbox script. coding-agent-life-v1 published scorecard at docs/benchmarks/2026-05-20-coding-agent-life-v1.md: agentmemory-hybrid R@5=0.967 P@5=0.578 (100% hit) vs grep R@5=0.967 P@5=0.267. 2.2x better precision on identical input, sandbox-reproducible. Adapter contract: init(sessions, config) -> State; query(q, state, k) -> RankedDoc[] npm scripts: npm run eval:coding-life (no download, no API key for grep) npm run eval:longmemeval (needs OPENAI key + 278MB download) eval/scripts/sandbox.sh boots clean agentmemory + iii-engine on ports 3411/3412 with isolated data dir; tears down on exit. README headline updated. 1072/1072 tests pass + 5 new eval tests. * fix(eval): address review findings on benchmark harness - agentmemory adapter: prefer row.sessionId before observationToSession lookup - vector adapter: validate embedBatch response (length, indexes, non-empty rows) - coding-life: positive-int guard on --k; wrap query loop in try/finally so teardown runs - longmemeval: positive-int guards on --k/--limit/--stratify; per-question try/finally - load: throw on haystack_session_ids vs haystack_sessions length mismatch - score: P@K denominator is k (requested cutoff) not topK.length - sandbox.sh: guard rm -rf with non-empty + /tmp/ prefix check - README: drop unsafe rm "$(which iii)"; instruct ~/.local/bin + PATH instead; add language tag to repo-layout fenced block - sessions.json: fix "two-phase" -> "three-phase" wording mismatch --- .gitignore | 7 + README.md | 11 ++ .../2026-05-20-coding-agent-life-v1.md | 76 +++++++++++ docs/benchmarks/TEMPLATE.md | 54 ++++++++ eval/README.md | 111 +++++++++++++++ eval/data/coding-agent-life-v1/queries.json | 107 +++++++++++++++ eval/data/coding-agent-life-v1/sessions.json | 77 +++++++++++ eval/runner/adapters/agentmemory.ts | 93 +++++++++++++ eval/runner/adapters/grep.ts | 36 +++++ eval/runner/adapters/vector.ts | 108 +++++++++++++++ eval/runner/coding-life.ts | 101 ++++++++++++++ eval/runner/load.ts | 54 ++++++++ eval/runner/longmemeval.ts | 126 ++++++++++++++++++ eval/runner/score.ts | 78 +++++++++++ eval/runner/types.ts | 38 ++++++ eval/scripts/sandbox.sh | 117 ++++++++++++++++ package.json | 4 +- test/eval-adapters.test.ts | 92 +++++++++++++ 18 files changed, 1289 insertions(+), 1 deletion(-) create mode 100644 docs/benchmarks/2026-05-20-coding-agent-life-v1.md create mode 100644 docs/benchmarks/TEMPLATE.md create mode 100644 eval/README.md create mode 100644 eval/data/coding-agent-life-v1/queries.json create mode 100644 eval/data/coding-agent-life-v1/sessions.json create mode 100644 eval/runner/adapters/agentmemory.ts create mode 100644 eval/runner/adapters/grep.ts create mode 100644 eval/runner/adapters/vector.ts create mode 100644 eval/runner/coding-life.ts create mode 100644 eval/runner/load.ts create mode 100644 eval/runner/longmemeval.ts create mode 100644 eval/runner/score.ts create mode 100644 eval/runner/types.ts create mode 100755 eval/scripts/sandbox.sh create mode 100644 test/eval-adapters.test.ts diff --git a/.gitignore b/.gitignore index 585d0f49..ba6af995 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,8 @@ dist/ plugin/scripts/*.map plugin/scripts/*.d.mts data/ +!eval/data/ +!eval/data/** data-*/ agentmemory-debug/ .gstack/ @@ -22,3 +24,8 @@ package-lock.json pnpm-lock.yaml yarn.lock integrations/hermes/__pycache__/ + +# Eval reports (transient; published scorecards live in docs/benchmarks/) +eval/reports/ +# LongMemEval download is 278MB; fetched on demand +eval/data/longmemeval/ diff --git a/README.md b/README.md index cfa87bc4..96e804d4 100644 --- a/README.md +++ b/README.md @@ -208,6 +208,15 @@ npx @agentmemory/agentmemory ### Retrieval Accuracy +**coding-agent-life-v1** (in-house corpus, sandbox-reproducible) + +| Adapter | P@5 | R@5 | Top-5 hit rate | p50 latency | +|---|---|---|---|---| +| **agentmemory hybrid** | **0.578** | **0.967** | **15 / 15** | 14 ms | +| grep baseline | 0.267 | 0.967 | 15 / 15 | 0 ms | + +100% top-5 hit rate. **2.2×** better precision than the grep baseline on identical input. Full per-type breakdown: [`docs/benchmarks/2026-05-20-coding-agent-life-v1.md`](docs/benchmarks/2026-05-20-coding-agent-life-v1.md). + **LongMemEval-S** (ICLR 2025, 500 questions) | System | R@5 | R@10 | MRR | @@ -233,6 +242,8 @@ npx @agentmemory/agentmemory > Embedding model: `all-MiniLM-L6-v2` (local, free, no API key). Full reports: [`benchmark/LONGMEMEVAL.md`](benchmark/LONGMEMEVAL.md), [`benchmark/QUALITY.md`](benchmark/QUALITY.md), [`benchmark/SCALE.md`](benchmark/SCALE.md). Competitor comparison: [`benchmark/COMPARISON.md`](benchmark/COMPARISON.md) — agentmemory vs mem0, Letta, Khoj, claude-mem, Hippo. +**Reproduce locally:** [`eval/README.md`](eval/README.md) — adapter-pluggable harness for LongMemEval `_s` (public 500-Q) + `coding-agent-life-v1` (in-house 15-session corpus). Grep / vector / agentmemory adapters score side-by-side, NDJSON output, published scorecards land in [`docs/benchmarks/`](docs/benchmarks/). + ---

vs Competitors

diff --git a/docs/benchmarks/2026-05-20-coding-agent-life-v1.md b/docs/benchmarks/2026-05-20-coding-agent-life-v1.md new file mode 100644 index 00000000..f280b27d --- /dev/null +++ b/docs/benchmarks/2026-05-20-coding-agent-life-v1.md @@ -0,0 +1,76 @@ +# 2026-05-20 — coding-agent-life-v1 (v0.9.21) + +**Commit:** `e9dc710` +**Bench:** coding-agent-life-v1 (15 sessions, 15 queries) +**N:** 15 +**K:** 5 +**Hardware:** macOS 15 (Apple Silicon) +**agentmemory:** v0.9.21 +**iii-engine:** v0.11.2 +**Embedding provider:** local default +**Sandbox:** isolated data dir at `/tmp/agentmemory-eval-sandbox/`, ports 3411/3412 + +## Headline + +`agentmemory-hybrid` hits **100% top-5 hit rate**, R@5 = **0.967**, P@5 = **0.578**. + +Same corpus, grep baseline: R@5 = 0.967, P@5 = 0.267 — same recall, but **2.2× worse precision**. Hybrid's top-5 is mostly gold; grep's top-5 is half noise. + +## Per-adapter + +| Adapter | P@5 | R@5 | Hit rate | p50 latency | +|---|---|---|---|---| +| grep (tokenized substring) | 0.267 | 0.967 | 15 / 15 | 0 ms | +| `agentmemory-hybrid` | **0.578** | **0.967** | **15 / 15** | 14 ms | + +`agentmemory-hybrid` runs through the production smart-search endpoint (`POST /agentmemory/smart-search`) so it exercises the full BM25 + embedding + reranker stack. + +## Per-question-type + +P@5, grep vs `agentmemory-hybrid`: + +| Type | grep | hybrid | hybrid lift | +|---|---|---|---| +| single-session-bug | 0.20 | 0.33 | 1.7× | +| single-session-infra (n=2) | 0.20 | 0.50 | 2.5× | +| single-session-refactor | 0.20 | 0.50 | 2.5× | +| single-session-feature | 0.50 | 0.50 | tie | +| single-session-test | 0.20 | 0.33 | 1.7× | +| single-session-perf | 0.20 | 0.50 | 2.5× | +| single-session-api | 0.20 | 0.50 | 2.5× | +| single-session-db | 0.20 | 0.50 | 2.5× | +| single-session-release | 0.20 | 0.33 | 1.7× | +| multi-session-causal | 0.40 | 0.40 | tie | +| preference (n=2) | 0.20 | 0.42 | 2.1× | +| multi-session-review | 0.40 | 0.67 | 1.7× | +| temporal (R@5 = 0.50 grep / 1.00 hybrid) | 0.50 | 0.67 | 1.3× | + +Temporal queries (`What was shipped on April 8th 2026?`) need both gold sessions to score full recall. grep finds 1/2; hybrid finds 2/2. + +## Methodology + +- 15 fictional Claude Code sessions across a 10-day stretch of a Rust CLI project (`shipctl`) — bug fixes, refactors, infra, perf, schema migrations, preferences, post-mortem +- 15 hand-graded queries with `goldSessionIds[]` covering single-session, multi-session causal, multi-session review, preference, temporal +- Each session ingested via `POST /agentmemory/remember` with `type=eval-session` and `concepts=[session_id]` +- Each query hits `POST /agentmemory/smart-search` with `limit=50`; dedupe by session ID; truncate to K=5 +- No LLM in the retrieval loop +- Sandbox: clean `~/.agentmemory` via `HOME` override + alt ports (3411/3412) so no cross-contamination from a user's real store + +## Reproduce + +```sh +git checkout e9dc710 +npm install --legacy-peer-deps +npm run build + +source eval/scripts/sandbox.sh +npm run eval:coding-life -- --adapters grep,agentmemory +``` + +Outputs land in `eval/reports/coding-life/`: `scores.ndjson` (per-query rows) and `summary.json` (per-adapter and per-type aggregates). + +## Notes + +- The single-session-feature tie (`Which PR introduced helm chart support?`) is interesting: query says `PR introduced helm chart` and gold session has `helm chart` literally — grep wins on lexical exactness, hybrid matches but doesn't outperform. +- The corpus is intentionally small for fast iteration. Hardening targets: paraphrased queries, synonym substitution, in-corpus distractors with shared keywords, longer multi-session chains. +- Vector adapter not measured here — requires `OPENAI_API_KEY`; will be added in a follow-up scorecard alongside LongMemEval `_s`. diff --git a/docs/benchmarks/TEMPLATE.md b/docs/benchmarks/TEMPLATE.md new file mode 100644 index 00000000..b830e24e --- /dev/null +++ b/docs/benchmarks/TEMPLATE.md @@ -0,0 +1,54 @@ +# + +**Commit:** `` +**Bench:** LongMemEval `_s` / coding-agent-life-v1 / ... +**N:** 500 / 15 / ... +**K:** 5 +**Hardware:** macos-15 / ubuntu-22.04 / ... +**OpenAI model:** text-embedding-3-small +**Anthropic model:** N/A (no LLM in retrieval loop) + +## Headline + +agentmemory-hybrid: **R@5 = XX.XX%**, P@5 = XX.XX%, p50 latency = XXms + +Beats grep baseline by +X.Xpt R@5, vector by +X.Xpt R@5. + +## Per-adapter + +| Adapter | P@5 | R@5 | Hit rate | p50 latency | +|---|---|---|---|---| +| grep | | | | | +| vector | | | | | +| agentmemory-hybrid | | | | | + +## Per-question-type + +| Type | grep R@5 | vector R@5 | agentmemory R@5 | +|---|---|---|---| +| single-session-bug | | | | +| single-session-refactor | | | | +| preference | | | | +| multi-session-causal | | | | +| temporal | | | | + +## Methodology + +- Sessions ingested via `POST /agentmemory/remember` with `type=eval-session` +- Queries hit `POST /agentmemory/smart-search` with `limit=k*4` +- No LLM in retrieval loop. Direct rank from hybrid scoring. +- Ranks dedup by sessionId before truncating to K +- Latency measured as init+query for LongMemEval (per-question fresh state), query-only for coding-life (shared state) + +## Reproduce + +```sh +git checkout +npm install --legacy-peer-deps +OPENAI_API_KEY=sk-... AGENTMEMORY_BASE_URL=http://localhost:3111 \ + npm run eval:longmemeval -- --stratify 10 +``` + +## Notes + + diff --git a/eval/README.md b/eval/README.md new file mode 100644 index 00000000..7f295367 --- /dev/null +++ b/eval/README.md @@ -0,0 +1,111 @@ +# agentmemory-evals + +Public benchmarks for agentmemory's hybrid memory stack (BM25 + embeddings + consolidation + graph). + +Two families, both reproducible: + +- **LongMemEval** — public 500-question retrieval benchmark over multi-session chat +- **coding-agent-life-v1** — in-house corpus of 15 fictional Claude Code sessions for a Rust CLI project (`shipctl`), with 15 hand-graded queries covering bug fixes, refactors, preferences, and multi-session causal reasoning + +## Adapters + +| Adapter | Backend | API key needed | +|---|---|---| +| `grep` | Tokenized substring match | none | +| `vector` | OpenAI `text-embedding-3-small` + cosine | `OPENAI_API_KEY` | +| `agentmemory` | Running agentmemory server, smart-search endpoint | none (auth optional via `AGENTMEMORY_SECRET`) | + +## Sandbox first + +Running the `agentmemory` adapter against your real `~/.agentmemory` directory pollutes the eval with pre-existing memories AND pollutes your real store with eval test data. Always sandbox. + +`eval/scripts/sandbox.sh` spins up a clean agentmemory + iii-engine on ports 3411/3412 with state in `/tmp/agentmemory-eval-sandbox/`, exports `AGENTMEMORY_BASE_URL`, and tears down on exit. + +```sh +source eval/scripts/sandbox.sh +npm run eval:coding-life -- --adapters grep,agentmemory +``` + +Requires iii v0.11.2 on PATH (agentmemory pin). If you already have a different version installed, install the pinned build into `~/.local/bin` and make sure that directory comes first on `PATH`: + +```sh +mkdir -p ~/.local/bin +curl -fsSL https://github.com/iii-hq/iii/releases/download/iii/v0.11.2/iii-aarch64-apple-darwin.tar.gz | tar -xz -C ~/.local/bin +export PATH="$HOME/.local/bin:$PATH" # add to ~/.zshrc or ~/.bashrc for persistence +``` + +## Quickstart + +### coding-agent-life-v1 (in-house, no download) + +```sh +# grep baseline, no sandbox needed +npm run eval:coding-life -- --adapters grep + +# add agentmemory + vector (sandbox + OpenAI key) +source eval/scripts/sandbox.sh +OPENAI_API_KEY=sk-... npm run eval:coding-life -- --adapters grep,vector,agentmemory +``` + +### LongMemEval `_s` (public, 278MB download) + +```sh +mkdir -p ~/datasets/longmemeval +curl -Lo ~/datasets/longmemeval/longmemeval_s.json \ + https://huggingface.co/datasets/xiaowu0162/longmemeval/resolve/main/longmemeval_s + +source eval/scripts/sandbox.sh + +# Stratified sample of 10 per type (fast iteration, ~$0.20 OpenAI cost) +OPENAI_API_KEY=sk-... LONGMEMEVAL_PATH=~/datasets/longmemeval/longmemeval_s.json \ + npm run eval:longmemeval -- --stratify 10 + +# Full 500 questions × 3 adapters (~$2 OpenAI cost) +OPENAI_API_KEY=sk-... LONGMEMEVAL_PATH=~/datasets/longmemeval/longmemeval_s.json \ + npm run eval:longmemeval +``` + +## Repo layout + +```text +eval/ +├── README.md +├── runner/ +│ ├── types.ts Adapter, Question, RankedDoc, ScoreRow +│ ├── score.ts P@K, R@K, aggregation +│ ├── load.ts LongMemEval JSON → Question[] +│ ├── adapters/ +│ │ ├── grep.ts tokenized substring baseline +│ │ ├── vector.ts OpenAI embeddings + cosine +│ │ └── agentmemory.ts POST /agentmemory/{remember,smart-search} +│ ├── longmemeval.ts public benchmark runner +│ └── coding-life.ts in-house benchmark runner +└── data/ + └── coding-agent-life-v1/ + ├── sessions.json 15 fictional sessions (~6KB) + └── queries.json 15 queries with gold session IDs +``` + +Reports land in `eval/reports//` (gitignored): `scores.ndjson` + `summary.json`. + +Published scorecards land in `docs/benchmarks/YYYY-MM-DD-.md`. + +## Writing a new adapter + +1. Implement `Adapter` from `eval/runner/types.ts`: + ```ts + import type { Adapter } from "../types.js"; + export const myAdapter: Adapter = { + name: "my-adapter", + async init(sessions, config) { /* index */ return state; }, + async query(q, state, k) { /* search */ return ranked; }, + }; + ``` +2. Register in `eval/runner/{longmemeval,coding-life}.ts` `ADAPTERS` map. +3. Run against `coding-agent-life-v1` to sanity-check before committing OpenAI spend on LongMemEval. + +## Why a benchmark for agentmemory + +agentmemory ships BM25 + embeddings + consolidation + graph retrieval. Numbers from those layers should be measured against grep/vector baselines so the value of each layer is provable. + +The in-house corpus is small on purpose (15 sessions) — covers single-session, multi-session, preference, and temporal question types without taking 15 minutes to run. LongMemEval gives the public-comparison axis. diff --git a/eval/data/coding-agent-life-v1/queries.json b/eval/data/coding-agent-life-v1/queries.json new file mode 100644 index 00000000..5603e8a0 --- /dev/null +++ b/eval/data/coding-agent-life-v1/queries.json @@ -0,0 +1,107 @@ +[ + { + "id": "q-001", + "type": "single-session-bug", + "question": "Where did we land the auth env var precedence fix?", + "answer": "PR #11 with SHIPCTL_TOKEN > SHIP_TOKEN > SC_TOKEN precedence", + "goldSessionIds": ["sess-001"] + }, + { + "id": "q-002", + "type": "single-session-infra", + "question": "What was the multi-arch Docker fix?", + "answer": "Added --platform=$BUILDPLATFORM and BUILDX_PLATFORMS for amd64+arm64", + "goldSessionIds": ["sess-002"] + }, + { + "id": "q-003", + "type": "single-session-refactor", + "question": "Where did we consolidate the retry logic?", + "answer": "src/retry.rs with exponential backoff base=200ms cap=30s full jitter", + "goldSessionIds": ["sess-003"] + }, + { + "id": "q-004", + "type": "single-session-feature", + "question": "Which PR introduced helm chart support?", + "answer": "PR #14", + "goldSessionIds": ["sess-004"] + }, + { + "id": "q-005", + "type": "single-session-test", + "question": "Which test was flaky on macos and how was it fixed?", + "answer": "fs-watcher emits_changekind_file_delete; bumped wait to 1500ms + retry: 2", + "goldSessionIds": ["sess-005"] + }, + { + "id": "q-006", + "type": "single-session-perf", + "question": "How did we fix the memory leak?", + "answer": "Replaced unbounded HashMap with LruCache cap=10k in src/cache.rs (PR #16)", + "goldSessionIds": ["sess-006"] + }, + { + "id": "q-007", + "type": "single-session-api", + "question": "How did we handle the github API rate limit?", + "answer": "Conditional requests with If-None-Match etag and 304 caching via http-cache", + "goldSessionIds": ["sess-007"] + }, + { + "id": "q-008", + "type": "single-session-db", + "question": "What was the schema migration approach for run_history?", + "answer": "Three-phase: nullable column + dual-write, backfill + flip reads, drop old column", + "goldSessionIds": ["sess-008"] + }, + { + "id": "q-009", + "type": "single-session-infra", + "question": "How is the docs site deployed?", + "answer": "GitHub Actions docs.yml workflow + mdbook build + Cloudflare Pages on shipctl.dev", + "goldSessionIds": ["sess-009"] + }, + { + "id": "q-010", + "type": "single-session-release", + "question": "Which PR set up the cross-platform release pipeline?", + "answer": "PR #19 with cross-rs for linux and native macos/windows builds", + "goldSessionIds": ["sess-010"] + }, + { + "id": "q-011", + "type": "multi-session-causal", + "question": "What was the root cause of the staging incident, and where was it fixed?", + "answer": "SHIPCTL_TOKEN unset caused fallback to bad SC_TOKEN; fixed in PR #11 (sess-001) with precedence test; documented in post-mortem (sess-014)", + "goldSessionIds": ["sess-001", "sess-014"] + }, + { + "id": "q-012", + "type": "preference", + "question": "Which async runtime does the team prefer for new code?", + "answer": "async-std (decided in arch review, not tokio)", + "goldSessionIds": ["sess-015"] + }, + { + "id": "q-013", + "type": "preference", + "question": "What are the user's formatting preferences?", + "answer": "2-space indent in yaml, tab in rust, rust-analyzer over rls, cargo clippy before commit", + "goldSessionIds": ["sess-012"] + }, + { + "id": "q-014", + "type": "multi-session-review", + "question": "What was the race condition Aria fixed in the schema migration?", + "answer": "Dual-write step had race when two writers hit same row; Aria added per-row advisory lock", + "goldSessionIds": ["sess-008", "sess-013"] + }, + { + "id": "q-015", + "type": "temporal", + "question": "What was shipped on April 8th 2026?", + "answer": "Auth env-var precedence fix in PR #11, then started docker arm64 work", + "goldSessionIds": ["sess-001", "sess-011"] + } +] diff --git a/eval/data/coding-agent-life-v1/sessions.json b/eval/data/coding-agent-life-v1/sessions.json new file mode 100644 index 00000000..71134134 --- /dev/null +++ b/eval/data/coding-agent-life-v1/sessions.json @@ -0,0 +1,77 @@ +[ + { + "id": "sess-001", + "timestamp": "2026-04-08T09:14:00Z", + "content": "[user] auth check broken on staging when SHIPCTL_TOKEN unset, falling back to wrong env var\n[assistant] looked at src/auth.rs:42. precedence is SHIPCTL_TOKEN > SHIP_TOKEN > legacy SC_TOKEN. legacy path didn't validate. fixing precedence + adding length check.\n[user] yeah ship it\n[assistant] PR #11 opened with fix + regression test. covers all three env names." + }, + { + "id": "sess-002", + "timestamp": "2026-04-08T14:30:00Z", + "content": "[user] docker build failing for arm64 on M2 runner\n[assistant] missing target in Dockerfile FROM line. need --platform=$BUILDPLATFORM on builder stage. updated Dockerfile + Makefile to set BUILDX_PLATFORMS=linux/amd64,linux/arm64. tested locally on apple silicon, builds clean." + }, + { + "id": "sess-003", + "timestamp": "2026-04-09T10:02:00Z", + "content": "[user] retry logic is duplicated across handlers. consolidate.\n[assistant] extracted retry_with_backoff into src/retry.rs. exponential backoff base=200ms cap=30s jitter=full. all 4 callsites now use it. cuts ~80 lines." + }, + { + "id": "sess-004", + "timestamp": "2026-04-10T11:45:00Z", + "content": "[user] add helm chart support\n[assistant] new subcommand `shipctl helm install`. wraps helm CLI, threads through --namespace and --values. PR #14 has the changes + 12 integration tests. Aria reviewed and approved." + }, + { + "id": "sess-005", + "timestamp": "2026-04-11T16:20:00Z", + "content": "[user] test flake on macos-22 runner. emits_changekind_file_delete keeps failing\n[assistant] fsevents propagation slower on apple silicon, the 800ms wait isn't enough. bumped to 1500ms + describe-level retry: 2. all 15 fs-watcher tests pass now." + }, + { + "id": "sess-006", + "timestamp": "2026-04-12T09:50:00Z", + "content": "[user] memory leak in long-running daemon. RSS grows 5MB/min.\n[assistant] traced to unbounded HashMap in src/cache.rs. swapped for LruCache with 10k cap. RSS stable at 180MB after 6h soak test. PR #16." + }, + { + "id": "sess-007", + "timestamp": "2026-04-13T13:15:00Z", + "content": "[user] github API rate-limiting us when listing PRs. 5000/hr isn't enough during ci\n[assistant] added conditional requests with If-None-Match etag. cache 304 responses. drops actual API calls by ~70%. used http-cache crate." + }, + { + "id": "sess-008", + "timestamp": "2026-04-15T10:30:00Z", + "content": "[user] schema migration for run_history table needs zero-downtime\n[assistant] three-phase migration. phase 1: add nullable new column + dual-write. phase 2: backfill + flip reads. phase 3: drop old column. avoids ALTER TABLE locks. ran on staging, took 4s for 2M rows." + }, + { + "id": "sess-009", + "timestamp": "2026-04-16T15:40:00Z", + "content": "[user] docs site is stale. set up auto-deploy from main\n[assistant] added .github/workflows/docs.yml. mdbook build + cloudflare pages deploy. domain shipctl.dev wired up. preview deploys on PRs." + }, + { + "id": "sess-010", + "timestamp": "2026-04-17T11:00:00Z", + "content": "[user] release pipeline. cut v0.3.0 with binaries for linux/macos/windows\n[assistant] new .github/workflows/release.yml. cross-rs for static linux, native for macos + windows. uploads to gh release + crates.io publish. tag-triggered. PR #19 has the workflow." + }, + { + "id": "sess-011", + "timestamp": "2026-04-08T17:00:00Z", + "content": "[user] standup notes: shipped #11 auth fix this morning. starting on docker arm64 next." + }, + { + "id": "sess-012", + "timestamp": "2026-04-10T18:30:00Z", + "content": "[user] preferences: I like rust analyzer over rls. always run cargo clippy before commit. prefer 2-space indent in yaml but tab in rust." + }, + { + "id": "sess-013", + "timestamp": "2026-04-15T19:00:00Z", + "content": "[assistant] reviewed Aria's PR #18 (schema migration). flagged race condition in dual-write step when two writers hit same row. Aria added per-row advisory lock. lgtm now." + }, + { + "id": "sess-014", + "timestamp": "2026-04-16T20:10:00Z", + "content": "[user] post-mortem from prod incident last week: SHIPCTL_TOKEN was unset in staging, fell back to bad SC_TOKEN which had wrong perms. delivery delayed 40min. action items: (1) precedence test (done in #11), (2) startup validation, (3) alert on auth fallback." + }, + { + "id": "sess-015", + "timestamp": "2026-04-17T16:45:00Z", + "content": "[user] preferences: stick to async-std not tokio for new code. team agreed in arch review." + } +] diff --git a/eval/runner/adapters/agentmemory.ts b/eval/runner/adapters/agentmemory.ts new file mode 100644 index 00000000..38028a7d --- /dev/null +++ b/eval/runner/adapters/agentmemory.ts @@ -0,0 +1,93 @@ +import type { Adapter, RankedDoc, Session } from "../types.js"; + +interface AgentMemoryState { + baseUrl: string; + secret?: string; + sessions: Session[]; + observationToSession: Map; +} + +interface RememberResponse { + memory?: { id?: string }; + observationId?: string; + id?: string; + observation?: { id?: string }; +} + +interface SmartSearchResponse { + results?: Array<{ + obsId?: string; + id?: string; + observationId?: string; + sessionId?: string; + score?: number; + content?: string; + }>; + observations?: Array<{ + obsId?: string; + id?: string; + sessionId?: string; + score?: number; + content?: string; + }>; +} + +function authHeaders(secret?: string): Record { + const h: Record = { "Content-Type": "application/json" }; + if (secret) h.Authorization = `Bearer ${secret}`; + return h; +} + +export const agentmemoryAdapter: Adapter = { + name: "agentmemory-hybrid", + async init(sessions, config) { + const baseUrl = (config?.baseUrl as string) ?? process.env.AGENTMEMORY_BASE_URL ?? "http://localhost:3111"; + const secret = (config?.secret as string) ?? process.env.AGENTMEMORY_SECRET; + const observationToSession = new Map(); + for (const s of sessions) { + const res = await fetch(`${baseUrl}/agentmemory/remember`, { + method: "POST", + headers: authHeaders(secret), + body: JSON.stringify({ + content: s.content, + type: "eval-session", + concepts: [s.id], + }), + }); + if (!res.ok) { + throw new Error(`remember failed for ${s.id}: ${res.status} ${await res.text()}`); + } + const body = (await res.json()) as RememberResponse; + const obsId = + body.memory?.id ?? body.observationId ?? body.id ?? body.observation?.id; + if (obsId) observationToSession.set(obsId, s.id); + } + return { baseUrl, secret, sessions, observationToSession }; + }, + async query(q, state, k) { + const res = await fetch(`${state.baseUrl}/agentmemory/smart-search`, { + method: "POST", + headers: authHeaders(state.secret), + body: JSON.stringify({ query: q, limit: Math.max(k * 10, 50) }), + }); + if (!res.ok) { + throw new Error(`smart-search failed: ${res.status} ${await res.text()}`); + } + const body = (await res.json()) as SmartSearchResponse; + const rows = body.results ?? body.observations ?? []; + const ranked: RankedDoc[] = []; + const seen = new Set(); + for (const row of rows) { + let sessionId = row.sessionId; + if (!sessionId) { + const memId = row.obsId ?? row.id ?? row.observationId; + sessionId = memId ? state.observationToSession.get(memId) : undefined; + } + if (!sessionId || seen.has(sessionId)) continue; + seen.add(sessionId); + ranked.push({ sessionId, score: row.score ?? 0 }); + if (ranked.length >= k) break; + } + return ranked; + }, +}; diff --git a/eval/runner/adapters/grep.ts b/eval/runner/adapters/grep.ts new file mode 100644 index 00000000..28b18ea6 --- /dev/null +++ b/eval/runner/adapters/grep.ts @@ -0,0 +1,36 @@ +import type { Adapter, RankedDoc, Session } from "../types.js"; + +interface GrepState { + sessions: Session[]; +} + +function tokenize(s: string): string[] { + return s + .toLowerCase() + .replace(/[^a-z0-9_]+/g, " ") + .split(/\s+/) + .filter((t) => t.length > 2); +} + +export const grepAdapter: Adapter = { + name: "grep", + async init(sessions) { + return { sessions }; + }, + async query(q, state, k) { + const terms = tokenize(q); + const scored: RankedDoc[] = []; + for (const s of state.sessions) { + const body = s.content.toLowerCase(); + let hits = 0; + for (const t of terms) { + if (body.includes(t)) hits += 1; + } + if (hits > 0) { + scored.push({ sessionId: s.id, score: hits }); + } + } + scored.sort((a, b) => b.score - a.score); + return scored.slice(0, k); + }, +}; diff --git a/eval/runner/adapters/vector.ts b/eval/runner/adapters/vector.ts new file mode 100644 index 00000000..c40e414d --- /dev/null +++ b/eval/runner/adapters/vector.ts @@ -0,0 +1,108 @@ +import type { Adapter, RankedDoc, Session } from "../types.js"; + +interface VectorState { + sessions: Session[]; + embeddings: Float32Array[]; +} + +const OPENAI_URL = "https://api.openai.com/v1/embeddings"; +const MODEL = "text-embedding-3-small"; +const DIM = 1536; + +async function embed(text: string, apiKey: string): Promise { + const res = await fetch(OPENAI_URL, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${apiKey}`, + }, + body: JSON.stringify({ input: text, model: MODEL }), + }); + if (!res.ok) { + throw new Error(`OpenAI embed failed: ${res.status} ${await res.text()}`); + } + const data = (await res.json()) as { data: Array<{ embedding: number[] }> }; + return Float32Array.from(data.data[0].embedding); +} + +async function embedBatch(texts: string[], apiKey: string): Promise { + const res = await fetch(OPENAI_URL, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${apiKey}`, + }, + body: JSON.stringify({ input: texts, model: MODEL }), + }); + if (!res.ok) { + throw new Error(`OpenAI batch embed failed: ${res.status} ${await res.text()}`); + } + const data = (await res.json()) as { data: Array<{ embedding: number[]; index: number }> }; + if (!Array.isArray(data.data) || data.data.length !== texts.length) { + throw new Error( + `OpenAI batch embed: expected ${texts.length} embeddings, got ${data.data?.length ?? 0}`, + ); + } + const out = new Array(texts.length); + for (const row of data.data) { + if ( + !Number.isInteger(row.index) || + row.index < 0 || + row.index >= texts.length || + out[row.index] !== undefined + ) { + throw new Error(`OpenAI batch embed: invalid or duplicate index ${row.index}`); + } + if (!Array.isArray(row.embedding) || row.embedding.length === 0) { + throw new Error(`OpenAI batch embed: empty embedding at index ${row.index}`); + } + out[row.index] = Float32Array.from(row.embedding); + } + return out; +} + +function cosine(a: Float32Array, b: Float32Array): number { + let dot = 0; + let na = 0; + let nb = 0; + for (let i = 0; i < a.length; i++) { + dot += a[i] * b[i]; + na += a[i] * a[i]; + nb += b[i] * b[i]; + } + const denom = Math.sqrt(na) * Math.sqrt(nb); + return denom === 0 ? 0 : dot / denom; +} + +export const vectorAdapter: Adapter = { + name: "vector", + async init(sessions) { + const apiKey = process.env.OPENAI_API_KEY; + if (!apiKey) throw new Error("OPENAI_API_KEY required for vector adapter"); + const embeddings: Float32Array[] = new Array(sessions.length); + const BATCH = 50; + for (let i = 0; i < sessions.length; i += BATCH) { + const batch = sessions.slice(i, i + BATCH); + const vecs = await embedBatch( + batch.map((s) => s.content.slice(0, 8000)), + apiKey, + ); + for (let j = 0; j < vecs.length; j++) embeddings[i + j] = vecs[j]; + } + if (embeddings.length > 0 && embeddings[0].length !== DIM) { + throw new Error(`unexpected embedding dim: ${embeddings[0].length}`); + } + return { sessions, embeddings }; + }, + async query(q, state, k) { + const apiKey = process.env.OPENAI_API_KEY; + if (!apiKey) throw new Error("OPENAI_API_KEY required for vector adapter"); + const qvec = await embed(q, apiKey); + const scored: RankedDoc[] = state.sessions.map((s, i) => ({ + sessionId: s.id, + score: cosine(qvec, state.embeddings[i]), + })); + scored.sort((a, b) => b.score - a.score); + return scored.slice(0, k); + }, +}; diff --git a/eval/runner/coding-life.ts b/eval/runner/coding-life.ts new file mode 100644 index 00000000..753ca87f --- /dev/null +++ b/eval/runner/coding-life.ts @@ -0,0 +1,101 @@ +import { readFileSync, existsSync, mkdirSync, writeFileSync, appendFileSync } from "node:fs"; +import { resolve } from "node:path"; +import { parseArgs } from "node:util"; +import { agentmemoryAdapter } from "./adapters/agentmemory.js"; +import { grepAdapter } from "./adapters/grep.js"; +import { vectorAdapter } from "./adapters/vector.js"; +import { aggregate, scoreQuestion } from "./score.js"; +import type { Adapter, Question, ScoreRow, Session } from "./types.js"; + +const ADAPTERS: Record = { + grep: grepAdapter as unknown as Adapter, + vector: vectorAdapter as unknown as Adapter, + agentmemory: agentmemoryAdapter as unknown as Adapter, +}; + +interface CliOptions { + data: string; + adapters: string; + k: string; + out: string; +} + +function parse(): CliOptions { + const { values } = parseArgs({ + options: { + data: { type: "string", default: "eval/data/coding-agent-life-v1" }, + adapters: { type: "string", default: "grep,vector,agentmemory" }, + k: { type: "string", default: "5" }, + out: { type: "string", default: "eval/reports/coding-life" }, + }, + }); + return values as unknown as CliOptions; +} + +async function main(): Promise { + const opts = parse(); + const k = Number(opts.k); + if (!Number.isInteger(k) || k <= 0) { + console.error(`--k must be a positive integer, got: ${opts.k}`); + process.exit(2); + } + const sessions = JSON.parse( + readFileSync(resolve(opts.data, "sessions.json"), "utf8"), + ) as Session[]; + const queriesRaw = JSON.parse( + readFileSync(resolve(opts.data, "queries.json"), "utf8"), + ) as Array>; + const questions: Question[] = queriesRaw.map((q) => ({ ...q, haystack: sessions })); + const adapterNames = opts.adapters.split(",").map((s) => s.trim()).filter(Boolean); + for (const a of adapterNames) { + if (!ADAPTERS[a]) { + console.error(`unknown adapter: ${a}. options: ${Object.keys(ADAPTERS).join(",")}`); + process.exit(2); + } + } + console.log( + `loaded ${sessions.length} sessions, ${questions.length} queries, adapters: ${adapterNames.join(",")}, k=${k}`, + ); + + const outDir = resolve(opts.out); + mkdirSync(outDir, { recursive: true }); + const ndjsonPath = `${outDir}/scores.ndjson`; + if (existsSync(ndjsonPath)) writeFileSync(ndjsonPath, ""); + + const rows: ScoreRow[] = []; + for (const adapterName of adapterNames) { + const adapter = ADAPTERS[adapterName]; + console.log(`\n== ${adapter.name} ==`); + const state = await adapter.init(sessions); + try { + for (const q of questions) { + const t0 = performance.now(); + const ranked = await adapter.query(q.question, state, k); + const latencyMs = performance.now() - t0; + const row = scoreQuestion(q, ranked, k, adapter.name, latencyMs); + rows.push(row); + appendFileSync(ndjsonPath, JSON.stringify(row) + "\n"); + const mark = row.hit ? "+" : "-"; + console.log( + ` ${mark} ${q.id} [${q.type}] R@${k}=${row.recallAtK.toFixed(2)} (${Math.round(latencyMs)}ms)`, + ); + } + } finally { + if (adapter.teardown) await adapter.teardown(state); + } + } + + const agg = aggregate(rows); + writeFileSync(`${outDir}/summary.json`, JSON.stringify(agg, null, 2)); + console.log("\n=== Summary ==="); + for (const [adapter, stats] of Object.entries(agg.byAdapter)) { + console.log( + ` ${adapter.padEnd(22)} P@${k}=${stats.p.toFixed(3)} R@${k}=${stats.r.toFixed(3)} hit=${stats.hit}/${stats.n} p50=${Math.round(stats.latencyP50)}ms`, + ); + } +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/eval/runner/load.ts b/eval/runner/load.ts new file mode 100644 index 00000000..aece2452 --- /dev/null +++ b/eval/runner/load.ts @@ -0,0 +1,54 @@ +import { readFileSync } from "node:fs"; +import type { Question, Session } from "./types.js"; + +interface LongMemEvalRaw { + question_id: string; + question_type: string; + question: string; + answer?: string; + answer_session_ids: string[]; + haystack_session_ids: string[]; + haystack_sessions: Array>; +} + +function flattenSession(turns: Array<{ role: string; content: string }>): string { + return turns.map((t) => `[${t.role}] ${t.content}`).join("\n\n"); +} + +export function loadLongMemEval(path: string, limit?: number): Question[] { + const raw = JSON.parse(readFileSync(path, "utf8")) as LongMemEvalRaw[]; + const slice = typeof limit === "number" ? raw.slice(0, limit) : raw; + const questions: Question[] = []; + for (const r of slice) { + if (r.haystack_session_ids.length !== r.haystack_sessions.length) { + throw new Error( + `LongMemEval row ${r.question_id}: haystack_session_ids (${r.haystack_session_ids.length}) and haystack_sessions (${r.haystack_sessions.length}) length mismatch`, + ); + } + const haystack: Session[] = r.haystack_session_ids.map((id, i) => ({ + id, + content: flattenSession(r.haystack_sessions[i]), + })); + questions.push({ + id: r.question_id, + type: r.question_type, + question: r.question, + answer: r.answer, + goldSessionIds: r.answer_session_ids, + haystack, + }); + } + return questions; +} + +export function stratifySample(questions: Question[], perType: number): Question[] { + const buckets: Record = {}; + for (const q of questions) { + (buckets[q.type] ??= []).push(q); + } + const out: Question[] = []; + for (const type of Object.keys(buckets).sort()) { + out.push(...buckets[type].slice(0, perType)); + } + return out; +} diff --git a/eval/runner/longmemeval.ts b/eval/runner/longmemeval.ts new file mode 100644 index 00000000..a906fa21 --- /dev/null +++ b/eval/runner/longmemeval.ts @@ -0,0 +1,126 @@ +import { existsSync, mkdirSync, writeFileSync, appendFileSync } from "node:fs"; +import { dirname, resolve } from "node:path"; +import { parseArgs } from "node:util"; +import { agentmemoryAdapter } from "./adapters/agentmemory.js"; +import { grepAdapter } from "./adapters/grep.js"; +import { vectorAdapter } from "./adapters/vector.js"; +import { loadLongMemEval, stratifySample } from "./load.js"; +import { aggregate, scoreQuestion } from "./score.js"; +import type { Adapter, ScoreRow } from "./types.js"; + +const ADAPTERS: Record = { + grep: grepAdapter as unknown as Adapter, + vector: vectorAdapter as unknown as Adapter, + agentmemory: agentmemoryAdapter as unknown as Adapter, +}; + +interface CliOptions { + data: string; + adapters: string; + k: string; + limit?: string; + stratify?: string; + out: string; +} + +function parse(): CliOptions { + const { values } = parseArgs({ + options: { + data: { type: "string", default: process.env.LONGMEMEVAL_PATH ?? "" }, + adapters: { type: "string", default: "grep,vector,agentmemory" }, + k: { type: "string", default: "5" }, + limit: { type: "string" }, + stratify: { type: "string" }, + out: { type: "string", default: "eval/reports/longmemeval" }, + }, + }); + return values as unknown as CliOptions; +} + +async function main(): Promise { + const opts = parse(); + if (!opts.data) { + console.error("--data required (or LONGMEMEVAL_PATH env)"); + process.exit(2); + } + const k = Number(opts.k); + if (!Number.isInteger(k) || k <= 0) { + console.error(`--k must be a positive integer, got: ${opts.k}`); + process.exit(2); + } + let limit: number | undefined; + if (opts.limit !== undefined) { + limit = Number(opts.limit); + if (!Number.isInteger(limit) || limit <= 0) { + console.error(`--limit must be a positive integer, got: ${opts.limit}`); + process.exit(2); + } + } + let perType: number | undefined; + if (opts.stratify !== undefined) { + perType = Number(opts.stratify); + if (!Number.isInteger(perType) || perType <= 0) { + console.error(`--stratify must be a positive integer, got: ${opts.stratify}`); + process.exit(2); + } + } + const adapterNames = opts.adapters.split(",").map((s) => s.trim()).filter(Boolean); + for (const a of adapterNames) { + if (!ADAPTERS[a]) { + console.error(`unknown adapter: ${a}. options: ${Object.keys(ADAPTERS).join(",")}`); + process.exit(2); + } + } + let questions = loadLongMemEval(resolve(opts.data), limit); + if (perType) questions = stratifySample(questions, perType); + console.log( + `loaded ${questions.length} questions, adapters: ${adapterNames.join(",")}, k=${k}`, + ); + + const outDir = resolve(opts.out); + mkdirSync(outDir, { recursive: true }); + const ndjsonPath = `${outDir}/scores.ndjson`; + if (existsSync(ndjsonPath)) writeFileSync(ndjsonPath, ""); + mkdirSync(dirname(ndjsonPath), { recursive: true }); + + const rows: ScoreRow[] = []; + for (const adapterName of adapterNames) { + const adapter = ADAPTERS[adapterName]; + console.log(`\n== ${adapter.name} ==`); + for (const q of questions) { + const t0 = performance.now(); + const state = await adapter.init(q.haystack); + try { + const ranked = await adapter.query(q.question, state, k); + const latencyMs = performance.now() - t0; + const row = scoreQuestion(q, ranked, k, adapter.name, latencyMs); + rows.push(row); + appendFileSync(ndjsonPath, JSON.stringify(row) + "\n"); + const mark = row.hit ? "+" : "-"; + console.log( + ` ${mark} ${q.id} [${q.type}] R@${k}=${row.recallAtK.toFixed(2)} (${Math.round(latencyMs)}ms)`, + ); + } finally { + if (adapter.teardown) await adapter.teardown(state); + } + } + } + + const agg = aggregate(rows); + const summaryPath = `${outDir}/summary.json`; + writeFileSync(summaryPath, JSON.stringify(agg, null, 2)); + + console.log("\n=== Summary ==="); + for (const [adapter, stats] of Object.entries(agg.byAdapter)) { + console.log( + ` ${adapter.padEnd(22)} P@${k}=${stats.p.toFixed(3)} R@${k}=${stats.r.toFixed(3)} hit=${stats.hit}/${stats.n} p50=${Math.round(stats.latencyP50)}ms`, + ); + } + console.log(`\nwrote ${ndjsonPath}`); + console.log(`wrote ${summaryPath}`); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/eval/runner/score.ts b/eval/runner/score.ts new file mode 100644 index 00000000..b21d30ca --- /dev/null +++ b/eval/runner/score.ts @@ -0,0 +1,78 @@ +import type { Question, RankedDoc, ScoreRow } from "./types.js"; + +export function scoreQuestion( + q: Question, + ranked: RankedDoc[], + k: number, + adapter: string, + latencyMs: number, +): ScoreRow { + const topK = ranked.slice(0, k).map((r) => r.sessionId); + const gold = new Set(q.goldSessionIds); + const hits = topK.filter((id) => gold.has(id)).length; + const precisionAtK = k > 0 ? hits / k : 0; + const recallAtK = gold.size === 0 ? 0 : hits / gold.size; + const hit = hits > 0; + let topGoldRank: number | null = null; + for (let i = 0; i < ranked.length; i++) { + if (gold.has(ranked[i].sessionId)) { + topGoldRank = i + 1; + break; + } + } + return { + questionId: q.id, + questionType: q.type, + adapter, + k, + precisionAtK, + recallAtK, + hit, + topGoldRank, + latencyMs, + }; +} + +export function aggregate(rows: ScoreRow[]): { + byAdapter: Record; + byType: Record>; +} { + const byAdapter: Record< + string, + { p: number; r: number; hit: number; n: number; latencyP50: number } + > = {}; + const latencies: Record = {}; + for (const r of rows) { + const a = (byAdapter[r.adapter] ??= { p: 0, r: 0, hit: 0, n: 0, latencyP50: 0 }); + a.p += r.precisionAtK; + a.r += r.recallAtK; + a.hit += r.hit ? 1 : 0; + a.n += 1; + (latencies[r.adapter] ??= []).push(r.latencyMs); + } + for (const adapter of Object.keys(byAdapter)) { + const a = byAdapter[adapter]; + a.p = a.p / a.n; + a.r = a.r / a.n; + const sorted = latencies[adapter].slice().sort((x, y) => x - y); + a.latencyP50 = sorted[Math.floor(sorted.length / 2)] ?? 0; + } + const byType: Record> = + {}; + for (const r of rows) { + const t = (byType[r.questionType] ??= {}); + const a = (t[r.adapter] ??= { p: 0, r: 0, hit: 0, n: 0 }); + a.p += r.precisionAtK; + a.r += r.recallAtK; + a.hit += r.hit ? 1 : 0; + a.n += 1; + } + for (const t of Object.keys(byType)) { + for (const adapter of Object.keys(byType[t])) { + const a = byType[t][adapter]; + a.p = a.p / a.n; + a.r = a.r / a.n; + } + } + return { byAdapter, byType }; +} diff --git a/eval/runner/types.ts b/eval/runner/types.ts new file mode 100644 index 00000000..e72a6408 --- /dev/null +++ b/eval/runner/types.ts @@ -0,0 +1,38 @@ +export interface Session { + id: string; + timestamp?: string; + content: string; +} + +export interface Question { + id: string; + type: string; + question: string; + answer?: string; + goldSessionIds: string[]; + haystack: Session[]; +} + +export interface RankedDoc { + sessionId: string; + score: number; +} + +export interface Adapter { + name: string; + init(sessions: Session[], config?: Record): Promise; + query(q: string, state: State, k: number): Promise; + teardown?(state: State): Promise; +} + +export interface ScoreRow { + questionId: string; + questionType: string; + adapter: string; + k: number; + precisionAtK: number; + recallAtK: number; + hit: boolean; + topGoldRank: number | null; + latencyMs: number; +} diff --git a/eval/scripts/sandbox.sh b/eval/scripts/sandbox.sh new file mode 100755 index 00000000..5d402330 --- /dev/null +++ b/eval/scripts/sandbox.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash +# Boot a sandboxed agentmemory + iii-engine on alt ports with a clean data dir, +# so eval runs aren't polluted by (and don't pollute) your real ~/.agentmemory. +# Source it: `source eval/scripts/sandbox.sh` then run eval scripts; +# the sandbox is torn down on EXIT. + +set -euo pipefail + +SANDBOX_ROOT="${SANDBOX_ROOT:-/tmp/agentmemory-eval-sandbox}" +SANDBOX_PORT="${SANDBOX_PORT:-3411}" +SANDBOX_STREAM_PORT="${SANDBOX_STREAM_PORT:-3412}" + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" + +if ! command -v iii >/dev/null 2>&1; then + echo "iii binary not on PATH. Install pinned version:" + echo " curl -fsSL https://github.com/iii-hq/iii/releases/download/iii/v0.11.2/iii-aarch64-apple-darwin.tar.gz | tar -xz -C ~/.local/bin" + exit 1 +fi + +iii_ver=$(iii --version 2>&1 | head -1) +if [[ "$iii_ver" != "0.11.2" ]]; then + echo "warning: iii version on PATH is $iii_ver; agentmemory pins 0.11.2" +fi + +if [[ ! -f "$REPO_ROOT/dist/index.mjs" ]]; then + echo "dist/ missing. Run: npm run build" >&2 + exit 1 +fi + +if [[ -z "${SANDBOX_ROOT:-}" || "$SANDBOX_ROOT" == "/" || "$SANDBOX_ROOT" != /tmp/* ]]; then + echo "refusing to wipe SANDBOX_ROOT='$SANDBOX_ROOT' — must be non-empty and under /tmp/" >&2 + exit 1 +fi +rm -rf "$SANDBOX_ROOT" +mkdir -p "$SANDBOX_ROOT/data" "$SANDBOX_ROOT/.agentmemory" + +cat > "$SANDBOX_ROOT/iii-config.yaml" < "$SANDBOX_ROOT/iii.log" 2>&1 & +SANDBOX_PID=$! + +cleanup() { + echo "tearing down sandbox (pid $SANDBOX_PID)" + kill "$SANDBOX_PID" 2>/dev/null || true + sleep 1 + kill -9 "$SANDBOX_PID" 2>/dev/null || true +} +trap cleanup EXIT + +# wait for livez +for i in $(seq 1 30); do + if curl -sS --max-time 1 "http://localhost:$SANDBOX_PORT/agentmemory/livez" 2>/dev/null | grep -q '"status":"ok"'; then + export AGENTMEMORY_BASE_URL="http://localhost:$SANDBOX_PORT" + echo "sandbox ready: $AGENTMEMORY_BASE_URL" + echo " state: $SANDBOX_ROOT/data/" + echo " logs: $SANDBOX_ROOT/iii.log" + return 0 2>/dev/null || exit 0 + fi + sleep 1 +done + +echo "sandbox failed to come up within 30s. last log lines:" >&2 +tail -10 "$SANDBOX_ROOT/iii.log" >&2 +exit 1 diff --git a/package.json b/package.json index e775c305..ae7b92ed 100644 --- a/package.json +++ b/package.json @@ -25,7 +25,9 @@ "test:watch": "vitest --exclude test/integration.test.ts", "test:integration": "vitest run test/integration.test.ts", "test:all": "vitest run", - "bench:load": "node --import tsx benchmark/load-100k.ts" + "bench:load": "node --import tsx benchmark/load-100k.ts", + "eval:longmemeval": "tsx eval/runner/longmemeval.ts", + "eval:coding-life": "tsx eval/runner/coding-life.ts" }, "keywords": [ "ai", diff --git a/test/eval-adapters.test.ts b/test/eval-adapters.test.ts new file mode 100644 index 00000000..90f914f5 --- /dev/null +++ b/test/eval-adapters.test.ts @@ -0,0 +1,92 @@ +import { describe, it, expect } from "vitest"; +import { readFileSync } from "node:fs"; +import { resolve } from "node:path"; +import { grepAdapter } from "../eval/runner/adapters/grep.js"; +import { aggregate, scoreQuestion } from "../eval/runner/score.js"; +import type { Question, Session } from "../eval/runner/types.js"; + +const DATA_DIR = resolve(__dirname, "..", "eval", "data", "coding-agent-life-v1"); +const sessions = JSON.parse(readFileSync(`${DATA_DIR}/sessions.json`, "utf8")) as Session[]; +const queries = JSON.parse(readFileSync(`${DATA_DIR}/queries.json`, "utf8")) as Array< + Omit +>; + +describe("eval scaffold", () => { + it("coding-agent-life-v1 corpus is well-formed", () => { + expect(sessions.length).toBeGreaterThan(0); + expect(queries.length).toBeGreaterThan(0); + const sessionIds = new Set(sessions.map((s) => s.id)); + for (const q of queries) { + expect(q.goldSessionIds.length).toBeGreaterThan(0); + for (const id of q.goldSessionIds) { + expect(sessionIds.has(id)).toBe(true); + } + } + }); + + it("grep adapter ranks gold session in top-5 for most queries", async () => { + const state = await grepAdapter.init(sessions); + let hits = 0; + for (const q of queries) { + const ranked = await grepAdapter.query(q.question, state, 5); + const topIds = new Set(ranked.map((r) => r.sessionId)); + if (q.goldSessionIds.some((id) => topIds.has(id))) hits += 1; + } + expect(hits / queries.length).toBeGreaterThan(0.5); + }); + + it("scoreQuestion computes P@K, R@K, hit, topGoldRank", () => { + const q: Question = { + id: "test", + type: "single-session", + question: "?", + goldSessionIds: ["a", "b"], + haystack: [], + }; + const ranked = [ + { sessionId: "x", score: 0.9 }, + { sessionId: "a", score: 0.7 }, + { sessionId: "y", score: 0.5 }, + { sessionId: "b", score: 0.3 }, + ]; + const row = scoreQuestion(q, ranked, 5, "test", 12); + expect(row.hit).toBe(true); + expect(row.recallAtK).toBe(1); + expect(row.precisionAtK).toBeCloseTo(2 / 5); + expect(row.topGoldRank).toBe(2); + }); + + it("scoreQuestion handles miss", () => { + const q: Question = { + id: "test", + type: "x", + question: "?", + goldSessionIds: ["a"], + haystack: [], + }; + const ranked = [ + { sessionId: "x", score: 1 }, + { sessionId: "y", score: 0.5 }, + ]; + const row = scoreQuestion(q, ranked, 5, "test", 5); + expect(row.hit).toBe(false); + expect(row.recallAtK).toBe(0); + expect(row.topGoldRank).toBeNull(); + }); + + it("aggregate computes per-adapter and per-type means", () => { + const q: Question = { + id: "1", + type: "t1", + question: "?", + goldSessionIds: ["a"], + haystack: [], + }; + const row1 = scoreQuestion(q, [{ sessionId: "a", score: 1 }], 5, "grep", 10); + const row2 = scoreQuestion(q, [{ sessionId: "x", score: 1 }], 5, "grep", 20); + const agg = aggregate([row1, row2]); + expect(agg.byAdapter.grep.hit).toBe(1); + expect(agg.byAdapter.grep.n).toBe(2); + expect(agg.byType.t1.grep.n).toBe(2); + }); +}); From e371be9d98e1541024ece013f9b97310a3ab0dc8 Mon Sep 17 00:00:00 2001 From: Rohit Ghumare <48523873+rohitg00@users.noreply.github.com> Date: Wed, 20 May 2026 15:26:08 +0100 Subject: [PATCH 26/34] fix(codex): --with-hooks workaround for openai/codex#16430 (closes #509) (#564) Codex Desktop currently does not dispatch plugin-local hooks.json even though both CodexHooks and PluginHooks feature flags are stable + default-enabled in codex-rs/features/src/lib.rs (openai/codex#16430). MCP tools still work; lifecycle observations are silently missing. Adds `agentmemory connect codex --with-hooks` which mirrors the bundled hooks.codex.json into the user-scope ~/.codex/hooks.json: - Resolves ${CLAUDE_PLUGIN_ROOT} to the absolute bundled plugin/ path (user-scope hooks don't get plugin-root injection) - Idempotent merge: previous agentmemory entries are stripped on reinstall via the resolved scripts/ path prefix; unrelated user hooks are preserved untouched - Preserves matcher fields from the bundled manifest so PreToolUse routing still works - findPluginRoot walks up from import.meta.url to locate the plugin/ dir; works for both dist/cli.mjs (bundled) and src/ (dev) layouts - Dry-run path previews both TOML and hooks.json changes Closes #509. --- README.md | 14 +++- src/cli/connect/codex-hooks.ts | 100 ++++++++++++++++++++++ src/cli/connect/codex.ts | 68 ++++++++++++++- src/cli/connect/index.ts | 9 +- src/cli/connect/types.ts | 7 ++ test/codex-connect-hooks.test.ts | 137 +++++++++++++++++++++++++++++++ 6 files changed, 330 insertions(+), 5 deletions(-) create mode 100644 src/cli/connect/codex-hooks.ts create mode 100644 test/codex-connect-hooks.test.ts diff --git a/README.md b/README.md index 96e804d4..840a75c4 100644 --- a/README.md +++ b/README.md @@ -430,6 +430,18 @@ The Codex plugin ships from the same `plugin/` directory as the Claude Code plug Codex's hook engine injects `CLAUDE_PLUGIN_ROOT` into hook subprocesses (per [`codex-rs/hooks/src/engine/discovery.rs`](https://github.com/openai/codex/blob/main/codex-rs/hooks/src/engine/discovery.rs)), so the same hook scripts work across both hosts without duplication. Subagent / SessionEnd / Notification / TaskCompleted / PostToolUseFailure events are Claude-Code-only and are not registered for Codex. +#### Codex Desktop: plugin hooks currently silent (workaround available) + +`CodexHooks` and `PluginHooks` are both stable + default-enabled in [`codex-rs/features/src/lib.rs`](https://github.com/openai/codex/blob/main/codex-rs/features/src/lib.rs), but Codex Desktop builds currently do not dispatch plugin-local `hooks.json` ([openai/codex#16430](https://github.com/openai/codex/issues/16430)). MCP tools still work; only the lifecycle observations are missing. + +Until upstream lands the fix, mirror the same hook commands into the global `~/.codex/hooks.json`: + +```bash +agentmemory connect codex --with-hooks +``` + +This adds an idempotent block to `~/.codex/hooks.json` referencing absolute paths to the bundled scripts (no `${CLAUDE_PLUGIN_ROOT}` expansion needed at user-scope). Re-run the same command after upgrading agentmemory to refresh paths. User entries in the same file are preserved; only previous agentmemory entries are replaced. +
OpenClaw (paste this prompt) @@ -504,7 +516,7 @@ The agentmemory entry is the **same MCP server block** across every host that us | **Gemini CLI** | `~/.gemini/settings.json` | `gemini mcp add agentmemory npx -y @agentmemory/mcp --scope user` (auto-merges). | | **OpenClaw** | OpenClaw MCP config | Same `mcpServers` block, or use the deeper [memory plugin](integrations/openclaw/). | | **Codex CLI (MCP only)** | `.codex/config.toml` | TOML shape: `codex mcp add agentmemory -- npx -y @agentmemory/mcp`, or add `[mcp_servers.agentmemory]` manually. | -| **Codex CLI (full plugin)** | Codex plugin marketplace | `codex plugin marketplace add rohitg00/agentmemory` then `codex plugin install agentmemory`. Registers MCP + 6 lifecycle hooks (SessionStart, UserPromptSubmit, PreToolUse, PostToolUse, PreCompact, Stop) + 4 skills. | +| **Codex CLI (full plugin)** | Codex plugin marketplace | `codex plugin marketplace add rohitg00/agentmemory` then `codex plugin install agentmemory`. Registers MCP + 6 lifecycle hooks (SessionStart, UserPromptSubmit, PreToolUse, PostToolUse, PreCompact, Stop) + 4 skills. On Codex Desktop, also run `agentmemory connect codex --with-hooks` until [openai/codex#16430](https://github.com/openai/codex/issues/16430) lands — plugin hooks are currently silent there. | | **OpenCode (MCP only)** | `opencode.json` | Different shape — top-level `mcp` key, command as array: `{"mcp": {"agentmemory": {"type": "local", "command": ["npx", "-y", "@agentmemory/mcp"], "enabled": true}}}`. | | **OpenCode (full plugin)** | `plugin/opencode/` | 22 auto-capture hooks covering session lifecycle, messages, tools, errors. Two slash commands (`/recall`, `/remember`). Copy `plugin/opencode/` into your OpenCode workspace and add the plugin entry to `opencode.json`. See [`plugin/opencode/README.md`](plugin/opencode/README.md) for the full hook table + gap analysis. | | **pi** | `~/.pi/agent/extensions/agentmemory` | Copy [`integrations/pi`](integrations/pi/) and restart pi. | diff --git a/src/cli/connect/codex-hooks.ts b/src/cli/connect/codex-hooks.ts new file mode 100644 index 00000000..679ec8be --- /dev/null +++ b/src/cli/connect/codex-hooks.ts @@ -0,0 +1,100 @@ +import { existsSync, readFileSync } from "node:fs"; +import { dirname, join, resolve } from "node:path"; +import { fileURLToPath } from "node:url"; + +/** + * Workaround for openai/codex#16430 — Codex Desktop does not dispatch + * plugin-local `hooks.json` even though both `CodexHooks` and `PluginHooks` + * feature flags are stable + default-enabled in + * `codex-rs/features/src/lib.rs`. Until upstream fixes plugin-scope + * dispatch, the same hook commands can be mirrored into the global + * `~/.codex/hooks.json`, which is loaded reliably. + * + * This module builds that mirror, with `${CLAUDE_PLUGIN_ROOT}` resolved to + * the bundled `plugin/` directory so the user-scope file does not depend + * on env-var expansion (Codex only injects `CLAUDE_PLUGIN_ROOT` for + * plugin-scope hooks). + * + * Identification on re-install: every command we write contains the + * resolved `/scripts/` prefix, so subsequent installs can + * strip our entries and re-add cleanly without touching the user's other + * hook entries. + */ + +type HookHandler = { type: string; command: string }; +type HookEntry = { matcher?: string; hooks: HookHandler[] }; +export type HookManifest = { hooks: Record }; + +/** + * Locate the bundled `plugin/` directory at runtime. Walks up from the + * module's own location looking for `plugin/scripts/` + `plugin/hooks/`, + * both shipped via the npm `files` field. Works for both `dist/cli.mjs` + * (bundled) and `src/cli/connect/codex-hooks.ts` (dev) layouts. + */ +export function findPluginRoot(startUrl: string = import.meta.url): string { + const here = dirname(fileURLToPath(startUrl)); + let dir = here; + for (let i = 0; i < 12; i++) { + if ( + existsSync(join(dir, "plugin", "scripts")) && + existsSync(join(dir, "plugin", "hooks")) + ) { + return resolve(join(dir, "plugin")); + } + const parent = dirname(dir); + if (parent === dir) break; + dir = parent; + } + throw new Error( + `agentmemory: could not locate bundled plugin/ directory (searched up from ${here})`, + ); +} + +/** + * Build the merged hooks.json content. + * + * 1. Strip any entry from `existing` whose first hook command points + * under `/scripts/`. This lets us re-install idempotently + * without leaving stale references. + * 2. Append fresh entries from the bundled Codex manifest with + * `${CLAUDE_PLUGIN_ROOT}` rewritten to the absolute plugin path. + * Matcher values from the bundled manifest are preserved so PreToolUse + * event routing keeps working. + */ +export function buildMergedHooks( + existing: HookManifest | null, + pluginRoot: string, +): HookManifest { + const codexManifestPath = join(pluginRoot, "hooks", "hooks.codex.json"); + const ours = JSON.parse(readFileSync(codexManifestPath, "utf-8")) as HookManifest; + const scriptsDir = join(pluginRoot, "scripts"); + + const out: HookManifest = { hooks: {} }; + + if (existing?.hooks) { + for (const [event, entries] of Object.entries(existing.hooks)) { + const kept = entries.filter((entry) => !isAgentmemoryEntry(entry, scriptsDir)); + if (kept.length > 0) out.hooks[event] = kept; + } + } + + for (const [event, entries] of Object.entries(ours.hooks)) { + const resolvedEntries: HookEntry[] = entries.map((entry) => { + const next: HookEntry = { + hooks: entry.hooks.map((handler) => ({ + type: handler.type, + command: handler.command.replace(/\$\{CLAUDE_PLUGIN_ROOT\}/g, pluginRoot), + })), + }; + if (entry.matcher !== undefined) next.matcher = entry.matcher; + return next; + }); + out.hooks[event] = [...(out.hooks[event] ?? []), ...resolvedEntries]; + } + + return out; +} + +function isAgentmemoryEntry(entry: HookEntry, scriptsDir: string): boolean { + return entry.hooks.some((handler) => handler.command.includes(scriptsDir)); +} diff --git a/src/cli/connect/codex.ts b/src/cli/connect/codex.ts index 003dc99a..a87b2858 100644 --- a/src/cli/connect/codex.ts +++ b/src/cli/connect/codex.ts @@ -8,10 +8,18 @@ import { logAlreadyWired, logBackup, logInstalled, + readJsonSafe, + writeJsonAtomic, } from "./util.js"; +import { + buildMergedHooks, + findPluginRoot, + type HookManifest, +} from "./codex-hooks.js"; const CODEX_DIR = join(homedir(), ".codex"); const CODEX_TOML = join(CODEX_DIR, "config.toml"); +const CODEX_HOOKS = join(CODEX_DIR, "hooks.json"); const TOML_BLOCK = `[mcp_servers.agentmemory] command = "npx" @@ -57,7 +65,7 @@ export const adapter: ConnectAdapter = { displayName: "Codex CLI", docs: "https://github.com/rohitg00/agentmemory#codex-cli-codex-plugin-platform", protocolNote: - "→ Using MCP. Hooks are also available — see docs/codex.md.", + "→ Using MCP. Hooks ship via the Codex plugin; on Codex Desktop, also pass --with-hooks to install the global hooks.json workaround for openai/codex#16430.", detect(): boolean { return existsSync(CODEX_DIR); @@ -77,6 +85,7 @@ export const adapter: ConnectAdapter = { p.log.info( `[dry-run] Would ${wired ? "rewrite" : "append"} [mcp_servers.agentmemory] in ${CODEX_TOML}`, ); + if (opts.withHooks) installCodexHooks(opts); return { kind: "installed", mutatedPath: CODEX_TOML }; } @@ -105,6 +114,16 @@ export const adapter: ConnectAdapter = { p.log.info( "Codex picks up MCP servers on next launch. For the deeper plugin install, run: codex plugin marketplace add rohitg00/agentmemory && codex plugin install agentmemory", ); + + if (opts.withHooks) { + const hookResult = installCodexHooks(opts); + if (hookResult.kind === "skipped") { + p.log.warn( + `Codex hooks fallback skipped: ${hookResult.reason}. MCP wiring still applied.`, + ); + } + } + return { kind: "installed", mutatedPath: CODEX_TOML, @@ -112,3 +131,50 @@ export const adapter: ConnectAdapter = { }; }, }; + +/** + * Install the global `~/.codex/hooks.json` fallback. See + * `codex-hooks.ts` for context (openai/codex#16430). Returns a result + * describing the side effect for the caller's summary; failures here do + * not roll back the MCP wiring. + */ +function installCodexHooks(opts: ConnectOptions): ConnectResult { + let pluginRoot: string; + try { + pluginRoot = findPluginRoot(); + } catch (err) { + return { + kind: "skipped", + reason: err instanceof Error ? err.message : String(err), + }; + } + + const existing = readJsonSafe(CODEX_HOOKS); + const merged = buildMergedHooks(existing, pluginRoot); + + if (opts.dryRun) { + p.log.info( + `[dry-run] Would ${existing ? "merge" : "create"} ${CODEX_HOOKS} with ${Object.keys(merged.hooks).length} event(s)`, + ); + return { kind: "installed", mutatedPath: CODEX_HOOKS }; + } + + let backupPath: string | undefined; + if (existsSync(CODEX_HOOKS)) { + backupPath = backupFile(CODEX_HOOKS, "codex-hooks", "json"); + logBackup(backupPath); + } + + writeJsonAtomic(CODEX_HOOKS, merged); + + logInstalled("Codex hooks (workaround for openai/codex#16430)", CODEX_HOOKS); + p.log.info( + "User-scope hooks reference absolute paths under the bundled plugin/ dir. Re-run `agentmemory connect codex --with-hooks` after upgrading agentmemory to refresh them.", + ); + + return { + kind: "installed", + mutatedPath: CODEX_HOOKS, + ...(backupPath !== undefined && { backupPath }), + }; +} diff --git a/src/cli/connect/index.ts b/src/cli/connect/index.ts index 17aedf8f..48c23c66 100644 --- a/src/cli/connect/index.ts +++ b/src/cli/connect/index.ts @@ -34,19 +34,22 @@ function parseFlags(args: string[]): { dryRun: boolean; force: boolean; all: boolean; + withHooks: boolean; positional: string[]; } { const positional: string[] = []; let dryRun = false; let force = false; let all = false; + let withHooks = false; for (const a of args) { if (a === "--dry-run") dryRun = true; else if (a === "--force") force = true; else if (a === "--all") all = true; + else if (a === "--with-hooks") withHooks = true; else if (!a.startsWith("-")) positional.push(a); } - return { dryRun, force, all, positional }; + return { dryRun, force, all, withHooks, positional }; } export async function runAdapter( @@ -83,8 +86,8 @@ export async function runConnect(args: string[]): Promise { return; } - const { dryRun, force, all, positional } = parseFlags(args); - const opts: ConnectOptions = { dryRun, force }; + const { dryRun, force, all, withHooks, positional } = parseFlags(args); + const opts: ConnectOptions = { dryRun, force, withHooks }; p.intro("agentmemory connect"); diff --git a/src/cli/connect/types.ts b/src/cli/connect/types.ts index 4f64c867..8abd2745 100644 --- a/src/cli/connect/types.ts +++ b/src/cli/connect/types.ts @@ -1,6 +1,13 @@ export type ConnectOptions = { dryRun: boolean; force: boolean; + /** + * When true, the Codex adapter additionally writes a global + * `~/.codex/hooks.json` block referencing absolute paths to bundled hook + * scripts. Workaround for openai/codex#16430, which prevents plugin-local + * hooks from dispatching on Codex Desktop. No-op for other adapters. + */ + withHooks?: boolean; }; export type ConnectAdapter = { diff --git a/test/codex-connect-hooks.test.ts b/test/codex-connect-hooks.test.ts new file mode 100644 index 00000000..75accbee --- /dev/null +++ b/test/codex-connect-hooks.test.ts @@ -0,0 +1,137 @@ +import { describe, it, expect } from "vitest"; +import { writeFileSync, readFileSync, mkdirSync, rmSync } from "node:fs"; +import { join, resolve } from "node:path"; +import { tmpdir } from "node:os"; +import { + buildMergedHooks, + findPluginRoot, + type HookManifest, +} from "../src/cli/connect/codex-hooks.js"; + +const PLUGIN_ROOT = resolve(__dirname, "..", "plugin"); + +describe("findPluginRoot", () => { + it("locates the bundled plugin/ directory from src/cli/connect/", () => { + const root = findPluginRoot(); + expect(root).toBe(PLUGIN_ROOT); + }); +}); + +describe("buildMergedHooks", () => { + it("rewrites ${CLAUDE_PLUGIN_ROOT} to absolute pluginRoot in every command", () => { + const merged = buildMergedHooks(null, PLUGIN_ROOT); + for (const entries of Object.values(merged.hooks)) { + for (const entry of entries) { + for (const handler of entry.hooks) { + expect(handler.command).not.toContain("${CLAUDE_PLUGIN_ROOT}"); + expect(handler.command).toContain(`${PLUGIN_ROOT}/scripts/`); + } + } + } + }); + + it("preserves matchers from the bundled manifest (e.g. PreToolUse)", () => { + const merged = buildMergedHooks(null, PLUGIN_ROOT); + const preToolUse = merged.hooks["PreToolUse"]; + expect(preToolUse).toBeDefined(); + expect(preToolUse!.length).toBeGreaterThan(0); + expect(preToolUse![0].matcher).toBe("Edit|Write|Read|Glob|Grep"); + }); + + it("includes all six expected lifecycle events", () => { + const merged = buildMergedHooks(null, PLUGIN_ROOT); + for (const event of [ + "SessionStart", + "UserPromptSubmit", + "PreToolUse", + "PostToolUse", + "PreCompact", + "Stop", + ]) { + expect(Object.keys(merged.hooks)).toContain(event); + } + }); + + it("appends to existing user hooks without dropping them", () => { + const existing: HookManifest = { + hooks: { + SessionStart: [ + { + hooks: [{ type: "command", command: "echo user-custom" }], + }, + ], + UserPromptSubmit: [ + { + hooks: [{ type: "command", command: "echo another-user-hook" }], + }, + ], + }, + }; + const merged = buildMergedHooks(existing, PLUGIN_ROOT); + const sessionStart = merged.hooks["SessionStart"]!; + const userHook = sessionStart.find((e) => + e.hooks.some((h) => h.command === "echo user-custom"), + ); + expect(userHook, "user's SessionStart hook should survive").toBeDefined(); + const ours = sessionStart.find((e) => + e.hooks.some((h) => h.command.includes(`${PLUGIN_ROOT}/scripts/session-start.mjs`)), + ); + expect(ours, "agentmemory SessionStart hook should be appended").toBeDefined(); + }); + + it("re-install strips previous agentmemory entries (idempotent by script path)", () => { + const first = buildMergedHooks(null, PLUGIN_ROOT); + const second = buildMergedHooks(first, PLUGIN_ROOT); + for (const event of Object.keys(first.hooks)) { + expect( + second.hooks[event]!.length, + `${event} should not double after second install`, + ).toBe(first.hooks[event]!.length); + } + }); + + it("re-install preserves unrelated user entries", () => { + const userEntry = { + hooks: [{ type: "command", command: "echo user-untouchable" }], + }; + const withUser: HookManifest = { + hooks: { + SessionStart: [userEntry], + Stop: [{ hooks: [{ type: "command", command: "echo also-user" }] }], + }, + }; + const installed = buildMergedHooks(withUser, PLUGIN_ROOT); + const reinstalled = buildMergedHooks(installed, PLUGIN_ROOT); + expect( + reinstalled.hooks["SessionStart"]!.some((e) => + e.hooks.some((h) => h.command === "echo user-untouchable"), + ), + ).toBe(true); + expect( + reinstalled.hooks["Stop"]!.some((e) => + e.hooks.some((h) => h.command === "echo also-user"), + ), + ).toBe(true); + }); + + it("handles empty existing manifest object", () => { + const merged = buildMergedHooks({ hooks: {} }, PLUGIN_ROOT); + expect(Object.keys(merged.hooks).length).toBeGreaterThan(0); + }); +}); + +describe("buildMergedHooks file round-trip", () => { + it("produces JSON that parses back to a structurally equivalent manifest", () => { + const dir = join(tmpdir(), `agentmemory-codex-hooks-${process.pid}-${Date.now()}`); + mkdirSync(dir, { recursive: true }); + const path = join(dir, "hooks.json"); + try { + const merged = buildMergedHooks(null, PLUGIN_ROOT); + writeFileSync(path, `${JSON.stringify(merged, null, 2)}\n`, "utf-8"); + const reread = JSON.parse(readFileSync(path, "utf-8")) as HookManifest; + expect(Object.keys(reread.hooks).sort()).toEqual(Object.keys(merged.hooks).sort()); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); From 93d1bdd81af8e201d54794b642309188f7c1f2c6 Mon Sep 17 00:00:00 2001 From: Rohit Ghumare <48523873+rohitg00@users.noreply.github.com> Date: Wed, 20 May 2026 16:23:32 +0100 Subject: [PATCH 27/34] fix(deps): pin iii-sdk to 0.11.2 (closes #555) (#567) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(deps): pin iii-sdk to 0.11.2 to avoid routing regression in 0.11.6 iii-sdk@0.11.6 changes nested behavior so that all /agentmemory/* routes return 404 against the iii-engine, even though both packages still satisfy the previous "^0.11.2" semver range. npm picked up the new version on `npm install -g @agentmemory/agentmemory` after 0.9.21 shipped, silently breaking installs. Two pin sites: 1. package.json — caret -> exact "0.11.2" so npm cannot drift forward on minor releases until the upstream regression is sorted. 2. src/cli.ts — `agentmemory setup` previously ran `pnpm up iii-sdk@latest` / `npm install iii-sdk@latest`, which would re-pull 0.11.6+ even after a freshly-pinned install. Both call sites now pin to 0.11.2 with a label referencing this issue. Tests (1081) + build pass against iii-sdk@0.11.2. Closes #555. * fix(cli): drop issue ref from iii-sdk pin label Source labels should describe what the code does, not point at issues that rot as the codebase evolves. Issue context lives in the PR body. --- package.json | 2 +- src/cli.ts | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/package.json b/package.json index ae7b92ed..bc245a2f 100644 --- a/package.json +++ b/package.json @@ -62,7 +62,7 @@ "@anthropic-ai/sdk": "^0.39.0", "@clack/prompts": "^1.2.0", "dotenv": "^17.4.2", - "iii-sdk": "^0.11.2", + "iii-sdk": "0.11.2", "zod": "^4.0.0" }, "optionalDependencies": { diff --git a/src/cli.ts b/src/cli.ts index 27885a95..3893e361 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -1966,8 +1966,8 @@ async function runUpgrade() { label: "Refreshing dependencies (pnpm install)", }); requireSuccess(installOk, "pnpm install"); - runCommand(pnpmBin, ["up", "iii-sdk@latest"], { - label: "Upgrading iii-sdk to latest", + runCommand(pnpmBin, ["up", "iii-sdk@0.11.2"], { + label: "Pinning iii-sdk@0.11.2", optional: true, }); } else if (npmBin) { @@ -1975,8 +1975,8 @@ async function runUpgrade() { label: "Refreshing dependencies (npm install)", }); requireSuccess(installOk, "npm install"); - runCommand(npmBin, ["install", "iii-sdk@latest"], { - label: "Upgrading iii-sdk to latest", + runCommand(npmBin, ["install", "iii-sdk@0.11.2"], { + label: "Pinning iii-sdk@0.11.2", optional: true, }); } else { From 3cb7f9089414f673ef584b8b4bc5d8d5b7ba1970 Mon Sep 17 00:00:00 2001 From: Faraz Ahmed <38698072+faraz152@users.noreply.github.com> Date: Wed, 20 May 2026 20:33:42 +0500 Subject: [PATCH 28/34] fix: read tool_response instead of tool_output in PostToolUse hook (#561) * fix: read tool_response instead of tool_output in PostToolUse hook Claude Code's PostToolUse payload sends the field as `tool_response`, not `tool_output`. The hook was reading `data.tool_output` which is always undefined, so `cleanOutput` was undefined, the observe request contained no `tool_output` value, and mem::compress consistently failed its XML schema validation (requires narrative >= 10 chars + facts >= 1). Fix: read `data.tool_response` with `data.tool_output` as a fallback so older integrations that emit the legacy field name keep working. Fixes #539 * style: remove explanatory comment per repo guidelines --- plugin/scripts/post-tool-use.mjs | 2 +- src/hooks/post-tool-use.ts | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/plugin/scripts/post-tool-use.mjs b/plugin/scripts/post-tool-use.mjs index 5ebec645..2e021ef5 100755 --- a/plugin/scripts/post-tool-use.mjs +++ b/plugin/scripts/post-tool-use.mjs @@ -23,7 +23,7 @@ async function main() { } if (isSdkChildContext(data)) return; const sessionId = data.session_id || "unknown"; - const { imageData, cleanOutput } = extractImageData(data.tool_output); + const { imageData, cleanOutput } = extractImageData(data.tool_response ?? data.tool_output); try { await fetch(`${REST_URL}/agentmemory/observe`, { method: "POST", diff --git a/src/hooks/post-tool-use.ts b/src/hooks/post-tool-use.ts index 65afc8b1..c84d09d2 100644 --- a/src/hooks/post-tool-use.ts +++ b/src/hooks/post-tool-use.ts @@ -32,7 +32,9 @@ async function main() { const sessionId = (data.session_id as string) || "unknown"; - const { imageData, cleanOutput } = extractImageData(data.tool_output); + const { imageData, cleanOutput } = extractImageData( + data.tool_response ?? data.tool_output, + ); try { await fetch(`${REST_URL}/agentmemory/observe`, { From a9945144a3af867e4afeb7da5c70a42c81e7270c Mon Sep 17 00:00:00 2001 From: Ptah Date: Wed, 20 May 2026 15:34:02 +0000 Subject: [PATCH 29/34] fix(providers/openai): send explicit stream:false in chat completion body (#526) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OpenAI API spec defines `stream` as defaulting to false when absent, so the current code (which omits it) should yield JSON. Some OpenAI-compatible proxies disagree and default to text/event-stream, which crashes the `response.json()` parser below with: Unexpected token 'd', "data: {"id"... is not valid JSON After a few of these in a row, the resilient wrapper's circuit breaker trips and all subsequent compression calls fail with `circuit_breaker_open`, silently disabling LLM-backed compression / summarisation / reflection. Reproduced upstream in decolua/9router#1260: 9Router's `handleChatCore` returns SSE unless `stream: false` is explicit. PR https://github.com/decolua/9router/pull/1272 fixes the proxy side, but sending the field explicitly here is defensive — other OpenAI-compatible endpoints (older self-hosted proxies, vLLM compat shims, …) hit the same spec gap. No behavior change for spec-compliant endpoints (openai.com, Azure OpenAI, well-behaved proxies): they already default to non-streaming when `stream` is absent, so making it explicit is a no-op there. Co-authored-by: Ptah-CT <221234802+Ptah-CT@users.noreply.github.com> Co-authored-by: Claude Opus 4.7 --- src/providers/openai.ts | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/providers/openai.ts b/src/providers/openai.ts index bca2370f..88e10829 100644 --- a/src/providers/openai.ts +++ b/src/providers/openai.ts @@ -80,6 +80,13 @@ export class OpenAIProvider implements MemoryProvider { const body: Record = { model: this.model, max_tokens: this.maxTokens, + // OpenAI API spec defines `stream` as defaulting to false, so omitting + // it should yield a JSON response. Some OpenAI-compatible proxies + // (notably 9Router < 0.4.56 — see decolua/9router#1260) default to + // text/event-stream when `stream` is absent, which crashes the + // `response.json()` call below with `Unexpected token 'd', "data: {"id"...`. + // Send it explicitly so non-spec endpoints route to non-streaming too. + stream: false, messages: [ { role: "system", content: systemPrompt }, { role: "user", content: userPrompt }, From 8e313271c9394486363e3525aca79b4b58d37cc4 Mon Sep 17 00:00:00 2001 From: Kaushal Reddy <84959787+kaushalrog@users.noreply.github.com> Date: Wed, 20 May 2026 21:04:37 +0530 Subject: [PATCH 30/34] Update index.html (#542) --- src/viewer/index.html | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/viewer/index.html b/src/viewer/index.html index 8c544594..c2c200b8 100644 --- a/src/viewer/index.html +++ b/src/viewer/index.html @@ -2230,7 +2230,26 @@

agentmemory

var filtered = items.filter(function(m) { if (typeFilter && m.type !== typeFilter) return false; - if (search && !(m.title || '').toLowerCase().includes(search) && !(m.content || '').toLowerCase().includes(search)) return false; + const normalizedSearch = (search || '') + .normalize("NFKC") + .toLowerCase(); + + const normalizedTitle = (m.title || '') + .normalize("NFKC") + .toLowerCase(); + + const normalizedContent = (m.content || '') + .normalize("NFKC") + .toLowerCase(); + + if ( + search && + !normalizedTitle.includes(normalizedSearch) && + !normalizedContent.includes(normalizedSearch) + ) { + return false; + } + return true; }); From edd1ceb916c54ee842126fb08bb382613d55c843 Mon Sep 17 00:00:00 2001 From: Tanmay Shirbhayye <158763203+Tanmay-008@users.noreply.github.com> Date: Wed, 20 May 2026 21:04:57 +0530 Subject: [PATCH 31/34] fix(cli): accurately display bound viewer port on splash screen (#560) * fix(cli): accurately display bound viewer port on splash screen - Expose viewerPort and viewerSkipped state in /agentmemory/livez endpoint. - Update CLI readiness check to poll until the viewer port is bound or explicitly skipped. - Prevents misleading default port (3113) display on splash screen when the viewer falls back to another port. * fix(viewer): address CodeRabbitAI review --- src/cli.ts | 44 +++++++++++++++++++++++++++++++++++++++++++- src/triggers/api.ts | 5 ++++- src/viewer/server.ts | 24 ++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 2 deletions(-) diff --git a/src/cli.ts b/src/cli.ts index 3893e361..e1ea9757 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -195,9 +195,36 @@ function getBaseUrl(): string { return `http://localhost:${getRestPort()}`; } +let discoveredViewerPort: number | null = null; + +export async function discoverViewerPort(): Promise { + if (discoveredViewerPort !== null) return; + try { + const res = await fetch(`${getBaseUrl()}/agentmemory/livez`, { + signal: AbortSignal.timeout(1000), + }); + if (res.ok) { + const data = await res.json() as { viewerPort?: number | null }; + if (typeof data.viewerPort === "number") { + discoveredViewerPort = data.viewerPort; + } + } + } catch {} +} + function getViewerUrl(): string { const envUrl = process.env["AGENTMEMORY_VIEWER_URL"]; if (envUrl) return envUrl.replace(/\/+$/, ""); + + if (discoveredViewerPort !== null) { + try { + const u = new URL(getBaseUrl()); + return `${u.protocol}//${u.hostname}:${discoveredViewerPort}`; + } catch { + return `http://localhost:${discoveredViewerPort}`; + } + } + try { const u = new URL(getBaseUrl()); const vPort = @@ -257,7 +284,18 @@ async function isAgentmemoryReady(): Promise { const res = await fetch(`${getBaseUrl()}/agentmemory/livez`, { signal: AbortSignal.timeout(2000), }); - return res.ok; + if (!res.ok) return false; + try { + const data = await res.json() as { viewerPort?: number | null; viewerSkipped?: boolean }; + if (typeof data.viewerPort === "number") { + discoveredViewerPort = data.viewerPort; + return true; + } + if (data.viewerSkipped) return true; + return false; + } catch { + return false; + } } catch { return false; } @@ -1092,6 +1130,9 @@ async function runStatus() { apiFetch(base, "config/flags"), ]); + if (typeof healthRes?.viewerPort === "number") { + discoveredViewerPort = healthRes.viewerPort; + } const h = healthRes?.health; const status = healthRes?.status || "unknown"; const version = healthRes?.version || "?"; @@ -1251,6 +1292,7 @@ function buildDoctorEffects(): DoctorEffects { iiiBinaryVersion: (binPath: string) => iiiBinVersion(binPath), viewerReachable: async (timeoutMs = 2000) => { try { + await discoverViewerPort(); const res = await fetch(getViewerUrl(), { signal: AbortSignal.timeout(timeoutMs), }); diff --git a/src/triggers/api.ts b/src/triggers/api.ts index 083c2159..66eaadc2 100644 --- a/src/triggers/api.ts +++ b/src/triggers/api.ts @@ -9,6 +9,7 @@ import type { ResilientProvider } from "../providers/resilient.js"; import { VERSION } from "../version.js"; import { timingSafeCompare } from "../auth.js"; import { renderViewerDocument } from "../viewer/document.js"; +import { getBoundViewerPort, getViewerSkipped } from "../viewer/server.js"; import { MAX_FILES_UPPER_BOUND } from "../functions/replay.js"; import { isGraphExtractionEnabled, @@ -143,7 +144,7 @@ export function registerApiTriggers( sdk.registerFunction("api::liveness", async (): Promise => ({ status_code: 200, - body: { status: "ok", service: "agentmemory" }, + body: { status: "ok", service: "agentmemory", viewerPort: getBoundViewerPort(), viewerSkipped: getViewerSkipped() }, }), ); sdk.registerTrigger({ @@ -244,6 +245,8 @@ export function registerApiTriggers( health: health || null, functionMetrics, circuitBreaker, + viewerPort: getBoundViewerPort(), + viewerSkipped: getViewerSkipped(), }, }; }, diff --git a/src/viewer/server.ts b/src/viewer/server.ts index bd8e3c63..71598690 100644 --- a/src/viewer/server.ts +++ b/src/viewer/server.ts @@ -131,6 +131,16 @@ function readBody(req: IncomingMessage): Promise { const MAX_VIEWER_PORT_RETRIES = 10; +let boundViewerPort: number | null = null; +let viewerSkipped = false; + +export function getBoundViewerPort(): number | null { + return boundViewerPort; +} +export function getViewerSkipped(): boolean { + return viewerSkipped; +} + export function startViewerServer( port: number, _kv: unknown, @@ -138,6 +148,10 @@ export function startViewerServer( secret?: string, restPort?: number, ): Server { + // Reset exported runtime state for each start attempt. + boundViewerPort = null; + viewerSkipped = false; + const resolvedRestPort = restPort ?? port - 2; const requestedPort = port; // Computed lazily on first request — `port` may be 0 here (OS-assigned) @@ -227,6 +241,12 @@ export function startViewerServer( }; server.on("listening", () => { + const addr = server.address(); + boundViewerPort = + addr && typeof addr === "object" && "port" in addr + ? addr.port + : currentPort; + viewerSkipped = false; if (currentPort === requestedPort) { console.log(`[agentmemory] Viewer: http://localhost:${currentPort}`); } else { @@ -244,10 +264,14 @@ export function startViewerServer( return; } if (err.code === "EADDRINUSE") { + boundViewerPort = null; + viewerSkipped = true; console.warn( `[agentmemory] Viewer ports ${requestedPort}-${requestedPort + MAX_VIEWER_PORT_RETRIES} all in use, skipping viewer.`, ); } else { + boundViewerPort = null; + viewerSkipped = true; console.error(`[agentmemory] Viewer error:`, err.message); } }); From ec6282382be002d26bc3f5737d840f2d5c3e2f73 Mon Sep 17 00:00:00 2001 From: Aqil Aziz Date: Thu, 21 May 2026 00:02:58 +0700 Subject: [PATCH 32/34] docs: clarify env file setup (#321) Signed-off-by: aqilaziz --- README.md | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 840a75c4..7fc7c306 100644 --- a/README.md +++ b/README.md @@ -1059,7 +1059,7 @@ Full registry: [workers.iii.dev](https://workers.iii.dev). Every worker there co ### LLM Providers -agentmemory auto-detects from your environment. No API key needed if you have a Claude subscription. +agentmemory auto-detects from your environment. By default, no LLM calls are made unless you configure a provider or explicitly opt in to the Claude subscription fallback. | Provider | Config | Notes | |----------|--------|-------| @@ -1070,6 +1070,33 @@ agentmemory auto-detects from your environment. No API key needed if you have a | OpenRouter | `OPENROUTER_API_KEY` | Any model | | Claude subscription fallback | `AGENTMEMORY_ALLOW_AGENT_SDK=true` | Opt-in only. Spawns `@anthropic-ai/claude-agent-sdk` sessions — used to cause unbounded Stop-hook recursion (#149 follow-up) so it is no longer the default. | +### Config File + +Put agentmemory runtime configuration in `~/.agentmemory/.env` instead of exporting variables in every shell. If the viewer shows a setup hint like `export ANTHROPIC_API_KEY=...`, copy it into this file as `ANTHROPIC_API_KEY=...` without the `export` prefix, then restart agentmemory. + +Process environment variables still work and take precedence over values in the file. + +On Windows, the same file lives at `%USERPROFILE%\.agentmemory\.env`: + +```powershell +New-Item -ItemType Directory -Force $HOME\.agentmemory +notepad $HOME\.agentmemory\.env +``` + +To test with a Claude Code Pro/Max subscription instead of an API key, opt in explicitly: + +```env +AGENTMEMORY_ALLOW_AGENT_SDK=true +AGENTMEMORY_AUTO_COMPRESS=true +``` + +Turn on graph or consolidation features in the same file if you want them: + +```env +GRAPH_EXTRACTION_ENABLED=true +CONSOLIDATION_ENABLED=true +``` + ### Environment Variables Create `~/.agentmemory/.env`: From 0ba3af4b01da27e1450ddcd4552e0917820ea3f3 Mon Sep 17 00:00:00 2001 From: Ross Story Date: Wed, 20 May 2026 13:28:57 -0700 Subject: [PATCH 33/34] Narrow Copilot pre-tool session ids Ensures pre-tool-use only forwards string session IDs and falls back to unknown for invalid Copilot payload values, with regression coverage for the generated plugin script. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- plugin/scripts/pre-tool-use.mjs | 3 ++- src/hooks/pre-tool-use.ts | 6 +++++- test/copilot-plugin.test.ts | 21 +++++++++++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/plugin/scripts/pre-tool-use.mjs b/plugin/scripts/pre-tool-use.mjs index 3d9dd986..16892fcd 100755 --- a/plugin/scripts/pre-tool-use.mjs +++ b/plugin/scripts/pre-tool-use.mjs @@ -55,7 +55,8 @@ async function main() { const pattern = toolInput["pattern"]; if (typeof pattern === "string" && pattern.length > 0) terms.push(pattern); } - const sessionId = data.session_id || data.sessionId || "unknown"; + const rawSessionId = data.session_id || data.sessionId; + const sessionId = typeof rawSessionId === "string" && rawSessionId.length > 0 ? rawSessionId : "unknown"; try { const res = await fetch(`${REST_URL}/agentmemory/enrich`, { method: "POST", diff --git a/src/hooks/pre-tool-use.ts b/src/hooks/pre-tool-use.ts index 63061b3d..eea440c8 100644 --- a/src/hooks/pre-tool-use.ts +++ b/src/hooks/pre-tool-use.ts @@ -88,7 +88,11 @@ async function main() { } } - const sessionId = ((data.session_id || data.sessionId) as string) || "unknown"; + const rawSessionId = data.session_id || data.sessionId; + const sessionId = + typeof rawSessionId === "string" && rawSessionId.length > 0 + ? rawSessionId + : "unknown"; try { const res = await fetch(`${REST_URL}/agentmemory/enrich`, { diff --git a/test/copilot-plugin.test.ts b/test/copilot-plugin.test.ts index 4956d2ac..e4121688 100644 --- a/test/copilot-plugin.test.ts +++ b/test/copilot-plugin.test.ts @@ -297,6 +297,27 @@ describe("Copilot hook scripts", () => { }); }); + it("pre-tool-use narrows Copilot sessionId to strings", async () => { + const result = await runHook( + "scripts/pre-tool-use.mjs", + { + sessionId: 123, + toolName: "read", + toolArgs: { path: "src/index.ts" }, + }, + { AGENTMEMORY_INJECT_CONTEXT: "true" }, + ); + + expect(result.stdout).toBe("remembered context"); + expect(result.requests[0]?.path).toBe("/agentmemory/enrich"); + expect(result.requests[0]?.body).toMatchObject({ + sessionId: "unknown", + files: ["src/index.ts"], + terms: [], + toolName: "read", + }); + }); + it("prompt-submit accepts Copilot camelCase prompt payload", async () => { const result = await runHook("scripts/prompt-submit.mjs", { sessionId: "copilot-session", From f5e67ea133b1d6260d4b64afde7ca964b7a5d7a2 Mon Sep 17 00:00:00 2001 From: kedar-1 <97901228+kedar-1@users.noreply.github.com> Date: Wed, 20 May 2026 22:38:53 +0200 Subject: [PATCH 34/34] Add sudo for global installation command (#454) * Add sudo for global installation command * Update README.md Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * Update README with EACCES retry instructions Added installation instructions for macOS/Linux users. * docs: drop backticks around package name inside bash code fence Backticks inside a ```bash fenced block are still copy-pasted literally by users, and bash interprets them as command substitution. The package name in the install line had decorative backticks that turn into a shell syntax error when pasted as-is. --------- Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Co-authored-by: Rohit Ghumare --- README.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7fc7c306..803366fe 100644 --- a/README.md +++ b/README.md @@ -73,10 +73,12 @@ ## Install ```bash -npm install -g @agentmemory/agentmemory # once — bare `agentmemory` on PATH -agentmemory # start the memory server on :3111 -agentmemory demo # seed sample sessions + prove recall -agentmemory connect claude-code # wire your agent (also: codex, cursor, gemini-cli, ...) +npm install -g @agentmemory/agentmemory # once — bare `agentmemory` on PATH +# If you hit EACCES on macOS/Linux system Node installs, retry with: +# sudo npm install -g @agentmemory/agentmemory +agentmemory # start the memory server on :3111 +agentmemory demo # seed sample sessions + prove recall +agentmemory connect claude-code # wire your agent (also: codex, cursor, gemini-cli, ...) ``` Or via `npx` (no install): @@ -361,6 +363,8 @@ Open `http://localhost:3113` to watch the memory build live. ```bash npm install -g @agentmemory/agentmemory +# If you hit EACCES on macOS/Linux system Node installs, retry with: +# sudo npm install -g @agentmemory/agentmemory agentmemory # start the server (same as the npx form) agentmemory stop # tear it down agentmemory remove # uninstall everything we created