diff --git a/package-lock.json b/package-lock.json index 0b47f87..6d543b2 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,16 +1,18 @@ { "name": "@automagik/rlmx", - "version": "0.260330.5", + "version": "0.260330.18", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@automagik/rlmx", - "version": "0.260330.5", + "version": "0.260330.18", "license": "MIT", "dependencies": { "@mariozechner/pi-ai": "0.64.0", - "js-yaml": "^4.1.1" + "js-yaml": "^4.1.1", + "pg": "^8.20.0", + "pgserve": "^1.1.6" }, "bin": { "rlmx": "dist/src/cli.js" @@ -20,6 +22,7 @@ "@commitlint/config-conventional": "^19.0.0", "@types/js-yaml": "^4.0.9", "@types/node": "^22.0.0", + "@types/pg": "^8.11.0", "husky": "^9.0.0", "typescript": "^5.7.0" }, @@ -1051,6 +1054,74 @@ "node": ">=v18" } }, + "node_modules/@embedded-postgres/darwin-arm64": { + "version": "18.2.0-beta.16", + "resolved": "https://registry.npmjs.org/@embedded-postgres/darwin-arm64/-/darwin-arm64-18.2.0-beta.16.tgz", + "integrity": "sha512-wnswaF+uDvGeitqajJ8v8xOG4ttFrzixElwKNe2MIxBXSLWPV3xhi6tBY0Sjw8Lmiu6UG9vNLFZSjHPrIeokBg==", + "cpu": [ + "arm64" + ], + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=16" + } + }, + "node_modules/@embedded-postgres/darwin-x64": { + "version": "18.2.0-beta.16", + "resolved": "https://registry.npmjs.org/@embedded-postgres/darwin-x64/-/darwin-x64-18.2.0-beta.16.tgz", + "integrity": "sha512-u9WtTPxRuO0uOny5IniXHSDaLmtOujwzDoExIV/jFT0Fu8SzpX7wdoPbsSPBLgyQWdr/nPA77K9QI4r6P1/fKA==", + "cpu": [ + "x64" + ], + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=16" + } + }, + "node_modules/@embedded-postgres/linux-x64": { + "version": "18.2.0-beta.16", + "resolved": "https://registry.npmjs.org/@embedded-postgres/linux-x64/-/linux-x64-18.2.0-beta.16.tgz", + "integrity": "sha512-BIt485ioL8/AwDgw37IcdraOfRgHNDOtGM6Hh63vnNaUAG4Z0qtJd5zXS5fr2wZTEsYHyC5PC60k7zkCRZXSzg==", + "cpu": [ + "x64" + ], + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=16" + } + }, + "node_modules/@embedded-postgres/windows-x64": { + "version": "18.2.0-beta.16", + "resolved": "https://registry.npmjs.org/@embedded-postgres/windows-x64/-/windows-x64-18.2.0-beta.16.tgz", + "integrity": "sha512-Sj6GhCZrvtMwchATEtWuEmexEBWpRNMHPTUHsqPuyDrHX/XgKfpIxz2/AMHa4sp7SZ0JOHGouH8AFIVsWQrQsQ==", + "cpu": [ + "x64" + ], + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=16" + } + }, "node_modules/@google/genai": { "version": "1.46.0", "resolved": "https://registry.npmjs.org/@google/genai/-/genai-1.46.0.tgz", @@ -1111,6 +1182,162 @@ "zod-to-json-schema": "^3.24.1" } }, + "node_modules/@oven/bun-darwin-aarch64": { + "version": "1.3.11", + "resolved": "https://registry.npmjs.org/@oven/bun-darwin-aarch64/-/bun-darwin-aarch64-1.3.11.tgz", + "integrity": "sha512-/8IzqSu4/OWGRs7Fs2ROzGVwJMFTBQkgAp6sAthkBYoN7OiM4rY/CpPVs2X9w9N1W61CHSkEdNKi8HrLZKfK3g==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@oven/bun-darwin-x64": { + "version": "1.3.11", + "resolved": "https://registry.npmjs.org/@oven/bun-darwin-x64/-/bun-darwin-x64-1.3.11.tgz", + "integrity": "sha512-TT7eUihnAzxM2tlZesusuC75PAOYKvUBgVU/Nm/lakZ/DpyuqhNkzUfcxSgmmK9IjVWzMmezLIGZl16XGCGJng==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@oven/bun-darwin-x64-baseline": { + "version": "1.3.11", + "resolved": "https://registry.npmjs.org/@oven/bun-darwin-x64-baseline/-/bun-darwin-x64-baseline-1.3.11.tgz", + "integrity": "sha512-CYjIHWaQG7T4phfjErHr6BiXRs0K/9DqMeiohJmuYSBF+H2m56vFslOenLCguGYQL9jeiiCZBeoVCpwjxZrMgQ==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@oven/bun-linux-aarch64": { + "version": "1.3.11", + "resolved": "https://registry.npmjs.org/@oven/bun-linux-aarch64/-/bun-linux-aarch64-1.3.11.tgz", + "integrity": "sha512-8XMLyRNxHF4jfLajkWt+F8UDxsWbzysyxQVMZKUXwoeGvaxB0rVd07r3YbgDtG8U6khhRFM3oaGp+CQ0whwmdA==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@oven/bun-linux-aarch64-musl": { + "version": "1.3.11", + "resolved": "https://registry.npmjs.org/@oven/bun-linux-aarch64-musl/-/bun-linux-aarch64-musl-1.3.11.tgz", + "integrity": "sha512-jBwYCLG5Eb+PqtFrc3Wp2WMYlw1Id75gUcsdP+ApCOpf5oQhHxkFWCjZmcDoioDmEhMWAiM3wtwSrTlPg+sI6Q==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@oven/bun-linux-x64": { + "version": "1.3.11", + "resolved": "https://registry.npmjs.org/@oven/bun-linux-x64/-/bun-linux-x64-1.3.11.tgz", + "integrity": "sha512-z3GFCk1UBzDOOiEBHL32lVP7Edi26BhOjKb6bIc0nRyabbRiyON4++GR0zmd/H5zM5S0+UcXFgCGnD+b8avTLw==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@oven/bun-linux-x64-baseline": { + "version": "1.3.11", + "resolved": "https://registry.npmjs.org/@oven/bun-linux-x64-baseline/-/bun-linux-x64-baseline-1.3.11.tgz", + "integrity": "sha512-KZlf1jKtf4jai8xiQv/0XRjxVVhHnw/HtUKtLdOeQpTOQ1fQFhLoz2FGGtVRd0LVa/yiRbSz9HlWIzWlmJClng==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@oven/bun-linux-x64-musl": { + "version": "1.3.11", + "resolved": "https://registry.npmjs.org/@oven/bun-linux-x64-musl/-/bun-linux-x64-musl-1.3.11.tgz", + "integrity": "sha512-ADImD4yCHNpqZu718E2chWcCaAHvua90yhmpzzV6fF4zOhwkGGbPCgUWmKyJ83uz+DXaPdYxX0ttDvtolrzx3Q==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@oven/bun-linux-x64-musl-baseline": { + "version": "1.3.11", + "resolved": "https://registry.npmjs.org/@oven/bun-linux-x64-musl-baseline/-/bun-linux-x64-musl-baseline-1.3.11.tgz", + "integrity": "sha512-J+qz4Al05PrNIOdj7xsWVTyx0c/gjUauG5nKV3Rrx0Q+5JO+1pPVlnfNmWbOF9pKG4f3IGad8KXJUfGMORld+Q==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@oven/bun-windows-aarch64": { + "version": "1.3.11", + "resolved": "https://registry.npmjs.org/@oven/bun-windows-aarch64/-/bun-windows-aarch64-1.3.11.tgz", + "integrity": "sha512-UOdkwScHRkGPz+n9ZJU7sTkTvqV7rD1SLCLaru1xH8WRsV7tDorPqNCzEN1msOIiPRK825nvAtEm9UsomO1GsA==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@oven/bun-windows-x64": { + "version": "1.3.11", + "resolved": "https://registry.npmjs.org/@oven/bun-windows-x64/-/bun-windows-x64-1.3.11.tgz", + "integrity": "sha512-E51tyWDP1l0CbjZYhiUxhDGPaY8Hf5YBREx0PHBff1LM1/q3qsJ6ZvRUa8YbbOO0Ax9QP6GHjD9vf3n6bXZ7QA==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@oven/bun-windows-x64-baseline": { + "version": "1.3.11", + "resolved": "https://registry.npmjs.org/@oven/bun-windows-x64-baseline/-/bun-windows-x64-baseline-1.3.11.tgz", + "integrity": "sha512-cCsXK9AQ9Zf18QlVnbrFu2IKfr4sf2sfbErkF2jfCzyCO9Bnhl0KRx63zlN+Ni1xU7gcBLAssgcui5R400N2eA==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, "node_modules/@protobufjs/aspromise": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz", @@ -1864,6 +2091,18 @@ "undici-types": "~6.21.0" } }, + "node_modules/@types/pg": { + "version": "8.20.0", + "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.20.0.tgz", + "integrity": "sha512-bEPFOaMAHTEP1EzpvHTbmwR8UsFyHSKsRisLIHVMXnpNefSbGA1bD6CVy+qKjGSqmZqNqBDV2azOBo8TgkcVow==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*", + "pg-protocol": "*", + "pg-types": "^2.2.0" + } + }, "node_modules/@types/retry": { "version": "0.12.0", "resolved": "https://registry.npmjs.org/@types/retry/-/retry-0.12.0.tgz", @@ -2013,6 +2252,40 @@ "integrity": "sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==", "license": "BSD-3-Clause" }, + "node_modules/bun": { + "version": "1.3.11", + "resolved": "https://registry.npmjs.org/bun/-/bun-1.3.11.tgz", + "integrity": "sha512-AvXWYFO6j/ZQ7bhGm4X6eilq2JHsDVC90ZM32k2B7/srhC2gs3Sdki1QTbwrdRCo8o7eT+167vcB1yzOvPdbjA==", + "cpu": [ + "arm64", + "x64" + ], + "hasInstallScript": true, + "license": "MIT", + "os": [ + "darwin", + "linux", + "win32" + ], + "bin": { + "bun": "bin/bun.exe", + "bunx": "bin/bunx.exe" + }, + "optionalDependencies": { + "@oven/bun-darwin-aarch64": "1.3.11", + "@oven/bun-darwin-x64": "1.3.11", + "@oven/bun-darwin-x64-baseline": "1.3.11", + "@oven/bun-linux-aarch64": "1.3.11", + "@oven/bun-linux-aarch64-musl": "1.3.11", + "@oven/bun-linux-x64": "1.3.11", + "@oven/bun-linux-x64-baseline": "1.3.11", + "@oven/bun-linux-x64-musl": "1.3.11", + "@oven/bun-linux-x64-musl-baseline": "1.3.11", + "@oven/bun-windows-aarch64": "1.3.11", + "@oven/bun-windows-x64": "1.3.11", + "@oven/bun-windows-x64-baseline": "1.3.11" + } + }, "node_modules/callsites": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", @@ -3162,6 +3435,113 @@ "node": ">=14.0.0" } }, + "node_modules/pg": { + "version": "8.20.0", + "resolved": "https://registry.npmjs.org/pg/-/pg-8.20.0.tgz", + "integrity": "sha512-ldhMxz2r8fl/6QkXnBD3CR9/xg694oT6DZQ2s6c/RI28OjtSOpxnPrUCGOBJ46RCUxcWdx3p6kw/xnDHjKvaRA==", + "license": "MIT", + "dependencies": { + "pg-connection-string": "^2.12.0", + "pg-pool": "^3.13.0", + "pg-protocol": "^1.13.0", + "pg-types": "2.2.0", + "pgpass": "1.0.5" + }, + "engines": { + "node": ">= 16.0.0" + }, + "optionalDependencies": { + "pg-cloudflare": "^1.3.0" + }, + "peerDependencies": { + "pg-native": ">=3.0.1" + }, + "peerDependenciesMeta": { + "pg-native": { + "optional": true + } + } + }, + "node_modules/pg-cloudflare": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/pg-cloudflare/-/pg-cloudflare-1.3.0.tgz", + "integrity": "sha512-6lswVVSztmHiRtD6I8hw4qP/nDm1EJbKMRhf3HCYaqud7frGysPv7FYJ5noZQdhQtN2xJnimfMtvQq21pdbzyQ==", + "license": "MIT", + "optional": true + }, + "node_modules/pg-connection-string": { + "version": "2.12.0", + "resolved": "https://registry.npmjs.org/pg-connection-string/-/pg-connection-string-2.12.0.tgz", + "integrity": "sha512-U7qg+bpswf3Cs5xLzRqbXbQl85ng0mfSV/J0nnA31MCLgvEaAo7CIhmeyrmJpOr7o+zm0rXK+hNnT5l9RHkCkQ==", + "license": "MIT" + }, + "node_modules/pg-int8": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/pg-int8/-/pg-int8-1.0.1.tgz", + "integrity": "sha512-WCtabS6t3c8SkpDBUlb1kjOs7l66xsGdKpIPZsg4wR+B3+u9UAum2odSsF9tnvxg80h4ZxLWMy4pRjOsFIqQpw==", + "license": "ISC", + "engines": { + "node": ">=4.0.0" + } + }, + "node_modules/pg-pool": { + "version": "3.13.0", + "resolved": "https://registry.npmjs.org/pg-pool/-/pg-pool-3.13.0.tgz", + "integrity": "sha512-gB+R+Xud1gLFuRD/QgOIgGOBE2KCQPaPwkzBBGC9oG69pHTkhQeIuejVIk3/cnDyX39av2AxomQiyPT13WKHQA==", + "license": "MIT", + "peerDependencies": { + "pg": ">=8.0" + } + }, + "node_modules/pg-protocol": { + "version": "1.13.0", + "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.13.0.tgz", + "integrity": "sha512-zzdvXfS6v89r6v7OcFCHfHlyG/wvry1ALxZo4LqgUoy7W9xhBDMaqOuMiF3qEV45VqsN6rdlcehHrfDtlCPc8w==", + "license": "MIT" + }, + "node_modules/pg-types": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-2.2.0.tgz", + "integrity": "sha512-qTAAlrEsl8s4OiEQY69wDvcMIdQN6wdz5ojQiOy6YRMuynxenON0O5oCpJI6lshc6scgAY8qvJ2On/p+CXY0GA==", + "license": "MIT", + "dependencies": { + "pg-int8": "1.0.1", + "postgres-array": "~2.0.0", + "postgres-bytea": "~1.0.0", + "postgres-date": "~1.0.4", + "postgres-interval": "^1.1.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/pgpass": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/pgpass/-/pgpass-1.0.5.tgz", + "integrity": "sha512-FdW9r/jQZhSeohs1Z3sI1yxFQNFvMcnmfuj4WBMUTxOrAyLMaTcE1aAMBiTlbMNaXvBCQuVi0R7hd8udDSP7ug==", + "license": "MIT", + "dependencies": { + "split2": "^4.1.0" + } + }, + "node_modules/pgserve": { + "version": "1.1.6", + "resolved": "https://registry.npmjs.org/pgserve/-/pgserve-1.1.6.tgz", + "integrity": "sha512-qdgiU3q/o/k8lP+IPHLiZG5uTWmvCSgqM3erdP7nmgVeQYv89DTo0nSG5XB0ccVD7yG/BppLFHjRvvXcWVgKnQ==", + "license": "MIT", + "dependencies": { + "bun": "^1.3.4" + }, + "bin": { + "pgserve": "bin/pgserve-wrapper.cjs" + }, + "optionalDependencies": { + "@embedded-postgres/darwin-arm64": "18.2.0-beta.16", + "@embedded-postgres/darwin-x64": "18.2.0-beta.16", + "@embedded-postgres/linux-x64": "18.2.0-beta.16", + "@embedded-postgres/windows-x64": "18.2.0-beta.16" + } + }, "node_modules/picocolors": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", @@ -3169,6 +3549,45 @@ "dev": true, "license": "ISC" }, + "node_modules/postgres-array": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-2.0.0.tgz", + "integrity": "sha512-VpZrUqU5A69eQyW2c5CA1jtLecCsN2U/bD6VilrFDWq5+5UIEVO7nazS3TEcHf1zuPYO/sqGvUvW62g86RXZuA==", + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/postgres-bytea": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-1.0.1.tgz", + "integrity": "sha512-5+5HqXnsZPE65IJZSMkZtURARZelel2oXUEO8rH83VS/hxH5vv1uHquPg5wZs8yMAfdv971IU+kcPUczi7NVBQ==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/postgres-date": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-1.0.7.tgz", + "integrity": "sha512-suDmjLVQg78nMK2UZ454hAG+OAW+HQPZ6n++TNDUX+L0+uUlLywnoxJKDou51Zm+zTCjrCl0Nq6J9C5hP9vK/Q==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/postgres-interval": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-1.2.0.tgz", + "integrity": "sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ==", + "license": "MIT", + "dependencies": { + "xtend": "^4.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/protobufjs": { "version": "7.5.4", "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.5.4.tgz", @@ -3341,7 +3760,6 @@ "version": "4.2.0", "resolved": "https://registry.npmjs.org/split2/-/split2-4.2.0.tgz", "integrity": "sha512-UcjcJOWknrNkF6PLX83qcHM6KHgVKNkV62Y8a5uYDVv9ydGQVwAHMKqHdJje1VTWpljG0WYpCDhrCdAOYH4TWg==", - "dev": true, "license": "ISC", "engines": { "node": ">= 10.x" @@ -3519,6 +3937,15 @@ } } }, + "node_modules/xtend": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", + "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==", + "license": "MIT", + "engines": { + "node": ">=0.4" + } + }, "node_modules/y18n": { "version": "5.0.8", "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", diff --git a/package.json b/package.json index d883d71..10ab02c 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@automagik/rlmx", - "version": "0.260330.13", + "version": "0.260331.1", "description": "RLM algorithm CLI for coding agents — prompt externalization, Python REPL with symbolic recursion, code-driven navigation", "type": "module", "publishConfig": { @@ -17,7 +17,7 @@ "examples" ], "scripts": { - "build": "tsc", + "build": "tsc && cp src/benchmark-data.json dist/src/", "dev": "tsc --watch", "clean": "rm -rf dist", "test": "node --test dist/tests/*.test.js", @@ -27,13 +27,16 @@ }, "dependencies": { "@mariozechner/pi-ai": "0.64.0", - "js-yaml": "^4.1.1" + "js-yaml": "^4.1.1", + "pg": "^8.20.0", + "pgserve": "^1.1.6" }, "devDependencies": { "@commitlint/cli": "^19.0.0", "@commitlint/config-conventional": "^19.0.0", "@types/js-yaml": "^4.0.9", "@types/node": "^22.0.0", + "@types/pg": "^8.11.0", "husky": "^9.0.0", "typescript": "^5.7.0" }, diff --git a/python/gemini_batteries.py b/python/gemini_batteries.py index c584e22..6d42463 100644 --- a/python/gemini_batteries.py +++ b/python/gemini_batteries.py @@ -22,7 +22,7 @@ def web_search(query: str) -> str: Returns search results as text. Only available with provider: google. """ - results = llm_bridge._send_request("web_search", [query]) + results = llm_bridge.send_request("web_search", [query]) return results[0] if results else "Error: no response from web_search" @@ -31,7 +31,7 @@ def fetch_url(url: str) -> str: Returns page content as text. Only available with provider: google. """ - results = llm_bridge._send_request("fetch_url", [url]) + results = llm_bridge.send_request("fetch_url", [url]) return results[0] if results else "Error: no response from fetch_url" @@ -42,5 +42,5 @@ def generate_image(prompt: str, aspect_ratio: str = "16:9", size: str = "2K") -> Only available with provider: google. """ full_prompt = f"{prompt} [aspect_ratio={aspect_ratio}, size={size}]" - results = llm_bridge._send_request("generate_image", [full_prompt]) + results = llm_bridge.send_request("generate_image", [full_prompt]) return results[0] if results else "Error: no response from generate_image" diff --git a/python/llm_bridge.py b/python/llm_bridge.py index 70ee44d..2d002d0 100644 --- a/python/llm_bridge.py +++ b/python/llm_bridge.py @@ -50,6 +50,11 @@ def _send_request(request_type: str, prompts: list, model=None) -> list: return [f"Error: invalid JSON response: {e}"] * len(prompts) +def send_request(request_type: str, prompts: list, model=None) -> list: + """Public interface for sending IPC requests to Node.js parent process.""" + return _send_request(request_type, prompts, model) + + def llm_query(prompt: str, model=None) -> str: """Query the LLM with a single prompt. Returns the response string.""" results = _send_request("llm_query", [prompt], model) diff --git a/python/load_dataset.py b/python/load_dataset.py new file mode 100755 index 0000000..65450de --- /dev/null +++ b/python/load_dataset.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +"""Load Oolong Synth dataset from HuggingFace and output as JSON.""" +import json +import sys +from datasets import load_dataset + + +def main(): + samples = int(sys.argv[1]) if len(sys.argv) > 1 else 5 + idx = int(sys.argv[2]) if len(sys.argv) > 2 else -1 + + ds = load_dataset("oolongbench/oolong-synth", split="test") + + if idx >= 0: + items = [ds[idx]] + else: + items = list(ds.select(range(min(samples, len(ds))))) + + output = [] + for item in items: + # Oolong Synth uses "context_window_text" for the context field + context = item.get("context_window_text") or item.get("context", "") + answer = item.get("answer", "") + # answer can be a list — join if so + if isinstance(answer, list): + answer = ", ".join(str(a) for a in answer) + output.append({ + "id": f"oolong-{item.get('id', 'unknown')}", + "name": item.get("question", "")[:50], + "question": item["question"], + "context": context, + "expected": answer, + "category": item.get("task_group", "oolong"), + }) + + json.dump(output, sys.stdout) + + +if __name__ == "__main__": + main() diff --git a/python/pg_batteries.py b/python/pg_batteries.py new file mode 100644 index 0000000..379263e --- /dev/null +++ b/python/pg_batteries.py @@ -0,0 +1,158 @@ +""" +pg_batteries.py — PostgreSQL storage query functions for rlmx. + +Provides pg_search(), pg_slice(), pg_time(), pg_count(), pg_query() +that communicate with the Node.js PgStorage via the IPC bridge. + +Available when storage mode is active. +""" + +import json + +# IPC bridge — resolved at call time from REPL namespace globals +import llm_bridge + + +def _pg_request(request_type, params=None): + """Send a pg_* request to Node.js PgStorage and return parsed result.""" + payload = json.dumps(params) if params else "{}" + results = llm_bridge.send_request(request_type, [payload]) + if not results: + return None + raw = results[0] + if raw.startswith("Error:"): + raise RuntimeError(raw) + try: + return json.loads(raw) + except (json.JSONDecodeError, TypeError): + return raw + + +def _truncate_output(items, label="results"): + """Truncate output if > 2000 chars, showing first 5 items as stub.""" + if not isinstance(items, list): + text = str(items) + if len(text) <= 2000: + return text + return text[:2000] + "\n... [truncated]" + + full = json.dumps(items, indent=2) + if len(full) <= 2000: + return full + + n = len(items) + preview_items = items[:5] + lines = [f"[{n} {label}, showing first {min(5, n)}]"] + for item in preview_items: + if isinstance(item, dict): + content = item.get("content", "") + preview = content[:100] + "..." if len(content) > 100 else content + shown = {k: (preview if k == "content" else v) for k, v in item.items()} + lines.append(json.dumps(shown)) + else: + lines.append(str(item)[:100]) + lines.append("...") + if items and isinstance(items[0], dict) and "line_num" in items[0]: + first_ln = items[0]["line_num"] + lines.append(f'Use pg_slice({first_ln}, {first_ln + 10}) to see full content') + return "\n".join(lines) + + +def pg_search(pattern, limit=20): + """Full-text search in stored context. Returns matching records ranked by relevance. + + Args: + pattern: Search terms (words joined with AND) + limit: Max results (default 20) + + Returns: + List of {line_num, source, content, rank} dicts + """ + result = _pg_request("pg_search", {"pattern": pattern, "limit": limit}) + if isinstance(result, list): + return _truncate_output(result, "results") + return result + + +def pg_slice(start, end): + """Get context lines by range. Returns content for lines [start, end). + + Args: + start: Starting line number (inclusive) + end: Ending line number (exclusive) + + Returns: + String with source and content of the requested lines + """ + result = _pg_request("pg_slice", {"start": start, "end": end}) + if isinstance(result, list): + lines = [] + for r in result: + if isinstance(r, dict): + src = r.get("source", "") + content = r.get("content", "") + lines.append(f"[{src}] {content}" if src else content) + else: + lines.append(str(r)) + content = "\n".join(lines) + if len(content) > 2000: + return content[:2000] + f"\n... [truncated, {len(result)} lines total]" + return content + return result + + +def pg_sources(): + """List distinct source files in the stored context. + + Returns: + List of source file paths + """ + result = _pg_request("pg_query", { + "sql": "SELECT DISTINCT source FROM records WHERE source IS NOT NULL ORDER BY source" + }) + if isinstance(result, list): + return [r.get("source", "") if isinstance(r, dict) else str(r) for r in result] + return result + + +def pg_time(from_time, to_time): + """Filter context records by timestamp range. + + Args: + from_time: Start time (e.g. '01:00' or '2024-01-01T01:00:00') + to_time: End time + + Returns: + List of {line_num, timestamp, content} dicts + """ + result = _pg_request("pg_time", {"from": from_time, "to": to_time}) + if isinstance(result, list): + return _truncate_output(result, "results") + return result + + +def pg_count(): + """Count total records in stored context. + + Returns: + Integer count + """ + result = _pg_request("pg_count", {}) + if isinstance(result, dict): + return result.get("count", 0) + return result + + +def pg_query(sql): + """Execute raw SQL query (read-only) against the context database. + + Args: + sql: SQL query string + + Returns: + List of result row dicts + """ + result = _pg_request("pg_query", {"sql": sql}) + if isinstance(result, list): + return _truncate_output(result, "rows") + return result diff --git a/src/batch.ts b/src/batch.ts index 131d330..85abb88 100644 --- a/src/batch.ts +++ b/src/batch.ts @@ -11,6 +11,7 @@ import { rlmLoop, type RLMOptions } from "./rlm.js"; import type { RlmxConfig } from "./config.js"; import type { LoadedContext } from "./context.js"; import { isGoogleProvider } from "./gemini.js"; +import { validateContextSize } from "./cache.js"; interface BatchResult { question: string; @@ -36,6 +37,8 @@ export interface BatchOptions extends Partial { parallel?: number; /** Use Gemini Batch API for 50% cost reduction. Requires provider: google. */ batchApi?: boolean; + /** When true, use pgserve storage for large context handling. */ + storageMode?: boolean; } /** @@ -96,13 +99,18 @@ export async function runBatch( break; } - // Run each question through rlmLoop with cache enabled + // Determine cache/storage mode for this batch + const useCache = options.cache ?? true; + const useStorage = options.storageMode ?? false; + + // Run each question through rlmLoop const result = await rlmLoop(question, context, config, { maxIterations: options.maxIterations, timeout: options.timeout, verbose: options.verbose, output: "text", // batch always captures text internally - cache: true, // batch always uses cache + cache: useCache, + storageMode: useStorage, }); totalCost += result.usage.totalCost; diff --git a/src/benchmark-data.json b/src/benchmark-data.json new file mode 100644 index 0000000..372362b --- /dev/null +++ b/src/benchmark-data.json @@ -0,0 +1,44 @@ +[ + { + "id": "cost-001", + "name": "API extraction", + "question": "What authentication methods does the API support?", + "context": "# Authentication API Reference\n\nThe Gateway API supports three authentication methods for securing client requests.\n\n## 1. API Key Authentication\n\nPass your API key via the `X-API-Key` header. Keys are generated in the Developer Portal under Settings > API Keys. Each key has configurable rate limits and scope restrictions. Keys can be rotated without downtime using the dual-key mechanism: generate a new key, update clients, then revoke the old key.\n\n## 2. OAuth 2.0 Bearer Tokens\n\nFor user-delegated access, the API supports OAuth 2.0 Authorization Code flow with PKCE. Request tokens from `POST /oauth/token` with grant_type=authorization_code. Access tokens expire after 3600 seconds. Refresh tokens are valid for 30 days and support automatic rotation.\n\n## 3. Mutual TLS (mTLS)\n\nFor service-to-service communication, mTLS provides certificate-based authentication. Upload your client certificate via the Admin Console. The API validates the certificate chain against the configured CA bundle. Certificate pinning is supported but optional.", + "category": "extraction" + }, + { + "id": "cost-002", + "name": "Architecture summary", + "question": "Summarize the key architectural decisions and their trade-offs.", + "context": "# System Architecture Document\n\n## Overview\n\nThe platform uses a microservices architecture deployed on Kubernetes, with an event-driven backbone for inter-service communication. This document captures the major architectural decisions made during the Q3 2025 redesign.\n\n## Decision 1: Event Sourcing for Order Management\n\nWe adopted event sourcing for the order management domain. Every state change is captured as an immutable event in an append-only log (Apache Kafka). The current state is derived by replaying events. Trade-off: increased storage requirements (~3x compared to state-based) and higher complexity for simple queries, but we gain complete audit trails, temporal queries, and the ability to rebuild read models.\n\n## Decision 2: CQRS for Read/Write Separation\n\nCommand and Query Responsibility Segregation separates the write model (event store) from read models (PostgreSQL materialized views). Writes go through the command bus, reads hit optimized projections. Trade-off: eventual consistency (typically <100ms lag) and operational complexity of maintaining projections, but read performance improved 15x and we can scale read/write independently.\n\n## Decision 3: Service Mesh with Istio\n\nAll inter-service communication flows through Istio sidecars. This provides mTLS between services, circuit breaking, retries with exponential backoff, and observability (distributed tracing via Jaeger). Trade-off: ~2ms latency overhead per hop and significant operational complexity, but unified security policy enforcement and deep traffic visibility.\n\n## Decision 4: GraphQL Federation\n\nThe API gateway uses Apollo Federation to compose a unified GraphQL schema from individual service schemas. Each team owns their subgraph. Trade-off: schema governance overhead and potential N+1 query issues, but teams can iterate independently and clients get a single endpoint with exactly the data they need.\n\n## Decision 5: Multi-Region Active-Active\n\nThe system operates in active-active mode across us-east-1 and eu-west-1. CockroachDB handles cross-region replication with serializable isolation. DNS-based routing directs users to the nearest region. Trade-off: significantly higher infrastructure cost (~2.3x) and complex conflict resolution for concurrent writes, but we achieve <50ms latency globally and survive full region failures.", + "category": "summarization" + }, + { + "id": "cost-003", + "name": "Migration reasoning", + "question": "Given the constraints described, should the team proceed with the database migration to PostgreSQL 16 during the holiday freeze, or wait until Q1? Explain your reasoning step by step.", + "context": "# Database Migration Planning: PostgreSQL 14 to 16\n\n## Current State\n- Running PostgreSQL 14.9 on AWS RDS Multi-AZ\n- 847 GB data across 23 databases\n- Peak load: 12,000 queries/second during business hours\n- End-of-life for PG 14 community support: November 2026\n\n## Migration Benefits (PG 16)\n- Logical replication improvements: parallel apply for large transactions\n- Performance: up to 30% improvement for bulk operations via SIMD JSON parsing\n- MERGE command support (SQL standard compliance)\n- pg_stat_io view for I/O monitoring\n- Security: libpq now supports Kerberos credential delegation\n\n## Holiday Freeze Context\n- Company holiday freeze: December 15 - January 5\n- All production changes require VP-level approval during freeze\n- On-call team reduced to 2 engineers (vs normal 6)\n- Customer traffic drops 40% during holidays (lower risk window)\n- Q1 has two major feature launches planned (weeks 3 and 7)\n\n## Risk Factors\n- Three extensions need compatibility verification: PostGIS 3.3, pg_partman 4.7, timescaledb 2.11\n- Application connection pooling (PgBouncer 1.19) needs config update for PG 16 protocol changes\n- ORM (Prisma 5.x) has known issue with PG 16 MERGE — workaround available but untested in production\n- Rollback requires full RDS snapshot restore (~45 minutes for 847 GB)\n- Two critical batch jobs run nightly at 02:00 UTC and cannot be interrupted\n\n## Team Assessment\n- DBA team confidence level: 7/10 (comfortable with process, concerned about extension compat)\n- Dev team readiness: 6/10 (integration tests pass, but limited PG 16 staging time)\n- Staging environment has been running PG 16 for 3 weeks with synthetic load\n- One P2 bug found in staging: connection timeout under high concurrent MERGE operations\n\n## Compliance Requirements\n- SOC 2 audit scheduled for February — auditors expect documented change management\n- GDPR data residency constraints require EU region migration to be completed within same maintenance window\n- PCI DSS requires change approval documentation with rollback evidence", + "category": "reasoning" + }, + { + "id": "cost-004", + "name": "Framework comparison", + "question": "Compare the three deployment strategies described and recommend which one is best suited for a team with limited DevOps experience.", + "context": "# Deployment Strategy Comparison\n\n## Strategy A: Blue-Green Deployment\n\nMaintain two identical production environments (blue and green). At any time, one serves live traffic while the other is idle. To deploy: push new code to idle environment, run smoke tests, then switch the load balancer. Rollback is instant — switch back to the previous environment.\n\nResource cost: 2x infrastructure at all times. Complexity: moderate — requires automated environment provisioning and LB switching. Downtime: zero during switchover. Database handling: challenging — requires backward-compatible schema migrations or database-per-environment.\n\nBest for: teams with budget for double infrastructure, need instant rollback, relatively stable database schemas.\n\n## Strategy B: Canary Deployment\n\nRoute a small percentage (1-5%) of traffic to the new version while 95-99% stays on the old version. Gradually increase the percentage while monitoring error rates, latency, and business metrics. If anomalies are detected, route all traffic back to the old version.\n\nResource cost: 1.05-1.1x infrastructure during rollout. Complexity: high — requires sophisticated traffic routing (service mesh or smart LB), real-time monitoring, automated rollback triggers. Downtime: zero. Database handling: both versions must work with same database, requiring backward-compatible migrations.\n\nBest for: high-traffic services where bugs in new versions could have significant blast radius, teams with strong observability.\n\n## Strategy C: Rolling Update\n\nReplace instances of the old version with the new version one at a time (or in small batches). Kubernetes does this natively with Deployment resources. At any point during rollout, both old and new versions handle traffic.\n\nResource cost: 1.1-1.25x infrastructure during rollout. Complexity: low — built into Kubernetes, minimal configuration (maxSurge, maxUnavailable). Downtime: zero if configured correctly. Database handling: same as canary — requires backward-compatible migrations.\n\nBest for: Kubernetes-native teams, services that handle mixed-version traffic well, teams wanting simplicity.", + "category": "comparison" + }, + { + "id": "cost-005", + "name": "Incident synthesis", + "question": "Synthesize the incident reports below into a unified root cause analysis and propose systemic improvements.", + "context": "# Incident Reports — Q3 2025\n\n## INC-2847: Payment Processing Outage (August 3, 14:22-15:47 UTC)\n\nSeverity: P1 | Duration: 85 minutes | Impact: 100% of payment transactions failed\n\nTimeline:\n- 14:22 — Monitoring alert: payment-service error rate >50%\n- 14:25 — On-call engineer begins investigation\n- 14:32 — Root cause identified: expired TLS certificate on payment gateway integration\n- 14:45 — Certificate renewal initiated via automated pipeline\n- 14:58 — New certificate deployed but service still failing\n- 15:12 — Discovery: PgBouncer connection pool exhausted due to retry storm from failed payments\n- 15:30 — PgBouncer restarted, connection pool drained\n- 15:47 — Full recovery confirmed\n\nRoot cause: TLS certificate auto-renewal cron job was disabled during June infrastructure migration and never re-enabled. Certificate expired after 90-day Let's Encrypt cycle.\n\nContributing factors: No monitoring on certificate expiry dates. Retry logic in payment-service uses fixed 1-second intervals (no exponential backoff), causing connection pool exhaustion.\n\n## INC-2901: Search Degradation (August 19, 09:15-10:30 UTC)\n\nSeverity: P2 | Duration: 75 minutes | Impact: Search latency increased from 200ms to 8s (p99)\n\nTimeline:\n- 09:15 — Latency alert triggered on search-service\n- 09:22 — Investigation reveals Elasticsearch cluster yellow status\n- 09:28 — One ES data node (es-data-04) unresponsive, triggering shard rebalancing\n- 09:45 — Node es-data-04 confirmed OOM-killed by Kubernetes (memory limit: 32GB, actual usage: 31.8GB)\n- 10:00 — Node restarted with increased memory limit (48GB)\n- 10:15 — Shard rebalancing complete\n- 10:30 — Latency returned to normal\n\nRoot cause: A bulk indexing job (product catalog refresh) was scheduled concurrently with peak search traffic. The indexing job consumed excessive heap memory on es-data-04 because it processed 50,000 documents per batch instead of the recommended 5,000.\n\nContributing factors: No resource isolation between indexing and search workloads. Bulk batch size was increased from 5,000 to 50,000 three months ago without load testing.\n\n## INC-2956: API Gateway Cascade Failure (September 2, 16:00-17:15 UTC)\n\nSeverity: P1 | Duration: 75 minutes | Impact: All API endpoints returned 503 for 60% of requests\n\nTimeline:\n- 16:00 — Marketing campaign launched, driving 3x normal traffic\n- 16:05 — User-service auto-scaling triggered (3 → 12 pods)\n- 16:08 — New user-service pods pulling container image (2.1GB) from registry\n- 16:12 — Container registry rate limit hit (DockerHub free tier: 100 pulls/6 hours)\n- 16:15 — 9 of 12 user-service pods stuck in ImagePullBackoff\n- 16:18 — Remaining 3 pods overwhelmed, latency spikes to 30s\n- 16:22 — Istio circuit breaker trips on user-service, returning 503\n- 16:25 — Cascade: order-service and cart-service depend on user-service, both start failing\n- 16:40 — Team switches to private ECR registry mirror\n- 16:55 — Images pulled from ECR, pods come up\n- 17:15 — Full recovery, circuit breakers reset\n\nRoot cause: Production Kubernetes cluster configured to pull container images from DockerHub free tier instead of private registry mirror. Combined with 2.1GB image size, auto-scaling events exhausted the rate limit.\n\nContributing factors: No pre-pulled images on nodes. Container image not optimized (2.1GB includes dev dependencies). Auto-scaling policy scales too aggressively (100% pod increase per step). No alerting on container registry rate limits.\n\n## INC-3012: Data Pipeline Delay (September 15, 03:00-11:00 UTC)\n\nSeverity: P2 | Duration: 8 hours | Impact: Analytics dashboards showed stale data (8+ hours old)\n\nTimeline:\n- 03:00 — Nightly ETL job starts, processing 2.3TB of event data\n- 03:45 — Spark executor OOM on largest partition (events from top-3 customers = 40% of data)\n- 04:00 — Spark retry policy kicks in (3 retries, same configuration)\n- 06:00 — All retries exhausted, job marked as failed\n- 08:30 — Morning shift notices stale dashboards\n- 09:00 — Engineer investigates, identifies skewed partition\n- 10:00 — Job rerun with increased executor memory (16GB → 32GB) and partition balancing\n- 11:00 — Backfill complete\n\nRoot cause: Data skew — three enterprise customers generated 40% of daily events, causing one Spark partition to exceed executor memory limits.\n\nContributing factors: No data skew monitoring. No alerting on ETL job failures (only dashboards stale alert exists). Retry policy retries with same config (insanity). No partition rebalancing strategy for skewed keys.", + "category": "synthesis" + }, + { + "id": "cost-006", + "name": "Config extraction", + "question": "Extract all environment variables and their default values from this configuration reference.", + "context": "# Service Configuration Reference\n\nThe worker service reads configuration from environment variables at startup. All variables are optional and have sensible defaults.\n\n## Server Settings\n\n`PORT` — HTTP server listen port. Default: `8080`. Must be between 1024 and 65535.\n\n`HOST` — Bind address. Default: `0.0.0.0`. Set to `127.0.0.1` for local-only access.\n\n`WORKER_CONCURRENCY` — Number of parallel worker threads. Default: `4`. Recommended: set to CPU core count.\n\n## Database\n\n`DATABASE_URL` — PostgreSQL connection string. Default: `postgresql://localhost:5432/worker_db`. Supports connection pooling parameters via query string.\n\n`DB_POOL_SIZE` — Maximum database connections. Default: `10`. Should not exceed `max_connections / number_of_instances`.\n\n`DB_TIMEOUT_MS` — Query timeout in milliseconds. Default: `30000`.\n\n## Cache\n\n`REDIS_URL` — Redis connection string. Default: `redis://localhost:6379/0`.\n\n`CACHE_TTL_SECONDS` — Default cache entry TTL. Default: `300`.\n\n`CACHE_PREFIX` — Key prefix for cache entries. Default: `worker:`.", + "category": "extraction" + } +] diff --git a/src/benchmark.ts b/src/benchmark.ts new file mode 100644 index 0000000..6f84830 --- /dev/null +++ b/src/benchmark.ts @@ -0,0 +1,431 @@ +/** + * Benchmark runner for rlmx — compares RLM vs direct LLM on cost/tokens/latency. + * + * Two modes: + * - cost: built-in curated dataset, measures cost savings + * - oolong: Oolong Synth from HuggingFace, measures accuracy + */ + +import { homedir } from "node:os"; +import { join, dirname } from "node:path"; +import { fileURLToPath } from "node:url"; +import { mkdir, writeFile, readFile, stat } from "node:fs/promises"; +import { execFile } from "node:child_process"; +import { promisify } from "node:util"; + +import type { RlmxConfig } from "./config.js"; +import type { LoadedContext } from "./context.js"; +import { llmComplete, type ChatMessage } from "./llm.js"; +import { rlmLoop } from "./rlm.js"; + +const execFileAsync = promisify(execFile); + +// ─── Interfaces ────────────────────────────────────────── + +export interface BenchmarkQuestion { + id: string; + name: string; + question: string; + context: string; + category: string; + expected?: string; +} + +export interface BenchmarkRunResult { + questionId: string; + questionName: string; + direct: { + tokens_input: number; + tokens_output: number; + cost: number; + latency_ms: number; + answer: string; + }; + rlm: { + tokens_input: number; + tokens_output: number; + cost: number; + latency_ms: number; + iterations: number; + answer: string; + }; + savings: { + tokens_pct: number; + cost_pct: number; + }; +} + +export interface BenchmarkResults { + timestamp: string; + mode: "cost" | "oolong"; + model: string; + runs: BenchmarkRunResult[]; + totals: { + direct: { tokens: number; cost: number; latency_ms: number }; + rlm: { tokens: number; cost: number; latency_ms: number; avg_iterations: number }; + savings: { tokens_pct: number; cost_pct: number }; + }; +} + +// ─── Dataset Loading ───────────────────────────────────── + +async function loadBuiltinDataset(): Promise { + const thisDir = dirname(fileURLToPath(import.meta.url)); + const jsonPath = join(thisDir, "benchmark-data.json"); + const raw = await readFile(jsonPath, "utf-8"); + return JSON.parse(raw) as BenchmarkQuestion[]; +} + +// ─── Savings Calculation ───────────────────────────────── + +export function calculateSavings(directTokens: number, rlmTokens: number): number { + if (directTokens <= 0) return 0; + return ((directTokens - rlmTokens) / directTokens) * 100; +} + +export function calculateCostSavings(directCost: number, rlmCost: number): number { + if (directCost <= 0) return 0; + return ((directCost - rlmCost) / directCost) * 100; +} + +// ─── Cost Benchmark ────────────────────────────────────── + +export async function runCostBenchmark( + config: RlmxConfig, + options?: { outputFormat?: "table" | "json" } +): Promise { + const dataset = await loadBuiltinDataset(); + const runs: BenchmarkRunResult[] = []; + + for (const q of dataset) { + const ctx: LoadedContext = { + type: "string", + content: q.context, + metadata: `benchmark context for "${q.name}" (${q.context.length} chars)`, + }; + + // Direct LLM call + const directMessages: ChatMessage[] = [ + { + role: "user", + content: `Context:\n${q.context}\n\nQuestion: ${q.question}`, + }, + ]; + + const directStart = Date.now(); + const directResp = await llmComplete(directMessages, config.model); + const directLatency = Date.now() - directStart; + + // RLM call + const rlmStart = Date.now(); + const rlmResult = await rlmLoop(q.question, ctx, config, { + maxIterations: config.budget.maxTokens ? 5 : 10, + timeout: 120_000, + verbose: false, + output: "text", + cache: false, + }); + const rlmLatency = Date.now() - rlmStart; + + const directTotalTokens = directResp.usage.inputTokens + directResp.usage.outputTokens; + const rlmTotalTokens = rlmResult.usage.inputTokens + rlmResult.usage.outputTokens; + + runs.push({ + questionId: q.id, + questionName: q.name, + direct: { + tokens_input: directResp.usage.inputTokens, + tokens_output: directResp.usage.outputTokens, + cost: directResp.usage.totalCost, + latency_ms: directLatency, + answer: directResp.text, + }, + rlm: { + tokens_input: rlmResult.usage.inputTokens, + tokens_output: rlmResult.usage.outputTokens, + cost: rlmResult.usage.totalCost, + latency_ms: rlmLatency, + iterations: rlmResult.iterations, + answer: rlmResult.answer, + }, + savings: { + tokens_pct: calculateSavings(directTotalTokens, rlmTotalTokens), + cost_pct: calculateCostSavings(directResp.usage.totalCost, rlmResult.usage.totalCost), + }, + }); + + if (options?.outputFormat !== "json") { + process.stderr.write(` completed: ${q.name}\n`); + } + } + + const totals = aggregateTotals(runs); + + return { + timestamp: new Date().toISOString(), + mode: "cost", + model: `${config.model.provider}/${config.model.model}`, + runs, + totals, + }; +} + +// ─── Oolong Benchmark ──────────────────────────────────── + +async function ensureBenchVenv(): Promise { + const venvDir = join(homedir(), ".rlmx", ".bench-venv"); + const pythonBin = join(venvDir, "bin", "python"); + + try { + await stat(pythonBin); + return pythonBin; + } catch { + // Create venv and install datasets + process.stderr.write("rlmx benchmark: setting up Python venv for HuggingFace datasets...\n"); + await mkdir(join(homedir(), ".rlmx"), { recursive: true }); + + // Try uv first (preferred), fall back to python3 -m venv + pip + try { + await execFileAsync("uv", ["venv", venvDir]); + await execFileAsync("uv", ["pip", "install", "--python", pythonBin, "datasets"]); + } catch { + await execFileAsync("python3", ["-m", "venv", venvDir]); + await execFileAsync(join(venvDir, "bin", "pip"), ["install", "datasets"]); + } + + process.stderr.write("rlmx benchmark: Python venv ready.\n"); + return pythonBin; + } +} + +function findLoadDatasetScript(): string { + // The script is at python/load_dataset.py relative to package root + // From dist/src/benchmark.js, package root is ../../ + const thisDir = dirname(fileURLToPath(import.meta.url)); + return join(thisDir, "..", "..", "python", "load_dataset.py"); +} + +export async function runOolongBenchmark( + config: RlmxConfig, + options?: { samples?: number; idx?: number } +): Promise { + const samples = options?.samples ?? 5; + const idx = options?.idx; + + const pythonBin = await ensureBenchVenv(); + const scriptPath = findLoadDatasetScript(); + + // Load dataset via Python subprocess + const args = [scriptPath, String(samples)]; + if (idx !== undefined) { + args.push(String(idx)); + } + + process.stderr.write(`rlmx benchmark: loading Oolong Synth dataset (${idx !== undefined ? `idx=${idx}` : `${samples} samples`})...\n`); + const { stdout } = await execFileAsync(pythonBin, args, { maxBuffer: 50 * 1024 * 1024 }); + const dataset: BenchmarkQuestion[] = JSON.parse(stdout); + process.stderr.write(`rlmx benchmark: loaded ${dataset.length} samples.\n`); + + const runs: BenchmarkRunResult[] = []; + + for (const q of dataset) { + const ctx: LoadedContext = { + type: "string", + content: q.context, + metadata: `oolong context (${q.context.length} chars)`, + }; + + // Direct LLM call + const directMessages: ChatMessage[] = [ + { + role: "user", + content: `Context:\n${q.context}\n\nQuestion: ${q.question}`, + }, + ]; + + const directStart = Date.now(); + const directResp = await llmComplete(directMessages, config.model); + const directLatency = Date.now() - directStart; + + // RLM call + const rlmStart = Date.now(); + const rlmResult = await rlmLoop(q.question, ctx, config, { + maxIterations: 10, + timeout: 120_000, + verbose: false, + output: "text", + cache: false, + }); + const rlmLatency = Date.now() - rlmStart; + + const directTotalTokens = directResp.usage.inputTokens + directResp.usage.outputTokens; + const rlmTotalTokens = rlmResult.usage.inputTokens + rlmResult.usage.outputTokens; + + runs.push({ + questionId: q.id, + questionName: q.name, + direct: { + tokens_input: directResp.usage.inputTokens, + tokens_output: directResp.usage.outputTokens, + cost: directResp.usage.totalCost, + latency_ms: directLatency, + answer: directResp.text, + }, + rlm: { + tokens_input: rlmResult.usage.inputTokens, + tokens_output: rlmResult.usage.outputTokens, + cost: rlmResult.usage.totalCost, + latency_ms: rlmLatency, + iterations: rlmResult.iterations, + answer: rlmResult.answer, + }, + savings: { + tokens_pct: calculateSavings(directTotalTokens, rlmTotalTokens), + cost_pct: calculateCostSavings(directResp.usage.totalCost, rlmResult.usage.totalCost), + }, + }); + + process.stderr.write(` completed: ${q.name}\n`); + } + + const totals = aggregateTotals(runs); + + return { + timestamp: new Date().toISOString(), + mode: "oolong", + model: `${config.model.provider}/${config.model.model}`, + runs, + totals, + }; +} + +// ─── Aggregation ───────────────────────────────────────── + +export function aggregateTotals(runs: BenchmarkRunResult[]): BenchmarkResults["totals"] { + if (runs.length === 0) { + return { + direct: { tokens: 0, cost: 0, latency_ms: 0 }, + rlm: { tokens: 0, cost: 0, latency_ms: 0, avg_iterations: 0 }, + savings: { tokens_pct: 0, cost_pct: 0 }, + }; + } + + const directTokens = runs.reduce((sum, r) => sum + r.direct.tokens_input + r.direct.tokens_output, 0); + const directCost = runs.reduce((sum, r) => sum + r.direct.cost, 0); + const directLatency = runs.reduce((sum, r) => sum + r.direct.latency_ms, 0); + + const rlmTokens = runs.reduce((sum, r) => sum + r.rlm.tokens_input + r.rlm.tokens_output, 0); + const rlmCost = runs.reduce((sum, r) => sum + r.rlm.cost, 0); + const rlmLatency = runs.reduce((sum, r) => sum + r.rlm.latency_ms, 0); + const avgIterations = runs.reduce((sum, r) => sum + r.rlm.iterations, 0) / runs.length; + + return { + direct: { tokens: directTokens, cost: directCost, latency_ms: directLatency }, + rlm: { tokens: rlmTokens, cost: rlmCost, latency_ms: rlmLatency, avg_iterations: avgIterations }, + savings: { + tokens_pct: calculateSavings(directTokens, rlmTokens), + cost_pct: calculateCostSavings(directCost, rlmCost), + }, + }; +} + +// ─── Table Formatting ──────────────────────────────────── + +function padRight(str: string, len: number): string { + if (str.length >= len) return str.slice(0, len); + return str + " ".repeat(len - str.length); +} + +function padLeft(str: string, len: number): string { + if (str.length >= len) return str.slice(0, len); + return " ".repeat(len - str.length) + str; +} + +function formatTokens(n: number): string { + return n.toLocaleString("en-US"); +} + +function formatCost(n: number): string { + return `$${n.toFixed(4)}`; +} + +function formatLatency(ms: number): string { + if (ms < 1000) return `${ms}ms`; + return `${(ms / 1000).toFixed(1)}s`; +} + +function formatPct(n: number): string { + const sign = n > 0 ? "" : ""; + return `${sign}${n.toFixed(1)}%`; +} + +export function formatBenchmarkTable(results: BenchmarkResults): string { + const colW = { name: 22, mode: 10, tokens: 12, cost: 10, latency: 10, iters: 8 }; + + const lines: string[] = []; + + const modeLabel = results.mode === "cost" ? "cost comparison" : "oolong accuracy"; + lines.push(`rlmx benchmark — ${modeLabel} (RLM vs Direct LLM)`); + lines.push(""); + + // Header + const hr = "─"; + lines.push( + `┌${hr.repeat(colW.name)}┬${hr.repeat(colW.mode)}┬${hr.repeat(colW.tokens)}┬${hr.repeat(colW.cost)}┬${hr.repeat(colW.latency)}┬${hr.repeat(colW.iters)}┐` + ); + lines.push( + `│${padRight(" Question", colW.name)}│${padRight(" Mode", colW.mode)}│${padRight(" Tokens", colW.tokens)}│${padRight(" Cost", colW.cost)}│${padRight(" Latency", colW.latency)}│${padRight(" Iters", colW.iters)}│` + ); + lines.push( + `├${hr.repeat(colW.name)}┼${hr.repeat(colW.mode)}┼${hr.repeat(colW.tokens)}┼${hr.repeat(colW.cost)}┼${hr.repeat(colW.latency)}┼${hr.repeat(colW.iters)}┤` + ); + + // Rows + for (const run of results.runs) { + const directTokens = run.direct.tokens_input + run.direct.tokens_output; + const rlmTokens = run.rlm.tokens_input + run.rlm.tokens_output; + + lines.push( + `│${padRight(` ${run.questionName}`, colW.name)}│${padRight(" Direct", colW.mode)}│${padLeft(formatTokens(directTokens) + " ", colW.tokens)}│${padLeft(formatCost(run.direct.cost) + " ", colW.cost)}│${padLeft(formatLatency(run.direct.latency_ms) + " ", colW.latency)}│${padRight(" -", colW.iters)}│` + ); + lines.push( + `│${padRight("", colW.name)}│${padRight(" RLM", colW.mode)}│${padLeft(formatTokens(rlmTokens) + " ", colW.tokens)}│${padLeft(formatCost(run.rlm.cost) + " ", colW.cost)}│${padLeft(formatLatency(run.rlm.latency_ms) + " ", colW.latency)}│${padLeft(String(run.rlm.iterations) + " ", colW.iters)}│` + ); + lines.push( + `│${padRight("", colW.name)}│${padRight(" Savings", colW.mode)}│${padLeft(formatPct(run.savings.tokens_pct) + " ", colW.tokens)}│${padLeft(formatPct(run.savings.cost_pct) + " ", colW.cost)}│${padRight(" -", colW.latency)}│${padRight("", colW.iters)}│` + ); + } + + // Footer with totals + lines.push( + `├${hr.repeat(colW.name)}┼${hr.repeat(colW.mode)}┼${hr.repeat(colW.tokens)}┼${hr.repeat(colW.cost)}┼${hr.repeat(colW.latency)}┼${hr.repeat(colW.iters)}┤` + ); + lines.push( + `│${padRight(" TOTALS", colW.name)}│${padRight(" Direct", colW.mode)}│${padLeft(formatTokens(results.totals.direct.tokens) + " ", colW.tokens)}│${padLeft(formatCost(results.totals.direct.cost) + " ", colW.cost)}│${padLeft(formatLatency(results.totals.direct.latency_ms) + " ", colW.latency)}│${padRight(" -", colW.iters)}│` + ); + lines.push( + `│${padRight("", colW.name)}│${padRight(" RLM", colW.mode)}│${padLeft(formatTokens(results.totals.rlm.tokens) + " ", colW.tokens)}│${padLeft(formatCost(results.totals.rlm.cost) + " ", colW.cost)}│${padLeft(formatLatency(results.totals.rlm.latency_ms) + " ", colW.latency)}│${padLeft(results.totals.rlm.avg_iterations.toFixed(1) + " ", colW.iters)}│` + ); + lines.push( + `│${padRight("", colW.name)}│${padRight(" Savings", colW.mode)}│${padLeft(formatPct(results.totals.savings.tokens_pct) + " ", colW.tokens)}│${padLeft(formatPct(results.totals.savings.cost_pct) + " ", colW.cost)}│${padRight(" -", colW.latency)}│${padRight("", colW.iters)}│` + ); + lines.push( + `└${hr.repeat(colW.name)}┴${hr.repeat(colW.mode)}┴${hr.repeat(colW.tokens)}┴${hr.repeat(colW.cost)}┴${hr.repeat(colW.latency)}┴${hr.repeat(colW.iters)}┘` + ); + + return lines.join("\n"); +} + +// ─── Results Persistence ───────────────────────────────── + +export async function saveBenchmarkResults(results: BenchmarkResults): Promise { + const benchDir = join(homedir(), ".rlmx", "benchmarks"); + await mkdir(benchDir, { recursive: true }); + + const ts = results.timestamp.replace(/[:.]/g, "-").replace("T", "_").replace("Z", ""); + const filename = `benchmark-${results.mode}-${ts}.json`; + const filepath = join(benchDir, filename); + + await writeFile(filepath, JSON.stringify(results, null, 2), "utf-8"); + return filepath; +} diff --git a/src/cache.ts b/src/cache.ts index d7f7634..0fba5ce 100644 --- a/src/cache.ts +++ b/src/cache.ts @@ -12,7 +12,7 @@ import type { RlmxConfig, CacheConfig } from "./config.js"; import type { LoadedContext, ContextItem } from "./context.js"; // Provider context window limits (approximate token counts) -const PROVIDER_LIMITS: Record = { +export const PROVIDER_LIMITS: Record = { anthropic: 200000, openai: 128000, google: 1000000, // Gemini supports 1M+ diff --git a/src/cli.ts b/src/cli.ts index 5fa045c..1064a94 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -22,6 +22,8 @@ Usage: rlmx init [--dir ] Scaffold rlmx.yaml config rlmx cache [options] Pre-warm cache or estimate context size rlmx batch [options] Bulk interrogation from questions file + rlmx benchmark [options] Run benchmarks (cost or oolong) + rlmx stats [options] Query run history and cost breakdowns Options: --context Path to context (directory or file) @@ -42,6 +44,7 @@ Options: --ext File extensions for context dirs (comma-separated) --thinking Thinking level: minimal, low, medium, high (Gemini 3) --cache Enable cache mode (full context in system prompt for provider caching) + --no-session Disable auto-save of session data --estimate Show context size and cost estimate without caching (cache command) --parallel Concurrent questions for batch command (default: 1) --batch-api Use Gemini Batch API for 50% cost reduction (batch command) @@ -72,7 +75,7 @@ Examples: interface CliOptions { query: string | null; - command: "query" | "init" | "help" | "version" | "cache" | "batch" | "config"; + command: "query" | "init" | "help" | "version" | "cache" | "batch" | "config" | "benchmark" | "stats"; context: string | null; output: "text" | "json" | "stream"; verbose: boolean; @@ -92,6 +95,7 @@ interface CliOptions { batchFile: string | null; parallel: number; batchApi: boolean; + noSession: boolean; } function parseCliArgs(args: string[]): CliOptions { @@ -118,6 +122,7 @@ function parseCliArgs(args: string[]): CliOptions { estimate: { type: "boolean", default: false }, parallel: { type: "string", default: "1" }, "batch-api": { type: "boolean", default: false }, + "no-session": { type: "boolean", default: false }, }, allowPositionals: true, strict: false, @@ -129,7 +134,7 @@ function parseCliArgs(args: string[]): CliOptions { verbose: false, maxIterations: 30, timeout: 300000, dir: process.cwd(), stats: false, log: null, tools: null, maxCost: null, maxTokens: null, maxDepth: null, ext: null, thinking: null, cache: false, estimate: false, - batchFile: null, parallel: 1, batchApi: false, + batchFile: null, parallel: 1, batchApi: false, noSession: false, }; } @@ -139,7 +144,7 @@ function parseCliArgs(args: string[]): CliOptions { verbose: false, maxIterations: 30, timeout: 300000, dir: process.cwd(), stats: false, log: null, tools: null, maxCost: null, maxTokens: null, maxDepth: null, ext: null, thinking: null, cache: false, estimate: false, - batchFile: null, parallel: 1, batchApi: false, + batchFile: null, parallel: 1, batchApi: false, noSession: false, }; } @@ -147,6 +152,8 @@ function parseCliArgs(args: string[]): CliOptions { : positionals[0] === "cache" ? "cache" : positionals[0] === "batch" ? "batch" : positionals[0] === "config" ? "config" + : positionals[0] === "benchmark" ? "benchmark" + : positionals[0] === "stats" ? "stats" : "query"; const query = command === "query" ? positionals[0] ?? null : null; const batchFile = command === "batch" ? positionals[1] ?? null : null; @@ -200,6 +207,7 @@ function parseCliArgs(args: string[]): CliOptions { batchFile, parallel: parseInt(values.parallel as string, 10) || 1, batchApi: values["batch-api"] as boolean, + noSession: values["no-session"] as boolean, }; } @@ -297,6 +305,36 @@ async function runQuery(opts: CliOptions): Promise { } } + // Validate context size and auto-adjust cache/storage modes + let storageMode = false; + if (context) { + const validation = validateContextSize(context, config.model.provider); + if (!validation.valid) { + // Context exceeds model limit — disable cache mode if it was enabled + if (opts.cache || config.cache.enabled) { + console.error( + `rlmx: context exceeds model limit (~${validation.estimatedTokens.toLocaleString()} tokens > ${validation.limit.toLocaleString()}), disabling cache mode` + ); + opts.cache = false; + config.cache.enabled = false; + } + // Signal storage mode when enabled is 'auto' or 'always' + if (config.storage.enabled === "auto" || config.storage.enabled === "always") { + storageMode = true; + console.error( + `rlmx: storage mode activated for large context (~${validation.estimatedTokens.toLocaleString()} tokens)` + ); + } + } + } + // Force storage mode when explicitly set to 'always' + if (config.storage.enabled === "always" && !storageMode) { + storageMode = true; + if (opts.verbose) { + console.error("rlmx: storage mode forced (storage.enabled: always)"); + } + } + // Read query from stdin if not provided as argument let query = opts.query; if (!query && !process.stdin.isTTY) { @@ -316,6 +354,7 @@ async function runQuery(opts: CliOptions): Promise { verbose: opts.verbose, output: opts.output, cache: opts.cache, + storageMode, logger, }); @@ -359,6 +398,33 @@ async function runQuery(opts: CliOptions): Promise { outputResult(result, opts.output); } + // Save session (unless --no-session) + if (!opts.noSession) { + try { + const { saveSession } = await import("./session.js"); + await saveSession({ + runId: logger.runId, + query: opts.query ?? "(stdin)", + contextPath: opts.context, + model: `${config.model.provider}/${config.model.model}`, + answer: result.answer, + usage: { + inputTokens: result.usage.inputTokens, + outputTokens: result.usage.outputTokens, + cachedTokens: result.usage.cacheReadTokens, + totalCost: result.usage.totalCost, + iterations: result.iterations, + timeMs: timeMs, + model: `${config.model.provider}/${config.model.model}`, + }, + config: config as unknown as Record, + logPath: opts.log, + }); + } catch (err: unknown) { + console.error(`rlmx: session save failed: ${err instanceof Error ? err.message : String(err)}`); + } + } + // Exit with non-zero code on empty response abort (issue #14) if (result.budgetHit === "empty_responses") { process.exit(1); @@ -498,6 +564,30 @@ async function runBatchCommand(opts: CliOptions): Promise { } } + // Validate context size and auto-adjust cache/storage mode for batch + let batchCache = true; + let batchStorageMode = false; + if (context) { + const validation = validateContextSize(context, config.model.provider); + if (!validation.valid) { + console.error( + `rlmx: context exceeds model limit (~${validation.estimatedTokens.toLocaleString()} tokens > ${validation.limit.toLocaleString()}), disabling cache mode` + ); + batchCache = false; + config.cache.enabled = false; + if (config.storage.enabled === "auto" || config.storage.enabled === "always") { + batchStorageMode = true; + console.error( + `rlmx: storage mode activated for batch (~${validation.estimatedTokens.toLocaleString()} tokens)` + ); + } + } + } + // Force storage mode when explicitly set to 'always' + if (config.storage.enabled === "always" && !batchStorageMode) { + batchStorageMode = true; + } + if (opts.verbose) { console.error(`rlmx batch: processing ${opts.batchFile}`); } @@ -506,7 +596,8 @@ async function runBatchCommand(opts: CliOptions): Promise { maxIterations: opts.maxIterations, timeout: opts.timeout, verbose: opts.verbose, - cache: true, + cache: batchCache, + storageMode: batchStorageMode, maxCost: opts.maxCost ?? undefined, parallel: opts.parallel, }); @@ -602,6 +693,41 @@ async function runConfig(args: string[]): Promise { } } +async function runBenchmarkCommand(opts: CliOptions, args: string[]): Promise { + const mode = args[0]; + const configDir = process.cwd(); + const config = await loadConfig(configDir); + + if (opts.tools) config.toolsLevel = opts.tools; + + if (mode === "cost") { + const { runCostBenchmark, formatBenchmarkTable, saveBenchmarkResults } = await import("./benchmark.js"); + const outputIdx = args.indexOf("--output"); + const outputFormat = outputIdx >= 0 && args[outputIdx + 1] === "json" ? "json" as const : "table" as const; + const results = await runCostBenchmark(config, { outputFormat }); + if (outputFormat === "json") { + console.log(JSON.stringify(results, null, 2)); + } else { + console.error(formatBenchmarkTable(results)); + } + const savedPath = await saveBenchmarkResults(results); + console.error(`Results saved to ${savedPath}`); + } else if (mode === "oolong") { + const samplesIdx = args.indexOf("--samples"); + const samples = samplesIdx >= 0 ? parseInt(args[samplesIdx + 1], 10) : 5; + const idxArgIdx = args.indexOf("--idx"); + const idx = idxArgIdx >= 0 ? parseInt(args[idxArgIdx + 1], 10) : undefined; + + const { runOolongBenchmark, formatBenchmarkTable, saveBenchmarkResults } = await import("./benchmark.js"); + const results = await runOolongBenchmark(config, { samples, idx }); + console.error(formatBenchmarkTable(results)); + const savedPath = await saveBenchmarkResults(results); + console.error(`Results saved to ${savedPath}`); + } else { + console.log(`rlmx benchmark — compare RLM vs direct LLM\n\nUsage:\n rlmx benchmark cost Run cost benchmark with built-in dataset\n rlmx benchmark cost --output json Output results as JSON\n rlmx benchmark oolong Run Oolong Synth (auto-installs HF datasets)\n rlmx benchmark oolong --samples 5 Run N samples (default 5)\n rlmx benchmark oolong --idx 42 Run specific sample by index`); + } +} + async function main(): Promise { const opts = parseCliArgs(process.argv.slice(2)); @@ -639,6 +765,16 @@ async function main(): Promise { await runConfig(process.argv.slice(3)); break; + case "benchmark": + await runBenchmarkCommand(opts, process.argv.slice(3)); + break; + + case "stats": { + const { runStatsCommand } = await import("./stats.js"); + await runStatsCommand(process.argv.slice(3)); + break; + } + case "query": if (!opts.query && process.stdin.isTTY) { console.log(HELP); diff --git a/src/config.ts b/src/config.ts index 5391973..3f51a60 100644 --- a/src/config.ts +++ b/src/config.ts @@ -65,6 +65,17 @@ export interface OutputConfig { schema: Record | null; } +/** Storage configuration for pgserve-backed large context handling */ +export interface StorageConfig { + enabled: "auto" | "always" | "never"; + mode: "persistent" | "memory"; + dataDir: string; + port: number; + chunkSize: number | null; + chunkUtilization: number; + charsPerToken: number; +} + /** Tool level — controls which functions are available in the REPL */ export type ToolsLevel = "core" | "standard" | "full"; @@ -88,6 +99,8 @@ export interface RlmxConfig { gemini: GeminiConfig; /** Structured output configuration */ output: OutputConfig; + /** Storage configuration for pgserve */ + storage: StorageConfig; /** Config source: "yaml" | "md" | "defaults" */ configSource: "yaml" | "md" | "defaults"; } @@ -131,6 +144,16 @@ const DEFAULT_OUTPUT_CONFIG: OutputConfig = { schema: null, }; +export const DEFAULT_STORAGE_CONFIG: StorageConfig = { + enabled: "auto", + mode: "persistent", + dataDir: "~/.rlmx/data", + port: 0, + chunkSize: null, + chunkUtilization: 0.6, + charsPerToken: 4, +}; + // ─── YAML Schema ───────────────────────────────────────── /** Shape of rlmx.yaml on disk */ @@ -178,6 +201,15 @@ interface RawYamlConfig { ttl?: number; "expire-time"?: string; }; + storage?: { + enabled?: string; + mode?: string; + "data-dir"?: string; + port?: number; + "chunk-size"?: number | null; + "chunk-utilization"?: number; + "chars-per-token"?: number; + }; } // ─── YAML Loading ──────────────────────────────────────── @@ -256,13 +288,18 @@ function parseYamlConfig(content: string, dir: string): RlmxConfig { exclude: cfg.context?.exclude ?? DEFAULT_CONTEXT_CONFIG.exclude, }; - // Validate extensions format - for (const ext of contextConfig.extensions) { + // Validate and normalize extensions format + for (let i = 0; i < contextConfig.extensions.length; i++) { + const ext = contextConfig.extensions[i]; if (typeof ext !== "string") { throw new Error( `Invalid extension in context.extensions: expected string, got ${typeof ext}.` ); } + // Ensure leading dot: "mdx" → ".mdx" + if (ext.length > 0 && !ext.startsWith(".")) { + contextConfig.extensions[i] = `.${ext}`; + } } // Parse budget @@ -366,6 +403,53 @@ function parseYamlConfig(content: string, dir: string): RlmxConfig { ); } + // Parse storage config + const rawEnabled = cfg.storage?.enabled ?? DEFAULT_STORAGE_CONFIG.enabled; + if (!["auto", "always", "never"].includes(rawEnabled)) { + throw new Error( + `Invalid storage.enabled "${rawEnabled}" in rlmx.yaml. Must be one of: auto, always, never.` + ); + } + const rawMode = cfg.storage?.mode ?? DEFAULT_STORAGE_CONFIG.mode; + if (!["persistent", "memory"].includes(rawMode)) { + throw new Error( + `Invalid storage.mode "${rawMode}" in rlmx.yaml. Must be one of: persistent, memory.` + ); + } + const storagePort = cfg.storage?.port ?? DEFAULT_STORAGE_CONFIG.port; + if (typeof storagePort !== "number" || storagePort < 0 || !Number.isInteger(storagePort)) { + throw new Error( + `Invalid storage.port in rlmx.yaml: must be a non-negative integer, got ${storagePort}.` + ); + } + const chunkSize = cfg.storage?.["chunk-size"] ?? DEFAULT_STORAGE_CONFIG.chunkSize; + if (chunkSize !== null && (typeof chunkSize !== "number" || chunkSize <= 0)) { + throw new Error( + `Invalid storage.chunk-size in rlmx.yaml: must be a positive number or null, got ${chunkSize}.` + ); + } + const chunkUtilization = cfg.storage?.["chunk-utilization"] ?? DEFAULT_STORAGE_CONFIG.chunkUtilization; + if (typeof chunkUtilization !== "number" || chunkUtilization <= 0 || chunkUtilization > 1) { + throw new Error( + `Invalid storage.chunk-utilization in rlmx.yaml: must be a number between 0 (exclusive) and 1 (inclusive), got ${chunkUtilization}.` + ); + } + const charsPerToken = cfg.storage?.["chars-per-token"] ?? DEFAULT_STORAGE_CONFIG.charsPerToken; + if (typeof charsPerToken !== "number" || charsPerToken <= 0) { + throw new Error( + `Invalid storage.chars-per-token in rlmx.yaml: must be a positive number, got ${charsPerToken}.` + ); + } + const storage: StorageConfig = { + enabled: rawEnabled as StorageConfig["enabled"], + mode: rawMode as StorageConfig["mode"], + dataDir: cfg.storage?.["data-dir"] ?? DEFAULT_STORAGE_CONFIG.dataDir, + port: storagePort, + chunkSize, + chunkUtilization, + charsPerToken, + }; + return { system, tools, @@ -378,6 +462,7 @@ function parseYamlConfig(content: string, dir: string): RlmxConfig { cache, gemini, output, + storage, configSource: "yaml", }; } @@ -494,6 +579,7 @@ async function loadConfigFromMd(dir: string): Promise { cache: { ...DEFAULT_CACHE_CONFIG }, gemini: { ...DEFAULT_GEMINI_CONFIG }, output: { ...DEFAULT_OUTPUT_CONFIG }, + storage: { ...DEFAULT_STORAGE_CONFIG }, configSource: "md", }; } @@ -514,6 +600,7 @@ function defaultConfig(dir: string): RlmxConfig { cache: { ...DEFAULT_CACHE_CONFIG }, gemini: { ...DEFAULT_GEMINI_CONFIG }, output: { ...DEFAULT_OUTPUT_CONFIG }, + storage: { ...DEFAULT_STORAGE_CONFIG }, configSource: "defaults", }; } diff --git a/src/context.ts b/src/context.ts index 4faecac..4168d49 100644 --- a/src/context.ts +++ b/src/context.ts @@ -140,8 +140,13 @@ export async function loadContextFromDir( dirPath: string, options?: Partial ): Promise { + // Normalize extensions: ensure each has a leading dot ("mdx" → ".mdx") + const rawExtensions = options?.extensions ?? DEFAULT_COLLECT_OPTIONS.extensions; + const normalizedExtensions = rawExtensions.map((ext) => + ext.startsWith(".") ? ext : `.${ext}` + ); const merged: CollectOptions = { - extensions: options?.extensions ?? DEFAULT_COLLECT_OPTIONS.extensions, + extensions: normalizedExtensions, exclude: options?.exclude ?? DEFAULT_COLLECT_OPTIONS.exclude, }; const items = await collectFiles(dirPath, dirPath, merged); diff --git a/src/ipc.ts b/src/ipc.ts index f710c76..b49e9ae 100644 --- a/src/ipc.ts +++ b/src/ipc.ts @@ -60,7 +60,12 @@ export interface LLMRequest { | "rlm_query_batched" | "web_search" | "fetch_url" - | "generate_image"; + | "generate_image" + | "pg_search" + | "pg_slice" + | "pg_time" + | "pg_count" + | "pg_query"; prompts: string[]; model?: string; } diff --git a/src/llm.ts b/src/llm.ts index ecd35fe..f6a9be7 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -11,6 +11,7 @@ import { spawn } from "node:child_process"; import type { RlmxConfig, ModelConfig, GeminiConfig } from "./config.js"; import type { LLMRequest } from "./ipc.js"; import type { Logger } from "./logger.js"; +import type { PgStorage } from "./storage.js"; import { buildGeminiOnPayload, isGoogleProvider, type ThinkingLevel } from "./gemini.js"; /** Token usage tracking. */ @@ -384,7 +385,8 @@ export async function handleLLMRequest( config: RlmxConfig, usage: UsageStats, signal?: AbortSignal, - geminiCounts?: GeminiCallCounts + geminiCounts?: GeminiCallCounts, + storage?: PgStorage ): Promise { const subCallModel: ModelConfig = config.model.subCallModel ? { ...config.model, model: config.model.subCallModel } @@ -487,6 +489,40 @@ export async function handleLLMRequest( return [igResp.text]; } + case "pg_search": { + if (!storage) return [`Error: storage not available`]; + const params = JSON.parse(request.prompts[0]); + const rows = await storage.search(params.pattern, params.limit); + return [JSON.stringify(rows)]; + } + + case "pg_slice": { + if (!storage) return [`Error: storage not available`]; + const params = JSON.parse(request.prompts[0]); + const rows = await storage.slice(params.start, params.end); + return [JSON.stringify(rows)]; + } + + case "pg_time": { + if (!storage) return [`Error: storage not available`]; + const params = JSON.parse(request.prompts[0]); + const rows = await storage.timeRange(params.from, params.to); + return [JSON.stringify(rows)]; + } + + case "pg_count": { + if (!storage) return [`Error: storage not available`]; + const cnt = await storage.count(); + return [JSON.stringify({ count: cnt })]; + } + + case "pg_query": { + if (!storage) return [`Error: storage not available`]; + const params = JSON.parse(request.prompts[0]); + const rows = await storage.query(params.sql); + return [JSON.stringify(rows)]; + } + default: return request.prompts.map( () => `Error: unknown request type "${request.request_type}"` diff --git a/src/observe.ts b/src/observe.ts new file mode 100644 index 0000000..4a3b928 --- /dev/null +++ b/src/observe.ts @@ -0,0 +1,186 @@ +/** + * ObservabilityRecorder — fire-and-forget event recording to pgserve. + * + * Records every LLM call, REPL execution, and sub-call into rlmx_sessions + * and rlmx_events tables. All methods are fire-and-forget: errors are logged + * to stderr but never thrown or block the main RLM loop. + */ + +import type { PgStorage } from "./storage.js"; + +/** Usage info for recording an LLM call. */ +export interface LLMCallUsage { + inputTokens: number; + outputTokens: number; + cost: number; +} + +/** Total usage for recording session completion. */ +export interface TotalUsage { + inputTokens: number; + outputTokens: number; + cachedTokens?: number; + totalCost: number; +} + +export class ObservabilityRecorder { + private storage: PgStorage; + private sessionId: string | null = null; + + constructor(storage: PgStorage) { + this.storage = storage; + } + + /** + * Create a new session record. + */ + startSession( + runId: string, + query: string, + model: string, + provider: string, + contextPath?: string, + config?: Record + ): void { + this.sessionId = runId; + this.fire(async () => { + const client = this.storage.getClient(); + if (!client) return; + await client.query( + `INSERT INTO rlmx_sessions (id, query, context_path, model, provider, config) + VALUES ($1, $2, $3, $4, $5, $6) + ON CONFLICT (id) DO NOTHING`, + [runId, query, contextPath ?? null, model, provider, config ? JSON.stringify(config) : null] + ); + }); + } + + /** + * Record an LLM call event. + */ + recordLLMCall( + iteration: number, + usage: LLMCallUsage, + model: string, + durationMs: number + ): void { + this.fire(async () => { + const client = this.storage.getClient(); + if (!client || !this.sessionId) return; + await client.query( + `INSERT INTO rlmx_events (session_id, iteration, kind, input_tokens, output_tokens, cost, model, duration_ms) + VALUES ($1, $2, 'llm_call', $3, $4, $5, $6, $7)`, + [this.sessionId, iteration, usage.inputTokens, usage.outputTokens, usage.cost, model, durationMs] + ); + }); + } + + /** + * Record a REPL execution event. + */ + recordReplExec( + iteration: number, + code: string, + stdout: string, + stderr: string, + durationMs: number, + isError?: boolean + ): void { + this.fire(async () => { + const client = this.storage.getClient(); + if (!client || !this.sessionId) return; + await client.query( + `INSERT INTO rlmx_events (session_id, iteration, kind, code, stdout, stderr, duration_ms, is_error, error_message) + VALUES ($1, $2, 'repl_exec', $3, $4, $5, $6, $7, $8)`, + [ + this.sessionId, iteration, + code.slice(0, 10000), stdout.slice(0, 10000), stderr.slice(0, 5000), + durationMs, isError ?? false, isError ? stderr.slice(0, 1000) : null, + ] + ); + }); + } + + /** + * Record a sub-call event (pg_search, llm_query from REPL, etc.). + */ + recordSubCall( + iteration: number, + requestType: string, + promptPreview: string, + durationMs: number, + isError?: boolean, + errorMessage?: string + ): void { + this.fire(async () => { + const client = this.storage.getClient(); + if (!client || !this.sessionId) return; + await client.query( + `INSERT INTO rlmx_events (session_id, iteration, kind, request_type, prompt_preview, duration_ms, is_error, error_message) + VALUES ($1, $2, 'sub_call', $3, $4, $5, $6, $7)`, + [ + this.sessionId, iteration, + requestType, promptPreview.slice(0, 500), + durationMs, isError ?? false, errorMessage?.slice(0, 1000) ?? null, + ] + ); + }); + } + + /** + * Record session completion with final answer and totals. + */ + recordFinal( + answer: string, + iterations: number, + totalUsage: TotalUsage + ): void { + this.fire(async () => { + const client = this.storage.getClient(); + if (!client || !this.sessionId) return; + await client.query( + `UPDATE rlmx_sessions SET + status = 'completed', + ended_at = now(), + iterations = $2, + input_tokens = $3, + output_tokens = $4, + cached_tokens = $5, + total_cost = $6, + answer_length = $7 + WHERE id = $1`, + [ + this.sessionId, iterations, + totalUsage.inputTokens, totalUsage.outputTokens, + totalUsage.cachedTokens ?? 0, totalUsage.totalCost, + answer.length, + ] + ); + }); + } + + /** + * Record session failure. + */ + recordError(errorMessage: string): void { + this.fire(async () => { + const client = this.storage.getClient(); + if (!client || !this.sessionId) return; + await client.query( + `UPDATE rlmx_sessions SET status = 'failed', ended_at = now(), budget_hit = $2 WHERE id = $1`, + [this.sessionId, errorMessage.slice(0, 500)] + ); + }); + } + + /** + * Fire-and-forget: run an async operation, swallow errors. + */ + private fire(fn: () => Promise): void { + fn().catch((err) => { + process.stderr.write( + `rlmx: observability recording error: ${err instanceof Error ? err.message : String(err)}\n` + ); + }); + } +} diff --git a/src/repl.ts b/src/repl.ts index 16efef4..deab5f1 100644 --- a/src/repl.ts +++ b/src/repl.ts @@ -30,6 +30,7 @@ const __dirname = dirname(__filename); const REPL_SERVER_PATH = join(__dirname, "..", "..", "python", "repl_server.py"); const BATTERIES_PATH = join(__dirname, "..", "..", "python", "batteries.py"); const GEMINI_BATTERIES_PATH = join(__dirname, "..", "..", "python", "gemini_batteries.py"); +const PG_BATTERIES_PATH = join(__dirname, "..", "..", "python", "pg_batteries.py"); /** Battery function names — tracked for stats. */ const BATTERY_FUNCTION_NAMES = [ @@ -60,6 +61,8 @@ export interface REPLStartOptions { toolsLevel?: ToolsLevel; /** Whether to load Gemini batteries (web_search, fetch_url, generate_image). */ loadGeminiBatteries?: boolean; + /** Whether to load pg_batteries (pg_search, pg_slice, etc.) for storage mode. */ + loadPgBatteries?: boolean; /** Python executable path (default: "python3"). */ pythonPath?: string; /** Path to repl_server.py (auto-detected). */ @@ -167,6 +170,11 @@ export class REPL { await this._loadGeminiBatteries(); } } + + // Load pg_batteries when storage mode is active (independent of tools level) + if (options.loadPgBatteries) { + await this._loadPgBatteries(); + } } /** Execute Python code in the REPL and return the result. */ @@ -272,6 +280,13 @@ export class REPL { this._skipTracking = false; } + private async _loadPgBatteries(): Promise { + const code = await readFile(PG_BATTERIES_PATH, "utf-8"); + this._skipTracking = true; + await this.execute(code); + this._skipTracking = false; + } + /** Track which battery functions appear in executed code. */ private _trackBatteryUsage(code: string): void { if (this._skipTracking) return; diff --git a/src/rlm.ts b/src/rlm.ts index 92fefdb..29e8283 100644 --- a/src/rlm.ts +++ b/src/rlm.ts @@ -9,10 +9,13 @@ * - Recursive sub-calls via llm_query/rlm_query */ +import { randomUUID } from "node:crypto"; import type { RlmxConfig, ToolDef } from "./config.js"; import type { LoadedContext, ContextItem } from "./context.js"; import { buildCachedSystemPrompt, computeContentHash, buildSessionId, estimateTokens } from "./cache.js"; import { REPL } from "./repl.js"; +import { PgStorage } from "./storage.js"; +import { ObservabilityRecorder } from "./observe.js"; import { llmComplete, handleLLMRequest, @@ -42,6 +45,8 @@ export interface RLMOptions { verbose: boolean; output: "text" | "json" | "stream"; cache: boolean; + /** When true, route context through pgserve storage instead of REPL variable. */ + storageMode?: boolean; logger?: Logger; } @@ -66,7 +71,8 @@ function isStructuredOutputMode(config: RlmxConfig): boolean { */ function buildSystemPrompt( config: RlmxConfig, - _context: LoadedContext | null + _context: LoadedContext | null, + storageRecordCount?: number ): string { // Use SYSTEM.md content or paper default (from scaffold) let system = config.system ?? ""; @@ -86,6 +92,19 @@ function buildSystemPrompt( config.criteria; } + // Append storage mode instructions when context is in PostgreSQL + if (storageRecordCount !== undefined) { + system += + `\n\n## Context Storage\n\n` + + `Context is stored in PostgreSQL (~${storageRecordCount.toLocaleString()} records). Use these tools to query it:\n` + + `- pg_search("pattern") — full-text search\n` + + `- pg_slice(start, end) — get lines by range\n` + + `- pg_time("HH:MM", "HH:MM") — filter by timestamp\n` + + `- pg_count() — total records\n` + + `- pg_query("SQL") — raw SQL (read-only)\n` + + `Do NOT try to access the \`context\` variable directly — it is not loaded in memory.`; + } + return system; } @@ -175,11 +194,45 @@ export async function rlmLoop( const geminiCounts = createGeminiCallCounts(); const budget = new BudgetTracker(config.budget); - // Build system prompt — cache mode embeds full context, normal mode uses metadata only + // ── Storage mode setup ────────────────────────────────── + let storage: PgStorage | undefined; + let recorder: ObservabilityRecorder | undefined; + let storageRecordCount: number | undefined; + const runId = randomUUID(); + + if (opts.storageMode) { + storage = new PgStorage(); + await storage.start(config.storage); + + // Ingest context into Postgres + if (context) { + storageRecordCount = await storage.ingest(context); + if (opts.verbose) { + process.stderr.write(`rlmx: ingested ${storageRecordCount} records into pgserve storage\n`); + } + } + + // Set up observability recorder + recorder = new ObservabilityRecorder(storage); + recorder.startSession( + runId, + query, + `${config.model.provider}/${config.model.model}`, + config.model.provider, + undefined, + config as unknown as Record + ); + } + + // Build system prompt — cache mode embeds full context, storage mode adds pg_* tools, normal mode uses metadata only const systemPrompt = opts.cache ? buildCachedSystemPrompt(config, context) - : buildSystemPrompt(config, context); - const contextMetadata = buildContextMetadata(context); + : buildSystemPrompt(config, context, storageRecordCount); + + // In storage mode, override context metadata to describe storage + const contextMetadata = opts.storageMode && storageRecordCount !== undefined + ? `Context is stored in PostgreSQL (~${storageRecordCount.toLocaleString()} records). Use pg_search(), pg_slice(), pg_time(), pg_count(), pg_query() to query it.` + : buildContextMetadata(context); // Build cache config for LLM calls (passed through to pi/ai completeSimple) let cacheConfig: CacheLLMConfig | undefined; @@ -211,8 +264,8 @@ export async function rlmLoop( const repl = new REPL(); try { - // Start REPL with context and custom tools - const replContext = prepareReplContext(context); + // Start REPL — in storage mode, skip raw context injection and load pg_batteries + const replContext = opts.storageMode ? undefined : prepareReplContext(context); const toolsMap: Record = {}; for (const tool of config.tools) { toolsMap[tool.name] = tool.code; @@ -222,24 +275,47 @@ export async function rlmLoop( context: replContext as string | string[] | Record, tools: Object.keys(toolsMap).length > 0 ? toolsMap : undefined, loadGeminiBatteries: isGoogleProvider(config.model.provider) && (config.toolsLevel === "standard" || config.toolsLevel === "full"), + loadPgBatteries: !!opts.storageMode, toolsLevel: config.toolsLevel, }); - // Set up LLM request handler for REPL IPC + // Set up LLM request handler for REPL IPC — pass storage for pg_* routes repl.onLLMRequest(async (request) => { - return handleLLMRequest( + const startMs = Date.now(); + const results = await handleLLMRequest( request, config, usage, abortController.signal, - geminiCounts + geminiCounts, + storage ); + // Record sub-calls to observability + if (recorder && request.request_type !== "llm_query" && request.request_type !== "llm_query_batched") { + recorder.recordSubCall( + 0, // iteration not available here; will be approximate + request.request_type, + request.prompts[0]?.slice(0, 200) ?? "", + Date.now() - startMs + ); + } + return results; }); - /** Cleanup timeout/REPL and build the final result. */ + /** Cleanup timeout/REPL/storage and build the final result. */ const finalize = async (answer: string, iterations: number): Promise => { clearTimeout(timeoutHandle); + // Record final observability event + if (recorder) { + recorder.recordFinal(answer, iterations, { + inputTokens: usage.inputTokens, + outputTokens: usage.outputTokens, + cachedTokens: usage.cacheReadTokens, + totalCost: usage.totalCost, + }); + } await repl.stop(); + if (storage) await storage.stop(); return buildResult(answer, usage, iterations, config, budget.getState().budgetHit, geminiCounts, repl.getGeminiBatteriesUsed()); }; @@ -275,6 +351,7 @@ export async function rlmLoop( } // Call LLM + const llmStartMs = Date.now(); const response = await llmComplete(messages, config.model, { signal: abortController.signal, cacheConfig, @@ -282,9 +359,20 @@ export async function rlmLoop( outputSchema: config.output.schema, geminiConfig: config.gemini, }); + const llmDurationMs = Date.now() - llmStartMs; mergeUsage(usage, response.usage); budget.record(response.usage.inputTokens, response.usage.outputTokens, response.usage.totalCost); + // Record LLM call to observability + if (recorder) { + recorder.recordLLMCall( + iteration, + { inputTokens: response.usage.inputTokens, outputTokens: response.usage.outputTokens, cost: response.usage.totalCost }, + `${config.model.provider}/${config.model.model}`, + llmDurationMs + ); + } + // Track thought signatures for Gemini stats if (response.thoughtSignatureCount) { geminiCounts.thoughtSignatures += response.thoughtSignatureCount; @@ -355,7 +443,9 @@ export async function rlmLoop( logVerbose(iteration, `executing code (${block.code.length} chars)`); } + const execStartMs = Date.now(); const execResult = await repl.execute(block.code); + const execDurationMs = Date.now() - execStartMs; executions.push({ code: block.code, @@ -365,6 +455,14 @@ export async function rlmLoop( error: execResult.error, }); + // Record REPL execution to observability + if (recorder) { + recorder.recordReplExec( + iteration, block.code, execResult.stdout, execResult.stderr ?? "", + execDurationMs, !!execResult.error + ); + } + if (execResult.final) { return finalize(execResult.final.value, iteration + 1); } @@ -470,7 +568,9 @@ export async function rlmLoop( `rlmx: 3 consecutive empty LLM responses — aborting. Context may exceed API limits.\n` ); clearTimeout(timeoutHandle); + if (recorder) recorder.recordError("empty_responses"); await repl.stop(); + if (storage) await storage.stop(); return buildResult( "Error: aborted after 3 consecutive empty LLM responses. Context may exceed API token limits.", @@ -493,7 +593,9 @@ export async function rlmLoop( return finalize(forcedResult, actualIterations); } catch (err: unknown) { clearTimeout(timeoutHandle); + if (recorder) recorder.recordError(err instanceof Error ? err.message : String(err)); await repl.stop().catch(() => {}); + if (storage) await storage.stop().catch(() => {}); if ((err instanceof Error && err.name === "AbortError") || abortController.signal.aborted) { return buildResult( diff --git a/src/session.ts b/src/session.ts new file mode 100644 index 0000000..84ab26c --- /dev/null +++ b/src/session.ts @@ -0,0 +1,91 @@ +/** + * Session persistence — auto-save every rlmx run to ~/.rlmx/sessions//. + * + * Each session directory contains: + * meta.json — run metadata (runId, query, context, timestamp, version) + * usage.json — token usage and cost statistics + * answer.txt — final answer text + * config.yaml — snapshot of the RlmxConfig used + * trajectory.jsonl — copy of the JSONL log (if --log was specified) + */ + +import { mkdir, writeFile, copyFile, stat } from "node:fs/promises"; +import { join } from "node:path"; +import { homedir } from "node:os"; +import { createRequire } from "node:module"; +import yaml from "js-yaml"; + +/** Data required to persist a session. */ +export interface SessionData { + runId: string; + query: string; + contextPath: string | null; + model: string; + answer: string; + usage: { + inputTokens: number; + outputTokens: number; + cachedTokens: number; + totalCost: number; + iterations: number; + timeMs: number; + model: string; + }; + config: Record; + logPath: string | null; +} + +/** + * Save session artifacts to ~/.rlmx/sessions//. + * Returns the session directory path. + */ +export async function saveSession(data: SessionData): Promise { + const sessionsDir = join(homedir(), ".rlmx", "sessions", data.runId); + await mkdir(sessionsDir, { recursive: true }); + + // Read package version + let version = "unknown"; + try { + const require = createRequire(import.meta.url); + const pkg = require("../../package.json"); + version = pkg.version ?? "unknown"; + } catch { + // If we can't read the package, continue with "unknown" + } + + // 1. meta.json + const meta = { + runId: data.runId, + query: data.query, + contextPath: data.contextPath, + timestamp: new Date().toISOString(), + version, + }; + await writeFile(join(sessionsDir, "meta.json"), JSON.stringify(meta, null, 2) + "\n"); + + // 2. usage.json + await writeFile(join(sessionsDir, "usage.json"), JSON.stringify(data.usage, null, 2) + "\n"); + + // 3. answer.txt + await writeFile(join(sessionsDir, "answer.txt"), data.answer); + + // 4. config.yaml + const configYaml = yaml.dump(data.config, { lineWidth: 120, noRefs: true }); + await writeFile(join(sessionsDir, "config.yaml"), configYaml); + + // 5. trajectory.jsonl — copy from logPath if it exists, otherwise write empty file + const trajectoryPath = join(sessionsDir, "trajectory.jsonl"); + if (data.logPath) { + try { + await stat(data.logPath); + await copyFile(data.logPath, trajectoryPath); + } catch { + // Log file doesn't exist or can't be read — write empty + await writeFile(trajectoryPath, ""); + } + } else { + await writeFile(trajectoryPath, ""); + } + + return sessionsDir; +} diff --git a/src/stats.ts b/src/stats.ts new file mode 100644 index 0000000..f21c6e6 --- /dev/null +++ b/src/stats.ts @@ -0,0 +1,408 @@ +/** + * Stats query functions for rlmx observability data. + * + * Connects to the persistent pgserve data at ~/.rlmx/data to query + * rlmx_sessions and rlmx_events tables. Starts pgserve temporarily, + * queries, then stops. + */ + +import { existsSync } from "node:fs"; +import { homedir } from "node:os"; +import { PgStorage } from "./storage.js"; +import { loadConfig } from "./config.js"; + +/** A session row from rlmx_sessions */ +export interface SessionRow { + id: string; + query: string; + model: string; + provider: string; + status: string; + iterations: number | null; + input_tokens: number; + output_tokens: number; + total_cost: number; + started_at: string; + ended_at: string | null; + duration_ms: number | null; +} + +/** An event row from rlmx_events */ +export interface EventRow { + id: number; + iteration: number | null; + kind: string; + input_tokens: number | null; + output_tokens: number | null; + cost: number | null; + model: string | null; + code: string | null; + stdout: string | null; + stderr: string | null; + request_type: string | null; + prompt_preview: string | null; + duration_ms: number | null; + is_error: boolean; + error_message: string | null; + created_at: string; +} + +/** Cost breakdown row from v_cost_breakdown */ +export interface CostRow { + session_id: string; + model: string; + calls: number; + total_input: number; + total_output: number; + total_cost: number; + avg_duration_ms: number; +} + +/** Tool usage row from v_repl_usage */ +export interface ToolRow { + session_id: string; + request_type: string; + calls: number; + errors: number; + avg_duration_ms: number; +} + +/** + * Check if persistent data directory exists. + */ +export async function hasStatsData(): Promise { + const config = await loadConfig(process.cwd()); + const dataDir = config.storage.dataDir.replace(/^~/, homedir()); + return existsSync(dataDir); +} + +/** + * Parse a "since" duration string (e.g., "24h", "7d", "30m") into a JS Date cutoff. + */ +function parseSinceCutoff(since: string): Date { + const match = since.match(/^(\d+)([mhd])$/); + if (!match) throw new Error(`Invalid --since format "${since}". Use Nh, Nd, or Nm (e.g., 24h, 7d, 30m).`); + const num = parseInt(match[1], 10); + const unit = match[2]; + const msMap: Record = { m: 60_000, h: 3_600_000, d: 86_400_000 }; + return new Date(Date.now() - num * msMap[unit]); +} + +/** + * Create a temporary PgStorage connected to the configured data directory for querying stats. + */ +async function createStatsStorage(): Promise { + const config = await loadConfig(process.cwd()); + const storage = new PgStorage(); + await storage.start({ + ...config.storage, + mode: "persistent", + enabled: "always", + }); + return storage; +} + +/** + * List recent runs. + */ +export async function listRuns(storage: PgStorage, limit = 20): Promise { + const rows = await storage.query( + `SELECT id, query, model, provider, status, iterations, + input_tokens::int, output_tokens::int, total_cost::float, + started_at::text, ended_at::text, + EXTRACT(EPOCH FROM (COALESCE(ended_at, now()) - started_at))::int * 1000 AS duration_ms + FROM rlmx_sessions + ORDER BY started_at DESC + LIMIT $1`, + [limit] + ); + return rows as SessionRow[]; +} + +/** + * Get events for a specific run. + */ +export async function getRun(storage: PgStorage, runId: string): Promise { + const rows = await storage.query( + `SELECT id, iteration, kind, input_tokens, output_tokens, cost::float, + model, code, stdout, stderr, request_type, prompt_preview, + duration_ms, is_error, error_message, created_at::text + FROM rlmx_events + WHERE session_id = $1 + ORDER BY id`, + [runId] + ); + return rows as EventRow[]; +} + +/** + * Get cost breakdown by model. + */ +export async function costBreakdown(storage: PgStorage, since?: string): Promise { + if (since) { + const cutoff = parseSinceCutoff(since); + const rows = await storage.query( + `SELECT e.session_id, e.model, + COUNT(*)::int AS calls, + SUM(e.input_tokens)::int AS total_input, + SUM(e.output_tokens)::int AS total_output, + SUM(e.cost)::float AS total_cost, + AVG(e.duration_ms)::int AS avg_duration_ms + FROM rlmx_events e + WHERE e.created_at >= $1 AND e.kind = 'llm_call' + GROUP BY e.session_id, e.model + ORDER BY total_cost DESC`, + [cutoff.toISOString()] + ); + return rows as CostRow[]; + } + const rows = await storage.query( + `SELECT e.session_id, e.model, + COUNT(*)::int AS calls, + SUM(e.input_tokens)::int AS total_input, + SUM(e.output_tokens)::int AS total_output, + SUM(e.cost)::float AS total_cost, + AVG(e.duration_ms)::int AS avg_duration_ms + FROM rlmx_events e + WHERE e.kind = 'llm_call' + GROUP BY e.session_id, e.model + ORDER BY total_cost DESC` + ); + return rows as CostRow[]; +} + +/** + * Get tool/sub-call usage. + */ +export async function toolUsage(storage: PgStorage, since?: string): Promise { + if (since) { + const cutoff = parseSinceCutoff(since); + const rows = await storage.query( + `SELECT e.session_id, e.request_type, + COUNT(*)::int AS calls, + SUM(CASE WHEN e.is_error THEN 1 ELSE 0 END)::int AS errors, + AVG(e.duration_ms)::int AS avg_duration_ms + FROM rlmx_events e + WHERE e.created_at >= $1 AND e.kind IN ('sub_call', 'pg_query', 'repl_exec') + GROUP BY e.session_id, e.request_type + ORDER BY calls DESC`, + [cutoff.toISOString()] + ); + return rows as ToolRow[]; + } + const rows = await storage.query( + `SELECT e.session_id, e.request_type, + COUNT(*)::int AS calls, + SUM(CASE WHEN e.is_error THEN 1 ELSE 0 END)::int AS errors, + AVG(e.duration_ms)::int AS avg_duration_ms + FROM rlmx_events e + WHERE e.kind IN ('sub_call', 'pg_query', 'repl_exec') + GROUP BY e.session_id, e.request_type + ORDER BY calls DESC` + ); + return rows as ToolRow[]; +} + +// ─── Formatting ───────────────────────────────────────── + +function pad(s: string, len: number, right = false): string { + return right ? s.padStart(len) : s.padEnd(len); +} + +function truncate(s: string, max: number): string { + return s.length > max ? s.slice(0, max - 1) + "…" : s; +} + +function formatCost(n: number): string { + return n < 0.01 ? `$${n.toFixed(6)}` : `$${n.toFixed(4)}`; +} + +function formatDuration(ms: number | null): string { + if (ms === null) return "-"; + if (ms < 1000) return `${ms}ms`; + return `${(ms / 1000).toFixed(1)}s`; +} + +/** + * Format session rows as a terminal table. + */ +export function formatRunsTable(rows: SessionRow[]): string { + if (rows.length === 0) return "No runs found."; + + const header = [ + pad("ID", 10), + pad("Query", 30), + pad("Model", 25), + pad("Iter", 5, true), + pad("Cost", 10, true), + pad("Status", 10), + pad("Duration", 10, true), + ].join(" "); + + const sep = "-".repeat(header.length); + + const lines = rows.map((r) => [ + pad(r.id.slice(0, 8) + "..", 10), + pad(truncate(r.query, 30), 30), + pad(truncate(r.model, 25), 25), + pad(r.iterations !== null ? String(r.iterations) : "-", 5, true), + pad(formatCost(r.total_cost), 10, true), + pad(r.status, 10), + pad(formatDuration(r.duration_ms), 10, true), + ].join(" ")); + + return [header, sep, ...lines].join("\n"); +} + +/** + * Format event rows as a terminal table. + */ +export function formatEventsTable(rows: EventRow[]): string { + if (rows.length === 0) return "No events found for this run."; + + const header = [ + pad("#", 4, true), + pad("Iter", 5, true), + pad("Kind", 12), + pad("In", 8, true), + pad("Out", 8, true), + pad("Cost", 10, true), + pad("Time", 8, true), + pad("Detail", 40), + ].join(" "); + + const sep = "-".repeat(header.length); + + const lines = rows.map((r) => { + let detail = ""; + if (r.kind === "llm_call") detail = r.model ?? ""; + else if (r.kind === "repl_exec") detail = truncate(r.code ?? "", 40); + else if (r.kind === "sub_call") detail = r.request_type ?? ""; + + return [ + pad(String(r.id), 4, true), + pad(r.iteration !== null ? String(r.iteration) : "-", 5, true), + pad(r.kind, 12), + pad(r.input_tokens !== null ? String(r.input_tokens) : "-", 8, true), + pad(r.output_tokens !== null ? String(r.output_tokens) : "-", 8, true), + pad(r.cost !== null ? formatCost(r.cost) : "-", 10, true), + pad(formatDuration(r.duration_ms), 8, true), + pad(truncate(detail, 40), 40), + ].join(" "); + }); + + return [header, sep, ...lines].join("\n"); +} + +/** + * Format cost breakdown as a terminal table. + */ +export function formatCostTable(rows: CostRow[]): string { + if (rows.length === 0) return "No cost data found."; + + const header = [ + pad("Model", 30), + pad("Calls", 6, true), + pad("Input", 10, true), + pad("Output", 10, true), + pad("Cost", 12, true), + pad("Avg Time", 10, true), + ].join(" "); + + const sep = "-".repeat(header.length); + + const lines = rows.map((r) => [ + pad(truncate(r.model, 30), 30), + pad(String(r.calls), 6, true), + pad(r.total_input.toLocaleString(), 10, true), + pad(r.total_output.toLocaleString(), 10, true), + pad(formatCost(r.total_cost), 12, true), + pad(formatDuration(r.avg_duration_ms), 10, true), + ].join(" ")); + + return [header, sep, ...lines].join("\n"); +} + +/** + * Format tool usage as a terminal table. + */ +export function formatToolTable(rows: ToolRow[]): string { + if (rows.length === 0) return "No tool usage data found."; + + const header = [ + pad("Type", 20), + pad("Calls", 6, true), + pad("Errors", 7, true), + pad("Avg Time", 10, true), + ].join(" "); + + const sep = "-".repeat(header.length); + + const lines = rows.map((r) => [ + pad(r.request_type ?? "-", 20), + pad(String(r.calls), 6, true), + pad(String(r.errors), 7, true), + pad(formatDuration(r.avg_duration_ms), 10, true), + ].join(" ")); + + return [header, sep, ...lines].join("\n"); +} + +/** + * Run the stats command with parsed args. + */ +export async function runStatsCommand(args: string[]): Promise { + // Check for data + if (!(await hasStatsData())) { + console.log("No stats yet. Run a query first."); + return; + } + + // Parse args + const runIdx = args.indexOf("--run"); + const runId = runIdx >= 0 ? args[runIdx + 1] : undefined; + const costsFlag = args.includes("--costs"); + const toolsFlag = args.includes("--tools"); + const sinceIdx = args.indexOf("--since"); + const since = sinceIdx >= 0 ? args[sinceIdx + 1] : undefined; + const outputIdx = args.indexOf("--output"); + const jsonOutput = outputIdx >= 0 && args[outputIdx + 1] === "json"; + + // Start temporary pgserve for querying + let storage: PgStorage | undefined; + try { + storage = await createStatsStorage(); + + if (runId) { + const events = await getRun(storage, runId); + if (jsonOutput) { + console.log(JSON.stringify(events, null, 2)); + } else { + console.log(formatEventsTable(events)); + } + } else if (costsFlag) { + const costs = await costBreakdown(storage, since); + if (jsonOutput) { + console.log(JSON.stringify(costs, null, 2)); + } else { + console.log(formatCostTable(costs)); + } + } else if (toolsFlag) { + const tools = await toolUsage(storage, since); + if (jsonOutput) { + console.log(JSON.stringify(tools, null, 2)); + } else { + console.log(formatToolTable(tools)); + } + } else { + const runs = await listRuns(storage); + if (jsonOutput) { + console.log(JSON.stringify(runs, null, 2)); + } else { + console.log(formatRunsTable(runs)); + } + } + } finally { + if (storage) await storage.stop(); + } +} diff --git a/src/storage.ts b/src/storage.ts new file mode 100644 index 0000000..bf07086 --- /dev/null +++ b/src/storage.ts @@ -0,0 +1,573 @@ +/** + * PgStorage — embedded pgserve lifecycle, context ingestion, and query interface. + * + * Spawns pgserve as a child process (Bun-based TCP proxy over embedded PostgreSQL), + * connects via node-postgres, and provides methods for context storage and retrieval. + */ + +import { spawn, type ChildProcess } from "node:child_process"; +import { createServer } from "node:net"; +import { resolve, join, dirname } from "node:path"; +import { fileURLToPath } from "node:url"; +import { homedir } from "node:os"; +import { mkdirSync, existsSync } from "node:fs"; +import pg from "pg"; +import type { StorageConfig } from "./config.js"; +import type { LoadedContext, ContextItem } from "./context.js"; +import { PROVIDER_LIMITS } from "./cache.js"; + +const { Client } = pg; + +/** Database name used for rlmx context storage */ +const RLMX_DB = "rlmx"; + +/** Schema DDL for the records table */ +const SCHEMA_DDL = ` +CREATE TABLE IF NOT EXISTS records ( + line_num INT PRIMARY KEY, + timestamp TIMESTAMPTZ, + type TEXT, + source TEXT, + session_id TEXT, + content TEXT NOT NULL, + content_tsvector TSVECTOR GENERATED ALWAYS AS (to_tsvector('english', content)) STORED +); + +CREATE INDEX IF NOT EXISTS idx_records_tsvector ON records USING GIN (content_tsvector); +CREATE INDEX IF NOT EXISTS idx_records_timestamp ON records (timestamp) WHERE timestamp IS NOT NULL; +`; + +/** Schema DDL for observability tables */ +const OBSERVABILITY_DDL = ` +CREATE TABLE IF NOT EXISTS rlmx_sessions ( + id TEXT PRIMARY KEY, + query TEXT NOT NULL, + context_path TEXT, + model TEXT NOT NULL, + provider TEXT NOT NULL, + status TEXT DEFAULT 'running', + config JSONB, + started_at TIMESTAMPTZ DEFAULT now(), + ended_at TIMESTAMPTZ, + iterations INT, + input_tokens BIGINT DEFAULT 0, + output_tokens BIGINT DEFAULT 0, + cached_tokens BIGINT DEFAULT 0, + total_cost NUMERIC(10,6) DEFAULT 0, + answer_length INT, + budget_hit TEXT +); + +CREATE TABLE IF NOT EXISTS rlmx_events ( + id BIGSERIAL PRIMARY KEY, + session_id TEXT REFERENCES rlmx_sessions(id), + iteration INT, + kind TEXT NOT NULL, + input_tokens INT, + output_tokens INT, + cost NUMERIC(10,6), + model TEXT, + code TEXT, + stdout TEXT, + stderr TEXT, + request_type TEXT, + prompt_preview TEXT, + duration_ms INT, + is_error BOOLEAN DEFAULT false, + error_message TEXT, + data JSONB, + created_at TIMESTAMPTZ DEFAULT now() +); + +CREATE OR REPLACE VIEW v_cost_breakdown AS +SELECT session_id, model, + COUNT(*) AS calls, + SUM(input_tokens) AS total_input, + SUM(output_tokens) AS total_output, + SUM(cost) AS total_cost, + AVG(duration_ms)::INT AS avg_duration_ms +FROM rlmx_events WHERE kind = 'llm_call' +GROUP BY session_id, model; + +CREATE OR REPLACE VIEW v_repl_usage AS +SELECT session_id, request_type, + COUNT(*) AS calls, + SUM(CASE WHEN is_error THEN 1 ELSE 0 END) AS errors, + AVG(duration_ms)::INT AS avg_duration_ms +FROM rlmx_events WHERE kind IN ('sub_call', 'pg_query', 'repl_exec') +GROUP BY session_id, request_type; +`; + +/** Resolve ~ in paths to the user's home directory */ +function expandHome(p: string): string { + if (p.startsWith("~/") || p === "~") { + return join(homedir(), p.slice(1)); + } + return p; +} + +/** + * Find a free TCP port by binding to port 0 and reading the assigned port. + */ +async function findFreePort(): Promise { + return new Promise((resolve, reject) => { + const srv = createServer(); + srv.listen(0, "127.0.0.1", () => { + const addr = srv.address(); + if (addr && typeof addr === "object") { + const port = addr.port; + srv.close(() => resolve(port)); + } else { + srv.close(() => reject(new Error("Failed to get free port"))); + } + }); + srv.on("error", reject); + }); +} + +/** + * Compute adaptive chunk size based on provider limits and storage config. + */ +export function getChunkSize(provider: string, config: StorageConfig): number { + if (config.chunkSize) return config.chunkSize; + const limit = PROVIDER_LIMITS[provider] ?? 128000; + return Math.floor(limit * config.chunkUtilization * config.charsPerToken); +} + +/** + * PgStorage manages an embedded pgserve instance for large context handling. + */ +export class PgStorage { + private process: ChildProcess | null = null; + private client: InstanceType | null = null; + private port = 0; + private stopping = false; + private cleanupRegistered = false; + + /** Connection string for the running pgserve instance */ + get connectionString(): string { + return `postgresql://localhost:${this.port}/${RLMX_DB}`; + } + + /** Get the underlying pg Client (for observability recorder). */ + getClient(): InstanceType | null { + return this.client; + } + + /** + * Start pgserve and connect to it. + * Returns the connection string once ready. + */ + async start(config: StorageConfig): Promise { + // Resolve pgserve binary path + const pgserveBin = this.findPgserveBin(); + + // Build CLI args + const args: string[] = []; + + // Port: 0 means auto-assign a free port; otherwise use the specified port + const requestedPort = config.port === 0 ? await findFreePort() : config.port; + args.push("--port", String(requestedPort)); + + // Mode: persistent uses dataDir, memory uses temp + if (config.mode === "persistent") { + const dataDir = expandHome(config.dataDir); + mkdirSync(dataDir, { recursive: true }); + args.push("--data", dataDir); + } + // memory mode: no --data flag = in-memory (pgserve default) + + // Quiet output for embedded use + args.push("--log", "error"); + args.push("--no-cluster"); + args.push("--no-stats"); + + // Spawn pgserve + this.process = spawn(pgserveBin, args, { + stdio: ["ignore", "pipe", "pipe"], + detached: false, + }); + + // Register cleanup handlers (only once per process) + if (!this.cleanupRegistered) { + this.registerCleanup(); + this.cleanupRegistered = true; + } + + // Wait for pgserve to be ready by polling for connection + this.port = requestedPort; + await this.waitForReady(); + + // Connect pg client and create schema + this.client = new Client({ connectionString: this.connectionString }); + await this.client.connect(); + await this.client.query(SCHEMA_DDL); + await this.client.query(OBSERVABILITY_DDL); + + return this.connectionString; + } + + /** + * Ingest a loaded context into the records table. + * For JSONL: parses each line as JSON, extracts timestamp/type fields. + * For other text: one record per line. + */ + async ingest(context: LoadedContext, sessionId?: string): Promise { + if (!this.client) throw new Error("PgStorage not started"); + + // Clear stale records so re-runs with different context don't keep old data + await this.client.query("TRUNCATE records"); + + // Collect all lines from context, tracking source file when available + let lines: Array<{ text: string; source: string | null }>; + if (context.type === "list") { + const items = context.content as ContextItem[]; + lines = items.flatMap((item) => + item.content.split("\n").map((text) => ({ text, source: item.path })) + ); + } else { + lines = (context.content as string) + .split("\n") + .map((text) => ({ text, source: null })); + } + + // Batch insert using multi-row VALUES + const BATCH_SIZE = 500; + let ingested = 0; + + for (let i = 0; i < lines.length; i += BATCH_SIZE) { + const batch = lines.slice(i, i + BATCH_SIZE); + const values: unknown[] = []; + const placeholders: string[] = []; + + for (let j = 0; j < batch.length; j++) { + const lineNum = i + j; + const { text, source } = batch[j]; + if (text.trim() === "") continue; + + const { timestamp, type, content } = parseLine(text); + const idx = values.length; + placeholders.push( + `($${idx + 1}, $${idx + 2}, $${idx + 3}, $${idx + 4}, $${idx + 5}, $${idx + 6})` + ); + values.push(lineNum, timestamp, type, source, sessionId ?? null, content); + } + + if (placeholders.length > 0) { + await this.client.query( + `INSERT INTO records (line_num, timestamp, type, source, session_id, content) + VALUES ${placeholders.join(", ")}`, + values + ); + ingested += placeholders.length; + } + } + + return ingested; + } + + /** + * Full-text search via tsvector. + */ + async search( + pattern: string, + limit = 20 + ): Promise> { + if (!this.client) throw new Error("PgStorage not started"); + + // Convert pattern to tsquery: split words and join with & + const tsquery = pattern + .trim() + .split(/\s+/) + .map((w) => w.replace(/[^a-zA-Z0-9]/g, "")) + .filter(Boolean) + .join(" & "); + + if (!tsquery) return []; + + const result = await this.client.query( + `SELECT line_num, source, content, ts_rank(content_tsvector, to_tsquery('english', $1)) AS rank + FROM records + WHERE content_tsvector @@ to_tsquery('english', $1) + ORDER BY rank DESC + LIMIT $2`, + [tsquery, limit] + ); + + return result.rows.map((r) => ({ + line_num: r.line_num, + source: r.source, + content: r.content, + rank: parseFloat(r.rank), + })); + } + + /** + * Get records by line number range (inclusive). + */ + async slice( + start: number, + end: number + ): Promise> { + if (!this.client) throw new Error("PgStorage not started"); + + const result = await this.client.query( + `SELECT line_num, source, content FROM records + WHERE line_num >= $1 AND line_num < $2 + ORDER BY line_num`, + [start, end] + ); + + return result.rows; + } + + /** + * Filter records by timestamp range. + */ + async timeRange( + from: string, + to: string + ): Promise< + Array<{ line_num: number; timestamp: string; content: string }> + > { + if (!this.client) throw new Error("PgStorage not started"); + + const result = await this.client.query( + `SELECT line_num, timestamp, content FROM records + WHERE timestamp >= $1::timestamptz AND timestamp <= $2::timestamptz + ORDER BY timestamp`, + [from, to] + ); + + return result.rows; + } + + /** + * Count total records. + */ + async count(): Promise { + if (!this.client) throw new Error("PgStorage not started"); + const result = await this.client.query("SELECT COUNT(*)::int AS cnt FROM records"); + return result.rows[0].cnt; + } + + /** + * Execute raw SQL (read-only). Supports parameterized queries. + */ + async query(sql: string, params?: unknown[]): Promise { + if (!this.client) throw new Error("PgStorage not started"); + + // Wrap in read-only transaction for safety + await this.client.query("BEGIN TRANSACTION READ ONLY"); + try { + const result = params + ? await this.client.query(sql, params) + : await this.client.query(sql); + await this.client.query("COMMIT"); + return result.rows; + } catch (err) { + await this.client.query("ROLLBACK").catch(() => {}); + throw err; + } + } + + /** + * Stop pgserve: graceful 3s timeout, then SIGKILL. + */ + async stop(): Promise { + if (this.stopping) return; + this.stopping = true; + + // Close pg client + if (this.client) { + try { + await this.client.end(); + } catch { + // Ignore client close errors + } + this.client = null; + } + + // Kill pgserve process + const proc = this.process; + if (proc && proc.pid && !proc.killed) { + await new Promise((resolve) => { + const forceKillTimer = setTimeout(() => { + try { + proc.kill("SIGKILL"); + } catch { + // Already dead + } + resolve(); + }, 3000); + + proc.once("exit", () => { + clearTimeout(forceKillTimer); + resolve(); + }); + + try { + proc.kill("SIGTERM"); + } catch { + clearTimeout(forceKillTimer); + resolve(); + } + }); + } + + this.process = null; + this.stopping = false; + } + + // ─── Private helpers ────────────────────────────────────── + + /** Find the pgserve CLI binary in node_modules */ + private findPgserveBin(): string { + // Try package-relative first (works for global installs), then cwd fallback + const __dirname = dirname(fileURLToPath(import.meta.url)); + const pkgBin = join(__dirname, "..", "..", "node_modules", ".bin", "pgserve"); + if (existsSync(pkgBin)) return pkgBin; + + // Fallback: resolve from cwd (works for local dev) + return resolve("node_modules", ".bin", "pgserve"); + } + + /** + * Wait for pgserve to accept connections (poll with backoff). + * Throws if not ready within 10 seconds. + */ + private async waitForReady(): Promise { + const deadline = Date.now() + 10_000; + let lastError: Error | null = null; + + // Also capture process errors + const procError = new Promise((_, reject) => { + if (!this.process) return; + this.process.once("exit", (code) => { + if (code !== null && code !== 0) { + reject(new Error(`pgserve exited with code ${code}`)); + } + }); + this.process.once("error", (err) => { + reject(new Error(`pgserve spawn error: ${err.message}`)); + }); + }); + + while (Date.now() < deadline) { + // Fail fast if the process already exited + if (this.process && typeof this.process.exitCode === 'number') { + throw new Error(`pgserve exited with code ${this.process.exitCode} before becoming ready`); + } + + try { + const testClient = new Client({ + connectionString: this.connectionString, + connectionTimeoutMillis: 1000, + }); + await Promise.race([testClient.connect(), procError]); + await testClient.end(); + return; + } catch (err) { + lastError = err instanceof Error ? err : new Error(String(err)); + await new Promise((r) => setTimeout(r, 200)); + } + } + + throw new Error( + `pgserve did not become ready within 10s: ${lastError?.message ?? "unknown error"}` + ); + } + + /** Register process exit handlers for cleanup */ + private registerCleanup(): void { + const cleanup = () => { + if (this.process && this.process.pid && !this.process.killed) { + try { + this.process.kill("SIGKILL"); + } catch { + // Best effort + } + } + }; + + process.once("exit", cleanup); + process.once("SIGTERM", () => { + cleanup(); + process.exit(128 + 15); + }); + process.once("SIGINT", () => { + cleanup(); + process.exit(128 + 2); + }); + process.once("uncaughtException", (err) => { + console.error("rlmx: uncaught exception, cleaning up pgserve:", err.message); + cleanup(); + process.exit(1); + }); + } +} + +// ─── Line parsing ─────────────────────────────────────── + +/** Common timestamp field names in JSONL data */ +const TIMESTAMP_FIELDS = [ + "timestamp", + "ts", + "time", + "datetime", + "date", + "created_at", + "createdAt", + "@timestamp", +]; + +/** Common type/category field names in JSONL data */ +const TYPE_FIELDS = ["type", "kind", "level", "severity", "category", "event"]; + +/** + * Parse a single line of context data. + * Tries JSON first (JSONL), falls back to plain text. + */ +function parseLine(line: string): { + timestamp: string | null; + type: string | null; + content: string; +} { + const trimmed = line.trim(); + if (!trimmed.startsWith("{")) { + return { timestamp: null, type: null, content: trimmed }; + } + + try { + const obj = JSON.parse(trimmed); + if (typeof obj !== "object" || obj === null) { + return { timestamp: null, type: null, content: trimmed }; + } + + // Extract timestamp + let timestamp: string | null = null; + for (const field of TIMESTAMP_FIELDS) { + if (obj[field] !== undefined && obj[field] !== null) { + timestamp = String(obj[field]); + break; + } + } + + // Extract type + let type: string | null = null; + for (const field of TYPE_FIELDS) { + if (obj[field] !== undefined && obj[field] !== null) { + type = String(obj[field]); + break; + } + } + + return { timestamp, type, content: trimmed }; + } catch { + // Malformed JSON — log warning and treat as plain text + process.stderr.write( + `rlmx: warning: skipping malformed JSONL line: ${trimmed.slice(0, 80)}...\n` + ); + return { timestamp: null, type: null, content: trimmed }; + } +} diff --git a/src/version.ts b/src/version.ts index aa3187d..db853f1 100644 --- a/src/version.ts +++ b/src/version.ts @@ -1 +1 @@ -export const VERSION = '0.260330.13'; +export const VERSION = '0.260331.1'; diff --git a/tests/benchmark.test.ts b/tests/benchmark.test.ts new file mode 100644 index 0000000..b3f604b --- /dev/null +++ b/tests/benchmark.test.ts @@ -0,0 +1,302 @@ +import { describe, it } from "node:test"; +import assert from "node:assert/strict"; +import { join, dirname } from "node:path"; +import { fileURLToPath } from "node:url"; +import { mkdtemp, readFile, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; + +import { + formatBenchmarkTable, + saveBenchmarkResults, + aggregateTotals, + calculateSavings, + calculateCostSavings, + type BenchmarkQuestion, + type BenchmarkRunResult, + type BenchmarkResults, +} from "../src/benchmark.js"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); + +async function loadDataset(): Promise { + // From dist/tests/ -> ../../src/benchmark-data.json + const jsonPath = join(__dirname, "..", "..", "src", "benchmark-data.json"); + const raw = await readFile(jsonPath, "utf-8"); + return JSON.parse(raw) as BenchmarkQuestion[]; +} + +// ─── Dataset Validation ────────────────────────────────── + +describe("benchmark-data.json", () => { + it("has valid structure for all entries", async () => { + const dataset = await loadDataset(); + + assert.ok(Array.isArray(dataset), "dataset should be an array"); + assert.ok(dataset.length >= 5, `dataset should have at least 5 entries, got ${dataset.length}`); + + for (const entry of dataset) { + assert.ok(typeof entry.id === "string" && entry.id.length > 0, `entry must have non-empty id`); + assert.ok(typeof entry.name === "string" && entry.name.length > 0, `entry ${entry.id} must have non-empty name`); + assert.ok(typeof entry.question === "string" && entry.question.length > 0, `entry ${entry.id} must have non-empty question`); + assert.ok(typeof entry.context === "string" && entry.context.length > 0, `entry ${entry.id} must have non-empty context`); + assert.ok(typeof entry.category === "string" && entry.category.length > 0, `entry ${entry.id} must have non-empty category`); + } + }); + + it("covers required categories", async () => { + const dataset = await loadDataset(); + + const categories = new Set(dataset.map((d) => d.category)); + assert.ok(categories.has("extraction"), "should have extraction category"); + assert.ok(categories.has("summarization"), "should have summarization category"); + assert.ok(categories.has("reasoning"), "should have reasoning category"); + assert.ok(categories.has("comparison"), "should have comparison category"); + assert.ok(categories.has("synthesis"), "should have synthesis category"); + }); + + it("has unique ids", async () => { + const dataset = await loadDataset(); + + const ids = dataset.map((d) => d.id); + const uniqueIds = new Set(ids); + assert.equal(ids.length, uniqueIds.size, "all ids must be unique"); + }); +}); + +// ─── Savings Calculation ───────────────────────────────── + +describe("calculateSavings", () => { + it("computes correct percentage", () => { + assert.equal(calculateSavings(100, 70), 30); + assert.equal(calculateSavings(200, 100), 50); + }); + + it("returns 0 for zero direct tokens", () => { + assert.equal(calculateSavings(0, 100), 0); + }); + + it("handles equal tokens", () => { + assert.equal(calculateSavings(100, 100), 0); + }); + + it("handles RLM using more tokens (negative savings)", () => { + const result = calculateSavings(100, 150); + assert.ok(result < 0, "savings should be negative when RLM uses more"); + assert.equal(result, -50); + }); +}); + +describe("calculateCostSavings", () => { + it("computes correct percentage", () => { + const result = calculateCostSavings(1.0, 0.7); + assert.ok(Math.abs(result - 30) < 0.001, `expected ~30, got ${result}`); + }); + + it("returns 0 for zero direct cost", () => { + assert.equal(calculateCostSavings(0, 0.5), 0); + }); + + it("handles equal cost", () => { + assert.equal(calculateCostSavings(0.5, 0.5), 0); + }); +}); + +// ─── Table Formatting ──────────────────────────────────── + +function makeMockResults(): BenchmarkResults { + return { + timestamp: "2025-01-15T10:30:00.000Z", + mode: "cost", + model: "google/gemini-2.0-flash", + runs: [ + { + questionId: "cost-001", + questionName: "API extraction", + direct: { + tokens_input: 10000, + tokens_output: 2450, + cost: 0.0012, + latency_ms: 2300, + answer: "Direct answer", + }, + rlm: { + tokens_input: 6000, + tokens_output: 2200, + cost: 0.0008, + latency_ms: 4100, + iterations: 3, + answer: "RLM answer", + }, + savings: { + tokens_pct: 34.1, + cost_pct: 33.3, + }, + }, + { + questionId: "cost-002", + questionName: "Summary test", + direct: { + tokens_input: 20000, + tokens_output: 5000, + cost: 0.0025, + latency_ms: 3500, + answer: "Direct summary", + }, + rlm: { + tokens_input: 12000, + tokens_output: 4000, + cost: 0.0016, + latency_ms: 6200, + iterations: 5, + answer: "RLM summary", + }, + savings: { + tokens_pct: 36.0, + cost_pct: 36.0, + }, + }, + ], + totals: { + direct: { tokens: 37450, cost: 0.0037, latency_ms: 5800 }, + rlm: { tokens: 24200, cost: 0.0024, latency_ms: 10300, avg_iterations: 4.0 }, + savings: { tokens_pct: 35.4, cost_pct: 35.1 }, + }, + }; +} + +describe("formatBenchmarkTable", () => { + it("contains expected header", () => { + const table = formatBenchmarkTable(makeMockResults()); + assert.ok(table.includes("rlmx benchmark"), "should contain header"); + assert.ok(table.includes("cost comparison"), "should contain mode label"); + assert.ok(table.includes("Question"), "should contain Question column"); + assert.ok(table.includes("Mode"), "should contain Mode column"); + assert.ok(table.includes("Tokens"), "should contain Tokens column"); + assert.ok(table.includes("Cost"), "should contain Cost column"); + assert.ok(table.includes("Latency"), "should contain Latency column"); + assert.ok(table.includes("Iters"), "should contain Iters column"); + }); + + it("contains row data for each run", () => { + const table = formatBenchmarkTable(makeMockResults()); + assert.ok(table.includes("API extraction"), "should contain first question name"); + assert.ok(table.includes("Summary test"), "should contain second question name"); + assert.ok(table.includes("Direct"), "should contain Direct mode"); + assert.ok(table.includes("RLM"), "should contain RLM mode"); + assert.ok(table.includes("Savings"), "should contain Savings mode"); + }); + + it("contains box-drawing characters", () => { + const table = formatBenchmarkTable(makeMockResults()); + assert.ok(table.includes("┌"), "should contain top-left corner"); + assert.ok(table.includes("┐"), "should contain top-right corner"); + assert.ok(table.includes("└"), "should contain bottom-left corner"); + assert.ok(table.includes("┘"), "should contain bottom-right corner"); + assert.ok(table.includes("│"), "should contain vertical bar"); + assert.ok(table.includes("─"), "should contain horizontal bar"); + assert.ok(table.includes("├"), "should contain left tee"); + assert.ok(table.includes("┤"), "should contain right tee"); + }); + + it("contains TOTALS row", () => { + const table = formatBenchmarkTable(makeMockResults()); + assert.ok(table.includes("TOTALS"), "should contain TOTALS label"); + }); + + it("uses oolong label for oolong mode", () => { + const results = makeMockResults(); + results.mode = "oolong"; + const table = formatBenchmarkTable(results); + assert.ok(table.includes("oolong accuracy"), "should contain oolong mode label"); + }); +}); + +// ─── Aggregation ───────────────────────────────────────── + +describe("aggregateTotals", () => { + it("sums tokens and costs correctly", () => { + const runs: BenchmarkRunResult[] = [ + { + questionId: "q1", + questionName: "Q1", + direct: { tokens_input: 100, tokens_output: 50, cost: 0.01, latency_ms: 1000, answer: "" }, + rlm: { tokens_input: 60, tokens_output: 30, cost: 0.006, latency_ms: 2000, iterations: 2, answer: "" }, + savings: { tokens_pct: 40, cost_pct: 40 }, + }, + { + questionId: "q2", + questionName: "Q2", + direct: { tokens_input: 200, tokens_output: 100, cost: 0.02, latency_ms: 1500, answer: "" }, + rlm: { tokens_input: 120, tokens_output: 80, cost: 0.012, latency_ms: 3000, iterations: 4, answer: "" }, + savings: { tokens_pct: 33.3, cost_pct: 40 }, + }, + ]; + + const totals = aggregateTotals(runs); + + assert.equal(totals.direct.tokens, 450, "direct tokens: 100+50+200+100"); + assert.equal(totals.rlm.tokens, 290, "rlm tokens: 60+30+120+80"); + assert.equal(totals.direct.latency_ms, 2500, "direct latency sum"); + assert.equal(totals.rlm.latency_ms, 5000, "rlm latency sum"); + assert.equal(totals.rlm.avg_iterations, 3, "avg iterations: (2+4)/2"); + + // Check cost sums (use approximate due to floating point) + assert.ok(Math.abs(totals.direct.cost - 0.03) < 0.0001, "direct cost should be ~0.03"); + assert.ok(Math.abs(totals.rlm.cost - 0.018) < 0.0001, "rlm cost should be ~0.018"); + }); + + it("handles empty runs", () => { + const totals = aggregateTotals([]); + assert.equal(totals.direct.tokens, 0); + assert.equal(totals.rlm.tokens, 0); + assert.equal(totals.direct.cost, 0); + assert.equal(totals.rlm.cost, 0); + assert.equal(totals.rlm.avg_iterations, 0); + assert.equal(totals.savings.tokens_pct, 0); + assert.equal(totals.savings.cost_pct, 0); + }); + + it("computes savings percentages from aggregated totals", () => { + const runs: BenchmarkRunResult[] = [ + { + questionId: "q1", + questionName: "Q1", + direct: { tokens_input: 100, tokens_output: 0, cost: 0.10, latency_ms: 100, answer: "" }, + rlm: { tokens_input: 50, tokens_output: 0, cost: 0.05, latency_ms: 200, iterations: 1, answer: "" }, + savings: { tokens_pct: 50, cost_pct: 50 }, + }, + ]; + + const totals = aggregateTotals(runs); + assert.equal(totals.savings.tokens_pct, 50); + assert.equal(totals.savings.cost_pct, 50); + }); +}); + +// ─── Results Saving ────────────────────────────────────── + +describe("saveBenchmarkResults", () => { + it("writes results as parseable JSON file", async () => { + const tempDir = await mkdtemp(join(tmpdir(), "rlmx-bench-test-")); + // Override homedir by setting env for the save call + const origHome = process.env.HOME; + process.env.HOME = tempDir; + + try { + const results = makeMockResults(); + const savedPath = await saveBenchmarkResults(results); + + assert.ok(savedPath.endsWith(".json"), "saved file should be .json"); + assert.ok(savedPath.includes("benchmark-cost"), "saved file should include mode"); + + const content = await readFile(savedPath, "utf-8"); + const parsed = JSON.parse(content); + assert.equal(parsed.mode, "cost"); + assert.equal(parsed.runs.length, 2); + assert.equal(parsed.model, "google/gemini-2.0-flash"); + } finally { + process.env.HOME = origHome; + await rm(tempDir, { recursive: true, force: true }); + } + }); +}); diff --git a/tests/cache.test.ts b/tests/cache.test.ts index b4848c9..8a845b0 100644 --- a/tests/cache.test.ts +++ b/tests/cache.test.ts @@ -42,6 +42,7 @@ function makeConfig(overrides: Partial = {}): RlmxConfig { configSource: overrides.configSource ?? "yaml", gemini: overrides.gemini ?? { thinkingLevel: null, googleSearch: false, urlContext: false, codeExecution: false, computerUse: false, mapsGrounding: false, fileSearch: false, mediaResolution: null }, output: overrides.output ?? { schema: null }, + storage: overrides.storage ?? { enabled: "auto", mode: "persistent", dataDir: "~/.rlmx/data", port: 0, chunkSize: null, chunkUtilization: 0.6, charsPerToken: 4 }, }; } diff --git a/tests/context.test.ts b/tests/context.test.ts index 2cf0b48..bcd8d14 100644 --- a/tests/context.test.ts +++ b/tests/context.test.ts @@ -5,6 +5,7 @@ import { join } from "node:path"; import { tmpdir } from "node:os"; import { loadContext, loadContextFromDir, loadContextFromFile } from "../src/context.js"; import type { ContextItem } from "../src/context.js"; +import { loadConfig } from "../src/config.js"; describe("context loading", () => { let dir: string; @@ -105,4 +106,66 @@ describe("context loading", () => { assert.equal(items.length, 0); await rm(dir, { recursive: true }); }); + + it("extensions filter loads only matching types (regression #28)", async () => { + dir = await mkdtemp(join(tmpdir(), "rlmx-ctx-")); + await writeFile(join(dir, "readme.md"), "markdown"); + await writeFile(join(dir, "app.ts"), "typescript"); + await writeFile(join(dir, "doc.mdx"), "mdx content"); + await writeFile(join(dir, "data.json"), '{"key": "value"}'); + await writeFile(join(dir, "style.css"), "body {}"); + + const ctx = await loadContextFromDir(dir, { extensions: [".mdx", ".json"] }); + const items = ctx.content as ContextItem[]; + const paths = items.map((i) => i.path).sort(); + + assert.equal(items.length, 2, `Expected 2 files but got ${items.length}: ${paths.join(", ")}`); + assert.deepEqual(paths, ["data.json", "doc.mdx"]); + await rm(dir, { recursive: true }); + }); + + it("extensions without leading dots are normalized (regression #28)", async () => { + dir = await mkdtemp(join(tmpdir(), "rlmx-ctx-")); + await writeFile(join(dir, "readme.md"), "markdown"); + await writeFile(join(dir, "app.ts"), "typescript"); + await writeFile(join(dir, "doc.mdx"), "mdx content"); + await writeFile(join(dir, "data.json"), '{"key": "value"}'); + + // Pass extensions WITHOUT leading dots — should still work + const ctx = await loadContextFromDir(dir, { extensions: ["mdx", "json"] }); + const items = ctx.content as ContextItem[]; + const paths = items.map((i) => i.path).sort(); + + assert.equal(items.length, 2, `Expected 2 files but got ${items.length}: ${paths.join(", ")}`); + assert.deepEqual(paths, ["data.json", "doc.mdx"]); + await rm(dir, { recursive: true }); + }); + + it("yaml context.extensions propagates to config correctly (regression #28)", async () => { + dir = await mkdtemp(join(tmpdir(), "rlmx-ctx-")); + // Create an rlmx.yaml with custom extensions (without dots, as users often write) + await writeFile( + join(dir, "rlmx.yaml"), + "context:\n extensions: [mdx, json]\n" + ); + + const config = await loadConfig(dir); + // Extensions should be normalized to have leading dots + assert.deepEqual(config.contextConfig.extensions, [".mdx", ".json"]); + // Exclude should fall back to defaults + assert.ok(config.contextConfig.exclude.includes("node_modules")); + await rm(dir, { recursive: true }); + }); + + it("yaml context.extensions with dots propagates correctly (regression #28)", async () => { + dir = await mkdtemp(join(tmpdir(), "rlmx-ctx-")); + await writeFile( + join(dir, "rlmx.yaml"), + "context:\n extensions: [.mdx, .json]\n" + ); + + const config = await loadConfig(dir); + assert.deepEqual(config.contextConfig.extensions, [".mdx", ".json"]); + await rm(dir, { recursive: true }); + }); }); diff --git a/tests/session.test.ts b/tests/session.test.ts new file mode 100644 index 0000000..a05d786 --- /dev/null +++ b/tests/session.test.ts @@ -0,0 +1,156 @@ +import { describe, it, before, after } from "node:test"; +import assert from "node:assert/strict"; +import { rm, mkdir, readFile, writeFile, stat } from "node:fs/promises"; +import { join } from "node:path"; +import { tmpdir } from "node:os"; +import yaml from "js-yaml"; +import { saveSession, type SessionData } from "../src/session.js"; + +function makeSessionData(overrides?: Partial): SessionData { + return { + runId: "test-run-id-1234", + query: "What is the meaning of life?", + contextPath: "/tmp/context", + model: "google/gemini-2.0-flash", + answer: "The answer is 42.", + usage: { + inputTokens: 1000, + outputTokens: 500, + cachedTokens: 200, + totalCost: 0.0015, + iterations: 3, + timeMs: 4500, + model: "google/gemini-2.0-flash", + }, + config: { + model: { provider: "google", model: "gemini-2.0-flash" }, + toolsLevel: "core", + budget: { maxCost: 1.0, maxTokens: 100000 }, + }, + logPath: null, + ...overrides, + }; +} + +describe("saveSession", () => { + const testDir = join(tmpdir(), `rlmx-session-test-${Date.now()}`); + const originalHome = process.env.HOME; + + before(async () => { + await mkdir(testDir, { recursive: true }); + process.env.HOME = testDir; + }); + + after(async () => { + process.env.HOME = originalHome; + await rm(testDir, { recursive: true, force: true }); + }); + + it("creates session directory and returns its path", async () => { + const data = makeSessionData(); + const sessionDir = await saveSession(data); + assert.ok(sessionDir.includes(data.runId)); + const dirStat = await stat(sessionDir); + assert.ok(dirStat.isDirectory()); + }); + + it("writes all 5 expected files", async () => { + const data = makeSessionData({ runId: "test-all-files" }); + const sessionDir = await saveSession(data); + + const files = ["meta.json", "usage.json", "answer.txt", "config.yaml", "trajectory.jsonl"]; + for (const file of files) { + const fileStat = await stat(join(sessionDir, file)); + assert.ok(fileStat.isFile(), `Expected ${file} to exist`); + } + }); + + it("meta.json has correct fields", async () => { + const data = makeSessionData({ runId: "test-meta-fields" }); + const sessionDir = await saveSession(data); + + const raw = await readFile(join(sessionDir, "meta.json"), "utf-8"); + const meta = JSON.parse(raw); + assert.equal(meta.runId, "test-meta-fields"); + assert.equal(meta.query, data.query); + assert.equal(meta.contextPath, data.contextPath); + assert.ok(typeof meta.timestamp === "string"); + assert.ok(meta.timestamp.includes("T")); // ISO string + assert.ok(typeof meta.version === "string"); + }); + + it("usage.json has correct token counts", async () => { + const data = makeSessionData({ runId: "test-usage-fields" }); + const sessionDir = await saveSession(data); + + const raw = await readFile(join(sessionDir, "usage.json"), "utf-8"); + const usage = JSON.parse(raw); + assert.equal(usage.inputTokens, 1000); + assert.equal(usage.outputTokens, 500); + assert.equal(usage.cachedTokens, 200); + assert.equal(usage.totalCost, 0.0015); + assert.equal(usage.iterations, 3); + assert.equal(usage.timeMs, 4500); + assert.equal(usage.model, "google/gemini-2.0-flash"); + }); + + it("answer.txt contains the answer text", async () => { + const data = makeSessionData({ runId: "test-answer-txt" }); + const sessionDir = await saveSession(data); + + const answer = await readFile(join(sessionDir, "answer.txt"), "utf-8"); + assert.equal(answer, "The answer is 42."); + }); + + it("config.yaml is valid YAML matching the config snapshot", async () => { + const data = makeSessionData({ runId: "test-config-yaml" }); + const sessionDir = await saveSession(data); + + const raw = await readFile(join(sessionDir, "config.yaml"), "utf-8"); + const parsed = yaml.load(raw) as Record; + assert.ok(typeof parsed === "object"); + assert.deepEqual(parsed, data.config); + }); + + it("copies trajectory when logPath is provided", async () => { + const logDir = join(testDir, "logs"); + await mkdir(logDir, { recursive: true }); + const logPath = join(logDir, "test-run.jsonl"); + const logContent = '{"event":"run_start","run_id":"abc"}\n{"event":"run_end","run_id":"abc"}\n'; + await writeFile(logPath, logContent); + + const data = makeSessionData({ runId: "test-trajectory-copy", logPath }); + const sessionDir = await saveSession(data); + + const trajectory = await readFile(join(sessionDir, "trajectory.jsonl"), "utf-8"); + assert.equal(trajectory, logContent); + }); + + it("writes empty trajectory when logPath is null", async () => { + const data = makeSessionData({ runId: "test-trajectory-null", logPath: null }); + const sessionDir = await saveSession(data); + + const trajectory = await readFile(join(sessionDir, "trajectory.jsonl"), "utf-8"); + assert.equal(trajectory, ""); + }); + + it("writes empty trajectory when logPath points to non-existent file", async () => { + const data = makeSessionData({ + runId: "test-trajectory-missing", + logPath: "/tmp/does-not-exist-rlmx-test.jsonl", + }); + const sessionDir = await saveSession(data); + + const trajectory = await readFile(join(sessionDir, "trajectory.jsonl"), "utf-8"); + assert.equal(trajectory, ""); + }); + + it("handles null contextPath in meta.json", async () => { + const data = makeSessionData({ runId: "test-null-context", contextPath: null }); + const sessionDir = await saveSession(data); + + const raw = await readFile(join(sessionDir, "meta.json"), "utf-8"); + const meta = JSON.parse(raw); + assert.equal(meta.contextPath, null); + }); +});