Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/RELEASE-MANUAL-SMOKE.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ Applies to every release, all platforms.
- [ ] **First launch flow completes for a brand-new user** — Fresh OS user account, no `~/.openhuman` directory. Walk through onboarding to first agent reply. Expected: no crashes, no permission deadlocks, no stale-config errors.
- [ ] **Auto-update download + relaunch succeeds** — Install the previous release, point the updater feed at this release, trigger an update check. Expected: download completes, relaunch installs the new binary, version string in `Settings > About` matches the release tag.
- [ ] **Logging out + logging back in preserves nothing private** — Sign out, sign in as a different user. Expected: no leaked memory, threads, or skill state from the previous session (regression watch — see #900).
- [ ] **`memory_tree` migrates WAL→TRUNCATE on upgrade with memory intact** — Install a previous (WAL-era) build, use it enough to populate memory so a `chunks.db-wal`/`-shm` pair exists under `~/.openhuman/.../workspace/memory_tree/`, then upgrade to this build. Expected on first launch: `PRAGMA journal_mode` on `chunks.db` reports `truncate`, the `-wal`/`-shm` side-files are gone, previously-captured memories still surface in recall, and no `Failed to initialize memory_tree schema` errors appear.

---

Expand Down
52 changes: 32 additions & 20 deletions src/openhuman/memory/tree/jobs/worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,9 @@ pub fn start(config: Config) {
);
tokio::time::sleep(Duration::from_secs(1)).await;
} else if is_sqlite_io_transient(&err) {
// I/O errors (IOERR_TRUNCATE 1546, IOERR_SHMMAP 4874,
// CANTOPEN 14) or circuit breaker open — transient
// I/O errors (IOERR_TRUNCATE 1546, the `-shm` family
// 4618/4874/5386, IN_PAGE 8714, CANTOPEN 14) or circuit
// breaker open — transient
// filesystem / WAL condition. Back off 30 s and let the
// connection cache try a fresh open on next poll. These
// are NOT reported to Sentry (they are transient and were
Expand Down Expand Up @@ -243,17 +244,21 @@ pub async fn run_once(config: &Config) -> Result<bool> {
/// silently backed off without a Sentry report (#2206).
///
/// Covers:
/// - `SQLITE_IOERR_TRUNCATE` (extended code 1546): WAL truncation failed —
/// usually a transient filesystem hiccup.
/// - `SQLITE_IOERR_SHMMAP` (extended code 4874): shared-memory mapping
/// failed — WAL side-file temporarily unavailable.
/// - `SQLITE_CANTOPEN` / `CannotOpen` (extended code 14): DB file temporarily
/// inaccessible.
/// - `SQLITE_IOERR_TRUNCATE` (1546): WAL truncation failed — usually a
/// transient filesystem hiccup.
/// - WAL `-shm` family — `SHMOPEN` (4618, the macOS cold-start failure),
/// `SHMSIZE` (4874), `SHMMAP` (5386): shared-memory side-file temporarily
/// unavailable. (4874 is SHMSIZE, not SHMMAP — the real SHMMAP is 5386.)
/// - `SQLITE_IOERR_IN_PAGE` (8714): mmap-page I/O fault.
/// - `SQLITE_CANTOPEN` / `CannotOpen` (14): DB file temporarily inaccessible.
/// - Text fallback: circuit breaker message, or rusqlite phrases that don't
/// downcast cleanly after multiple `.context()` layers.
fn is_sqlite_io_transient(err: &anyhow::Error) -> bool {
if let Some(rusqlite::Error::SqliteFailure(f, _)) = err.downcast_ref::<rusqlite::Error>() {
if matches!(f.extended_code, 1546 | 4874 | 14) {
// 14 CANTOPEN, 1546 TRUNCATE, 4618 SHMOPEN, 4874 SHMSIZE, 5386 SHMMAP,
// 8714 IN_PAGE — the WAL `-shm` cold-start family (4874 is SHMSIZE, not
// SHMMAP; the real SHMMAP is 5386).
if matches!(f.extended_code, 14 | 1546 | 4618 | 4874 | 5386 | 8714) {
return true;
}
if f.code == rusqlite::ErrorCode::CannotOpen {
Expand Down Expand Up @@ -396,18 +401,25 @@ mod tests {
assert!(is_sqlite_io_transient(&anyhow::Error::from(raw)));
}

/// SQLITE_IOERR_SHMMAP (extended code 4874) must be classified as
/// transient — WAL shared-memory mapping is a filesystem hiccup.
/// The WAL `-shm` family must classify as transient via the NUMERIC arm
/// (the message deliberately avoids the text-fallback phrases). 4618
/// SHMOPEN is the macOS cold-start failure; 4874 is SHMSIZE; 5386 is the
/// real SHMMAP; 8714 is IN_PAGE.
#[test]
fn is_sqlite_io_transient_matches_ioerr_shmmap() {
let raw = rusqlite::Error::SqliteFailure(
rusqlite::ffi::Error {
code: rusqlite::ErrorCode::SystemIoFailure,
extended_code: 4874, // SQLITE_IOERR_SHMMAP
},
Some("xshmmap failed".into()),
);
assert!(is_sqlite_io_transient(&anyhow::Error::from(raw)));
fn is_sqlite_io_transient_matches_shm_family() {
for ext in [4618, 4874, 5386, 8714] {
let raw = rusqlite::Error::SqliteFailure(
rusqlite::ffi::Error {
code: rusqlite::ErrorCode::SystemIoFailure,
extended_code: ext,
},
Some("sqlite extended io failure".into()),
);
assert!(
is_sqlite_io_transient(&anyhow::Error::from(raw)),
"extended_code {ext} must classify as transient (numeric arm)"
);
}
}

/// SQLITE_CANTOPEN (code CannotOpen, extended code 14) must be
Expand Down
88 changes: 67 additions & 21 deletions src/openhuman/memory/tree/store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
//! `with_connection()` previously opened a new SQLite connection and re-ran
//! the full schema init (8 tables, 15+ indexes, 8+ migrations) on **every**
//! call. With 4 workers polling every 5 s this amounted to ~69K connection
//! opens/day, and three I/O error codes (1546 IOERR_TRUNCATE, 4874
//! IOERR_SHMMAP, 14 CANTOPEN) flooded Sentry with ~19K events in 4 days.
//! opens/day, and a family of WAL/SHM cold-start I/O codes (1546
//! IOERR_TRUNCATE, 4618 IOERR_SHMOPEN, 4874 IOERR_SHMSIZE, 14 CANTOPEN)
//! flooded Sentry with ~19K events in 4 days.
//!
//! Fix: a process-level `ConnectionCache` keyed by DB path. Each entry holds
//! one `parking_lot::Mutex<Connection>` that is initialised once (schema +
Expand Down Expand Up @@ -792,25 +793,41 @@ pub(crate) fn schema_apply_count_for_path_for_tests(path: &Path) -> usize {
.unwrap_or(0)
}

/// SQLite extended result code `CANTOPEN` — surfaces when a cold-start
/// caller races the lockfile/WAL creation done by another connection.
// SQLite extended result codes that fire during cold-start WAL/SHM bootstrap
// races. NOTE on values: extended codes are `SQLITE_IOERR (10) | (sub << 8)`.
// 4874 is `IOERR_SHMSIZE` (sub 19), NOT `SHMMAP` — the real `SHMMAP` is 5386
// (sub 21) and the "open a new shared-memory segment" failure is `SHMOPEN`
// 4618 (sub 18), which is what surfaced on macOS. The whole `-shm` family is
// listed so the classifiers don't miss any of them.
/// `CANTOPEN` — racing the lockfile/WAL creation done by another connection.
const SQLITE_CANTOPEN: i32 = 14;
/// SQLite extended result code `IOERR_TRUNCATE` — fires when the WAL is
/// being truncated by another connection during bootstrap.
/// `IOERR_TRUNCATE` — the WAL/db is being truncated during bootstrap.
const SQLITE_IOERR_TRUNCATE: i32 = 1546;
/// SQLite extended result code `IOERR_SHMMAP` — fires when the shared
/// memory file is resized by another connection during bootstrap.
const SQLITE_IOERR_SHMMAP: i32 = 4874;

/// True if `err` (or anything in its cause chain) is one of the three
/// SQLite codes that fire during cold-start WAL/SHM bootstrap races:
/// `CANTOPEN`, `IOERR_TRUNCATE`, `IOERR_SHMMAP`.
/// `IOERR_SHMOPEN` — opening a new `-shm` shared-memory segment failed (the
/// macOS cold-start failure, e.g. Sentry TAURI-RUST-X1).
const SQLITE_IOERR_SHMOPEN: i32 = 4618;
/// `IOERR_SHMSIZE` — the `-shm` file is being resized during bootstrap.
const SQLITE_IOERR_SHMSIZE: i32 = 4874;
/// `IOERR_SHMMAP` — mapping a page of the `-shm` wal-index failed.
const SQLITE_IOERR_SHMMAP: i32 = 5386;
/// `IOERR_IN_PAGE` — an mmap-page I/O fault, also seen under WAL cold-start.
const SQLITE_IOERR_IN_PAGE: i32 = 8714;

/// True if `err` (or anything in its cause chain) is one of the SQLite codes
/// that fire during cold-start WAL/SHM bootstrap races: `CANTOPEN`,
/// `IOERR_TRUNCATE`, the `-shm` family (`SHMOPEN` / `SHMSIZE` / `SHMMAP`), and
/// `IOERR_IN_PAGE`.
pub(crate) fn is_transient_cold_start(err: &anyhow::Error) -> bool {
fn is_transient_sqlite(e: &(dyn std::error::Error + 'static)) -> bool {
if let Some(rusqlite::Error::SqliteFailure(ffi, _)) = e.downcast_ref::<rusqlite::Error>() {
return matches!(
ffi.extended_code,
SQLITE_CANTOPEN | SQLITE_IOERR_TRUNCATE | SQLITE_IOERR_SHMMAP
SQLITE_CANTOPEN
| SQLITE_IOERR_TRUNCATE
| SQLITE_IOERR_SHMOPEN
| SQLITE_IOERR_SHMSIZE
| SQLITE_IOERR_SHMMAP
| SQLITE_IOERR_IN_PAGE
);
}
false
Expand Down Expand Up @@ -963,8 +980,8 @@ pub(crate) fn try_cleanup_stale_files(db_path: &std::path::Path) -> bool {
cleaned
}

/// Run the full one-time DB initialisation (WAL, schema, migrations) against
/// an already-open `Connection`. Used by `get_or_init_connection`.
/// Run the full one-time DB initialisation (journal mode, schema, migrations)
/// against an already-open `Connection`. Used by `get_or_init_connection`.
fn init_db(conn: &Connection, config: &Config) -> Result<()> {
conn.busy_timeout(SQLITE_BUSY_TIMEOUT)
.context("Failed to configure memory_tree busy timeout")?;
Expand All @@ -975,6 +992,11 @@ fn init_db(conn: &Connection, config: &Config) -> Result<()> {
// on.
conn.execute_batch("PRAGMA foreign_keys = ON;")
.context("Failed to enable memory_tree foreign_keys pragma")?;
// memory_tree runs the TRUNCATE rollback journal (see `apply_schema`), so
// crash-safety requires synchronous=FULL — NORMAL is only corruption-safe
// under WAL. Set explicitly so a future global default can't weaken it.
conn.execute_batch("PRAGMA synchronous = FULL;")
.context("Failed to set memory_tree synchronous=FULL")?;
apply_schema(conn)?;
// #1574 §7: one-shot, version-gated legacy→sidecar embedding migration.
migrate_legacy_embeddings_to_sidecar(conn, config)?;
Expand All @@ -984,9 +1006,27 @@ fn init_db(conn: &Connection, config: &Config) -> Result<()> {
fn apply_schema(conn: &Connection) -> Result<()> {
// Note: `init_db` runs the `#1574 §7` legacy→sidecar embedding migration
// after this returns, so the dim-equal copy step is not duplicated here.
if let Err(wal_err) = conn.execute_batch("PRAGMA journal_mode=WAL;") {
// memory_tree uses the TRUNCATE rollback journal, NOT WAL. WAL's `-shm`
// shared-memory index + `-wal` checkpoint machinery are the root of the
// cold-start IOERR_SHMMAP (macOS) / IOERR_TRUNCATE (Windows, AV-held
// handles) failures (Sentry TAURI-RUST-EV / TAURI-RUST-X1). All tree
// access serialises on the single cached `PMutex<Connection>` (see
Comment thread
graycyrus marked this conversation as resolved.
// `get_or_init_connection`), so WAL's only real benefit — concurrent
// readers — is unused here, which makes WAL pure liability. The sibling
// tree DBs (cron / vault / redirect_links) already run the default
// rollback journal without issue.
//
// Requesting TRUNCATE on a database a prior release left in WAL mode
// checkpoints the `-wal` back into the main file and removes the
// `-wal`/`-shm` side-files, so this also migrates existing WAL databases
// in place on upgrade.
let journal_mode: String = conn
.query_row("PRAGMA journal_mode=TRUNCATE", [], |row| row.get(0))
.context("Failed to set memory_tree journal_mode=TRUNCATE")?;
if !journal_mode.eq_ignore_ascii_case("truncate") {
log::warn!(
"[memory_tree] Failed to enable WAL mode (filesystem may not support it): {wal_err}"
"[memory_tree] journal_mode is '{journal_mode}' after requesting TRUNCATE \
— a prior WAL connection or a locked -wal may be blocking the switch"
);
}
conn.execute_batch(SCHEMA)
Expand Down Expand Up @@ -1037,9 +1077,15 @@ fn apply_schema(conn: &Connection) -> Result<()> {
/// stale-file cleanup + single retry before giving up.
fn is_io_open_error(err: &anyhow::Error) -> bool {
if let Some(rusqlite::Error::SqliteFailure(f, _)) = err.downcast_ref::<rusqlite::Error>() {
// 1546 = SQLITE_IOERR_TRUNCATE, 4874 = SQLITE_IOERR_SHMMAP, 14 = SQLITE_CANTOPEN
return matches!(f.extended_code, 1546 | 4874 | 14)
|| f.code == rusqlite::ErrorCode::CannotOpen;
return matches!(
f.extended_code,
SQLITE_CANTOPEN
| SQLITE_IOERR_TRUNCATE
| SQLITE_IOERR_SHMOPEN
| SQLITE_IOERR_SHMSIZE
| SQLITE_IOERR_SHMMAP
| SQLITE_IOERR_IN_PAGE
) || f.code == rusqlite::ErrorCode::CannotOpen;
}
let msg = format!("{err:#}").to_ascii_lowercase();
msg.contains("disk i/o error")
Expand Down
90 changes: 84 additions & 6 deletions src/openhuman/memory/tree/store_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -268,9 +268,9 @@ fn schema_has_content_path_and_content_sha256_columns() {
/// Regression: OPENHUMAN-TAURI-HH / -ZM / -MB.
///
/// Before this fix, N `tree_jobs_worker` tasks racing into
/// `with_connection` on a cold workspace would trigger one of three
/// SQLite cold-start codes — 14 (CANTOPEN), 1546 (IOERR_TRUNCATE),
/// or 4874 (IOERR_SHMMAP) — surfaced as
/// `with_connection` on a cold workspace would trigger a WAL/SHM
/// cold-start code — 14 (CANTOPEN), 1546 (IOERR_TRUNCATE), or a
/// `-shm` code (4618 SHMOPEN / 4874 SHMSIZE / 5386 SHMMAP) — surfaced as
/// `Failed to initialize memory_tree schema`. The mutex-gated init set
/// in `store::open_and_init_with_retry` serialises the WAL+SHM
/// bootstrap so only one thread runs `apply_schema` per DB path.
Expand Down Expand Up @@ -324,12 +324,16 @@ fn is_transient_cold_start_classifies_known_extended_codes() {
use rusqlite::ffi;
use rusqlite::ErrorCode;

// The three SHMmap/WAL bootstrap codes that fire under cold-start
// contention. All must classify as transient → retried.
// The WAL/SHM cold-start codes that fire under contention. All must
// classify as transient → retried. (4618 SHMOPEN is the macOS failure;
// 5386 is the real SHMMAP; 4874 is SHMSIZE — all of the `-shm` family.)
for extended in [
14, // CANTOPEN
1546, // IOERR_TRUNCATE
4874, // IOERR_SHMMAP
4618, // IOERR_SHMOPEN
4874, // IOERR_SHMSIZE
5386, // IOERR_SHMMAP
8714, // IOERR_IN_PAGE
] {
let err = anyhow::Error::from(rusqlite::Error::SqliteFailure(
ffi::Error {
Expand Down Expand Up @@ -585,3 +589,77 @@ fn stale_shm_cleanup_removes_files() {
assert!(!shm.exists(), "shm must be removed");
assert!(!wal.exists(), "wal must be removed");
}

/// memory_tree must run the TRUNCATE rollback journal — never WAL. WAL's
/// `-shm`/`-wal` machinery is the source of the cold-start IOERR_SHMMAP /
/// IOERR_TRUNCATE failures (Sentry TAURI-RUST-EV / TAURI-RUST-X1), and the
/// single cached connection gains nothing from WAL's reader concurrency.
#[test]
fn memory_tree_uses_truncate_journal_not_wal() {
let (_tmp, cfg) = test_config();

with_connection(&cfg, |conn| {
let mode: String = conn.query_row("PRAGMA journal_mode", [], |r| r.get(0))?;
assert!(
mode.eq_ignore_ascii_case("truncate"),
"memory_tree journal_mode must be TRUNCATE, got '{mode}'"
);
let sync: i64 = conn.query_row("PRAGMA synchronous", [], |r| r.get(0))?;
assert_eq!(sync, 2, "rollback journal requires synchronous=FULL (2)");
Ok(())
})
.expect("with_connection");

// A `-shm` shared-memory side-file is only ever created under WAL.
let shm = cfg.workspace_dir.join("memory_tree").join("chunks.db-shm");
assert!(
!shm.exists(),
"no -shm file must exist under TRUNCATE journal"
);
}

/// A database a prior (WAL-mode) release left behind must migrate cleanly to
/// TRUNCATE on the next open, with the `-wal`/`-shm` side-files gone.
#[test]
fn existing_wal_db_migrates_to_truncate() {
let (_tmp, cfg) = test_config();
let db_path = cfg.workspace_dir.join("memory_tree").join("chunks.db");
std::fs::create_dir_all(db_path.parent().unwrap()).expect("mkdir");

// Simulate the old release: open the DB in WAL mode and commit a row so
// the WAL marker is persisted in the database header.
{
let conn = rusqlite::Connection::open(&db_path).expect("open wal db");
let mode: String = conn
.query_row("PRAGMA journal_mode=WAL", [], |r| r.get(0))
.expect("set wal");
assert!(mode.eq_ignore_ascii_case("wal"), "precondition: db in WAL");
conn.execute_batch("CREATE TABLE legacy_marker(x); INSERT INTO legacy_marker VALUES (1);")
.expect("seed");
} // connection dropped — the header still records WAL

// Clear any cached connection for isolation, then open via with_connection.
clear_connection_cache();
with_connection(&cfg, |conn| {
let mode: String = conn.query_row("PRAGMA journal_mode", [], |r| r.get(0))?;
assert!(
mode.eq_ignore_ascii_case("truncate"),
"WAL db must migrate to TRUNCATE on open, got '{mode}'"
);
// Data written under WAL must survive the checkpoint-and-switch — the
// migration must not lose committed rows.
let marker: i64 = conn.query_row("SELECT x FROM legacy_marker", [], |r| r.get(0))?;
assert_eq!(marker, 1, "row committed under WAL must survive migration");
Ok(())
})
.expect("with_connection migrates");

assert!(
!db_path.with_file_name("chunks.db-shm").exists(),
"-shm must be gone after WAL→TRUNCATE migration"
);
assert!(
!db_path.with_file_name("chunks.db-wal").exists(),
"-wal must be gone after WAL→TRUNCATE migration"
);
}
Loading