From b333d15fa8d2d2b2ea6eaecf84b8255ada3e287c Mon Sep 17 00:00:00 2001
From: Clint Ecker <me@clintecker.com>
Date: Fri, 17 Apr 2026 14:44:02 -0500
Subject: [PATCH 1/5] refactor(tracker): library API hardening for v1.0

Batches six related pre-1.0 fixes flagged in the expert panel review
of PR #101 (#102, #103, #104, #105, #106, #109). All changes touch
the v0.18.0 library surface and are breaking, so ship together:

- Thread context.Context through Doctor, Diagnose, DiagnoseMostRecent,
  Audit, and Simulate. Provider probes and binary version lookups
  honor ctx; getBinaryVersion now uses exec.CommandContext with a
  5-second timeout to match getDippinVersion.
- Introduce typed CheckStatus and SuggestionKind so consumers can
  switch-exhaust. Constant values are unchanged.
- NDJSONWriter.Write now returns error. First write failure is still
  logged once to stderr; subsequent failures surface via return value.
- Rename NDJSONEvent -> StreamEvent (wire format unchanged).
- Move DoctorConfig.TrackerVersion/TrackerCommit off the config struct
  behind a WithVersionInfo functional option.
- Add LogWriter to DoctorConfig/DiagnoseConfig/AuditConfig. CLI sets
  it to io.Discard so users stop seeing stray library warnings from
  audit/list/diagnose on os.Stderr.
- sanitizeProviderError strips API keys and bearer tokens from
  provider error bodies before they land in CheckDetail.Message.
- Panic-recover the NDJSON handler closures (pipeline, agent, LLM
  trace) so a misbehaving writer cannot crash the caller goroutine.
- Stream activity.jsonl in Diagnose via bufio.Scanner (1 MB buffer
  limit), matching LoadActivityLog and avoiding memory spikes on
  large runs.
- Rename internal *Lib/*NF/*ForDiagnose helpers to plain names.
- CHANGELOG notes deterministic diagnose suggestion ordering (already
  alphabetical post-#101, documented as a visible change).

Closes #102, #103, #104, #105, #106, #109.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .gitignore                |   1 +
 CHANGELOG.md              |  27 ++++
 README.md                 |   4 +-
 cmd/tracker/audit.go      |   6 +-
 cmd/tracker/diagnose.go   |   6 +-
 cmd/tracker/doctor.go     |   9 +-
 cmd/tracker/simulate.go   |   3 +-
 tracker_activity.go       |   4 +-
 tracker_audit.go          |  37 ++++-
 tracker_audit_test.go     |  31 +++-
 tracker_diagnose.go       | 124 +++++++++++-----
 tracker_diagnose_test.go  |   9 +-
 tracker_doctor.go         | 292 +++++++++++++++++++++++---------------
 tracker_doctor_test.go    |  48 ++++++-
 tracker_doctor_unix.go    |   8 +-
 tracker_doctor_windows.go |   4 +-
 tracker_events.go         |  65 ++++++---
 tracker_events_test.go    |  90 +++++++++---
 tracker_simulate.go       |  13 +-
 tracker_simulate_test.go  |  11 +-
 20 files changed, 568 insertions(+), 224 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8a6e0c6..05f21ec 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,4 @@ coverage.out
 .claude/settings.local.json
 .private-journal/
 a/
+runs/
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7681fea..5402777 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,33 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+- **Library API hardening for v1.0** (#102, #103, #104, #106, #109):
+  - Typed enum-like strings for `CheckStatus` and `SuggestionKind` so consumers can switch-exhaust. Existing constants (`SuggestionRetryPattern`, etc.) retain their underlying string values.
+  - `tracker.WithVersionInfo(version, commit)` functional option replaces the CLI-only `DoctorConfig.TrackerVersion` / `TrackerCommit` fields.
+  - `DoctorConfig.LogWriter` / `DiagnoseConfig.LogWriter` / `AuditConfig.LogWriter` — optional `io.Writer` for non-fatal parse warnings. Nil is treated as `io.Discard` so library callers no longer see stray warnings on `os.Stderr`. The `tracker` CLI sets this to `io.Discard` for user-facing commands.
+  - `Doctor`, `Diagnose`, `DiagnoseMostRecent`, `Audit`, `Simulate` now accept `context.Context`, honored by provider probes and binary version lookups. `getBinaryVersion` now uses `exec.CommandContext` with a 5-second timeout, matching `getDippinVersion`.
+  - Provider probe error bodies are now sanitized (API keys and bearer tokens stripped) before they land in `CheckDetail.Message`.
+  - `NDJSON` handler closures (pipeline, agent, LLM trace) now `recover()` from panics in the underlying writer so a misbehaving sink cannot crash the caller goroutine.
+  - `Diagnose` now streams `activity.jsonl` with `bufio.Scanner` instead of `os.ReadFile` → `strings.Split`, matching `LoadActivityLog` and avoiding a memory spike on large runs.
+
+### Changed
+
+- **BREAKING** (library):
+  - `tracker.Doctor(cfg)` → `tracker.Doctor(ctx, cfg, opts...)`.
+  - `tracker.Diagnose(runDir)` → `tracker.Diagnose(ctx, runDir, opts...)`.
+  - `tracker.DiagnoseMostRecent(workdir)` → `tracker.DiagnoseMostRecent(ctx, workdir, opts...)`.
+  - `tracker.Audit(runDir)` → `tracker.Audit(ctx, runDir, opts...)`.
+  - `tracker.Simulate(source)` → `tracker.Simulate(ctx, source)`.
+  - `tracker.ListRuns(workdir)` now accepts optional `...AuditConfig`.
+  - `tracker.NDJSONEvent` → `tracker.StreamEvent`. Wire-format JSON tags unchanged.
+  - `NDJSONWriter.Write` now returns `error` so callers can detect a broken stream. First failure is still logged to `os.Stderr` once (unchanged behavior); subsequent failures are surfaced via the return value.
+  - `DoctorConfig.TrackerVersion` and `DoctorConfig.TrackerCommit` removed — use `tracker.WithVersionInfo(version, commit)` instead.
+  - `CheckResult.Status` and `CheckDetail.Status` are now typed as `tracker.CheckStatus` (underlying string). Untyped string literal comparisons (`status == "ok"`) keep working.
+  - `Suggestion.Kind` is now typed as `tracker.SuggestionKind` (underlying string).
+- `tracker diagnose` suggestion order is now deterministic (alphabetical by node ID). Previously suggestions printed in Go map-iteration order, which varied between runs.
+
 ## [0.18.0] - 2026-04-17
 
 ### Added
diff --git a/README.md b/README.md
index 7f208c2..d9b46c4 100644
--- a/README.md
+++ b/README.md
@@ -576,7 +576,7 @@ result, _ := tracker.Run(ctx, source, tracker.Config{
 ```go
 import tracker "github.com/2389-research/tracker"
 
-report, err := tracker.DiagnoseMostRecent(".")
+report, err := tracker.DiagnoseMostRecent(ctx, ".")
 if err != nil { log.Fatal(err) }
 
 for _, f := range report.Failures {
@@ -588,7 +588,7 @@ for _, s := range report.Suggestions {
 }
 ```
 
-`tracker.Audit`, `tracker.Simulate`, and `tracker.Doctor` follow the same pattern and return JSON-serializable reports.
+`tracker.Audit`, `tracker.Simulate`, and `tracker.Doctor` follow the same pattern and return JSON-serializable reports. All four accept `context.Context` as their first argument and an optional config struct (`AuditConfig`, `DiagnoseConfig`, `DoctorConfig`) with a `LogWriter` for non-fatal parse warnings — set it to `io.Discard` to silence warnings in embedded callers.
 
 To stream events programmatically in the same NDJSON format as `tracker --json`, use `tracker.NewNDJSONWriter`:
 
diff --git a/cmd/tracker/audit.go b/cmd/tracker/audit.go
index e4e56ad..562ff85 100644
--- a/cmd/tracker/audit.go
+++ b/cmd/tracker/audit.go
@@ -3,7 +3,9 @@
 package main
 
 import (
+	"context"
 	"fmt"
+	"io"
 	"time"
 
 	tracker "github.com/2389-research/tracker"
@@ -11,7 +13,7 @@ import (
 
 // listRuns shows all available runs with their status and node count.
 func listRuns(workdir string) error {
-	runs, err := tracker.ListRuns(workdir)
+	runs, err := tracker.ListRuns(workdir, tracker.AuditConfig{LogWriter: io.Discard})
 	if err != nil {
 		return err
 	}
@@ -55,7 +57,7 @@ func runAudit(workdir, runID string) error {
 	if err != nil {
 		return err
 	}
-	report, err := tracker.Audit(runDir)
+	report, err := tracker.Audit(context.Background(), runDir)
 	if err != nil {
 		return err
 	}
diff --git a/cmd/tracker/diagnose.go b/cmd/tracker/diagnose.go
index 292b7e6..e228508 100644
--- a/cmd/tracker/diagnose.go
+++ b/cmd/tracker/diagnose.go
@@ -3,7 +3,9 @@
 package main
 
 import (
+	"context"
 	"fmt"
+	"io"
 	"os"
 	"sort"
 	"strings"
@@ -14,7 +16,7 @@ import (
 
 // diagnoseMostRecent finds and diagnoses the most recent run.
 func diagnoseMostRecent(workdir string) error {
-	report, err := tracker.DiagnoseMostRecent(workdir)
+	report, err := tracker.DiagnoseMostRecent(context.Background(), workdir, tracker.DiagnoseConfig{LogWriter: io.Discard})
 	if err != nil {
 		return err
 	}
@@ -28,7 +30,7 @@ func runDiagnose(workdir, runID string) error {
 	if err != nil {
 		return err
 	}
-	report, err := tracker.Diagnose(runDir)
+	report, err := tracker.Diagnose(context.Background(), runDir, tracker.DiagnoseConfig{LogWriter: io.Discard})
 	if err != nil {
 		return err
 	}
diff --git a/cmd/tracker/doctor.go b/cmd/tracker/doctor.go
index 06fb520..a543f39 100644
--- a/cmd/tracker/doctor.go
+++ b/cmd/tracker/doctor.go
@@ -4,7 +4,9 @@
 package main
 
 import (
+	"context"
 	"fmt"
+	"io"
 	"os"
 	"path/filepath"
 	"strings"
@@ -73,14 +75,13 @@ func runDoctorWithConfig(workdir string, cfg DoctorConfig) error {
 		workdir = wd
 	}
 
-	report, err := tracker.Doctor(tracker.DoctorConfig{
+	report, err := tracker.Doctor(context.Background(), tracker.DoctorConfig{
 		WorkDir:        workdir,
 		Backend:        cfg.backend,
 		ProbeProviders: cfg.probe,
 		PipelineFile:   cfg.pipelineFile,
-		TrackerVersion: version,
-		TrackerCommit:  commit,
-	})
+		LogWriter:      io.Discard,
+	}, tracker.WithVersionInfo(version, commit))
 	if err != nil {
 		return err
 	}
diff --git a/cmd/tracker/simulate.go b/cmd/tracker/simulate.go
index a1f1d83..084b644 100644
--- a/cmd/tracker/simulate.go
+++ b/cmd/tracker/simulate.go
@@ -3,6 +3,7 @@
 package main
 
 import (
+	"context"
 	"fmt"
 	"io"
 	"os"
@@ -58,7 +59,7 @@ func runSimulateCmd(pipelineFile, formatOverride string, w io.Writer) error {
 		source = string(data)
 	}
 
-	report, err := tracker.Simulate(source)
+	report, err := tracker.Simulate(context.Background(), source)
 	if err != nil {
 		return err
 	}
diff --git a/tracker_activity.go b/tracker_activity.go
index 10733e1..5d6788e 100644
--- a/tracker_activity.go
+++ b/tracker_activity.go
@@ -159,7 +159,7 @@ func ParseActivityLine(line string) (ActivityEntry, bool) {
 	if err := json.Unmarshal([]byte(line), &raw); err != nil {
 		return ActivityEntry{}, false
 	}
-	ts, ok := parseActivityTimestampLib(raw.Timestamp)
+	ts, ok := parseActivityTimestamp(raw.Timestamp)
 	if !ok {
 		return ActivityEntry{}, false
 	}
@@ -173,7 +173,7 @@ func ParseActivityLine(line string) (ActivityEntry, bool) {
 	}, true
 }
 
-func parseActivityTimestampLib(s string) (time.Time, bool) {
+func parseActivityTimestamp(s string) (time.Time, bool) {
 	if ts, err := time.Parse(time.RFC3339Nano, s); err == nil {
 		return ts, true
 	}
diff --git a/tracker_audit.go b/tracker_audit.go
index 5e14769..8ec4007 100644
--- a/tracker_audit.go
+++ b/tracker_audit.go
@@ -3,7 +3,9 @@
 package tracker
 
 import (
+	"context"
 	"fmt"
+	"io"
 	"os"
 	"path/filepath"
 	"sort"
@@ -12,6 +14,15 @@ import (
 	"github.com/2389-research/tracker/pipeline"
 )
 
+// AuditConfig configures an Audit() or ListRuns() call.
+type AuditConfig struct {
+	// LogWriter receives non-fatal warnings (unreadable activity.jsonl
+	// in a run directory, etc.). Nil is treated as io.Discard so
+	// embedded library callers do not see warnings on os.Stderr. The
+	// tracker CLI sets this to io.Discard for user-facing commands.
+	LogWriter io.Writer
+}
+
 // AuditReport is the structured result of Audit().
 type AuditReport struct {
 	RunID           string          `json:"run_id"`
@@ -68,7 +79,14 @@ type RunSummary struct {
 // and activity.jsonl directly under it. For user-supplied input, resolve
 // the path via ResolveRunDir or use MostRecentRunID first, which enforce
 // the .tracker/runs/<runID> layout.
-func Audit(runDir string) (*AuditReport, error) {
+//
+// ctx is accepted for future extensibility. A nil context is treated as
+// context.Background().
+func Audit(ctx context.Context, runDir string, opts ...AuditConfig) (*AuditReport, error) {
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	_ = firstAuditConfig(opts) // reserved for future fields
 	cp, err := pipeline.LoadCheckpoint(filepath.Join(runDir, "checkpoint.json"))
 	if err != nil {
 		return nil, fmt.Errorf("load checkpoint: %w", err)
@@ -98,7 +116,9 @@ func Audit(runDir string) (*AuditReport, error) {
 }
 
 // ListRuns returns all runs under workdir/.tracker/runs, sorted newest first.
-func ListRuns(workdir string) ([]RunSummary, error) {
+func ListRuns(workdir string, opts ...AuditConfig) ([]RunSummary, error) {
+	cfg := firstAuditConfig(opts)
+	logW := logWriterOrDiscard(cfg.LogWriter)
 	runsDir := filepath.Join(workdir, ".tracker", "runs")
 	entries, err := os.ReadDir(runsDir)
 	if err != nil {
@@ -112,7 +132,7 @@ func ListRuns(workdir string) ([]RunSummary, error) {
 		if !e.IsDir() {
 			continue
 		}
-		rs, ok := buildLibRunSummary(runsDir, e.Name())
+		rs, ok := buildRunSummary(runsDir, e.Name(), logW)
 		if ok {
 			runs = append(runs, rs)
 		}
@@ -121,6 +141,13 @@ func ListRuns(workdir string) ([]RunSummary, error) {
 	return runs, nil
 }
 
+func firstAuditConfig(opts []AuditConfig) AuditConfig {
+	if len(opts) == 0 {
+		return AuditConfig{}
+	}
+	return opts[0]
+}
+
 func classifyStatus(cp *pipeline.Checkpoint, activity []ActivityEntry) string {
 	for i := len(activity) - 1; i >= 0; i-- {
 		switch activity[i].Type {
@@ -213,7 +240,7 @@ func buildAuditRecommendations(cp *pipeline.Checkpoint, status string, total tim
 	return recs
 }
 
-func buildLibRunSummary(runsDir, name string) (RunSummary, bool) {
+func buildRunSummary(runsDir, name string, logW io.Writer) (RunSummary, bool) {
 	runDir := filepath.Join(runsDir, name)
 	cp, err := pipeline.LoadCheckpoint(filepath.Join(runDir, "checkpoint.json"))
 	if err != nil {
@@ -221,7 +248,7 @@ func buildLibRunSummary(runsDir, name string) (RunSummary, bool) {
 	}
 	activity, lerr := LoadActivityLog(runDir)
 	if lerr != nil {
-		fmt.Fprintf(os.Stderr, "warning: run %s: cannot read activity log: %v\n", name, lerr)
+		fmt.Fprintf(logW, "warning: run %s: cannot read activity log: %v\n", name, lerr)
 		activity = nil // continue with nil so the summary still builds
 	}
 	SortActivityByTime(activity)
diff --git a/tracker_audit_test.go b/tracker_audit_test.go
index 5ba0f88..25efcfd 100644
--- a/tracker_audit_test.go
+++ b/tracker_audit_test.go
@@ -1,13 +1,15 @@
 package tracker
 
 import (
+	"bytes"
+	"context"
 	"os"
 	"path/filepath"
 	"testing"
 )
 
 func TestAudit_CompletedRun(t *testing.T) {
-	r, err := Audit("testdata/runs/ok")
+	r, err := Audit(context.Background(), "testdata/runs/ok")
 	if err != nil {
 		t.Fatalf("Audit: %v", err)
 	}
@@ -23,7 +25,7 @@ func TestAudit_CompletedRun(t *testing.T) {
 }
 
 func TestAudit_FailedRun(t *testing.T) {
-	r, err := Audit("testdata/runs/failed")
+	r, err := Audit(context.Background(), "testdata/runs/failed")
 	if err != nil {
 		t.Fatalf("Audit: %v", err)
 	}
@@ -65,3 +67,28 @@ func TestListRuns_MultipleRuns(t *testing.T) {
 		t.Errorf("first = %q, want r2 (newest first)", runs[0].RunID)
 	}
 }
+
+func TestListRuns_LogWriterSilencesWarnings(t *testing.T) {
+	// Build a run directory whose checkpoint loads fine but whose activity.jsonl
+	// is unreadable (EISDIR). buildRunSummary should emit a warning to the
+	// LogWriter rather than os.Stderr.
+	workdir := t.TempDir()
+	runsDir := filepath.Join(workdir, ".tracker", "runs")
+	must(t, os.MkdirAll(filepath.Join(runsDir, "r1"), 0o755))
+	must(t, os.WriteFile(filepath.Join(runsDir, "r1", "checkpoint.json"),
+		[]byte(`{"run_id":"r1","completed_nodes":["A"],"timestamp":"2026-04-17T10:00:00Z"}`), 0o644))
+	// Make activity.jsonl a directory so os.ReadFile fails with EISDIR.
+	must(t, os.MkdirAll(filepath.Join(runsDir, "r1", "activity.jsonl"), 0o755))
+
+	var logBuf bytes.Buffer
+	runs, err := ListRuns(workdir, AuditConfig{LogWriter: &logBuf})
+	if err != nil {
+		t.Fatalf("ListRuns: %v", err)
+	}
+	if len(runs) != 1 {
+		t.Fatalf("got %d runs, want 1", len(runs))
+	}
+	if logBuf.Len() == 0 {
+		t.Error("expected log writer to receive a warning about activity.jsonl")
+	}
+}
diff --git a/tracker_diagnose.go b/tracker_diagnose.go
index c1fd421..55afd04 100644
--- a/tracker_diagnose.go
+++ b/tracker_diagnose.go
@@ -3,8 +3,11 @@
 package tracker
 
 import (
+	"bufio"
+	"context"
 	"encoding/json"
 	"fmt"
+	"io"
 	"os"
 	"path/filepath"
 	"sort"
@@ -14,6 +17,16 @@ import (
 	"github.com/2389-research/tracker/pipeline"
 )
 
+// DiagnoseConfig configures a Diagnose() run.
+type DiagnoseConfig struct {
+	// LogWriter receives non-fatal parse warnings (malformed status.json
+	// files, unreadable artifact directories). Nil is treated as
+	// io.Discard so library callers do not see stray warnings on
+	// os.Stderr. The tracker CLI sets this to io.Discard for user-facing
+	// commands.
+	LogWriter io.Writer
+}
+
 // DiagnoseReport is the structured output of Diagnose / DiagnoseMostRecent.
 type DiagnoseReport struct {
 	RunID          string        `json:"run_id"`
@@ -50,22 +63,27 @@ type BudgetHalt struct {
 	Message       string  `json:"message"`
 }
 
+// SuggestionKind is the typed string identifying which template produced a
+// Suggestion. The underlying string values are stable; new kinds may be
+// added additively.
+type SuggestionKind string
+
 // Suggestion is an actionable recommendation produced by Diagnose.
 type Suggestion struct {
-	NodeID  string `json:"node_id,omitempty"`
-	Kind    string `json:"kind"`
-	Message string `json:"message"`
+	NodeID  string         `json:"node_id,omitempty"`
+	Kind    SuggestionKind `json:"kind"`
+	Message string         `json:"message"`
 }
 
 // Suggestion kinds (stable; new ones may be added additively).
 const (
-	SuggestionRetryPattern     = "retry_pattern"
-	SuggestionEscalateLimit    = "escalate_limit"
-	SuggestionNoOutput         = "no_output"
-	SuggestionShellCommand     = "shell_command"
-	SuggestionGoTest           = "go_test"
-	SuggestionSuspiciousTiming = "suspicious_timing"
-	SuggestionBudget           = "budget"
+	SuggestionRetryPattern     SuggestionKind = "retry_pattern"
+	SuggestionEscalateLimit    SuggestionKind = "escalate_limit"
+	SuggestionNoOutput         SuggestionKind = "no_output"
+	SuggestionShellCommand     SuggestionKind = "shell_command"
+	SuggestionGoTest           SuggestionKind = "go_test"
+	SuggestionSuspiciousTiming SuggestionKind = "suspicious_timing"
+	SuggestionBudget           SuggestionKind = "budget"
 )
 
 // Diagnose analyzes a run directory and returns a structured report.
@@ -75,7 +93,17 @@ const (
 // under it. For user-supplied input, resolve the path via
 // ResolveRunDir or DiagnoseMostRecent first, which enforce the
 // .tracker/runs/<runID> layout.
-func Diagnose(runDir string) (*DiagnoseReport, error) {
+//
+// ctx is accepted for future extensibility (cancellation of long parses);
+// current callers may pass context.Background(). A nil context is treated
+// as context.Background().
+func Diagnose(ctx context.Context, runDir string, opts ...DiagnoseConfig) (*DiagnoseReport, error) {
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	cfg := firstDiagnoseConfig(opts)
+	logW := logWriterOrDiscard(cfg.LogWriter)
+
 	cpPath := filepath.Join(runDir, "checkpoint.json")
 	cp, err := pipeline.LoadCheckpoint(cpPath)
 	if err != nil {
@@ -85,25 +113,39 @@ func Diagnose(runDir string) (*DiagnoseReport, error) {
 		RunID:          cp.RunID,
 		CompletedNodes: len(cp.CompletedNodes),
 	}
-	failures := collectNodeFailuresLib(runDir)
-	report.BudgetHalt = enrichFromActivityLib(runDir, failures)
+	failures := collectNodeFailures(runDir, logW)
+	report.BudgetHalt = enrichFromActivity(ctx, runDir, failures)
 	report.Failures = sortedFailures(failures)
 	report.Suggestions = buildSuggestions(report.Failures, report.BudgetHalt)
 	return report, nil
 }
 
 // DiagnoseMostRecent finds the most recent run under workdir and diagnoses it.
-func DiagnoseMostRecent(workdir string) (*DiagnoseReport, error) {
+func DiagnoseMostRecent(ctx context.Context, workdir string, opts ...DiagnoseConfig) (*DiagnoseReport, error) {
 	id, err := MostRecentRunID(workdir)
 	if err != nil {
 		return nil, err
 	}
-	return Diagnose(filepath.Join(workdir, ".tracker", "runs", id))
+	return Diagnose(ctx, filepath.Join(workdir, ".tracker", "runs", id), opts...)
+}
+
+func firstDiagnoseConfig(opts []DiagnoseConfig) DiagnoseConfig {
+	if len(opts) == 0 {
+		return DiagnoseConfig{}
+	}
+	return opts[0]
+}
+
+func logWriterOrDiscard(w io.Writer) io.Writer {
+	if w == nil {
+		return io.Discard
+	}
+	return w
 }
 
 // ----- internals -----
 
-func collectNodeFailuresLib(runDir string) map[string]*NodeFailure {
+func collectNodeFailures(runDir string, logW io.Writer) map[string]*NodeFailure {
 	failures := make(map[string]*NodeFailure)
 	entries, err := os.ReadDir(runDir)
 	if err != nil {
@@ -113,14 +155,14 @@ func collectNodeFailuresLib(runDir string) map[string]*NodeFailure {
 		if !e.IsDir() {
 			continue
 		}
-		if f := loadNodeFailureLib(runDir, e.Name()); f != nil {
+		if f := loadNodeFailure(runDir, e.Name(), logW); f != nil {
 			failures[e.Name()] = f
 		}
 	}
 	return failures
 }
 
-func loadNodeFailureLib(runDir, nodeID string) *NodeFailure {
+func loadNodeFailure(runDir, nodeID string, logW io.Writer) *NodeFailure {
 	statusPath := filepath.Join(runDir, nodeID, "status.json")
 	data, err := os.ReadFile(statusPath)
 	if err != nil {
@@ -131,7 +173,7 @@ func loadNodeFailureLib(runDir, nodeID string) *NodeFailure {
 		ContextUpdates map[string]string `json:"context_updates"`
 	}
 	if err := json.Unmarshal(data, &status); err != nil {
-		fmt.Fprintf(os.Stderr, "warning: cannot parse %s: %v\n", statusPath, err)
+		fmt.Fprintf(logW, "warning: cannot parse %s: %v\n", statusPath, err)
 		return nil
 	}
 	if status.Outcome != "fail" {
@@ -145,8 +187,8 @@ func loadNodeFailureLib(runDir, nodeID string) *NodeFailure {
 	return f
 }
 
-// diagnoseEntryLib is a parsed activity.jsonl line with fields needed for diagnosis.
-type diagnoseEntryLib struct {
+// diagnoseEntry is a parsed activity.jsonl line with fields needed for diagnosis.
+type diagnoseEntry struct {
 	Timestamp     string  `json:"ts"`
 	Type          string  `json:"type"`
 	NodeID        string  `json:"node_id"`
@@ -159,27 +201,30 @@ type diagnoseEntryLib struct {
 	WallElapsedMs int64   `json:"wall_elapsed_ms"`
 }
 
-func enrichFromActivityLib(runDir string, failures map[string]*NodeFailure) *BudgetHalt {
+func enrichFromActivity(ctx context.Context, runDir string, failures map[string]*NodeFailure) *BudgetHalt {
 	path := filepath.Join(runDir, "activity.jsonl")
-	data, err := os.ReadFile(path)
+	f, err := os.Open(path)
 	if err != nil {
 		return nil
 	}
+	defer f.Close()
+
 	stageStarts := map[string]time.Time{}
 	failSignatures := map[string][]string{}
-	halt := parseActivityLinesForDiagnose(string(data), failures, stageStarts, failSignatures)
-	applyRetryAnalysisLib(failures, failSignatures)
-	return halt
-}
-
-func parseActivityLinesForDiagnose(data string, failures map[string]*NodeFailure, stageStarts map[string]time.Time, failSignatures map[string][]string) *BudgetHalt {
 	var halt *BudgetHalt
-	for _, line := range strings.Split(data, "\n") {
-		line = strings.TrimSpace(line)
+
+	scanner := bufio.NewScanner(f)
+	// Match LoadActivityLog: allow 1 MB lines.
+	scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
+	for scanner.Scan() {
+		if ctx.Err() != nil {
+			break
+		}
+		line := strings.TrimSpace(scanner.Text())
 		if line == "" {
 			continue
 		}
-		var entry diagnoseEntryLib
+		var entry diagnoseEntry
 		if err := json.Unmarshal([]byte(line), &entry); err != nil {
 			continue
 		}
@@ -191,24 +236,25 @@ func parseActivityLinesForDiagnose(data string, failures map[string]*NodeFailure
 				Message:       entry.Message,
 			}
 		}
-		enrichFromEntryNF(entry, failures, stageStarts, failSignatures)
+		enrichFromEntry(entry, failures, stageStarts, failSignatures)
 	}
+	applyRetryAnalysis(failures, failSignatures)
 	return halt
 }
 
-func enrichFromEntryNF(entry diagnoseEntryLib, failures map[string]*NodeFailure, stageStarts map[string]time.Time, failSignatures map[string][]string) {
-	ts, _ := parseActivityTimestampLib(entry.Timestamp)
+func enrichFromEntry(entry diagnoseEntry, failures map[string]*NodeFailure, stageStarts map[string]time.Time, failSignatures map[string][]string) {
+	ts, _ := parseActivityTimestamp(entry.Timestamp)
 	switch entry.Type {
 	case "stage_started":
 		if !ts.IsZero() {
 			stageStarts[entry.NodeID] = ts
 		}
 	case "stage_failed":
-		updateFailureTimingNF(failures[entry.NodeID], stageStarts, entry, ts)
+		updateFailureTiming(failures[entry.NodeID], stageStarts, entry, ts)
 		sig := entry.Error + "\x00" + entry.ToolErr
 		failSignatures[entry.NodeID] = append(failSignatures[entry.NodeID], sig)
 	case "stage_completed":
-		updateFailureTimingNF(failures[entry.NodeID], stageStarts, entry, ts)
+		updateFailureTiming(failures[entry.NodeID], stageStarts, entry, ts)
 	}
 	if entry.NodeID == "" {
 		return
@@ -225,7 +271,7 @@ func enrichFromEntryNF(entry diagnoseEntryLib, failures map[string]*NodeFailure,
 	}
 }
 
-func updateFailureTimingNF(f *NodeFailure, stageStarts map[string]time.Time, entry diagnoseEntryLib, ts time.Time) {
+func updateFailureTiming(f *NodeFailure, stageStarts map[string]time.Time, entry diagnoseEntry, ts time.Time) {
 	if f == nil {
 		return
 	}
@@ -237,7 +283,7 @@ func updateFailureTimingNF(f *NodeFailure, stageStarts map[string]time.Time, ent
 	}
 }
 
-func applyRetryAnalysisLib(failures map[string]*NodeFailure, failSignatures map[string][]string) {
+func applyRetryAnalysis(failures map[string]*NodeFailure, failSignatures map[string][]string) {
 	for nodeID, sigs := range failSignatures {
 		f, ok := failures[nodeID]
 		if !ok {
diff --git a/tracker_diagnose_test.go b/tracker_diagnose_test.go
index cbd8961..2f578c7 100644
--- a/tracker_diagnose_test.go
+++ b/tracker_diagnose_test.go
@@ -1,11 +1,12 @@
 package tracker
 
 import (
+	"context"
 	"testing"
 )
 
 func TestDiagnose_CleanRun(t *testing.T) {
-	r, err := Diagnose("testdata/runs/ok")
+	r, err := Diagnose(context.Background(), "testdata/runs/ok")
 	if err != nil {
 		t.Fatalf("Diagnose: %v", err)
 	}
@@ -24,7 +25,7 @@ func TestDiagnose_CleanRun(t *testing.T) {
 }
 
 func TestDiagnose_FailureWithRetries(t *testing.T) {
-	r, err := Diagnose("testdata/runs/failed")
+	r, err := Diagnose(context.Background(), "testdata/runs/failed")
 	if err != nil {
 		t.Fatalf("Diagnose: %v", err)
 	}
@@ -44,7 +45,7 @@ func TestDiagnose_FailureWithRetries(t *testing.T) {
 	if f.Handler != "tool" {
 		t.Errorf("handler = %q", f.Handler)
 	}
-	kinds := map[string]bool{}
+	kinds := map[SuggestionKind]bool{}
 	for _, s := range r.Suggestions {
 		kinds[s.Kind] = true
 	}
@@ -57,7 +58,7 @@ func TestDiagnose_FailureWithRetries(t *testing.T) {
 }
 
 func TestDiagnose_BudgetHalt(t *testing.T) {
-	r, err := Diagnose("testdata/runs/budget_halted")
+	r, err := Diagnose(context.Background(), "testdata/runs/budget_halted")
 	if err != nil {
 		t.Fatalf("Diagnose: %v", err)
 	}
diff --git a/tracker_doctor.go b/tracker_doctor.go
index 42ab4cd..058db31 100644
--- a/tracker_doctor.go
+++ b/tracker_doctor.go
@@ -5,6 +5,7 @@ package tracker
 import (
 	"context"
 	"fmt"
+	"io"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -37,11 +38,33 @@ type DoctorConfig struct {
 	// PipelineFile, when non-empty, adds a "Pipeline File" check that parses
 	// and validates the given .dip / .dot file.
 	PipelineFile string
-	// TrackerVersion and TrackerCommit are surfaced in the "Version
-	// Compatibility" check. They are populated by the CLI from build-time
-	// ldflags; library callers may leave them empty.
-	TrackerVersion string
-	TrackerCommit  string
+	// LogWriter receives non-fatal parse warnings (bad status.json,
+	// unreadable activity.jsonl, etc.). Nil is treated as io.Discard so
+	// embedded library callers do not see warnings on os.Stderr. The
+	// tracker CLI sets this to os.Stderr for factory/webhook consumers
+	// and io.Discard for user-facing commands.
+	LogWriter io.Writer
+	// versionInfo is populated by WithVersionInfo. Unexported so callers
+	// use the functional option rather than setting CLI-specific fields.
+	versionInfo versionInfo
+}
+
+// versionInfo carries CLI-provided build metadata into a Doctor run.
+type versionInfo struct {
+	version string
+	commit  string
+}
+
+// DoctorOption configures a Doctor run via a functional option.
+type DoctorOption func(*DoctorConfig)
+
+// WithVersionInfo attaches a tracker version and commit hash for display in
+// the "Version Compatibility" check. CLI callers populate these from
+// build-time ldflags; library callers typically do not need this.
+func WithVersionInfo(version, commit string) DoctorOption {
+	return func(c *DoctorConfig) {
+		c.versionInfo = versionInfo{version: version, commit: commit}
+	}
 }
 
 // DoctorReport is the structured result of a Doctor() call.
@@ -52,10 +75,25 @@ type DoctorReport struct {
 	Errors   int           `json:"errors"`
 }
 
+// CheckStatus is the status of a CheckResult or CheckDetail. Enum-like
+// typed string so consumers can switch-exhaust. "hint" is only valid on
+// CheckDetail.Status (informational sub-items such as optional providers
+// not configured).
+type CheckStatus string
+
+// CheckStatus values.
+const (
+	CheckStatusOK    CheckStatus = "ok"
+	CheckStatusWarn  CheckStatus = "warn"
+	CheckStatusError CheckStatus = "error"
+	CheckStatusSkip  CheckStatus = "skip"
+	CheckStatusHint  CheckStatus = "hint"
+)
+
 // CheckResult is one section of a DoctorReport.
 type CheckResult struct {
 	Name    string        `json:"name"`
-	Status  string        `json:"status"` // "ok" | "warn" | "error" | "skip"
+	Status  CheckStatus   `json:"status"` // "ok" | "warn" | "error" | "skip"
 	Message string        `json:"message,omitempty"`
 	Hint    string        `json:"hint,omitempty"`
 	Details []CheckDetail `json:"details,omitempty"`
@@ -64,9 +102,9 @@ type CheckResult struct {
 // CheckDetail is one sub-line within a CheckResult — used for per-item
 // status lines (per-provider, per-binary, per-subdirectory).
 type CheckDetail struct {
-	Status  string `json:"status"` // "ok" | "warn" | "error" | "hint" — "hint" is used for informational sub-items (e.g. optional providers not configured)
-	Message string `json:"message"`
-	Hint    string `json:"hint,omitempty"`
+	Status  CheckStatus `json:"status"` // "ok" | "warn" | "error" | "hint"
+	Message string      `json:"message"`
+	Hint    string      `json:"hint,omitempty"`
 }
 
 // Doctor runs a suite of preflight checks and returns a structured report.
@@ -78,10 +116,20 @@ type CheckDetail struct {
 // flag; library callers should leave it false unless they specifically
 // want live credential verification.
 //
+// Provider probes and binary version lookups honor ctx: cancelling the
+// context aborts in-flight checks. A nil context is treated as
+// context.Background().
+//
 // Write side effects (gitignore fix-up, workdir creation prompts) are NOT
 // performed by Doctor — callers inspect the report and apply any fixes
 // themselves.
-func Doctor(cfg DoctorConfig) (*DoctorReport, error) {
+func Doctor(ctx context.Context, cfg DoctorConfig, opts ...DoctorOption) (*DoctorReport, error) {
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	for _, opt := range opts {
+		opt(&cfg)
+	}
 	if cfg.WorkDir == "" {
 		wd, err := os.Getwd()
 		if err != nil {
@@ -92,25 +140,25 @@ func Doctor(cfg DoctorConfig) (*DoctorReport, error) {
 
 	r := &DoctorReport{}
 	r.Checks = append(r.Checks,
-		checkEnvWarningsLib(),
-		checkProvidersLib(cfg.ProbeProviders),
-		checkDippinLib(),
-		checkVersionCompatLib(cfg.TrackerVersion, cfg.TrackerCommit),
-		checkOtherBinariesLib(cfg.Backend),
-		checkWorkdirLib(cfg.WorkDir),
-		checkArtifactDirsLib(cfg.WorkDir),
-		checkDiskSpaceLib(cfg.WorkDir),
+		checkEnvWarnings(),
+		checkProviders(ctx, cfg.ProbeProviders),
+		checkDippin(ctx),
+		checkVersionCompat(ctx, cfg.versionInfo.version, cfg.versionInfo.commit),
+		checkOtherBinaries(ctx, cfg.Backend),
+		checkWorkdir(cfg.WorkDir),
+		checkArtifactDirs(cfg.WorkDir),
+		checkDiskSpace(cfg.WorkDir),
 	)
 	if cfg.PipelineFile != "" {
-		r.Checks = append(r.Checks, checkPipelineFileLib(cfg.PipelineFile))
+		r.Checks = append(r.Checks, checkPipelineFile(cfg.PipelineFile))
 	}
 
 	r.OK = true
 	for _, c := range r.Checks {
 		switch c.Status {
-		case "warn":
+		case CheckStatusWarn:
 			r.Warnings++
-		case "error":
+		case CheckStatusError:
 			r.Errors++
 			r.OK = false
 		}
@@ -118,8 +166,8 @@ func Doctor(cfg DoctorConfig) (*DoctorReport, error) {
 	return r, nil
 }
 
-// checkEnvWarningsLib warns when opt-in security overrides are active.
-func checkEnvWarningsLib() CheckResult {
+// checkEnvWarnings warns when opt-in security overrides are active.
+func checkEnvWarnings() CheckResult {
 	dangerousVars := map[string]string{
 		"TRACKER_PASS_ENV":      "passes all env vars to tool subprocesses (security risk)",
 		"TRACKER_PASS_API_KEYS": "passes API keys to tool subprocesses (security risk)",
@@ -131,11 +179,11 @@ func checkEnvWarningsLib() CheckResult {
 		}
 	}
 	if len(found) == 0 {
-		return CheckResult{Name: "Environment Warnings", Status: "ok", Message: "no dangerous environment variables detected"}
+		return CheckResult{Name: "Environment Warnings", Status: CheckStatusOK, Message: "no dangerous environment variables detected"}
 	}
 	return CheckResult{
 		Name:    "Environment Warnings",
-		Status:  "warn",
+		Status:  CheckStatusWarn,
 		Message: fmt.Sprintf("dangerous variables set: %s", strings.Join(found, "; ")),
 		Hint:    "unset TRACKER_PASS_ENV and TRACKER_PASS_API_KEYS to restore default security posture",
 	}
@@ -199,9 +247,9 @@ var knownProviders = []providerDef{
 	},
 }
 
-// checkProvidersLib reports on each configured LLM provider. When probe
+// checkProviders reports on each configured LLM provider. When probe
 // is true, a 1-token API call verifies auth for each configured provider.
-func checkProvidersLib(probe bool) CheckResult {
+func checkProviders(ctx context.Context, probe bool) CheckResult {
 	out := CheckResult{Name: "LLM Providers"}
 	var configuredNames []string
 	var missingNames []string
@@ -216,7 +264,7 @@ func checkProvidersLib(probe bool) CheckResult {
 		masked := maskKey(key)
 		if !isValidAPIKey(p.name, key) {
 			out.Details = append(out.Details, CheckDetail{
-				Status:  "error",
+				Status:  CheckStatusError,
 				Message: fmt.Sprintf("%-15s %s=%s (invalid format)", p.name, envName, masked),
 				Hint:    fmt.Sprintf("%s keys should match expected format — run `tracker setup`", p.name),
 			})
@@ -224,10 +272,10 @@ func checkProvidersLib(probe bool) CheckResult {
 			continue
 		}
 		if probe && p.buildAdapter != nil {
-			authOk, authMsg := probeProvider(p, key)
+			authOk, authMsg := probeProvider(ctx, p, key)
 			if !authOk {
 				out.Details = append(out.Details, CheckDetail{
-					Status:  "error",
+					Status:  CheckStatusError,
 					Message: fmt.Sprintf("%-15s %s=%s (auth failed: %s)", p.name, envName, masked, authMsg),
 					Hint:    fmt.Sprintf("your %s key is invalid or expired — export a fresh key or run `tracker setup`", p.name),
 				})
@@ -235,12 +283,12 @@ func checkProvidersLib(probe bool) CheckResult {
 				continue
 			}
 			out.Details = append(out.Details, CheckDetail{
-				Status:  "ok",
+				Status:  CheckStatusOK,
 				Message: fmt.Sprintf("%-15s %s=%s (auth verified)", p.name, envName, masked),
 			})
 		} else {
 			out.Details = append(out.Details, CheckDetail{
-				Status:  "ok",
+				Status:  CheckStatusOK,
 				Message: fmt.Sprintf("%-15s %s=%s", p.name, envName, masked),
 			})
 		}
@@ -252,14 +300,14 @@ func checkProvidersLib(probe bool) CheckResult {
 			for _, pd := range knownProviders {
 				if pd.name == name {
 					out.Details = append(out.Details, CheckDetail{
-						Status:  "error",
+						Status:  CheckStatusError,
 						Message: fmt.Sprintf("%-15s %s not set", pd.name, pd.envVars[0]),
 					})
 					break
 				}
 			}
 		}
-		out.Status = "error"
+		out.Status = CheckStatusError
 		out.Message = "no LLM providers configured"
 		out.Hint = "run `tracker setup` or export ANTHROPIC_API_KEY / OPENAI_API_KEY / GEMINI_API_KEY"
 		return out
@@ -269,15 +317,15 @@ func checkProvidersLib(probe bool) CheckResult {
 		// "not configured" is informational when at least one provider works —
 		// rendered as a hint line, not an error or warning, so Status=hint.
 		out.Details = append(out.Details, CheckDetail{
-			Status:  "hint",
+			Status:  CheckStatusHint,
 			Message: fmt.Sprintf("not configured: %s (optional)", strings.Join(missingNames, ", ")),
 		})
 	}
 
 	if hasProviderErrors {
-		out.Status = "warn"
+		out.Status = CheckStatusWarn
 	} else {
-		out.Status = "ok"
+		out.Status = CheckStatusOK
 	}
 	if probe {
 		out.Message = fmt.Sprintf("%d provider(s) configured and auth verified", len(configuredNames))
@@ -296,7 +344,7 @@ func findProviderKey(envVars []string) (key, envName string) {
 	return "", ""
 }
 
-func probeProvider(p providerDef, key string) (bool, string) {
+func probeProvider(ctx context.Context, p providerDef, key string) (bool, string) {
 	adapter, err := p.buildAdapter(key)
 	if err != nil {
 		return false, fmt.Sprintf("build adapter: %v", err)
@@ -306,7 +354,7 @@ func probeProvider(p providerDef, key string) (bool, string) {
 		return false, fmt.Sprintf("create client: %v", err)
 	}
 	defer client.Close()
-	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
+	probeCtx, cancel := context.WithTimeout(ctx, 15*time.Second)
 	defer cancel()
 	maxTok := 1
 	req := &llm.Request{
@@ -314,17 +362,37 @@ func probeProvider(p providerDef, key string) (bool, string) {
 		Messages:  []llm.Message{llm.UserMessage("ping")},
 		MaxTokens: &maxTok,
 	}
-	_, err = client.Complete(ctx, req)
+	_, err = client.Complete(probeCtx, req)
 	if err != nil {
 		msg := err.Error()
 		if isAuthError(msg) {
 			return false, "invalid or expired API key"
 		}
-		return false, trimErrMsg(msg, 80)
+		return false, sanitizeProviderError(trimErrMsg(msg, 80))
 	}
 	return true, ""
 }
 
+// sanitizeProviderError strips API keys and bearer tokens from provider error
+// text so they never land in CheckDetail.Message (which library consumers may
+// log or forward to webhooks).
+var (
+	apiKeyPatterns = []*regexp.Regexp{
+		regexp.MustCompile(`sk-ant-[A-Za-z0-9_\-]{6,}`),
+		regexp.MustCompile(`sk-[A-Za-z0-9_\-]{10,}`),
+		regexp.MustCompile(`AIza[0-9A-Za-z_\-]{20,}`),
+	}
+	bearerPattern = regexp.MustCompile(`(?i)bearer\s+[A-Za-z0-9._\-]{6,}`)
+)
+
+func sanitizeProviderError(msg string) string {
+	for _, re := range apiKeyPatterns {
+		msg = re.ReplaceAllString(msg, "[redacted-key]")
+	}
+	msg = bearerPattern.ReplaceAllString(msg, "Bearer [redacted]")
+	return msg
+}
+
 func isAuthError(msg string) bool {
 	lower := strings.ToLower(msg)
 	for _, kw := range []string{"401", "403", "unauthorized", "authentication", "invalid api key", "api key", "forbidden"} {
@@ -342,38 +410,38 @@ func trimErrMsg(msg string, maxLen int) string {
 	return msg[:maxLen] + "..."
 }
 
-// checkDippinLib verifies the dippin binary is installed. The full "dippin
+// checkDippin verifies the dippin binary is installed. The full "dippin
 // <ver> at <path>" string goes into the details so the CLI can print a
 // per-item line; the composite summary carries the shorter "dippin <ver>"
 // form. Historically the CLI emits both lines.
-func checkDippinLib() CheckResult {
+func checkDippin(ctx context.Context) CheckResult {
 	path, err := exec.LookPath("dippin")
 	if err != nil {
 		return CheckResult{
 			Name:    "Dippin Language",
-			Status:  "error",
+			Status:  CheckStatusError,
 			Message: "dippin binary not found in PATH",
 			Hint:    "install from https://github.com/2389-research/dippin-lang  (required for pipeline linting)",
 		}
 	}
-	ver := getDippinVersion(path)
+	ver := getDippinVersion(ctx, path)
 	return CheckResult{
 		Name:   "Dippin Language",
-		Status: "ok",
+		Status: CheckStatusOK,
 		Details: []CheckDetail{{
-			Status:  "ok",
+			Status:  CheckStatusOK,
 			Message: fmt.Sprintf("dippin %s at %s", ver, path),
 		}},
 		Message: fmt.Sprintf("dippin %s", ver),
 	}
 }
 
-func getDippinVersion(path string) string {
-	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+func getDippinVersion(ctx context.Context, path string) string {
+	probeCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
 	defer cancel()
-	out, err := exec.CommandContext(ctx, path, "--version").CombinedOutput()
+	out, err := exec.CommandContext(probeCtx, path, "--version").CombinedOutput()
 	if err != nil {
-		out, err = exec.CommandContext(ctx, path, "version").CombinedOutput()
+		out, err = exec.CommandContext(probeCtx, path, "version").CombinedOutput()
 		if err != nil {
 			return "(version unknown)"
 		}
@@ -387,25 +455,25 @@ func getDippinVersion(path string) string {
 	return ver
 }
 
-// checkVersionCompatLib verifies the installed dippin version matches the
+// checkVersionCompat verifies the installed dippin version matches the
 // go.mod pin (on major and minor). trackerVersion / trackerCommit, when
 // non-empty, are surfaced as a detail line.
-func checkVersionCompatLib(trackerVersion, trackerCommit string) CheckResult {
+func checkVersionCompat(ctx context.Context, trackerVersion, trackerCommit string) CheckResult {
 	out := CheckResult{Name: "Version Compatibility"}
 	if trackerVersion != "" {
 		msg := fmt.Sprintf("tracker   %s", trackerVersion)
 		if trackerCommit != "" {
 			msg = fmt.Sprintf("tracker   %s (commit %s)", trackerVersion, trackerCommit)
 		}
-		out.Details = append(out.Details, CheckDetail{Status: "ok", Message: msg})
+		out.Details = append(out.Details, CheckDetail{Status: CheckStatusOK, Message: msg})
 	}
 	dippinPath, err := exec.LookPath("dippin")
 	if err != nil {
 		out.Details = append(out.Details, CheckDetail{
-			Status:  "warn",
+			Status:  CheckStatusWarn,
 			Message: "dippin not found — skipping version compatibility check",
 		})
-		out.Status = "warn"
+		out.Status = CheckStatusWarn
 		if trackerVersion != "" {
 			out.Message = fmt.Sprintf("tracker %s / dippin not found", trackerVersion)
 		} else {
@@ -413,18 +481,18 @@ func checkVersionCompatLib(trackerVersion, trackerCommit string) CheckResult {
 		}
 		return out
 	}
-	cliVer := getDippinVersion(dippinPath)
+	cliVer := getDippinVersion(ctx, dippinPath)
 	out.Details = append(out.Details, CheckDetail{
-		Status:  "ok",
+		Status:  CheckStatusOK,
 		Message: fmt.Sprintf("dippin    %s (installed) / %s (go.mod pin)", cliVer, PinnedDippinVersion),
 	})
 
 	if mismatch, msg := checkDippinVersionMismatch(cliVer, PinnedDippinVersion); mismatch {
 		out.Details = append(out.Details, CheckDetail{
-			Status:  "warn",
+			Status:  CheckStatusWarn,
 			Message: fmt.Sprintf("dippin version mismatch: %s", msg),
 		})
-		out.Status = "warn"
+		out.Status = CheckStatusWarn
 		if trackerVersion != "" {
 			out.Message = fmt.Sprintf("tracker %s / dippin %s (mismatched — expected %s)", trackerVersion, cliVer, PinnedDippinVersion)
 		} else {
@@ -433,7 +501,7 @@ func checkVersionCompatLib(trackerVersion, trackerCommit string) CheckResult {
 		out.Hint = fmt.Sprintf("install dippin %s to match the go.mod pin", PinnedDippinVersion)
 		return out
 	}
-	out.Status = "ok"
+	out.Status = CheckStatusOK
 	if trackerVersion != "" {
 		out.Message = fmt.Sprintf("tracker %s / dippin %s", trackerVersion, cliVer)
 	} else {
@@ -471,63 +539,65 @@ func parseVersionMajorMinor(ver string) (major, minor int, ok bool) {
 	return major, minor, true
 }
 
-// checkOtherBinariesLib checks for git (recommended) and claude (required
+// checkOtherBinaries checks for git (recommended) and claude (required
 // when backend == "claude-code", optional otherwise).
-func checkOtherBinariesLib(backend string) CheckResult {
+func checkOtherBinaries(ctx context.Context, backend string) CheckResult {
 	out := CheckResult{Name: "Optional Binaries"}
 	hasErr := false
 	hasWarn := false
 	if _, err := exec.LookPath("git"); err == nil {
 		out.Details = append(out.Details, CheckDetail{
-			Status:  "ok",
+			Status:  CheckStatusOK,
 			Message: "git found (recommended for pipeline versioning)",
 		})
 	} else {
 		out.Details = append(out.Details, CheckDetail{
-			Status:  "warn",
+			Status:  CheckStatusWarn,
 			Message: "git not found in PATH (recommended for pipeline versioning)",
 		})
 		hasWarn = true
 	}
 	claudePath, claudeErr := exec.LookPath("claude")
 	if claudeErr == nil {
-		claudeVer := getBinaryVersion(claudePath, "--version")
+		claudeVer := getBinaryVersion(ctx, claudePath, "--version")
 		out.Details = append(out.Details, CheckDetail{
-			Status:  "ok",
+			Status:  CheckStatusOK,
 			Message: fmt.Sprintf("claude %s (for --backend claude-code)", claudeVer),
 		})
 	} else if backend == "claude-code" {
 		out.Details = append(out.Details, CheckDetail{
-			Status:  "error",
+			Status:  CheckStatusError,
 			Message: "claude CLI not found in PATH (required for --backend claude-code)",
 			Hint:    "install the Claude CLI from https://claude.ai/code",
 		})
 		hasErr = true
 	} else {
 		out.Details = append(out.Details, CheckDetail{
-			Status:  "warn",
+			Status:  CheckStatusWarn,
 			Message: "claude not found in PATH (install for --backend claude-code support)",
 		})
 		hasWarn = true
 	}
 	switch {
 	case hasErr:
-		out.Status = "error"
+		out.Status = CheckStatusError
 		out.Message = "required binary missing for selected backend"
 		out.Hint = "install the Claude CLI from https://claude.ai/code"
 	case hasWarn:
-		out.Status = "warn"
+		out.Status = CheckStatusWarn
 		out.Message = "some optional binaries missing"
 		out.Hint = "install git and/or the Claude CLI to unlock all tracker features"
 	default:
-		out.Status = "ok"
+		out.Status = CheckStatusOK
 		out.Message = "optional binaries available"
 	}
 	return out
 }
 
-func getBinaryVersion(path, flag string) string {
-	out, err := exec.Command(path, flag).CombinedOutput()
+func getBinaryVersion(ctx context.Context, path, flag string) string {
+	probeCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
+	defer cancel()
+	out, err := exec.CommandContext(probeCtx, path, flag).CombinedOutput()
 	if err != nil {
 		return "(version unknown)"
 	}
@@ -538,27 +608,27 @@ func getBinaryVersion(path, flag string) string {
 	return strings.TrimSpace(lines[0])
 }
 
-// checkWorkdirLib verifies the working directory exists and is writable.
+// checkWorkdir verifies the working directory exists and is writable.
 // It also detects missing .gitignore entries but does NOT modify the file —
 // the CLI applies any fix-up separately.
-func checkWorkdirLib(workdir string) CheckResult {
+func checkWorkdir(workdir string) CheckResult {
 	out := CheckResult{Name: "Working Directory"}
 	info, err := os.Stat(workdir)
 	if err != nil {
-		out.Status = "error"
+		out.Status = CheckStatusError
 		out.Message = fmt.Sprintf("%s does not exist", workdir)
 		out.Hint = fmt.Sprintf("create the directory: mkdir -p %s", workdir)
 		return out
 	}
 	if !info.IsDir() {
-		out.Status = "error"
+		out.Status = CheckStatusError
 		out.Message = fmt.Sprintf("%s is not a directory", workdir)
 		out.Hint = "point --workdir at a directory, not a file"
 		return out
 	}
 	f, err := os.CreateTemp(workdir, ".tracker_probe_*")
 	if err != nil {
-		out.Status = "error"
+		out.Status = CheckStatusError
 		out.Message = fmt.Sprintf("%s is not writable", workdir)
 		out.Hint = fmt.Sprintf("check permissions: chmod u+w %s", workdir)
 		return out
@@ -570,7 +640,7 @@ func checkWorkdirLib(workdir string) CheckResult {
 	home, _ := os.UserHomeDir()
 	if workdir == home || workdir == "/" {
 		out.Details = append(out.Details, CheckDetail{
-			Status:  "warn",
+			Status:  CheckStatusWarn,
 			Message: fmt.Sprintf("%s (risk of accidental data loss — use a project subdirectory)", workdir),
 		})
 		hasWarn = true
@@ -579,21 +649,21 @@ func checkWorkdirLib(workdir string) CheckResult {
 	// Detect missing .gitignore entries without modifying the file.
 	if missing := missingGitignoreEntries(workdir); missing != "" {
 		out.Details = append(out.Details, CheckDetail{
-			Status:  "warn",
+			Status:  CheckStatusWarn,
 			Message: missing,
 		})
 		hasWarn = true
 	}
 
 	out.Details = append(out.Details, CheckDetail{
-		Status:  "ok",
+		Status:  CheckStatusOK,
 		Message: fmt.Sprintf("%s (writable)", workdir),
 	})
 	if hasWarn {
-		out.Status = "warn"
+		out.Status = CheckStatusWarn
 		out.Message = fmt.Sprintf("%s is writable (with warnings)", workdir)
 	} else {
-		out.Status = "ok"
+		out.Status = CheckStatusOK
 		out.Message = fmt.Sprintf("%s is writable", workdir)
 	}
 	return out
@@ -636,9 +706,9 @@ func missingGitignoreEntries(workdir string) string {
 	return ""
 }
 
-// checkArtifactDirsLib verifies the .ai artifact directory is usable
+// checkArtifactDirs verifies the .ai artifact directory is usable
 // (either exists and is writable, or can be created).
-func checkArtifactDirsLib(workdir string) CheckResult {
+func checkArtifactDirs(workdir string) CheckResult {
 	out := CheckResult{Name: "Artifact Directories"}
 	allOk := true
 	aiDir := filepath.Join(workdir, ".ai")
@@ -646,47 +716,47 @@ func checkArtifactDirsLib(workdir string) CheckResult {
 		switch {
 		case !info.IsDir():
 			out.Details = append(out.Details, CheckDetail{
-				Status:  "error",
+				Status:  CheckStatusError,
 				Message: ".ai is not a directory",
 			})
 			allOk = false
 		case !isDirWritable(aiDir):
 			out.Details = append(out.Details, CheckDetail{
-				Status:  "error",
+				Status:  CheckStatusError,
 				Message: fmt.Sprintf("%s exists but is not writable", aiDir),
 				Hint:    fmt.Sprintf("check permissions: chmod u+w %s", aiDir),
 			})
 			allOk = false
 		default:
 			out.Details = append(out.Details, CheckDetail{
-				Status:  "ok",
+				Status:  CheckStatusOK,
 				Message: fmt.Sprintf("%s exists and is writable", aiDir),
 			})
 		}
 	} else {
 		if isDirWritable(workdir) {
 			out.Details = append(out.Details, CheckDetail{
-				Status:  "ok",
+				Status:  CheckStatusOK,
 				Message: fmt.Sprintf("%s will be created on first run", aiDir),
 			})
 		} else {
 			out.Details = append(out.Details, CheckDetail{
-				Status:  "error",
+				Status:  CheckStatusError,
 				Message: fmt.Sprintf("%s cannot be created (parent not writable)", aiDir),
 			})
 			allOk = false
 		}
 	}
 	if allOk {
-		out.Status = "ok"
+		out.Status = CheckStatusOK
 		out.Message = "artifact directories writable"
 		return out
 	}
 	// Promote to "error" if any detail is an error (not just a warning).
-	out.Status = "warn"
+	out.Status = CheckStatusWarn
 	for _, d := range out.Details {
-		if d.Status == "error" {
-			out.Status = "error"
+		if d.Status == CheckStatusError {
+			out.Status = CheckStatusError
 			break
 		}
 	}
@@ -705,15 +775,15 @@ func isDirWritable(dir string) bool {
 	return true
 }
 
-// checkDiskSpaceLib warns when available disk space under workdir is low.
+// checkDiskSpace warns when available disk space under workdir is low.
 // The implementation is platform-specific; see tracker_doctor_unix.go and
 // tracker_doctor_windows.go.
 
-// checkPipelineFileLib parses and validates a pipeline file.
-func checkPipelineFileLib(pipelineFile string) CheckResult {
+// checkPipelineFile parses and validates a pipeline file.
+func checkPipelineFile(pipelineFile string) CheckResult {
 	out := CheckResult{Name: "Pipeline File"}
 	if _, err := os.Stat(pipelineFile); err != nil {
-		out.Status = "error"
+		out.Status = CheckStatusError
 		out.Message = fmt.Sprintf("%s does not exist", pipelineFile)
 		out.Hint = fmt.Sprintf("check the file path: %s", pipelineFile)
 		return out
@@ -721,21 +791,21 @@ func checkPipelineFileLib(pipelineFile string) CheckResult {
 	hasWarn := false
 	if !strings.HasSuffix(pipelineFile, ".dip") && !strings.HasSuffix(pipelineFile, ".dot") {
 		out.Details = append(out.Details, CheckDetail{
-			Status:  "warn",
+			Status:  CheckStatusWarn,
 			Message: fmt.Sprintf("%s is not a .dip or .dot file — may not be a valid pipeline", pipelineFile),
 		})
 		hasWarn = true
 	}
 	fileBytes, err := os.ReadFile(pipelineFile)
 	if err != nil {
-		out.Status = "error"
+		out.Status = CheckStatusError
 		out.Message = fmt.Sprintf("%s: read error: %v", pipelineFile, err)
 		out.Hint = "check file permissions"
 		return out
 	}
 	graph, err := parsePipelineSource(string(fileBytes), detectSourceFormat(string(fileBytes)))
 	if err != nil {
-		out.Status = "error"
+		out.Status = CheckStatusError
 		out.Message = fmt.Sprintf("%s: parse error: %v", pipelineFile, err)
 		out.Hint = "run `tracker validate " + pipelineFile + "` for full details"
 		return out
@@ -745,17 +815,17 @@ func checkPipelineFileLib(pipelineFile string) CheckResult {
 	if ve != nil && len(ve.Errors) > 0 {
 		for _, e := range ve.Errors {
 			out.Details = append(out.Details, CheckDetail{
-				Status:  "error",
+				Status:  CheckStatusError,
 				Message: fmt.Sprintf("error: %s", e),
 			})
 		}
 		for _, w := range ve.Warnings {
 			out.Details = append(out.Details, CheckDetail{
-				Status:  "warn",
+				Status:  CheckStatusWarn,
 				Message: w,
 			})
 		}
-		out.Status = "error"
+		out.Status = CheckStatusError
 		out.Message = fmt.Sprintf("%s failed validation (%d error(s))", pipelineFile, len(ve.Errors))
 		out.Hint = "run `tracker validate " + pipelineFile + "` for full details"
 		return out
@@ -763,28 +833,28 @@ func checkPipelineFileLib(pipelineFile string) CheckResult {
 	if ve != nil && len(ve.Warnings) > 0 {
 		for _, w := range ve.Warnings {
 			out.Details = append(out.Details, CheckDetail{
-				Status:  "warn",
+				Status:  CheckStatusWarn,
 				Message: w,
 			})
 		}
 		out.Details = append(out.Details, CheckDetail{
-			Status: "ok",
+			Status: CheckStatusOK,
 			Message: fmt.Sprintf("%s valid (%d nodes, %d edges, %d warning(s))",
 				pipelineFile, len(graph.Nodes), len(graph.Edges), len(ve.Warnings)),
 		})
-		out.Status = "warn"
+		out.Status = CheckStatusWarn
 		out.Message = fmt.Sprintf("%s valid with %d warning(s)", pipelineFile, len(ve.Warnings))
 		return out
 	}
 	out.Details = append(out.Details, CheckDetail{
-		Status:  "ok",
+		Status:  CheckStatusOK,
 		Message: fmt.Sprintf("%s valid (%d nodes, %d edges)", pipelineFile, len(graph.Nodes), len(graph.Edges)),
 	})
 	if hasWarn {
-		out.Status = "warn"
+		out.Status = CheckStatusWarn
 		out.Message = fmt.Sprintf("%s is valid but has warnings", pipelineFile)
 	} else {
-		out.Status = "ok"
+		out.Status = CheckStatusOK
 		out.Message = fmt.Sprintf("%s is valid", pipelineFile)
 	}
 	return out
diff --git a/tracker_doctor_test.go b/tracker_doctor_test.go
index 71d1569..f2625ec 100644
--- a/tracker_doctor_test.go
+++ b/tracker_doctor_test.go
@@ -3,6 +3,7 @@
 package tracker
 
 import (
+	"context"
 	"os"
 	"path/filepath"
 	"testing"
@@ -12,7 +13,7 @@ func TestDoctor_NoProbe_KeyPresent(t *testing.T) {
 	workdir := t.TempDir()
 	t.Setenv("ANTHROPIC_API_KEY", "sk-ant-test-12345678901234567890")
 
-	r, err := Doctor(DoctorConfig{WorkDir: workdir, ProbeProviders: false})
+	r, err := Doctor(context.Background(), DoctorConfig{WorkDir: workdir, ProbeProviders: false})
 	if err != nil {
 		t.Fatalf("Doctor: %v", err)
 	}
@@ -36,7 +37,7 @@ func TestDoctor_NoProviderKeys(t *testing.T) {
 		t.Setenv(k, "")
 	}
 
-	r, err := Doctor(DoctorConfig{WorkDir: workdir, ProbeProviders: false})
+	r, err := Doctor(context.Background(), DoctorConfig{WorkDir: workdir, ProbeProviders: false})
 	if err != nil {
 		t.Fatalf("Doctor: %v", err)
 	}
@@ -62,7 +63,7 @@ func TestDoctor_PipelineFileValidation(t *testing.T) {
 `
 	must(t, os.WriteFile(pf, []byte(src), 0o644))
 
-	r, err := Doctor(DoctorConfig{WorkDir: workdir, PipelineFile: pf, ProbeProviders: false})
+	r, err := Doctor(context.Background(), DoctorConfig{WorkDir: workdir, PipelineFile: pf, ProbeProviders: false})
 	if err != nil {
 		t.Fatalf("Doctor: %v", err)
 	}
@@ -76,3 +77,44 @@ func TestDoctor_PipelineFileValidation(t *testing.T) {
 		t.Fatal("Pipeline File check missing when PipelineFile set")
 	}
 }
+
+func TestSanitizeProviderError(t *testing.T) {
+	cases := []struct {
+		name string
+		in   string
+		want string
+	}{
+		{
+			name: "anthropic key",
+			in:   "auth failed: sk-ant-api03-abcdef1234567890abcdef",
+			want: "auth failed: [redacted-key]",
+		},
+		{
+			name: "openai key",
+			in:   "invalid key sk-abcdef1234567890abcdef",
+			want: "invalid key [redacted-key]",
+		},
+		{
+			name: "google key",
+			in:   "request failed AIzaSyAbcDef1234567890abcdef_01",
+			want: "request failed [redacted-key]",
+		},
+		{
+			name: "bearer token",
+			in:   "401 Unauthorized: Bearer abc.def.ghi12345",
+			want: "401 Unauthorized: Bearer [redacted]",
+		},
+		{
+			name: "plain message",
+			in:   "connection refused",
+			want: "connection refused",
+		},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			if got := sanitizeProviderError(c.in); got != c.want {
+				t.Errorf("sanitizeProviderError(%q) = %q, want %q", c.in, got, c.want)
+			}
+		})
+	}
+}
diff --git a/tracker_doctor_unix.go b/tracker_doctor_unix.go
index 9b546dd..155f651 100644
--- a/tracker_doctor_unix.go
+++ b/tracker_doctor_unix.go
@@ -8,12 +8,12 @@ import (
 	"syscall"
 )
 
-func checkDiskSpaceLib(workdir string) CheckResult {
+func checkDiskSpace(workdir string) CheckResult {
 	var stat syscall.Statfs_t
 	if err := syscall.Statfs(workdir, &stat); err != nil {
 		return CheckResult{
 			Name:    "Disk Space",
-			Status:  "warn",
+			Status:  CheckStatusWarn,
 			Message: fmt.Sprintf("could not determine disk space: %v", err),
 		}
 	}
@@ -23,14 +23,14 @@ func checkDiskSpaceLib(workdir string) CheckResult {
 	if availableGB < minGB {
 		return CheckResult{
 			Name:    "Disk Space",
-			Status:  "warn",
+			Status:  CheckStatusWarn,
 			Message: fmt.Sprintf("low disk space: %.2f GB available (recommended: %.1f GB+)", availableGB, minGB),
 			Hint:    "free up disk space before running long pipelines",
 		}
 	}
 	return CheckResult{
 		Name:    "Disk Space",
-		Status:  "ok",
+		Status:  CheckStatusOK,
 		Message: fmt.Sprintf("%.2f GB available", availableGB),
 	}
 }
diff --git a/tracker_doctor_windows.go b/tracker_doctor_windows.go
index 9003275..da14707 100644
--- a/tracker_doctor_windows.go
+++ b/tracker_doctor_windows.go
@@ -3,10 +3,10 @@
 
 package tracker
 
-func checkDiskSpaceLib(_ string) CheckResult {
+func checkDiskSpace(_ string) CheckResult {
 	return CheckResult{
 		Name:    "Disk Space",
-		Status:  "ok",
+		Status:  CheckStatusOK,
 		Message: "disk space check not available on Windows",
 	}
 }
diff --git a/tracker_events.go b/tracker_events.go
index af57df0..b5b42c6 100644
--- a/tracker_events.go
+++ b/tracker_events.go
@@ -17,9 +17,9 @@ import (
 
 const ndjsonTimestampLayout = "2006-01-02T15:04:05.000Z07:00"
 
-// NDJSONEvent is the stable wire format for the tracker --json mode. Field
+// StreamEvent is the stable wire format for the tracker --json mode. Field
 // tags are stable; new optional fields may be added without a major bump.
-type NDJSONEvent struct {
+type StreamEvent struct {
 	Timestamp string `json:"ts"`
 	Source    string `json:"source"`              // "pipeline", "llm", "agent"
 	Type      string `json:"type"`                // event type within source
@@ -33,9 +33,16 @@ type NDJSONEvent struct {
 	Content   string `json:"content,omitempty"`   // text content (LLM output, tool output)
 }
 
-// NDJSONWriter is a thread-safe writer that serializes NDJSONEvents line by
+// NDJSONWriter is a thread-safe writer that serializes StreamEvents line by
 // line onto an io.Writer. Library consumers use it to produce the same
 // stream as the tracker CLI's --json mode.
+//
+// Backpressure note: Write holds an internal mutex for the duration of the
+// underlying io.Writer.Write call. When three handler sources (pipeline,
+// agent, LLM trace) share one writer, a slow backing writer serializes
+// handler callbacks across those sources. If the backing writer can block
+// (network socket, pipe), wrap it in a bufio.Writer or a channel-backed
+// forwarder to decouple producers from the slow sink.
 type NDJSONWriter struct {
 	mu      sync.Mutex
 	w       io.Writer
@@ -47,12 +54,15 @@ func NewNDJSONWriter(w io.Writer) *NDJSONWriter {
 	return &NDJSONWriter{w: w}
 }
 
-// Write serializes evt as a JSON line. Safe to call from multiple goroutines.
-func (s *NDJSONWriter) Write(evt NDJSONEvent) {
+// Write serializes evt as a JSON line. Safe to call from multiple
+// goroutines. Returns a non-nil error if marshalling or writing to the
+// underlying io.Writer fails; the first write error is also logged to
+// os.Stderr once so long-running callers that ignore the error still
+// surface it.
+func (s *NDJSONWriter) Write(evt StreamEvent) error {
 	data, err := json.Marshal(evt)
 	if err != nil {
-		fmt.Fprintf(os.Stderr, "tracker: marshal NDJSON event: %v\n", err)
-		return
+		return fmt.Errorf("marshal NDJSON event: %w", err)
 	}
 	data = append(data, '\n')
 	s.mu.Lock()
@@ -61,14 +71,19 @@ func (s *NDJSONWriter) Write(evt NDJSONEvent) {
 		s.errOnce.Do(func() {
 			fmt.Fprintf(os.Stderr, "tracker: NDJSON stream write error: %v (further write errors suppressed)\n", werr)
 		})
+		return werr
 	}
+	return nil
 }
 
 // PipelineHandler returns a pipeline.PipelineEventHandler that writes events
-// to this stream.
+// to this stream. Panics in the underlying writer are recovered and logged
+// to os.Stderr once so a misbehaving sink cannot crash the pipeline
+// goroutine.
 func (s *NDJSONWriter) PipelineHandler() pipeline.PipelineEventHandler {
 	return pipeline.PipelineEventHandlerFunc(func(evt pipeline.PipelineEvent) {
-		entry := NDJSONEvent{
+		defer recoverNDJSONPanic("pipeline")
+		entry := StreamEvent{
 			Timestamp: evt.Timestamp.Format(ndjsonTimestampLayout),
 			Source:    "pipeline",
 			Type:      string(evt.Type),
@@ -79,14 +94,17 @@ func (s *NDJSONWriter) PipelineHandler() pipeline.PipelineEventHandler {
 		if evt.Err != nil {
 			entry.Error = evt.Err.Error()
 		}
-		s.Write(entry)
+		_ = s.Write(entry)
 	})
 }
 
-// TraceObserver returns an llm.TraceObserver that writes trace events to this stream.
+// TraceObserver returns an llm.TraceObserver that writes trace events to
+// this stream. Panics in the underlying writer are recovered (see
+// PipelineHandler).
 func (s *NDJSONWriter) TraceObserver() llm.TraceObserver {
 	return llm.TraceObserverFunc(func(evt llm.TraceEvent) {
-		s.Write(NDJSONEvent{
+		defer recoverNDJSONPanic("llm")
+		_ = s.Write(StreamEvent{
 			Timestamp: time.Now().Format(ndjsonTimestampLayout),
 			Source:    "llm",
 			Type:      string(evt.Kind),
@@ -98,9 +116,12 @@ func (s *NDJSONWriter) TraceObserver() llm.TraceObserver {
 	})
 }
 
-// AgentHandler returns an agent.EventHandler that writes agent events to this stream.
+// AgentHandler returns an agent.EventHandler that writes agent events to
+// this stream. Panics in the underlying writer are recovered (see
+// PipelineHandler).
 func (s *NDJSONWriter) AgentHandler() agent.EventHandler {
 	return agent.EventHandlerFunc(func(evt agent.Event) {
+		defer recoverNDJSONPanic("agent")
 		content := evt.ToolOutput
 		if content == "" {
 			content = evt.Text
@@ -109,7 +130,7 @@ func (s *NDJSONWriter) AgentHandler() agent.EventHandler {
 		if ts.IsZero() {
 			ts = time.Now()
 		}
-		entry := NDJSONEvent{
+		entry := StreamEvent{
 			Timestamp: ts.Format(ndjsonTimestampLayout),
 			Source:    "agent",
 			Type:      string(evt.Type),
@@ -119,12 +140,22 @@ func (s *NDJSONWriter) AgentHandler() agent.EventHandler {
 			ToolName:  evt.ToolName,
 			Content:   content,
 		}
-		entry.Error = buildNDJSONEntryError(evt)
-		s.Write(entry)
+		entry.Error = buildStreamEntryError(evt)
+		_ = s.Write(entry)
 	})
 }
 
-func buildNDJSONEntryError(evt agent.Event) string {
+var ndjsonPanicOnce sync.Once
+
+func recoverNDJSONPanic(source string) {
+	if r := recover(); r != nil {
+		ndjsonPanicOnce.Do(func() {
+			fmt.Fprintf(os.Stderr, "tracker: NDJSON %s handler recovered from panic: %v (further panics suppressed)\n", source, r)
+		})
+	}
+}
+
+func buildStreamEntryError(evt agent.Event) string {
 	if evt.ToolError == "" && evt.Err == nil {
 		return ""
 	}
diff --git a/tracker_events_test.go b/tracker_events_test.go
index 113a12d..a7a2adb 100644
--- a/tracker_events_test.go
+++ b/tracker_events_test.go
@@ -1,4 +1,4 @@
-// ABOUTME: Tests for the public NDJSONWriter and NDJSONEvent types in the tracker package.
+// ABOUTME: Tests for the public NDJSONWriter and StreamEvent types in the tracker package.
 // ABOUTME: Covers write, stable JSON tags, concurrency, and handler factory methods.
 package tracker
 
@@ -19,10 +19,10 @@ import (
 func TestNDJSONWriter_Write(t *testing.T) {
 	var buf bytes.Buffer
 	w := NewNDJSONWriter(&buf)
-	w.Write(NDJSONEvent{Timestamp: "2026-04-17T10:00:00Z", Source: "pipeline", Type: "stage_started", NodeID: "N1"})
+	w.Write(StreamEvent{Timestamp: "2026-04-17T10:00:00Z", Source: "pipeline", Type: "stage_started", NodeID: "N1"})
 
 	line := strings.TrimSuffix(buf.String(), "\n")
-	var got NDJSONEvent
+	var got StreamEvent
 	if err := json.Unmarshal([]byte(line), &got); err != nil {
 		t.Fatalf("unmarshal: %v", err)
 	}
@@ -34,7 +34,7 @@ func TestNDJSONWriter_Write(t *testing.T) {
 func TestNDJSONWriter_StableJSONTags(t *testing.T) {
 	var buf bytes.Buffer
 	w := NewNDJSONWriter(&buf)
-	w.Write(NDJSONEvent{
+	w.Write(StreamEvent{
 		Timestamp: "t", Source: "agent", Type: "tool_call",
 		RunID: "r1", NodeID: "n1", Message: "m", Error: "e",
 		Provider: "p", Model: "mo", ToolName: "tn", Content: "c",
@@ -60,7 +60,7 @@ func TestNDJSONWriter_ConcurrentWrites(t *testing.T) {
 	for i := 0; i < n; i++ {
 		go func() {
 			defer wg.Done()
-			w.Write(NDJSONEvent{Timestamp: "t", Source: "pipeline", Type: "x"})
+			w.Write(StreamEvent{Timestamp: "t", Source: "pipeline", Type: "x"})
 		}()
 	}
 	wg.Wait()
@@ -70,7 +70,7 @@ func TestNDJSONWriter_ConcurrentWrites(t *testing.T) {
 		t.Fatalf("got %d lines, want %d", len(lines), n)
 	}
 	for i, l := range lines {
-		var evt NDJSONEvent
+		var evt StreamEvent
 		if err := json.Unmarshal([]byte(l), &evt); err != nil {
 			t.Fatalf("line %d: unmarshal: %v; got %q", i, err, l)
 		}
@@ -80,7 +80,7 @@ func TestNDJSONWriter_ConcurrentWrites(t *testing.T) {
 func TestNDJSONWriter_OmitsEmptyFields(t *testing.T) {
 	var buf bytes.Buffer
 	w := NewNDJSONWriter(&buf)
-	w.Write(NDJSONEvent{
+	w.Write(StreamEvent{
 		Timestamp: "2026-04-17T10:00:00Z",
 		Source:    "llm",
 		Type:      "request_start",
@@ -108,7 +108,7 @@ func TestNDJSONWriter_PipelineHandler(t *testing.T) {
 		Message:   "started",
 	})
 
-	var evt NDJSONEvent
+	var evt StreamEvent
 	if err := json.Unmarshal([]byte(strings.TrimSpace(buf.String())), &evt); err != nil {
 		t.Fatalf("invalid JSON: %v", err)
 	}
@@ -142,7 +142,7 @@ func TestNDJSONWriter_PipelineHandlerWithError(t *testing.T) {
 		Err:       errors.New("context cancelled"),
 	})
 
-	var evt NDJSONEvent
+	var evt StreamEvent
 	if err := json.Unmarshal([]byte(strings.TrimSpace(buf.String())), &evt); err != nil {
 		t.Fatalf("invalid JSON: %v", err)
 	}
@@ -163,7 +163,7 @@ func TestNDJSONWriter_TraceObserver(t *testing.T) {
 		Preview:  "hello world",
 	})
 
-	var evt NDJSONEvent
+	var evt StreamEvent
 	if err := json.Unmarshal([]byte(strings.TrimSpace(buf.String())), &evt); err != nil {
 		t.Fatalf("invalid JSON: %v", err)
 	}
@@ -193,7 +193,7 @@ func TestNDJSONWriter_TraceObserverWithToolName(t *testing.T) {
 		ToolName: "execute_command",
 	})
 
-	var evt NDJSONEvent
+	var evt StreamEvent
 	if err := json.Unmarshal([]byte(strings.TrimSpace(buf.String())), &evt); err != nil {
 		t.Fatalf("invalid JSON: %v", err)
 	}
@@ -215,7 +215,7 @@ func TestNDJSONWriter_AgentHandler(t *testing.T) {
 		ToolOutput: "file contents here",
 	})
 
-	var evt NDJSONEvent
+	var evt StreamEvent
 	if err := json.Unmarshal([]byte(strings.TrimSpace(buf.String())), &evt); err != nil {
 		t.Fatalf("invalid JSON: %v", err)
 	}
@@ -243,7 +243,7 @@ func TestNDJSONWriter_AgentHandlerFallsBackToText(t *testing.T) {
 		Text: "thinking about the problem",
 	})
 
-	var evt NDJSONEvent
+	var evt StreamEvent
 	if err := json.Unmarshal([]byte(strings.TrimSpace(buf.String())), &evt); err != nil {
 		t.Fatalf("invalid JSON: %v", err)
 	}
@@ -263,7 +263,7 @@ func TestNDJSONWriter_AgentHandlerWithToolError(t *testing.T) {
 		ToolError: "command timed out",
 	})
 
-	var evt NDJSONEvent
+	var evt StreamEvent
 	if err := json.Unmarshal([]byte(strings.TrimSpace(buf.String())), &evt); err != nil {
 		t.Fatalf("invalid JSON: %v", err)
 	}
@@ -284,7 +284,7 @@ func TestNDJSONWriter_AgentHandlerCombinesErrors(t *testing.T) {
 		Err:       errors.New("process killed"),
 	})
 
-	var evt NDJSONEvent
+	var evt StreamEvent
 	if err := json.Unmarshal([]byte(strings.TrimSpace(buf.String())), &evt); err != nil {
 		t.Fatalf("invalid JSON: %v", err)
 	}
@@ -300,7 +300,7 @@ func TestNDJSONWriter_AgentHandler_ContentPriority(t *testing.T) {
 		h := w.AgentHandler()
 		h.HandleEvent(agent.Event{Type: agent.EventToolCallEnd, ToolOutput: "tool-out", Text: "text-out"})
 
-		var got NDJSONEvent
+		var got StreamEvent
 		if err := json.Unmarshal(bytes.TrimRight(buf.Bytes(), "\n"), &got); err != nil {
 			t.Fatalf("unmarshal: %v", err)
 		}
@@ -320,7 +320,7 @@ func TestNDJSONWriter_AgentHandlerErrorOnlyFromErr(t *testing.T) {
 		Err:  errors.New("session failed"),
 	})
 
-	var evt NDJSONEvent
+	var evt StreamEvent
 	if err := json.Unmarshal([]byte(strings.TrimSpace(buf.String())), &evt); err != nil {
 		t.Fatalf("invalid JSON: %v", err)
 	}
@@ -328,3 +328,59 @@ func TestNDJSONWriter_AgentHandlerErrorOnlyFromErr(t *testing.T) {
 		t.Errorf("error = %q, want 'session failed'", evt.Error)
 	}
 }
+
+// errWriter always returns an error on Write, used to exercise NDJSONWriter's
+// error-propagation path.
+type errWriter struct{ err error }
+
+func (e *errWriter) Write(_ []byte) (int, error) { return 0, e.err }
+
+func TestNDJSONWriter_WriteReturnsError(t *testing.T) {
+	w := NewNDJSONWriter(&errWriter{err: errors.New("sink closed")})
+	err := w.Write(StreamEvent{Timestamp: "t", Source: "pipeline", Type: "stage_started"})
+	if err == nil {
+		t.Fatal("expected non-nil error when backing writer fails")
+	}
+	if !strings.Contains(err.Error(), "sink closed") {
+		t.Errorf("error = %q, want to contain 'sink closed'", err.Error())
+	}
+}
+
+// panicWriter panics on Write, used to exercise handler panic-recovery paths.
+type panicWriter struct{}
+
+func (p *panicWriter) Write(_ []byte) (int, error) { panic("sink exploded") }
+
+func TestNDJSONWriter_PipelineHandler_PanicRecovery(t *testing.T) {
+	defer func() {
+		if r := recover(); r != nil {
+			t.Fatalf("PipelineHandler should recover from panic, got: %v", r)
+		}
+	}()
+	w := NewNDJSONWriter(&panicWriter{})
+	w.PipelineHandler().HandlePipelineEvent(pipeline.PipelineEvent{
+		Type:      pipeline.EventPipelineStarted,
+		Timestamp: time.Now(),
+		RunID:     "r",
+	})
+}
+
+func TestNDJSONWriter_AgentHandler_PanicRecovery(t *testing.T) {
+	defer func() {
+		if r := recover(); r != nil {
+			t.Fatalf("AgentHandler should recover from panic, got: %v", r)
+		}
+	}()
+	w := NewNDJSONWriter(&panicWriter{})
+	w.AgentHandler().HandleEvent(agent.Event{Type: agent.EventTextDelta, Text: "x"})
+}
+
+func TestNDJSONWriter_TraceObserver_PanicRecovery(t *testing.T) {
+	defer func() {
+		if r := recover(); r != nil {
+			t.Fatalf("TraceObserver should recover from panic, got: %v", r)
+		}
+	}()
+	w := NewNDJSONWriter(&panicWriter{})
+	w.TraceObserver().HandleTraceEvent(llm.TraceEvent{Kind: llm.TraceRequestStart})
+}
diff --git a/tracker_simulate.go b/tracker_simulate.go
index e7960d8..bda7837 100644
--- a/tracker_simulate.go
+++ b/tracker_simulate.go
@@ -3,6 +3,7 @@
 package tracker
 
 import (
+	"context"
 	"fmt"
 	"sort"
 
@@ -47,8 +48,16 @@ type PlanStep struct {
 	Edges  []SimEdge `json:"edges,omitempty"`
 }
 
-// Simulate parses source and returns a SimulateReport. Format is detected from content.
-func Simulate(source string) (*SimulateReport, error) {
+// Simulate parses source and returns a SimulateReport. Format is detected
+// from content.
+//
+// ctx is accepted for future extensibility (e.g. cancelling a slow parse
+// on very large graphs). A nil context is treated as context.Background().
+func Simulate(ctx context.Context, source string) (*SimulateReport, error) {
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	_ = ctx // reserved for future use
 	format := detectSourceFormat(source)
 	graph, err := parsePipelineSource(source, format)
 	if err != nil {
diff --git a/tracker_simulate_test.go b/tracker_simulate_test.go
index c69cf91..d6210f0 100644
--- a/tracker_simulate_test.go
+++ b/tracker_simulate_test.go
@@ -2,6 +2,7 @@
 package tracker
 
 import (
+	"context"
 	"strings"
 	"testing"
 )
@@ -24,7 +25,7 @@ const simpleSource = `workflow X
 `
 
 func TestSimulate_BasicGraph(t *testing.T) {
-	r, err := Simulate(simpleSource)
+	r, err := Simulate(context.Background(), simpleSource)
 	if err != nil {
 		t.Fatalf("Simulate: %v", err)
 	}
@@ -57,7 +58,7 @@ func TestSimulate_UnreachableDetection(t *testing.T) {
 		Start -> S;
 		S -> E;
 	}`
-	r, err := Simulate(src)
+	r, err := Simulate(context.Background(), src)
 	if err != nil {
 		t.Fatalf("Simulate: %v", err)
 	}
@@ -81,7 +82,7 @@ func TestSimulate_EdgeConditionPropagated(t *testing.T) {
   edges
     S -> E when ctx.outcome = success
 `
-	r, err := Simulate(src)
+	r, err := Simulate(context.Background(), src)
 	if err != nil {
 		t.Fatalf("Simulate: %v", err)
 	}
@@ -94,7 +95,7 @@ func TestSimulate_EdgeConditionPropagated(t *testing.T) {
 }
 
 func TestSimulate_InvalidSource(t *testing.T) {
-	_, err := Simulate("this is not a pipeline")
+	_, err := Simulate(context.Background(), "this is not a pipeline")
 	if err == nil {
 		t.Fatal("expected error for invalid source")
 	}
@@ -109,7 +110,7 @@ func TestSimulate_GraphAttrsPopulated(t *testing.T) {
 		End [shape=Msquare label="End"];
 		Start -> End;
 	}`
-	r, err := Simulate(src)
+	r, err := Simulate(context.Background(), src)
 	if err != nil {
 		t.Fatalf("Simulate: %v", err)
 	}

From ec3f68b567ab09ed4d66974cf26570c1c6b888ef Mon Sep 17 00:00:00 2001
From: Clint Ecker <me@clintecker.com>
Date: Fri, 17 Apr 2026 16:42:19 -0500
Subject: [PATCH 2/5] fix: address PR #113 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Diagnose: propagate ctx.Err() instead of silently truncating the report
  on cancellation. Partial reports are never returned as success, so
  deadline-aware callers can distinguish complete from truncated output.
  (Codex P1)
- enrichFromActivity: surface bufio.Scanner errors (1 MB line overflow,
  I/O) instead of dropping them on the floor. Returns the error up to
  Diagnose and logs it via the LogWriter. (Codex P2, CodeRabbit)
- NDJSONWriter: panicOnce is now a per-instance field, not a package-level
  sync.Once. Before this, one misbehaving sink would silence panic
  logging across every NDJSONWriter in the process. Added a test to
  verify per-instance isolation. (CodeRabbit)
- Drop unused DoctorConfig.LogWriter. Doctor doesn't emit stderr warnings
  — the field was added symmetrically with Audit/Diagnose but had no
  consumers. (CodeRabbit)
- Drop ineffectual ctx nil-check in Audit and Simulate. ctx is reserved
  for future extensibility; silence the linter with `_ = ctx` and drop
  the dead nil-guard. (CodeRabbit, golangci-lint)
- README: correct the library-API sentence — Simulate does not take a
  config struct; Doctor uses functional options, not AuditConfig-shaped
  options. (CodeRabbit)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md             |  6 +++---
 README.md                |  2 +-
 cmd/tracker/doctor.go    |  2 --
 tracker_audit.go         |  9 ++++-----
 tracker_diagnose.go      | 36 +++++++++++++++++++++++++++---------
 tracker_diagnose_test.go | 15 +++++++++++++++
 tracker_doctor.go        |  7 -------
 tracker_events.go        | 28 ++++++++++++++++------------
 tracker_events_test.go   | 24 ++++++++++++++++++++++++
 tracker_simulate.go      |  8 +++-----
 10 files changed, 93 insertions(+), 44 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5402777..40ad8b3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,11 +12,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - **Library API hardening for v1.0** (#102, #103, #104, #106, #109):
   - Typed enum-like strings for `CheckStatus` and `SuggestionKind` so consumers can switch-exhaust. Existing constants (`SuggestionRetryPattern`, etc.) retain their underlying string values.
   - `tracker.WithVersionInfo(version, commit)` functional option replaces the CLI-only `DoctorConfig.TrackerVersion` / `TrackerCommit` fields.
-  - `DoctorConfig.LogWriter` / `DiagnoseConfig.LogWriter` / `AuditConfig.LogWriter` — optional `io.Writer` for non-fatal parse warnings. Nil is treated as `io.Discard` so library callers no longer see stray warnings on `os.Stderr`. The `tracker` CLI sets this to `io.Discard` for user-facing commands.
+  - `DiagnoseConfig.LogWriter` / `AuditConfig.LogWriter` — optional `io.Writer` for non-fatal parse warnings. Nil is treated as `io.Discard` so library callers no longer see stray warnings on `os.Stderr`. The `tracker` CLI sets this to `io.Discard` for user-facing commands. `Doctor` has no warnings to suppress so it deliberately does not carry a `LogWriter` field.
   - `Doctor`, `Diagnose`, `DiagnoseMostRecent`, `Audit`, `Simulate` now accept `context.Context`, honored by provider probes and binary version lookups. `getBinaryVersion` now uses `exec.CommandContext` with a 5-second timeout, matching `getDippinVersion`.
   - Provider probe error bodies are now sanitized (API keys and bearer tokens stripped) before they land in `CheckDetail.Message`.
-  - `NDJSON` handler closures (pipeline, agent, LLM trace) now `recover()` from panics in the underlying writer so a misbehaving sink cannot crash the caller goroutine.
-  - `Diagnose` now streams `activity.jsonl` with `bufio.Scanner` instead of `os.ReadFile` → `strings.Split`, matching `LoadActivityLog` and avoiding a memory spike on large runs.
+  - `NDJSON` handler closures (pipeline, agent, LLM trace) now `recover()` from panics in the underlying writer so a misbehaving sink cannot crash the caller goroutine. Panic suppression is per-`NDJSONWriter` instance (not package-level), so one misbehaving sink cannot silence unrelated writers in the same process.
+  - `Diagnose` now streams `activity.jsonl` with `bufio.Scanner` instead of `os.ReadFile` → `strings.Split`, matching `LoadActivityLog` and avoiding a memory spike on large runs. Scanner errors (1 MB line-length overflow, I/O) and `ctx.Err()` now propagate out of `Diagnose` as a real error — partial reports are never returned as success, so automation with deadlines can distinguish complete from truncated analysis.
 
 ### Changed
 
diff --git a/README.md b/README.md
index d9b46c4..e53e79d 100644
--- a/README.md
+++ b/README.md
@@ -588,7 +588,7 @@ for _, s := range report.Suggestions {
 }
 ```
 
-`tracker.Audit`, `tracker.Simulate`, and `tracker.Doctor` follow the same pattern and return JSON-serializable reports. All four accept `context.Context` as their first argument and an optional config struct (`AuditConfig`, `DiagnoseConfig`, `DoctorConfig`) with a `LogWriter` for non-fatal parse warnings — set it to `io.Discard` to silence warnings in embedded callers.
+`tracker.Audit`, `tracker.DiagnoseMostRecent`, `tracker.Simulate`, and `tracker.Doctor` all accept `context.Context` as their first argument and return JSON-serializable reports. `Audit` and `DiagnoseMostRecent`/`Diagnose` additionally accept an optional config (`AuditConfig`, `DiagnoseConfig`) with a `LogWriter` for non-fatal parse warnings — set it to `io.Discard` to silence warnings in embedded callers. `Simulate` takes just `ctx` and source text; `Doctor` takes a required `DoctorConfig` plus optional functional options (e.g., `tracker.WithVersionInfo`).
 
 To stream events programmatically in the same NDJSON format as `tracker --json`, use `tracker.NewNDJSONWriter`:
 
diff --git a/cmd/tracker/doctor.go b/cmd/tracker/doctor.go
index a543f39..583917b 100644
--- a/cmd/tracker/doctor.go
+++ b/cmd/tracker/doctor.go
@@ -6,7 +6,6 @@ package main
 import (
 	"context"
 	"fmt"
-	"io"
 	"os"
 	"path/filepath"
 	"strings"
@@ -80,7 +79,6 @@ func runDoctorWithConfig(workdir string, cfg DoctorConfig) error {
 		Backend:        cfg.backend,
 		ProbeProviders: cfg.probe,
 		PipelineFile:   cfg.pipelineFile,
-		LogWriter:      io.Discard,
 	}, tracker.WithVersionInfo(version, commit))
 	if err != nil {
 		return err
diff --git a/tracker_audit.go b/tracker_audit.go
index 8ec4007..30ee4f7 100644
--- a/tracker_audit.go
+++ b/tracker_audit.go
@@ -80,12 +80,11 @@ type RunSummary struct {
 // the path via ResolveRunDir or use MostRecentRunID first, which enforce
 // the .tracker/runs/<runID> layout.
 //
-// ctx is accepted for future extensibility. A nil context is treated as
-// context.Background().
+// ctx is accepted for future extensibility (cancellation of checkpoint or
+// activity log parsing). It is currently unused but part of the public
+// signature so callers do not need to re-thread it later.
 func Audit(ctx context.Context, runDir string, opts ...AuditConfig) (*AuditReport, error) {
-	if ctx == nil {
-		ctx = context.Background()
-	}
+	_ = ctx                    // reserved for future cancellation plumbing
 	_ = firstAuditConfig(opts) // reserved for future fields
 	cp, err := pipeline.LoadCheckpoint(filepath.Join(runDir, "checkpoint.json"))
 	if err != nil {
diff --git a/tracker_diagnose.go b/tracker_diagnose.go
index 55afd04..a02ac87 100644
--- a/tracker_diagnose.go
+++ b/tracker_diagnose.go
@@ -94,9 +94,10 @@ const (
 // ResolveRunDir or DiagnoseMostRecent first, which enforce the
 // .tracker/runs/<runID> layout.
 //
-// ctx is accepted for future extensibility (cancellation of long parses);
-// current callers may pass context.Background(). A nil context is treated
-// as context.Background().
+// If ctx is cancelled mid-parse, Diagnose returns ctx.Err() — a partial
+// report is never returned as a success, so callers using deadlines can
+// distinguish complete from truncated analysis. A nil ctx is treated as
+// context.Background() (no cancellation possible).
 func Diagnose(ctx context.Context, runDir string, opts ...DiagnoseConfig) (*DiagnoseReport, error) {
 	if ctx == nil {
 		ctx = context.Background()
@@ -114,7 +115,11 @@ func Diagnose(ctx context.Context, runDir string, opts ...DiagnoseConfig) (*Diag
 		CompletedNodes: len(cp.CompletedNodes),
 	}
 	failures := collectNodeFailures(runDir, logW)
-	report.BudgetHalt = enrichFromActivity(ctx, runDir, failures)
+	halt, err := enrichFromActivity(ctx, runDir, failures, logW)
+	if err != nil {
+		return nil, err
+	}
+	report.BudgetHalt = halt
 	report.Failures = sortedFailures(failures)
 	report.Suggestions = buildSuggestions(report.Failures, report.BudgetHalt)
 	return report, nil
@@ -201,11 +206,20 @@ type diagnoseEntry struct {
 	WallElapsedMs int64   `json:"wall_elapsed_ms"`
 }
 
-func enrichFromActivity(ctx context.Context, runDir string, failures map[string]*NodeFailure) *BudgetHalt {
+// enrichFromActivity streams activity.jsonl, populating failures + detecting
+// budget halt events. Returns (nil, nil) if activity.jsonl does not exist
+// (runs that never started). Returns ctx.Err() if cancellation fires
+// mid-parse, and scanner.Err() if the scanner aborts (buffer overflow at
+// 1 MB, I/O error) — both surface truncation to the caller so partial
+// analysis is never silently treated as authoritative.
+func enrichFromActivity(ctx context.Context, runDir string, failures map[string]*NodeFailure, logW io.Writer) (*BudgetHalt, error) {
 	path := filepath.Join(runDir, "activity.jsonl")
 	f, err := os.Open(path)
 	if err != nil {
-		return nil
+		if os.IsNotExist(err) {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("open activity log: %w", err)
 	}
 	defer f.Close()
 
@@ -217,8 +231,8 @@ func enrichFromActivity(ctx context.Context, runDir string, failures map[string]
 	// Match LoadActivityLog: allow 1 MB lines.
 	scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
 	for scanner.Scan() {
-		if ctx.Err() != nil {
-			break
+		if err := ctx.Err(); err != nil {
+			return nil, err
 		}
 		line := strings.TrimSpace(scanner.Text())
 		if line == "" {
@@ -238,8 +252,12 @@ func enrichFromActivity(ctx context.Context, runDir string, failures map[string]
 		}
 		enrichFromEntry(entry, failures, stageStarts, failSignatures)
 	}
+	if err := scanner.Err(); err != nil {
+		fmt.Fprintf(logW, "warning: activity log scanner stopped at %s: %v\n", path, err)
+		return nil, fmt.Errorf("scan activity log: %w", err)
+	}
 	applyRetryAnalysis(failures, failSignatures)
-	return halt
+	return halt, nil
 }
 
 func enrichFromEntry(entry diagnoseEntry, failures map[string]*NodeFailure, stageStarts map[string]time.Time, failSignatures map[string][]string) {
diff --git a/tracker_diagnose_test.go b/tracker_diagnose_test.go
index 2f578c7..7d5a26c 100644
--- a/tracker_diagnose_test.go
+++ b/tracker_diagnose_test.go
@@ -72,3 +72,18 @@ func TestDiagnose_BudgetHalt(t *testing.T) {
 		t.Error("empty breach message")
 	}
 }
+
+// TestDiagnose_CtxCancelled verifies that a cancelled context propagates
+// out of Diagnose — a partial report is never returned as a success, so
+// automation with deadlines can distinguish complete from truncated output.
+func TestDiagnose_CtxCancelled(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel() // cancel before the call
+	_, err := Diagnose(ctx, "testdata/runs/failed")
+	if err == nil {
+		t.Fatal("expected ctx.Err() to propagate, got nil")
+	}
+	if err != context.Canceled {
+		t.Errorf("err = %v, want context.Canceled", err)
+	}
+}
diff --git a/tracker_doctor.go b/tracker_doctor.go
index 058db31..aee70f6 100644
--- a/tracker_doctor.go
+++ b/tracker_doctor.go
@@ -5,7 +5,6 @@ package tracker
 import (
 	"context"
 	"fmt"
-	"io"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -38,12 +37,6 @@ type DoctorConfig struct {
 	// PipelineFile, when non-empty, adds a "Pipeline File" check that parses
 	// and validates the given .dip / .dot file.
 	PipelineFile string
-	// LogWriter receives non-fatal parse warnings (bad status.json,
-	// unreadable activity.jsonl, etc.). Nil is treated as io.Discard so
-	// embedded library callers do not see warnings on os.Stderr. The
-	// tracker CLI sets this to os.Stderr for factory/webhook consumers
-	// and io.Discard for user-facing commands.
-	LogWriter io.Writer
 	// versionInfo is populated by WithVersionInfo. Unexported so callers
 	// use the functional option rather than setting CLI-specific fields.
 	versionInfo versionInfo
diff --git a/tracker_events.go b/tracker_events.go
index b5b42c6..ef4ab73 100644
--- a/tracker_events.go
+++ b/tracker_events.go
@@ -44,9 +44,10 @@ type StreamEvent struct {
 // (network socket, pipe), wrap it in a bufio.Writer or a channel-backed
 // forwarder to decouple producers from the slow sink.
 type NDJSONWriter struct {
-	mu      sync.Mutex
-	w       io.Writer
-	errOnce sync.Once
+	mu        sync.Mutex
+	w         io.Writer
+	errOnce   sync.Once
+	panicOnce sync.Once
 }
 
 // NewNDJSONWriter returns a new writer backed by w.
@@ -78,11 +79,11 @@ func (s *NDJSONWriter) Write(evt StreamEvent) error {
 
 // PipelineHandler returns a pipeline.PipelineEventHandler that writes events
 // to this stream. Panics in the underlying writer are recovered and logged
-// to os.Stderr once so a misbehaving sink cannot crash the pipeline
-// goroutine.
+// to os.Stderr once (per writer instance) so a misbehaving sink cannot
+// crash the pipeline goroutine.
 func (s *NDJSONWriter) PipelineHandler() pipeline.PipelineEventHandler {
 	return pipeline.PipelineEventHandlerFunc(func(evt pipeline.PipelineEvent) {
-		defer recoverNDJSONPanic("pipeline")
+		defer s.recoverPanic("pipeline")
 		entry := StreamEvent{
 			Timestamp: evt.Timestamp.Format(ndjsonTimestampLayout),
 			Source:    "pipeline",
@@ -103,7 +104,7 @@ func (s *NDJSONWriter) PipelineHandler() pipeline.PipelineEventHandler {
 // PipelineHandler).
 func (s *NDJSONWriter) TraceObserver() llm.TraceObserver {
 	return llm.TraceObserverFunc(func(evt llm.TraceEvent) {
-		defer recoverNDJSONPanic("llm")
+		defer s.recoverPanic("llm")
 		_ = s.Write(StreamEvent{
 			Timestamp: time.Now().Format(ndjsonTimestampLayout),
 			Source:    "llm",
@@ -121,7 +122,7 @@ func (s *NDJSONWriter) TraceObserver() llm.TraceObserver {
 // PipelineHandler).
 func (s *NDJSONWriter) AgentHandler() agent.EventHandler {
 	return agent.EventHandlerFunc(func(evt agent.Event) {
-		defer recoverNDJSONPanic("agent")
+		defer s.recoverPanic("agent")
 		content := evt.ToolOutput
 		if content == "" {
 			content = evt.Text
@@ -145,11 +146,14 @@ func (s *NDJSONWriter) AgentHandler() agent.EventHandler {
 	})
 }
 
-var ndjsonPanicOnce sync.Once
-
-func recoverNDJSONPanic(source string) {
+// recoverPanic recovers from a handler panic and logs the first occurrence
+// per writer instance. Using a per-instance sync.Once (not package-level)
+// means multiple NDJSONWriter instances (e.g., different runs streaming to
+// different sinks) each get their own suppression state, so one misbehaving
+// sink does not silence unrelated panics elsewhere in the process.
+func (s *NDJSONWriter) recoverPanic(source string) {
 	if r := recover(); r != nil {
-		ndjsonPanicOnce.Do(func() {
+		s.panicOnce.Do(func() {
 			fmt.Fprintf(os.Stderr, "tracker: NDJSON %s handler recovered from panic: %v (further panics suppressed)\n", source, r)
 		})
 	}
diff --git a/tracker_events_test.go b/tracker_events_test.go
index a7a2adb..a3607b7 100644
--- a/tracker_events_test.go
+++ b/tracker_events_test.go
@@ -384,3 +384,27 @@ func TestNDJSONWriter_TraceObserver_PanicRecovery(t *testing.T) {
 	w := NewNDJSONWriter(&panicWriter{})
 	w.TraceObserver().HandleTraceEvent(llm.TraceEvent{Kind: llm.TraceRequestStart})
 }
+
+// TestNDJSONWriter_PanicSuppressionIsPerInstance verifies that one writer
+// recovering from a panic does not silence panic logging on a separate
+// writer instance. Both writers must independently report their first
+// panic; regressions here mean package-level state re-crept in.
+func TestNDJSONWriter_PanicSuppressionIsPerInstance(t *testing.T) {
+	defer func() {
+		if r := recover(); r != nil {
+			t.Fatalf("handlers should recover, got: %v", r)
+		}
+	}()
+	w1 := NewNDJSONWriter(&panicWriter{})
+	w2 := NewNDJSONWriter(&panicWriter{})
+	// First panic on w1 — must not consume w2's Once.
+	w1.PipelineHandler().HandlePipelineEvent(pipeline.PipelineEvent{
+		Type:      pipeline.EventPipelineStarted,
+		Timestamp: time.Now(),
+	})
+	// Second panic on w2 — still the first on its own instance.
+	w2.PipelineHandler().HandlePipelineEvent(pipeline.PipelineEvent{
+		Type:      pipeline.EventPipelineStarted,
+		Timestamp: time.Now(),
+	})
+}
diff --git a/tracker_simulate.go b/tracker_simulate.go
index bda7837..fc5d76c 100644
--- a/tracker_simulate.go
+++ b/tracker_simulate.go
@@ -52,12 +52,10 @@ type PlanStep struct {
 // from content.
 //
 // ctx is accepted for future extensibility (e.g. cancelling a slow parse
-// on very large graphs). A nil context is treated as context.Background().
+// on very large graphs). It is currently unused but part of the public
+// signature so callers do not need to re-thread it later.
 func Simulate(ctx context.Context, source string) (*SimulateReport, error) {
-	if ctx == nil {
-		ctx = context.Background()
-	}
-	_ = ctx // reserved for future use
+	_ = ctx // reserved for future cancellation plumbing
 	format := detectSourceFormat(source)
 	graph, err := parsePipelineSource(source, format)
 	if err != nil {

From 3d653daaeee1abeb778069a26105ae3194525f09 Mon Sep 17 00:00:00 2001
From: Clint Ecker <me@clintecker.com>
Date: Mon, 20 Apr 2026 10:46:46 -0500
Subject: [PATCH 3/5] fix: address PR #113 round-2 review feedback

- probeProvider: sanitize full error message BEFORE trimming to 80
  chars. Previously sanitizeProviderError(trimErrMsg(msg, 80)) trimmed
  first, so a key straddling the 80-char boundary could be cut into a
  shorter prefix that no longer matched the regex minimum, leaking
  the prefix. New order: trim(sanitize(msg), 80). Added a regression
  test. (Copilot, security)
- Audit, Simulate: coalesce nil ctx to context.Background() instead
  of just _ = ctx. Previously dropping the nil-check entirely (to
  satisfy the earlier ineffassign warning) meant nil-callers would
  regress if ctx ever gets wired into real work. Now: nil-check
  preserves callers, and _ = ctx keeps the linter quiet. (CodeRabbit
  nitpick)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tracker_audit.go       |  7 +++++--
 tracker_doctor.go      |  6 +++++-
 tracker_doctor_test.go | 24 ++++++++++++++++++++++++
 tracker_simulate.go    |  7 +++++--
 4 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/tracker_audit.go b/tracker_audit.go
index 30ee4f7..331248d 100644
--- a/tracker_audit.go
+++ b/tracker_audit.go
@@ -81,9 +81,12 @@ type RunSummary struct {
 // the .tracker/runs/<runID> layout.
 //
 // ctx is accepted for future extensibility (cancellation of checkpoint or
-// activity log parsing). It is currently unused but part of the public
-// signature so callers do not need to re-thread it later.
+// activity log parsing). Nil is coalesced to context.Background() so that
+// when cancellation is wired in later, nil-callers do not regress.
 func Audit(ctx context.Context, runDir string, opts ...AuditConfig) (*AuditReport, error) {
+	if ctx == nil {
+		ctx = context.Background()
+	}
 	_ = ctx                    // reserved for future cancellation plumbing
 	_ = firstAuditConfig(opts) // reserved for future fields
 	cp, err := pipeline.LoadCheckpoint(filepath.Join(runDir, "checkpoint.json"))
diff --git a/tracker_doctor.go b/tracker_doctor.go
index aee70f6..dbca912 100644
--- a/tracker_doctor.go
+++ b/tracker_doctor.go
@@ -361,7 +361,11 @@ func probeProvider(ctx context.Context, p providerDef, key string) (bool, string
 		if isAuthError(msg) {
 			return false, "invalid or expired API key"
 		}
-		return false, sanitizeProviderError(trimErrMsg(msg, 80))
+		// Sanitize FIRST, then trim. If we trim first, a key that
+		// straddles the 80-char boundary gets cut into a shorter prefix
+		// that no longer matches the regex, leaking the prefix. Sanitize
+		// the full message, then trim whatever's left.
+		return false, trimErrMsg(sanitizeProviderError(msg), 80)
 	}
 	return true, ""
 }
diff --git a/tracker_doctor_test.go b/tracker_doctor_test.go
index f2625ec..95d2e6a 100644
--- a/tracker_doctor_test.go
+++ b/tracker_doctor_test.go
@@ -6,6 +6,7 @@ import (
 	"context"
 	"os"
 	"path/filepath"
+	"strings"
 	"testing"
 )
 
@@ -118,3 +119,26 @@ func TestSanitizeProviderError(t *testing.T) {
 		})
 	}
 }
+
+// TestSanitizeThenTrim_NoPartialKeyLeak verifies the sanitize-before-trim
+// ordering in probeProvider. A key that straddles the trim boundary must
+// not produce a leaked prefix after truncation. Regression guard for PR
+// feedback on issue #106 follow-up.
+func TestSanitizeThenTrim_NoPartialKeyLeak(t *testing.T) {
+	// Construct a message where the key starts at char 50 and runs past
+	// the 80-char truncation point. Trimming first would leave a 30-char
+	// prefix of the key that's shorter than the regex minimum, so the
+	// regex would miss it and the prefix would leak.
+	key := "sk-ant-api03-" + strings.Repeat("A", 60)
+	msg := strings.Repeat("x", 50) + key
+
+	// Correct order: sanitize first, then trim.
+	got := trimErrMsg(sanitizeProviderError(msg), 80)
+
+	if strings.Contains(got, "sk-ant-") {
+		t.Errorf("got = %q; leaked key prefix (must be redacted before trim)", got)
+	}
+	if !strings.Contains(got, "[redacted-key]") {
+		t.Errorf("got = %q; want [redacted-key] substitution", got)
+	}
+}
diff --git a/tracker_simulate.go b/tracker_simulate.go
index fc5d76c..e323a3c 100644
--- a/tracker_simulate.go
+++ b/tracker_simulate.go
@@ -52,9 +52,12 @@ type PlanStep struct {
 // from content.
 //
 // ctx is accepted for future extensibility (e.g. cancelling a slow parse
-// on very large graphs). It is currently unused but part of the public
-// signature so callers do not need to re-thread it later.
+// on very large graphs). Nil is coalesced to context.Background() so that
+// when cancellation is wired in later, nil-callers do not regress.
 func Simulate(ctx context.Context, source string) (*SimulateReport, error) {
+	if ctx == nil {
+		ctx = context.Background()
+	}
 	_ = ctx // reserved for future cancellation plumbing
 	format := detectSourceFormat(source)
 	graph, err := parsePipelineSource(source, format)

From be5fa3ce2eb89ef2a13a259eff7d2cd110b4c9cf Mon Sep 17 00:00:00 2001
From: Clint Ecker <me@clintecker.com>
Date: Mon, 20 Apr 2026 11:22:52 -0500
Subject: [PATCH 4/5] fix: address PR #113 round-3 CodeRabbit feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- checkWorkdir: distinguish os.IsNotExist vs os.IsPermission vs other
  stat failures. Previously any stat error told the user the path
  "does not exist" and suggested mkdir, even when the real issue was
  permission-denied. Applied same pattern to checkPipelineFile.
- checkArtifactDirs: a non-ENOENT stat failure on .ai (permission
  denied, I/O error) is now reported as an error with the actual
  error text. Previously any stat error fell through to the missing-
  dir branch and reported "will be created on first run", hiding the
  real permission problem.
- Simulate, Audit: check ctx.Err() at function entry. Full mid-parse
  cancellation would require threading ctx through dippin-lang's
  parser and the pipeline graph layer, which is out of scope today
  (both paths are fast and bounded). But honoring caller
  cancellation at entry is minimum-viable — a caller that passes an
  already-cancelled context now gets context.Canceled immediately
  instead of unnecessary work.
- Tests: regression guard for each case (permission-denied in
  checkWorkdir, non-ENOENT in checkArtifactDirs, ctx-cancelled at
  entry for Audit and Simulate).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tracker_audit.go         | 13 +++++---
 tracker_audit_test.go    | 12 +++++++
 tracker_doctor.go        | 41 ++++++++++++++++++++----
 tracker_doctor_test.go   | 68 ++++++++++++++++++++++++++++++++++++++++
 tracker_simulate.go      | 13 +++++---
 tracker_simulate_test.go | 11 +++++++
 6 files changed, 144 insertions(+), 14 deletions(-)

diff --git a/tracker_audit.go b/tracker_audit.go
index 331248d..8ec5189 100644
--- a/tracker_audit.go
+++ b/tracker_audit.go
@@ -80,14 +80,19 @@ type RunSummary struct {
 // the path via ResolveRunDir or use MostRecentRunID first, which enforce
 // the .tracker/runs/<runID> layout.
 //
-// ctx is accepted for future extensibility (cancellation of checkpoint or
-// activity log parsing). Nil is coalesced to context.Background() so that
-// when cancellation is wired in later, nil-callers do not regress.
+// ctx is checked at entry so a caller that passes an already-cancelled
+// context gets an immediate error instead of silent work. Full
+// cancellation mid-parse would require threading ctx through
+// pipeline.LoadCheckpoint and LoadActivityLog, which is out of scope
+// today (both are fast and bounded). Nil is coalesced to
+// context.Background().
 func Audit(ctx context.Context, runDir string, opts ...AuditConfig) (*AuditReport, error) {
 	if ctx == nil {
 		ctx = context.Background()
 	}
-	_ = ctx                    // reserved for future cancellation plumbing
+	if err := ctx.Err(); err != nil {
+		return nil, err
+	}
 	_ = firstAuditConfig(opts) // reserved for future fields
 	cp, err := pipeline.LoadCheckpoint(filepath.Join(runDir, "checkpoint.json"))
 	if err != nil {
diff --git a/tracker_audit_test.go b/tracker_audit_test.go
index 25efcfd..de768dd 100644
--- a/tracker_audit_test.go
+++ b/tracker_audit_test.go
@@ -92,3 +92,15 @@ func TestListRuns_LogWriterSilencesWarnings(t *testing.T) {
 		t.Error("expected log writer to receive a warning about activity.jsonl")
 	}
 }
+
+// TestAudit_CtxCancelledAtEntry verifies Audit returns the caller's
+// cancellation error immediately rather than silently proceeding with the
+// expensive checkpoint + activity log reads.
+func TestAudit_CtxCancelledAtEntry(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+	_, err := Audit(ctx, "testdata/runs/ok")
+	if err != context.Canceled {
+		t.Errorf("err = %v, want context.Canceled", err)
+	}
+}
diff --git a/tracker_doctor.go b/tracker_doctor.go
index dbca912..891d1e9 100644
--- a/tracker_doctor.go
+++ b/tracker_doctor.go
@@ -613,8 +613,17 @@ func checkWorkdir(workdir string) CheckResult {
 	info, err := os.Stat(workdir)
 	if err != nil {
 		out.Status = CheckStatusError
-		out.Message = fmt.Sprintf("%s does not exist", workdir)
-		out.Hint = fmt.Sprintf("create the directory: mkdir -p %s", workdir)
+		switch {
+		case os.IsNotExist(err):
+			out.Message = fmt.Sprintf("%s does not exist", workdir)
+			out.Hint = fmt.Sprintf("create the directory: mkdir -p %s", workdir)
+		case os.IsPermission(err):
+			out.Message = fmt.Sprintf("permission denied accessing %s", workdir)
+			out.Hint = fmt.Sprintf("check permissions on %s or a parent directory", workdir)
+		default:
+			out.Message = fmt.Sprintf("cannot stat %s: %v", workdir, err)
+			out.Hint = "check the path and its parent directories"
+		}
 		return out
 	}
 	if !info.IsDir() {
@@ -709,7 +718,9 @@ func checkArtifactDirs(workdir string) CheckResult {
 	out := CheckResult{Name: "Artifact Directories"}
 	allOk := true
 	aiDir := filepath.Join(workdir, ".ai")
-	if info, err := os.Stat(aiDir); err == nil {
+	info, err := os.Stat(aiDir)
+	switch {
+	case err == nil:
 		switch {
 		case !info.IsDir():
 			out.Details = append(out.Details, CheckDetail{
@@ -730,7 +741,7 @@ func checkArtifactDirs(workdir string) CheckResult {
 				Message: fmt.Sprintf("%s exists and is writable", aiDir),
 			})
 		}
-	} else {
+	case os.IsNotExist(err):
 		if isDirWritable(workdir) {
 			out.Details = append(out.Details, CheckDetail{
 				Status:  CheckStatusOK,
@@ -743,6 +754,15 @@ func checkArtifactDirs(workdir string) CheckResult {
 			})
 			allOk = false
 		}
+	default:
+		// Non-ENOENT stat failure — permission denied, I/O error, etc.
+		// Report the real failure instead of pretending .ai is missing.
+		out.Details = append(out.Details, CheckDetail{
+			Status:  CheckStatusError,
+			Message: fmt.Sprintf("cannot inspect %s: %v", aiDir, err),
+			Hint:    fmt.Sprintf("check permissions on %s and its parents", aiDir),
+		})
+		allOk = false
 	}
 	if allOk {
 		out.Status = CheckStatusOK
@@ -781,8 +801,17 @@ func checkPipelineFile(pipelineFile string) CheckResult {
 	out := CheckResult{Name: "Pipeline File"}
 	if _, err := os.Stat(pipelineFile); err != nil {
 		out.Status = CheckStatusError
-		out.Message = fmt.Sprintf("%s does not exist", pipelineFile)
-		out.Hint = fmt.Sprintf("check the file path: %s", pipelineFile)
+		switch {
+		case os.IsNotExist(err):
+			out.Message = fmt.Sprintf("%s does not exist", pipelineFile)
+			out.Hint = fmt.Sprintf("check the file path: %s", pipelineFile)
+		case os.IsPermission(err):
+			out.Message = fmt.Sprintf("permission denied reading %s", pipelineFile)
+			out.Hint = fmt.Sprintf("check permissions: chmod +r %s", pipelineFile)
+		default:
+			out.Message = fmt.Sprintf("cannot stat %s: %v", pipelineFile, err)
+			out.Hint = "check the file path and permissions"
+		}
 		return out
 	}
 	hasWarn := false
diff --git a/tracker_doctor_test.go b/tracker_doctor_test.go
index 95d2e6a..808ca1f 100644
--- a/tracker_doctor_test.go
+++ b/tracker_doctor_test.go
@@ -142,3 +142,71 @@ func TestSanitizeThenTrim_NoPartialKeyLeak(t *testing.T) {
 		t.Errorf("got = %q; want [redacted-key] substitution", got)
 	}
 }
+
+// TestCheckWorkdir_DistinguishesErrorKinds verifies that permission-denied
+// and other non-ENOENT stat failures are reported with the right remediation
+// hint, rather than being reported as "does not exist" + an mkdir hint.
+func TestCheckWorkdir_DistinguishesErrorKinds(t *testing.T) {
+	t.Run("missing path", func(t *testing.T) {
+		r := checkWorkdir(filepath.Join(t.TempDir(), "does-not-exist"))
+		if r.Status != CheckStatusError {
+			t.Fatalf("status = %q, want error", r.Status)
+		}
+		if !strings.Contains(r.Message, "does not exist") {
+			t.Errorf("message = %q, want 'does not exist'", r.Message)
+		}
+		if !strings.Contains(r.Hint, "mkdir") {
+			t.Errorf("hint = %q, want mkdir suggestion", r.Hint)
+		}
+	})
+	t.Run("permission denied", func(t *testing.T) {
+		if os.Geteuid() == 0 {
+			t.Skip("permission tests are meaningless as root")
+		}
+		parent := t.TempDir()
+		inner := filepath.Join(parent, "locked")
+		must(t, os.Mkdir(inner, 0o755))
+		// Revoke search/execute on parent → stat of inner fails with EACCES.
+		must(t, os.Chmod(parent, 0o000))
+		t.Cleanup(func() { _ = os.Chmod(parent, 0o755) })
+
+		r := checkWorkdir(inner)
+		if r.Status != CheckStatusError {
+			t.Fatalf("status = %q, want error", r.Status)
+		}
+		if strings.Contains(r.Hint, "mkdir") {
+			t.Errorf("permission error should not suggest mkdir, got hint = %q", r.Hint)
+		}
+		if !strings.Contains(r.Message, "permission denied") &&
+			!strings.Contains(r.Message, "cannot stat") {
+			t.Errorf("message = %q, want permission-denied or stat-error wording", r.Message)
+		}
+	})
+}
+
+// TestCheckArtifactDirs_NonENOENTStatError verifies that a stat failure on
+// .ai that is not ENOENT (e.g. permission denied) is reported as an error
+// rather than silently treated as "will be created on first run".
+func TestCheckArtifactDirs_NonENOENTStatError(t *testing.T) {
+	if os.Geteuid() == 0 {
+		t.Skip("permission tests are meaningless as root")
+	}
+	workdir := t.TempDir()
+	aiDir := filepath.Join(workdir, ".ai")
+	must(t, os.Mkdir(aiDir, 0o755))
+	// Make workdir unreadable/unsearchable so stat on .ai fails with EACCES
+	// (not ENOENT — .ai does exist).
+	must(t, os.Chmod(workdir, 0o000))
+	t.Cleanup(func() { _ = os.Chmod(workdir, 0o755) })
+
+	r := checkArtifactDirs(workdir)
+	if r.Status != CheckStatusError {
+		t.Fatalf("status = %q, want error (permission failure must not be reported as OK)", r.Status)
+	}
+	// No detail should say "will be created on first run" — that's the lie we're guarding against.
+	for _, d := range r.Details {
+		if strings.Contains(d.Message, "will be created") {
+			t.Errorf("detail %q hides the real permission error", d.Message)
+		}
+	}
+}
diff --git a/tracker_simulate.go b/tracker_simulate.go
index e323a3c..11cce98 100644
--- a/tracker_simulate.go
+++ b/tracker_simulate.go
@@ -51,14 +51,19 @@ type PlanStep struct {
 // Simulate parses source and returns a SimulateReport. Format is detected
 // from content.
 //
-// ctx is accepted for future extensibility (e.g. cancelling a slow parse
-// on very large graphs). Nil is coalesced to context.Background() so that
-// when cancellation is wired in later, nil-callers do not regress.
+// ctx is checked at entry so a caller that passes an already-cancelled
+// context gets an immediate error instead of silent work. Full
+// cancellation mid-parse would require threading ctx through
+// parsePipelineSource → dippin-lang's parser, which is out of scope
+// today (parses are fast and O(n) anyway). Nil is coalesced to
+// context.Background().
 func Simulate(ctx context.Context, source string) (*SimulateReport, error) {
 	if ctx == nil {
 		ctx = context.Background()
 	}
-	_ = ctx // reserved for future cancellation plumbing
+	if err := ctx.Err(); err != nil {
+		return nil, err
+	}
 	format := detectSourceFormat(source)
 	graph, err := parsePipelineSource(source, format)
 	if err != nil {
diff --git a/tracker_simulate_test.go b/tracker_simulate_test.go
index d6210f0..7c54836 100644
--- a/tracker_simulate_test.go
+++ b/tracker_simulate_test.go
@@ -101,6 +101,17 @@ func TestSimulate_InvalidSource(t *testing.T) {
 	}
 }
 
+// TestSimulate_CtxCancelledAtEntry verifies Simulate returns the caller's
+// cancellation error immediately rather than silently parsing.
+func TestSimulate_CtxCancelledAtEntry(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+	_, err := Simulate(ctx, simpleSource)
+	if err != context.Canceled {
+		t.Errorf("err = %v, want context.Canceled", err)
+	}
+}
+
 func TestSimulate_GraphAttrsPopulated(t *testing.T) {
 	// Use DOT format to set graph-level attributes reliably without
 	// depending on dippin-lang's specific syntax for workflow-level attrs.

From 073d2fc485e4b5b862826a32e3221e6ca7f22462 Mon Sep 17 00:00:00 2001
From: Clint Ecker <me@clintecker.com>
Date: Mon, 20 Apr 2026 11:36:28 -0500
Subject: [PATCH 5/5] fix: address PR #113 round-4 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Copilot + CodeRabbit round 4. Seven items, all addressed with tests.

CodeRabbit:
- tracker_doctor_test.go: skip permission tests on Windows (chmod 000
  doesn't have Unix semantics there). Imported runtime.
- tracker_doctor.go: guard against nil DoctorOption in the public
  Doctor API — nil entries are skipped rather than dereferenced. v1.0
  hardening for external callers.
- tracker_doctor.go: probeProvider now returns (ok, msg, isAuthFail)
  so non-auth failures (DNS, timeout, transport, ctx-cancel) stop
  being mislabeled as "auth failed — rotate your key". The caller
  shows a network/transport-themed hint when isAuthFail is false.

Copilot:
- tracker_events.go NDJSONWriter.Write: detect short writes
  (n < len(data) with nil err) as io.ErrShortWrite. io.Writer.Write's
  contract allows this, so silently truncating was a latent bug.
- tracker_audit.go: drop the `opts ...AuditConfig` variadic from
  Audit since it was ignored. Audit has no warnings to suppress, so
  the signature was misleading. Use ListRuns + AuditConfig for the
  bulk-enumeration case.
- tracker_events_test.go: per-instance panic-suppression test now
  actually captures os.Stderr and asserts two log lines (one per
  writer). Previously only asserted no-panic, which a package-level
  sync.Once would also pass.
- README.md: library example now initializes ctx and imports context.
- tracker_diagnose.go: DiagnoseConfig.LogWriter doc narrowed to the
  real warning sources (malformed status.json, scanner errors on
  activity.jsonl) — removed the misleading mention of artifact dirs
  (Diagnose doesn't inspect those).

New tests:
- TestNDJSONWriter_Write_ShortWriteIsError
- TestCheckWorkdir_DistinguishesErrorKinds with Windows skip
- TestCheckArtifactDirs_NonENOENTStatError with Windows skip
- TestNDJSONWriter_PanicSuppressionIsPerInstance rewritten to capture
  stderr via os.Pipe and count log lines.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md           |  2 +-
 README.md              |  9 +++--
 tracker_audit.go       |  7 ++--
 tracker_diagnose.go    | 12 ++++---
 tracker_doctor.go      | 38 +++++++++++++-------
 tracker_doctor_test.go |  7 ++++
 tracker_events.go      | 13 ++++---
 tracker_events_test.go | 78 +++++++++++++++++++++++++++++++++++-------
 8 files changed, 126 insertions(+), 40 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 01df426..22d1c38 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,7 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `tracker.Doctor(cfg)` → `tracker.Doctor(ctx, cfg, opts...)`.
   - `tracker.Diagnose(runDir)` → `tracker.Diagnose(ctx, runDir, opts...)`.
   - `tracker.DiagnoseMostRecent(workdir)` → `tracker.DiagnoseMostRecent(ctx, workdir, opts...)`.
-  - `tracker.Audit(runDir)` → `tracker.Audit(ctx, runDir, opts...)`.
+  - `tracker.Audit(runDir)` → `tracker.Audit(ctx, runDir)`. (No config struct — Audit emits no suppressible warnings. Use `ListRuns` + `AuditConfig{LogWriter}` for bulk enumeration.)
   - `tracker.Simulate(source)` → `tracker.Simulate(ctx, source)`.
   - `tracker.ListRuns(workdir)` now accepts optional `...AuditConfig`.
   - `tracker.NDJSONEvent` → `tracker.StreamEvent`. Wire-format JSON tags unchanged.
diff --git a/README.md b/README.md
index e53e79d..946ad26 100644
--- a/README.md
+++ b/README.md
@@ -574,8 +574,13 @@ result, _ := tracker.Run(ctx, source, tracker.Config{
 ### Analyzing past runs from code
 
 ```go
-import tracker "github.com/2389-research/tracker"
+import (
+    "context"
 
+    tracker "github.com/2389-research/tracker"
+)
+
+ctx := context.Background()
 report, err := tracker.DiagnoseMostRecent(ctx, ".")
 if err != nil { log.Fatal(err) }
 
@@ -588,7 +593,7 @@ for _, s := range report.Suggestions {
 }
 ```
 
-`tracker.Audit`, `tracker.DiagnoseMostRecent`, `tracker.Simulate`, and `tracker.Doctor` all accept `context.Context` as their first argument and return JSON-serializable reports. `Audit` and `DiagnoseMostRecent`/`Diagnose` additionally accept an optional config (`AuditConfig`, `DiagnoseConfig`) with a `LogWriter` for non-fatal parse warnings — set it to `io.Discard` to silence warnings in embedded callers. `Simulate` takes just `ctx` and source text; `Doctor` takes a required `DoctorConfig` plus optional functional options (e.g., `tracker.WithVersionInfo`).
+`tracker.Audit`, `tracker.DiagnoseMostRecent`, `tracker.Simulate`, and `tracker.Doctor` all accept `context.Context` as their first argument and return JSON-serializable reports. `tracker.ListRuns` and `DiagnoseMostRecent`/`Diagnose` accept an optional config (`AuditConfig`, `DiagnoseConfig`) with a `LogWriter` for non-fatal parse warnings — set it to `io.Discard` to silence warnings in embedded callers. `Audit` and `Simulate` currently take just `ctx` (plus their payload); `Doctor` takes a required `DoctorConfig` plus optional functional options (e.g., `tracker.WithVersionInfo`).
 
 To stream events programmatically in the same NDJSON format as `tracker --json`, use `tracker.NewNDJSONWriter`:
 
diff --git a/tracker_audit.go b/tracker_audit.go
index 8ec5189..ce210c9 100644
--- a/tracker_audit.go
+++ b/tracker_audit.go
@@ -86,14 +86,17 @@ type RunSummary struct {
 // pipeline.LoadCheckpoint and LoadActivityLog, which is out of scope
 // today (both are fast and bounded). Nil is coalesced to
 // context.Background().
-func Audit(ctx context.Context, runDir string, opts ...AuditConfig) (*AuditReport, error) {
+//
+// Audit does not accept AuditConfig — it emits no warnings to suppress.
+// Use ListRuns + AuditConfig{LogWriter} for bulk enumeration where the
+// summary builder may skip unreadable activity logs.
+func Audit(ctx context.Context, runDir string) (*AuditReport, error) {
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	if err := ctx.Err(); err != nil {
 		return nil, err
 	}
-	_ = firstAuditConfig(opts) // reserved for future fields
 	cp, err := pipeline.LoadCheckpoint(filepath.Join(runDir, "checkpoint.json"))
 	if err != nil {
 		return nil, fmt.Errorf("load checkpoint: %w", err)
diff --git a/tracker_diagnose.go b/tracker_diagnose.go
index a02ac87..c3e6eda 100644
--- a/tracker_diagnose.go
+++ b/tracker_diagnose.go
@@ -19,11 +19,13 @@ import (
 
 // DiagnoseConfig configures a Diagnose() run.
 type DiagnoseConfig struct {
-	// LogWriter receives non-fatal parse warnings (malformed status.json
-	// files, unreadable artifact directories). Nil is treated as
-	// io.Discard so library callers do not see stray warnings on
-	// os.Stderr. The tracker CLI sets this to io.Discard for user-facing
-	// commands.
+	// LogWriter receives non-fatal parse/read warnings — specifically
+	// malformed status.json content (one warning per bad file) and
+	// bufio.Scanner errors while reading activity.jsonl (e.g. lines
+	// exceeding the 1 MB buffer limit, I/O failures). Nil is treated
+	// as io.Discard so library callers do not see stray warnings on
+	// os.Stderr. The tracker CLI sets this to io.Discard for user-
+	// facing commands.
 	LogWriter io.Writer
 }
 
diff --git a/tracker_doctor.go b/tracker_doctor.go
index 891d1e9..ffd31a9 100644
--- a/tracker_doctor.go
+++ b/tracker_doctor.go
@@ -121,6 +121,9 @@ func Doctor(ctx context.Context, cfg DoctorConfig, opts ...DoctorOption) (*Docto
 		ctx = context.Background()
 	}
 	for _, opt := range opts {
+		if opt == nil {
+			continue
+		}
 		opt(&cfg)
 	}
 	if cfg.WorkDir == "" {
@@ -265,13 +268,19 @@ func checkProviders(ctx context.Context, probe bool) CheckResult {
 			continue
 		}
 		if probe && p.buildAdapter != nil {
-			authOk, authMsg := probeProvider(ctx, p, key)
-			if !authOk {
-				out.Details = append(out.Details, CheckDetail{
-					Status:  CheckStatusError,
-					Message: fmt.Sprintf("%-15s %s=%s (auth failed: %s)", p.name, envName, masked, authMsg),
-					Hint:    fmt.Sprintf("your %s key is invalid or expired — export a fresh key or run `tracker setup`", p.name),
-				})
+			ok, probeMsg, isAuth := probeProvider(ctx, p, key)
+			if !ok {
+				detail := CheckDetail{Status: CheckStatusError}
+				if isAuth {
+					detail.Message = fmt.Sprintf("%-15s %s=%s (auth failed: %s)", p.name, envName, masked, probeMsg)
+					detail.Hint = fmt.Sprintf("your %s key is invalid or expired — export a fresh key or run `tracker setup`", p.name)
+				} else {
+					// DNS, timeout, transport, context cancel, or other non-auth failure.
+					// Do NOT tell the user to rotate a working key.
+					detail.Message = fmt.Sprintf("%-15s %s=%s (probe failed: %s)", p.name, envName, masked, probeMsg)
+					detail.Hint = fmt.Sprintf("probe for %s failed on network/transport — verify connectivity and %s_BASE_URL before rotating keys", p.name, strings.ToUpper(p.name))
+				}
+				out.Details = append(out.Details, detail)
 				hasProviderErrors = true
 				continue
 			}
@@ -337,14 +346,17 @@ func findProviderKey(envVars []string) (key, envName string) {
 	return "", ""
 }
 
-func probeProvider(ctx context.Context, p providerDef, key string) (bool, string) {
+// probeProvider returns (ok, msg, isAuthFailure). The third return lets the
+// caller distinguish an actual auth failure (rotate-the-key guidance) from
+// a network/transport/timeout failure (don't rotate good keys).
+func probeProvider(ctx context.Context, p providerDef, key string) (bool, string, bool) {
 	adapter, err := p.buildAdapter(key)
 	if err != nil {
-		return false, fmt.Sprintf("build adapter: %v", err)
+		return false, fmt.Sprintf("build adapter: %v", err), false
 	}
 	client, err := llm.NewClient(llm.WithProvider(adapter))
 	if err != nil {
-		return false, fmt.Sprintf("create client: %v", err)
+		return false, fmt.Sprintf("create client: %v", err), false
 	}
 	defer client.Close()
 	probeCtx, cancel := context.WithTimeout(ctx, 15*time.Second)
@@ -359,15 +371,15 @@ func probeProvider(ctx context.Context, p providerDef, key string) (bool, string
 	if err != nil {
 		msg := err.Error()
 		if isAuthError(msg) {
-			return false, "invalid or expired API key"
+			return false, "invalid or expired API key", true
 		}
 		// Sanitize FIRST, then trim. If we trim first, a key that
 		// straddles the 80-char boundary gets cut into a shorter prefix
 		// that no longer matches the regex, leaking the prefix. Sanitize
 		// the full message, then trim whatever's left.
-		return false, trimErrMsg(sanitizeProviderError(msg), 80)
+		return false, trimErrMsg(sanitizeProviderError(msg), 80), false
 	}
-	return true, ""
+	return true, "", false
 }
 
 // sanitizeProviderError strips API keys and bearer tokens from provider error
diff --git a/tracker_doctor_test.go b/tracker_doctor_test.go
index 808ca1f..edb9cc0 100644
--- a/tracker_doctor_test.go
+++ b/tracker_doctor_test.go
@@ -6,6 +6,7 @@ import (
 	"context"
 	"os"
 	"path/filepath"
+	"runtime"
 	"strings"
 	"testing"
 )
@@ -160,6 +161,9 @@ func TestCheckWorkdir_DistinguishesErrorKinds(t *testing.T) {
 		}
 	})
 	t.Run("permission denied", func(t *testing.T) {
+		if runtime.GOOS == "windows" {
+			t.Skip("permission semantics differ on Windows")
+		}
 		if os.Geteuid() == 0 {
 			t.Skip("permission tests are meaningless as root")
 		}
@@ -188,6 +192,9 @@ func TestCheckWorkdir_DistinguishesErrorKinds(t *testing.T) {
 // .ai that is not ENOENT (e.g. permission denied) is reported as an error
 // rather than silently treated as "will be created on first run".
 func TestCheckArtifactDirs_NonENOENTStatError(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("permission semantics differ on Windows")
+	}
 	if os.Geteuid() == 0 {
 		t.Skip("permission tests are meaningless as root")
 	}
diff --git a/tracker_events.go b/tracker_events.go
index ef4ab73..6725f0a 100644
--- a/tracker_events.go
+++ b/tracker_events.go
@@ -57,9 +57,10 @@ func NewNDJSONWriter(w io.Writer) *NDJSONWriter {
 
 // Write serializes evt as a JSON line. Safe to call from multiple
 // goroutines. Returns a non-nil error if marshalling or writing to the
-// underlying io.Writer fails; the first write error is also logged to
-// os.Stderr once so long-running callers that ignore the error still
-// surface it.
+// underlying io.Writer fails, including short writes (io.Writer.Write
+// may legally return n < len(data) with a nil error). The first write
+// error is also logged to os.Stderr once so long-running callers that
+// ignore the return value still surface it.
 func (s *NDJSONWriter) Write(evt StreamEvent) error {
 	data, err := json.Marshal(evt)
 	if err != nil {
@@ -68,7 +69,11 @@ func (s *NDJSONWriter) Write(evt StreamEvent) error {
 	data = append(data, '\n')
 	s.mu.Lock()
 	defer s.mu.Unlock()
-	if _, werr := s.w.Write(data); werr != nil {
+	n, werr := s.w.Write(data)
+	if werr == nil && n < len(data) {
+		werr = io.ErrShortWrite
+	}
+	if werr != nil {
 		s.errOnce.Do(func() {
 			fmt.Fprintf(os.Stderr, "tracker: NDJSON stream write error: %v (further write errors suppressed)\n", werr)
 		})
diff --git a/tracker_events_test.go b/tracker_events_test.go
index a3607b7..359c8b5 100644
--- a/tracker_events_test.go
+++ b/tracker_events_test.go
@@ -6,6 +6,8 @@ import (
 	"bytes"
 	"encoding/json"
 	"errors"
+	"io"
+	"os"
 	"strings"
 	"sync"
 	"testing"
@@ -335,6 +337,26 @@ type errWriter struct{ err error }
 
 func (e *errWriter) Write(_ []byte) (int, error) { return 0, e.err }
 
+// shortWriter returns n < len(p) with a nil error — a legal but rare
+// io.Writer behavior that NDJSONWriter must treat as a stream error so
+// silent truncation cannot sneak through.
+type shortWriter struct{}
+
+func (shortWriter) Write(p []byte) (int, error) {
+	if len(p) == 0 {
+		return 0, nil
+	}
+	return len(p) - 1, nil // one byte short, nil error
+}
+
+func TestNDJSONWriter_Write_ShortWriteIsError(t *testing.T) {
+	w := NewNDJSONWriter(shortWriter{})
+	err := w.Write(StreamEvent{Timestamp: "t", Source: "pipeline", Type: "x"})
+	if err != io.ErrShortWrite {
+		t.Errorf("err = %v, want io.ErrShortWrite", err)
+	}
+}
+
 func TestNDJSONWriter_WriteReturnsError(t *testing.T) {
 	w := NewNDJSONWriter(&errWriter{err: errors.New("sink closed")})
 	err := w.Write(StreamEvent{Timestamp: "t", Source: "pipeline", Type: "stage_started"})
@@ -387,24 +409,54 @@ func TestNDJSONWriter_TraceObserver_PanicRecovery(t *testing.T) {
 
 // TestNDJSONWriter_PanicSuppressionIsPerInstance verifies that one writer
 // recovering from a panic does not silence panic logging on a separate
-// writer instance. Both writers must independently report their first
-// panic; regressions here mean package-level state re-crept in.
+// writer instance. Both writers must independently emit their first
+// panic line to os.Stderr; regressions here mean package-level state
+// re-crept in (a package-level sync.Once would log once and silently
+// swallow the second, which the assertion below catches).
 func TestNDJSONWriter_PanicSuppressionIsPerInstance(t *testing.T) {
 	defer func() {
 		if r := recover(); r != nil {
 			t.Fatalf("handlers should recover, got: %v", r)
 		}
 	}()
-	w1 := NewNDJSONWriter(&panicWriter{})
-	w2 := NewNDJSONWriter(&panicWriter{})
-	// First panic on w1 — must not consume w2's Once.
-	w1.PipelineHandler().HandlePipelineEvent(pipeline.PipelineEvent{
-		Type:      pipeline.EventPipelineStarted,
-		Timestamp: time.Now(),
-	})
-	// Second panic on w2 — still the first on its own instance.
-	w2.PipelineHandler().HandlePipelineEvent(pipeline.PipelineEvent{
-		Type:      pipeline.EventPipelineStarted,
-		Timestamp: time.Now(),
+
+	stderr := captureStderr(t, func() {
+		w1 := NewNDJSONWriter(&panicWriter{})
+		w2 := NewNDJSONWriter(&panicWriter{})
+		w1.PipelineHandler().HandlePipelineEvent(pipeline.PipelineEvent{
+			Type: pipeline.EventPipelineStarted, Timestamp: time.Now(),
+		})
+		w2.PipelineHandler().HandlePipelineEvent(pipeline.PipelineEvent{
+			Type: pipeline.EventPipelineStarted, Timestamp: time.Now(),
+		})
 	})
+
+	// Count the recovered-panic log lines. Each writer must emit its own.
+	// Package-level suppression would produce exactly 1; per-instance produces 2.
+	got := strings.Count(stderr, "NDJSON pipeline handler recovered from panic")
+	if got != 2 {
+		t.Errorf("got %d panic log lines, want 2 (each writer instance logs independently); stderr was:\n%s", got, stderr)
+	}
+}
+
+// captureStderr swaps os.Stderr for an os.Pipe for the duration of fn,
+// returning the captured output as a string.
+func captureStderr(t *testing.T, fn func()) string {
+	t.Helper()
+	r, w, err := os.Pipe()
+	if err != nil {
+		t.Fatalf("os.Pipe: %v", err)
+	}
+	origStderr := os.Stderr
+	os.Stderr = w
+	t.Cleanup(func() { os.Stderr = origStderr })
+
+	fn()
+	w.Close()
+
+	data, err := io.ReadAll(r)
+	if err != nil {
+		t.Fatalf("read captured stderr: %v", err)
+	}
+	return string(data)
 }