From aa4445c0f9e631ebb70e346fe3ecb946e1ae5164 Mon Sep 17 00:00:00 2001 From: Klappy Date: Fri, 17 Apr 2026 05:33:22 +0000 Subject: [PATCH 01/17] feat(challenge): governance-driven runChallengeAction extraction (E0008) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors the PR #96 encode pattern. Extracts challenge behavior from live governance articles (landed in klappy.dev canon via PR #99) rather than hardcoded source logic. New functions in workers/src/orchestrate.ts: - discoverChallengeTypes — per-canonUrl cached type discovery - fetchBasePrerequisites — universal prerequisite checks - fetchNormativeVocabulary — RFC 2119 + architectural load-bearing terms - fetchStakesCalibration — mode-to-depth filter - extractPrereqTable / extractKeywordsFromCheck — shared helpers Refactored: - runChallengeAction — replaces hardcoded detectClaimType / generateChallenges / findTensions / findMissingPrerequisites with governance extraction. Supports multi-match. Filters output by stakes calibration based on mode parameter. - runCleanupStorage — clears all four new caches on invalidation Invariant: voice-dump mode suppresses all challenge output regardless of matched types. Load-bearing per stakes-calibration governance — some modes exist for raw capture and pressure-testing at that stage damages the mode. Graceful degradation: missing governance articles fall back to minimal built-in behavior with warnings, rather than failing. Co-authored-by: Claude --- .../challenge-governance-code-refactor.md | 125 ++++ workers/src/orchestrate.ts | 592 ++++++++++++++++-- 2 files changed, 662 insertions(+), 55 deletions(-) create mode 100644 docs/oddkit/evidence/challenge-governance-code-refactor.md diff --git a/docs/oddkit/evidence/challenge-governance-code-refactor.md b/docs/oddkit/evidence/challenge-governance-code-refactor.md new file mode 100644 index 0000000..2b6b673 --- /dev/null +++ b/docs/oddkit/evidence/challenge-governance-code-refactor.md @@ -0,0 +1,125 @@ +# Evidence: Challenge Governance Code Refactor (E0008) + +## Change Description + +Modified `workers/src/orchestrate.ts` to replace the hardcoded `runChallengeAction` implementation with a governance-driven architecture that mirrors PR #96 (encode precedent). + +### New / Modified Functions with Line Ranges + +| Function | Lines | Type | +|---|---|---| +| `ChallengeTypeDef` interface | ~58–118 | New type declaration | +| `PrereqOverlay` interface | ~58–118 | New type declaration | +| `NormativeVocabulary` interface | ~58–118 | New type declaration | +| `StakesCalibration` interface | ~58–118 | New type declaration | +| `cachedChallengeTypes` + `cachedChallengeTypesCanonUrl` | ~118–125 | New cache variables | +| `cachedBasePrerequisites` + `cachedBasePrerequisitesCanonUrl` | ~127–130 | New cache variables | +| `cachedNormativeVocabulary` + `cachedNormativeVocabularyCanonUrl` | ~132–135 | New cache variables | +| `cachedStakesCalibration` + `cachedStakesCalibrationCanonUrl` | ~137–140 | New cache variables | +| `extractKeywordsFromCheck` | 404–411 | New helper | +| `extractPrereqTable` | 412–432 | New helper | +| `discoverChallengeTypes` | 434–531 | New async function | +| `fetchBasePrerequisites` | 532–551 | New async function | +| `fetchNormativeVocabulary` | 552–642 | New async function | +| `fetchStakesCalibration` | 643–~730 | New async function | +| `runCleanupStorage` | 1104–~1126 | Extended — clears 4 new caches | +| `runChallengeAction` | 1532–~1752 | Replaced body | + +Total new lines of implementation: ~482 (types + caches + helpers + functions + new body). +Original `runChallengeAction` body: ~117 lines. Replaced, not extended. + +### Architecture Summary + +- **discoverChallengeTypes**: Reads `odd/challenge-types/*.md` articles tagged `challenge-type` from canon index. Parses `## Type Identity` (slug, name), blockquote, `## Detection Patterns` code block, `## Challenge Questions` table, `## Prerequisite Overlays` table, `## Suggested Reframings` bullets. Per-canonUrl cached. +- **fetchBasePrerequisites**: Reads `odd/challenge/base-prerequisites.md`. Extracts `## Prerequisite Overlays` table. Per-canonUrl cached. Gracefully degrades to empty array if missing. +- **fetchNormativeVocabulary**: Reads `odd/challenge/normative-vocabulary.md`. Parses `### Directive Language` (RFC 2119 words → regex, case-sensitive) and `### Architectural` tables. Falls back to minimal hardcoded set (MUST/MUST NOT/SHOULD/SHOULD NOT) if missing. +- **fetchStakesCalibration**: Reads `odd/challenge/stakes-calibration.md`. Parses `## Stakes Calibration` 4-column table (Mode, Question tiers, Prerequisite strictness, Reframings). Falls back to "surface everything" at every mode if missing. +- **runChallengeAction** (new body): Multi-match detection, voice-dump suppression invariant, aggregation across matched types, question filtering by stakes tier, prerequisite checking via quoted keywords, normative vocabulary tension detection, reframings filtering, BM25 canon constraint retrieval. +- **runCleanupStorage** (extended): Now clears all four new caches on invalidation. + +## Verification Performed + +```bash +# Working directory: /tmp/work/oddkit/workers + +npm install --silent 2>&1 | tail -5 +# Output: (no output — already up to date) + +npx tsc --noEmit 2>&1 | tee /tmp/tsc.log; echo "EXIT:$?" +# Output: EXIT:0 + +# Root-level test suite +cd /tmp/work/oddkit && npm test 2>&1 | tail -40 || true +``` + +## Observed Behavior + +### tsc --noEmit output (last 20 lines) + +``` +EXIT:0 +``` + +No errors. TypeScript compilation clean. + +### npm test output (last 40 lines) + +``` +Test 1: Index command +node:internal/modules/package_json_reader:314 + throw new ERR_MODULE_NOT_FOUND(packageName, fileURLToPath(base), null); + ^ + +Error [ERR_MODULE_NOT_FOUND]: Cannot find package 'commander' imported from /tmp/work/oddkit/src/cli.js +... +FAIL - Index: no success in output +``` + +**Pre-existing failure unrelated to this change.** The root-level test suite invokes `src/cli.js` which requires `commander`, a package not installed at root level. This failure exists on `main` before this branch and is not caused by changes to `workers/src/orchestrate.ts`. + +### Smoke test + +Local wrangler invocation not available in this session environment. Smoke testing will occur on the Cloudflare preview deploy (staging auto-deploy from this PR branch). + +## Evidence Produced + +- This file: `docs/oddkit/evidence/challenge-governance-code-refactor.md` +- Modified file: `workers/src/orchestrate.ts` (git diff available on branch `feat/e0008-challenge-governance-driven`) +- Build output: `tsc --noEmit` exit 0, no errors + +## Self-Audit + +### Intended Outcome + +Replace the hardcoded `detectClaimType`-based challenge logic with governance-driven extraction from live canon articles (PR #99 governance articles), following the exact same pattern as PR #96 (encode). The output format evolves to include `matched_types`, `mode_used`, and `governance` fields while preserving `claim_type` as a backward-compat alias. + +### Constraints Applied + +1. **Did not redesign** — followed the spec function signatures, cache key names, regex patterns, and fallback behaviors exactly as specified. +2. **Voice-dump invariant is load-bearing** — Step 4 in `runChallengeAction` short-circuits when `calibration.questionTiers.length === 0` and returns `status: "SUPPRESSED"` with empty arrays before any aggregation. Not advisory. +3. **Four caches, four clears** — `runCleanupStorage` clears all eight new cache variables (four cache values, four canonUrl guards). +4. **Multi-match is the design** — `matchedTypes` is an array; aggregation loops over all matched types for questions, prereq overlays, and reframings. +5. **Graceful degradation** — all four fetch functions have try/catch with fallbacks; missing governance articles produce minimal built-in behavior rather than errors. +6. **detectClaimType preserved** — old helper left in place (still used by no current path but may be referenced by `runChallengeActionCompat`). + +### Decision Rules + +- `tsc --noEmit` exit 0 required before commit. Achieved. +- No speculation in observed behavior section — only what commands actually printed. +- Pre-existing test failure documented with root cause attribution. + +### Tradeoffs + +- Detection-pattern overlap noise: if multiple challenge types have overlapping trigger words, `matchedTypes.length > 1` may occur frequently in practice. The multi-match design handles this correctly but may surface more questions than expected. Governance authors can manage this by making trigger words specific. +- Descriptive-only prerequisite checks (no quoted keywords) are silently skipped rather than surfaced. This is the spec behavior — mechanical testing of prose descriptions is not reliable. +- `claim_type` alias: the backward-compat field returns the first matched slug, which may differ from the old `detectClaimType` output (e.g., `"strong-claim"` vs `"strong_claim"`). Callers relying on specific string values of this field will need to update. + +### Remaining Risks + +1. **Governance article availability**: all four fetch functions degrade gracefully, but if `odd/challenge-types/` has no tagged articles, `discoverChallengeTypes` returns an empty array and no matching occurs. The fallback uses the first type found, which is nothing — challenge returns empty output. This is recoverable by authoring governance articles. +2. **Regex compilation on cold start**: `discoverChallengeTypes` compiles regexes from all challenge-type articles on first call. With many types this may add latency on cold Worker start. Mitigated by per-canonUrl caching. +3. **Table regex brittleness**: markdown table parsing uses regexes that assume standard pipe-delimited format. Governance articles with non-standard formatting will silently produce empty arrays rather than parse errors. + +### Visual Proof + +Not applicable — this is Cloudflare Worker code with no UI component. Correctness is demonstrated by: (1) TypeScript compilation clean, (2) PR review and preview deploy. diff --git a/workers/src/orchestrate.ts b/workers/src/orchestrate.ts index a1a43ea..3408606 100644 --- a/workers/src/orchestrate.ts +++ b/workers/src/orchestrate.ts @@ -75,6 +75,52 @@ interface ParsedArtifact { let cachedEncodingTypes: EncodingTypeDef[] | null = null; let cachedEncodingTypesCanonUrl: string | undefined = undefined; +// Governance-driven challenge types (parallel to EncodingTypeDef) +interface ChallengeTypeDef { + slug: string; // from ## Type Identity table, Slug row + name: string; // from ## Type Identity table, Name row + blockquote: string; // the opening > line after the title + fallback: boolean; // from frontmatter + triggerWords: string[]; // from ## Detection Patterns code block + triggerRegex: RegExp | null; + questions: Array<{ question: string; tier: string }>; // from ## Challenge Questions table + prereqOverlays: PrereqOverlay[]; // from ## Prerequisite Overlays table + reframings: string[]; // from ## Suggested Reframings bullets +} + +interface PrereqOverlay { + name: string; // first column of table + check: string; // second column — prose description (may contain quoted keywords) + gapMessage: string; // third column — message if check fails + keywords: string[]; // extracted from check description (quoted strings) +} + +interface NormativeVocabulary { + rfc2119Regex: RegExp | null; // case-sensitive: MUST, SHALL, NEVER, etc. + architecturalRegex: RegExp | null; // case-insensitive: "invariant", "forcing function", etc. + directiveLookup: Map; // word/phrase (lowercase key) → directive type +} + +interface StakesCalibration { + mode: string; // "exploration", "planning", "execution", "voice-dump", etc. + questionTiers: string[]; // ["baseline"], ["baseline","elevated"], etc. OR empty array for "none" + strictness: "optional" | "required" | "required_plus_source"; + reframings: "none" | "first_1" | "all" | "all_plus_block"; +} + +// Caches — one per governance article, each guarded by canonUrl (mirror encoding types pattern) +let cachedChallengeTypes: ChallengeTypeDef[] | null = null; +let cachedChallengeTypesCanonUrl: string | undefined = undefined; + +let cachedBasePrerequisites: PrereqOverlay[] | null = null; +let cachedBasePrerequisitesCanonUrl: string | undefined = undefined; + +let cachedNormativeVocabulary: NormativeVocabulary | null = null; +let cachedNormativeVocabularyCanonUrl: string | undefined = undefined; + +let cachedStakesCalibration: StakesCalibration[] | null = null; +let cachedStakesCalibrationCanonUrl: string | undefined = undefined; + export interface UnifiedParams { action: string; input: string; @@ -355,6 +401,331 @@ async function discoverEncodingTypes( return types; } +function extractKeywordsFromCheck(check: string): string[] { + // Extract quoted substrings from check description + // Example input: `input contains "evidence", "saw", "observed"` + // Output: ["evidence", "saw", "observed"] + const matches = check.match(/"([^"]+)"/g) || []; + return matches.map((m: string) => m.replace(/^"|"$/g, "")); +} + +function extractPrereqTable(content: string): PrereqOverlay[] { + const section = content.match( + /## Prerequisite Overlays[\s\S]*?\| Prerequisite[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n##|$)/, + ); + if (!section) return []; + + const overlays: PrereqOverlay[] = []; + for (const row of section[1].split("\n").filter((r: string) => r.includes("|"))) { + const cols = row.split("|").map((c: string) => c.trim()).filter((c: string) => c.length > 0); + if (cols.length >= 3) { + const check = cols[1]; + overlays.push({ + name: cols[0], + check, + gapMessage: cols[2].replace(/^"|"$/g, ""), + keywords: extractKeywordsFromCheck(check), + }); + } + } + return overlays; +} + +async function discoverChallengeTypes( + fetcher: ZipBaselineFetcher, + canonUrl?: string, +): Promise { + if (cachedChallengeTypes && cachedChallengeTypesCanonUrl === canonUrl) return cachedChallengeTypes; + + const index = await fetcher.getIndex(canonUrl); + const typeArticles = index.entries.filter( + (entry: IndexEntry) => + entry.tags?.includes("challenge-type") && entry.path.includes("challenge-types/"), + ); + + const types: ChallengeTypeDef[] = []; + for (const article of typeArticles) { + try { + const content = await fetcher.getFile(article.path, canonUrl); + if (!content) continue; + + // Frontmatter → fallback flag + const metadata = parseFullFrontmatter(content) || {}; + const fallback = metadata.fallback === true; + + // Blockquote (opening > line) + const blockquoteMatch = content.match(/^---[\s\S]*?---\s*\n+\s*#[^\n]+\n+>\s*(.+?)(?=\n\n|\n---|\n##)/s); + const blockquote = blockquoteMatch ? blockquoteMatch[1].trim().replace(/\n>\s*/g, " ") : ""; + + // ## Type Identity → Slug and Name + const slugMatch = content.match(/\|\s*Slug\s*\|\s*([a-z0-9-]+)\s*\|/i); + const nameMatch = content.match(/\|\s*Name\s*\|\s*([^|]+)\s*\|/i); + if (!slugMatch) continue; + const slug = slugMatch[1]; + const name = nameMatch ? nameMatch[1].trim() : slug; + + // ## Detection Patterns → code block of comma-separated words + const detectionMatch = content.match( + /## Detection Patterns[\s\S]*?```\n([\s\S]*?)\n```/, + ); + const triggerWords = detectionMatch + ? detectionMatch[1] + .split(/[,\n]/) + .map((w: string) => w.trim()) + .filter((w: string) => w.length > 0) + : []; + const triggerRegex = + triggerWords.length > 0 + ? new RegExp( + "\\b(" + + triggerWords + .map((w: string) => w.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")) + .join("|") + + ")\\b", + "i", + ) + : null; + + // ## Challenge Questions → table (Question | Stakes tier) + const questionsSection = content.match( + /## Challenge Questions[\s\S]*?\| Question[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n##|$)/, + ); + const questions: Array<{ question: string; tier: string }> = []; + if (questionsSection) { + for (const row of questionsSection[1].split("\n").filter((r: string) => r.includes("|"))) { + const cols = row.split("|").map((c: string) => c.trim()).filter((c: string) => c.length > 0); + if (cols.length >= 2) { + questions.push({ question: cols[0], tier: cols[1].toLowerCase() }); + } + } + } + + // ## Prerequisite Overlays → table (Prerequisite | Check | Gap message) + const prereqOverlays = extractPrereqTable(content); + + // ## Suggested Reframings → bulleted list + const reframingsSection = content.match( + /## Suggested Reframings[\s\S]*?\n((?:- [^\n]+\n?)+)/, + ); + const reframings = reframingsSection + ? reframingsSection[1] + .split("\n") + .filter((l: string) => l.startsWith("- ")) + .map((l: string) => l.slice(2).trim()) + : []; + + types.push({ + slug, name, blockquote, fallback, + triggerWords, triggerRegex, + questions, prereqOverlays, reframings, + }); + } catch { + continue; + } + } + + cachedChallengeTypes = types; + cachedChallengeTypesCanonUrl = canonUrl; + return types; +} + +async function fetchBasePrerequisites( + fetcher: ZipBaselineFetcher, + canonUrl?: string, +): Promise { + if (cachedBasePrerequisites && cachedBasePrerequisitesCanonUrl === canonUrl) + return cachedBasePrerequisites; + + try { + const content = await fetcher.getFile("odd/challenge/base-prerequisites.md", canonUrl); + const overlays = content ? extractPrereqTable(content) : []; + cachedBasePrerequisites = overlays; + cachedBasePrerequisitesCanonUrl = canonUrl; + return overlays; + } catch { + cachedBasePrerequisites = []; + cachedBasePrerequisitesCanonUrl = canonUrl; + return []; + } +} + +async function fetchNormativeVocabulary( + fetcher: ZipBaselineFetcher, + canonUrl?: string, +): Promise { + if (cachedNormativeVocabulary && cachedNormativeVocabularyCanonUrl === canonUrl) + return cachedNormativeVocabulary; + + // Fallback minimal set if article is missing + const fallback: NormativeVocabulary = { + rfc2119Regex: /\b(MUST NOT|SHOULD NOT|MUST|SHOULD)\b/, + architecturalRegex: null, + directiveLookup: new Map([ + ["must", "requirement"], + ["must not", "prohibition"], + ["should", "recommendation"], + ["should not", "discouragement"], + ]), + }; + + try { + const content = await fetcher.getFile("odd/challenge/normative-vocabulary.md", canonUrl); + if (!content) { + cachedNormativeVocabulary = fallback; + cachedNormativeVocabularyCanonUrl = canonUrl; + return fallback; + } + + // Parse two tables under ## Normative Vocabulary + // Table 1: ### Directive Language (RFC 2119 and Related) — 2 cols (Word | Directive type) + // Table 2: ### Architectural Writing Load-Bearing Terms — 2 cols (Phrase | Directive type) + const rfcSection = content.match( + /### Directive Language[\s\S]*?\| Word[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n###|\n##|$)/, + ); + const archSection = content.match( + /### Architectural[\s\S]*?\|[^|]+\|[^|]+\|\n\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n###|\n##|$)/, + ); + + const rfcWords: string[] = []; + const archPhrases: string[] = []; + const lookup = new Map(); + + if (rfcSection) { + for (const row of rfcSection[1].split("\n").filter((r: string) => r.includes("|"))) { + const cols = row.split("|").map((c: string) => c.trim()).filter((c: string) => c.length > 0); + if (cols.length >= 2) { + rfcWords.push(cols[0]); + lookup.set(cols[0].toLowerCase(), cols[1]); + } + } + } + + if (archSection) { + for (const row of archSection[1].split("\n").filter((r: string) => r.includes("|"))) { + const cols = row.split("|").map((c: string) => c.trim()).filter((c: string) => c.length > 0); + if (cols.length >= 2) { + archPhrases.push(cols[0]); + lookup.set(cols[0].toLowerCase(), cols[1]); + } + } + } + + const rfcRegex = + rfcWords.length > 0 + ? new RegExp( + "\\b(" + rfcWords.map((w) => w.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|") + ")\\b", + ) // case-sensitive — no "i" flag + : null; + + const archRegex = + archPhrases.length > 0 + ? new RegExp( + "\\b(" + archPhrases.map((p) => p.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|") + ")\\b", + "i", + ) + : null; + + const result: NormativeVocabulary = { + rfc2119Regex: rfcRegex, + architecturalRegex: archRegex, + directiveLookup: lookup, + }; + cachedNormativeVocabulary = result; + cachedNormativeVocabularyCanonUrl = canonUrl; + return result; + } catch { + cachedNormativeVocabulary = fallback; + cachedNormativeVocabularyCanonUrl = canonUrl; + return fallback; + } +} + +async function fetchStakesCalibration( + fetcher: ZipBaselineFetcher, + canonUrl?: string, +): Promise { + if (cachedStakesCalibration && cachedStakesCalibrationCanonUrl === canonUrl) + return cachedStakesCalibration; + + // Fallback: "surface everything" at every mode + const fallback: StakesCalibration[] = [ + { mode: "exploration", questionTiers: ["baseline", "elevated", "rigorous"], strictness: "optional", reframings: "all" }, + { mode: "planning", questionTiers: ["baseline", "elevated", "rigorous"], strictness: "required", reframings: "all" }, + { mode: "execution", questionTiers: ["baseline", "elevated", "rigorous"], strictness: "required_plus_source", reframings: "all" }, + ]; + + try { + const content = await fetcher.getFile("odd/challenge/stakes-calibration.md", canonUrl); + if (!content) { + cachedStakesCalibration = fallback; + cachedStakesCalibrationCanonUrl = canonUrl; + return fallback; + } + + // Parse ## Stakes Calibration table — 4 columns + // Mode | Question tiers surfaced | Prerequisite strictness | Reframings surfaced + const section = content.match( + /## Stakes Calibration[\s\S]*?\| Mode[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n##|$)/, + ); + if (!section) { + cachedStakesCalibration = fallback; + cachedStakesCalibrationCanonUrl = canonUrl; + return fallback; + } + + const calibrations: StakesCalibration[] = []; + for (const row of section[1].split("\n").filter((r: string) => r.includes("|"))) { + const cols = row.split("|").map((c: string) => c.trim()).filter((c: string) => c.length > 0); + if (cols.length < 4) continue; + + const mode = cols[0]; + const tiersRaw = cols[1].toLowerCase(); + const strictRaw = cols[2].toLowerCase(); + const reframingsRaw = cols[3].toLowerCase(); + + // Parse question tiers + let questionTiers: string[]; + if (tiersRaw.includes("none")) questionTiers = []; + else { + questionTiers = []; + if (tiersRaw.includes("baseline")) questionTiers.push("baseline"); + if (tiersRaw.includes("elevated")) questionTiers.push("elevated"); + if (tiersRaw.includes("rigorous")) questionTiers.push("rigorous"); + } + + // Parse strictness + let strictness: StakesCalibration["strictness"]; + if (strictRaw.includes("source-named")) strictness = "required_plus_source"; + else if (strictRaw.includes("required")) strictness = "required"; + else strictness = "optional"; + + // Parse reframings + let reframings: StakesCalibration["reframings"]; + if (reframingsRaw.includes("block-until-addressed")) reframings = "all_plus_block"; + else if (reframingsRaw.includes("all")) reframings = "all"; + else if (reframingsRaw.includes("first 1") || reframingsRaw.includes("first one")) reframings = "first_1"; + else if (reframingsRaw.includes("none")) reframings = "none"; + else reframings = "all"; + + calibrations.push({ mode, questionTiers, strictness, reframings }); + } + + if (calibrations.length === 0) { + cachedStakesCalibration = fallback; + cachedStakesCalibrationCanonUrl = canonUrl; + return fallback; + } + + cachedStakesCalibration = calibrations; + cachedStakesCalibrationCanonUrl = canonUrl; + return calibrations; + } catch { + cachedStakesCalibration = fallback; + cachedStakesCalibrationCanonUrl = canonUrl; + return fallback; + } +} + function isStructuredInput(input: string): boolean { const lines = input.split("\n").filter((l) => l.trim().length > 0); return lines.length > 0 && lines.every((l) => /^[A-Z]\t/.test(l)); @@ -740,6 +1111,14 @@ async function runCleanupStorage( cachedBM25Entries = null; cachedEncodingTypes = null; cachedEncodingTypesCanonUrl = undefined; + cachedChallengeTypes = null; + cachedChallengeTypesCanonUrl = undefined; + cachedBasePrerequisites = null; + cachedBasePrerequisitesCanonUrl = undefined; + cachedNormativeVocabulary = null; + cachedNormativeVocabularyCanonUrl = undefined; + cachedStakesCalibration = null; + cachedStakesCalibrationCanonUrl = undefined; return { action: "cleanup_storage", @@ -1158,72 +1537,162 @@ async function runChallengeAction( state?: OddkitState, ): Promise { const startMs = Date.now(); - const claimType = detectClaimType(input); + + // 1. Load all governance + const types = await discoverChallengeTypes(fetcher, canonUrl); + const basePrereqs = await fetchBasePrerequisites(fetcher, canonUrl); + const normVocab = await fetchNormativeVocabulary(fetcher, canonUrl); + const calibrations = await fetchStakesCalibration(fetcher, canonUrl); + + // 2. Resolve mode and calibration + const mode = modeHint || "planning"; + const calibration = + calibrations.find((c) => c.mode === mode) || + calibrations.find((c) => c.mode === "planning") || + calibrations[0] || { + mode: "planning", questionTiers: ["baseline", "elevated"], + strictness: "required" as const, reframings: "all" as const, + }; + + // 3. Multi-match detection + let matchedTypes: ChallengeTypeDef[] = types.filter( + (t) => t.triggerRegex && t.triggerRegex.test(input), + ); + if (matchedTypes.length === 0) { + const fallbackType = types.find((t) => t.fallback) || types[0]; + if (fallbackType) matchedTypes = [fallbackType]; + } + + // 4. VOICE-DUMP INVARIANT: if calibration says no question tiers, suppress entire output. + // This is load-bearing — some modes exist for raw thought capture and pressure-testing + // at that stage damages the mode. Do not "helpfully" surface a reduced set. + if (calibration.questionTiers.length === 0) { + const primary = matchedTypes[0]?.slug || "observation"; + return { + action: "challenge", + result: { + status: "SUPPRESSED", + mode_used: mode, + matched_types: matchedTypes.map((t) => t.slug), + claim_type: primary, // backward-compat alias + tensions: [], + missing_prerequisites: [], + challenges: [], + suggested_reframings: [], + canon_constraints: [], + }, + state: state ? initState(state) : undefined, + assistant_text: + `Challenge suppressed (mode: ${mode}). This mode exists for raw capture; ` + + `pressure-testing would damage the mode's function. Resume challenge at a later stage.`, + debug: { duration_ms: Date.now() - startMs, generated_at: new Date().toISOString() }, + }; + } + + // 5. Aggregate across matched types + const aggregatedQuestions: Array<{ question: string; tier: string }> = []; + const aggregatedOverlays: PrereqOverlay[] = []; + const aggregatedReframings: string[] = []; + for (const t of matchedTypes) { + aggregatedQuestions.push(...t.questions); + aggregatedOverlays.push(...t.prereqOverlays); + aggregatedReframings.push(...t.reframings); + } + + // 6. Filter questions by stakes tier, dedupe by string + const filteredQuestions = aggregatedQuestions + .filter((q) => calibration.questionTiers.includes(q.tier)) + .map((q) => q.question); + const challenges = Array.from(new Set(filteredQuestions)); + + // 7. Merge base + type overlay prerequisites, dedupe by name, test each against input + const allPrereqs = [...basePrereqs, ...aggregatedOverlays]; + const uniquePrereqs = Array.from( + new Map(allPrereqs.map((p) => [p.name, p])).values(), + ); + + const missing: string[] = []; + for (const prereq of uniquePrereqs) { + if (prereq.keywords.length === 0) { + // No quoted keywords — check is descriptive-only, cannot mechanically test. Skip. + continue; + } + const matched = prereq.keywords.some((k) => + new RegExp("\\b" + k.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + "\\b", "i").test(input), + ); + if (!matched) { + const typeName = matchedTypes[0]?.name || "claim"; + let gap = prereq.gapMessage.replace(/\{name\}/g, typeName.toLowerCase()); + if (calibration.strictness === "optional") gap = `Advisory: ${gap}`; + missing.push(gap); + } + } + + // 8. Dedupe and filter reframings + const dedupedReframings = Array.from(new Set(aggregatedReframings)); + let reframings: string[]; + switch (calibration.reframings) { + case "none": + reframings = []; + break; + case "first_1": + // Take first reframing from each matched type (no more than one per type) + reframings = Array.from( + new Set(matchedTypes.map((t) => t.reframings[0]).filter((r) => r)), + ); + break; + case "all": + case "all_plus_block": + default: + reframings = dedupedReframings; + } + + // 9. Retrieve canon constraints (same BM25 path as before) const index = await fetcher.getIndex(canonUrl); const results = scoreEntries(index.entries, `constraints challenges risks ${input}`).slice(0, 4); const canonConstraints: Array<{ citation: string; quote: string }> = []; const tensions: Array<{ type: string; message: string }> = []; + for (const entry of results) { const content = await fetcher.getFile(entry.path, canonUrl); - if (content) { - const stripped = content.replace(/^---[\s\S]*?---\n/, ""); - const lines = stripped.split("\n").filter((l) => l.trim() && !l.startsWith("#")); - const excerpt = lines.slice(0, 2).join(" ").slice(0, 150); - canonConstraints.push({ citation: `${entry.path}#${entry.title}`, quote: excerpt }); - if (/\bMUST NOT\b/.test(excerpt)) - tensions.push({ type: "prohibition", message: `Canon prohibition found in ${entry.path}` }); - else if (/\bMUST\b/.test(excerpt)) - tensions.push({ type: "requirement", message: `Canon requirement found in ${entry.path}` }); + if (!content) continue; + const stripped = content.replace(/^---[\s\S]*?---\n/, ""); + const lines = stripped.split("\n").filter((l) => l.trim() && !l.startsWith("#")); + const excerpt = lines.slice(0, 2).join(" ").slice(0, 150); + canonConstraints.push({ citation: `${entry.path}#${entry.title}`, quote: excerpt }); + + // Apply normative vocabulary regexes + let foundMatch: string | null = null; + if (normVocab.rfc2119Regex) { + const m = excerpt.match(normVocab.rfc2119Regex); + if (m) foundMatch = m[0]; + } + if (!foundMatch && normVocab.architecturalRegex) { + const m = excerpt.match(normVocab.architecturalRegex); + if (m) foundMatch = m[0]; + } + if (foundMatch) { + const directiveType = normVocab.directiveLookup.get(foundMatch.toLowerCase()) || "directive"; + tensions.push({ + type: directiveType, + message: `Canon ${directiveType} (${foundMatch}) found in ${entry.path}`, + }); } } - const missing: string[] = []; - if (!/\bevidence\b/i.test(input) && !/\bdata\b/i.test(input)) - missing.push("No evidence cited — claims without evidence are assumptions"); - if (claimType === "strong_claim" || claimType === "proposal") { - if (!/\balternative/i.test(input)) missing.push("No alternatives mentioned"); - if (!/\brisk/i.test(input) && !/\bcost\b/i.test(input)) - missing.push("No risks or costs acknowledged"); - } - - const challenges: string[] = []; - if (claimType === "strong_claim") { - challenges.push( - "What evidence would disprove this?", - "Under what conditions does this NOT hold?", - "Who would disagree, and why?", - ); - } else if (claimType === "proposal") { - challenges.push( - "What's the cost of being wrong?", - "What alternatives were considered?", - "What would need to be true for this to fail?", - ); - } else if (claimType === "assumption") { - challenges.push( - "Has this assumption been validated?", - "What if this assumption is wrong — what breaks?", - ); - } else { - challenges.push("Is this observation representative?", "What context might change this?"); - } - - const reframings: string[] = []; - if (claimType === "strong_claim") - reframings.push("Reframe as hypothesis: 'We believe X because Y, and would reconsider if Z'"); - if (claimType === "assumption") - reframings.push("Make explicit: state the assumption and how you'd validate it"); - if (claimType === "proposal") - reframings.push("Add optionality: 'We're choosing X over Y because Z, reversible until W'"); - - // Update state + // 10. Update state const updatedState = state ? initState(state) : undefined; if (updatedState && missing.length > 0) { updatedState.unresolved = [...updatedState.unresolved, ...missing]; } - const lines = [`Challenge (${claimType}):`, ""]; + // 11. Build human-readable assistant_text (preserve existing format roughly) + const primarySlug = matchedTypes[0]?.slug || "observation"; + const primaryName = matchedTypes[0]?.name || "Observation"; + const typesLabel = + matchedTypes.length > 1 ? `${primaryName} +${matchedTypes.length - 1} more` : primaryName; + const lines = [`Challenge (${typesLabel}, mode: ${mode}):`, ""]; if (tensions.length > 0) { lines.push("Tensions found:"); for (const t of tensions) lines.push(` - [${t.type}] ${t.message}`); @@ -1234,14 +1703,20 @@ async function runChallengeAction( for (const m of missing) lines.push(` - ${m}`); lines.push(""); } - lines.push("Questions to address:"); - for (const c of challenges) lines.push(` - ${c}`); - lines.push(""); + if (challenges.length > 0) { + lines.push("Questions to address:"); + for (const c of challenges) lines.push(` - ${c}`); + lines.push(""); + } if (reframings.length > 0) { lines.push("Suggested reframings:"); for (const r of reframings) lines.push(` - ${r}`); lines.push(""); } + if (calibration.reframings === "all_plus_block" && reframings.length > 0) { + lines.push("⚠ Block-until-addressed: this claim should not proceed until reframings are explicitly addressed or declined."); + lines.push(""); + } if (canonConstraints.length > 0) { lines.push("Canon constraints:"); for (const c of canonConstraints) { @@ -1255,12 +1730,19 @@ async function runChallengeAction( action: "challenge", result: { status: "CHALLENGED", - claim_type: claimType, + mode_used: mode, + matched_types: matchedTypes.map((t) => t.slug), + claim_type: primarySlug, // backward-compat alias — first matched slug tensions, missing_prerequisites: missing, challenges, suggested_reframings: reframings, canon_constraints: canonConstraints, + governance: matchedTypes.map((t) => ({ + slug: t.slug, + name: t.name, + description: t.blockquote, + })), }, state: updatedState, assistant_text: lines.join("\n").trim(), From a88abf7a0baab0c9e940c5370f7919fc0ae152e7 Mon Sep 17 00:00:00 2001 From: Klappy Date: Fri, 17 Apr 2026 05:34:30 +0000 Subject: [PATCH 02/17] chore: record E0008 challenge governance refactor decision in ledger --- odd/ledger/learnings.jsonl | 1 + 1 file changed, 1 insertion(+) diff --git a/odd/ledger/learnings.jsonl b/odd/ledger/learnings.jsonl index f7497f9..f00ada0 100644 --- a/odd/ledger/learnings.jsonl +++ b/odd/ledger/learnings.jsonl @@ -37,3 +37,4 @@ {"id":"learn-20260410-0003","timestamp":"2026-04-10T04:33:00Z","summary":"AnalyticsEngineDataset is a global interface in @cloudflare/workers-types — no import statement needed, just use it directly in type annotations","trigger":"friction","impact":"Initially searched for how to import the type. It is declared globally by the workers-types package (configured via tsconfig types array), so it can be used directly in interface declarations without any import.","confidence":1.0,"sources":["workers/tsconfig.json","node_modules/@cloudflare/workers-types/index.d.ts"],"evidence":[{"type":"artifact","ref":"workers/src/zip-baseline-fetcher.ts — ODDKIT_TELEMETRY?: AnalyticsEngineDataset used without import"}],"candidate_targets":[],"proposed_escalation":"none"} {"id":"learn-20260412-0001","timestamp":"2026-04-12T00:52:00Z","summary":"Standalone Worker tools (telemetry, time) bypass orchestrate pipeline — they share oddkit_ MCP prefix but register directly in createServer with their own handler. CLI parity requires adding to TOOLS array (auto-cascades) plus explicit param threading in cli.js and server.js","trigger":"architecture","impact":"New standalone tools need 5 files touched: index.ts (Worker registration), tool-registry.js (TOOLS entry), actions.js (handler), server.js (param threading), cli.js (param threading). The TOOLS auto-derivation handles enum/listing but not param plumbing.","confidence":0.95,"sources":["workers/src/index.ts","src/core/tool-registry.js","src/core/actions.js","src/mcp/server.js","src/cli.js"],"evidence":[{"type":"artifact","ref":"PR #87 — oddkit_time implementation across 5 files"}],"candidate_targets":[],"proposed_escalation":"none"} {"id":"L39","timestamp":"2026-04-13T11:12:00Z","type":"learning","summary":"raw.githubusercontent.com URL parsing must rejoin all path segments after owner/repo to support branch names with slashes — parts[2] truncates multi-segment refs like publish/four-essays-and-skill to just publish","context":"extractBranchRef() and getZipUrl() in zip-baseline-fetcher.ts both used parts[2] which only captured the first segment of a slash-containing branch name, causing 404s on both SHA resolution and ZIP download","resolution":"Changed to parts.slice(2).join(\"/\") in both functions — minimal 2-line fix"} +{"type":"D","summary":"E0008 challenge governance refactor: replaced hardcoded detectClaimType logic in runChallengeAction with four governance-driven fetch functions (discoverChallengeTypes, fetchBasePrerequisites, fetchNormativeVocabulary, fetchStakesCalibration). Voice-dump suppression invariant is load-bearing — questionTiers.length === 0 short-circuits all output. Four new caches cleared in runCleanupStorage. tsc clean. PR #100.","rationale":"Hardcoded challenge logic cannot evolve with governance articles; governance-driven extraction means challenge behavior updates when articles update, no code change required. Mirrors PR #96 encode precedent exactly.","context":"workers/src/orchestrate.ts, branch feat/e0008-challenge-governance-driven, commit aa4445c","date":"2026-04-17"} From 31f8134f89be04121db68ce57cf08ea288de536e Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Fri, 17 Apr 2026 05:50:18 +0000 Subject: [PATCH 03/17] fix(orchestrate): normalize mode casing and sort directive regex by length --- workers/src/orchestrate.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workers/src/orchestrate.ts b/workers/src/orchestrate.ts index 3408606..5eb80af 100644 --- a/workers/src/orchestrate.ts +++ b/workers/src/orchestrate.ts @@ -613,14 +613,14 @@ async function fetchNormativeVocabulary( const rfcRegex = rfcWords.length > 0 ? new RegExp( - "\\b(" + rfcWords.map((w) => w.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|") + ")\\b", + "\\b(" + [...rfcWords].sort((a, b) => b.length - a.length).map((w) => w.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|") + ")\\b", ) // case-sensitive — no "i" flag : null; const archRegex = archPhrases.length > 0 ? new RegExp( - "\\b(" + archPhrases.map((p) => p.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|") + ")\\b", + "\\b(" + [...archPhrases].sort((a, b) => b.length - a.length).map((p) => p.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|") + ")\\b", "i", ) : null; @@ -678,7 +678,7 @@ async function fetchStakesCalibration( const cols = row.split("|").map((c: string) => c.trim()).filter((c: string) => c.length > 0); if (cols.length < 4) continue; - const mode = cols[0]; + const mode = cols[0].toLowerCase(); const tiersRaw = cols[1].toLowerCase(); const strictRaw = cols[2].toLowerCase(); const reframingsRaw = cols[3].toLowerCase(); From e9ef2f9dc8be78d0bf49656d0639764a0ee29f0d Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Fri, 17 Apr 2026 05:57:50 +0000 Subject: [PATCH 04/17] fix(orchestrate): first_1 reframings surfaces a single reframing total --- workers/src/orchestrate.ts | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/workers/src/orchestrate.ts b/workers/src/orchestrate.ts index 5eb80af..250e441 100644 --- a/workers/src/orchestrate.ts +++ b/workers/src/orchestrate.ts @@ -1636,10 +1636,8 @@ async function runChallengeAction( reframings = []; break; case "first_1": - // Take first reframing from each matched type (no more than one per type) - reframings = Array.from( - new Set(matchedTypes.map((t) => t.reframings[0]).filter((r) => r)), - ); + // Surface at most one reframing total + reframings = dedupedReframings.slice(0, 1); break; case "all": case "all_plus_block": From 84932f02468a3ba7dd991ffdcda357cf017b5ea4 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Fri, 17 Apr 2026 06:07:56 +0000 Subject: [PATCH 05/17] fix(orchestrate): include governance field in SUPPRESSED challenge response --- workers/src/orchestrate.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/workers/src/orchestrate.ts b/workers/src/orchestrate.ts index 250e441..bc400ea 100644 --- a/workers/src/orchestrate.ts +++ b/workers/src/orchestrate.ts @@ -1580,6 +1580,11 @@ async function runChallengeAction( challenges: [], suggested_reframings: [], canon_constraints: [], + governance: matchedTypes.map((t) => ({ + slug: t.slug, + name: t.name, + description: t.blockquote, + })), }, state: state ? initState(state) : undefined, assistant_text: From 726e5ed2dcaf6411d139bcb2f572a51bb792d35b Mon Sep 17 00:00:00 2001 From: Klappy Date: Fri, 17 Apr 2026 06:59:29 +0000 Subject: [PATCH 06/17] feat(workers): governance-driven oddkit_challenge with BM25 + stemming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor runChallengeAction in workers/src/orchestrate.ts to extract challenge-type behavior from canon governance articles at runtime rather than hardcoding claim-type detection, questions, prerequisites, and tension rules in source. Structural mirror of PR #96 (encode). Detection upgraded mid-implementation from regex-OR to BM25 + stemming after the gauntlet revealed that regex-based matching was morphologically brittle ("coin" doesn't match trigger "coining"). The pivot removed an entire class of bug and seeded a reusable pattern for future governance-driven tools. Changes in workers/src/orchestrate.ts: - New: ChallengeTypeDef, BasePrerequisite, NormativeVocabulary, StakesModeConfig, StakesCalibration - New: discoverChallengeTypes (builds per-canonUrl BM25 index over detection text), fetchBasePrerequisites, fetchNormativeVocabulary, fetchStakesCalibration — each with per-canonUrl cache and graceful degradation on missing articles - New: evaluatePrerequisiteCheck — interprets natural-language check strings from prerequisite overlay tables - Refactored runChallengeAction: multi-match via BM25 score > 0, base + overlay prerequisite aggregation, stakes calibration filtering, voice-dump suppression invariant, governance-driven tension detection - Extended runCleanupStorage with five new cache clears (types, type-index, base prerequisites, vocabulary, calibration) - Removed dead detectClaimType (legacy src/tasks/challenge.js retains its copy for CLI backward-compat) - Added CHALLENGE_STOP_WORDS set preserving modal verbs as signal Changes in workers/src/bm25.ts (backward-compatible extension): - tokenize(), buildBM25Index() accept optional stopWords: Set - BM25Index gains optional stopWords field so searchBM25 tokenizes queries consistently with the index - Default behavior unchanged — existing callers unaffected - Motivation: default STOP_WORDS filters modals (must, should, shall, may, not) which are signal for challenge-type detection New tests: workers/test/governance-parser.test.mjs — 94 assertions against live governance articles fetched from klappy.dev raw. Covers type parsing, fallback resolution, BM25 detection, stemming regression cases (coin/coining, propose/proposed, principle/principles), multi- match, and the voice-dump suppression invariant. 94/94 pass. Bugs the gauntlet caught on this PR: 1. Voice-dump suppression invariant would have shipped broken — the calibration cell reads "none (suppress all challenge)" not bare "none". Strict-equality parser would have produced a single-element array, voice-dump mode would have surfaced all challenges in prod. 2. Morphological brittleness in regex detection (coin vs coining) — triggered the pivot to BM25 + stemming. 3. Default BM25 STOP_WORDS silently breaks strong-claim and proposal detection by filtering modal verbs. Fixed via custom stop word set. Verification: - npm run typecheck: clean - tests/smoke.sh: 6/6 pass (legacy CLI path — backward compat preserved) - workers/test/governance-parser.test.mjs: 94/94 pass - AI voice clichés audit on new comments: clean - oddkit_preflight, challenge, gate, validate: all run; gate NOT_READY due to same hardcoded-logic gap as challenge pre-refactor (flagged as follow-up) Response shape change: adds mode, matched_types, type_definitions, block_until_addressed; removes claim_type. Consumed programmatically, not rendered. Follow-ups flagged: - Encode parity PR — same regex-OR brittleness in runEncodeAction; pattern proven here, port will be near-mechanical - klappy.dev meta governance PR — "compiles into a case-insensitive word-boundary regex" is now stale language - Gate refactor candidate — same hardcoded-logic shape as challenge pre-refactor Refs: - Depends on: klappy/klappy.dev#99 (governance articles this code reads) - Structural mirror: #96 (governance-driven encode) - Evidence: docs/oddkit/evidence/challenge-governance-code-refactor.md --- .../challenge-governance-code-refactor.md | 209 ++--- workers/src/bm25.ts | 30 +- workers/src/orchestrate.ts | 847 +++++++++--------- workers/test/governance-parser.test.mjs | 314 +++++++ 4 files changed, 892 insertions(+), 508 deletions(-) create mode 100644 workers/test/governance-parser.test.mjs diff --git a/docs/oddkit/evidence/challenge-governance-code-refactor.md b/docs/oddkit/evidence/challenge-governance-code-refactor.md index 2b6b673..2a1e73a 100644 --- a/docs/oddkit/evidence/challenge-governance-code-refactor.md +++ b/docs/oddkit/evidence/challenge-governance-code-refactor.md @@ -1,125 +1,132 @@ -# Evidence: Challenge Governance Code Refactor (E0008) - -## Change Description - -Modified `workers/src/orchestrate.ts` to replace the hardcoded `runChallengeAction` implementation with a governance-driven architecture that mirrors PR #96 (encode precedent). - -### New / Modified Functions with Line Ranges - -| Function | Lines | Type | -|---|---|---| -| `ChallengeTypeDef` interface | ~58–118 | New type declaration | -| `PrereqOverlay` interface | ~58–118 | New type declaration | -| `NormativeVocabulary` interface | ~58–118 | New type declaration | -| `StakesCalibration` interface | ~58–118 | New type declaration | -| `cachedChallengeTypes` + `cachedChallengeTypesCanonUrl` | ~118–125 | New cache variables | -| `cachedBasePrerequisites` + `cachedBasePrerequisitesCanonUrl` | ~127–130 | New cache variables | -| `cachedNormativeVocabulary` + `cachedNormativeVocabularyCanonUrl` | ~132–135 | New cache variables | -| `cachedStakesCalibration` + `cachedStakesCalibrationCanonUrl` | ~137–140 | New cache variables | -| `extractKeywordsFromCheck` | 404–411 | New helper | -| `extractPrereqTable` | 412–432 | New helper | -| `discoverChallengeTypes` | 434–531 | New async function | -| `fetchBasePrerequisites` | 532–551 | New async function | -| `fetchNormativeVocabulary` | 552–642 | New async function | -| `fetchStakesCalibration` | 643–~730 | New async function | -| `runCleanupStorage` | 1104–~1126 | Extended — clears 4 new caches | -| `runChallengeAction` | 1532–~1752 | Replaced body | - -Total new lines of implementation: ~482 (types + caches + helpers + functions + new body). -Original `runChallengeAction` body: ~117 lines. Replaced, not extended. - -### Architecture Summary - -- **discoverChallengeTypes**: Reads `odd/challenge-types/*.md` articles tagged `challenge-type` from canon index. Parses `## Type Identity` (slug, name), blockquote, `## Detection Patterns` code block, `## Challenge Questions` table, `## Prerequisite Overlays` table, `## Suggested Reframings` bullets. Per-canonUrl cached. -- **fetchBasePrerequisites**: Reads `odd/challenge/base-prerequisites.md`. Extracts `## Prerequisite Overlays` table. Per-canonUrl cached. Gracefully degrades to empty array if missing. -- **fetchNormativeVocabulary**: Reads `odd/challenge/normative-vocabulary.md`. Parses `### Directive Language` (RFC 2119 words → regex, case-sensitive) and `### Architectural` tables. Falls back to minimal hardcoded set (MUST/MUST NOT/SHOULD/SHOULD NOT) if missing. -- **fetchStakesCalibration**: Reads `odd/challenge/stakes-calibration.md`. Parses `## Stakes Calibration` 4-column table (Mode, Question tiers, Prerequisite strictness, Reframings). Falls back to "surface everything" at every mode if missing. -- **runChallengeAction** (new body): Multi-match detection, voice-dump suppression invariant, aggregation across matched types, question filtering by stakes tier, prerequisite checking via quoted keywords, normative vocabulary tension detection, reframings filtering, BM25 canon constraint retrieval. -- **runCleanupStorage** (extended): Now clears all four new caches on invalidation. - -## Verification Performed - -```bash -# Working directory: /tmp/work/oddkit/workers - -npm install --silent 2>&1 | tail -5 -# Output: (no output — already up to date) - -npx tsc --noEmit 2>&1 | tee /tmp/tsc.log; echo "EXIT:$?" -# Output: EXIT:0 - -# Root-level test suite -cd /tmp/work/oddkit && npm test 2>&1 | tail -40 || true -``` +# Gauntlet Evidence — Challenge Governance Code Refactor -## Observed Behavior +**Branch:** `feat/e0008-challenge-governance-driven` +**Date:** 2026-04-17 +**Scope:** Governance-driven refactor of `oddkit_challenge` in `workers/src/orchestrate.ts` plus minor extension of `workers/src/bm25.ts` +**Deliverable type:** Worker code change (TypeScript) — the runtime that consumes the canon governance articles landed in PR #99 +**Predecessor PRs:** #96 (governance-driven encode pattern, the structural mirror), #99 (klappy.dev governance articles, the canon this code reads) -### tsc --noEmit output (last 20 lines) +--- -``` -EXIT:0 -``` +## Definition of Done — Evidence -No errors. TypeScript compilation clean. +### 1. Change Description -### npm test output (last 40 lines) +Refactored `runChallengeAction` in `workers/src/orchestrate.ts` from hardcoded claim-type detection and question generation to governance-driven extraction. The structural mirror of PR #96 (encode). **Mid-implementation pivot:** replaced regex-OR detection with BM25 + stemming after the gauntlet surfaced a morphological brittleness (`"coin"` doesn't match trigger word `"coining"`). The architectural swap removed an entire class of bug and validated a reusable pattern for future governance-driven tools. -``` -Test 1: Index command -node:internal/modules/package_json_reader:314 - throw new ERR_MODULE_NOT_FOUND(packageName, fileURLToPath(base), null); - ^ - -Error [ERR_MODULE_NOT_FOUND]: Cannot find package 'commander' imported from /tmp/work/oddkit/src/cli.js -... -FAIL - Index: no success in output -``` +**New types added (`orchestrate.ts`):** + +- `ChallengeTypeDef` — slug, name, blockquote, trigger words, `detectionText` (triggers + blockquote, fed to BM25 indexer), questions with tiers, prerequisite overlays, reframings, fallback flag +- `BasePrerequisite` — prerequisite name, check description, gap message +- `NormativeVocabulary` — case-sensitive regex (RFC 2119), case-insensitive regex (architectural phrases), directive type map (this one keeps regex since it's directive-vocabulary matching against retrieved canon quotes, not claim-type detection) +- `StakesModeConfig` / `StakesCalibration` — mode → (question tiers, prerequisite strictness, reframing surfacing) + +**New discovery/fetch functions added (`orchestrate.ts`):** + +- `discoverChallengeTypes(fetcher, canonUrl)` — finds articles tagged `challenge-type`, parses each, builds a per-canonUrl BM25 index over detection text. Per-canonUrl cache for types AND index. +- `fetchBasePrerequisites(fetcher, canonUrl)` — fetches `odd/challenge/base-prerequisites.md`, extracts the prerequisite overlays table. Per-canonUrl cache. +- `fetchNormativeVocabulary(fetcher, canonUrl)` — fetches `odd/challenge/normative-vocabulary.md`, extracts both vocabulary tables, compiles case-sensitive and case-insensitive regexes. Falls back to minimal RFC 2119 set if the article is missing. Per-canonUrl cache. +- `fetchStakesCalibration(fetcher, canonUrl)` — fetches `odd/challenge/stakes-calibration.md`, extracts the calibration table. Per-canonUrl cache. + +**`runChallengeAction` refactored to:** -**Pre-existing failure unrelated to this change.** The root-level test suite invokes `src/cli.js` which requires `commander`, a package not installed at root level. This failure exists on `main` before this branch and is not caused by changes to `workers/src/orchestrate.ts`. +- Load all four governance sources in parallel +- Honor voice-dump suppression invariant — return empty challenge output when mode's tier list is empty +- Detect matching types via BM25 over per-type detection text (score > 0 = match) +- Resolve fallback type when no type scores > 0 +- Aggregate questions, prerequisite overlays (base + type), and reframings across matched types with deduplication +- Apply stakes calibration filter based on mode (question tiers, prerequisite strictness, reframing surfacing) +- Detect tensions in retrieved canon quotes via governance-driven vocabulary regex (replacing hardcoded `MUST`/`MUST NOT` checks) +- Surface matched type names and definitions in the response (teaching the model what governs the behavior) +- Mark `block_until_addressed` when calibration says so -### Smoke test +**`evaluatePrerequisiteCheck` helper added:** interprets natural-language `check` strings from prerequisite overlay tables. Extracts quoted keywords and tests presence in input. Special-cases URL, numeric, proper-noun, and citation patterns. -Local wrangler invocation not available in this session environment. Smoke testing will occur on the Cloudflare preview deploy (staging auto-deploy from this PR branch). +**`runCleanupStorage` extended:** clears all five new caches (types, type-index, base prerequisites, normative vocabulary, stakes calibration). Mirror of the PR #96 fix for cache staleness on governance edits. -## Evidence Produced +**Dead code removed:** `detectClaimType` in `workers/src/orchestrate.ts` (only used by the old hardcoded `runChallengeAction`). Legacy version in `src/tasks/challenge.js` retained for backward-compat on the non-worker CLI path. -- This file: `docs/oddkit/evidence/challenge-governance-code-refactor.md` -- Modified file: `workers/src/orchestrate.ts` (git diff available on branch `feat/e0008-challenge-governance-driven`) -- Build output: `tsc --noEmit` exit 0, no errors +**`workers/src/bm25.ts` extension (backward-compatible):** + +- `tokenize(text, stopWords?)` — new optional parameter. Defaults to the existing `STOP_WORDS` set (unchanged behavior for existing callers). +- `buildBM25Index(documents, stopWords?)` — same. Records the stop word set on the returned index so `searchBM25` tokenizes queries consistently with doc vocabularies. +- `BM25Index` interface gained an optional `stopWords?: Set` field. +- Motivation: the default `STOP_WORDS` filters out modal verbs (`must`, `should`, `shall`, `may`, `not`) which are the load-bearing detection signal for strong-claim, proposal, and assumption challenge types. Challenge-type detection needs a custom stop-word set that preserves modals. + +### 2. Verification Performed + +- `npm run typecheck` (workers/) — clean both before and after the BM25 pivot, and after the dead-code removal +- `bash tests/smoke.sh` (root) — 6 PASS, exercising the legacy CLI path. Confirms backward compat preserved (the worker path I refactored is separate from the CLI path). +- `node workers/test/governance-parser.test.mjs` — new parser-fidelity test, 94 assertions against live governance articles fetched from klappy.dev raw. **94 pass, 0 fail.** Includes explicit regression tests for stemming (`coin`/`coining`, `proposed`/`propose`, `principles`/`principle`) and multi-match semantics via BM25. +- `oddkit_preflight` — surfaced constraints (ai-voice-cliches, author-identity-language, definition-of-done, supersession, prompt-over-code) +- `oddkit_get` on `canon/methods/supersession.md` — confirmed this refactor is "replace" on the supersession spectrum (provenance preserved via PR description, commit message, ledger entry, retained legacy file) +- AI voice clichés audit on new code/comments via `git diff | grep` for negation parallelism, formulaic transitions, puffing — clean, zero hits +- `oddkit_challenge` on the commit decision — generic prereqs answered honestly in the PR description +- `oddkit_gate` returned NOT_READY for the same hardcoded-logic reason documented in PR #99 — flagged in PR as future refactor candidate + +### 3. Observed Behavior + +Parser-fidelity test output (94/94 passed): + +``` +─── Test 1: Challenge type parsing ─── (7 types × 8 assertions = 56 passing) +─── Test 2: Fallback resolution ─── (2 passing — observation has fallback: true, others don't) +─── Test 3: BM25 detection with stemming ─── (7 passing — each type matches its first trigger word) +─── Test 3b: Stemming defeats the original coin/coining bug ─── (5 passing — stemming equivalence + 4 real-world inputs) +─── Test 4: Multi-match semantics (BM25) ─── (3 passing) +─── Test 4b: Empty input + irrelevant input do not over-match ─── (1 passing) +─── Test 5: Base prerequisites ─── (4 passing) +─── Test 6: Normative vocabulary ─── (4 passing) +─── Test 7: Stakes calibration ─── (5 passing — including the voice-dump suppression invariant) + +94 passed, 0 failed +``` -## Self-Audit +### 4. Evidence Produced -### Intended Outcome +This file. Plus the diffs: -Replace the hardcoded `detectClaimType`-based challenge logic with governance-driven extraction from live canon articles (PR #99 governance articles), following the exact same pattern as PR #96 (encode). The output format evolves to include `matched_types`, `mode_used`, and `governance` fields while preserving `claim_type` as a backward-compat alias. +- `workers/src/orchestrate.ts`: ~560 insertions, ~70 deletions +- `workers/src/bm25.ts`: small additive change (stopWords parameter threaded through tokenize/buildBM25Index/searchBM25, no behavior change for existing callers) +- `workers/test/governance-parser.test.mjs`: new (~200 lines) +- `docs/oddkit/evidence/challenge-governance-code-refactor.md`: this note -### Constraints Applied +Visual proof: **N/A — server-side code change.** No UI, no interaction surface, no visible state. The `oddkit_challenge` MCP tool's response shape changes (adds `mode`, `matched_types`, `type_definitions`, `block_until_addressed` fields; removes `claim_type`) but this is consumed programmatically, not rendered. -1. **Did not redesign** — followed the spec function signatures, cache key names, regex patterns, and fallback behaviors exactly as specified. -2. **Voice-dump invariant is load-bearing** — Step 4 in `runChallengeAction` short-circuits when `calibration.questionTiers.length === 0` and returns `status: "SUPPRESSED"` with empty arrays before any aggregation. Not advisory. -3. **Four caches, four clears** — `runCleanupStorage` clears all eight new cache variables (four cache values, four canonUrl guards). -4. **Multi-match is the design** — `matchedTypes` is an array; aggregation loops over all matched types for questions, prereq overlays, and reframings. -5. **Graceful degradation** — all four fetch functions have try/catch with fallbacks; missing governance articles produce minimal built-in behavior rather than errors. -6. **detectClaimType preserved** — old helper left in place (still used by no current path but may be referenced by `runChallengeActionCompat`). +### 5. Self-Audit Completed -### Decision Rules +- **Intended outcome:** the worker path of `oddkit_challenge` becomes governance-driven via extraction from canon, mirroring PR #96. Behavior changes when the canon governance articles change — no code redeploy required. Detection is morphologically resilient via BM25 + stemming. +- **Constraints applied:** Definition of Done (this file), Writing Canon (n/a — code, not document, but evidence note follows the structure), AI voice clichés (audited clean on new comments), supersession ("replace" with provenance preserved), prompt-over-code (the principle this implements), Vodka Architecture (server stays thin — extraction and IR, no domain opinion baked in). +- **Decision rules followed:** mirror PR #96's cache pattern (per-canonUrl keying, try-catch-graceful-degradation per article); preserve legacy CLI path; voice-dump suppression as a load-bearing invariant; multi-match by design; honor `fallback: true` frontmatter for type fallback resolution; keep `bm25.ts` changes backward-compatible. +- **Tradeoffs:** four governance fetches per challenge call (mitigated by per-canonUrl module-level cache, so cold start is the only slow path); BM25 index built per cache invalidation (cheap — 5–10 tiny docs); BM25 score magnitudes aren't intuitive constants (anyone tuning thresholds later will need to reason in relative terms); the Porter-style stemmer handles common English morphology but not irregular forms. +- **Remaining risks:** + - Parser regex assumes specific table column order. If a future governance article reorders columns, parsing degrades silently. The parser-fidelity test catches this for currently-shipped articles but won't catch it for hypothetical future structure changes. + - `evaluatePrerequisiteCheck` uses heuristics over natural-language check descriptions. Some prerequisite checks may evaluate incorrectly — watch for false-negative gap messages in production logs. + - `oddkit_gate` still returns NOT_READY due to its own hardcoded prereqs — same architectural pattern as challenge pre-refactor. Future refactor candidate. Documented in PR. + - `oddkit_encode` still uses regex-OR detection with the same morphological brittleness this PR fixes for challenge. Follow-up PR required to bring encode to parity; the pivot here provides the blueprint. + - klappy.dev meta governance article (`odd/challenge-types/how-to-write-challenge-types.md`) describes the runtime as "compiles into a case-insensitive word-boundary regex" — that's now stale. Small coordinated klappy.dev PR required to update the language. -- `tsc --noEmit` exit 0 required before commit. Achieved. -- No speculation in observed behavior section — only what commands actually printed. -- Pre-existing test failure documented with root cause attribution. +--- -### Tradeoffs +## Bugs the Gauntlet Caught (this refactor sequence) -- Detection-pattern overlap noise: if multiple challenge types have overlapping trigger words, `matchedTypes.length > 1` may occur frequently in practice. The multi-match design handles this correctly but may surface more questions than expected. Governance authors can manage this by making trigger words specific. -- Descriptive-only prerequisite checks (no quoted keywords) are silently skipped rather than surfaced. This is the spec behavior — mechanical testing of prose descriptions is not reliable. -- `claim_type` alias: the backward-compat field returns the first matched slug, which may differ from the old `detectClaimType` output (e.g., `"strong-claim"` vs `"strong_claim"`). Callers relying on specific string values of this field will need to update. +1. **PR #99 — 10 of 11 articles missing required `## Summary` sections.** Writing Canon tier 4 violation. Same failure mode as the Feb 2026 Progressive Disclosure Failure incident. +2. **PR #99 — broken `derives_from` path** in `stakes-calibration.md` (`canon/epistemic-modes.md` → `canon/definitions/epistemic-modes.md`). +3. **This PR — voice-dump suppression invariant would have shipped broken.** The calibration cell content is `"none (suppress all challenge)"` not bare `"none"`. Initial parser checked `=== "none"` with strict equality, would have produced a single-element array, voice-dump mode would have surfaced all challenge questions in production. Fixed by checking `tiersRaw === "none" || tiersRaw.startsWith("none ") || tiersRaw.startsWith("none(")`. +4. **This PR (BM25 pivot) — morphological brittleness revealed.** The test `pattern-coinage fires on 'coin the term'` failed under regex because the article has `coining` as a trigger but not `coin`. This signal triggered the full pivot from regex-OR to BM25 + stemming. +5. **This PR (BM25 pivot) — default `STOP_WORDS` would have silently broken strong-claim and proposal detection.** The default filter drops modal verbs (`must`, `should`, `shall`, `may`, `not`) — exactly the load-bearing trigger words for these two types. Caught because the parser-fidelity test asserted each type matches its first trigger word and two types failed. Fixed by extending `bm25.ts` with an optional `stopWords: Set` parameter and defining a `CHALLENGE_STOP_WORDS` set in `orchestrate.ts` that preserves modals. -### Remaining Risks +**The discipline is load-bearing, not ceremony.** Five real bugs caught across two PRs. Two of the five would have caused silent production failures of invariants specifically named in the governance. -1. **Governance article availability**: all four fetch functions degrade gracefully, but if `odd/challenge-types/` has no tagged articles, `discoverChallengeTypes` returns an empty array and no matching occurs. The fallback uses the first type found, which is nothing — challenge returns empty output. This is recoverable by authoring governance articles. -2. **Regex compilation on cold start**: `discoverChallengeTypes` compiles regexes from all challenge-type articles on first call. With many types this may add latency on cold Worker start. Mitigated by per-canonUrl caching. -3. **Table regex brittleness**: markdown table parsing uses regexes that assume standard pipe-delimited format. Governance articles with non-standard formatting will silently produce empty arrays rather than parse errors. +--- -### Visual Proof +## Version Tracking -Not applicable — this is Cloudflare Worker code with no UI component. Correctness is demonstrated by: (1) TypeScript compilation clean, (2) PR review and preview deploy. +- Branch: `feat/e0008-challenge-governance-driven` +- Post-merge: ledger entry capturing E0008 challenge code-refactor milestone +- Related PRs: + - **Predecessor (structural mirror):** klappy/oddkit#96 (governance-driven encode refactor) + - **Depends on:** klappy/klappy.dev#99 (governance articles in canon — the inputs this code reads) + - **Immediate follow-up:** encode parity PR — bring `oddkit_encode` to BM25 + stemming using the pattern proven here + - **Small follow-up:** klappy.dev PR updating `how-to-write-challenge-types.md` — swap "compiles into a case-insensitive word-boundary regex" for the BM25 description + - **Future candidate:** governance-driven gate refactor (gate has the same hardcoded-logic gap as challenge pre-refactor; surfaced again during this gauntlet run) diff --git a/workers/src/bm25.ts b/workers/src/bm25.ts index f1aea92..66867df 100644 --- a/workers/src/bm25.ts +++ b/workers/src/bm25.ts @@ -30,13 +30,17 @@ export function stem(word: string): string { .replace(/s$/, ""); } -/** Tokenize and stem text, removing stop words */ -export function tokenize(text: string): string[] { +/** Tokenize and stem text. Pass a custom `stopWords` set to override the + * default. Pass an empty Set to disable filtering entirely. Use this for + * domains where the default modal verbs (must, should, shall, may, might, + * can, could, will, would) carry meaningful signal — for example, + * challenge-type detection where modals are themselves the trigger words. */ +export function tokenize(text: string, stopWords: Set = STOP_WORDS): string[] { return text .toLowerCase() .replace(/[^\w\s-]/g, " ") .split(/[\s\-_/]+/) - .filter((t) => t.length > 1 && !STOP_WORDS.has(t)) + .filter((t) => t.length > 1 && !stopWords.has(t)) .map(stem); } @@ -53,18 +57,25 @@ export interface BM25Index { df: Map; avgdl: number; N: number; + /** The stop word set used at index time. searchBM25 reuses it so that + * query tokenization matches doc tokenization exactly. */ + stopWords?: Set; } -/** Build BM25 index from {id, text} pairs */ +/** Build BM25 index from {id, text} pairs. + * Pass `stopWords` to override the default filter (e.g., for domains where + * modal verbs are signal). The same set is stored on the index so that + * searchBM25 tokenizes queries consistently with the indexed docs. */ export function buildBM25Index( documents: Array<{ id: string; text: string }>, + stopWords: Set = STOP_WORDS, ): BM25Index { const docs: BM25Doc[] = []; const df = new Map(); let totalLength = 0; for (const doc of documents) { - const terms = tokenize(doc.text); + const terms = tokenize(doc.text, stopWords); docs.push({ id: doc.id, terms, length: terms.length, originalText: doc.text }); totalLength += terms.length; @@ -82,6 +93,7 @@ export function buildBM25Index( df, avgdl: documents.length > 0 ? totalLength / documents.length : 0, N: documents.length, + stopWords, }; } @@ -97,12 +109,16 @@ export function searchBM25( query: string, limit: number = 5, ): Array<{ id: string; score: number }> { - const queryTerms = tokenize(query); + const stopWords = index.stopWords ?? STOP_WORDS; + const queryTerms = tokenize(query, stopWords); if (queryTerms.length === 0) return []; // Pre-compute phrase matching inputs once, outside the per-doc loop. const queryLower = query.toLowerCase(); - const queryWords = queryLower.replace(/[^\w\s-]/g, " ").split(/[\s\-_/]+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w)); + const queryWords = queryLower + .replace(/[^\w\s-]/g, " ") + .split(/[\s\-_/]+/) + .filter((w) => w.length > 1 && !stopWords.has(w)); const scores: Array<{ id: string; score: number }> = []; diff --git a/workers/src/orchestrate.ts b/workers/src/orchestrate.ts index bc400ea..72c91a8 100644 --- a/workers/src/orchestrate.ts +++ b/workers/src/orchestrate.ts @@ -75,50 +75,65 @@ interface ParsedArtifact { let cachedEncodingTypes: EncodingTypeDef[] | null = null; let cachedEncodingTypesCanonUrl: string | undefined = undefined; -// Governance-driven challenge types (parallel to EncodingTypeDef) +// Governance-driven challenge types (E0008 — mirrors encode pattern from PR #96) interface ChallengeTypeDef { - slug: string; // from ## Type Identity table, Slug row - name: string; // from ## Type Identity table, Name row - blockquote: string; // the opening > line after the title - fallback: boolean; // from frontmatter - triggerWords: string[]; // from ## Detection Patterns code block - triggerRegex: RegExp | null; - questions: Array<{ question: string; tier: string }>; // from ## Challenge Questions table - prereqOverlays: PrereqOverlay[]; // from ## Prerequisite Overlays table - reframings: string[]; // from ## Suggested Reframings bullets + slug: string; + name: string; + blockquote: string; + triggerWords: string[]; + detectionText: string; // triggerWords + blockquote, fed to BM25 indexer + questions: Array<{ question: string; tier: string }>; + prerequisiteOverlays: Array<{ prerequisite: string; check: string; gapMessage: string }>; + reframings: string[]; + fallback: boolean; } -interface PrereqOverlay { - name: string; // first column of table - check: string; // second column — prose description (may contain quoted keywords) - gapMessage: string; // third column — message if check fails - keywords: string[]; // extracted from check description (quoted strings) +interface BasePrerequisite { + prerequisite: string; + check: string; + gapMessage: string; } interface NormativeVocabulary { - rfc2119Regex: RegExp | null; // case-sensitive: MUST, SHALL, NEVER, etc. - architecturalRegex: RegExp | null; // case-insensitive: "invariant", "forcing function", etc. - directiveLookup: Map; // word/phrase (lowercase key) → directive type + caseSensitiveRegex: RegExp | null; + caseInsensitiveRegex: RegExp | null; + directiveTypes: Map; +} + +interface StakesModeConfig { + questionTiers: string[]; + prerequisiteStrictness: string; + reframingSurfacing: string; } interface StakesCalibration { - mode: string; // "exploration", "planning", "execution", "voice-dump", etc. - questionTiers: string[]; // ["baseline"], ["baseline","elevated"], etc. OR empty array for "none" - strictness: "optional" | "required" | "required_plus_source"; - reframings: "none" | "first_1" | "all" | "all_plus_block"; + byMode: Map; } -// Caches — one per governance article, each guarded by canonUrl (mirror encoding types pattern) +// Stop word set for challenge-type detection. Filters general filler +// (the, of, in, etc.) but deliberately preserves modal verbs, "do/does/did", +// and negation — those are load-bearing signal for strong-claim, proposal, +// and assumption types. Using the default bm25 STOP_WORDS would silently +// strip "must", "should", "shall", "may", "not" and break detection. +const CHALLENGE_STOP_WORDS = new Set([ + "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", + "of", "in", "to", "for", "with", "on", "at", "by", "from", "as", "into", "through", + "and", "but", "or", "nor", "if", "then", "than", + "that", "this", "it", "its", "we", "you", "he", "she", "they", + // intentionally NOT in this list (kept as signal): + // must, should, shall, will, would, may, might, can, could, do, does, did, + // have, has, had, not, no, never, always +]); + let cachedChallengeTypes: ChallengeTypeDef[] | null = null; let cachedChallengeTypesCanonUrl: string | undefined = undefined; - -let cachedBasePrerequisites: PrereqOverlay[] | null = null; +let cachedChallengeTypeIndex: BM25Index | null = null; +let cachedChallengeTypeIndexCanonUrl: string | undefined = undefined; +let cachedBasePrerequisites: BasePrerequisite[] | null = null; let cachedBasePrerequisitesCanonUrl: string | undefined = undefined; - let cachedNormativeVocabulary: NormativeVocabulary | null = null; let cachedNormativeVocabularyCanonUrl: string | undefined = undefined; - -let cachedStakesCalibration: StakesCalibration[] | null = null; +let cachedStakesCalibration: StakesCalibration | null = null; let cachedStakesCalibrationCanonUrl: string | undefined = undefined; export interface UnifiedParams { @@ -285,20 +300,6 @@ function detectMode(input: string): { mode: string; confidence: string } { return { mode: sorted[0][0], confidence }; } -function detectClaimType(input: string): string { - if ( - /\b(must|always|never|guaranteed|impossible|certain|definitely|obviously|clearly)\b/i.test( - input, - ) - ) - return "strong_claim"; - if (/\b(should|plan to|going to|will|propose|suggest|recommend|let's|want to)\b/i.test(input)) - return "proposal"; - if (/\b(assume|assuming|presume|given that|since|because|if we)\b/i.test(input)) - return "assumption"; - return "observation"; -} - function detectTransition(input: string): { from: string; to: string } { if (/\b(ready to build|ready to implement|start building|let's code|start coding)\b/i.test(input)) return { from: "planning", to: "execution" }; @@ -401,35 +402,11 @@ async function discoverEncodingTypes( return types; } -function extractKeywordsFromCheck(check: string): string[] { - // Extract quoted substrings from check description - // Example input: `input contains "evidence", "saw", "observed"` - // Output: ["evidence", "saw", "observed"] - const matches = check.match(/"([^"]+)"/g) || []; - return matches.map((m: string) => m.replace(/^"|"$/g, "")); -} - -function extractPrereqTable(content: string): PrereqOverlay[] { - const section = content.match( - /## Prerequisite Overlays[\s\S]*?\| Prerequisite[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n##|$)/, - ); - if (!section) return []; - - const overlays: PrereqOverlay[] = []; - for (const row of section[1].split("\n").filter((r: string) => r.includes("|"))) { - const cols = row.split("|").map((c: string) => c.trim()).filter((c: string) => c.length > 0); - if (cols.length >= 3) { - const check = cols[1]; - overlays.push({ - name: cols[0], - check, - gapMessage: cols[2].replace(/^"|"$/g, ""), - keywords: extractKeywordsFromCheck(check), - }); - } - } - return overlays; -} +// ────────────────────────────────────────────────────────────────────────────── +// E0008 — Governance-driven challenge (mirrors encode pattern from PR #96) +// Four discovery/fetch helpers read canon at runtime rather than hardcoding +// claim types, tensions, prerequisites, and mode calibration in source. +// ────────────────────────────────────────────────────────────────────────────── async function discoverChallengeTypes( fetcher: ZipBaselineFetcher, @@ -449,104 +426,172 @@ async function discoverChallengeTypes( const content = await fetcher.getFile(article.path, canonUrl); if (!content) continue; - // Frontmatter → fallback flag - const metadata = parseFullFrontmatter(content) || {}; - const fallback = metadata.fallback === true; - - // Blockquote (opening > line) - const blockquoteMatch = content.match(/^---[\s\S]*?---\s*\n+\s*#[^\n]+\n+>\s*(.+?)(?=\n\n|\n---|\n##)/s); - const blockquote = blockquoteMatch ? blockquoteMatch[1].trim().replace(/\n>\s*/g, " ") : ""; - - // ## Type Identity → Slug and Name - const slugMatch = content.match(/\|\s*Slug\s*\|\s*([a-z0-9-]+)\s*\|/i); - const nameMatch = content.match(/\|\s*Name\s*\|\s*([^|]+)\s*\|/i); + // Slug from ## Type Identity table + const slugMatch = content.match(/\|\s*Slug\s*\|\s*([^|]+)\s*\|/); + const nameMatch = content.match(/\|\s*Name\s*\|\s*([^|]+)\s*\|/); if (!slugMatch) continue; - const slug = slugMatch[1]; + const slug = slugMatch[1].trim(); const name = nameMatch ? nameMatch[1].trim() : slug; - // ## Detection Patterns → code block of comma-separated words - const detectionMatch = content.match( + // Opening blockquote (first > line after title) + const blockquoteMatch = content.match(/^#\s[^\n]+\n+>\s*([^\n]+(?:\n>\s*[^\n]+)*)/m); + const blockquote = blockquoteMatch + ? blockquoteMatch[1].replace(/\n>\s*/g, " ").trim() + : ""; + + // Detection patterns — code block under ## Detection Patterns + const detectionSection = content.match( /## Detection Patterns[\s\S]*?```\n([\s\S]*?)\n```/, ); - const triggerWords = detectionMatch - ? detectionMatch[1] - .split(/[,\n]/) + const triggerWords = detectionSection + ? detectionSection[1] + .split(",") .map((w: string) => w.trim()) .filter((w: string) => w.length > 0) : []; - const triggerRegex = - triggerWords.length > 0 - ? new RegExp( - "\\b(" + - triggerWords - .map((w: string) => w.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")) - .join("|") + - ")\\b", - "i", - ) - : null; + // Detection text fed to BM25 = trigger words + blockquote. + // Stemming handles morphology (coining ~ coin ~ coined ~ coinage) + // and IDF naturally weights distinctive trigger words above filler. + const detectionText = [triggerWords.join(" "), blockquote].filter((s) => s.length > 0).join(" "); - // ## Challenge Questions → table (Question | Stakes tier) + // Challenge Questions table — rows of (Question, Stakes tier) const questionsSection = content.match( /## Challenge Questions[\s\S]*?\| Question[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n##|$)/, ); const questions: Array<{ question: string; tier: string }> = []; if (questionsSection) { for (const row of questionsSection[1].split("\n").filter((r: string) => r.includes("|"))) { - const cols = row.split("|").map((c: string) => c.trim()).filter((c: string) => c.length > 0); + const cols = row + .split("|") + .map((c: string) => c.trim()) + .filter((c: string) => c.length > 0); if (cols.length >= 2) { - questions.push({ question: cols[0], tier: cols[1].toLowerCase() }); + questions.push({ question: cols[0], tier: cols[1] }); } } } - // ## Prerequisite Overlays → table (Prerequisite | Check | Gap message) - const prereqOverlays = extractPrereqTable(content); + // Prerequisite Overlays table — rows of (Prerequisite, Check, Gap message) + const prereqSection = content.match( + /## Prerequisite Overlays[\s\S]*?\| Prerequisite[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n##|$)/, + ); + const prerequisiteOverlays: Array<{ + prerequisite: string; + check: string; + gapMessage: string; + }> = []; + if (prereqSection) { + for (const row of prereqSection[1].split("\n").filter((r: string) => r.includes("|"))) { + const cols = row + .split("|") + .map((c: string) => c.trim()) + .filter((c: string) => c.length > 0); + if (cols.length >= 3) { + // Substitute {name} placeholder in gap messages + const gap = cols[2].replace(/^"|"$/g, "").replace(/\{name\}/g, name); + prerequisiteOverlays.push({ + prerequisite: cols[0], + check: cols[1], + gapMessage: gap, + }); + } + } + } - // ## Suggested Reframings → bulleted list + // Suggested Reframings — bullet list const reframingsSection = content.match( - /## Suggested Reframings[\s\S]*?\n((?:- [^\n]+\n?)+)/, + /## Suggested Reframings[\s\S]*?\n((?:-\s+[^\n]+\n?)+)/, ); - const reframings = reframingsSection - ? reframingsSection[1] - .split("\n") - .filter((l: string) => l.startsWith("- ")) - .map((l: string) => l.slice(2).trim()) - : []; + const reframings: string[] = []; + if (reframingsSection) { + for (const line of reframingsSection[1].split("\n")) { + const m = line.match(/^-\s+(.+)$/); + if (m) reframings.push(m[1].trim()); + } + } + + // Fallback flag from frontmatter + const frontmatter = parseFullFrontmatter(content); + const fallback = frontmatter?.fallback === true; types.push({ - slug, name, blockquote, fallback, - triggerWords, triggerRegex, - questions, prereqOverlays, reframings, + slug, + name, + blockquote, + triggerWords, + detectionText, + questions, + prerequisiteOverlays, + reframings, + fallback, }); } catch { continue; } } + // Sort: fallback types last for deterministic fallback-resolution + types.sort((a, b) => { + if (a.fallback && !b.fallback) return 1; + if (!a.fallback && b.fallback) return -1; + return a.slug.localeCompare(b.slug); + }); + + // Build BM25 index over per-type detection text (triggers + blockquote). + // Stemming handles morphology; IDF weights distinctive trigger terms above filler. + // CHALLENGE_STOP_WORDS preserves modal verbs and negation as signal — the + // default bm25 STOP_WORDS would silently strip "must", "should", "not" etc. + const bm25Docs = types.map((t) => ({ id: t.slug, text: t.detectionText })); + const bm25Index = buildBM25Index(bm25Docs, CHALLENGE_STOP_WORDS); + cachedChallengeTypes = types; cachedChallengeTypesCanonUrl = canonUrl; + cachedChallengeTypeIndex = bm25Index; + cachedChallengeTypeIndexCanonUrl = canonUrl; return types; } +function getChallengeTypeIndex(): BM25Index | null { + return cachedChallengeTypeIndex; +} + async function fetchBasePrerequisites( fetcher: ZipBaselineFetcher, canonUrl?: string, -): Promise { +): Promise { if (cachedBasePrerequisites && cachedBasePrerequisitesCanonUrl === canonUrl) return cachedBasePrerequisites; + const result: BasePrerequisite[] = []; try { const content = await fetcher.getFile("odd/challenge/base-prerequisites.md", canonUrl); - const overlays = content ? extractPrereqTable(content) : []; - cachedBasePrerequisites = overlays; - cachedBasePrerequisitesCanonUrl = canonUrl; - return overlays; + if (content) { + const prereqSection = content.match( + /## Prerequisite Overlays[\s\S]*?\| Prerequisite[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n##|$)/, + ); + if (prereqSection) { + for (const row of prereqSection[1].split("\n").filter((r: string) => r.includes("|"))) { + const cols = row + .split("|") + .map((c: string) => c.trim()) + .filter((c: string) => c.length > 0); + if (cols.length >= 3) { + result.push({ + prerequisite: cols[0], + check: cols[1], + gapMessage: cols[2].replace(/^"|"$/g, ""), + }); + } + } + } + } } catch { - cachedBasePrerequisites = []; - cachedBasePrerequisitesCanonUrl = canonUrl; - return []; + // Graceful degradation: no base prerequisites article → type overlays only } + + cachedBasePrerequisites = result; + cachedBasePrerequisitesCanonUrl = canonUrl; + return result; } async function fetchNormativeVocabulary( @@ -556,174 +601,112 @@ async function fetchNormativeVocabulary( if (cachedNormativeVocabulary && cachedNormativeVocabularyCanonUrl === canonUrl) return cachedNormativeVocabulary; - // Fallback minimal set if article is missing - const fallback: NormativeVocabulary = { - rfc2119Regex: /\b(MUST NOT|SHOULD NOT|MUST|SHOULD)\b/, - architecturalRegex: null, - directiveLookup: new Map([ - ["must", "requirement"], - ["must not", "prohibition"], - ["should", "recommendation"], - ["should not", "discouragement"], - ]), - }; + const caseSensitiveWords: string[] = []; + const caseInsensitiveWords: string[] = []; + const directiveTypes = new Map(); try { const content = await fetcher.getFile("odd/challenge/normative-vocabulary.md", canonUrl); - if (!content) { - cachedNormativeVocabulary = fallback; - cachedNormativeVocabularyCanonUrl = canonUrl; - return fallback; - } - - // Parse two tables under ## Normative Vocabulary - // Table 1: ### Directive Language (RFC 2119 and Related) — 2 cols (Word | Directive type) - // Table 2: ### Architectural Writing Load-Bearing Terms — 2 cols (Phrase | Directive type) - const rfcSection = content.match( - /### Directive Language[\s\S]*?\| Word[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n###|\n##|$)/, - ); - const archSection = content.match( - /### Architectural[\s\S]*?\|[^|]+\|[^|]+\|\n\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n###|\n##|$)/, - ); - - const rfcWords: string[] = []; - const archPhrases: string[] = []; - const lookup = new Map(); - - if (rfcSection) { - for (const row of rfcSection[1].split("\n").filter((r: string) => r.includes("|"))) { - const cols = row.split("|").map((c: string) => c.trim()).filter((c: string) => c.length > 0); - if (cols.length >= 2) { - rfcWords.push(cols[0]); - lookup.set(cols[0].toLowerCase(), cols[1]); + if (content) { + // Two sections: one under "RFC 2119" heading (case-sensitive), + // one under "Architectural Writing" heading (case-insensitive). + // Each is a markdown table with (Word | Directive type). + const sections = content.split(/###\s+/); + for (const section of sections) { + const isCaseSensitive = /RFC 2119|Directive Language/i.test(section.split("\n")[0] || ""); + const tableMatch = section.match(/\|\s*(?:Word|Phrase)\s*\|[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n##|$)/); + if (!tableMatch) continue; + for (const row of tableMatch[1].split("\n").filter((r: string) => r.includes("|"))) { + const cols = row + .split("|") + .map((c: string) => c.trim()) + .filter((c: string) => c.length > 0); + if (cols.length >= 2) { + const phrase = cols[0]; + const dtype = cols[1]; + directiveTypes.set(phrase, dtype); + if (isCaseSensitive) caseSensitiveWords.push(phrase); + else caseInsensitiveWords.push(phrase); + } } } } + } catch { + // Graceful degradation below + } - if (archSection) { - for (const row of archSection[1].split("\n").filter((r: string) => r.includes("|"))) { - const cols = row.split("|").map((c: string) => c.trim()).filter((c: string) => c.length > 0); - if (cols.length >= 2) { - archPhrases.push(cols[0]); - lookup.set(cols[0].toLowerCase(), cols[1]); - } - } + // Fallback: minimal built-in RFC 2119 if article missing + if (caseSensitiveWords.length === 0 && caseInsensitiveWords.length === 0) { + for (const w of ["MUST", "MUST NOT", "SHOULD", "SHOULD NOT"]) { + caseSensitiveWords.push(w); + directiveTypes.set(w, w.includes("NOT") ? "prohibition" : "requirement"); } - - const rfcRegex = - rfcWords.length > 0 - ? new RegExp( - "\\b(" + [...rfcWords].sort((a, b) => b.length - a.length).map((w) => w.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|") + ")\\b", - ) // case-sensitive — no "i" flag - : null; - - const archRegex = - archPhrases.length > 0 - ? new RegExp( - "\\b(" + [...archPhrases].sort((a, b) => b.length - a.length).map((p) => p.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|") + ")\\b", - "i", - ) - : null; - - const result: NormativeVocabulary = { - rfc2119Regex: rfcRegex, - architecturalRegex: archRegex, - directiveLookup: lookup, - }; - cachedNormativeVocabulary = result; - cachedNormativeVocabularyCanonUrl = canonUrl; - return result; - } catch { - cachedNormativeVocabulary = fallback; - cachedNormativeVocabularyCanonUrl = canonUrl; - return fallback; } + + const escape = (s: string) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + const caseSensitiveRegex = + caseSensitiveWords.length > 0 + ? new RegExp("\\b(" + caseSensitiveWords.map(escape).join("|") + ")\\b") + : null; + const caseInsensitiveRegex = + caseInsensitiveWords.length > 0 + ? new RegExp("(" + caseInsensitiveWords.map(escape).join("|") + ")", "i") + : null; + + const vocab = { caseSensitiveRegex, caseInsensitiveRegex, directiveTypes }; + cachedNormativeVocabulary = vocab; + cachedNormativeVocabularyCanonUrl = canonUrl; + return vocab; } async function fetchStakesCalibration( fetcher: ZipBaselineFetcher, canonUrl?: string, -): Promise { +): Promise { if (cachedStakesCalibration && cachedStakesCalibrationCanonUrl === canonUrl) return cachedStakesCalibration; - // Fallback: "surface everything" at every mode - const fallback: StakesCalibration[] = [ - { mode: "exploration", questionTiers: ["baseline", "elevated", "rigorous"], strictness: "optional", reframings: "all" }, - { mode: "planning", questionTiers: ["baseline", "elevated", "rigorous"], strictness: "required", reframings: "all" }, - { mode: "execution", questionTiers: ["baseline", "elevated", "rigorous"], strictness: "required_plus_source", reframings: "all" }, - ]; - + const byMode = new Map(); try { const content = await fetcher.getFile("odd/challenge/stakes-calibration.md", canonUrl); - if (!content) { - cachedStakesCalibration = fallback; - cachedStakesCalibrationCanonUrl = canonUrl; - return fallback; - } - - // Parse ## Stakes Calibration table — 4 columns - // Mode | Question tiers surfaced | Prerequisite strictness | Reframings surfaced - const section = content.match( - /## Stakes Calibration[\s\S]*?\| Mode[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n##|$)/, - ); - if (!section) { - cachedStakesCalibration = fallback; - cachedStakesCalibrationCanonUrl = canonUrl; - return fallback; - } - - const calibrations: StakesCalibration[] = []; - for (const row of section[1].split("\n").filter((r: string) => r.includes("|"))) { - const cols = row.split("|").map((c: string) => c.trim()).filter((c: string) => c.length > 0); - if (cols.length < 4) continue; - - const mode = cols[0].toLowerCase(); - const tiersRaw = cols[1].toLowerCase(); - const strictRaw = cols[2].toLowerCase(); - const reframingsRaw = cols[3].toLowerCase(); - - // Parse question tiers - let questionTiers: string[]; - if (tiersRaw.includes("none")) questionTiers = []; - else { - questionTiers = []; - if (tiersRaw.includes("baseline")) questionTiers.push("baseline"); - if (tiersRaw.includes("elevated")) questionTiers.push("elevated"); - if (tiersRaw.includes("rigorous")) questionTiers.push("rigorous"); + if (content) { + // Parse the Stakes Calibration table: + // | Mode | Question tiers surfaced | Prerequisite strictness | Reframings surfaced | + const tableMatch = content.match( + /## Stakes Calibration[\s\S]*?\| Mode[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n##|$)/, + ); + if (tableMatch) { + for (const row of tableMatch[1].split("\n").filter((r: string) => r.includes("|"))) { + const cols = row + .split("|") + .map((c: string) => c.trim()) + .filter((c: string) => c.length > 0); + if (cols.length >= 4) { + const mode = cols[0]; + const tiersRaw = cols[1].toLowerCase().trim(); + // The cell may be "none" or "none (suppress all challenge)" — both mean + // empty tier list and trigger the voice-dump suppression invariant. + // Without this leading-"none" check the suppression invariant ships broken. + const isNone = tiersRaw === "none" || tiersRaw.startsWith("none ") || tiersRaw.startsWith("none("); + const questionTiers: string[] = isNone + ? [] + : tiersRaw.split(",").map((t: string) => t.trim()).filter((t: string) => t.length > 0); + byMode.set(mode, { + questionTiers, + prerequisiteStrictness: cols[2], + reframingSurfacing: cols[3], + }); + } + } } - - // Parse strictness - let strictness: StakesCalibration["strictness"]; - if (strictRaw.includes("source-named")) strictness = "required_plus_source"; - else if (strictRaw.includes("required")) strictness = "required"; - else strictness = "optional"; - - // Parse reframings - let reframings: StakesCalibration["reframings"]; - if (reframingsRaw.includes("block-until-addressed")) reframings = "all_plus_block"; - else if (reframingsRaw.includes("all")) reframings = "all"; - else if (reframingsRaw.includes("first 1") || reframingsRaw.includes("first one")) reframings = "first_1"; - else if (reframingsRaw.includes("none")) reframings = "none"; - else reframings = "all"; - - calibrations.push({ mode, questionTiers, strictness, reframings }); } - - if (calibrations.length === 0) { - cachedStakesCalibration = fallback; - cachedStakesCalibrationCanonUrl = canonUrl; - return fallback; - } - - cachedStakesCalibration = calibrations; - cachedStakesCalibrationCanonUrl = canonUrl; - return calibrations; } catch { - cachedStakesCalibration = fallback; - cachedStakesCalibrationCanonUrl = canonUrl; - return fallback; + // Graceful degradation below } + + cachedStakesCalibration = { byMode }; + cachedStakesCalibrationCanonUrl = canonUrl; + return cachedStakesCalibration; } function isStructuredInput(input: string): boolean { @@ -1111,8 +1094,11 @@ async function runCleanupStorage( cachedBM25Entries = null; cachedEncodingTypes = null; cachedEncodingTypesCanonUrl = undefined; + // E0008 — governance-driven challenge caches (mirror PR #96 fix) cachedChallengeTypes = null; cachedChallengeTypesCanonUrl = undefined; + cachedChallengeTypeIndex = null; + cachedChallengeTypeIndexCanonUrl = undefined; cachedBasePrerequisites = null; cachedBasePrerequisitesCanonUrl = undefined; cachedNormativeVocabulary = null; @@ -1537,165 +1523,192 @@ async function runChallengeAction( state?: OddkitState, ): Promise { const startMs = Date.now(); - - // 1. Load all governance - const types = await discoverChallengeTypes(fetcher, canonUrl); - const basePrereqs = await fetchBasePrerequisites(fetcher, canonUrl); - const normVocab = await fetchNormativeVocabulary(fetcher, canonUrl); - const calibrations = await fetchStakesCalibration(fetcher, canonUrl); - - // 2. Resolve mode and calibration const mode = modeHint || "planning"; - const calibration = - calibrations.find((c) => c.mode === mode) || - calibrations.find((c) => c.mode === "planning") || - calibrations[0] || { - mode: "planning", questionTiers: ["baseline", "elevated"], - strictness: "required" as const, reframings: "all" as const, - }; - // 3. Multi-match detection - let matchedTypes: ChallengeTypeDef[] = types.filter( - (t) => t.triggerRegex && t.triggerRegex.test(input), - ); - if (matchedTypes.length === 0) { - const fallbackType = types.find((t) => t.fallback) || types[0]; - if (fallbackType) matchedTypes = [fallbackType]; - } + // Load governance in parallel + const [types, basePrereqs, vocab, calibration] = await Promise.all([ + discoverChallengeTypes(fetcher, canonUrl), + fetchBasePrerequisites(fetcher, canonUrl), + fetchNormativeVocabulary(fetcher, canonUrl), + fetchStakesCalibration(fetcher, canonUrl), + ]); - // 4. VOICE-DUMP INVARIANT: if calibration says no question tiers, suppress entire output. - // This is load-bearing — some modes exist for raw thought capture and pressure-testing - // at that stage damages the mode. Do not "helpfully" surface a reduced set. - if (calibration.questionTiers.length === 0) { - const primary = matchedTypes[0]?.slug || "observation"; + const modeConfig = calibration.byMode.get(mode); + + // Voice-dump invariant: suppress all challenge output regardless of matched types. + // Encoded at klappy://odd/challenge/stakes-calibration. Some modes exist for getting + // thoughts out of the head; pressure-testing at that stage damages the mode. + if (modeConfig && modeConfig.questionTiers.length === 0) { return { action: "challenge", result: { status: "SUPPRESSED", - mode_used: mode, - matched_types: matchedTypes.map((t) => t.slug), - claim_type: primary, // backward-compat alias + mode, + matched_types: [], tensions: [], missing_prerequisites: [], challenges: [], suggested_reframings: [], canon_constraints: [], - governance: matchedTypes.map((t) => ({ - slug: t.slug, - name: t.name, - description: t.blockquote, - })), + suppression_reason: + `Mode '${mode}' suppresses challenge output. Challenge is not applied during raw thought capture.`, }, state: state ? initState(state) : undefined, - assistant_text: - `Challenge suppressed (mode: ${mode}). This mode exists for raw capture; ` + - `pressure-testing would damage the mode's function. Resume challenge at a later stage.`, + assistant_text: `Challenge suppressed for mode '${mode}'. Raw thought capture protected.`, debug: { duration_ms: Date.now() - startMs, generated_at: new Date().toISOString() }, }; } - // 5. Aggregate across matched types - const aggregatedQuestions: Array<{ question: string; tier: string }> = []; - const aggregatedOverlays: PrereqOverlay[] = []; - const aggregatedReframings: string[] = []; + // Detect matching types via BM25 over per-type detection text. + // Stemming makes "coining" match "coin", "rolled" match "rollback", etc. + // score > 0 = match (BM25 returns 0 when no stemmed query terms hit). + // Multi-match preserved: a single input may score against several types. + const typeIndex = getChallengeTypeIndex(); + const matchedTypes: ChallengeTypeDef[] = []; + if (typeIndex) { + const hits = searchBM25(typeIndex, input, types.length); + const typeBySlug = new Map(types.map((t) => [t.slug, t])); + for (const hit of hits) { + const t = typeBySlug.get(hit.id); + if (t) matchedTypes.push(t); + } + } + + // Fallback resolution when no type scored above zero + if (matchedTypes.length === 0) { + const fallback = types.find((t) => t.fallback) || types[0]; + if (fallback) matchedTypes.push(fallback); + } + + // Aggregate questions across matched types, deduped by question string + const questionMap = new Map(); for (const t of matchedTypes) { - aggregatedQuestions.push(...t.questions); - aggregatedOverlays.push(...t.prereqOverlays); - aggregatedReframings.push(...t.reframings); + for (const q of t.questions) { + if (!questionMap.has(q.question)) questionMap.set(q.question, q); + } } - // 6. Filter questions by stakes tier, dedupe by string - const filteredQuestions = aggregatedQuestions - .filter((q) => calibration.questionTiers.includes(q.tier)) - .map((q) => q.question); - const challenges = Array.from(new Set(filteredQuestions)); + // Aggregate prerequisite overlays: base + all matched type overlays, deduped by prerequisite name + const prereqMap = new Map(); + for (const p of basePrereqs) { + prereqMap.set(p.prerequisite, p); + } + for (const t of matchedTypes) { + for (const p of t.prerequisiteOverlays) { + if (!prereqMap.has(p.prerequisite)) prereqMap.set(p.prerequisite, p); + } + } - // 7. Merge base + type overlay prerequisites, dedupe by name, test each against input - const allPrereqs = [...basePrereqs, ...aggregatedOverlays]; - const uniquePrereqs = Array.from( - new Map(allPrereqs.map((p) => [p.name, p])).values(), - ); + // Aggregate reframings across matched types, deduped by string equality + const reframingSet = new Set(); + const reframingsByType = new Map(); + for (const t of matchedTypes) { + const typeReframings: string[] = []; + for (const r of t.reframings) { + if (!reframingSet.has(r)) { + reframingSet.add(r); + typeReframings.push(r); + } + } + reframingsByType.set(t.slug, typeReframings); + } + + // Apply stakes calibration: filter questions by tier, evaluate prerequisites by strictness, + // surface reframings by the surfacing rule. When modeConfig is absent (no calibration + // article or mode not in table), surface everything — "uniformly loud" fallback. + const surfacedQuestions: string[] = []; + for (const q of questionMap.values()) { + if (!modeConfig || modeConfig.questionTiers.length === 0 || modeConfig.questionTiers.includes(q.tier)) { + surfacedQuestions.push(q.question); + } + } + const strictness = modeConfig?.prerequisiteStrictness?.toLowerCase() || "required"; const missing: string[] = []; - for (const prereq of uniquePrereqs) { - if (prereq.keywords.length === 0) { - // No quoted keywords — check is descriptive-only, cannot mechanically test. Skip. - continue; + for (const p of prereqMap.values()) { + const passed = evaluatePrerequisiteCheck(input, p.check); + if (!passed) { + // source-named check is escalated to blocking when strictness says so + if (strictness.includes("optional") && !p.prerequisite.includes("source-named")) { + continue; + } + missing.push(p.gapMessage); } - const matched = prereq.keywords.some((k) => - new RegExp("\\b" + k.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + "\\b", "i").test(input), - ); - if (!matched) { - const typeName = matchedTypes[0]?.name || "claim"; - let gap = prereq.gapMessage.replace(/\{name\}/g, typeName.toLowerCase()); - if (calibration.strictness === "optional") gap = `Advisory: ${gap}`; - missing.push(gap); + } + + const surfacing = modeConfig?.reframingSurfacing?.toLowerCase() || "all"; + const surfacedReframings: string[] = []; + if (surfacing === "none") { + // no reframings + } else if (surfacing.includes("first 1") || surfacing.includes("first-1")) { + for (const typeReframings of reframingsByType.values()) { + if (typeReframings.length > 0) surfacedReframings.push(typeReframings[0]); + } + } else { + // "all" or "all, plus block-until-addressed" + for (const typeReframings of reframingsByType.values()) { + surfacedReframings.push(...typeReframings); } } + const blockUntilAddressed = surfacing.includes("block-until-addressed"); - // 8. Dedupe and filter reframings - const dedupedReframings = Array.from(new Set(aggregatedReframings)); - let reframings: string[]; - switch (calibration.reframings) { - case "none": - reframings = []; - break; - case "first_1": - // Surface at most one reframing total - reframings = dedupedReframings.slice(0, 1); - break; - case "all": - case "all_plus_block": - default: - reframings = dedupedReframings; - } - - // 9. Retrieve canon constraints (same BM25 path as before) + // Retrieve canon quotes and detect tensions via governance-driven vocabulary const index = await fetcher.getIndex(canonUrl); const results = scoreEntries(index.entries, `constraints challenges risks ${input}`).slice(0, 4); const canonConstraints: Array<{ citation: string; quote: string }> = []; - const tensions: Array<{ type: string; message: string }> = []; - + const tensions: Array<{ type: string; message: string; citation?: string; quote?: string }> = []; for (const entry of results) { const content = await fetcher.getFile(entry.path, canonUrl); - if (!content) continue; - const stripped = content.replace(/^---[\s\S]*?---\n/, ""); - const lines = stripped.split("\n").filter((l) => l.trim() && !l.startsWith("#")); - const excerpt = lines.slice(0, 2).join(" ").slice(0, 150); - canonConstraints.push({ citation: `${entry.path}#${entry.title}`, quote: excerpt }); - - // Apply normative vocabulary regexes - let foundMatch: string | null = null; - if (normVocab.rfc2119Regex) { - const m = excerpt.match(normVocab.rfc2119Regex); - if (m) foundMatch = m[0]; - } - if (!foundMatch && normVocab.architecturalRegex) { - const m = excerpt.match(normVocab.architecturalRegex); - if (m) foundMatch = m[0]; - } - if (foundMatch) { - const directiveType = normVocab.directiveLookup.get(foundMatch.toLowerCase()) || "directive"; - tensions.push({ - type: directiveType, - message: `Canon ${directiveType} (${foundMatch}) found in ${entry.path}`, - }); + if (content) { + const stripped = content.replace(/^---[\s\S]*?---\n/, ""); + const lines = stripped.split("\n").filter((l) => l.trim() && !l.startsWith("#")); + const excerpt = lines.slice(0, 2).join(" ").slice(0, 150); + const citation = `${entry.path}#${entry.title}`; + canonConstraints.push({ citation, quote: excerpt }); + + // Governance-driven tension detection + if (vocab.caseSensitiveRegex) { + const m = excerpt.match(vocab.caseSensitiveRegex); + if (m) { + const phrase = m[1]; + tensions.push({ + type: vocab.directiveTypes.get(phrase) || "directive", + message: `Canon ${vocab.directiveTypes.get(phrase) || "directive"} (${phrase}) found in ${entry.path}`, + citation, + quote: excerpt, + }); + continue; + } + } + if (vocab.caseInsensitiveRegex) { + const m = excerpt.match(vocab.caseInsensitiveRegex); + if (m) { + const phrase = m[1]; + const dtype = + vocab.directiveTypes.get(phrase) || + vocab.directiveTypes.get(phrase.toLowerCase()) || + "load-bearing-claim"; + tensions.push({ + type: dtype, + message: `Canon ${dtype} (${phrase}) found in ${entry.path}`, + citation, + quote: excerpt, + }); + } + } } } - // 10. Update state + // Update state const updatedState = state ? initState(state) : undefined; if (updatedState && missing.length > 0) { updatedState.unresolved = [...updatedState.unresolved, ...missing]; } - // 11. Build human-readable assistant_text (preserve existing format roughly) - const primarySlug = matchedTypes[0]?.slug || "observation"; - const primaryName = matchedTypes[0]?.name || "Observation"; - const typesLabel = - matchedTypes.length > 1 ? `${primaryName} +${matchedTypes.length - 1} more` : primaryName; - const lines = [`Challenge (${typesLabel}, mode: ${mode}):`, ""]; + // Assistant text — preserves prior format, extends with matched types and mode + const matchedSlugs = matchedTypes.map((t) => t.slug); + const lines = [`Challenge (${matchedSlugs.join(", ") || "no-match"}) [mode: ${mode}]:`, ""]; if (tensions.length > 0) { lines.push("Tensions found:"); for (const t of tensions) lines.push(` - [${t.type}] ${t.message}`); @@ -1706,18 +1719,20 @@ async function runChallengeAction( for (const m of missing) lines.push(` - ${m}`); lines.push(""); } - if (challenges.length > 0) { + if (surfacedQuestions.length > 0) { lines.push("Questions to address:"); - for (const c of challenges) lines.push(` - ${c}`); + for (const c of surfacedQuestions) lines.push(` - ${c}`); lines.push(""); } - if (reframings.length > 0) { + if (surfacedReframings.length > 0) { lines.push("Suggested reframings:"); - for (const r of reframings) lines.push(` - ${r}`); + for (const r of surfacedReframings) lines.push(` - ${r}`); lines.push(""); } - if (calibration.reframings === "all_plus_block" && reframings.length > 0) { - lines.push("⚠ Block-until-addressed: this claim should not proceed until reframings are explicitly addressed or declined."); + if (blockUntilAddressed && (missing.length > 0 || tensions.length > 0)) { + lines.push( + "⚠ Block-until-addressed: in this mode, the claim should not proceed until the gaps above are resolved or explicitly declined.", + ); lines.push(""); } if (canonConstraints.length > 0) { @@ -1733,19 +1748,19 @@ async function runChallengeAction( action: "challenge", result: { status: "CHALLENGED", - mode_used: mode, - matched_types: matchedTypes.map((t) => t.slug), - claim_type: primarySlug, // backward-compat alias — first matched slug - tensions, - missing_prerequisites: missing, - challenges, - suggested_reframings: reframings, - canon_constraints: canonConstraints, - governance: matchedTypes.map((t) => ({ + mode, + matched_types: matchedSlugs, + type_definitions: matchedTypes.map((t) => ({ slug: t.slug, name: t.name, description: t.blockquote, })), + tensions, + missing_prerequisites: missing, + challenges: surfacedQuestions, + suggested_reframings: surfacedReframings, + block_until_addressed: blockUntilAddressed, + canon_constraints: canonConstraints, }, state: updatedState, assistant_text: lines.join("\n").trim(), @@ -1753,6 +1768,38 @@ async function runChallengeAction( }; } +// Governance-driven check evaluator — interprets natural-language `check` strings +// from ## Prerequisite Overlays tables. Uses cheap heuristics: substring matching +// against quoted keywords in the check description, plus a few special-case patterns. +function evaluatePrerequisiteCheck(input: string, check: string): boolean { + // Extract quoted keywords like "evidence", "observed", "alternative" + const quotedKeywords: string[] = []; + const quotedRegex = /"([^"]+)"/g; + let m: RegExpExecArray | null; + while ((m = quotedRegex.exec(check)) !== null) { + quotedKeywords.push(m[1]); + } + + if (quotedKeywords.length > 0) { + // Pass if ANY quoted keyword appears in input (case-insensitive, word-boundary where possible) + for (const kw of quotedKeywords) { + const escaped = kw.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + // Use word-boundary for single words, substring for phrases + const pattern = /^\w+$/.test(kw) ? new RegExp("\\b" + escaped + "\\b", "i") : new RegExp(escaped, "i"); + if (pattern.test(input)) return true; + } + // Special-case check descriptions that mention URLs, citations, numeric markers + if (/\bURL\b/i.test(check) && /https?:\/\//.test(input)) return true; + if (/numeric/i.test(check) && /\d/.test(input)) return true; + if (/proper-?noun/i.test(check) && /\b[A-Z][a-z]+\s+[A-Z]/.test(input)) return true; + if (/citation/i.test(check) && /\[\d+\]|\bper\s+[A-Z]|\baccording to\b/i.test(input)) return true; + return false; + } + + // No quoted keywords: conservative fallback — passes if input is non-trivial + return input.trim().length >= 20; +} + async function runGateAction( input: string, context: string | undefined, diff --git a/workers/test/governance-parser.test.mjs b/workers/test/governance-parser.test.mjs new file mode 100644 index 0000000..518379e --- /dev/null +++ b/workers/test/governance-parser.test.mjs @@ -0,0 +1,314 @@ +#!/usr/bin/env node +/** + * Parser-fidelity test for governance-driven challenge extraction. + * + * Fetches the 11 live governance articles from klappy.dev and runs the same + * regex patterns used in workers/src/orchestrate.ts to confirm the parsers + * correctly extract types, questions, prerequisites, vocabulary, and calibration. + * + * This is not a worker integration test — it exercises the parser logic + * outside the Cloudflare runtime. Run pre-PR to verify parser regexes match + * real-world article structure. + */ + +import { readFile } from "node:fs/promises"; +import { fileURLToPath } from "node:url"; +import { dirname, join } from "node:path"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const REPO_ROOT = join(__dirname, "..", ".."); + +// Articles to test against — these MUST exist in the local clone of klappy.dev +// or we fetch from raw.githubusercontent.com +const KLAPPYDEV_RAW = "https://raw.githubusercontent.com/klappy/klappy.dev/main"; +const ARTICLE_PATHS = { + meta: "odd/challenge-types/how-to-write-challenge-types.md", + strongClaim: "odd/challenge-types/strong-claim.md", + proposal: "odd/challenge-types/proposal.md", + assumption: "odd/challenge-types/assumption.md", + observation: "odd/challenge-types/observation.md", + patternCoinage: "odd/challenge-types/pattern-coinage.md", + comparativePositioning: "odd/challenge-types/comparative-positioning.md", + principleExtraction: "odd/challenge-types/principle-extraction.md", + basePrerequisites: "odd/challenge/base-prerequisites.md", + normativeVocabulary: "odd/challenge/normative-vocabulary.md", + stakesCalibration: "odd/challenge/stakes-calibration.md", +}; + +async function fetchArticle(path) { + const url = `${KLAPPYDEV_RAW}/${path}`; + const r = await fetch(url); + if (!r.ok) throw new Error(`Failed to fetch ${url}: ${r.status}`); + return r.text(); +} + +// ────────────────────────────────────────────────────────────────────────── +// Parser logic — verbatim copies of the regexes in workers/src/orchestrate.ts +// ────────────────────────────────────────────────────────────────────────── + +function parseChallengeType(content) { + const slugMatch = content.match(/\|\s*Slug\s*\|\s*([^|]+)\s*\|/); + const nameMatch = content.match(/\|\s*Name\s*\|\s*([^|]+)\s*\|/); + if (!slugMatch) return null; + const slug = slugMatch[1].trim(); + const name = nameMatch ? nameMatch[1].trim() : slug; + + const blockquoteMatch = content.match(/^#\s[^\n]+\n+>\s*([^\n]+(?:\n>\s*[^\n]+)*)/m); + const blockquote = blockquoteMatch + ? blockquoteMatch[1].replace(/\n>\s*/g, " ").trim() + : ""; + + const detectionSection = content.match( + /## Detection Patterns[\s\S]*?```\n([\s\S]*?)\n```/, + ); + const triggerWords = detectionSection + ? detectionSection[1].split(",").map((w) => w.trim()).filter((w) => w.length > 0) + : []; + + const questionsSection = content.match( + /## Challenge Questions[\s\S]*?\| Question[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n##|$)/, + ); + const questions = []; + if (questionsSection) { + for (const row of questionsSection[1].split("\n").filter((r) => r.includes("|"))) { + const cols = row.split("|").map((c) => c.trim()).filter((c) => c.length > 0); + if (cols.length >= 2) questions.push({ question: cols[0], tier: cols[1] }); + } + } + + const prereqSection = content.match( + /## Prerequisite Overlays[\s\S]*?\| Prerequisite[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n##|$)/, + ); + const prerequisiteOverlays = []; + if (prereqSection) { + for (const row of prereqSection[1].split("\n").filter((r) => r.includes("|"))) { + const cols = row.split("|").map((c) => c.trim()).filter((c) => c.length > 0); + if (cols.length >= 3) { + prerequisiteOverlays.push({ + prerequisite: cols[0], + check: cols[1], + gapMessage: cols[2].replace(/^"|"$/g, "").replace(/\{name\}/g, name), + }); + } + } + } + + const reframingsSection = content.match(/## Suggested Reframings[\s\S]*?\n((?:-\s+[^\n]+\n?)+)/); + const reframings = []; + if (reframingsSection) { + for (const line of reframingsSection[1].split("\n")) { + const m = line.match(/^-\s+(.+)$/); + if (m) reframings.push(m[1].trim()); + } + } + + const fmMatch = content.match(/^---\n([\s\S]*?)\n---/); + let fallback = false; + if (fmMatch) { + fallback = /^fallback:\s*true\s*$/m.test(fmMatch[1]); + } + + return { slug, name, blockquote, triggerWords, questions, prerequisiteOverlays, reframings, fallback }; +} + +function parseBasePrereqs(content) { + const section = content.match( + /## Prerequisite Overlays[\s\S]*?\| Prerequisite[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n##|$)/, + ); + const result = []; + if (section) { + for (const row of section[1].split("\n").filter((r) => r.includes("|"))) { + const cols = row.split("|").map((c) => c.trim()).filter((c) => c.length > 0); + if (cols.length >= 3) { + result.push({ prerequisite: cols[0], check: cols[1], gapMessage: cols[2].replace(/^"|"$/g, "") }); + } + } + } + return result; +} + +function parseNormativeVocab(content) { + const caseSensitive = []; + const caseInsensitive = []; + const sections = content.split(/###\s+/); + for (const section of sections) { + const isCS = /RFC 2119|Directive Language/i.test(section.split("\n")[0] || ""); + const tableMatch = section.match(/\|\s*(?:Word|Phrase)\s*\|[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n##|$)/); + if (!tableMatch) continue; + for (const row of tableMatch[1].split("\n").filter((r) => r.includes("|"))) { + const cols = row.split("|").map((c) => c.trim()).filter((c) => c.length > 0); + if (cols.length >= 2) { + if (isCS) caseSensitive.push(cols[0]); + else caseInsensitive.push(cols[0]); + } + } + } + return { caseSensitive, caseInsensitive }; +} + +function parseStakesCalibration(content) { + const tableMatch = content.match( + /## Stakes Calibration[\s\S]*?\| Mode[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n##|$)/, + ); + const byMode = new Map(); + if (tableMatch) { + for (const row of tableMatch[1].split("\n").filter((r) => r.includes("|"))) { + const cols = row.split("|").map((c) => c.trim()).filter((c) => c.length > 0); + if (cols.length >= 4) { + const tiersRaw = cols[1].toLowerCase().trim(); + const isNone = tiersRaw === "none" || tiersRaw.startsWith("none ") || tiersRaw.startsWith("none("); + const tiers = isNone ? [] : tiersRaw.split(",").map((t) => t.trim()).filter((t) => t); + byMode.set(cols[0], { tiers, strictness: cols[2], surfacing: cols[3] }); + } + } + } + return byMode; +} + +// ────────────────────────────────────────────────────────────────────────── +// Tests +// ────────────────────────────────────────────────────────────────────────── + +let passed = 0; +let failed = 0; + +function ok(name, cond, detail = "") { + if (cond) { console.log(` ✓ ${name}`); passed++; } + else { console.log(` ✗ ${name}${detail ? " — " + detail : ""}`); failed++; } +} + +async function run() { + console.log("Fetching 11 governance articles from klappy.dev...\n"); + + const articles = {}; + for (const [key, path] of Object.entries(ARTICLE_PATHS)) { + articles[key] = await fetchArticle(path); + } + + console.log("─── Test 1: Challenge type parsing ───"); + const types = []; + for (const key of ["strongClaim", "proposal", "assumption", "observation", "patternCoinage", "comparativePositioning", "principleExtraction"]) { + const t = parseChallengeType(articles[key]); + types.push(t); + ok(`${key} parses`, t !== null); + if (t) { + ok(`${key} has slug`, t.slug.length > 0, `got "${t.slug}"`); + ok(`${key} has name`, t.name.length > 0, `got "${t.name}"`); + ok(`${key} has blockquote`, t.blockquote.length > 20, `got ${t.blockquote.length} chars`); + ok(`${key} has trigger words`, t.triggerWords.length >= 3, `got ${t.triggerWords.length}`); + ok(`${key} has questions`, t.questions.length >= 2, `got ${t.questions.length}`); + ok(`${key} questions have tiers`, t.questions.every((q) => ["baseline", "elevated", "rigorous"].includes(q.tier)), `tiers: ${[...new Set(t.questions.map((q) => q.tier))].join(",")}`); + ok(`${key} has prerequisite overlays`, t.prerequisiteOverlays.length >= 1, `got ${t.prerequisiteOverlays.length}`); + ok(`${key} has reframings`, t.reframings.length >= 1, `got ${t.reframings.length}`); + } + } + + console.log("\n─── Test 2: Fallback resolution ───"); + const observation = types.find((t) => t && t.slug === "observation"); + ok("observation has fallback: true", observation && observation.fallback === true); + const otherTypes = types.filter((t) => t && t.slug !== "observation"); + ok("non-fallback types do not have fallback: true", otherTypes.every((t) => !t.fallback)); + + console.log("\n─── Test 3: BM25 detection with stemming ───"); + // Build the per-type BM25 index the same way the worker does + const { buildBM25Index, searchBM25, stem } = await import("../src/bm25.ts").catch(() => + import("../src/bm25.js"), + ); + const detectionDocs = types + .filter((t) => t) + .map((t) => ({ + id: t.slug, + text: [t.triggerWords.join(" "), t.blockquote].filter((s) => s.length > 0).join(" "), + })); + // Same stop word set the worker uses — preserves modals as signal, + // filters general filler so irrelevant input doesn't over-match. + const CHALLENGE_STOP_WORDS = new Set([ + "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", + "of", "in", "to", "for", "with", "on", "at", "by", "from", "as", "into", "through", + "and", "but", "or", "nor", "if", "then", "than", + "that", "this", "it", "its", "we", "you", "he", "she", "they", + ]); + const bm25 = buildBM25Index(detectionDocs, CHALLENGE_STOP_WORDS); + + // Each type's first trigger word should still match its own type + for (const t of types) { + if (!t) continue; + const sampleWord = t.triggerWords[0]; + const hits = searchBM25(bm25, sampleWord, types.length); + ok( + `${t.slug} matches its first trigger word "${sampleWord}" via BM25`, + hits.some((h) => h.id === t.slug), + `top hit was "${hits[0]?.id || "(none)"}" with score ${hits[0]?.score?.toFixed(2) || 0}`, + ); + } + + console.log("\n─── Test 3b: Stemming defeats the original coin/coining bug ───"); + // The original regex-based approach had "coining" as a trigger but failed on "coin". + // With stemming, both should reduce to the same root. + ok( + `stem("coin") === stem("coining")`, + stem("coin") === stem("coining"), + `stem("coin")="${stem("coin")}" stem("coining")="${stem("coining")}"`, + ); + ok( + `"coin the term" matches pattern-coinage via BM25`, + searchBM25(bm25, "coin the term", types.length).some((h) => h.id === "pattern-coinage"), + ); + ok( + `"I'm coining a new term" matches pattern-coinage via BM25`, + searchBM25(bm25, "I'm coining a new term", types.length).some((h) => h.id === "pattern-coinage"), + ); + ok( + `"the principles" matches principle-extraction (plural form)`, + searchBM25(bm25, "the principles", types.length).some((h) => h.id === "principle-extraction"), + ); + ok( + `"alternatives proposed" matches proposal (proposed not propose)`, + searchBM25(bm25, "alternatives proposed", types.length).some((h) => h.id === "proposal"), + ); + + console.log("\n─── Test 4: Multi-match semantics (BM25) ───"); + const compoundInput = "We must always be coining new terms like Vodka Architecture"; + const matched = searchBM25(bm25, compoundInput, types.length); + ok( + "compound input fires multiple types via BM25", + matched.length >= 2, + `matched: ${matched.map((m) => m.id).join(", ")}`, + ); + ok("strong-claim fires on 'must always'", matched.some((m) => m.id === "strong-claim")); + ok("pattern-coinage fires on 'coining'", matched.some((m) => m.id === "pattern-coinage")); + + console.log("\n─── Test 4b: Empty input + irrelevant input do not over-match ───"); + ok( + "irrelevant input scores no types", + searchBM25(bm25, "the cat sat on the mat", types.length).length === 0, + `(would have triggered fallback in runChallengeAction)`, + ); + + console.log("\n─── Test 5: Base prerequisites ───"); + const basePrereqs = parseBasePrereqs(articles.basePrerequisites); + ok("base prerequisites parse", basePrereqs.length >= 3, `got ${basePrereqs.length}`); + ok("base includes evidence-cited", basePrereqs.some((p) => p.prerequisite === "evidence-cited")); + ok("base includes source-named", basePrereqs.some((p) => p.prerequisite === "source-named")); + ok("base includes confidence-signaled", basePrereqs.some((p) => p.prerequisite === "confidence-signaled")); + + console.log("\n─── Test 6: Normative vocabulary ───"); + const vocab = parseNormativeVocab(articles.normativeVocabulary); + ok("case-sensitive RFC 2119 words present", vocab.caseSensitive.length >= 4, `got ${vocab.caseSensitive.length}: ${vocab.caseSensitive.slice(0,5).join(",")}`); + ok("case-insensitive architectural words present", vocab.caseInsensitive.length >= 3, `got ${vocab.caseInsensitive.length}: ${vocab.caseInsensitive.slice(0,5).join(",")}`); + ok("includes MUST", vocab.caseSensitive.includes("MUST")); + ok("includes invariant", vocab.caseInsensitive.includes("invariant")); + + console.log("\n─── Test 7: Stakes calibration ───"); + const calib = parseStakesCalibration(articles.stakesCalibration); + ok("calibration parses 9 modes", calib.size >= 9, `got ${calib.size} modes: ${[...calib.keys()].join(", ")}`); + ok("voice-dump exists", calib.has("voice-dump")); + ok("voice-dump has empty tiers (suppression invariant)", calib.get("voice-dump")?.tiers.length === 0); + ok("planning has baseline+elevated", calib.get("planning")?.tiers.length === 2); + ok("execution has all three tiers", calib.get("execution")?.tiers.length === 3); + + console.log(`\n${passed} passed, ${failed} failed`); + process.exit(failed === 0 ? 0 : 1); +} + +run().catch((e) => { console.error(e); process.exit(1); }); From e82164b84bfca5f1e08b3500a4df1dcd6bc09d81 Mon Sep 17 00:00:00 2001 From: Klappy Date: Fri, 17 Apr 2026 07:12:34 +0000 Subject: [PATCH 07/17] fix(challenge): port bugbot review fixes onto BM25 detection layer Re-applies the four review fixes from sibling commits (31f8134, e9ef2f9, 84932f0) and the dead-code removal that the bugbot review also flagged, on top of the BM25 + stemming detection swap. - Vocabulary regex sorted by length descending so 'MUST NOT' matches before 'MUST' (closes bugbot 'Regex alternation order') - Stakes calibration mode column lowercased at parse time AND mode normalized to lowercase at lookup time (closes bugbot 'Mode column not lowercased breaks voice-dump suppression') - first_1 reframings policy now surfaces a single reframing total across all matched types, not one per type (closes bugbot 'first_1 reframings surfaces multiple instead of one') - Detection runs BEFORE voice-dump suppression check, and SUPPRESSED response includes the governance field for shape parity with CHALLENGED (closes bugbot 'SUPPRESSED response missing governance') - Renames type_definitions to governance in CHALLENGED response so both statuses return the same shape under the same key - Dead detectClaimType already removed by the BM25 commit (closes bugbot 'Dead code: detectClaimType has zero callers') Verification: - npm run typecheck: clean - workers/test/governance-parser.test.mjs: 94/94 pass - tests/smoke.sh: 6/6 pass --- workers/src/orchestrate.ts | 97 ++++++++++++++++++++++++-------------- 1 file changed, 61 insertions(+), 36 deletions(-) diff --git a/workers/src/orchestrate.ts b/workers/src/orchestrate.ts index 72c91a8..2a30a35 100644 --- a/workers/src/orchestrate.ts +++ b/workers/src/orchestrate.ts @@ -646,11 +646,20 @@ async function fetchNormativeVocabulary( const escape = (s: string) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); const caseSensitiveRegex = caseSensitiveWords.length > 0 - ? new RegExp("\\b(" + caseSensitiveWords.map(escape).join("|") + ")\\b") + ? new RegExp( + "\\b(" + + [...caseSensitiveWords].sort((a, b) => b.length - a.length).map(escape).join("|") + + ")\\b", + ) : null; const caseInsensitiveRegex = caseInsensitiveWords.length > 0 - ? new RegExp("(" + caseInsensitiveWords.map(escape).join("|") + ")", "i") + ? new RegExp( + "(" + + [...caseInsensitiveWords].sort((a, b) => b.length - a.length).map(escape).join("|") + + ")", + "i", + ) : null; const vocab = { caseSensitiveRegex, caseInsensitiveRegex, directiveTypes }; @@ -682,7 +691,7 @@ async function fetchStakesCalibration( .map((c: string) => c.trim()) .filter((c: string) => c.length > 0); if (cols.length >= 4) { - const mode = cols[0]; + const mode = cols[0].toLowerCase(); const tiersRaw = cols[1].toLowerCase().trim(); // The cell may be "none" or "none (suppress all challenge)" — both mean // empty tier list and trigger the voice-dump suppression invariant. @@ -1523,7 +1532,7 @@ async function runChallengeAction( state?: OddkitState, ): Promise { const startMs = Date.now(); - const mode = modeHint || "planning"; + const mode = (modeHint || "planning").toLowerCase(); // Load governance in parallel const [types, basePrereqs, vocab, calibration] = await Promise.all([ @@ -1535,16 +1544,46 @@ async function runChallengeAction( const modeConfig = calibration.byMode.get(mode); + // Detect matching types via BM25 over per-type detection text. + // Stemming makes "coining" match "coin", "rolled" match "rollback", etc. + // score > 0 = match (BM25 returns 0 when no stemmed query terms hit). + // Multi-match preserved: a single input may score against several types. + // Detection runs BEFORE the voice-dump suppression check so the SUPPRESSED + // response can still expose `governance` — the model sees what would have + // fired without surfacing the pressure-test questions. + const typeIndex = getChallengeTypeIndex(); + const matchedTypes: ChallengeTypeDef[] = []; + if (typeIndex) { + const hits = searchBM25(typeIndex, input, types.length); + const typeBySlug = new Map(types.map((t) => [t.slug, t])); + for (const hit of hits) { + const t = typeBySlug.get(hit.id); + if (t) matchedTypes.push(t); + } + } + + // Fallback resolution when no type scored above zero + if (matchedTypes.length === 0) { + const fallback = types.find((t) => t.fallback) || types[0]; + if (fallback) matchedTypes.push(fallback); + } + // Voice-dump invariant: suppress all challenge output regardless of matched types. // Encoded at klappy://odd/challenge/stakes-calibration. Some modes exist for getting // thoughts out of the head; pressure-testing at that stage damages the mode. + // The `governance` field is still surfaced so the model sees what types matched. if (modeConfig && modeConfig.questionTiers.length === 0) { return { action: "challenge", result: { status: "SUPPRESSED", mode, - matched_types: [], + matched_types: matchedTypes.map((t) => t.slug), + governance: matchedTypes.map((t) => ({ + slug: t.slug, + name: t.name, + description: t.blockquote, + })), tensions: [], missing_prerequisites: [], challenges: [], @@ -1559,27 +1598,6 @@ async function runChallengeAction( }; } - // Detect matching types via BM25 over per-type detection text. - // Stemming makes "coining" match "coin", "rolled" match "rollback", etc. - // score > 0 = match (BM25 returns 0 when no stemmed query terms hit). - // Multi-match preserved: a single input may score against several types. - const typeIndex = getChallengeTypeIndex(); - const matchedTypes: ChallengeTypeDef[] = []; - if (typeIndex) { - const hits = searchBM25(typeIndex, input, types.length); - const typeBySlug = new Map(types.map((t) => [t.slug, t])); - for (const hit of hits) { - const t = typeBySlug.get(hit.id); - if (t) matchedTypes.push(t); - } - } - - // Fallback resolution when no type scored above zero - if (matchedTypes.length === 0) { - const fallback = types.find((t) => t.fallback) || types[0]; - if (fallback) matchedTypes.push(fallback); - } - // Aggregate questions across matched types, deduped by question string const questionMap = new Map(); for (const t of matchedTypes) { @@ -1637,18 +1655,25 @@ async function runChallengeAction( } const surfacing = modeConfig?.reframingSurfacing?.toLowerCase() || "all"; - const surfacedReframings: string[] = []; + const allReframings: string[] = []; + for (const typeReframings of reframingsByType.values()) { + allReframings.push(...typeReframings); + } + let surfacedReframings: string[] = []; if (surfacing === "none") { - // no reframings - } else if (surfacing.includes("first 1") || surfacing.includes("first-1")) { - for (const typeReframings of reframingsByType.values()) { - if (typeReframings.length > 0) surfacedReframings.push(typeReframings[0]); - } + surfacedReframings = []; + } else if ( + surfacing.includes("first 1") || + surfacing.includes("first-1") || + surfacing.includes("first one") + ) { + // Surface at most one reframing total — across all matched types, not one per type. + // The governance phrase "first 1" means a single reframing in the response; + // multi-match should not multiply the surfacing. + surfacedReframings = allReframings.slice(0, 1); } else { // "all" or "all, plus block-until-addressed" - for (const typeReframings of reframingsByType.values()) { - surfacedReframings.push(...typeReframings); - } + surfacedReframings = allReframings; } const blockUntilAddressed = surfacing.includes("block-until-addressed"); @@ -1750,7 +1775,7 @@ async function runChallengeAction( status: "CHALLENGED", mode, matched_types: matchedSlugs, - type_definitions: matchedTypes.map((t) => ({ + governance: matchedTypes.map((t) => ({ slug: t.slug, name: t.name, description: t.blockquote, From fd14a6031ce091dfee7713f21e166203950ffff8 Mon Sep 17 00:00:00 2001 From: Klappy Date: Fri, 17 Apr 2026 07:13:12 +0000 Subject: [PATCH 08/17] docs(evidence): add bugbot review + combine section to challenge refactor evidence Captures the fork-resolution and bugbot-review-driven fixes as a sixth layer of catch alongside the gauntlet bugs. Records the lesson: read PR review comments before treating divergent remote as unknown work. --- .../challenge-governance-code-refactor.md | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/docs/oddkit/evidence/challenge-governance-code-refactor.md b/docs/oddkit/evidence/challenge-governance-code-refactor.md index 2a1e73a..68068a1 100644 --- a/docs/oddkit/evidence/challenge-governance-code-refactor.md +++ b/docs/oddkit/evidence/challenge-governance-code-refactor.md @@ -120,6 +120,29 @@ Visual proof: **N/A — server-side code change.** No UI, no interaction surface --- +## Bugbot Review + Combine — A Sixth Layer of Catch + +After the regex version of this PR was first pushed, the Cursor bugbot reviewed it and flagged five issues, four of which were addressed via three follow-up commits on the remote branch (`31f8134`, `e9ef2f9`, `84932f0`). Those commits existed in parallel to the local BM25 pivot work and were not visible until the remote was fetched. Discovering them late led to a fork situation that was resolved by: + +1. Resetting local to the remote tip (preserving the three follow-up fixes plus the ledger entry on `a88abf7`) +2. Cherry-picking the BM25 commit on top of the resolved tip +3. Hand-porting the four polish fixes onto the BM25 base — they touch unrelated code regions so the port was mechanical +4. Adding a fixup commit (`e82164b`) on top of the BM25 commit that captures the ports + +**Bugbot's five review items, all closed by this PR:** + +| Severity | Issue | Resolution | +|----------|-------|------------| +| High | Mode column not lowercased breaks voice-dump suppression | Lowercased at parse time AND at lookup | +| Medium | Regex alternation order breaks multi-word directive matching (`MUST` before `MUST NOT`) | Sort vocab by length descending | +| Medium | `first_1` reframings surfaces multiple instead of one | Slice from aggregated list, not per-type | +| Medium | SUPPRESSED response missing `governance` field | Detection runs before suppression check; both responses share the `governance` shape | +| Low | Dead code: `detectClaimType` has zero callers | Removed by the BM25 commit before bugbot reviewed | + +**Lesson recorded:** when encountering a divergent remote on an existing PR branch, fetch and read PR review comments first. Bugbot leaves structured comments that explain the divergent commits — checking saves the user from explaining what already exists. + +--- + ## Version Tracking - Branch: `feat/e0008-challenge-governance-driven` From 4f19ecd301bcf213f378a96670d2247ad59e7dd0 Mon Sep 17 00:00:00 2001 From: Klappy Date: Fri, 17 Apr 2026 07:14:34 +0000 Subject: [PATCH 09/17] chore(ledger): journal PR #100 BM25 pivot + bugbot combine session --- .../journal/2026-04-17-pr100-combined.md | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 odd/ledger/journal/2026-04-17-pr100-combined.md diff --git a/odd/ledger/journal/2026-04-17-pr100-combined.md b/odd/ledger/journal/2026-04-17-pr100-combined.md new file mode 100644 index 0000000..2cc35eb --- /dev/null +++ b/odd/ledger/journal/2026-04-17-pr100-combined.md @@ -0,0 +1,48 @@ +# Session Journal — PR #100 BM25 Pivot + Bugbot Combine + +**Date:** 2026-04-17 +**PR:** klappy/oddkit#100 — feat(challenge): governance-driven runChallengeAction (E0008) +**Branch:** `feat/e0008-challenge-governance-driven` +**Final commit on branch:** `fd14a60` + +## DOLCHE + +### Decisions + +- **D1: Pivot from regex-OR to BM25 + stemming for challenge-type detection.** Triggered by gauntlet observation that `coin` doesn't match trigger word `coining`. Klappy proposed BM25 as the right tool; agreed, executed the swap. +- **D2: Use a custom `CHALLENGE_STOP_WORDS` set, not the default `STOP_WORDS`.** Default filters modal verbs (`must`, `should`, `shall`, `may`, `not`) which are the signal for strong-claim and proposal types. Without this fix, those two type detections would have silently broken in production. +- **D3: Detection runs BEFORE voice-dump suppression check.** Lets SUPPRESSED response include the `governance` field so the model sees what types matched even when questions are suppressed. Closes bugbot Medium item; also better UX. +- **D4: `governance` is the canonical response key for matched-type definitions.** CHALLENGED and SUPPRESSED both use it; SUPPRESSED no longer returns `undefined` for shape parity. +- **D5: Combine fork rather than discard either side.** Remote had 5 commits (regex base + 3 polish + ledger entry), local had BM25 pivot. Both contained real work. Reset to remote tip, cherry-picked BM25 on top, hand-ported the 4 polish fixes. + +### Observations + +- **O1: PR review comments explain divergent commits.** Bugbot left 5 structured review comments on PR #100. Reading them first would have surfaced what those 3 unfamiliar commits did within seconds. Lesson recorded. +- **O2: Cherry-picking after staged conflicts didn't fold subsequent edits.** Hand-edits made AFTER `git add` but BEFORE `git cherry-pick --continue` sat in the working tree post-commit. Required a fixup commit. Workflow note for future merges. +- **O3: BM25 default `STOP_WORDS` is tuned for prose, not directive language.** The general-purpose IR assumption that modals are filler doesn't hold in the challenge taxonomy. The opt-in `stopWords: Set` extension is now available to other use cases that face the same issue. +- **O4: BM25 already had phrase boost machinery.** `PHRASE_BOOST_EXACT` and `PHRASE_BOOST_PARTIAL` give multi-word triggers free score amplification with no additional code. + +### Learnings + +- **L1: Read PR review comments first when fork is detected.** Bugbot/cursor leaves structured comments that explain divergent commits. Standard practice from now on. +- **L2: General-purpose IR stop-word lists are domain-hostile in directive matching.** Modal verbs are content, not filler, in any context where claims and proposals are the signal. +- **L3: Stemming + BM25 is the right shape for canon-defined category matching.** Stems handle morphology, IDF weights distinctive terms, score > 0 preserves multi-match and fallback semantics. Pattern is now reusable for encode parity and future gate refactor. +- **L4: The gauntlet caught one bug; bugbot caught four; the combine surfaced the fifth (dead code).** Different review tools catch different classes. The gauntlet is for "would this satisfy our governance"; bugbot is for "would this satisfy basic correctness." Both have value. + +### Constraints + +- **C1: Voice-dump mode MUST suppress questions, prereqs, and reframings — but MAY surface governance.** The invariant is about not pressure-testing during raw thought capture, not about hiding what types matched. +- **C2: bm25.ts extensions must preserve backward compatibility.** Default parameter values mean existing callers (oddkit_search, future encode pivot) are unaffected. New behavior is opt-in only. + +### Handoffs + +- **H1: Encode parity PR.** Same regex-OR brittleness in `runEncodeAction`. Pattern proven here, port near-mechanical. Highest-priority follow-up. +- **H2: klappy.dev meta governance update.** `how-to-write-challenge-types.md` references "compiles into a case-insensitive word-boundary regex" — now stale. Two-line PR. +- **H3: Gate refactor candidate.** `oddkit_gate` returned NOT_READY for the same hardcoded-logic reason challenge had pre-refactor. Same shape, same fix pattern. +- **H4: Score-based confidence in `matched_types`.** Currently `[slug, slug, ...]`; trivial upgrade to `[{slug, score}, ...]` if any consumer wants relative confidence. Not blocking. + +### Encodes + +- This journal at `odd/ledger/journal/2026-04-17-pr100-combined.md` +- Evidence note at `docs/oddkit/evidence/challenge-governance-code-refactor.md` (updated with bugbot+combine section) +- PR #100 comment summarizing the combine: https://github.com/klappy/oddkit/pull/100#issuecomment-4266114513 From 94990b5da0ed8577940bfce54e45d6ac69b997da Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Fri, 17 Apr 2026 07:21:18 +0000 Subject: [PATCH 10/17] fix(orchestrate): lowercase challenge question tier for calibration match --- workers/src/orchestrate.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workers/src/orchestrate.ts b/workers/src/orchestrate.ts index 2a30a35..d28ae82 100644 --- a/workers/src/orchestrate.ts +++ b/workers/src/orchestrate.ts @@ -466,7 +466,7 @@ async function discoverChallengeTypes( .map((c: string) => c.trim()) .filter((c: string) => c.length > 0); if (cols.length >= 2) { - questions.push({ question: cols[0], tier: cols[1] }); + questions.push({ question: cols[0], tier: cols[1].toLowerCase() }); } } } From 997a50d82cf05f41724c9043b3df647f25664a0d Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Fri, 17 Apr 2026 07:31:02 +0000 Subject: [PATCH 11/17] fix(challenge): guard cached BM25 index by canonUrl to prevent isolate cross-contamination --- workers/src/orchestrate.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/workers/src/orchestrate.ts b/workers/src/orchestrate.ts index d28ae82..71f7691 100644 --- a/workers/src/orchestrate.ts +++ b/workers/src/orchestrate.ts @@ -551,7 +551,8 @@ async function discoverChallengeTypes( return types; } -function getChallengeTypeIndex(): BM25Index | null { +function getChallengeTypeIndex(canonUrl?: string): BM25Index | null { + if (cachedChallengeTypeIndexCanonUrl !== canonUrl) return null; return cachedChallengeTypeIndex; } @@ -1551,7 +1552,7 @@ async function runChallengeAction( // Detection runs BEFORE the voice-dump suppression check so the SUPPRESSED // response can still expose `governance` — the model sees what would have // fired without surfacing the pressure-test questions. - const typeIndex = getChallengeTypeIndex(); + const typeIndex = getChallengeTypeIndex(canonUrl); const matchedTypes: ChallengeTypeDef[] = []; if (typeIndex) { const hits = searchBM25(typeIndex, input, types.length); From 6358d652fb08dd653f11eccea9ef4191616523a5 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Fri, 17 Apr 2026 07:39:12 +0000 Subject: [PATCH 12/17] fix(challenge): restore claim_type alias in response envelope --- workers/src/orchestrate.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workers/src/orchestrate.ts b/workers/src/orchestrate.ts index 71f7691..d885e31 100644 --- a/workers/src/orchestrate.ts +++ b/workers/src/orchestrate.ts @@ -1579,6 +1579,7 @@ async function runChallengeAction( result: { status: "SUPPRESSED", mode, + claim_type: matchedTypes[0]?.slug, matched_types: matchedTypes.map((t) => t.slug), governance: matchedTypes.map((t) => ({ slug: t.slug, @@ -1775,6 +1776,7 @@ async function runChallengeAction( result: { status: "CHALLENGED", mode, + claim_type: matchedSlugs[0], matched_types: matchedSlugs, governance: matchedTypes.map((t) => ({ slug: t.slug, From ce41b397c72191a405d2045cd5f3f02529eba69e Mon Sep 17 00:00:00 2001 From: Klappy Date: Fri, 17 Apr 2026 13:48:10 +0000 Subject: [PATCH 13/17] fix(challenge): move stop words from hardcoded constant into governance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Caught in PR #100 review by Klappy: the CHALLENGE_STOP_WORDS Set added mid-PR to fix a BM25 over-match was itself a Vodka Architecture violation in a refactor explicitly about removing such violations. The constant carried a domain opinion ('modals are signal, articles are filler in challenge detection') that belonged in canon, not in worker source. Anti-pattern fixed: - Drop the hardcoded CHALLENGE_STOP_WORDS Set from workers/src/orchestrate.ts - Drop the duplicate hardcoded copy from workers/test/governance-parser.test.mjs - Extend NormativeVocabulary interface with stopWords: Set - Extend fetchNormativeVocabulary to extract '## Detection Noise' code block from odd/challenge/normative-vocabulary.md (lands in klappy.dev#100) - Move BM25 index build out of discoverChallengeTypes into a new lazy builder getOrBuildChallengeTypeIndex(types, vocab, canonUrl) so the index can use governance-sourced stop words rather than a constant - Update parser test to fetch Detection Noise the same way the worker does — no hardcoded duplicate, no drift risk. Test gains 3 new assertions: Detection Noise parses non-empty, excludes modal verbs, includes common filler Net hardcoded-constants delta: this PR removes ~6 classes of hardcoded domain opinion (claim type detection, questions, prereqs, tension regex, reframings, stop words) and adds zero. The remaining minimal RFC 2119 fallback ('MUST', 'MUST NOT', 'SHOULD', 'SHOULD NOT') and 'planning' default mode are server-availability fallbacks for when canon is unreachable, not domain governance. Test currently runs against the feature branch via KLAPPYDEV_RAW env override. After klappy.dev#100 merges, the override comes off and the test reads from main with no further changes. Verification: - npm run typecheck: clean - workers/test/governance-parser.test.mjs (vs feature branch): 97/97 pass - tests/smoke.sh: 6/6 pass - grep CHALLENGE_STOP_WORDS in workers/ and src/: zero matches Refs: - Caught in: this PR review by Klappy - Depends on: klappy/klappy.dev#100 (Detection Noise section) - Lesson: 'is this the right architectural shape' is a category the current gauntlet does not catch — the tools verify governance content, not whether new code is creating new ungoverned content. Possible future tool: a vodka-audit that flags non-trivial Sets/Maps/lists in worker source and asks 'should this be in canon?' --- workers/src/orchestrate.ts | 106 +++++++++++++++--------- workers/test/governance-parser.test.mjs | 42 +++++++--- 2 files changed, 99 insertions(+), 49 deletions(-) diff --git a/workers/src/orchestrate.ts b/workers/src/orchestrate.ts index d885e31..6775f77 100644 --- a/workers/src/orchestrate.ts +++ b/workers/src/orchestrate.ts @@ -98,6 +98,12 @@ interface NormativeVocabulary { caseSensitiveRegex: RegExp | null; caseInsensitiveRegex: RegExp | null; directiveTypes: Map; + /** Stop words for user-input matching against per-type detection text. + * Sourced from the `## Detection Noise` section of normative-vocabulary.md. + * Empty Set = no filtering (server falls back to BM25 IDF only). Modal + * verbs and negation are deliberately absent from canon's default list + * because they are signal for strong-claim, proposal, and assumption types. */ + stopWords: Set; } interface StakesModeConfig { @@ -110,21 +116,6 @@ interface StakesCalibration { byMode: Map; } -// Stop word set for challenge-type detection. Filters general filler -// (the, of, in, etc.) but deliberately preserves modal verbs, "do/does/did", -// and negation — those are load-bearing signal for strong-claim, proposal, -// and assumption types. Using the default bm25 STOP_WORDS would silently -// strip "must", "should", "shall", "may", "not" and break detection. -const CHALLENGE_STOP_WORDS = new Set([ - "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", - "of", "in", "to", "for", "with", "on", "at", "by", "from", "as", "into", "through", - "and", "but", "or", "nor", "if", "then", "than", - "that", "this", "it", "its", "we", "you", "he", "she", "they", - // intentionally NOT in this list (kept as signal): - // must, should, shall, will, would, may, might, can, could, do, does, did, - // have, has, had, not, no, never, always -]); - let cachedChallengeTypes: ChallengeTypeDef[] | null = null; let cachedChallengeTypesCanonUrl: string | undefined = undefined; let cachedChallengeTypeIndex: BM25Index | null = null; @@ -537,23 +528,37 @@ async function discoverChallengeTypes( return a.slug.localeCompare(b.slug); }); - // Build BM25 index over per-type detection text (triggers + blockquote). - // Stemming handles morphology; IDF weights distinctive trigger terms above filler. - // CHALLENGE_STOP_WORDS preserves modal verbs and negation as signal — the - // default bm25 STOP_WORDS would silently strip "must", "should", "not" etc. - const bm25Docs = types.map((t) => ({ id: t.slug, text: t.detectionText })); - const bm25Index = buildBM25Index(bm25Docs, CHALLENGE_STOP_WORDS); - cachedChallengeTypes = types; cachedChallengeTypesCanonUrl = canonUrl; - cachedChallengeTypeIndex = bm25Index; - cachedChallengeTypeIndexCanonUrl = canonUrl; + // Index build deferred — needs vocab.stopWords from fetchNormativeVocabulary, + // assembled lazily by getOrBuildChallengeTypeIndex below. Both types and the + // index are deterministic functions of canonUrl, so caching by canonUrl + // remains safe. return types; } -function getChallengeTypeIndex(canonUrl?: string): BM25Index | null { - if (cachedChallengeTypeIndexCanonUrl !== canonUrl) return null; - return cachedChallengeTypeIndex; +/** Lazily build (or return cached) per-canonUrl BM25 index over the per-type + * detection text, using governance-sourced stop words from normative-vocabulary.md. + * The cache is keyed on canonUrl so different canon sources do not contaminate + * each other's indexes. */ +function getOrBuildChallengeTypeIndex( + types: ChallengeTypeDef[], + vocab: NormativeVocabulary, + canonUrl?: string, +): BM25Index { + if (cachedChallengeTypeIndex && cachedChallengeTypeIndexCanonUrl === canonUrl) { + return cachedChallengeTypeIndex; + } + // Build BM25 index over per-type detection text (triggers + blockquote). + // Stemming handles morphology; IDF weights distinctive trigger terms above filler. + // vocab.stopWords comes from `## Detection Noise` in normative-vocabulary.md; + // it deliberately preserves modal verbs and negation as signal. An empty + // Set means no filtering (governance opted into IDF-only scoring). + const bm25Docs = types.map((t) => ({ id: t.slug, text: t.detectionText })); + const bm25Index = buildBM25Index(bm25Docs, vocab.stopWords); + cachedChallengeTypeIndex = bm25Index; + cachedChallengeTypeIndexCanonUrl = canonUrl; + return bm25Index; } async function fetchBasePrerequisites( @@ -605,13 +610,16 @@ async function fetchNormativeVocabulary( const caseSensitiveWords: string[] = []; const caseInsensitiveWords: string[] = []; const directiveTypes = new Map(); + const stopWords = new Set(); try { const content = await fetcher.getFile("odd/challenge/normative-vocabulary.md", canonUrl); if (content) { - // Two sections: one under "RFC 2119" heading (case-sensitive), - // one under "Architectural Writing" heading (case-insensitive). - // Each is a markdown table with (Word | Directive type). + // ── Surface 1: Normative Vocabulary (signal in canon quotes) ── + // Two subsections under "## Normative Vocabulary": one keyed by "RFC 2119" + // or "Directive Language" (case-sensitive), one for architectural-writing + // load-bearing phrases (case-insensitive). Each is a markdown table with + // (Word | Directive type). const sections = content.split(/###\s+/); for (const section of sections) { const isCaseSensitive = /RFC 2119|Directive Language/i.test(section.split("\n")[0] || ""); @@ -631,6 +639,21 @@ async function fetchNormativeVocabulary( } } } + + // ── Surface 2: Detection Noise (filler in user input) ── + // A code block of comma-and-newline separated words under "## Detection + // Noise". The set is passed to the BM25 indexer as the custom stop-word + // filter. Modal verbs and negation are deliberately absent — they are + // signal for strong-claim, proposal, and assumption type detection. + // If the section is missing, stopWords stays empty and BM25 falls back + // to IDF-only filtering — an explicit governance choice in the article. + const noiseMatch = content.match(/## Detection Noise[\s\S]*?```\n([\s\S]*?)\n```/); + if (noiseMatch) { + for (const word of noiseMatch[1].split(/[,\n]/)) { + const w = word.trim().toLowerCase(); + if (w.length > 0) stopWords.add(w); + } + } } } catch { // Graceful degradation below @@ -663,7 +686,12 @@ async function fetchNormativeVocabulary( ) : null; - const vocab = { caseSensitiveRegex, caseInsensitiveRegex, directiveTypes }; + const vocab: NormativeVocabulary = { + caseSensitiveRegex, + caseInsensitiveRegex, + directiveTypes, + stopWords, + }; cachedNormativeVocabulary = vocab; cachedNormativeVocabularyCanonUrl = canonUrl; return vocab; @@ -1552,15 +1580,15 @@ async function runChallengeAction( // Detection runs BEFORE the voice-dump suppression check so the SUPPRESSED // response can still expose `governance` — the model sees what would have // fired without surfacing the pressure-test questions. - const typeIndex = getChallengeTypeIndex(canonUrl); + // Stop words come from `## Detection Noise` in normative-vocabulary.md + // (governance), not a hardcoded constant in this file. + const typeIndex = getOrBuildChallengeTypeIndex(types, vocab, canonUrl); const matchedTypes: ChallengeTypeDef[] = []; - if (typeIndex) { - const hits = searchBM25(typeIndex, input, types.length); - const typeBySlug = new Map(types.map((t) => [t.slug, t])); - for (const hit of hits) { - const t = typeBySlug.get(hit.id); - if (t) matchedTypes.push(t); - } + const hits = searchBM25(typeIndex, input, types.length); + const typeBySlug = new Map(types.map((t) => [t.slug, t])); + for (const hit of hits) { + const t = typeBySlug.get(hit.id); + if (t) matchedTypes.push(t); } // Fallback resolution when no type scored above zero diff --git a/workers/test/governance-parser.test.mjs b/workers/test/governance-parser.test.mjs index 518379e..c6378c5 100644 --- a/workers/test/governance-parser.test.mjs +++ b/workers/test/governance-parser.test.mjs @@ -20,7 +20,10 @@ const REPO_ROOT = join(__dirname, "..", ".."); // Articles to test against — these MUST exist in the local clone of klappy.dev // or we fetch from raw.githubusercontent.com -const KLAPPYDEV_RAW = "https://raw.githubusercontent.com/klappy/klappy.dev/main"; +// Default to main; override via KLAPPYDEV_RAW env var when testing against +// an unmerged feature branch (e.g. while klappy.dev#100 is still open). +const KLAPPYDEV_RAW = + process.env.KLAPPYDEV_RAW || "https://raw.githubusercontent.com/klappy/klappy.dev/main"; const ARTICLE_PATHS = { meta: "odd/challenge-types/how-to-write-challenge-types.md", strongClaim: "odd/challenge-types/strong-claim.md", @@ -220,15 +223,34 @@ async function run() { id: t.slug, text: [t.triggerWords.join(" "), t.blockquote].filter((s) => s.length > 0).join(" "), })); - // Same stop word set the worker uses — preserves modals as signal, - // filters general filler so irrelevant input doesn't over-match. - const CHALLENGE_STOP_WORDS = new Set([ - "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", - "of", "in", "to", "for", "with", "on", "at", "by", "from", "as", "into", "through", - "and", "but", "or", "nor", "if", "then", "than", - "that", "this", "it", "its", "we", "you", "he", "she", "they", - ]); - const bm25 = buildBM25Index(detectionDocs, CHALLENGE_STOP_WORDS); + // Stop words come from the `## Detection Noise` section of normative-vocabulary.md + // (governance), exactly the same way the worker reads them. No hardcoded + // duplicate in this test — drift would mean the test passes while production fails. + const noiseMatch = articles.normativeVocabulary.match( + /## Detection Noise[\s\S]*?```\n([\s\S]*?)\n```/, + ); + const stopWords = new Set(); + if (noiseMatch) { + for (const word of noiseMatch[1].split(/[,\n]/)) { + const w = word.trim().toLowerCase(); + if (w.length > 0) stopWords.add(w); + } + } + ok( + "Detection Noise section parses non-empty stop word set", + stopWords.size > 0, + `parsed ${stopWords.size} stop words`, + ); + ok( + "Detection Noise excludes modal verbs (signal preservation)", + !stopWords.has("must") && !stopWords.has("should") && !stopWords.has("not"), + `must=${stopWords.has("must")} should=${stopWords.has("should")} not=${stopWords.has("not")}`, + ); + ok( + "Detection Noise includes common filler", + stopWords.has("the") && stopWords.has("of") && stopWords.has("in"), + ); + const bm25 = buildBM25Index(detectionDocs, stopWords); // Each type's first trigger word should still match its own type for (const t of types) { From 6f5a7cc536f5dff2aa3e0c699fca707603a87cd3 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Fri, 17 Apr 2026 13:58:50 +0000 Subject: [PATCH 14/17] fix(challenge): prefer prohibitions and lowercase mode keys in test parser - Use matchAll and prefer prohibition directive type over leftmost requirement match so excerpts like 'You MUST X and MUST NOT Y' surface the prohibition. Regex switched to global flag to support matchAll. - Port fetchStakesCalibration toLowerCase fix to the fidelity test parser so byMode keys stay lowercase even if governance introduces capitalized mode names. --- workers/src/orchestrate.ts | 54 ++++++++++++++++++------- workers/test/governance-parser.test.mjs | 2 +- 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/workers/src/orchestrate.ts b/workers/src/orchestrate.ts index 6775f77..f01f75b 100644 --- a/workers/src/orchestrate.ts +++ b/workers/src/orchestrate.ts @@ -674,6 +674,7 @@ async function fetchNormativeVocabulary( "\\b(" + [...caseSensitiveWords].sort((a, b) => b.length - a.length).map(escape).join("|") + ")\\b", + "g", ) : null; const caseInsensitiveRegex = @@ -682,7 +683,7 @@ async function fetchNormativeVocabulary( "(" + [...caseInsensitiveWords].sort((a, b) => b.length - a.length).map(escape).join("|") + ")", - "i", + "gi", ) : null; @@ -1723,13 +1724,38 @@ async function runChallengeAction( canonConstraints.push({ citation, quote: excerpt }); // Governance-driven tension detection - if (vocab.caseSensitiveRegex) { - const m = excerpt.match(vocab.caseSensitiveRegex); - if (m) { + // + // `.match()` with a combined alternation returns the *leftmost* hit, so + // "You MUST do X and MUST NOT do Y" would resolve to "MUST" (requirement) + // even though a prohibition is present later in the excerpt. Collect all + // matches via `matchAll` and prefer a prohibition over any other + // directive type, falling back to the leftmost match otherwise. This + // preserves the prior two-test priority (MUST NOT before MUST) without + // coupling to a hard-coded vocabulary. + const pickStrongest = ( + matches: IterableIterator, + lookup: (phrase: string) => string | undefined, + ): { phrase: string; dtype: string } | null => { + let first: { phrase: string; dtype: string } | null = null; + let prohibition: { phrase: string; dtype: string } | null = null; + for (const m of matches) { const phrase = m[1]; + const dtype = lookup(phrase) || "directive"; + if (!first) first = { phrase, dtype }; + if (!prohibition && dtype === "prohibition") prohibition = { phrase, dtype }; + } + return prohibition || first; + }; + + if (vocab.caseSensitiveRegex) { + const hit = pickStrongest( + excerpt.matchAll(vocab.caseSensitiveRegex), + (p) => vocab.directiveTypes.get(p), + ); + if (hit) { tensions.push({ - type: vocab.directiveTypes.get(phrase) || "directive", - message: `Canon ${vocab.directiveTypes.get(phrase) || "directive"} (${phrase}) found in ${entry.path}`, + type: hit.dtype, + message: `Canon ${hit.dtype} (${hit.phrase}) found in ${entry.path}`, citation, quote: excerpt, }); @@ -1737,16 +1763,14 @@ async function runChallengeAction( } } if (vocab.caseInsensitiveRegex) { - const m = excerpt.match(vocab.caseInsensitiveRegex); - if (m) { - const phrase = m[1]; - const dtype = - vocab.directiveTypes.get(phrase) || - vocab.directiveTypes.get(phrase.toLowerCase()) || - "load-bearing-claim"; + const hit = pickStrongest( + excerpt.matchAll(vocab.caseInsensitiveRegex), + (p) => vocab.directiveTypes.get(p) || vocab.directiveTypes.get(p.toLowerCase()) || "load-bearing-claim", + ); + if (hit) { tensions.push({ - type: dtype, - message: `Canon ${dtype} (${phrase}) found in ${entry.path}`, + type: hit.dtype, + message: `Canon ${hit.dtype} (${hit.phrase}) found in ${entry.path}`, citation, quote: excerpt, }); diff --git a/workers/test/governance-parser.test.mjs b/workers/test/governance-parser.test.mjs index c6378c5..27c5c4b 100644 --- a/workers/test/governance-parser.test.mjs +++ b/workers/test/governance-parser.test.mjs @@ -161,7 +161,7 @@ function parseStakesCalibration(content) { const tiersRaw = cols[1].toLowerCase().trim(); const isNone = tiersRaw === "none" || tiersRaw.startsWith("none ") || tiersRaw.startsWith("none("); const tiers = isNone ? [] : tiersRaw.split(",").map((t) => t.trim()).filter((t) => t); - byMode.set(cols[0], { tiers, strictness: cols[2], surfacing: cols[3] }); + byMode.set(cols[0].toLowerCase(), { tiers, strictness: cols[2], surfacing: cols[3] }); } } } From bd462fcf34ea49b25ee6d8359eaf8660968bd7f4 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Fri, 17 Apr 2026 14:12:58 +0000 Subject: [PATCH 15/17] fix(orchestrate): preserve empty middle cells in table parser and hoist pickStrongest Two fixes: 1. Table row parser (6 call sites) was using .split('|').map(trim).filter(c => c.length > 0) which also drops legitimately-empty interior cells, silently collapsing the column count. In fetchStakesCalibration this would silently drop a voice-dump row with an empty tiers cell, breaking the suppression invariant with no error signal. Introduce parseTableRow helper that only strips the leading/trailing empties produced by surrounding pipes, preserving empty middle cells. 2. Hoist pickStrongest (now pickStrongestDirective) out of the per-entry loop in runChallengeAction. It captures no loop-scoped state, so defining it inside the loop needlessly re-allocated the closure on each iteration and misled readers about its scope. Matches the placement of evaluatePrerequisiteCheck. --- workers/src/orchestrate.ts | 100 ++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 47 deletions(-) diff --git a/workers/src/orchestrate.ts b/workers/src/orchestrate.ts index f01f75b..77351cc 100644 --- a/workers/src/orchestrate.ts +++ b/workers/src/orchestrate.ts @@ -154,6 +154,27 @@ export interface OrchestrateOptions { canonUrl?: string; } +// ────────────────────────────────────────────────────────────────────────────── +// Markdown table helpers +// ────────────────────────────────────────────────────────────────────────────── + +/** + * Parse a single markdown table row into trimmed cell values, preserving + * legitimately-empty middle cells. Only the leading and trailing empty strings + * produced by splitting a `| a | b |`-style row are stripped — a prior + * `.filter(c => c.length > 0)` approach also dropped empty interior cells, + * which silently collapsed the column count and caused `cols.length >= N` + * guards to misfire (e.g. a voice-dump row with an empty tiers cell). + */ +function parseTableRow(row: string): string[] { + const parts = row.split("|"); + // Strip the leading empty produced by a leading `|`, if present + if (parts.length > 0 && parts[0].trim() === "") parts.shift(); + // Strip the trailing empty produced by a trailing `|`, if present + if (parts.length > 0 && parts[parts.length - 1].trim() === "") parts.pop(); + return parts.map((c) => c.trim()); +} + // ────────────────────────────────────────────────────────────────────────────── // BM25 Index Cache (per-request, lazy) // ────────────────────────────────────────────────────────────────────────────── @@ -353,7 +374,7 @@ async function discoverEncodingTypes( const qualityCriteria: Array<{ criterion: string; check: string; gapMessage: string }> = []; if (criteriaSection) { for (const row of criteriaSection[1].split("\n").filter((r: string) => r.includes("|"))) { - const cols = row.split("|").map((c: string) => c.trim()).filter((c: string) => c.length > 0); + const cols = parseTableRow(row); if (cols.length >= 3) { qualityCriteria.push({ criterion: cols[0], @@ -452,10 +473,7 @@ async function discoverChallengeTypes( const questions: Array<{ question: string; tier: string }> = []; if (questionsSection) { for (const row of questionsSection[1].split("\n").filter((r: string) => r.includes("|"))) { - const cols = row - .split("|") - .map((c: string) => c.trim()) - .filter((c: string) => c.length > 0); + const cols = parseTableRow(row); if (cols.length >= 2) { questions.push({ question: cols[0], tier: cols[1].toLowerCase() }); } @@ -473,10 +491,7 @@ async function discoverChallengeTypes( }> = []; if (prereqSection) { for (const row of prereqSection[1].split("\n").filter((r: string) => r.includes("|"))) { - const cols = row - .split("|") - .map((c: string) => c.trim()) - .filter((c: string) => c.length > 0); + const cols = parseTableRow(row); if (cols.length >= 3) { // Substitute {name} placeholder in gap messages const gap = cols[2].replace(/^"|"$/g, "").replace(/\{name\}/g, name); @@ -577,10 +592,7 @@ async function fetchBasePrerequisites( ); if (prereqSection) { for (const row of prereqSection[1].split("\n").filter((r: string) => r.includes("|"))) { - const cols = row - .split("|") - .map((c: string) => c.trim()) - .filter((c: string) => c.length > 0); + const cols = parseTableRow(row); if (cols.length >= 3) { result.push({ prerequisite: cols[0], @@ -626,10 +638,7 @@ async function fetchNormativeVocabulary( const tableMatch = section.match(/\|\s*(?:Word|Phrase)\s*\|[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n##|$)/); if (!tableMatch) continue; for (const row of tableMatch[1].split("\n").filter((r: string) => r.includes("|"))) { - const cols = row - .split("|") - .map((c: string) => c.trim()) - .filter((c: string) => c.length > 0); + const cols = parseTableRow(row); if (cols.length >= 2) { const phrase = cols[0]; const dtype = cols[1]; @@ -716,10 +725,7 @@ async function fetchStakesCalibration( ); if (tableMatch) { for (const row of tableMatch[1].split("\n").filter((r: string) => r.includes("|"))) { - const cols = row - .split("|") - .map((c: string) => c.trim()) - .filter((c: string) => c.length > 0); + const cols = parseTableRow(row); if (cols.length >= 4) { const mode = cols[0].toLowerCase(); const tiersRaw = cols[1].toLowerCase().trim(); @@ -1554,6 +1560,30 @@ async function runOrientAction( }; } +// Governance-driven tension detection helper. +// +// `.match()` with a combined alternation returns the *leftmost* hit, so +// "You MUST do X and MUST NOT do Y" would resolve to "MUST" (requirement) +// even though a prohibition is present later in the excerpt. Collect all +// matches via `matchAll` and prefer a prohibition over any other directive +// type, falling back to the leftmost match otherwise. This preserves the +// prior two-test priority (MUST NOT before MUST) without coupling to a +// hard-coded vocabulary. +function pickStrongestDirective( + matches: IterableIterator, + lookup: (phrase: string) => string | undefined, +): { phrase: string; dtype: string } | null { + let first: { phrase: string; dtype: string } | null = null; + let prohibition: { phrase: string; dtype: string } | null = null; + for (const m of matches) { + const phrase = m[1]; + const dtype = lookup(phrase) || "directive"; + if (!first) first = { phrase, dtype }; + if (!prohibition && dtype === "prohibition") prohibition = { phrase, dtype }; + } + return prohibition || first; +} + async function runChallengeAction( input: string, modeHint: string | undefined, @@ -1723,32 +1753,8 @@ async function runChallengeAction( const citation = `${entry.path}#${entry.title}`; canonConstraints.push({ citation, quote: excerpt }); - // Governance-driven tension detection - // - // `.match()` with a combined alternation returns the *leftmost* hit, so - // "You MUST do X and MUST NOT do Y" would resolve to "MUST" (requirement) - // even though a prohibition is present later in the excerpt. Collect all - // matches via `matchAll` and prefer a prohibition over any other - // directive type, falling back to the leftmost match otherwise. This - // preserves the prior two-test priority (MUST NOT before MUST) without - // coupling to a hard-coded vocabulary. - const pickStrongest = ( - matches: IterableIterator, - lookup: (phrase: string) => string | undefined, - ): { phrase: string; dtype: string } | null => { - let first: { phrase: string; dtype: string } | null = null; - let prohibition: { phrase: string; dtype: string } | null = null; - for (const m of matches) { - const phrase = m[1]; - const dtype = lookup(phrase) || "directive"; - if (!first) first = { phrase, dtype }; - if (!prohibition && dtype === "prohibition") prohibition = { phrase, dtype }; - } - return prohibition || first; - }; - if (vocab.caseSensitiveRegex) { - const hit = pickStrongest( + const hit = pickStrongestDirective( excerpt.matchAll(vocab.caseSensitiveRegex), (p) => vocab.directiveTypes.get(p), ); @@ -1763,7 +1769,7 @@ async function runChallengeAction( } } if (vocab.caseInsensitiveRegex) { - const hit = pickStrongest( + const hit = pickStrongestDirective( excerpt.matchAll(vocab.caseInsensitiveRegex), (p) => vocab.directiveTypes.get(p) || vocab.directiveTypes.get(p.toLowerCase()) || "load-bearing-claim", ); From 477a213954ce0ed13fb43ec31ccd9699885a9eb9 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Fri, 17 Apr 2026 14:21:49 +0000 Subject: [PATCH 16/17] fix(tests): use parseTableRow in governance parser test to preserve empty cells --- workers/test/governance-parser.test.mjs | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/workers/test/governance-parser.test.mjs b/workers/test/governance-parser.test.mjs index 27c5c4b..4ae40c5 100644 --- a/workers/test/governance-parser.test.mjs +++ b/workers/test/governance-parser.test.mjs @@ -49,6 +49,16 @@ async function fetchArticle(path) { // Parser logic — verbatim copies of the regexes in workers/src/orchestrate.ts // ────────────────────────────────────────────────────────────────────────── +// Mirror of `parseTableRow` in workers/src/orchestrate.ts. Preserves +// legitimately-empty interior cells (a prior `.filter(c => c.length > 0)` +// approach dropped them and silently collapsed column indexes). +function parseTableRow(row) { + const parts = row.split("|"); + if (parts.length > 0 && parts[0].trim() === "") parts.shift(); + if (parts.length > 0 && parts[parts.length - 1].trim() === "") parts.pop(); + return parts.map((c) => c.trim()); +} + function parseChallengeType(content) { const slugMatch = content.match(/\|\s*Slug\s*\|\s*([^|]+)\s*\|/); const nameMatch = content.match(/\|\s*Name\s*\|\s*([^|]+)\s*\|/); @@ -74,7 +84,7 @@ function parseChallengeType(content) { const questions = []; if (questionsSection) { for (const row of questionsSection[1].split("\n").filter((r) => r.includes("|"))) { - const cols = row.split("|").map((c) => c.trim()).filter((c) => c.length > 0); + const cols = parseTableRow(row); if (cols.length >= 2) questions.push({ question: cols[0], tier: cols[1] }); } } @@ -85,7 +95,7 @@ function parseChallengeType(content) { const prerequisiteOverlays = []; if (prereqSection) { for (const row of prereqSection[1].split("\n").filter((r) => r.includes("|"))) { - const cols = row.split("|").map((c) => c.trim()).filter((c) => c.length > 0); + const cols = parseTableRow(row); if (cols.length >= 3) { prerequisiteOverlays.push({ prerequisite: cols[0], @@ -121,7 +131,7 @@ function parseBasePrereqs(content) { const result = []; if (section) { for (const row of section[1].split("\n").filter((r) => r.includes("|"))) { - const cols = row.split("|").map((c) => c.trim()).filter((c) => c.length > 0); + const cols = parseTableRow(row); if (cols.length >= 3) { result.push({ prerequisite: cols[0], check: cols[1], gapMessage: cols[2].replace(/^"|"$/g, "") }); } @@ -139,7 +149,7 @@ function parseNormativeVocab(content) { const tableMatch = section.match(/\|\s*(?:Word|Phrase)\s*\|[\s\S]*?\|[-|\s]+\|\n([\s\S]*?)(?=\n\n|\n##|$)/); if (!tableMatch) continue; for (const row of tableMatch[1].split("\n").filter((r) => r.includes("|"))) { - const cols = row.split("|").map((c) => c.trim()).filter((c) => c.length > 0); + const cols = parseTableRow(row); if (cols.length >= 2) { if (isCS) caseSensitive.push(cols[0]); else caseInsensitive.push(cols[0]); @@ -156,7 +166,7 @@ function parseStakesCalibration(content) { const byMode = new Map(); if (tableMatch) { for (const row of tableMatch[1].split("\n").filter((r) => r.includes("|"))) { - const cols = row.split("|").map((c) => c.trim()).filter((c) => c.length > 0); + const cols = parseTableRow(row); if (cols.length >= 4) { const tiersRaw = cols[1].toLowerCase().trim(); const isNone = tiersRaw === "none" || tiersRaw.startsWith("none ") || tiersRaw.startsWith("none("); From 80416ac60682416436677b8f94ce982bd75b1ad4 Mon Sep 17 00:00:00 2001 From: Klappy Date: Fri, 17 Apr 2026 15:09:01 +0000 Subject: [PATCH 17/17] =?UTF-8?q?fix(challenge):=20close=20two=20open=20bu?= =?UTF-8?q?gbot=20issues=20=E2=80=94=20none-prefix=20surfacing=20and=20dea?= =?UTF-8?q?d=20branch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two issues from bugbot's 14:29 review: 1. Reframing 'none' check applies same defensive pattern as the tiersRaw fix in fetchStakesCalibration. The cell may be 'none' or 'none (parenthetical reason)' — strict equality would silently surface all reframings via the 'all' fallback when authors include explanatory text. Same defect class as bug #3 in the evidence note; sweep applied. 2. Remove unreachable questionTiers.length === 0 branch in the question- surfacing condition. The SUPPRESSED early-return at line 1635 already handles that case, so the branch was dead code that misleadingly suggested 'surface all questions for empty tiers' semantics — the actual semantic is full suppression. Verified: typecheck clean, parser test 97/97 against main, smoke 6/6. Defect-class sweep on governance cell strict-equality checks: only two sites (tiersRaw, surfacing), both now defensive. --- workers/src/orchestrate.ts | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/workers/src/orchestrate.ts b/workers/src/orchestrate.ts index 77351cc..401529a 100644 --- a/workers/src/orchestrate.ts +++ b/workers/src/orchestrate.ts @@ -1695,9 +1695,12 @@ async function runChallengeAction( // Apply stakes calibration: filter questions by tier, evaluate prerequisites by strictness, // surface reframings by the surfacing rule. When modeConfig is absent (no calibration // article or mode not in table), surface everything — "uniformly loud" fallback. + // Note: the questionTiers.length === 0 case is impossible here because the + // SUPPRESSED early-return above already handled it. We branch only on + // modeConfig presence and tier-membership. const surfacedQuestions: string[] = []; for (const q of questionMap.values()) { - if (!modeConfig || modeConfig.questionTiers.length === 0 || modeConfig.questionTiers.includes(q.tier)) { + if (!modeConfig || modeConfig.questionTiers.includes(q.tier)) { surfacedQuestions.push(q.question); } } @@ -1721,7 +1724,14 @@ async function runChallengeAction( allReframings.push(...typeReframings); } let surfacedReframings: string[] = []; - if (surfacing === "none") { + // Same defensive shape as the tiersRaw "none" check in fetchStakesCalibration. + // The cell may be "none" or "none (parenthetical reason)" — both mean suppress + // all reframings. Strict equality would let the parenthetical fall through to + // the "all" branch and silently surface every reframing for a mode that opted + // out of them. + const surfaceNone = + surfacing === "none" || surfacing.startsWith("none ") || surfacing.startsWith("none("); + if (surfaceNone) { surfacedReframings = []; } else if ( surfacing.includes("first 1") ||