From 31bb87f2b0306f5ffbd9b59c0d71f49ec6c7fce9 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 22 May 2026 04:20:22 +0000
Subject: [PATCH 1/6] spike(parser): bake-off harness for issue #69 Option B

Three candidate markdown libraries evaluated as drop-in replacements
for the hand-rolled `shared/markdown_parser.py`: mistune, markdown-it-py,
marko. Each gets a thin translator (~150-215 LOC) to the mdast dict
shape produced by the existing parser. Not wired into the runtime SDK
-- this directory exists purely so the harness can run them
side-by-side.

Components:
- `pyproject.toml`: new `spike-parser` dependency group with the three
  candidates pinned. Production install is unaffected.
- `src/chat_sdk/shared/parser_spike/`: per-library translators sharing
  the mdast `make_*` constructors from `markdown_parser.py`. README
  documents the spike findings inline.
- `tests/parser_spike/`: mdast parity bake-off + fixture corpus
  (~3KB mixed-content markdown exercising every node type). Reports
  divergences vs baseline without failing.
- `scripts/parser_spike/benchmark.py`: parse-and-translate timings
  across the four parsers on a ~12KB scaled corpus, plus translator
  LOC budget check.

Current findings (sample run):
- baseline (hand): 2.59ms median -- meets the 5ms target by 2x.
- mistune: 11.94ms median, 161 translator LOC, 26 divergences.
- markdown-it-py: 13.36ms median, 215 LOC, 24 divergences.
- marko: 46.62ms median, 152 LOC, 27 divergences (incl. dropped
  table alignment -- translator bug).

The 5ms perf budget set in the issue acceptance criteria is met only
by the baseline. mdast fidelity is roughly equivalent across the three
candidates -- and most "divergences" are baseline non-compliance the
candidates correct. Suggests a re-framed Option D (keep the fast
parser, port upstream remend for the streaming side) -- spelled out
in the spike README.

Numbers are reproducible: `uv run python scripts/parser_spike/benchmark.py`
---
 pyproject.toml                                |   8 +
 scripts/parser_spike/__init__.py              |   0
 scripts/parser_spike/benchmark.py             | 102 ++++++++
 src/chat_sdk/shared/parser_spike/README.md    | 129 +++++++++
 src/chat_sdk/shared/parser_spike/__init__.py  |  31 +++
 .../parser_spike/markdown_it_translator.py    | 244 ++++++++++++++++++
 .../shared/parser_spike/marko_translator.py   | 179 +++++++++++++
 .../shared/parser_spike/mistune_translator.py | 192 ++++++++++++++
 tests/parser_spike/__init__.py                |   0
 tests/parser_spike/conftest.py                |  14 +
 tests/parser_spike/fixtures/mixed_content.md  |  77 ++++++
 tests/parser_spike/test_mdast_parity.py       | 148 +++++++++++
 12 files changed, 1124 insertions(+)
 create mode 100644 scripts/parser_spike/__init__.py
 create mode 100644 scripts/parser_spike/benchmark.py
 create mode 100644 src/chat_sdk/shared/parser_spike/README.md
 create mode 100644 src/chat_sdk/shared/parser_spike/__init__.py
 create mode 100644 src/chat_sdk/shared/parser_spike/markdown_it_translator.py
 create mode 100644 src/chat_sdk/shared/parser_spike/marko_translator.py
 create mode 100644 src/chat_sdk/shared/parser_spike/mistune_translator.py
 create mode 100644 tests/parser_spike/__init__.py
 create mode 100644 tests/parser_spike/conftest.py
 create mode 100644 tests/parser_spike/fixtures/mixed_content.md
 create mode 100644 tests/parser_spike/test_mdast_parity.py

diff --git a/pyproject.toml b/pyproject.toml
index 97fb917..3726d91 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -112,6 +112,14 @@ dev = [
     "ruff>=0.4.0",
     "pyrefly==0.61.1",
 ]
+# Spike for issue #69 Option B (parser-replacement bake-off).
+# Three CommonMark/GFM libraries evaluated as candidates for replacing the
+# hand-rolled `shared/markdown_parser.py`. Not a runtime dep -- spike only.
+spike-parser = [
+    "mistune>=3.0",
+    "markdown-it-py>=3.0",
+    "marko>=2.0",
+]
 
 # ---------------------------------------------------------------------------
 # Pyrefly type checker configuration
diff --git a/scripts/parser_spike/__init__.py b/scripts/parser_spike/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/parser_spike/benchmark.py b/scripts/parser_spike/benchmark.py
new file mode 100644
index 0000000..31835bb
--- /dev/null
+++ b/scripts/parser_spike/benchmark.py
@@ -0,0 +1,102 @@
+"""Parser-replacement spike benchmark.
+
+Measures parse-and-translate time for the four parsers (baseline +
+three candidates) across a synthetic corpus scaled to ~10KB. Reports
+median, p95, and per-construct cost so the Option B decision has
+hard numbers to weigh.
+
+Run::
+
+    uv run python scripts/parser_spike/benchmark.py
+
+Acceptance criteria (per issue #69 follow-up):
+- 10KB mixed-content document under 5ms median on CI hardware.
+- Translator LOC under 250 per library.
+"""
+
+from __future__ import annotations
+
+import statistics
+import time
+from pathlib import Path
+
+from chat_sdk.shared.markdown_parser import parse_markdown as baseline_parse
+from chat_sdk.shared.parser_spike.markdown_it_translator import (
+    parse_markdown as markdown_it_parse,
+)
+from chat_sdk.shared.parser_spike.marko_translator import parse_markdown as marko_parse
+from chat_sdk.shared.parser_spike.mistune_translator import parse_markdown as mistune_parse
+
+FIXTURE_PATH = Path(__file__).resolve().parents[2] / "tests" / "parser_spike" / "fixtures" / "mixed_content.md"
+
+
+def _build_corpus(target_bytes: int = 10_240) -> str:
+    base = FIXTURE_PATH.read_text(encoding="utf-8")
+    out = []
+    size = 0
+    while size < target_bytes:
+        out.append(base)
+        size += len(base.encode("utf-8"))
+    return "\n".join(out)
+
+
+def _time_one(fn, text: str, iterations: int) -> list[float]:
+    timings = []
+    for _ in range(iterations):
+        t0 = time.perf_counter()
+        fn(text)
+        timings.append((time.perf_counter() - t0) * 1000.0)
+    return timings
+
+
+def _translator_loc() -> dict[str, int]:
+    root = Path(__file__).resolve().parents[2] / "src" / "chat_sdk" / "shared" / "parser_spike"
+    out = {}
+    for name, path in [
+        ("mistune", root / "mistune_translator.py"),
+        ("markdown-it-py", root / "markdown_it_translator.py"),
+        ("marko", root / "marko_translator.py"),
+    ]:
+        text = path.read_text(encoding="utf-8")
+        code_lines = [line for line in text.splitlines() if line.strip() and not line.strip().startswith("#")]
+        # Subtract docstrings as a rough heuristic.
+        out[name] = len(code_lines)
+    return out
+
+
+def main() -> None:
+    corpus = _build_corpus()
+    actual_bytes = len(corpus.encode("utf-8"))
+    print(f"Corpus: {actual_bytes:,} bytes (~{actual_bytes / 1024:.1f} KB)")
+
+    # Warm-up: each parser caches some regexes / token-rule chains.
+    for fn in (baseline_parse, mistune_parse, markdown_it_parse, marko_parse):
+        for _ in range(3):
+            fn(corpus)
+
+    iterations = 50
+    print(f"Iterations per parser: {iterations}\n")
+
+    print(f"{'parser':<20} {'median (ms)':>12} {'p95 (ms)':>12} {'min (ms)':>12} {'max (ms)':>12}")
+    print("-" * 70)
+    for name, fn in [
+        ("baseline (hand)", baseline_parse),
+        ("mistune", mistune_parse),
+        ("markdown-it-py", markdown_it_parse),
+        ("marko", marko_parse),
+    ]:
+        timings = _time_one(fn, corpus, iterations)
+        timings.sort()
+        median = statistics.median(timings)
+        p95 = timings[int(len(timings) * 0.95)]
+        print(f"{name:<20} {median:>12.2f} {p95:>12.2f} {min(timings):>12.2f} {max(timings):>12.2f}")
+
+    print("\nTranslator LOC (excluding blank lines + line comments):")
+    print("-" * 70)
+    for name, loc in _translator_loc().items():
+        budget_marker = " ✓" if loc < 250 else " ✗ (over 250-LOC budget)"
+        print(f"  {name:<20} {loc:>4} lines{budget_marker}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/chat_sdk/shared/parser_spike/README.md b/src/chat_sdk/shared/parser_spike/README.md
new file mode 100644
index 0000000..0fc5912
--- /dev/null
+++ b/src/chat_sdk/shared/parser_spike/README.md
@@ -0,0 +1,129 @@
+# Parser-replacement spike (issue #69 Option B)
+
+This directory is **not part of the runtime SDK**. It exists so the
+three candidate markdown libraries can be benchmarked and diffed
+against the existing hand-rolled `shared/markdown_parser.py` in a
+controlled way before any production code is touched.
+
+## How to run
+
+```bash
+# Install spike dev deps (one-off)
+uv sync --group dev --group spike-parser
+
+# Diff candidate mdast trees against the baseline
+uv run pytest tests/parser_spike/test_mdast_parity.py -s
+
+# Run the benchmark + LOC report
+uv run python scripts/parser_spike/benchmark.py
+```
+
+## Current results (sample run, local machine)
+
+Numbers will vary on CI hardware but the **relative ordering is stable**
+across runs.
+
+### Parse-and-translate time (12KB mixed corpus, 50 iterations)
+
+| parser            | median  | p95     | meets 5ms budget? |
+|-------------------|--------:|--------:|-------------------|
+| baseline (hand)   |  2.59ms |  2.72ms | ✓                 |
+| mistune           | 11.94ms | 13.04ms | ✗ (2.4× over)     |
+| markdown-it-py    | 13.36ms | 20.64ms | ✗ (2.7× over)     |
+| marko             | 46.62ms | 49.58ms | ✗ (9.3× over)     |
+
+The baseline is **~5× faster** than mistune and markdown-it-py and
+**~18× faster** than marko. The 5ms acceptance criterion from issue #69
+is met by the baseline alone.
+
+### Translator LOC (excluding blank lines + line comments)
+
+| library         | LOC | 250-LOC budget |
+|-----------------|----:|----------------|
+| mistune         | 161 | ✓               |
+| markdown-it-py  | 215 | ✓               |
+| marko           | 152 | ✓               |
+
+All three fit comfortably. mistune and marko both come in under 165
+lines for the translator layer.
+
+### mdast fidelity vs the baseline
+
+Tested against the `tests/parser_spike/fixtures/mixed_content.md`
+corpus (≈3KB of mixed headings, tables, code blocks, lists, links,
+images, blockquotes, emphasis).
+
+| library         | divergences |
+|-----------------|------------:|
+| mistune         | 26          |
+| markdown-it-py  | 24          |
+| marko           | 27          |
+
+**Important caveat**: of the ~25 divergences each candidate has, the
+vast majority are cases where the **baseline diverges from the mdast
+spec**, not where the candidate does. The most common patterns:
+
+- **Soft line breaks inside paragraphs / blockquotes**: candidates
+  emit `text + text("\n") + text` (per mdast spec); baseline merges
+  them into a single text node.
+- **Inline link followed by text**: candidates emit
+  `link(...) + text(".")`; baseline emits a single trailing text node
+  for `link(...).` that drops the URL.
+- **Trailing newline in fenced code values**: mistune and marko
+  preserve the trailing `\n`; baseline strips it.
+
+These are **structural improvements**, not regressions. Adopting any
+of the candidates would also fix several baseline correctness bugs as
+a side effect — albeit changing the mdast shape that downstream code
+currently depends on.
+
+The one candidate-side bug surfaced was marko losing GFM table
+alignment metadata (a translator fix; not investigated further in the
+spike).
+
+## Implication for the Option A/B/C decision
+
+The original Option B framing assumed a library swap would be a clear
+win. The spike data adds nuance:
+
+1. **Performance budget fails on every candidate** at the 5ms target
+   set in the issue. The baseline is the only parser that meets it,
+   and it does so by a wide margin (2.59ms vs 11-47ms).
+
+2. **mdast fidelity is a wash**: all three candidates are roughly
+   equivalent and each closes some baseline correctness bugs. None is
+   "closer to the existing output" — all diverge from it in similar
+   ways, mostly toward greater spec compliance.
+
+3. **Translator LOC is not a blocker**: all under the 250-line budget.
+
+This argues for a **revisited Option D**: keep the fast hand-rolled
+parser for the static parse path, and port the upstream `remend` npm
+package to Python for the streaming-completion path. The three
+production bugs from issue #69 all lived in `_remend` /
+`_get_committable_prefix`, not in `markdown_parser.py`. A direct
+remend port would close that gap without taking the 5× perf hit.
+
+The parser-side gaps from issue #69 (setext, footnotes, escaped chars,
+multi-backtick spans, raw HTML, indented code) would remain
+unaddressed under Option D. If they become a recurring problem, Option
+B can still be reconsidered later — the spike scaffolding stays here
+for re-runs.
+
+### Recommendation
+
+Before committing to a path, the team should weigh:
+
+- How much does the 5ms parse target actually matter for chat
+  streaming workloads? (A 10ms median may be entirely fine relative to
+  LLM token latency.)
+- How frequently are users hitting the parser-side gaps vs the
+  streaming-side bugs?
+- Is the structural mdast improvement (better link/paragraph splitting,
+  spec-correct softbreaks) worth the dependency cost?
+
+If those answers come out in favor of the swap, **mistune is the
+preferred candidate**: lowest divergence count, smallest translator
+LOC, fastest of the three candidates. markdown-it-py is the runner-up
+if richer plugin extensibility becomes important (footnotes, tasks,
+custom containers). marko drops out on performance alone.
diff --git a/src/chat_sdk/shared/parser_spike/__init__.py b/src/chat_sdk/shared/parser_spike/__init__.py
new file mode 100644
index 0000000..598ec03
--- /dev/null
+++ b/src/chat_sdk/shared/parser_spike/__init__.py
@@ -0,0 +1,31 @@
+"""Parser-replacement spike for issue #69 Option B.
+
+Three candidate libraries are evaluated as drop-in replacements for the
+hand-rolled ``shared/markdown_parser.py``:
+
+- ``mistune`` (3.x)
+- ``markdown-it-py`` (4.x)
+- ``marko`` (2.x)
+
+Each gets a thin translator that converts the library's native token /
+AST format into the mdast-compatible dict shape produced by
+``shared.markdown_parser.parse_markdown``. The contract: same input
+markdown should produce the same mdast tree across all four parsers
+(the existing hand-rolled one + the three candidates), modulo
+documented divergences.
+
+This module is NOT imported by the runtime SDK. It exists purely so
+the bake-off harness in ``tests/parser_spike/`` and
+``scripts/parser_spike/`` can exercise the candidates side-by-side
+without touching production code paths.
+
+The decision criteria (per the issue #69 follow-up plan):
+  1. mdast fidelity vs the existing parser on the fixture corpus
+  2. Translator LOC (target: <250 per library)
+  3. Parse-and-translate time (target: <5ms on 10KB mixed content)
+  4. GFM coverage (tables, strikethrough, task lists)
+  5. Extensibility surface for the gaps in #69 (setext, footnotes,
+     escaped chars, multi-backtick code spans, raw HTML, indented code)
+"""
+
+from __future__ import annotations
diff --git a/src/chat_sdk/shared/parser_spike/markdown_it_translator.py b/src/chat_sdk/shared/parser_spike/markdown_it_translator.py
new file mode 100644
index 0000000..1be2237
--- /dev/null
+++ b/src/chat_sdk/shared/parser_spike/markdown_it_translator.py
@@ -0,0 +1,244 @@
+"""markdown-it-py (4.x) -> mdast translator.
+
+markdown-it tokenises into a flat list of ``Token`` objects (each with
+``type``, ``tag``, ``content``, ``children``, ``markup``, ``attrs``,
+``meta``). Block-level constructs use ``_open`` / ``_close`` pairs and
+must be folded into a tree. Inline tokens (under ``inline`` parents)
+are already nested.
+
+GFM features (tables, strikethrough) are enabled by selecting the
+``gfm-like`` preset and adding the strikethrough rule explicitly.
+Task-list rendering would require ``mdit-py-plugins`` (deferred).
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from markdown_it import MarkdownIt
+from markdown_it.token import Token
+
+from chat_sdk.shared.markdown_parser import (
+    Content,
+    Root,
+    make_blockquote,
+    make_break,
+    make_code,
+    make_delete,
+    make_emphasis,
+    make_heading,
+    make_image,
+    make_inline_code,
+    make_link,
+    make_list,
+    make_list_item,
+    make_paragraph,
+    make_root,
+    make_strong,
+    make_table,
+    make_table_cell,
+    make_table_row,
+    make_text,
+    make_thematic_break,
+)
+
+_MD = MarkdownIt("commonmark").enable(["table", "strikethrough"])
+
+
+def parse_markdown(text: str) -> Root:
+    tokens = _MD.parse(text)
+    children, _ = _consume_blocks(tokens, 0, end_type=None)
+    return make_root(children)
+
+
+def _consume_blocks(tokens: list[Token], i: int, end_type: str | None) -> tuple[list[Content], int]:
+    """Walk tokens until we hit *end_type* (or end of list). Return the
+    list of mdast block children produced and the index after the closer.
+    """
+    children: list[Content] = []
+    while i < len(tokens):
+        tok = tokens[i]
+        if end_type is not None and tok.type == end_type:
+            return children, i + 1
+
+        if tok.type == "paragraph_open":
+            inline = tokens[i + 1]
+            children.append(make_paragraph(_translate_inline(inline.children or [])))
+            i += 3  # paragraph_open, inline, paragraph_close
+            continue
+
+        if tok.type == "heading_open":
+            depth = int(tok.tag[1])  # h1 -> 1, h2 -> 2, ...
+            inline = tokens[i + 1]
+            children.append(make_heading(depth, _translate_inline(inline.children or [])))
+            i += 3
+            continue
+
+        if tok.type == "hr":
+            children.append(make_thematic_break())
+            i += 1
+            continue
+
+        if tok.type == "fence":
+            lang = tok.info.split()[0] if tok.info and tok.info.strip() else None
+            value = tok.content.rstrip("\n")
+            children.append(make_code(value, lang=lang))
+            i += 1
+            continue
+
+        if tok.type == "code_block":
+            children.append(make_code(tok.content.rstrip("\n"), lang=None))
+            i += 1
+            continue
+
+        if tok.type == "blockquote_open":
+            inner, i = _consume_blocks(tokens, i + 1, "blockquote_close")
+            children.append(make_blockquote(inner))
+            continue
+
+        if tok.type == "bullet_list_open":
+            items, i = _consume_list(tokens, i + 1, "bullet_list_close")
+            children.append(make_list(items, ordered=False))
+            continue
+
+        if tok.type == "ordered_list_open":
+            start = int((tok.attrs or {}).get("start", 1))
+            items, i = _consume_list(tokens, i + 1, "ordered_list_close")
+            children.append(make_list(items, ordered=True, start=start))
+            continue
+
+        if tok.type == "table_open":
+            table, i = _consume_table(tokens, i + 1)
+            children.append(table)
+            continue
+
+        # Unknown / unhandled token: skip but don't crash.
+        i += 1
+
+    return children, i
+
+
+def _consume_list(tokens: list[Token], i: int, end_type: str) -> tuple[list[Content], int]:
+    items: list[Content] = []
+    while i < len(tokens):
+        tok = tokens[i]
+        if tok.type == end_type:
+            return items, i + 1
+        if tok.type == "list_item_open":
+            inner, i = _consume_blocks(tokens, i + 1, "list_item_close")
+            items.append(make_list_item(inner))
+            continue
+        i += 1
+    return items, i
+
+
+def _consume_table(tokens: list[Token], i: int) -> tuple[Content, int]:
+    rows: list[Content] = []
+    align: list[str | None] = []
+    in_header = False
+    header_aligns: list[str | None] = []
+    current_row: list[Content] = []
+    current_aligns: list[str | None] = []
+
+    while i < len(tokens):
+        tok = tokens[i]
+        if tok.type == "table_close":
+            return make_table(rows, align=header_aligns if any(header_aligns) else None), i + 1
+        if tok.type == "thead_open":
+            in_header = True
+        elif tok.type == "thead_close":
+            in_header = False
+        elif tok.type == "tr_open":
+            current_row = []
+            current_aligns = []
+        elif tok.type == "tr_close":
+            rows.append(make_table_row(current_row))
+            if in_header:
+                header_aligns = current_aligns
+            align = current_aligns
+        elif tok.type in ("th_open", "td_open"):
+            style = (tok.attrs or {}).get("style", "")
+            cell_align: str | None = None
+            if isinstance(style, str):
+                if "text-align:left" in style:
+                    cell_align = "left"
+                elif "text-align:center" in style:
+                    cell_align = "center"
+                elif "text-align:right" in style:
+                    cell_align = "right"
+            current_aligns.append(cell_align)
+            inline = tokens[i + 1]
+            current_row.append(make_table_cell(_translate_inline(inline.children or [])))
+            i += 3  # th/td_open, inline, th/td_close
+            continue
+        i += 1
+    _ = align  # quiet unused-var without changing semantics
+    return make_table(rows, align=header_aligns if any(header_aligns) else None), i
+
+
+def _translate_inline(tokens: list[Token]) -> list[Content]:
+    out: list[Content] = []
+    stack: list[tuple[str, list[Content]]] = []
+    current = out
+
+    def open_container(kind: str) -> None:
+        nonlocal current
+        new_children: list[Content] = []
+        stack.append((kind, current))  # type: ignore[arg-type]
+        current = new_children
+
+    def close_container(make: Any) -> None:
+        nonlocal current
+        kids = current
+        _kind, parent = stack.pop()
+        current = parent  # type: ignore[assignment]
+        current.append(make(kids))
+
+    for tok in tokens:
+        t = tok.type
+        if t == "text":
+            current.append(make_text(tok.content))
+        elif t == "softbreak":
+            current.append(make_text("\n"))
+        elif t == "hardbreak":
+            current.append(make_break())
+        elif t == "code_inline":
+            current.append(make_inline_code(tok.content))
+        elif t == "strong_open":
+            open_container("strong")
+        elif t == "strong_close":
+            close_container(make_strong)
+        elif t == "em_open":
+            open_container("emphasis")
+        elif t == "em_close":
+            close_container(make_emphasis)
+        elif t == "s_open":
+            open_container("delete")
+        elif t == "s_close":
+            close_container(make_delete)
+        elif t == "link_open":
+            attrs = tok.attrs or {}
+            stack.append(("link", current))  # type: ignore[arg-type]
+            link_children: list[Content] = []
+            stack.append((f"_link_meta:{attrs.get('href', '')}|{attrs.get('title') or ''}", current))  # type: ignore[arg-type]
+            current = link_children
+        elif t == "link_close":
+            kids = current
+            meta_kind, _ = stack.pop()
+            _, parent = stack.pop()
+            current = parent  # type: ignore[assignment]
+            payload = meta_kind.removeprefix("_link_meta:")
+            href, _, title = payload.partition("|")
+            current.append(make_link(href, kids, title=title or None))
+        elif t == "image":
+            attrs = tok.attrs or {}
+            url = attrs.get("src", "")
+            title = attrs.get("title")
+            alt = tok.content  # markdown-it precomputes alt text
+            current.append(make_image(str(url), alt=alt, title=title))
+        elif t == "html_inline":
+            current.append(make_text(tok.content))
+        else:
+            if tok.content:
+                current.append(make_text(tok.content))
+    return out
diff --git a/src/chat_sdk/shared/parser_spike/marko_translator.py b/src/chat_sdk/shared/parser_spike/marko_translator.py
new file mode 100644
index 0000000..9bc9530
--- /dev/null
+++ b/src/chat_sdk/shared/parser_spike/marko_translator.py
@@ -0,0 +1,179 @@
+"""marko (2.x) -> mdast translator.
+
+marko parses to a class-based AST (``marko.block.Document`` etc.). Each
+node exposes ``children`` (list[Node] or str payload). The GFM extension
+adds tables, strikethrough, task lists, autolinks.
+"""
+
+from __future__ import annotations
+
+import marko
+from marko.ext.gfm import GFM
+
+from chat_sdk.shared.markdown_parser import (
+    Content,
+    Root,
+    make_blockquote,
+    make_break,
+    make_code,
+    make_delete,
+    make_emphasis,
+    make_heading,
+    make_image,
+    make_inline_code,
+    make_link,
+    make_list,
+    make_list_item,
+    make_paragraph,
+    make_root,
+    make_strong,
+    make_table,
+    make_table_cell,
+    make_table_row,
+    make_text,
+    make_thematic_break,
+)
+
+_MD = marko.Markdown(extensions=[GFM])
+
+
+def parse_markdown(text: str) -> Root:
+    doc = _MD.parse(text)
+    children = [_translate(c) for c in getattr(doc, "children", [])]
+    return make_root([c for c in children if c is not None])
+
+
+def _translate(node: object) -> Content | None:
+    cls = type(node).__name__
+
+    if cls == "Paragraph":
+        return make_paragraph(_inline_children(node))
+    if cls == "Heading":
+        depth = int(getattr(node, "level", 1))
+        return make_heading(depth, _inline_children(node))
+    if cls == "SetextHeading":
+        depth = int(getattr(node, "level", 1))
+        return make_heading(depth, _inline_children(node))
+    if cls == "ThematicBreak":
+        return make_thematic_break()
+    if cls in ("FencedCode", "CodeBlock"):
+        lang = getattr(node, "lang", None) or None
+        value = _gather_code_text(node)
+        return make_code(value, lang=lang)
+    if cls == "Quote":
+        return make_blockquote(_block_children(node))
+    if cls == "List":
+        ordered = bool(getattr(node, "ordered", False))
+        start = int(getattr(node, "start", 1)) if ordered else 1
+        return make_list(_block_children(node), ordered=ordered, start=start)
+    if cls == "ListItem":
+        return make_list_item(_block_children(node))
+    if cls == "Table":
+        return _translate_table(node)
+    if cls == "BlankLine":
+        return None
+    if cls == "HTMLBlock":
+        return make_paragraph([make_text(getattr(node, "body", "") or "")])
+    # Fallback: stringify if we can.
+    return None
+
+
+def _block_children(node: object) -> list[Content]:
+    out: list[Content] = []
+    for child in getattr(node, "children", []) or []:
+        translated = _translate(child)
+        if translated is not None:
+            out.append(translated)
+    return out
+
+
+def _inline_children(node: object) -> list[Content]:
+    out: list[Content] = []
+    children = getattr(node, "children", None)
+    if isinstance(children, str):
+        return [make_text(children)]
+    for child in children or []:
+        translated = _translate_inline(child)
+        if translated is not None:
+            out.extend(translated) if isinstance(translated, list) else out.append(translated)
+    return out
+
+
+def _translate_inline(node: object) -> Content | list[Content] | None:
+    cls = type(node).__name__
+
+    if cls == "RawText":
+        value = getattr(node, "children", "")
+        return make_text(value if isinstance(value, str) else "")
+    if cls == "Literal":
+        return make_text(getattr(node, "children", "") or "")
+    if cls == "LineBreak":
+        # marko exposes a ``soft`` flag on the line-break node.
+        soft = bool(getattr(node, "soft", False))
+        return make_text("\n") if soft else make_break()
+    if cls == "InlineHTML":
+        return make_text(getattr(node, "children", "") or "")
+    if cls == "CodeSpan":
+        value = getattr(node, "children", "")
+        return make_inline_code(value if isinstance(value, str) else "")
+    if cls == "Emphasis":
+        return make_emphasis(_inline_children(node))
+    if cls == "StrongEmphasis":
+        return make_strong(_inline_children(node))
+    if cls == "Strikethrough":
+        return make_delete(_inline_children(node))
+    if cls == "Link":
+        url = getattr(node, "dest", "") or ""
+        title = getattr(node, "title", None) or None
+        return make_link(url, _inline_children(node), title=title)
+    if cls in ("AutoLink", "Url"):
+        url = getattr(node, "dest", "") or ""
+        return make_link(url, _inline_children(node))
+    if cls == "Image":
+        url = getattr(node, "dest", "") or ""
+        title = getattr(node, "title", None) or None
+        alt = "".join(_extract_text(c) for c in getattr(node, "children", []) or [])
+        return make_image(url, alt=alt, title=title)
+    # Fallback: any unrecognized inline -> stringify children if any.
+    value = getattr(node, "children", None)
+    if isinstance(value, str):
+        return make_text(value)
+    return None
+
+
+def _translate_table(node: object) -> Content:
+    rows: list[Content] = []
+    align: list[str | None] = list(getattr(node, "alignment", []) or [])
+    # marko stores alignments as ["left", "center", "right", None].
+    align = [a if a in ("left", "center", "right") else None for a in align]
+    for row in getattr(node, "children", []) or []:
+        cells: list[Content] = []
+        for cell in getattr(row, "children", []) or []:
+            cells.append(make_table_cell(_inline_children(cell)))
+        rows.append(make_table_row(cells))
+    return make_table(rows, align=align if any(align) else None)
+
+
+def _extract_text(node: object) -> str:
+    cls = type(node).__name__
+    if cls == "RawText":
+        v = getattr(node, "children", "")
+        return v if isinstance(v, str) else ""
+    children = getattr(node, "children", None)
+    if isinstance(children, str):
+        return children
+    return "".join(_extract_text(c) for c in children or [])
+
+
+def _gather_code_text(node: object) -> str:
+    children = getattr(node, "children", None)
+    if isinstance(children, str):
+        return children
+    parts: list[str] = []
+    for c in children or []:
+        v = getattr(c, "children", "")
+        if isinstance(v, str):
+            parts.append(v)
+        else:
+            parts.append(_extract_text(c))
+    return "".join(parts)
diff --git a/src/chat_sdk/shared/parser_spike/mistune_translator.py b/src/chat_sdk/shared/parser_spike/mistune_translator.py
new file mode 100644
index 0000000..498f846
--- /dev/null
+++ b/src/chat_sdk/shared/parser_spike/mistune_translator.py
@@ -0,0 +1,192 @@
+"""mistune (3.x) -> mdast translator.
+
+Uses ``mistune.create_markdown(renderer=None)`` to obtain the parser's
+internal token list, then maps each token type to its mdast equivalent.
+
+GFM plugins enabled: ``table``, ``strikethrough``, ``task_lists``,
+``url``.
+
+Notes on the token shape (mistune 3.x): each token is a dict with
+``type`` (always), ``children`` (block tokens + some inline), ``raw``
+(text leaves), ``attrs`` (heading levels, link urls, list metadata).
+The inline parser is invoked lazily; we drive it explicitly via
+``md.inline.parse`` for cells / list items where needed.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import mistune
+
+from chat_sdk.shared.markdown_parser import (
+    Content,
+    Root,
+    make_blockquote,
+    make_break,
+    make_code,
+    make_delete,
+    make_emphasis,
+    make_heading,
+    make_image,
+    make_inline_code,
+    make_link,
+    make_list,
+    make_list_item,
+    make_paragraph,
+    make_root,
+    make_strong,
+    make_table,
+    make_table_cell,
+    make_table_row,
+    make_text,
+    make_thematic_break,
+)
+
+# Single shared parser instance (mistune parsers are stateless after creation).
+_MD = mistune.create_markdown(
+    renderer=None,
+    plugins=["table", "strikethrough", "task_lists", "url"],
+)
+
+
+def parse_markdown(text: str) -> Root:
+    """Parse *text* and return an mdast-compatible root node."""
+    tokens, _state = _MD.parse(text)
+    children = [_translate_block(tok) for tok in tokens]
+    return make_root([c for c in children if c is not None])
+
+
+def _translate_block(tok: dict[str, Any]) -> Content | None:
+    t = tok.get("type")
+    if t == "blank_line":
+        return None
+    if t == "paragraph":
+        return make_paragraph(_translate_inline_children(tok))
+    if t == "heading":
+        depth = int(tok.get("attrs", {}).get("level", 1))
+        return make_heading(depth, _translate_inline_children(tok))
+    if t == "thematic_break":
+        return make_thematic_break()
+    if t == "block_code":
+        attrs = tok.get("attrs", {}) or {}
+        info = attrs.get("info")
+        lang = info.split()[0] if isinstance(info, str) and info.strip() else None
+        return make_code(tok.get("raw", ""), lang=lang)
+    if t == "block_quote":
+        children = [_translate_block(c) for c in tok.get("children", [])]
+        return make_blockquote([c for c in children if c is not None])
+    if t == "list":
+        attrs = tok.get("attrs", {}) or {}
+        ordered = bool(attrs.get("ordered"))
+        start = int(attrs.get("start", 1)) if ordered else 1
+        items = [_translate_block(c) for c in tok.get("children", [])]
+        items = [c for c in items if c is not None]
+        return make_list(items, ordered=ordered, start=start)
+    if t == "list_item":
+        children = [_translate_block(c) for c in tok.get("children", [])]
+        return make_list_item([c for c in children if c is not None])
+    if t == "block_text":
+        # Loose-list paragraph payload; mistune emits raw inline text.
+        return make_paragraph(_translate_inline_children(tok))
+    if t == "table":
+        return _translate_table(tok)
+    # Unknown block: render as a paragraph carrying its raw text so we
+    # don't silently drop content. The bake-off harness will flag this.
+    raw = tok.get("raw", "")
+    if raw:
+        return make_paragraph([make_text(raw)])
+    return None
+
+
+def _translate_table(tok: dict[str, Any]) -> Content:
+    rows: list[Content] = []
+    align: list[str | None] = []
+    for child in tok.get("children", []):
+        ctype = child.get("type")
+        if ctype == "table_head":
+            cells, head_align = _translate_table_row(child)
+            rows.append(make_table_row(cells))
+            align = head_align
+        elif ctype == "table_body":
+            for row in child.get("children", []):
+                if row.get("type") == "table_row":
+                    cells, _ = _translate_table_row(row)
+                    rows.append(make_table_row(cells))
+    return make_table(rows, align=align if any(align) else None)
+
+
+def _translate_table_row(row: dict[str, Any]) -> tuple[list[Content], list[str | None]]:
+    cells: list[Content] = []
+    aligns: list[str | None] = []
+    for cell in row.get("children", []):
+        if cell.get("type") not in ("table_cell",):
+            continue
+        attrs = cell.get("attrs", {}) or {}
+        align_val = attrs.get("align")
+        aligns.append(align_val if align_val in ("left", "center", "right") else None)
+        cells.append(make_table_cell(_translate_inline_children(cell)))
+    return cells, aligns
+
+
+def _translate_inline_children(tok: dict[str, Any]) -> list[Content]:
+    children = tok.get("children")
+    if children is None:
+        # mistune defers inline parsing for some tokens (e.g. headings
+        # built from setext logic). Parse the raw text now.
+        raw = tok.get("raw", "")
+        if not raw:
+            return []
+        children = _MD.inline.parse(raw, mistune.BlockState())  # type: ignore[arg-type]
+    out: list[Content] = []
+    for child in children or []:
+        translated = _translate_inline(child)
+        if translated is not None:
+            out.extend(translated) if isinstance(translated, list) else out.append(translated)
+    return out
+
+
+def _translate_inline(tok: dict[str, Any]) -> Content | list[Content] | None:
+    t = tok.get("type")
+    if t == "text":
+        return make_text(tok.get("raw", ""))
+    if t == "softbreak":
+        return make_text("\n")
+    if t == "linebreak":
+        return make_break()
+    if t == "codespan":
+        return make_inline_code(tok.get("raw", ""))
+    if t in ("strong", "emphasis", "delete", "strikethrough"):
+        kids = _translate_inline_children(tok)
+        if t == "strong":
+            return make_strong(kids)
+        if t == "emphasis":
+            return make_emphasis(kids)
+        return make_delete(kids)
+    if t == "link":
+        attrs = tok.get("attrs", {}) or {}
+        url = attrs.get("url", "")
+        title = attrs.get("title")
+        return make_link(url, _translate_inline_children(tok), title=title)
+    if t == "image":
+        attrs = tok.get("attrs", {}) or {}
+        url = attrs.get("url", "")
+        title = attrs.get("title")
+        # mistune nests alt as inline children; flatten to plain string.
+        alt = "".join(_extract_text(c) for c in tok.get("children", []) or [])
+        return make_image(url, alt=alt, title=title)
+    if t == "inline_html":
+        # mdast has html nodes; the existing hand-rolled parser doesn't
+        # emit them. Surface as plain text for parity with the baseline.
+        return make_text(tok.get("raw", ""))
+    raw = tok.get("raw")
+    if raw:
+        return make_text(raw)
+    return None
+
+
+def _extract_text(node: dict[str, Any]) -> str:
+    if node.get("type") == "text":
+        return node.get("raw", "")
+    children = node.get("children") or []
+    return "".join(_extract_text(c) for c in children)
diff --git a/tests/parser_spike/__init__.py b/tests/parser_spike/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/parser_spike/conftest.py b/tests/parser_spike/conftest.py
new file mode 100644
index 0000000..3be0199
--- /dev/null
+++ b/tests/parser_spike/conftest.py
@@ -0,0 +1,14 @@
+"""Shared fixtures for the parser-replacement spike harness."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+FIXTURE_DIR = Path(__file__).parent / "fixtures"
+
+
+@pytest.fixture(scope="session")
+def mixed_content_markdown() -> str:
+    return (FIXTURE_DIR / "mixed_content.md").read_text(encoding="utf-8")
diff --git a/tests/parser_spike/fixtures/mixed_content.md b/tests/parser_spike/fixtures/mixed_content.md
new file mode 100644
index 0000000..9c41c3e
--- /dev/null
+++ b/tests/parser_spike/fixtures/mixed_content.md
@@ -0,0 +1,77 @@
+# Quarterly Report
+
+A short **overview** with some _emphasis_, ~~deletions~~, and `inline code`
+followed by a [link to the docs](https://example.com "Docs").
+
+## Section 1: Numbers
+
+Total revenue grew by **12.4%** quarter over quarter. Here are the splits:
+
+| Region        | Q1     | Q2     | Q3     |
+|:--------------|-------:|-------:|-------:|
+| North America | $12.3M | $14.1M | $15.8M |
+| EMEA          | $8.7M  | $9.2M  | $10.4M |
+| APAC          | $5.2M  | $5.9M  | $6.7M  |
+
+> Growth is driven by **enterprise** adoption in EMEA and renewed
+> demand in the APAC mid-market segment.
+
+## Section 2: Engineering
+
+The platform team shipped:
+
+- Streaming markdown renderer (issue #69)
+- Multi-region failover for the lock service
+- A new `ConcurrencyConfig.max_concurrent` enforcement
+  - sub-bullet: documented in the migration guide
+  - sub-bullet: covered by 47 new tests
+- Telemetry pipeline rewrite
+
+Ordered roadmap items:
+
+1. Finalize the parser swap (Option B)
+2. Land the test-fidelity baseline at 100%
+3. Ship 0.5.0 with the new defaults
+
+### Code samples
+
+```python
+from chat_sdk import Chat
+
+chat = Chat(adapter=SlackAdapter())
+
+@chat.on_mention
+async def handle(event):
+    await event.thread.post("hello")
+```
+
+```bash
+$ uv run pytest tests/ -q
+```
+
+### Edge cases worth checking
+
+- A `**bold` opened but not closed inside a sentence.
+- An italic `*partial` that should be repaired during streaming.
+- Word-internal asterisks like `5*3=15` (must not be italic).
+- A bullet item: `* this is the start of a list` is a marker, not italic.
+
+---
+
+## Section 3: References
+
+For background:
+
+- [Vercel chat SDK](https://github.com/vercel/chat)
+- [mdast specification](https://github.com/syntax-tree/mdast)
+- [remend npm package](https://www.npmjs.com/package/remend)
+
+An image for visual identity: ![logo](https://example.com/logo.png "Logo")
+
+A nested blockquote with rich formatting:
+
+> The *quick* brown **fox** jumps over the `lazy` dog.
+>
+> > Inside a nested quote with a [link](https://example.com).
+
+End of report.
diff --git a/tests/parser_spike/test_mdast_parity.py b/tests/parser_spike/test_mdast_parity.py
new file mode 100644
index 0000000..24a70f0
--- /dev/null
+++ b/tests/parser_spike/test_mdast_parity.py
@@ -0,0 +1,148 @@
+"""mdast parity bake-off: hand-rolled parser vs library candidates.
+
+For each library candidate, parse the fixture corpus and diff the
+resulting mdast tree against the baseline hand-rolled parser. The
+test does not fail on divergence -- this is a measurement harness,
+not an acceptance gate. Divergences are recorded so the spike report
+can show *which* node shapes each candidate gets wrong (and how badly).
+
+Run with verbose output to see the full divergence report:
+
+    uv run pytest tests/parser_spike/test_mdast_parity.py -s -v
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import pytest
+
+from chat_sdk.shared.markdown_parser import parse_markdown as baseline_parse
+from chat_sdk.shared.parser_spike.markdown_it_translator import (
+    parse_markdown as markdown_it_parse,
+)
+from chat_sdk.shared.parser_spike.marko_translator import parse_markdown as marko_parse
+from chat_sdk.shared.parser_spike.mistune_translator import parse_markdown as mistune_parse
+
+CANDIDATES = [
+    ("mistune", mistune_parse),
+    ("markdown-it-py", markdown_it_parse),
+    ("marko", marko_parse),
+]
+
+
+# ---------------------------------------------------------------------------
+# Divergence reporter
+# ---------------------------------------------------------------------------
+
+
+def _walk(node: Any, path: str = "$") -> list[tuple[str, str, Any]]:
+    """Yield (path, kind, value) for each node-shape signal we care about.
+
+    Kind is one of: "type", "depth", "ordered", "start", "lang", "url",
+    "alt", "title", "value", "align", "child_count".
+    """
+    out: list[tuple[str, str, Any]] = []
+    if isinstance(node, dict):
+        t = node.get("type")
+        out.append((path, "type", t))
+        for key in ("depth", "ordered", "start", "lang", "url", "alt", "title", "value", "align"):
+            if key in node:
+                out.append((path, key, node[key]))
+        children = node.get("children")
+        if isinstance(children, list):
+            out.append((path, "child_count", len(children)))
+            for i, child in enumerate(children):
+                out.extend(_walk(child, f"{path}.children[{i}]"))
+    return out
+
+
+def _diff_trees(baseline: Any, candidate: Any) -> list[str]:
+    base_walk = _walk(baseline)
+    cand_walk = _walk(candidate)
+
+    base_index = {(p, k): v for (p, k, v) in base_walk}
+    cand_index = {(p, k): v for (p, k, v) in cand_walk}
+
+    diffs: list[str] = []
+    seen = set(base_index) | set(cand_index)
+    for key in sorted(seen):
+        path, kind = key
+        b = base_index.get(key, "<missing>")
+        c = cand_index.get(key, "<missing>")
+        if b != c:
+            diffs.append(f"  {path} [{kind}]: baseline={b!r} candidate={c!r}")
+    return diffs
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("name,parser", CANDIDATES)
+def test_candidate_produces_root_node(name: str, parser, mixed_content_markdown: str) -> None:
+    result = parser(mixed_content_markdown)
+    assert result["type"] == "root", f"{name} did not produce a root node"
+    assert isinstance(result.get("children"), list)
+
+
+@pytest.mark.parametrize("name,parser", CANDIDATES)
+def test_candidate_matches_top_level_block_types(name: str, parser, mixed_content_markdown: str) -> None:
+    baseline = baseline_parse(mixed_content_markdown)
+    candidate = parser(mixed_content_markdown)
+    baseline_types = [c.get("type") for c in baseline["children"]]
+    candidate_types = [c.get("type") for c in candidate["children"]]
+    # Don't assert equality -- different parsers may split paragraphs
+    # differently around HRs or trailing blank lines. We assert that the
+    # important constructs are all present in both.
+    important = {"heading", "table", "code", "list", "blockquote", "thematicBreak"}
+    base_important = [t for t in baseline_types if t in important]
+    cand_important = [t for t in candidate_types if t in important]
+    assert base_important == cand_important, (
+        f"{name} block-type sequence diverges:\n  baseline: {baseline_types}\n  {name}: {candidate_types}"
+    )
+
+
+def test_report_full_divergences(mixed_content_markdown: str) -> None:
+    """Print a full divergence report for each candidate. Always passes.
+
+    Run with ``pytest -s`` to see the report inline.
+    """
+    baseline = baseline_parse(mixed_content_markdown)
+    print("\n" + "=" * 70)
+    print("mdast divergence report")
+    print("=" * 70)
+    for name, parser in CANDIDATES:
+        candidate = parser(mixed_content_markdown)
+        diffs = _diff_trees(baseline, candidate)
+        print(f"\n[{name}] {len(diffs)} divergence(s)")
+        if diffs:
+            for line in diffs[:30]:  # cap noise
+                print(line)
+            if len(diffs) > 30:
+                print(f"  ... +{len(diffs) - 30} more")
+
+
+def test_dump_baseline_tree_size(mixed_content_markdown: str) -> None:
+    """Sanity: the fixture exercises enough of the AST to be meaningful."""
+    baseline = baseline_parse(mixed_content_markdown)
+    nodes = _walk(baseline)
+    # ~200+ shape signals = at least a couple dozen non-trivial nodes.
+    assert len(nodes) > 150, f"Fixture is too small to be a useful bake-off (only {len(nodes)} signals)"
+    # Spot-check the constructs the fixture should contain.
+    types = {sig for (_, kind, sig) in nodes if kind == "type"}
+    required_types = (
+        "heading",
+        "paragraph",
+        "code",
+        "list",
+        "table",
+        "blockquote",
+        "thematicBreak",
+        "strong",
+        "emphasis",
+        "link",
+    )
+    for required in required_types:
+        assert required in types, f"Fixture missing required node type: {required}"

From 71dc42e2e3d31888276ecf1139228683b6319678 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 22 May 2026 04:28:38 +0000
Subject: [PATCH 2/6] spike(parser): add completeness gap fixture +
 recognise/silent-drop report

The original happy-path fixture only exercised constructs the baseline
parser handles, so divergence counts (24-27) measured "structural
differences on constructs we both parse" -- not the real cost of using
the baseline. That framing undersold the library candidates.

New `gap_cases.md` fixture exercises six constructs from the issue #69
non-parity list: setext headings, indented code blocks, footnote
definitions, inline HTML, task list items, definition lists. A new
`test_report_completeness_gap` walks each parser's output and reports
whether each construct was recognised (correct mdast node type) or
silently dropped to plain text.

Findings on the gap fixture:

  construct             baseline  mistune  markdown-it-py  marko
  setext heading        DROP      OK       OK              OK
  indented code         DROP      OK       OK              OK
  task list item        OK*       DROP     OK              OK
  footnote definition   DROP      DROP     DROP**          DROP
  inline HTML           DROP      DROP     DROP            DROP
  definition list       DROP      DROP     DROP            DROP
  silent-drop count     5         4        3               3

  * Baseline matches `- [x]` as a list item but doesn't extract
    checkbox state.
  ** markdown-it-py supports footnotes via mdit-py-plugins (not in the
     spike); enabling it would drop the count to 2.

The baseline is strictly worse on completeness than every candidate.
Part of its 2.59ms perf win is "fewer constructs to parse per byte" --
setext, indented code, multi-backtick, escaped chars, raw HTML all
skip straight through the inline fast-paths.

README updated to reflect the corrected framing: three options now
(A: in-tree gap closure, B: library swap, D: split the problem) with
markdown-it-py promoted to preferred Option B candidate based on the
completeness score. Option C (selective parser fixes only) ruled out
unless paired with a separate _remend fix.
---
 src/chat_sdk/shared/parser_spike/README.md | 144 ++++++++++++++-------
 tests/parser_spike/conftest.py             |  11 ++
 tests/parser_spike/fixtures/gap_cases.md   |  54 ++++++++
 tests/parser_spike/test_mdast_parity.py    |  72 +++++++++++
 4 files changed, 236 insertions(+), 45 deletions(-)
 create mode 100644 tests/parser_spike/fixtures/gap_cases.md

diff --git a/src/chat_sdk/shared/parser_spike/README.md b/src/chat_sdk/shared/parser_spike/README.md
index 0fc5912..60bb4bd 100644
--- a/src/chat_sdk/shared/parser_spike/README.md
+++ b/src/chat_sdk/shared/parser_spike/README.md
@@ -47,11 +47,11 @@ is met by the baseline alone.
 All three fit comfortably. mistune and marko both come in under 165
 lines for the translator layer.
 
-### mdast fidelity vs the baseline
+### mdast fidelity on the happy path (`mixed_content.md`)
 
-Tested against the `tests/parser_spike/fixtures/mixed_content.md`
-corpus (≈3KB of mixed headings, tables, code blocks, lists, links,
-images, blockquotes, emphasis).
+Tested against a ≈3KB corpus of headings, tables, code blocks, lists,
+links, images, blockquotes, emphasis — constructs the baseline parser
+*does* handle.
 
 | library         | divergences |
 |-----------------|------------:|
@@ -81,49 +81,103 @@ The one candidate-side bug surfaced was marko losing GFM table
 alignment metadata (a translator fix; not investigated further in the
 spike).
 
-## Implication for the Option A/B/C decision
-
-The original Option B framing assumed a library swap would be a clear
-win. The spike data adds nuance:
-
-1. **Performance budget fails on every candidate** at the 5ms target
-   set in the issue. The baseline is the only parser that meets it,
-   and it does so by a wide margin (2.59ms vs 11-47ms).
+### Completeness gap on hard constructs (`gap_cases.md`)
+
+The happy-path comparison above is **not the whole picture**: the
+baseline parser is documented as not handling several CommonMark / GFM
+constructs at all (see `docs/UPSTREAM_SYNC.md:442`). On those
+constructs it silently flattens to `text` / `paragraph` nodes — the
+same surface area issue #69 was opened to address.
+
+`fixtures/gap_cases.md` exercises six gap constructs. **Silent drop**
+means the construct was parsed as ordinary text/paragraph; **recognised**
+means the parser emitted the correct mdast node type.
+
+| construct             | baseline    | mistune    | markdown-it-py | marko       |
+|-----------------------|-------------|------------|----------------|-------------|
+| setext heading        | silent drop | recognised | recognised     | recognised  |
+| indented code block   | silent drop | recognised | recognised     | recognised  |
+| task list item        | recognised¹ | silent drop| recognised     | recognised  |
+| footnote definition   | silent drop | silent drop| silent drop²   | silent drop |
+| inline HTML           | silent drop | silent drop| silent drop    | silent drop |
+| definition list       | silent drop | silent drop| silent drop    | silent drop |
+| **silent-drop count** | **5**       | **4**      | **3**          | **3**       |
+
+¹ Baseline matches `- [x]` as a list item but doesn't extract the
+checkbox state.
+² markdown-it-py supports footnotes via the `mdit-py-plugins` package
+(not pulled in by the spike); enabling it would drop the silent-drop
+count to 2.
+
+**The baseline is strictly worse on completeness than every
+candidate.** That's the half of the perf comparison the happy-path
+numbers don't show: baseline runs faster partly because it does less
+work per byte — setext headings, indented code, multi-backtick spans,
+escaped chars, and raw HTML all skip straight through the inline
+fast-paths instead of being parsed.
 
-2. **mdast fidelity is a wash**: all three candidates are roughly
-   equivalent and each closes some baseline correctness bugs. None is
-   "closer to the existing output" — all diverge from it in similar
-   ways, mostly toward greater spec compliance.
-
-3. **Translator LOC is not a blocker**: all under the 250-line budget.
-
-This argues for a **revisited Option D**: keep the fast hand-rolled
-parser for the static parse path, and port the upstream `remend` npm
-package to Python for the streaming-completion path. The three
-production bugs from issue #69 all lived in `_remend` /
-`_get_committable_prefix`, not in `markdown_parser.py`. A direct
-remend port would close that gap without taking the 5× perf hit.
+## Implication for the Option A/B/C decision
 
-The parser-side gaps from issue #69 (setext, footnotes, escaped chars,
-multi-backtick spans, raw HTML, indented code) would remain
-unaddressed under Option D. If they become a recurring problem, Option
-B can still be reconsidered later — the spike scaffolding stays here
-for re-runs.
+The spike data argues against a clean recommendation in either
+direction:
+
+1. **Performance**: baseline wins at 2.59ms median vs 11-47ms for the
+   candidates. But that win is at least partly a function of doing
+   *less work per byte*: the baseline skips entire construct families
+   on the fast path, while the libraries fully tokenise them. Apples
+   to apples requires either teaching the baseline to handle setext +
+   indented code + escaped chars (Option A) and re-measuring, or
+   accepting that the perf gap pays for genuine completeness.
+
+2. **mdast fidelity on the happy path**: all three candidates are
+   roughly equivalent (24-27 minor divergences) and each closes some
+   baseline correctness bugs. mostly toward greater spec compliance.
+
+3. **Completeness on hard constructs**: the baseline is strictly
+   worse than every candidate. It silently flattens setext, indented
+   code, multi-backtick spans, escaped chars, raw HTML, and definition
+   lists into plain text — the exact gap list issue #69 enumerated.
+
+4. **Translator LOC**: all under the 250-line budget.
+
+### Three options now, not two
+
+- **Option A (close baseline gaps in-tree)**: write parser code for
+  setext, indented code, escaped chars, multi-backtick spans (the
+  ones #69 listed as common in LLM output). Estimated ~300-400 LOC of
+  carefully-tested regex / state-machine work, plus the existing
+  parser keeps its 2.6ms perf. Doesn't address `_remend` gaps from the
+  issue #69 follow-up comment.
+
+- **Option B (library swap)**: pay the 5× perf hit (10-15ms median)
+  for `mistune` or `markdown-it-py`, eat ~150-215 LOC of translator,
+  close the completeness gap *and* most `_remend` gaps in one motion.
+  **markdown-it-py is now the preferred candidate** (best
+  completeness score, only 1.5ms slower than mistune), with
+  `mdit-py-plugins` available for footnotes if needed later. mistune
+  is the runner-up. marko drops out on performance.
+
+- **Option D (split the problem)**: keep the fast hand-rolled parser
+  *and* close gaps in-tree (Option A), but separately port upstream
+  `remend` directly for the streaming side. Two efforts, two PRs, but
+  preserves perf while closing both bug classes. More total work than
+  Option B but no dependency added.
 
 ### Recommendation
 
-Before committing to a path, the team should weigh:
-
-- How much does the 5ms parse target actually matter for chat
-  streaming workloads? (A 10ms median may be entirely fine relative to
-  LLM token latency.)
-- How frequently are users hitting the parser-side gaps vs the
-  streaming-side bugs?
-- Is the structural mdast improvement (better link/paragraph splitting,
-  spec-correct softbreaks) worth the dependency cost?
-
-If those answers come out in favor of the swap, **mistune is the
-preferred candidate**: lowest divergence count, smallest translator
-LOC, fastest of the three candidates. markdown-it-py is the runner-up
-if richer plugin extensibility becomes important (footnotes, tasks,
-custom containers). marko drops out on performance alone.
+The right answer depends on team priorities the spike can't answer:
+
+- **If 10ms median parse time is fine** (likely true for chat
+  streaming, where LLM token latency dwarfs this), **Option B with
+  markdown-it-py is the cleanest path**. One PR, one dep, both gap
+  lists close.
+- **If we want zero-dep core preserved**, **Option D** is the only
+  path that keeps the install footprint small while closing both bug
+  classes. Highest total effort.
+- **If neither perf nor zero-dep is sacred**, Option B still wins on
+  effort per fix delivered.
+
+Option C (selective parser-side fixes only, the original framing in
+the issue) leaves the streaming-side bugs from the #69 follow-up
+comment unaddressed and should be ruled out unless we ship it
+alongside a separate `_remend` fix.
diff --git a/tests/parser_spike/conftest.py b/tests/parser_spike/conftest.py
index 3be0199..05d54f0 100644
--- a/tests/parser_spike/conftest.py
+++ b/tests/parser_spike/conftest.py
@@ -12,3 +12,14 @@
 @pytest.fixture(scope="session")
 def mixed_content_markdown() -> str:
     return (FIXTURE_DIR / "mixed_content.md").read_text(encoding="utf-8")
+
+
+@pytest.fixture(scope="session")
+def gap_cases_markdown() -> str:
+    """Constructs the hand-rolled parser explicitly doesn't support
+    (setext headings, footnotes, escaped chars, multi-backtick spans,
+    raw HTML, indented code blocks, math, task lists, autolinks,
+    definition lists). Used to measure the *completeness* gap, not
+    just the structural-equivalence gap.
+    """
+    return (FIXTURE_DIR / "gap_cases.md").read_text(encoding="utf-8")
diff --git a/tests/parser_spike/fixtures/gap_cases.md b/tests/parser_spike/fixtures/gap_cases.md
new file mode 100644
index 0000000..18678e6
--- /dev/null
+++ b/tests/parser_spike/fixtures/gap_cases.md
@@ -0,0 +1,54 @@
+Setext H1 underline
+===================
+
+Setext H2 underline
+-------------------
+
+Indented code block (4-space):
+
+    def hello():
+        return "world"
+
+A paragraph with escaped \*asterisks\* and escaped \[brackets\] and a
+literal backslash \\ in it.
+
+A footnote reference[^1] in running text.
+
+[^1]: This is the footnote body.
+
+Multi-backtick inline code: ``some `quoted` code`` and triple ```backticks
+with ``double`` inside```.
+
+Raw HTML block:
+
+<div class="callout">
+  <p>This is HTML, not markdown.</p>
+</div>
+
+Inline HTML: <span style="color:red">red text</span> and a self-closing
+<br/> mid-sentence.
+
+Word-internal asterisks: `5*3=15`, paths like `lib/*.so`, and
+glob*patterns*everywhere.
+
+Math: a single dollar $a^2 + b^2 = c^2$ and a display block:
+
+$$
+\int_0^\infty e^{-x^2} dx = \frac{\sqrt{\pi}}{2}
+$$
+
+A task list (GFM):
+
+- [ ] Pending item
+- [x] Completed item
+- [ ] Another pending one
+
+An autolink: <https://example.com> and an email <user@example.com>.
+
+A definition list (some flavors):
+
+term1
+:   definition for term 1
+
+term2
+:   definition for term 2
diff --git a/tests/parser_spike/test_mdast_parity.py b/tests/parser_spike/test_mdast_parity.py
index 24a70f0..be7e005 100644
--- a/tests/parser_spike/test_mdast_parity.py
+++ b/tests/parser_spike/test_mdast_parity.py
@@ -146,3 +146,75 @@ def test_dump_baseline_tree_size(mixed_content_markdown: str) -> None:
     )
     for required in required_types:
         assert required in types, f"Fixture missing required node type: {required}"
+
+
+# ---------------------------------------------------------------------------
+# Completeness gap (what each parser actually recognises on hard constructs)
+# ---------------------------------------------------------------------------
+
+
+def _collect_recognised_types(node: Any) -> set[str]:
+    """Set of all `type` values appearing anywhere in the tree."""
+    found: set[str] = set()
+    if isinstance(node, dict):
+        t = node.get("type")
+        if isinstance(t, str):
+            found.add(t)
+        for child in node.get("children") or []:
+            found |= _collect_recognised_types(child)
+    return found
+
+
+# Construct -> expected mdast `type` (or set of types) when recognised.
+# A parser that returns *none* of these for the gap fixture has silently
+# flattened the construct to paragraph/text. The baseline is documented
+# as not handling any of these, so it sets the floor.
+GAP_CONSTRUCTS: dict[str, set[str]] = {
+    "setext heading": {"heading"},  # heading must appear from a setext source
+    "indented code block": {"code"},  # raw 4-space indented block
+    "footnote definition": {"footnoteDefinition", "footnoteReference"},
+    "inline HTML": {"html", "inlineHTML"},
+    "task list item": {"listItem"},  # mdast: listItem with `checked` attr
+    "definition list": {"definition", "descriptionList", "termTitle"},
+}
+
+
+def test_report_completeness_gap(gap_cases_markdown: str) -> None:
+    """Print which gap constructs each parser actually recognised.
+
+    The baseline parser is *known* to not handle these (see
+    docs/UPSTREAM_SYNC.md non-parity table). This report quantifies how
+    many it silently drops vs each library candidate.
+
+    Run with ``pytest -s`` to see the report inline.
+    """
+    print("\n" + "=" * 70)
+    print("Completeness gap report (gap_cases.md)")
+    print("=" * 70)
+
+    parsers = [("baseline (hand)", baseline_parse), *CANDIDATES]
+    rows: list[tuple[str, set[str]]] = []
+    for name, parser in parsers:
+        types = _collect_recognised_types(parser(gap_cases_markdown))
+        rows.append((name, types))
+
+    # Construct table: rows = constructs, columns = parsers
+    print(f"\n{'construct':<24}", end="")
+    for name, _ in rows:
+        print(f" {name[:14]:>15}", end="")
+    print()
+    print("-" * (24 + 16 * len(rows)))
+
+    for construct, expected in GAP_CONSTRUCTS.items():
+        print(f"{construct:<24}", end="")
+        for _, types in rows:
+            recognised = bool(expected & types)
+            print(f" {'recognised' if recognised else 'silent drop':>15}", end="")
+        print()
+
+    print()
+    # Per-parser unique-type counts on this fixture
+    print("Distinct mdast types emitted on gap fixture:")
+    for name, types in rows:
+        type_list = sorted(types)
+        print(f"  {name:<20} {len(type_list):>2}  -> {type_list}")

From 227691d6c0f07c2c0025cef6dea123f833602f88 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 22 May 2026 06:37:05 +0000
Subject: [PATCH 3/6] spike(parser): document triggers to revisit Option A/B
 decision

The spike concluded with Option A (PRs #99 + #101) being the right
call for the SDK's current chat-only scope. Documenting the concrete
triggers under which that conclusion should be re-examined:

- Non-chat input source (RAG, user-authored memory, markdown-file
  ingestion, GitHub-body parsing -- anywhere humans write the input)
- Long-form artifact output surface (citations + footnotes; math
  rendering; one-shot parse where latency is less critical)
- Web rendering surface (port of upstream `@chat-adapter/web`)
- New chat platform demanding richer parsing

Plus a playbook for re-running the bake-off (author fixture, run
parity + benchmark harness, compare silent-drop count / parse time)
and a decision threshold for promoting markdown-it-py into runtime
if a trigger materialises.

Spot-checked upstream's `packages/` at the time of writing: only
`adapter-web` is the relevant new package (added in v4.27, Python
port deferred); no artifact / RAG / ingestion packages exist
upstream. All triggers are forward-looking.
---
 src/chat_sdk/shared/parser_spike/README.md | 80 ++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/src/chat_sdk/shared/parser_spike/README.md b/src/chat_sdk/shared/parser_spike/README.md
index 60bb4bd..eb5d691 100644
--- a/src/chat_sdk/shared/parser_spike/README.md
+++ b/src/chat_sdk/shared/parser_spike/README.md
@@ -181,3 +181,83 @@ Option C (selective parser-side fixes only, the original framing in
 the issue) leaves the streaming-side bugs from the #69 follow-up
 comment unaddressed and should be ruled out unless we ship it
 alongside a separate `_remend` fix.
+
+## Triggers to revisit this decision
+
+The chat-scoped Option A (PRs #99 + #101) is the right call **for the
+SDK's current scope** -- LLM output rendered into chat platforms. The
+moment the input source or rendering target changes, the spike data
+should be re-run with a workload-shaped fixture before deciding
+anything.
+
+Concrete triggers that should cause us to re-open this:
+
+- **A non-chat input surface lands.** The chat-scoped assumption is
+  "input comes from an LLM; humans don't write the markdown we parse."
+  That breaks the moment we start parsing markdown that humans (or
+  external corpora) authored:
+  - User-authored memory / notes / scratchpads stored in the SDK
+  - Ingestion of `*.md` files for RAG-style workflows
+  - Parsing incoming GitHub PR/issue bodies for structure extraction
+    (today the GitHub adapter mostly emits, not parses)
+  - Any "import markdown" public API
+  Human-authored content routinely uses setext, indented code,
+  footnotes, raw HTML, and multi-backtick spans -- exactly the gaps
+  the baseline silently drops.
+
+- **A long-form artifact output surface lands.** When agents start
+  emitting research-summary / report / document artifacts (not chat
+  messages), the workload shifts toward CommonMark fidelity:
+  - Footnotes for citations
+  - Math regions rendered (not just sanitised)
+  - Multi-backtick code spans for technical documentation
+  - Tables with richer cell content
+  Parsing for an artifact also happens once per document, not per
+  stream chunk -- which makes the 5-18× perf cost of Option B much
+  more tolerable than it is for streaming.
+
+- **A web rendering surface for chat-sdk-python.** Upstream added
+  `@chat-adapter/web` in v4.27.0 (a browser-side chat UI). It's
+  explicitly out of scope for chat-sdk-python today (see PR #83 sync
+  scope). If that ever ships in Python, the rendering target tolerates
+  richer markdown because the browser can display setext / footnotes /
+  HTML natively.
+
+- **A new chat platform that demands richer parsing.** Unlikely in
+  the near term -- the existing eight platforms all render a similar
+  CommonMark subset. But e.g. a platform with native footnote support
+  could surface a gap.
+
+### Upstream check (May 2026)
+
+Spot-checked `vercel/chat`'s `packages/` directory at the time of
+writing. The only relevant package besides the eight chat adapters and
+the core/state packages is **`adapter-web`** (added in v4.27.0, Python
+port deferred). No artifact-rendering, RAG, document-ingestion, or
+standalone markdown-rendering packages exist upstream. The triggers
+above are forward-looking -- none are imminent in upstream-tracked
+work.
+
+### Playbook for re-running
+
+When a trigger materialises:
+
+1. Author a fixture file under `tests/parser_spike/fixtures/` that
+   represents the new surface's actual content (not generic
+   CommonMark -- workload-shaped).
+2. Re-run `pytest tests/parser_spike/test_mdast_parity.py -s` and
+   `python scripts/parser_spike/benchmark.py`. Both pick up the new
+   fixture automatically if added to `conftest.py`.
+3. Compare the silent-drop count and benchmark numbers against the
+   chat-scoped findings above. The decision matrix shifts toward
+   Option B when:
+   - Silent-drop count is materially higher on the new fixture
+     (≥6 constructs that the new surface needs)
+   - Parse latency is one-shot rather than per-stream-chunk
+   - The team is OK adding a dependency to the runtime core
+4. If thresholds are met, promote `markdown-it-py` translator from
+   `parser_spike/` into runtime (it's the preferred candidate per
+   the spike data). Add `markdown-it-py` to the relevant extras
+   group (not `dependencies`, to preserve zero-dep core install for
+   chat-only consumers).
+

From 74d7c10447c69c669398ffac55dfe4340a30d81b Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 23 May 2026 02:13:05 +0000
Subject: [PATCH 4/6] spike(parser): address PR #100 review comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three medium-priority findings from gemini-code-assist:

1. **Unused `align` variable** in `_consume_table`. Removed the
   variable, its assignment, and the linter-quieting `_ = align`
   line. `header_aligns` already correctly carries the table-level
   alignment from the `<thead>` section.

2. **Fragile `|` separator for link metadata** in `_translate_inline`.
   The stack stored link href and title as a single string with `|`
   between them; a URL or title containing `|` would corrupt parsing.
   Replaced with a proper tuple-typed stack frame:
   `tuple[list[Content], tuple[str, str | None] | None]` -- the meta
   is `None` for plain containers (strong/emphasis/delete) and an
   `(href, title)` tuple for links. Removes the partition/format
   round-trip entirely.

3. **LOC heuristic mis-comment**. The previous comment said it
   subtracted docstrings but the implementation only filtered blank
   and comment lines. Replaced with a proper `ast`-based docstring
   exclusion that walks Module / ClassDef / FunctionDef bodies for
   `Expr(Constant(str))` first-statement patterns and excludes their
   line ranges. New LOC numbers (smaller, more honest):

   mistune        149 (was 161 — 12 lines were docstrings)
   markdown-it-py 194 (was 215 — 21 lines)
   marko          147 (was 152 — 5 lines)

   All still under the 250-LOC budget; conclusion unchanged. README
   updated with the new numbers.

Spike tests still pass (9/9), and the benchmark still runs.
---
 scripts/parser_spike/benchmark.py             | 42 +++++++++++++++++--
 src/chat_sdk/shared/parser_spike/README.md    | 12 +++---
 .../parser_spike/markdown_it_translator.py    | 36 ++++++++--------
 3 files changed, 62 insertions(+), 28 deletions(-)

diff --git a/scripts/parser_spike/benchmark.py b/scripts/parser_spike/benchmark.py
index 31835bb..0596730 100644
--- a/scripts/parser_spike/benchmark.py
+++ b/scripts/parser_spike/benchmark.py
@@ -50,6 +50,15 @@ def _time_one(fn, text: str, iterations: int) -> list[float]:
 
 
 def _translator_loc() -> dict[str, int]:
+    """Count lines of code per translator, excluding blanks, line comments,
+    and docstrings.
+
+    The docstring exclusion uses ``ast`` to identify ``Expr(Constant(str))``
+    statements -- the canonical docstring shape -- so we don't over-count
+    multi-line docstrings as logic LOC against the 250-LOC budget.
+    """
+    import ast
+
     root = Path(__file__).resolve().parents[2] / "src" / "chat_sdk" / "shared" / "parser_spike"
     out = {}
     for name, path in [
@@ -58,9 +67,34 @@ def _translator_loc() -> dict[str, int]:
         ("marko", root / "marko_translator.py"),
     ]:
         text = path.read_text(encoding="utf-8")
-        code_lines = [line for line in text.splitlines() if line.strip() and not line.strip().startswith("#")]
-        # Subtract docstrings as a rough heuristic.
-        out[name] = len(code_lines)
+        # Identify docstring line ranges via AST: any Expr(Constant(str))
+        # immediately under a module, class, or function definition.
+        tree = ast.parse(text)
+        docstring_lines: set[int] = set()
+        for node in ast.walk(tree):
+            if not isinstance(node, ast.Module | ast.ClassDef | ast.FunctionDef | ast.AsyncFunctionDef):
+                continue
+            body = getattr(node, "body", None)
+            if not body:
+                continue
+            first = body[0]
+            if (
+                isinstance(first, ast.Expr)
+                and isinstance(first.value, ast.Constant)
+                and isinstance(first.value.value, str)
+            ):
+                end_lineno = first.end_lineno or first.lineno
+                docstring_lines.update(range(first.lineno, end_lineno + 1))
+
+        code_lines = 0
+        for lineno, line in enumerate(text.splitlines(), start=1):
+            stripped = line.strip()
+            if not stripped or stripped.startswith("#"):
+                continue
+            if lineno in docstring_lines:
+                continue
+            code_lines += 1
+        out[name] = code_lines
     return out
 
 
@@ -91,7 +125,7 @@ def main() -> None:
         p95 = timings[int(len(timings) * 0.95)]
         print(f"{name:<20} {median:>12.2f} {p95:>12.2f} {min(timings):>12.2f} {max(timings):>12.2f}")
 
-    print("\nTranslator LOC (excluding blank lines + line comments):")
+    print("\nTranslator LOC (excluding blank lines, line comments, and docstrings):")
     print("-" * 70)
     for name, loc in _translator_loc().items():
         budget_marker = " ✓" if loc < 250 else " ✗ (over 250-LOC budget)"
diff --git a/src/chat_sdk/shared/parser_spike/README.md b/src/chat_sdk/shared/parser_spike/README.md
index eb5d691..12a5a97 100644
--- a/src/chat_sdk/shared/parser_spike/README.md
+++ b/src/chat_sdk/shared/parser_spike/README.md
@@ -36,16 +36,16 @@ The baseline is **~5× faster** than mistune and markdown-it-py and
 **~18× faster** than marko. The 5ms acceptance criterion from issue #69
 is met by the baseline alone.
 
-### Translator LOC (excluding blank lines + line comments)
+### Translator LOC (excluding blank lines, line comments, and docstrings)
 
 | library         | LOC | 250-LOC budget |
 |-----------------|----:|----------------|
-| mistune         | 161 | ✓               |
-| markdown-it-py  | 215 | ✓               |
-| marko           | 152 | ✓               |
+| mistune         | 149 | ✓               |
+| markdown-it-py  | 194 | ✓               |
+| marko           | 147 | ✓               |
 
-All three fit comfortably. mistune and marko both come in under 165
-lines for the translator layer.
+All three fit comfortably. mistune and marko both come in under 150
+lines of logic for the translator layer.
 
 ### mdast fidelity on the happy path (`mixed_content.md`)
 
diff --git a/src/chat_sdk/shared/parser_spike/markdown_it_translator.py b/src/chat_sdk/shared/parser_spike/markdown_it_translator.py
index 1be2237..7fe1d75 100644
--- a/src/chat_sdk/shared/parser_spike/markdown_it_translator.py
+++ b/src/chat_sdk/shared/parser_spike/markdown_it_translator.py
@@ -134,7 +134,6 @@ def _consume_list(tokens: list[Token], i: int, end_type: str) -> tuple[list[Cont
 
 def _consume_table(tokens: list[Token], i: int) -> tuple[Content, int]:
     rows: list[Content] = []
-    align: list[str | None] = []
     in_header = False
     header_aligns: list[str | None] = []
     current_row: list[Content] = []
@@ -155,7 +154,6 @@ def _consume_table(tokens: list[Token], i: int) -> tuple[Content, int]:
             rows.append(make_table_row(current_row))
             if in_header:
                 header_aligns = current_aligns
-            align = current_aligns
         elif tok.type in ("th_open", "td_open"):
             style = (tok.attrs or {}).get("style", "")
             cell_align: str | None = None
@@ -172,26 +170,29 @@ def _consume_table(tokens: list[Token], i: int) -> tuple[Content, int]:
             i += 3  # th/td_open, inline, th/td_close
             continue
         i += 1
-    _ = align  # quiet unused-var without changing semantics
     return make_table(rows, align=header_aligns if any(header_aligns) else None), i
 
 
 def _translate_inline(tokens: list[Token]) -> list[Content]:
     out: list[Content] = []
-    stack: list[tuple[str, list[Content]]] = []
+    # Each stack frame holds (parent_list, meta) -- meta is None for plain
+    # containers (strong/emphasis/delete) and a (href, title) tuple for
+    # links. Using a tuple instead of pipe-stuffing a string sidesteps the
+    # fragility of URLs/titles that contain pipe characters.
+    stack: list[tuple[list[Content], tuple[str, str | None] | None]] = []
     current = out
 
-    def open_container(kind: str) -> None:
+    def open_container() -> None:
         nonlocal current
         new_children: list[Content] = []
-        stack.append((kind, current))  # type: ignore[arg-type]
+        stack.append((current, None))
         current = new_children
 
     def close_container(make: Any) -> None:
         nonlocal current
         kids = current
-        _kind, parent = stack.pop()
-        current = parent  # type: ignore[assignment]
+        parent, _meta = stack.pop()
+        current = parent
         current.append(make(kids))
 
     for tok in tokens:
@@ -205,30 +206,29 @@ def close_container(make: Any) -> None:
         elif t == "code_inline":
             current.append(make_inline_code(tok.content))
         elif t == "strong_open":
-            open_container("strong")
+            open_container()
         elif t == "strong_close":
             close_container(make_strong)
         elif t == "em_open":
-            open_container("emphasis")
+            open_container()
         elif t == "em_close":
             close_container(make_emphasis)
         elif t == "s_open":
-            open_container("delete")
+            open_container()
         elif t == "s_close":
             close_container(make_delete)
         elif t == "link_open":
             attrs = tok.attrs or {}
-            stack.append(("link", current))  # type: ignore[arg-type]
+            href = str(attrs.get("href", ""))
+            title = attrs.get("title")
             link_children: list[Content] = []
-            stack.append((f"_link_meta:{attrs.get('href', '')}|{attrs.get('title') or ''}", current))  # type: ignore[arg-type]
+            stack.append((current, (href, title)))
             current = link_children
         elif t == "link_close":
             kids = current
-            meta_kind, _ = stack.pop()
-            _, parent = stack.pop()
-            current = parent  # type: ignore[assignment]
-            payload = meta_kind.removeprefix("_link_meta:")
-            href, _, title = payload.partition("|")
+            parent, meta = stack.pop()
+            current = parent
+            href, title = meta if meta else ("", None)
             current.append(make_link(href, kids, title=title or None))
         elif t == "image":
             attrs = tok.attrs or {}

From d56fed83f791557fad2f8e7314da59b3edf0aaba Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 23 May 2026 15:09:25 +0000
Subject: [PATCH 5/6] spike(parser): skip parity tests when candidate libs
 absent (fix CI)

Holistic re-review blocker: the spike test module imported mistune /
markdown_it / marko at module top-level, but the default CI test job
runs `uv sync --group dev` (NOT `--group spike-parser`) then
`pytest tests/`. Pytest collected `tests/parser_spike/` and failed
with ModuleNotFoundError -- confirmed red on the PR (`test (3.12)`
failed, others cancelled by fail-fast).

Added `pytest.importorskip(...)` for all three candidate libraries at
the top of the test module, before the translator imports. In the
default dev-only env the module now skips cleanly (verified:
`uv sync --group dev` then `pytest tests/` -> 3,581 passed, 12
skipped, 0 collection errors); with the spike-parser group installed
it runs all 9 tests as before.

conftest.py only imports pathlib + pytest (no candidate libs), so it
needs no guard.
---
 tests/parser_spike/test_mdast_parity.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/tests/parser_spike/test_mdast_parity.py b/tests/parser_spike/test_mdast_parity.py
index be7e005..df78d7e 100644
--- a/tests/parser_spike/test_mdast_parity.py
+++ b/tests/parser_spike/test_mdast_parity.py
@@ -17,12 +17,24 @@
 
 import pytest
 
-from chat_sdk.shared.markdown_parser import parse_markdown as baseline_parse
-from chat_sdk.shared.parser_spike.markdown_it_translator import (
+# The spike candidate libraries live in the optional `spike-parser`
+# dependency group, which CI's default `uv sync --group dev` does NOT
+# install. Skip the whole module (rather than erroring at collection)
+# when they're absent so the standard test job stays green.
+pytest.importorskip("mistune")
+pytest.importorskip("markdown_it")
+pytest.importorskip("marko")
+
+from chat_sdk.shared.markdown_parser import parse_markdown as baseline_parse  # noqa: E402
+from chat_sdk.shared.parser_spike.markdown_it_translator import (  # noqa: E402
     parse_markdown as markdown_it_parse,
 )
-from chat_sdk.shared.parser_spike.marko_translator import parse_markdown as marko_parse
-from chat_sdk.shared.parser_spike.mistune_translator import parse_markdown as mistune_parse
+from chat_sdk.shared.parser_spike.marko_translator import (  # noqa: E402
+    parse_markdown as marko_parse,
+)
+from chat_sdk.shared.parser_spike.mistune_translator import (  # noqa: E402
+    parse_markdown as mistune_parse,
+)
 
 CANDIDATES = [
     ("mistune", mistune_parse),

From e317741915fe00ca84d05bb920516ffa388f08d4 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 28 May 2026 03:34:26 +0000
Subject: [PATCH 6/6] spike(parser): fix pyrefly errors in translators

Final-pass subagent review surfaced 3 pyrefly errors in the spike
translators that the lint workflow enforces (currently masked because
the PR is still draft and the lint job is draft-gated). Fixing them
so the PR is mergeable if the team decides to keep the spike rather
than close-and-archive.

Errors fixed:

1. `markdown_it_translator.py:225` (link_open). `attrs.get("title")`
   returned `float | int | str | None` per markdown-it's loose typing,
   but the stack frame is typed as `tuple[str, str | None]`. Coerce
   via `str(raw_title) if raw_title is not None else None` before
   stuffing into the tuple.

2. `markdown_it_translator.py:238` (image). Same root cause: title
   passed to `make_image(title=...)` (typed `str | None`) was the raw
   `attrs.get("title")` result. Same coercion.

3. `mistune_translator.py:56` (parse_markdown). mistune's `parse()`
   returns `list[dict | str]` -- the rare `str` token is a bare lazy
   text payload. The list comprehension fed it to `_translate_block`
   which expects `dict`. Replaced with an explicit loop that
   distinguishes the two cases: dict tokens go through the block
   translator; string tokens are lifted into `paragraph(text(...))`
   so they're not silently dropped.

`uv run pyrefly check`: 0 errors (was 3). All 9 spike tests pass
unchanged. Full suite: 3,590 pass, 1 pre-existing unrelated failure.
---
 .../parser_spike/markdown_it_translator.py       |  6 ++++--
 .../shared/parser_spike/mistune_translator.py    | 16 ++++++++++++++--
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/chat_sdk/shared/parser_spike/markdown_it_translator.py b/src/chat_sdk/shared/parser_spike/markdown_it_translator.py
index 7fe1d75..ddf5705 100644
--- a/src/chat_sdk/shared/parser_spike/markdown_it_translator.py
+++ b/src/chat_sdk/shared/parser_spike/markdown_it_translator.py
@@ -220,7 +220,8 @@ def close_container(make: Any) -> None:
         elif t == "link_open":
             attrs = tok.attrs or {}
             href = str(attrs.get("href", ""))
-            title = attrs.get("title")
+            raw_title = attrs.get("title")
+            title: str | None = str(raw_title) if raw_title is not None else None
             link_children: list[Content] = []
             stack.append((current, (href, title)))
             current = link_children
@@ -233,7 +234,8 @@ def close_container(make: Any) -> None:
         elif t == "image":
             attrs = tok.attrs or {}
             url = attrs.get("src", "")
-            title = attrs.get("title")
+            raw_title = attrs.get("title")
+            title = str(raw_title) if raw_title is not None else None
             alt = tok.content  # markdown-it precomputes alt text
             current.append(make_image(str(url), alt=alt, title=title))
         elif t == "html_inline":
diff --git a/src/chat_sdk/shared/parser_spike/mistune_translator.py b/src/chat_sdk/shared/parser_spike/mistune_translator.py
index 498f846..017f905 100644
--- a/src/chat_sdk/shared/parser_spike/mistune_translator.py
+++ b/src/chat_sdk/shared/parser_spike/mistune_translator.py
@@ -53,8 +53,20 @@
 def parse_markdown(text: str) -> Root:
     """Parse *text* and return an mdast-compatible root node."""
     tokens, _state = _MD.parse(text)
-    children = [_translate_block(tok) for tok in tokens]
-    return make_root([c for c in children if c is not None])
+    # mistune's parse() return-type is `list[dict | str]` -- a bare str
+    # token is the rare lazy-text node that the public API stringifies
+    # directly. Narrow to dicts for the structural walker; lift any
+    # bare-string tokens into paragraph(text(...)) so they're not lost.
+    children: list[Content] = []
+    for tok in tokens:
+        if isinstance(tok, str):
+            if tok:
+                children.append(make_paragraph([make_text(tok)]))
+            continue
+        translated = _translate_block(tok)
+        if translated is not None:
+            children.append(translated)
+    return make_root(children)
 
 
 def _translate_block(tok: dict[str, Any]) -> Content | None: