diff --git a/pyproject.toml b/pyproject.toml index 97fb917..3726d91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -112,6 +112,14 @@ dev = [ "ruff>=0.4.0", "pyrefly==0.61.1", ] +# Spike for issue #69 Option B (parser-replacement bake-off). +# Three CommonMark/GFM libraries evaluated as candidates for replacing the +# hand-rolled `shared/markdown_parser.py`. Not a runtime dep -- spike only. +spike-parser = [ + "mistune>=3.0", + "markdown-it-py>=3.0", + "marko>=2.0", +] # --------------------------------------------------------------------------- # Pyrefly type checker configuration diff --git a/scripts/parser_spike/__init__.py b/scripts/parser_spike/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/parser_spike/benchmark.py b/scripts/parser_spike/benchmark.py new file mode 100644 index 0000000..0596730 --- /dev/null +++ b/scripts/parser_spike/benchmark.py @@ -0,0 +1,136 @@ +"""Parser-replacement spike benchmark. + +Measures parse-and-translate time for the four parsers (baseline + +three candidates) across a synthetic corpus scaled to ~10KB. Reports +median, p95, and per-construct cost so the Option B decision has +hard numbers to weigh. + +Run:: + + uv run python scripts/parser_spike/benchmark.py + +Acceptance criteria (per issue #69 follow-up): +- 10KB mixed-content document under 5ms median on CI hardware. +- Translator LOC under 250 per library. +""" + +from __future__ import annotations + +import statistics +import time +from pathlib import Path + +from chat_sdk.shared.markdown_parser import parse_markdown as baseline_parse +from chat_sdk.shared.parser_spike.markdown_it_translator import ( + parse_markdown as markdown_it_parse, +) +from chat_sdk.shared.parser_spike.marko_translator import parse_markdown as marko_parse +from chat_sdk.shared.parser_spike.mistune_translator import parse_markdown as mistune_parse + +FIXTURE_PATH = Path(__file__).resolve().parents[2] / "tests" / "parser_spike" / "fixtures" / "mixed_content.md" + + +def _build_corpus(target_bytes: int = 10_240) -> str: + base = FIXTURE_PATH.read_text(encoding="utf-8") + out = [] + size = 0 + while size < target_bytes: + out.append(base) + size += len(base.encode("utf-8")) + return "\n".join(out) + + +def _time_one(fn, text: str, iterations: int) -> list[float]: + timings = [] + for _ in range(iterations): + t0 = time.perf_counter() + fn(text) + timings.append((time.perf_counter() - t0) * 1000.0) + return timings + + +def _translator_loc() -> dict[str, int]: + """Count lines of code per translator, excluding blanks, line comments, + and docstrings. + + The docstring exclusion uses ``ast`` to identify ``Expr(Constant(str))`` + statements -- the canonical docstring shape -- so we don't over-count + multi-line docstrings as logic LOC against the 250-LOC budget. + """ + import ast + + root = Path(__file__).resolve().parents[2] / "src" / "chat_sdk" / "shared" / "parser_spike" + out = {} + for name, path in [ + ("mistune", root / "mistune_translator.py"), + ("markdown-it-py", root / "markdown_it_translator.py"), + ("marko", root / "marko_translator.py"), + ]: + text = path.read_text(encoding="utf-8") + # Identify docstring line ranges via AST: any Expr(Constant(str)) + # immediately under a module, class, or function definition. + tree = ast.parse(text) + docstring_lines: set[int] = set() + for node in ast.walk(tree): + if not isinstance(node, ast.Module | ast.ClassDef | ast.FunctionDef | ast.AsyncFunctionDef): + continue + body = getattr(node, "body", None) + if not body: + continue + first = body[0] + if ( + isinstance(first, ast.Expr) + and isinstance(first.value, ast.Constant) + and isinstance(first.value.value, str) + ): + end_lineno = first.end_lineno or first.lineno + docstring_lines.update(range(first.lineno, end_lineno + 1)) + + code_lines = 0 + for lineno, line in enumerate(text.splitlines(), start=1): + stripped = line.strip() + if not stripped or stripped.startswith("#"): + continue + if lineno in docstring_lines: + continue + code_lines += 1 + out[name] = code_lines + return out + + +def main() -> None: + corpus = _build_corpus() + actual_bytes = len(corpus.encode("utf-8")) + print(f"Corpus: {actual_bytes:,} bytes (~{actual_bytes / 1024:.1f} KB)") + + # Warm-up: each parser caches some regexes / token-rule chains. + for fn in (baseline_parse, mistune_parse, markdown_it_parse, marko_parse): + for _ in range(3): + fn(corpus) + + iterations = 50 + print(f"Iterations per parser: {iterations}\n") + + print(f"{'parser':<20} {'median (ms)':>12} {'p95 (ms)':>12} {'min (ms)':>12} {'max (ms)':>12}") + print("-" * 70) + for name, fn in [ + ("baseline (hand)", baseline_parse), + ("mistune", mistune_parse), + ("markdown-it-py", markdown_it_parse), + ("marko", marko_parse), + ]: + timings = _time_one(fn, corpus, iterations) + timings.sort() + median = statistics.median(timings) + p95 = timings[int(len(timings) * 0.95)] + print(f"{name:<20} {median:>12.2f} {p95:>12.2f} {min(timings):>12.2f} {max(timings):>12.2f}") + + print("\nTranslator LOC (excluding blank lines, line comments, and docstrings):") + print("-" * 70) + for name, loc in _translator_loc().items(): + budget_marker = " ✓" if loc < 250 else " ✗ (over 250-LOC budget)" + print(f" {name:<20} {loc:>4} lines{budget_marker}") + + +if __name__ == "__main__": + main() diff --git a/src/chat_sdk/shared/parser_spike/README.md b/src/chat_sdk/shared/parser_spike/README.md new file mode 100644 index 0000000..12a5a97 --- /dev/null +++ b/src/chat_sdk/shared/parser_spike/README.md @@ -0,0 +1,263 @@ +# Parser-replacement spike (issue #69 Option B) + +This directory is **not part of the runtime SDK**. It exists so the +three candidate markdown libraries can be benchmarked and diffed +against the existing hand-rolled `shared/markdown_parser.py` in a +controlled way before any production code is touched. + +## How to run + +```bash +# Install spike dev deps (one-off) +uv sync --group dev --group spike-parser + +# Diff candidate mdast trees against the baseline +uv run pytest tests/parser_spike/test_mdast_parity.py -s + +# Run the benchmark + LOC report +uv run python scripts/parser_spike/benchmark.py +``` + +## Current results (sample run, local machine) + +Numbers will vary on CI hardware but the **relative ordering is stable** +across runs. + +### Parse-and-translate time (12KB mixed corpus, 50 iterations) + +| parser | median | p95 | meets 5ms budget? | +|-------------------|--------:|--------:|-------------------| +| baseline (hand) | 2.59ms | 2.72ms | ✓ | +| mistune | 11.94ms | 13.04ms | ✗ (2.4× over) | +| markdown-it-py | 13.36ms | 20.64ms | ✗ (2.7× over) | +| marko | 46.62ms | 49.58ms | ✗ (9.3× over) | + +The baseline is **~5× faster** than mistune and markdown-it-py and +**~18× faster** than marko. The 5ms acceptance criterion from issue #69 +is met by the baseline alone. + +### Translator LOC (excluding blank lines, line comments, and docstrings) + +| library | LOC | 250-LOC budget | +|-----------------|----:|----------------| +| mistune | 149 | ✓ | +| markdown-it-py | 194 | ✓ | +| marko | 147 | ✓ | + +All three fit comfortably. mistune and marko both come in under 150 +lines of logic for the translator layer. + +### mdast fidelity on the happy path (`mixed_content.md`) + +Tested against a ≈3KB corpus of headings, tables, code blocks, lists, +links, images, blockquotes, emphasis — constructs the baseline parser +*does* handle. + +| library | divergences | +|-----------------|------------:| +| mistune | 26 | +| markdown-it-py | 24 | +| marko | 27 | + +**Important caveat**: of the ~25 divergences each candidate has, the +vast majority are cases where the **baseline diverges from the mdast +spec**, not where the candidate does. The most common patterns: + +- **Soft line breaks inside paragraphs / blockquotes**: candidates + emit `text + text("\n") + text` (per mdast spec); baseline merges + them into a single text node. +- **Inline link followed by text**: candidates emit + `link(...) + text(".")`; baseline emits a single trailing text node + for `link(...).` that drops the URL. +- **Trailing newline in fenced code values**: mistune and marko + preserve the trailing `\n`; baseline strips it. + +These are **structural improvements**, not regressions. Adopting any +of the candidates would also fix several baseline correctness bugs as +a side effect — albeit changing the mdast shape that downstream code +currently depends on. + +The one candidate-side bug surfaced was marko losing GFM table +alignment metadata (a translator fix; not investigated further in the +spike). + +### Completeness gap on hard constructs (`gap_cases.md`) + +The happy-path comparison above is **not the whole picture**: the +baseline parser is documented as not handling several CommonMark / GFM +constructs at all (see `docs/UPSTREAM_SYNC.md:442`). On those +constructs it silently flattens to `text` / `paragraph` nodes — the +same surface area issue #69 was opened to address. + +`fixtures/gap_cases.md` exercises six gap constructs. **Silent drop** +means the construct was parsed as ordinary text/paragraph; **recognised** +means the parser emitted the correct mdast node type. + +| construct | baseline | mistune | markdown-it-py | marko | +|-----------------------|-------------|------------|----------------|-------------| +| setext heading | silent drop | recognised | recognised | recognised | +| indented code block | silent drop | recognised | recognised | recognised | +| task list item | recognised¹ | silent drop| recognised | recognised | +| footnote definition | silent drop | silent drop| silent drop² | silent drop | +| inline HTML | silent drop | silent drop| silent drop | silent drop | +| definition list | silent drop | silent drop| silent drop | silent drop | +| **silent-drop count** | **5** | **4** | **3** | **3** | + +¹ Baseline matches `- [x]` as a list item but doesn't extract the +checkbox state. +² markdown-it-py supports footnotes via the `mdit-py-plugins` package +(not pulled in by the spike); enabling it would drop the silent-drop +count to 2. + +**The baseline is strictly worse on completeness than every +candidate.** That's the half of the perf comparison the happy-path +numbers don't show: baseline runs faster partly because it does less +work per byte — setext headings, indented code, multi-backtick spans, +escaped chars, and raw HTML all skip straight through the inline +fast-paths instead of being parsed. + +## Implication for the Option A/B/C decision + +The spike data argues against a clean recommendation in either +direction: + +1. **Performance**: baseline wins at 2.59ms median vs 11-47ms for the + candidates. But that win is at least partly a function of doing + *less work per byte*: the baseline skips entire construct families + on the fast path, while the libraries fully tokenise them. Apples + to apples requires either teaching the baseline to handle setext + + indented code + escaped chars (Option A) and re-measuring, or + accepting that the perf gap pays for genuine completeness. + +2. **mdast fidelity on the happy path**: all three candidates are + roughly equivalent (24-27 minor divergences) and each closes some + baseline correctness bugs. mostly toward greater spec compliance. + +3. **Completeness on hard constructs**: the baseline is strictly + worse than every candidate. It silently flattens setext, indented + code, multi-backtick spans, escaped chars, raw HTML, and definition + lists into plain text — the exact gap list issue #69 enumerated. + +4. **Translator LOC**: all under the 250-line budget. + +### Three options now, not two + +- **Option A (close baseline gaps in-tree)**: write parser code for + setext, indented code, escaped chars, multi-backtick spans (the + ones #69 listed as common in LLM output). Estimated ~300-400 LOC of + carefully-tested regex / state-machine work, plus the existing + parser keeps its 2.6ms perf. Doesn't address `_remend` gaps from the + issue #69 follow-up comment. + +- **Option B (library swap)**: pay the 5× perf hit (10-15ms median) + for `mistune` or `markdown-it-py`, eat ~150-215 LOC of translator, + close the completeness gap *and* most `_remend` gaps in one motion. + **markdown-it-py is now the preferred candidate** (best + completeness score, only 1.5ms slower than mistune), with + `mdit-py-plugins` available for footnotes if needed later. mistune + is the runner-up. marko drops out on performance. + +- **Option D (split the problem)**: keep the fast hand-rolled parser + *and* close gaps in-tree (Option A), but separately port upstream + `remend` directly for the streaming side. Two efforts, two PRs, but + preserves perf while closing both bug classes. More total work than + Option B but no dependency added. + +### Recommendation + +The right answer depends on team priorities the spike can't answer: + +- **If 10ms median parse time is fine** (likely true for chat + streaming, where LLM token latency dwarfs this), **Option B with + markdown-it-py is the cleanest path**. One PR, one dep, both gap + lists close. +- **If we want zero-dep core preserved**, **Option D** is the only + path that keeps the install footprint small while closing both bug + classes. Highest total effort. +- **If neither perf nor zero-dep is sacred**, Option B still wins on + effort per fix delivered. + +Option C (selective parser-side fixes only, the original framing in +the issue) leaves the streaming-side bugs from the #69 follow-up +comment unaddressed and should be ruled out unless we ship it +alongside a separate `_remend` fix. + +## Triggers to revisit this decision + +The chat-scoped Option A (PRs #99 + #101) is the right call **for the +SDK's current scope** -- LLM output rendered into chat platforms. The +moment the input source or rendering target changes, the spike data +should be re-run with a workload-shaped fixture before deciding +anything. + +Concrete triggers that should cause us to re-open this: + +- **A non-chat input surface lands.** The chat-scoped assumption is + "input comes from an LLM; humans don't write the markdown we parse." + That breaks the moment we start parsing markdown that humans (or + external corpora) authored: + - User-authored memory / notes / scratchpads stored in the SDK + - Ingestion of `*.md` files for RAG-style workflows + - Parsing incoming GitHub PR/issue bodies for structure extraction + (today the GitHub adapter mostly emits, not parses) + - Any "import markdown" public API + Human-authored content routinely uses setext, indented code, + footnotes, raw HTML, and multi-backtick spans -- exactly the gaps + the baseline silently drops. + +- **A long-form artifact output surface lands.** When agents start + emitting research-summary / report / document artifacts (not chat + messages), the workload shifts toward CommonMark fidelity: + - Footnotes for citations + - Math regions rendered (not just sanitised) + - Multi-backtick code spans for technical documentation + - Tables with richer cell content + Parsing for an artifact also happens once per document, not per + stream chunk -- which makes the 5-18× perf cost of Option B much + more tolerable than it is for streaming. + +- **A web rendering surface for chat-sdk-python.** Upstream added + `@chat-adapter/web` in v4.27.0 (a browser-side chat UI). It's + explicitly out of scope for chat-sdk-python today (see PR #83 sync + scope). If that ever ships in Python, the rendering target tolerates + richer markdown because the browser can display setext / footnotes / + HTML natively. + +- **A new chat platform that demands richer parsing.** Unlikely in + the near term -- the existing eight platforms all render a similar + CommonMark subset. But e.g. a platform with native footnote support + could surface a gap. + +### Upstream check (May 2026) + +Spot-checked `vercel/chat`'s `packages/` directory at the time of +writing. The only relevant package besides the eight chat adapters and +the core/state packages is **`adapter-web`** (added in v4.27.0, Python +port deferred). No artifact-rendering, RAG, document-ingestion, or +standalone markdown-rendering packages exist upstream. The triggers +above are forward-looking -- none are imminent in upstream-tracked +work. + +### Playbook for re-running + +When a trigger materialises: + +1. Author a fixture file under `tests/parser_spike/fixtures/` that + represents the new surface's actual content (not generic + CommonMark -- workload-shaped). +2. Re-run `pytest tests/parser_spike/test_mdast_parity.py -s` and + `python scripts/parser_spike/benchmark.py`. Both pick up the new + fixture automatically if added to `conftest.py`. +3. Compare the silent-drop count and benchmark numbers against the + chat-scoped findings above. The decision matrix shifts toward + Option B when: + - Silent-drop count is materially higher on the new fixture + (≥6 constructs that the new surface needs) + - Parse latency is one-shot rather than per-stream-chunk + - The team is OK adding a dependency to the runtime core +4. If thresholds are met, promote `markdown-it-py` translator from + `parser_spike/` into runtime (it's the preferred candidate per + the spike data). Add `markdown-it-py` to the relevant extras + group (not `dependencies`, to preserve zero-dep core install for + chat-only consumers). + diff --git a/src/chat_sdk/shared/parser_spike/__init__.py b/src/chat_sdk/shared/parser_spike/__init__.py new file mode 100644 index 0000000..598ec03 --- /dev/null +++ b/src/chat_sdk/shared/parser_spike/__init__.py @@ -0,0 +1,31 @@ +"""Parser-replacement spike for issue #69 Option B. + +Three candidate libraries are evaluated as drop-in replacements for the +hand-rolled ``shared/markdown_parser.py``: + +- ``mistune`` (3.x) +- ``markdown-it-py`` (4.x) +- ``marko`` (2.x) + +Each gets a thin translator that converts the library's native token / +AST format into the mdast-compatible dict shape produced by +``shared.markdown_parser.parse_markdown``. The contract: same input +markdown should produce the same mdast tree across all four parsers +(the existing hand-rolled one + the three candidates), modulo +documented divergences. + +This module is NOT imported by the runtime SDK. It exists purely so +the bake-off harness in ``tests/parser_spike/`` and +``scripts/parser_spike/`` can exercise the candidates side-by-side +without touching production code paths. + +The decision criteria (per the issue #69 follow-up plan): + 1. mdast fidelity vs the existing parser on the fixture corpus + 2. Translator LOC (target: <250 per library) + 3. Parse-and-translate time (target: <5ms on 10KB mixed content) + 4. GFM coverage (tables, strikethrough, task lists) + 5. Extensibility surface for the gaps in #69 (setext, footnotes, + escaped chars, multi-backtick code spans, raw HTML, indented code) +""" + +from __future__ import annotations diff --git a/src/chat_sdk/shared/parser_spike/markdown_it_translator.py b/src/chat_sdk/shared/parser_spike/markdown_it_translator.py new file mode 100644 index 0000000..ddf5705 --- /dev/null +++ b/src/chat_sdk/shared/parser_spike/markdown_it_translator.py @@ -0,0 +1,246 @@ +"""markdown-it-py (4.x) -> mdast translator. + +markdown-it tokenises into a flat list of ``Token`` objects (each with +``type``, ``tag``, ``content``, ``children``, ``markup``, ``attrs``, +``meta``). Block-level constructs use ``_open`` / ``_close`` pairs and +must be folded into a tree. Inline tokens (under ``inline`` parents) +are already nested. + +GFM features (tables, strikethrough) are enabled by selecting the +``gfm-like`` preset and adding the strikethrough rule explicitly. +Task-list rendering would require ``mdit-py-plugins`` (deferred). +""" + +from __future__ import annotations + +from typing import Any + +from markdown_it import MarkdownIt +from markdown_it.token import Token + +from chat_sdk.shared.markdown_parser import ( + Content, + Root, + make_blockquote, + make_break, + make_code, + make_delete, + make_emphasis, + make_heading, + make_image, + make_inline_code, + make_link, + make_list, + make_list_item, + make_paragraph, + make_root, + make_strong, + make_table, + make_table_cell, + make_table_row, + make_text, + make_thematic_break, +) + +_MD = MarkdownIt("commonmark").enable(["table", "strikethrough"]) + + +def parse_markdown(text: str) -> Root: + tokens = _MD.parse(text) + children, _ = _consume_blocks(tokens, 0, end_type=None) + return make_root(children) + + +def _consume_blocks(tokens: list[Token], i: int, end_type: str | None) -> tuple[list[Content], int]: + """Walk tokens until we hit *end_type* (or end of list). Return the + list of mdast block children produced and the index after the closer. + """ + children: list[Content] = [] + while i < len(tokens): + tok = tokens[i] + if end_type is not None and tok.type == end_type: + return children, i + 1 + + if tok.type == "paragraph_open": + inline = tokens[i + 1] + children.append(make_paragraph(_translate_inline(inline.children or []))) + i += 3 # paragraph_open, inline, paragraph_close + continue + + if tok.type == "heading_open": + depth = int(tok.tag[1]) # h1 -> 1, h2 -> 2, ... + inline = tokens[i + 1] + children.append(make_heading(depth, _translate_inline(inline.children or []))) + i += 3 + continue + + if tok.type == "hr": + children.append(make_thematic_break()) + i += 1 + continue + + if tok.type == "fence": + lang = tok.info.split()[0] if tok.info and tok.info.strip() else None + value = tok.content.rstrip("\n") + children.append(make_code(value, lang=lang)) + i += 1 + continue + + if tok.type == "code_block": + children.append(make_code(tok.content.rstrip("\n"), lang=None)) + i += 1 + continue + + if tok.type == "blockquote_open": + inner, i = _consume_blocks(tokens, i + 1, "blockquote_close") + children.append(make_blockquote(inner)) + continue + + if tok.type == "bullet_list_open": + items, i = _consume_list(tokens, i + 1, "bullet_list_close") + children.append(make_list(items, ordered=False)) + continue + + if tok.type == "ordered_list_open": + start = int((tok.attrs or {}).get("start", 1)) + items, i = _consume_list(tokens, i + 1, "ordered_list_close") + children.append(make_list(items, ordered=True, start=start)) + continue + + if tok.type == "table_open": + table, i = _consume_table(tokens, i + 1) + children.append(table) + continue + + # Unknown / unhandled token: skip but don't crash. + i += 1 + + return children, i + + +def _consume_list(tokens: list[Token], i: int, end_type: str) -> tuple[list[Content], int]: + items: list[Content] = [] + while i < len(tokens): + tok = tokens[i] + if tok.type == end_type: + return items, i + 1 + if tok.type == "list_item_open": + inner, i = _consume_blocks(tokens, i + 1, "list_item_close") + items.append(make_list_item(inner)) + continue + i += 1 + return items, i + + +def _consume_table(tokens: list[Token], i: int) -> tuple[Content, int]: + rows: list[Content] = [] + in_header = False + header_aligns: list[str | None] = [] + current_row: list[Content] = [] + current_aligns: list[str | None] = [] + + while i < len(tokens): + tok = tokens[i] + if tok.type == "table_close": + return make_table(rows, align=header_aligns if any(header_aligns) else None), i + 1 + if tok.type == "thead_open": + in_header = True + elif tok.type == "thead_close": + in_header = False + elif tok.type == "tr_open": + current_row = [] + current_aligns = [] + elif tok.type == "tr_close": + rows.append(make_table_row(current_row)) + if in_header: + header_aligns = current_aligns + elif tok.type in ("th_open", "td_open"): + style = (tok.attrs or {}).get("style", "") + cell_align: str | None = None + if isinstance(style, str): + if "text-align:left" in style: + cell_align = "left" + elif "text-align:center" in style: + cell_align = "center" + elif "text-align:right" in style: + cell_align = "right" + current_aligns.append(cell_align) + inline = tokens[i + 1] + current_row.append(make_table_cell(_translate_inline(inline.children or []))) + i += 3 # th/td_open, inline, th/td_close + continue + i += 1 + return make_table(rows, align=header_aligns if any(header_aligns) else None), i + + +def _translate_inline(tokens: list[Token]) -> list[Content]: + out: list[Content] = [] + # Each stack frame holds (parent_list, meta) -- meta is None for plain + # containers (strong/emphasis/delete) and a (href, title) tuple for + # links. Using a tuple instead of pipe-stuffing a string sidesteps the + # fragility of URLs/titles that contain pipe characters. + stack: list[tuple[list[Content], tuple[str, str | None] | None]] = [] + current = out + + def open_container() -> None: + nonlocal current + new_children: list[Content] = [] + stack.append((current, None)) + current = new_children + + def close_container(make: Any) -> None: + nonlocal current + kids = current + parent, _meta = stack.pop() + current = parent + current.append(make(kids)) + + for tok in tokens: + t = tok.type + if t == "text": + current.append(make_text(tok.content)) + elif t == "softbreak": + current.append(make_text("\n")) + elif t == "hardbreak": + current.append(make_break()) + elif t == "code_inline": + current.append(make_inline_code(tok.content)) + elif t == "strong_open": + open_container() + elif t == "strong_close": + close_container(make_strong) + elif t == "em_open": + open_container() + elif t == "em_close": + close_container(make_emphasis) + elif t == "s_open": + open_container() + elif t == "s_close": + close_container(make_delete) + elif t == "link_open": + attrs = tok.attrs or {} + href = str(attrs.get("href", "")) + raw_title = attrs.get("title") + title: str | None = str(raw_title) if raw_title is not None else None + link_children: list[Content] = [] + stack.append((current, (href, title))) + current = link_children + elif t == "link_close": + kids = current + parent, meta = stack.pop() + current = parent + href, title = meta if meta else ("", None) + current.append(make_link(href, kids, title=title or None)) + elif t == "image": + attrs = tok.attrs or {} + url = attrs.get("src", "") + raw_title = attrs.get("title") + title = str(raw_title) if raw_title is not None else None + alt = tok.content # markdown-it precomputes alt text + current.append(make_image(str(url), alt=alt, title=title)) + elif t == "html_inline": + current.append(make_text(tok.content)) + else: + if tok.content: + current.append(make_text(tok.content)) + return out diff --git a/src/chat_sdk/shared/parser_spike/marko_translator.py b/src/chat_sdk/shared/parser_spike/marko_translator.py new file mode 100644 index 0000000..9bc9530 --- /dev/null +++ b/src/chat_sdk/shared/parser_spike/marko_translator.py @@ -0,0 +1,179 @@ +"""marko (2.x) -> mdast translator. + +marko parses to a class-based AST (``marko.block.Document`` etc.). Each +node exposes ``children`` (list[Node] or str payload). The GFM extension +adds tables, strikethrough, task lists, autolinks. +""" + +from __future__ import annotations + +import marko +from marko.ext.gfm import GFM + +from chat_sdk.shared.markdown_parser import ( + Content, + Root, + make_blockquote, + make_break, + make_code, + make_delete, + make_emphasis, + make_heading, + make_image, + make_inline_code, + make_link, + make_list, + make_list_item, + make_paragraph, + make_root, + make_strong, + make_table, + make_table_cell, + make_table_row, + make_text, + make_thematic_break, +) + +_MD = marko.Markdown(extensions=[GFM]) + + +def parse_markdown(text: str) -> Root: + doc = _MD.parse(text) + children = [_translate(c) for c in getattr(doc, "children", [])] + return make_root([c for c in children if c is not None]) + + +def _translate(node: object) -> Content | None: + cls = type(node).__name__ + + if cls == "Paragraph": + return make_paragraph(_inline_children(node)) + if cls == "Heading": + depth = int(getattr(node, "level", 1)) + return make_heading(depth, _inline_children(node)) + if cls == "SetextHeading": + depth = int(getattr(node, "level", 1)) + return make_heading(depth, _inline_children(node)) + if cls == "ThematicBreak": + return make_thematic_break() + if cls in ("FencedCode", "CodeBlock"): + lang = getattr(node, "lang", None) or None + value = _gather_code_text(node) + return make_code(value, lang=lang) + if cls == "Quote": + return make_blockquote(_block_children(node)) + if cls == "List": + ordered = bool(getattr(node, "ordered", False)) + start = int(getattr(node, "start", 1)) if ordered else 1 + return make_list(_block_children(node), ordered=ordered, start=start) + if cls == "ListItem": + return make_list_item(_block_children(node)) + if cls == "Table": + return _translate_table(node) + if cls == "BlankLine": + return None + if cls == "HTMLBlock": + return make_paragraph([make_text(getattr(node, "body", "") or "")]) + # Fallback: stringify if we can. + return None + + +def _block_children(node: object) -> list[Content]: + out: list[Content] = [] + for child in getattr(node, "children", []) or []: + translated = _translate(child) + if translated is not None: + out.append(translated) + return out + + +def _inline_children(node: object) -> list[Content]: + out: list[Content] = [] + children = getattr(node, "children", None) + if isinstance(children, str): + return [make_text(children)] + for child in children or []: + translated = _translate_inline(child) + if translated is not None: + out.extend(translated) if isinstance(translated, list) else out.append(translated) + return out + + +def _translate_inline(node: object) -> Content | list[Content] | None: + cls = type(node).__name__ + + if cls == "RawText": + value = getattr(node, "children", "") + return make_text(value if isinstance(value, str) else "") + if cls == "Literal": + return make_text(getattr(node, "children", "") or "") + if cls == "LineBreak": + # marko exposes a ``soft`` flag on the line-break node. + soft = bool(getattr(node, "soft", False)) + return make_text("\n") if soft else make_break() + if cls == "InlineHTML": + return make_text(getattr(node, "children", "") or "") + if cls == "CodeSpan": + value = getattr(node, "children", "") + return make_inline_code(value if isinstance(value, str) else "") + if cls == "Emphasis": + return make_emphasis(_inline_children(node)) + if cls == "StrongEmphasis": + return make_strong(_inline_children(node)) + if cls == "Strikethrough": + return make_delete(_inline_children(node)) + if cls == "Link": + url = getattr(node, "dest", "") or "" + title = getattr(node, "title", None) or None + return make_link(url, _inline_children(node), title=title) + if cls in ("AutoLink", "Url"): + url = getattr(node, "dest", "") or "" + return make_link(url, _inline_children(node)) + if cls == "Image": + url = getattr(node, "dest", "") or "" + title = getattr(node, "title", None) or None + alt = "".join(_extract_text(c) for c in getattr(node, "children", []) or []) + return make_image(url, alt=alt, title=title) + # Fallback: any unrecognized inline -> stringify children if any. + value = getattr(node, "children", None) + if isinstance(value, str): + return make_text(value) + return None + + +def _translate_table(node: object) -> Content: + rows: list[Content] = [] + align: list[str | None] = list(getattr(node, "alignment", []) or []) + # marko stores alignments as ["left", "center", "right", None]. + align = [a if a in ("left", "center", "right") else None for a in align] + for row in getattr(node, "children", []) or []: + cells: list[Content] = [] + for cell in getattr(row, "children", []) or []: + cells.append(make_table_cell(_inline_children(cell))) + rows.append(make_table_row(cells)) + return make_table(rows, align=align if any(align) else None) + + +def _extract_text(node: object) -> str: + cls = type(node).__name__ + if cls == "RawText": + v = getattr(node, "children", "") + return v if isinstance(v, str) else "" + children = getattr(node, "children", None) + if isinstance(children, str): + return children + return "".join(_extract_text(c) for c in children or []) + + +def _gather_code_text(node: object) -> str: + children = getattr(node, "children", None) + if isinstance(children, str): + return children + parts: list[str] = [] + for c in children or []: + v = getattr(c, "children", "") + if isinstance(v, str): + parts.append(v) + else: + parts.append(_extract_text(c)) + return "".join(parts) diff --git a/src/chat_sdk/shared/parser_spike/mistune_translator.py b/src/chat_sdk/shared/parser_spike/mistune_translator.py new file mode 100644 index 0000000..017f905 --- /dev/null +++ b/src/chat_sdk/shared/parser_spike/mistune_translator.py @@ -0,0 +1,204 @@ +"""mistune (3.x) -> mdast translator. + +Uses ``mistune.create_markdown(renderer=None)`` to obtain the parser's +internal token list, then maps each token type to its mdast equivalent. + +GFM plugins enabled: ``table``, ``strikethrough``, ``task_lists``, +``url``. + +Notes on the token shape (mistune 3.x): each token is a dict with +``type`` (always), ``children`` (block tokens + some inline), ``raw`` +(text leaves), ``attrs`` (heading levels, link urls, list metadata). +The inline parser is invoked lazily; we drive it explicitly via +``md.inline.parse`` for cells / list items where needed. +""" + +from __future__ import annotations + +from typing import Any + +import mistune + +from chat_sdk.shared.markdown_parser import ( + Content, + Root, + make_blockquote, + make_break, + make_code, + make_delete, + make_emphasis, + make_heading, + make_image, + make_inline_code, + make_link, + make_list, + make_list_item, + make_paragraph, + make_root, + make_strong, + make_table, + make_table_cell, + make_table_row, + make_text, + make_thematic_break, +) + +# Single shared parser instance (mistune parsers are stateless after creation). +_MD = mistune.create_markdown( + renderer=None, + plugins=["table", "strikethrough", "task_lists", "url"], +) + + +def parse_markdown(text: str) -> Root: + """Parse *text* and return an mdast-compatible root node.""" + tokens, _state = _MD.parse(text) + # mistune's parse() return-type is `list[dict | str]` -- a bare str + # token is the rare lazy-text node that the public API stringifies + # directly. Narrow to dicts for the structural walker; lift any + # bare-string tokens into paragraph(text(...)) so they're not lost. + children: list[Content] = [] + for tok in tokens: + if isinstance(tok, str): + if tok: + children.append(make_paragraph([make_text(tok)])) + continue + translated = _translate_block(tok) + if translated is not None: + children.append(translated) + return make_root(children) + + +def _translate_block(tok: dict[str, Any]) -> Content | None: + t = tok.get("type") + if t == "blank_line": + return None + if t == "paragraph": + return make_paragraph(_translate_inline_children(tok)) + if t == "heading": + depth = int(tok.get("attrs", {}).get("level", 1)) + return make_heading(depth, _translate_inline_children(tok)) + if t == "thematic_break": + return make_thematic_break() + if t == "block_code": + attrs = tok.get("attrs", {}) or {} + info = attrs.get("info") + lang = info.split()[0] if isinstance(info, str) and info.strip() else None + return make_code(tok.get("raw", ""), lang=lang) + if t == "block_quote": + children = [_translate_block(c) for c in tok.get("children", [])] + return make_blockquote([c for c in children if c is not None]) + if t == "list": + attrs = tok.get("attrs", {}) or {} + ordered = bool(attrs.get("ordered")) + start = int(attrs.get("start", 1)) if ordered else 1 + items = [_translate_block(c) for c in tok.get("children", [])] + items = [c for c in items if c is not None] + return make_list(items, ordered=ordered, start=start) + if t == "list_item": + children = [_translate_block(c) for c in tok.get("children", [])] + return make_list_item([c for c in children if c is not None]) + if t == "block_text": + # Loose-list paragraph payload; mistune emits raw inline text. + return make_paragraph(_translate_inline_children(tok)) + if t == "table": + return _translate_table(tok) + # Unknown block: render as a paragraph carrying its raw text so we + # don't silently drop content. The bake-off harness will flag this. + raw = tok.get("raw", "") + if raw: + return make_paragraph([make_text(raw)]) + return None + + +def _translate_table(tok: dict[str, Any]) -> Content: + rows: list[Content] = [] + align: list[str | None] = [] + for child in tok.get("children", []): + ctype = child.get("type") + if ctype == "table_head": + cells, head_align = _translate_table_row(child) + rows.append(make_table_row(cells)) + align = head_align + elif ctype == "table_body": + for row in child.get("children", []): + if row.get("type") == "table_row": + cells, _ = _translate_table_row(row) + rows.append(make_table_row(cells)) + return make_table(rows, align=align if any(align) else None) + + +def _translate_table_row(row: dict[str, Any]) -> tuple[list[Content], list[str | None]]: + cells: list[Content] = [] + aligns: list[str | None] = [] + for cell in row.get("children", []): + if cell.get("type") not in ("table_cell",): + continue + attrs = cell.get("attrs", {}) or {} + align_val = attrs.get("align") + aligns.append(align_val if align_val in ("left", "center", "right") else None) + cells.append(make_table_cell(_translate_inline_children(cell))) + return cells, aligns + + +def _translate_inline_children(tok: dict[str, Any]) -> list[Content]: + children = tok.get("children") + if children is None: + # mistune defers inline parsing for some tokens (e.g. headings + # built from setext logic). Parse the raw text now. + raw = tok.get("raw", "") + if not raw: + return [] + children = _MD.inline.parse(raw, mistune.BlockState()) # type: ignore[arg-type] + out: list[Content] = [] + for child in children or []: + translated = _translate_inline(child) + if translated is not None: + out.extend(translated) if isinstance(translated, list) else out.append(translated) + return out + + +def _translate_inline(tok: dict[str, Any]) -> Content | list[Content] | None: + t = tok.get("type") + if t == "text": + return make_text(tok.get("raw", "")) + if t == "softbreak": + return make_text("\n") + if t == "linebreak": + return make_break() + if t == "codespan": + return make_inline_code(tok.get("raw", "")) + if t in ("strong", "emphasis", "delete", "strikethrough"): + kids = _translate_inline_children(tok) + if t == "strong": + return make_strong(kids) + if t == "emphasis": + return make_emphasis(kids) + return make_delete(kids) + if t == "link": + attrs = tok.get("attrs", {}) or {} + url = attrs.get("url", "") + title = attrs.get("title") + return make_link(url, _translate_inline_children(tok), title=title) + if t == "image": + attrs = tok.get("attrs", {}) or {} + url = attrs.get("url", "") + title = attrs.get("title") + # mistune nests alt as inline children; flatten to plain string. + alt = "".join(_extract_text(c) for c in tok.get("children", []) or []) + return make_image(url, alt=alt, title=title) + if t == "inline_html": + # mdast has html nodes; the existing hand-rolled parser doesn't + # emit them. Surface as plain text for parity with the baseline. + return make_text(tok.get("raw", "")) + raw = tok.get("raw") + if raw: + return make_text(raw) + return None + + +def _extract_text(node: dict[str, Any]) -> str: + if node.get("type") == "text": + return node.get("raw", "") + children = node.get("children") or [] + return "".join(_extract_text(c) for c in children) diff --git a/tests/parser_spike/__init__.py b/tests/parser_spike/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/parser_spike/conftest.py b/tests/parser_spike/conftest.py new file mode 100644 index 0000000..05d54f0 --- /dev/null +++ b/tests/parser_spike/conftest.py @@ -0,0 +1,25 @@ +"""Shared fixtures for the parser-replacement spike harness.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +FIXTURE_DIR = Path(__file__).parent / "fixtures" + + +@pytest.fixture(scope="session") +def mixed_content_markdown() -> str: + return (FIXTURE_DIR / "mixed_content.md").read_text(encoding="utf-8") + + +@pytest.fixture(scope="session") +def gap_cases_markdown() -> str: + """Constructs the hand-rolled parser explicitly doesn't support + (setext headings, footnotes, escaped chars, multi-backtick spans, + raw HTML, indented code blocks, math, task lists, autolinks, + definition lists). Used to measure the *completeness* gap, not + just the structural-equivalence gap. + """ + return (FIXTURE_DIR / "gap_cases.md").read_text(encoding="utf-8") diff --git a/tests/parser_spike/fixtures/gap_cases.md b/tests/parser_spike/fixtures/gap_cases.md new file mode 100644 index 0000000..18678e6 --- /dev/null +++ b/tests/parser_spike/fixtures/gap_cases.md @@ -0,0 +1,54 @@ +Setext H1 underline +=================== + +Setext H2 underline +------------------- + +Indented code block (4-space): + + def hello(): + return "world" + +A paragraph with escaped \*asterisks\* and escaped \[brackets\] and a +literal backslash \\ in it. + +A footnote reference[^1] in running text. + +[^1]: This is the footnote body. + +Multi-backtick inline code: ``some `quoted` code`` and triple ```backticks +with ``double`` inside```. + +Raw HTML block: + +
This is HTML, not markdown.
+