diff --git a/Cargo.lock b/Cargo.lock index 190a163b..00f5ccd2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -421,7 +421,8 @@ dependencies = [ "regex", "rstest", "tempfile", - "unicode-width", + "textwrap", + "unicode-width 0.1.14", ] [[package]] @@ -785,6 +786,12 @@ version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +[[package]] +name = "smawk" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c388c1b5e93756d0c740965c41e8822f866621d41acbdf6336a6a168f8840c" + [[package]] name = "string_cache" version = "0.8.9" @@ -857,6 +864,17 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683" +[[package]] +name = "textwrap" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c13547615a44dc9c452a8a534638acdf07120d4b6847c8178705da06306a3057" +dependencies = [ + "smawk", + "unicode-linebreak", + "unicode-width 0.2.2", +] + [[package]] name = "toml_datetime" version = "0.6.11" @@ -880,12 +898,24 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" +[[package]] +name = "unicode-linebreak" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b09c83c3c29d37506a3e260c08c03743a6bb66a9cd432c6934ab501a190571f" + [[package]] name = "unicode-width" version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + [[package]] name = "utf-8" version = "0.7.6" diff --git a/Cargo.toml b/Cargo.toml index bbb81b3e..89255a92 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ once_cell = "1" rayon = "1.11" html5ever = "0.27" markup5ever_rcdom = "0.3" +textwrap = "0.16.2" unicode-width = "0.1" diff --git a/README.md b/README.md index 23ad0f67..e9812d2d 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,9 @@ list items to 80 columns. Hyphenated words are treated as indivisible during wrapping, so `very-long-word` will move to the next line intact rather than split at the -hyphen. The tool ignores fenced code blocks and respects escaped pipes (`\|`), +hyphen. The wrap engine now delegates line fitting to the `textwrap` crate +while preserving Markdown-aware token grouping for inline code, links, and hard +breaks. The tool ignores fenced code blocks and respects escaped pipes (`\|`), making it safe to use on Markdown with mixed content. ## Installation diff --git a/docs/adrs/0002-textwrap-wrapping-engine.md b/docs/adrs/0002-textwrap-wrapping-engine.md new file mode 100644 index 00000000..e149a5d9 --- /dev/null +++ b/docs/adrs/0002-textwrap-wrapping-engine.md @@ -0,0 +1,64 @@ +# Architecture Decision Record (ADR) 0002: Delegate line fitting to `textwrap` + +- Status: Accepted +- Date: 2026-04-22 + +## Context + +The previous wrapping engine in `src/wrap/line_buffer.rs` implemented a bespoke +`LineBuffer` struct that accumulated tokens, tracked a split-point cursor, and +flushed completed lines one at a time. This approach had three compounding +problems: + +- Width measurement was byte-based in early versions, producing incorrect splits + for non-ASCII characters such as CJK glyphs and emoji. +- The split-with-carry logic required carefully coordinated state between + `push_span`, `split_with_span`, and `flush_trailing_whitespace`, making the + code difficult to reason about and extend. +- Each fragment addition triggered a full re-evaluation of the buffer, risking + quadratic behaviour on long paragraphs. + +## Decision + +Replace `LineBuffer` with `textwrap::wrap_algorithms::wrap_first_fit` and a +fragment model built on the `textwrap::core::Fragment` trait. Each token group +becomes an `InlineFragment` that carries pre-computed display width (via +`unicode-width`) and a `FragmentKind` tag. `wrap_first_fit` performs greedy +line fitting over the fragment slice; post-processing in +`src/wrap/inline/postprocess.rs` normalizes whitespace-only lines and +rebalances atomic tails. Prefix handling is centralized in +`ParagraphWriter::wrap_with_prefix`, which computes available width once and +prepends the correct prefix to every wrapped output line. + +The greedy first-fit algorithm is chosen over `textwrap`'s optimal-fit +algorithm because the optimal algorithm may produce non-local changes to +earlier lines when a later fragment is added, which conflicts with the +incremental buffer model and produces surprising diffs. + +## Consequences + +Positive: + +- Line fitting is delegated to a well-tested upstream crate; the bespoke split + logic and `LineBuffer` state machine are removed entirely. +- Display widths are computed by `unicode-width` according to Unicode Standard + Annex `#11`, giving correct column counts for non-ASCII text. +- `InlineFragment::kind` centralizes token classification, so post-processing + predicates (`is_whitespace`, `is_atomic`, `is_plain`) do not repeat + classification logic. + +Negative: + +- Greedy first-fit produces wider first lines than optimal-fit would in some + cases, though this difference is not visible in standard Markdown prose. +- The project now depends on `textwrap 0.16` in addition to `unicode-width`. + +## Alternatives considered + +- **Optimal-fit algorithm** (`textwrap::wrap_algorithms::wrap_optimal_fit`): + rejected because it requires the complete fragment list upfront and may + redistribute earlier lines when later fragments are added, which conflicts + with the streaming model. +- **Patching `LineBuffer` for Unicode correctness**: rejected because the + split-point cursor and carry semantics remained inherently fragile; the + maintenance burden outweighed the risk of introducing a new dependency. diff --git a/docs/architecture.md b/docs/architecture.md index a25c6cfc..daca369f 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -36,8 +36,9 @@ The function combines several helpers documented in `docs/`: - `html::convert_html_tables` transforms basic HTML tables into Markdown so \ they can be reflowed like regular tables. See \ [HTML table support](#html-table-support-in-mdtablefix). -- `wrap::wrap_text` applies optional line wrapping. It relies on the - `unicode-width` crate for accurate character widths. +- `wrap::wrap_text` applies optional line wrapping. It classifies Markdown + block structure locally and delegates greedy line fitting to the `textwrap` + crate over Markdown-aware fragments measured with `unicode-width`. - `wrap::tokenize_markdown` emits `Token` values for custom processing. - `headings::convert_setext_headings` rewrites Setext headings with underline markers into ATX headings when the CLI `--headings` flag is provided. The @@ -374,35 +375,113 @@ module handles filesystem operations, delegating the text processing to ### Tokenizer flow -The inline tokenizer iterates over the source string lazily, so no duplicate -`Vec` representation is required. The following diagram summarizes the -control flow, highlighting the helpers touched during whitespace, code span, -and link handling. +The inline tokenizer still iterates over the source string lazily, so no +duplicate `Vec` representation is required. The resulting tokens are then +grouped into Markdown-aware fragments and passed to +`textwrap::wrap_algorithms::wrap_first_fit`, which chooses the breakpoints +without splitting code spans, links, or punctuation groups. ```mermaid flowchart TD - A["Input text (&str)"] --> B["Initialize tokens Vec"] - B --> C["Iterate over text by byte index"] - C --> D{"Current char is whitespace?"} - D -- Yes --> E["scan_while for whitespace"] - E --> F["collect_range and push token"] - D -- No --> G{"Current char is '`'?"} - G -- Yes --> H["Check backslash escape (has_odd_backslash_escape_bytes)"] - H -- Escaped --> I["Push '`' as token"] - H -- Not escaped --> J["scan_while for code fence"] - J --> K["Find closing fence, collect_range and push token"] - G -- No --> L{"Current char is '[' or '!['?"} - L -- Yes --> M["parse_link_or_image"] - M --> N["Push link/image token"] - N --> O["scan_while for trailing punctuation"] - O --> P["collect_range and push punctuation token"] - L -- No --> Q["scan_while for non-whitespace/non-` chars"] - Q --> R["collect_range and push token"] - F & I & K & P & R --> S["Continue iteration"] - S --> C - C -->|End| T["Return tokens Vec"] + A["Input text (&str)"] --> B["Tokenize into whitespace and inline Markdown tokens"] + B --> C["Group tokens into Markdown-aware fragments"] + C --> D["Measure fragment widths with unicode-width"] + D --> E["Run textwrap wrap_first_fit over current fragments"] + E --> F["Merge whitespace-only continuation lines forward"] + F --> G["Render wrapped lines, trimming only a single trailing separator space"] ``` +Figure: Wrap-tokenizer flow. Starting from an input string, the wrapper emits +whitespace and inline Markdown tokens, groups them into fragments, measures +their display widths with `unicode-width`, feeds them through +`textwrap::wrap_algorithms::wrap_first_fit`, and then reconstructs wrapped +lines while preserving Markdown-aware spacing rules. + +### Wrap flow + +The higher-level `wrap_text` entry point combines block classification, +paragraph buffering, prefix-aware wrapping, and inline line fitting. The +following flow shows how a line moves through those stages before it is either +preserved verbatim or emitted as wrapped output. + +```mermaid +flowchart TD + A[Start: wrap_text called with lines and width] --> B{Classify line} + + B -->|Fenced or indented code block| C[Preserve line verbatim] + B -->|Table or heading or directive| C + B -->|Blank line| D[Flush active paragraph and emit blank] + B -->|Paragraph or prefixed line| E[Send to ParagraphWriter] + + E --> F{Has prefix such as bullet, blockquote, footnote} + F -->|Yes| G[wrap_with_prefix computes display width using unicode-width] + F -->|No| H[wrap_preserving_code wraps inline content] + + G --> I[fragment-building / post-process helpers] + H --> I + + I --> J[textwrap::wrap_algorithms::wrap_first_fit performs line breaking] + J --> K[Reconstruct wrapped lines with prefixes and preserved spans] + K --> L[Emit wrapped lines to wrap_text] + + C --> M[Append line to output] + D --> M + L --> M + + M --> N{More input lines?} + N -->|Yes| B + N -->|No| O[Flush remaining paragraph and finish] +``` + +Figure: `wrap_text` control flow. The wrapper classifies each incoming line, +passes fenced blocks, tables, headings, directives, and indented code through +unchanged, flushes paragraphs on blanks, routes prose and prefixed lines +through `ParagraphWriter`, computes visible widths with `unicode-width`, and +delegates inline line fitting to `textwrap` before reconstructing the emitted +Markdown lines. + +### Wrap sequence + +The following sequence diagram focuses on the runtime collaboration between the +CLI entry point, `wrap_text`, `ParagraphWriter`, the inline wrapper, and +`textwrap` while a paragraph is being processed. + +```mermaid +sequenceDiagram + participant CLI as mdtablefix_CLI + participant WT as wrap_text + participant PW as ParagraphWriter + participant WP as wrap_preserving_code + participant IH as inline.rs_helpers + participant TW as textwrap::wrap_first_fit + + CLI->>WT: wrap_text(lines, width) + loop For each classified paragraph line + WT->>PW: handle_prefix_line / flush_paragraph + alt Prefixed or plain paragraph content + PW->>WP: wrap_preserving_code(text, width) + WP->>IH: build_fragments + merge/rebalance + IH->>TW: wrap_first_fit(fragments, line_widths) + TW-->>IH: wrapped_fragment_groups + IH-->>WP: wrapped_lines_with_spans + WP-->>PW: wrapped_lines_with_prefixes + PW-->>WT: wrapped_lines + WT-->>CLI: append wrapped output + else Nonwrappable line + PW-->>WT: push_verbatim / original_line + WT-->>CLI: append original output + end + end + WT-->>CLI: return final wrapped text +``` + +Figure: `wrap_text` sequence flow. The CLI calls `wrap_text`, which delegates +paragraph handling to `ParagraphWriter`; wrappable paragraph content then flows +through `wrap_preserving_code`, the fragment-building and post-processing +helpers in `src/wrap/inline.rs`, and the underlying `textwrap` engine before +wrapped lines return through the same stack to the CLI, while nonwrappable +lines bypass the inline wrapping path and are emitted unchanged. + The helper `html_table_to_markdown` is retained for backward compatibility but is deprecated. New code should call `convert_html_tables` instead. @@ -444,8 +523,9 @@ sequenceDiagram `mdtablefix` wraps paragraphs and list items while respecting the display width of Unicode characters. The `unicode-width` crate is used to compute the width -of strings when deciding where to break lines. This prevents emojis or other -multibyte characters from causing unexpected wraps or truncation. +of prefixes and Markdown-aware wrapping fragments before `textwrap` performs +line fitting. This prevents emojis or other multibyte characters from causing +unexpected wraps or truncation. Whenever wrapping logic examines the length of a token, it relies on `UnicodeWidthStr::width` to measure visible columns rather than byte length. diff --git a/docs/developers-guide.md b/docs/developers-guide.md index d4ebe807..1c939aa8 100644 --- a/docs/developers-guide.md +++ b/docs/developers-guide.md @@ -88,3 +88,71 @@ The rationale for the staged table reflow pipeline is recorded in `docs/adrs/0001-table-reflow-pipeline.md`. Refer to that ADR when changing the parse, width-calculation, or separator-handling flow so implementation changes stay aligned with the documented design constraints. + +## Wrap module architecture + +The wrapping pipeline for `--wrap` is: + +1. **Block classification.** `classify_block` in `src/wrap.rs` inspects each + input line and decides whether it should pass through verbatim or enter the + paragraph wrapper. Fenced code blocks, indented code blocks, headings, + tables, directives, and blank lines stop paragraph accumulation. + +2. **Prefix-aware paragraph handling.** `ParagraphWriter` in + `src/wrap/paragraph.rs` is the single entry point for prefix-aware wrapping. + `wrap_with_prefix` computes the available content width once from the + Unicode display width of the first-line prefix, then feeds the paragraph + text into `wrap_preserving_code`. + +3. **Fragment construction and line fitting.** `wrap_preserving_code` in + `src/wrap/inline.rs` tokenizes prose with `tokenize::segment_inline`, groups + the tokens into `InlineFragment` values, and calls + `textwrap::wrap_algorithms::wrap_first_fit` over the accumulated fragment + buffer. + +4. **Post-processing and rendering.** The `postprocess` module applies + `merge_whitespace_only_lines` and then `rebalance_atomic_tails` so + whitespace-only wrap artefacts and isolated tails are normalized before the + fragments are rendered back into output lines. + +`InlineFragment` carries the rendered fragment text, its precomputed display +width, and a `FragmentKind` tag. That construction-time classification lets the +`is_whitespace`, `is_atomic`, and `is_plain` predicates answer all later +questions without repeating ad hoc string inspection in the post-processing +passes. + +The `postprocess` module exists because greedy line fitting alone does not +reproduce the repository's historical whitespace semantics. The first pass +merges whitespace-only wrap lines into adjacent content, and the second pass +rebalances a trailing atomic or plain fragment only when the destination line +still fits within the configured width. + +### Key types and functions + +Table: Key types and functions. + +| Symbol | File | +| ------------------------------------------------------- | -------------------------------- | +| `FragmentKind`, `InlineFragment`, `classify_fragment` | `src/wrap/inline.rs` | +| `build_fragments`, `wrap_preserving_code` | `src/wrap/inline.rs` | +| `merge_whitespace_only_lines`, `rebalance_atomic_tails` | `src/wrap/inline/postprocess.rs` | +| `ParagraphWriter`, `wrap_with_prefix` | `src/wrap/paragraph.rs` | +| `ParagraphState`, `PrefixLine` | `src/wrap/paragraph.rs` | + +### Design constraints + +- **Public API stability.** `mdtablefix::wrap::wrap_text`, `Token`, and + `tokenize_markdown` must not change their signatures or observable behaviour. +- **Atomic fragments.** Inline code spans and Markdown links are never split + across lines; they move as a unit when they would overflow the target width. +- **Hard breaks.** Trailing two-space hard breaks must survive on the emitted + line where they occur. +- **Verbatim blocks.** Fenced code blocks must pass through unchanged, along + with the other non-paragraph block kinds detected by `classify_block`. +- **Prefix width.** The visual width of every prefix string is measured with + `UnicodeWidthStr::width` before the available text width is computed, so + non-ASCII prefix characters (e.g. `「` in CJK blockquotes) are accounted for + correctly. + +Refer to `docs/adrs/0002-textwrap-wrapping-engine.md` for the rationale behind +replacing `LineBuffer` with `textwrap`. diff --git a/docs/execplans/replace-bespoke-wrapping-with-textwrap-and-unicode-width.md b/docs/execplans/replace-bespoke-wrapping-with-textwrap-and-unicode-width.md new file mode 100644 index 00000000..fa06bd53 --- /dev/null +++ b/docs/execplans/replace-bespoke-wrapping-with-textwrap-and-unicode-width.md @@ -0,0 +1,469 @@ +# Replace bespoke wrapping with `textwrap` and `unicode-width` + +This ExecPlan (execution plan) is a living document. The sections +`Constraints`, `Tolerances`, `Risks`, `Progress`, `Surprises & Discoveries`, +`Decision Log`, and `Outcomes & Retrospective` must be kept up to date as work +proceeds. + +Status: COMPLETED + +## Purpose / big picture + +After this change, `mdtablefix --wrap` must keep the same observable wrapping +behaviour that users already rely on for paragraphs, list items, blockquotes, +footnotes, hard line breaks, and fenced or indented code blocks, but the +line-breaking engine should be driven by the `textwrap` crate instead of the +current bespoke span buffer in `src/wrap/inline.rs` and the duplicated prefix +logic in `src/wrap/paragraph.rs`. + +Observable success means three things. First, the existing active wrap-related +tests still pass unchanged. Second, the implementation no longer depends on the +custom `LineBuffer`-driven wrapping loop or the duplicated +`append_wrapped_with_prefix` and `handle_prefix_line` split. Third, the +documentation explains that wrapping now delegates width-sensitive line +breaking to `textwrap` together with `unicode-width`. + +This issue is a maintainability refactor, not a user-facing bug fix. The plan +therefore prioritizes behaviour preservation over aggressive deletion. If a +proposed simplification would change a public API or visible wrapping result, +the plan prefers a smaller, safer first delivery and a follow-up issue. + +## Constraints + +- Keep public CLI flags and the public `mdtablefix::wrap::wrap_text` entry + point stable. +- Treat `mdtablefix::wrap::Token` and + `mdtablefix::wrap::tokenize_markdown` as stable unless the user explicitly + approves a public API change. They are re-exported from `src/lib.rs` and are + used by `src/code_emphasis.rs`, `src/footnotes/mod.rs`, + `src/footnotes/renumber.rs`, and `src/textproc.rs`. +- Preserve the current observable behaviour covered by active tests in + `src/wrap/tests.rs`, `tests/wrap_unit.rs`, `tests/wrap_cli.rs`, + `tests/wrap_renumber.rs`, `tests/cli.rs`, `tests/cli_frontmatter.rs`, + `tests/code_emphasis.rs`, and `tests/markdownlint.rs`. +- Do not rely on `tests/wrap/*.rs` for safety. Those files are currently + orphaned from Cargo test discovery and do not run unless they are promoted + into an active top-level target. +- Keep Unicode display-width handling based on `unicode-width`. Any helper that + computes indentation or available columns must use display width rather than + byte length. +- Add the `textwrap` dependency to `Cargo.toml` using implicit semver (no + leading `^`) that is compatible with the repository's Rust `1.89` minimum + version and existing 2024 edition build. +- Keep touched source files below the repository's 400-line soft ceiling. If + the refactor would make a file exceed that limit, split the code into a small + new module instead of extending the file. +- Update documentation that currently describes the bespoke wrapping engine: + at minimum `README.md`, `docs/architecture.md`, and `docs/trailing-spaces.md` + if the named helper or explanation changes. + +## Tolerances (exception triggers) + +- Scope: if the implementation requires changes to more than 12 files or + roughly 500 net lines of code after cleanup, stop and re-evaluate whether the + work has grown beyond a maintainability refactor. +- Interfaces: if removing bespoke wrapping would require changing the public + `Token` type, `tokenize_markdown`, or the signature of `wrap_text`, stop and + escalate rather than smuggling in an API break. +- Behaviour: if a naive `textwrap` prototype fails more than three existing + active wrap regressions that cannot be fixed with a small shielding layer, + stop and document the cases before proceeding. +- Dependencies: if the chosen `textwrap` release requires a Rust version newer + than `1.89`, stop and choose a compatible release or ask for approval to + raise the minimum supported version. +- Complexity: if preserving inline code spans and Markdown links appears to + require rebuilding a custom tokenizer and buffer that is materially similar + to the current one, stop and reframe the work as a narrower prefix-handling + simplification plus a separate design review for the inline engine. +- Validation: if `make lint` or `make test` still fail after three focused fix + cycles, stop and capture the failing cases in `Decision Log`. + +## Risks + +- Risk: plain `textwrap::wrap` is likely to split Markdown inline code spans or + links that contain spaces, which would regress tests such as + `wrap_text_preserves_code_spans`, `wrap_text_multiple_code_spans`, and + `wrap_text_preserves_links`. Severity: high Likelihood: high Mitigation: + begin with a red test baseline and a deliberate prototype. If the prototype + fails, add a minimal shielding layer for inline code spans and links before + invoking `textwrap`, rather than reviving the full custom `LineBuffer` loop. + +- Risk: the issue text suggests removing `tokenize_markdown`, but that helper + is public and is used outside wrapping. Severity: high Likelihood: high + Mitigation: treat public tokenization as out of scope for the first delivery. + Remove only wrapping-specific internals that become dead after the refactor. + Record any remaining token API cleanup as a follow-up issue. + +- Risk: `src/wrap/paragraph.rs` currently mixes byte-length and display-width + calculations. A rushed refactor could accidentally preserve that bug or + introduce a different indentation drift for bullets, blockquotes, or + footnotes. Severity: medium Likelihood: medium Mitigation: centralize prefix + wrapping in one helper that computes prefix and indent widths once with + `UnicodeWidthStr::width`, then add focused tests for repeated prefixes and + continuation indentation. + +- Risk: the repository contains many wrap-focused tests under `tests/wrap/`, + but they are not active Cargo targets. Severity: medium Likelihood: high + Mitigation: do not assume they provide safety. Promote any critical cases + needed for the refactor into `src/wrap/tests.rs`, `tests/wrap_unit.rs`, or + another active top-level `tests/*.rs` target before changing the engine. + +- Risk: the architecture docs currently describe the bespoke tokenizer flow and + module relationships in detail. Severity: low Likelihood: high Mitigation: + update the docs as part of the same change, so the repository does not + advertise an implementation that no longer exists. + +## Progress + +- [x] (2026-04-22 00:00Z) Reviewed the current wrapping pipeline in + `src/wrap.rs`, `src/wrap/inline.rs`, `src/wrap/paragraph.rs`, and + `src/wrap/tokenize/`. +- [x] (2026-04-22 00:00Z) Confirmed that `unicode-width` is already present in + `Cargo.toml` and that `textwrap` is not yet a dependency. +- [x] (2026-04-22 00:00Z) Confirmed that `tokenize_markdown` is public and is + used by footnotes, text processing, and code-emphasis logic outside the wrap + engine. +- [x] (2026-04-22 00:00Z) Confirmed that `tests/wrap/*.rs` are orphaned from + Cargo integration discovery and must not be relied upon as active coverage. +- [x] (2026-04-22 00:00Z) Added active regressions for nested blockquote + prefix repetition, footnote continuation alignment, and checkbox indentation + in `src/wrap/tests.rs`. +- [x] (2026-04-22 00:00Z) Introduced `textwrap` and routed inline line fitting + through `textwrap::wrap_algorithms::wrap_first_fit`. +- [x] (2026-04-22 00:00Z) Replaced the bespoke line-buffer loop with + fragment-based wrapping and consolidated prefix handling in + `src/wrap/paragraph.rs`. +- [x] (2026-04-22 00:00Z) Removed the dead `src/wrap/line_buffer.rs` module + from the active wrap path. +- [x] (2026-04-22 00:00Z) Updated the affected documentation and passed + `make fmt`, `make check-fmt`, `make lint`, `make test`, `make markdownlint`, + and `make nixie`. +- [x] (2026-04-23 00:00Z) Tightened `rebalance_atomic_tails` so post-wrap + fragment moves no longer create overflow lines, reused the trailing buffer in + `wrap_preserving_code` instead of cloning the full prefix per fragment, and + added an active regression for the `a four five` / width `6` case. + +## Surprises & Discoveries + +- Observation: the user-facing issue text names `tokenize_markdown` as a + candidate for removal, but in this repository it is not just an internal wrap + helper. It is re-exported publicly from `src/lib.rs` and also feeds + `src/textproc.rs`, `src/footnotes/mod.rs`, `src/footnotes/renumber.rs`, and + `src/code_emphasis.rs`. Evidence: `src/lib.rs`, `src/textproc.rs`, + `src/footnotes/mod.rs`, `src/footnotes/renumber.rs`, and + `src/code_emphasis.rs`. Impact: the first implementation milestone should + replace the bespoke wrap engine without promising removal of the public token + API. + +- Observation: `tests/wrap/mod.rs` and the other files under `tests/wrap/` are + not active integration tests. Evidence: Cargo only discovers top-level + `tests/*.rs` files, and the active wrap targets are instead + `tests/wrap_unit.rs`, `tests/wrap_cli.rs`, and the library tests in + `src/wrap/tests.rs`. Impact: any critical regression that currently exists + only in the nested directory must be promoted to an active harness before the + refactor. + +- Observation: `docs/architecture.md` explicitly documents the bespoke + tokenizer flow and module relationships, and `docs/trailing-spaces.md` + mentions `wrap_preserving_code` by name. Evidence: `docs/architecture.md` + sections "Module relationships", "Tokenizer flow", and "Unicode width + handling", plus `docs/trailing-spaces.md`. Impact: documentation updates are + part of the required work, not optional cleanup. + +- Observation: `textwrap`'s low-level `wrap_first_fit` API works better for + this migration than `textwrap::wrap` because it accepts pre-grouped fragments + with caller-defined widths. Evidence: `textwrap::core::Fragment` and + `textwrap::wrap_algorithms::wrap_first_fit`. Impact: the repository can keep + its existing Markdown-aware token grouping and whitespace preservation + without reviving the old mutable `LineBuffer`. + +- Observation: the post-processing pass that rebalances trailing atomic or + plain fragments can invalidate `wrap_first_fit`'s width guarantee if it moves + a fragment without rechecking the destination line width. Evidence: + `rebalance_atomic_tails` on 2026-04-23 could turn `a four` / `five` into `a` + / `four five` at width `6`. Impact: any heuristic that mutates fitted lines + after wrapping must be width-aware, or it can regress downstream layout + assumptions. + +## Decision Log + +- Decision: scope the first delivery to replacing the bespoke wrapping engine, + not the public token API. Rationale: `tokenize_markdown` is public and has + non-wrap consumers. Keeping that boundary stable preserves safety and lets + this issue land as the low-risk refactor it was labelled to be. Date/Author: + 2026-04-22 / Codex. + +- Decision: use `textwrap` only for line-breaking and keep the existing + high-level block classification in `wrap_text`. Rationale: fences, indented + code blocks, headings, tables, and hard line breaks are repository-specific + policy decisions that already live in `src/wrap.rs` and do not need to move + into a third-party crate. Date/Author: 2026-04-22 / Codex. + +- Decision: prove behaviour with active tests before deleting old internals. + Rationale: the repository contains inactive wrap tests, so moving directly to + clean up would create a false sense of safety. Date/Author: 2026-04-22 / + Codex. + +- Decision: use `textwrap::wrap_algorithms::wrap_first_fit` with custom + Markdown-aware fragments instead of `textwrap::wrap`. Rationale: + `wrap_first_fit` delegates line fitting to `textwrap` while allowing + `mdtablefix` to preserve leading carry whitespace, atomic code spans, and + display-width measurements on grouped fragments. That keeps the refactor + small without losing the behaviours guarded by the current tests. + Date/Author: 2026-04-22 / Codex. + +- Decision: treat post-wrap fragment rebalancing as width-constrained, and keep + fragment classification explicit on `InlineFragment`. Rationale: the + migration's shielding layer is still part of the line-fitting contract, so it + must not create lines that `wrap_first_fit` would have rejected. Recording + fragment kind once also makes the whitespace-carry and tail-rebalance passes + easier to audit than repeating ad hoc string classification in each branch. + Date/Author: 2026-04-23 / Codex. + +## Outcomes & Retrospective + +The implementation kept `tokenize_markdown` and the existing block +classification intact, but replaced the bespoke `LineBuffer` loop with a +fragment-based adapter over `textwrap::wrap_algorithms::wrap_first_fit`. Prefix +handling now flows through one helper in `src/wrap/paragraph.rs`, and active +regressions cover nested blockquotes, footnote continuation alignment, and +checkbox indentation. + +The key lesson from the migration is that `textwrap` alone is not a drop-in +replacement for the repository's historic whitespace semantics. Preserving the +old behaviour required a thin adaptation layer that merges whitespace-only +overflow lines forward and sometimes rebalances the trailing fragment when the +correct break only becomes obvious after the following separator arrives. + +That adaptation layer still needs the same width discipline as the underlying +wrapper. The 2026-04-23 follow-up fix confirmed that a seemingly harmless +tail-rebalancing heuristic can reintroduce overflow unless it rechecks the +destination line's display width before moving a fragment. + +## Context and orientation + +The top-level wrapping entry point is `wrap_text(lines, width)` in +`src/wrap.rs`. It streams input line by line, preserving fenced code blocks, +indented code blocks, tables, headings, Markdownlint directives, and blank +lines. Only ordinary paragraph text and prefixed lines such as bullets, +blockquotes, and footnotes reach the wrapping helpers. + +Prefix-aware wrapping currently lives in `src/wrap/paragraph.rs`. +`ParagraphWriter::handle_prefix_line` flushes any active paragraph and +delegates to `append_wrapped_with_prefix`. That helper computes the available +width for the content after the prefix, wraps the content, and then prefixes +the first and continuation lines manually. This is the best place to +consolidate the prefix logic into one helper such as `wrap_with_prefix`. + +Inline line-breaking currently lives in `src/wrap/inline.rs`. +`wrap_preserving_code` tokenizes the paragraph into inline fragments, groups +them into spans, and then walks them through a custom `LineBuffer`. The custom +logic exists to preserve inline code spans, links, adjacent punctuation, +consecutive whitespace, and trailing spaces. This is the core complexity that +the refactor is meant to remove. + +The `src/wrap/tokenize/` directory serves two separate roles today. One role is +the inline segmentation used only by `wrap_preserving_code`. The other role is +the public `tokenize_markdown` API used by other modules. These roles must be +treated separately during implementation; deleting the wrapping-specific +segmenter does not automatically justify removing the public token API. + +Active tests that currently exercise wrapping behaviour live in +`src/wrap/tests.rs`, `tests/wrap_unit.rs`, `tests/wrap_cli.rs`, +`tests/wrap_renumber.rs`, and wrap-related cases inside `tests/cli.rs`, +`tests/cli_frontmatter.rs`, `tests/code_emphasis.rs`, and +`tests/markdownlint.rs`. The nested `tests/wrap/*.rs` tree is useful as a +source of examples, but not as active protection until its cases are promoted +to top-level targets. + +The documentation that will need review lives in `README.md`, +`docs/architecture.md`, and `docs/trailing-spaces.md`. `docs/architecture.md` +currently includes a tokenizer-flow diagram that will become misleading once +the bespoke span buffer is removed. + +## Plan of work + +Stage A is a baseline and red-test stage. Start by identifying which current +behaviours the refactor must preserve. The minimum list is: inline code spans +remain intact, Markdown links remain intact, hyphenated words are not split, +trailing punctuation stays attached to code spans or links, trailing spaces on +the final wrapped line survive, bullets and footnotes indent continuation lines +correctly, blockquotes repeat the quote prefix, and fenced or indented code +blocks remain byte-identical. Promote any missing critical cases from the +inactive `tests/wrap/*.rs` tree into active harnesses before changing the wrap +engine. Prefer `src/wrap/tests.rs` for unit-level behaviour and +`tests/wrap_cli.rs` or `tests/wrap_unit.rs` for end-to-end regressions. + +Stage B is a prototype that proves whether `textwrap` can replace the custom +buffer without hidden regressions. Add `textwrap` to `Cargo.toml` and create a +temporary adapter in `src/wrap/inline.rs` that keeps the current +`wrap_preserving_code` call site stable while routing line breaking through +`textwrap::wrap` plus explicit `Options`. Start with `break_words(false)` and +Unicode-aware word separation. Run only the active wrap tests added in Stage A. +If the naive adapter breaks code spans or links, do not continue deleting old +logic. Instead, implement a small shielding layer that marks inline code spans, +links, and images as atomic fragments before calling `textwrap`, then restore +them after wrapping. The key goal of this stage is to prove that the new engine +can satisfy current tests without recreating the whole old buffer. + +Stage C simplifies prefix handling. Replace the current +`append_wrapped_with_prefix` plus `handle_prefix_line` split with a single +helper in `src/wrap/paragraph.rs` that receives a prefix, the remaining text, +the available width, and whether the prefix repeats on continuation lines. +Compute the display width of the prefix once with `UnicodeWidthStr::width`, +compute the wrapped continuation prefix once, and reuse that value for all +continuation lines. Apply the same display-width approach to ordinary paragraph +indentation so the module no longer mixes byte length and visible column width. + +Stage D removes dead internals. After the `textwrap`-backed path passes the +full active test suite, delete any no-longer-used custom buffering helpers such +as `src/wrap/line_buffer.rs` and any wrapping-only inline tokenization helpers +that are left unused. Do not remove `tokenize_markdown`, `Token`, or the +modules that still serve non-wrap consumers unless the user explicitly expands +the scope and the public API question is resolved. + +Stage E updates the docs. Rewrite the wrapping description in `README.md` only +where user-visible semantics changed or need clarification. In +`docs/architecture.md`, replace the bespoke tokenizer-flow explanation with a +short description of the new split: repository-owned block classification and +prefix handling on one side, `textwrap` plus `unicode-width` for line breaking +on the other. Update `docs/trailing-spaces.md` if `wrap_preserving_code` +changes name, visibility, or implementation details. + +Stage F is final validation and cleanup. Run formatting, lint, tests, and the +documentation gates through `tee` so the logs remain inspectable in this +environment. Review the diff to confirm that the result is a real +simplification and not a hidden rewrite of the bespoke engine under new names. + +## Concrete steps + +Work from the repository root: + +```bash +pwd +``` + +Expected: + +```plaintext +/home/user/project +``` + +List the currently active wrap-related tests before editing so the red/green +cycle is grounded in real targets: + +```bash +cargo test -- --list | rg "wrap|tokenize_markdown|frontmatter|markdownlint" +``` + +Expected: + +```plaintext +...wrap_text_preserves_code_spans +...wrap_text_multiple_code_spans +...cli_wrap_in_place_preserves_shell_block_verbatim +... +``` + +After adding any missing active regressions, prove they fail against a naive +`textwrap` prototype before investing in cleanup. For example, run the focused +unit and CLI suites that exercise the wrapping path: + +```bash +cargo test wrap_text_preserves_code_spans --lib +cargo test wrap_text_multiple_code_spans --lib +cargo test --test wrap_unit +cargo test --test wrap_cli +``` + +Expected: + +```plaintext +running ... +test ... ok +``` + +Once the prototype is passing, run the wider wrap-adjacent suites that can +catch unintended interactions: + +```bash +cargo test --test wrap_renumber +cargo test --test cli +cargo test --test cli_frontmatter +cargo test --test code_emphasis +cargo test --test markdownlint +``` + +Manually verify an end-to-end wrap example that covers a blockquote, inline +code, and a Markdown link: + +```bash +printf '%s\n' \ + '> A deliberately long blockquote line that mentions' \ + '`cargo test --test wrap_cli` and the' \ + '[project README](https://example.invalid/readme) so the wrap' \ + 'engine must preserve both atomic spans while continuing the quote' \ + 'prefix correctly.' \ + | cargo run -- --wrap +``` + +Expected: + +```plaintext +> A deliberately long blockquote line that mentions +> `cargo test --test wrap_cli` and the +> [project README](https://example.invalid/readme) so the wrap +> engine must preserve both atomic spans while continuing the quote +> prefix correctly. +``` + +Finish with the full repository quality gates. Use `tee` so the logs survive +output truncation and can be inspected afterward: + +```bash +set -o pipefail +make fmt 2>&1 | tee /tmp/issue-80-fmt.log +set -o pipefail +make check-fmt 2>&1 | tee /tmp/issue-80-check-fmt.log +set -o pipefail +make lint 2>&1 | tee /tmp/issue-80-lint.log +set -o pipefail +make test 2>&1 | tee /tmp/issue-80-test.log +set -o pipefail +make markdownlint 2>&1 | tee /tmp/issue-80-markdownlint.log +set -o pipefail +make nixie 2>&1 | tee /tmp/issue-80-nixie.log +``` + +Expected: + +```plaintext +... finished successfully ... +``` + +If any command fails, inspect the tail of the corresponding log before making +another change: + +```bash +tail -n 40 /tmp/issue-80-test.log +tail -n 40 /tmp/issue-80-lint.log +``` + +## Acceptance criteria + +The change is complete when all of the following statements are true: + +1. `wrap_text` still preserves the behaviour covered by the active wrap tests, + including code spans, links, prefixes, fences, hard breaks, and trailing + spaces. +2. `src/wrap/paragraph.rs` uses one display-width-aware helper for prefix + wrapping instead of the current two-step helper split. +3. The bespoke `LineBuffer`-driven wrapping loop is gone from the active wrap + path, and the implementation delegates line breaking to `textwrap`. +4. `tokenize_markdown` either remains intact and documented as out of scope, or + any broader removal is explicitly approved and covered by updated tests and + documentation. +5. `README.md`, `docs/architecture.md`, and any other affected docs describe + the new implementation accurately. +6. `make check-fmt`, `make lint`, `make test`, `make markdownlint`, and + `make nixie` all pass. diff --git a/docs/execplans/yaml-frontmatter.md b/docs/execplans/yaml-frontmatter.md index 91b2a76c..2092567c 100644 --- a/docs/execplans/yaml-frontmatter.md +++ b/docs/execplans/yaml-frontmatter.md @@ -96,7 +96,7 @@ according to the selected options. - [x] (2026-04-09) Update `README.md` and `docs/architecture.md`. - [x] (2026-04-09) Run `make check-fmt`, `make lint`, `make test`, `make markdownlint`, - and `make nixie`. + and `make nixie` if Mermaid content changes. ## Surprises & discoveries @@ -261,7 +261,8 @@ make markdownlint make nixie ``` -`make nixie` is required. +If `docs/architecture.md` does not change any Mermaid content, `make nixie` may +be skipped. ## Validation and acceptance diff --git a/docs/trailing-spaces.md b/docs/trailing-spaces.md index abc72226..b0f096bb 100644 --- a/docs/trailing-spaces.md +++ b/docs/trailing-spaces.md @@ -1,11 +1,11 @@ # Trailing spaces -`wrap_preserving_code` keeps trailing spaces on the final line. +The `textwrap`-backed inline wrapper keeps trailing spaces on the final line. Markdown treats two spaces at the end of a line as a hard break. Earlier versions trimmed those spaces during the final flush, turning hard breaks into -soft ones. The final line is now pushed as-is so trailing whitespace survives -wrapping. +soft ones. The current wrapper still renders the final line as-is after +`textwrap` has chosen the line breaks, so trailing whitespace survives wrapping. ## Example diff --git a/docs/users-guide.md b/docs/users-guide.md index 1d78ba52..2ee85568 100644 --- a/docs/users-guide.md +++ b/docs/users-guide.md @@ -22,3 +22,31 @@ The `--ellipsis` flag replaces `...` inside table cells with the Unicode ellipsis character `…` before the table is reflowed. This ensures column widths are computed from the final emitted glyph rather than from the three-dot source sequence. + +## Paragraph wrapping + +Pass `--wrap ` to reflow prose paragraphs so that every output line fits +within `` display columns. The width argument is measured in terminal +columns, not bytes, so it accounts correctly for CJK glyphs, emoji, and +accented characters. + +Line fitting is delegated to the `textwrap` crate using a greedy first-fit +algorithm: each word is placed on the current line if it fits, and a new line +is started otherwise. This produces predictable, diff-friendly output. + +Inline code spans (`` `…` ``) and Markdown links (`[text](url)`) are treated as +unbreakable units. A span is never split across lines; it moves as a whole to +the next line when it would otherwise exceed the target width. + +Blockquote prefixes (`>`), task-list item markers (`- [ ]`, `- [x]`), and +footnote definition labels (`[^n]:`) are detected automatically. The first +wrapped line carries the original prefix; subsequent wrapped lines are indented +to the same visual column, so the text stays aligned. + +Fenced code blocks, HTML blocks, indented code blocks (four or more leading +spaces or a leading tab), and table rows are passed through unchanged. Wrapping +is applied only to prose paragraphs and prefixed lines. + +Two trailing spaces at the end of a line produce a hard line break in rendered +Markdown. `mdtablefix --wrap` preserves those trailing spaces on the final +wrapped line, so hard-break semantics are not lost after reformatting. diff --git a/src/wrap.rs b/src/wrap.rs index 4499eb62..d447a5f6 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -13,11 +13,16 @@ use std::borrow::Cow; mod block; mod fence; mod inline; -mod line_buffer; mod paragraph; mod tokenize; use block::{BLOCKQUOTE_RE, BULLET_RE, FOOTNOTE_RE}; pub(crate) use block::{BlockKind, classify_block}; +/// Fence-detection utilities re-exported for downstream callers. +/// +/// [`FenceTracker`] maintains fenced code-block state across lines, which is +/// useful for callers that process Markdown incrementally. [`is_fence`] +/// inspects one line and returns the fence components (indentation, marker, +/// info string) when the line opens a fenced code block, or `None` otherwise. pub use fence::{FenceTracker, is_fence}; use paragraph::{ParagraphState, ParagraphWriter, PrefixLine}; /// Token emitted by the `tokenize::segment_inline` parser and used by diff --git a/src/wrap/inline.rs b/src/wrap/inline.rs index 8b8027fd..2f464fad 100644 --- a/src/wrap/inline.rs +++ b/src/wrap/inline.rs @@ -4,22 +4,36 @@ //! inline code, links, and trailing punctuation without reimplementing the //! grouping logic in multiple places. +mod postprocess; +#[cfg(test)] +mod test_support; +#[cfg(test)] +mod tests; use std::ops::Range; +use postprocess::{merge_whitespace_only_lines, rebalance_atomic_tails}; +use textwrap::{core::Fragment, wrap_algorithms::wrap_first_fit}; use unicode_width::UnicodeWidthStr; -use super::{ - line_buffer::{LineBuffer, SplitContext}, - tokenize, -}; +use super::tokenize; +/// Marks how a grouped token span should behave during wrapping. #[derive(Copy, Clone, PartialEq, Eq)] enum SpanKind { + /// Treat the span as ordinary prose. General, + /// Treat the span as an inline code sequence. Code, + /// Treat the span as a Markdown link or image link. Link, } +/// Returns whether a character should stay attached as trailing punctuation. +/// +/// The `c` parameter is the candidate trailing character. The return value is +/// `true` only for punctuation that should remain coupled with a preceding +/// atomic span. This helper has no panics and assumes `c` is a single scalar +/// value from an already-tokenised fragment. #[inline] fn is_trailing_punct(c: char) -> bool { // ASCII closers + common Unicode closers and word-final punctuation @@ -29,16 +43,37 @@ fn is_trailing_punct(c: char) -> bool { ) || "…—–»›)]】》」』、。,:;!?”.’".contains(c) } +/// Returns whether `token` already looks like a complete Markdown link. +/// +/// The `token` parameter is the rendered fragment text to inspect. The return +/// value is `true` only for complete inline links or image links, and this +/// helper never panics. fn looks_like_link(token: &str) -> bool { (token.starts_with('[') || token.starts_with("![")) && token.contains("](") && token.ends_with(')') } +/// Returns whether `token` contains only Unicode whitespace. +/// +/// The `token` parameter is the rendered fragment text to inspect. The return +/// value is `true` when every character is whitespace, and this helper never +/// panics. fn is_whitespace_token(token: &str) -> bool { token.chars().all(char::is_whitespace) } +/// Returns whether `token` is a complete inline code span. +/// +/// The `token` parameter is the rendered fragment text to inspect. The return +/// value is `true` only for complete backtick-delimited spans, and this helper +/// never panics. fn is_inline_code_token(token: &str) -> bool { token.starts_with('`') && token.ends_with('`') } +/// Extends a grouped span over trailing punctuation tokens and updates `width`. +/// +/// `tokens` supplies the token stream, `j` is the next token index to inspect, +/// and `width` accumulates the display width of the current span. The return +/// value is the exclusive end index after any attached punctuation, and the +/// caller must pass a valid starting index within `tokens`. fn extend_punctuation(tokens: &[String], mut j: usize, width: &mut usize) -> usize { while j < tokens.len() && tokens[j].chars().all(is_trailing_punct) { *width += UnicodeWidthStr::width(tokens[j].as_str()); @@ -69,6 +104,14 @@ fn should_couple_whitespace(kind: SpanKind, next_token: Option<&String>) -> bool } } +/// Merges a backtick-opened code span into one grouped span and updates +/// `width`. +/// +/// `tokens` is the token stream, `i` is the index of a lone backtick opener, +/// and `width` accumulates the grouped display width. The return value is the +/// exclusive end index after the closing backtick and any attached +/// punctuation. This helper relies on the invariant that `tokens[i]` is a lone +/// backtick token. #[inline] fn merge_code_span(tokens: &[String], i: usize, width: &mut usize) -> usize { debug_assert!( @@ -88,6 +131,14 @@ fn merge_code_span(tokens: &[String], i: usize, width: &mut usize) -> usize { j } +/// Finds the next logical token group starting at `start`. +/// +/// `tokens` is the segmented inline token stream and `start` is the first +/// token in the next candidate group. The return value is `(end, width)`, +/// where `end` is the exclusive end index of the grouped inline code span, +/// link, or plain fragment, and `width` is its Unicode display width. This +/// helper assumes `start < tokens.len()` and will panic if called out of +/// bounds. pub(super) fn determine_token_span(tokens: &[String], start: usize) -> (usize, usize) { let mut end = start + 1; let mut width = UnicodeWidthStr::width(tokens[start].as_str()); @@ -148,120 +199,200 @@ pub(super) fn determine_token_span(tokens: &[String], start: usize) -> (usize, u (end, width) } -pub(super) fn attach_punctuation_to_previous_line( - lines: &mut [String], - current: &str, - token: &str, -) -> bool { - if !current.is_empty() || token.len() != 1 || !".?!,:;".contains(token) { - return false; - } +/// Re-exports the test-only helper that joins punctuation onto a prior code +/// line when `current` is empty. +#[cfg(test)] +pub(super) use test_support::attach_punctuation_to_previous_line; + +/// Classifies an inline fragment for post-wrap heuristics. +#[derive(Debug, Clone, PartialEq, Eq)] +enum FragmentKind { + /// Marks a fragment that contains only whitespace. + Whitespace, + /// Marks a fragment that contains inline code. + InlineCode, + /// Marks a fragment that contains a Markdown link. + Link, + /// Marks a fragment that contains ordinary prose. + Plain, +} - let Some(last_line) = lines.last_mut() else { - return false; - }; +/// Stores rendered fragment text, width, and classification for wrapping. +#[derive(Debug, Clone, PartialEq, Eq)] +struct InlineFragment { + /// Holds the rendered fragment text that will be emitted unchanged. + text: String, + /// Stores the precomputed Unicode display width for `text`. + width: usize, + /// Records the fragment classification used by post-processing predicates. + kind: FragmentKind, +} - if last_line.trim_end().ends_with('`') { - last_line.push_str(token); - return true; +impl InlineFragment { + /// Builds a fragment from rendered `text`. + /// + /// The parameter is stored verbatim, while the return value carries the + /// same text together with its Unicode display width and `FragmentKind`. + /// This constructor never panics and preserves the invariant that `width` + /// and `kind` are derived from `text` exactly once. + fn new(text: String) -> Self { + let width = UnicodeWidthStr::width(text.as_str()); + let kind = classify_fragment(text.as_str()); + Self { text, width, kind } + } + /// Returns whether this fragment contains only whitespace. + /// + /// The return value is `true` only for `FragmentKind::Whitespace`. This + /// query never panics and does not inspect `text` again. + fn is_whitespace(&self) -> bool { self.kind == FragmentKind::Whitespace } + /// Returns whether this fragment must move as an atomic unit. + /// + /// The return value is `true` for inline code spans and links. This query + /// never panics and relies on the invariant that `kind` was set at + /// construction time. + fn is_atomic(&self) -> bool { + matches!(self.kind, FragmentKind::InlineCode | FragmentKind::Link) } + /// Returns whether this fragment is ordinary prose. + /// + /// The return value is `true` only for `FragmentKind::Plain`. This query + /// never panics and does not inspect `text` again. + fn is_plain(&self) -> bool { self.kind == FragmentKind::Plain } +} - false +impl Fragment for InlineFragment { + fn width(&self) -> f64 { width_as_f64(self.width) } + fn whitespace_width(&self) -> f64 { 0.0 } + fn penalty_width(&self) -> f64 { 0.0 } } -fn push_span_with_carry( - buffer: &mut LineBuffer, - tokens: &[String], - span: Range, - carried_whitespace: &mut String, -) { - if span.start >= span.end { - return; - } +/// Converts a display width into the `f64` representation required by +/// `textwrap`. +/// +/// `width` is the precomputed display width of one fragment or line. The +/// return value is a saturating `f64` conversion for `wrap_first_fit`; values +/// above `u32::MAX` clamp to that limit. This helper never panics. +fn width_as_f64(width: usize) -> f64 { f64::from(u32::try_from(width).unwrap_or(u32::MAX)) } - if carried_whitespace.is_empty() { - buffer.push_span(tokens, span.start, span.end); - return; +/// Classifies rendered fragment `text` for later post-processing. +/// +/// The parameter is the rendered fragment string, and the return value is a +/// `FragmentKind` used by cheap predicate helpers. This helper never panics +/// and keeps the invariant that classification is centralised in one place. +fn classify_fragment(text: &str) -> FragmentKind { + if is_whitespace_token(text) { + return FragmentKind::Whitespace; } - - let mut first_token = std::mem::take(carried_whitespace); - first_token.push_str(tokens[span.start].as_str()); - buffer.push_token(first_token.as_str()); - if span.start + 1 < span.end { - buffer.push_span(tokens, span.start + 1, span.end); + let trimmed = text.trim_end_matches(is_trailing_punct); + if is_inline_code_token(text) || is_inline_code_token(trimmed) { + FragmentKind::InlineCode + } else if looks_like_link(text) || looks_like_link(trimmed) { + FragmentKind::Link + } else { + FragmentKind::Plain } } -pub(super) fn wrap_preserving_code(text: &str, width: usize) -> Vec { - let tokens = tokenize::segment_inline(text); - if tokens.is_empty() { - return Vec::new(); +/// Appends the token span into the rendered fragment buffer `text`. +/// +/// `tokens` supplies the source tokens and `span` identifies the grouped range +/// to copy. This helper mutates `text` in place and preserves the invariant +/// that punctuation after code spans keeps its original Markdown spacing. +fn push_span_text(text: &mut String, tokens: &[String], span: Range) { + for token in &tokens[span] { + if token.len() == 1 && ".?!,:;".contains(token) && text.trim_end().ends_with('`') { + text.truncate(text.trim_end_matches(char::is_whitespace).len()); + } + text.push_str(token); } +} - let mut lines = Vec::new(); - let mut buffer = LineBuffer::new(); - let mut carried_whitespace = String::new(); +/// Builds Markdown-aware fragments from the segmented token stream `tokens`. +/// +/// The return value preserves token order while grouping inline code, links, +/// and whitespace runs into `InlineFragment` values with precomputed widths. +/// This helper never panics when `tokens` is well-formed. +fn build_fragments(tokens: &[String]) -> Vec { + let mut fragments: Vec = Vec::new(); let mut i = 0; while i < tokens.len() { - let (group_end, group_width) = determine_token_span(&tokens, i); + let (group_end, _) = determine_token_span(tokens, i); let span = i..group_end; - let span_is_whitespace = tokens[span.clone()] + let text = if tokens[span.clone()] .iter() - .all(|tok| is_whitespace_token(tok)); + .all(|token| is_whitespace_token(token)) + { + tokens[span].join("") + } else { + let mut text = String::new(); + push_span_text(&mut text, tokens, span); + text + }; + fragments.push(InlineFragment::new(text)); + i = group_end; + } - if span_is_whitespace && !carried_whitespace.is_empty() && group_end != tokens.len() { - for tok in &tokens[span.clone()] { - carried_whitespace.push_str(tok); - } - i = group_end; - continue; - } + fragments +} - if attach_punctuation_to_previous_line(lines.as_mut_slice(), buffer.text(), &tokens[i]) { - carried_whitespace.clear(); - i += 1; - continue; - } +/// Renders one wrapped fragment line back into Markdown text. +/// +/// `line` supplies the fragments to render and `is_final_output_line` +/// determines whether a single trailing space may be trimmed. The return value +/// is the emitted text for that line, and this helper preserves the invariant +/// that hard-break double spaces survive on the final output line. +fn render_line(line: &[InlineFragment], is_final_output_line: bool) -> String { + let mut text = line + .iter() + .map(|fragment| fragment.text.as_str()) + .collect::(); + + if !is_final_output_line && text.ends_with(' ') && !text.ends_with(" ") { + text.pop(); + } - if buffer.width() + group_width <= width { - push_span_with_carry(&mut buffer, &tokens, span.clone(), &mut carried_whitespace); - i = group_end; - continue; - } + text +} - let mut split = SplitContext { - lines: &mut lines, - width, - }; - if buffer.split_with_span(&mut split, &tokens, span.clone()) { - i = group_end; - continue; - } +/// Wraps inline Markdown `text` without splitting code spans or links. +/// +/// `text` is tokenised into `InlineFragment`s, fitted with +/// `textwrap::wrap_algorithms::wrap_first_fit`, normalised with +/// `merge_whitespace_only_lines` plus `rebalance_atomic_tails`, and then +/// rendered back into `Vec` output lines. `width` is measured in +/// Unicode display columns and must be at least one effective column after any +/// caller prefix handling. This helper never panics for valid input. +pub(super) fn wrap_preserving_code(text: &str, width: usize) -> Vec { + let tokens = tokenize::segment_inline(text); + if tokens.is_empty() { + return Vec::new(); + } - if buffer.flush_trailing_whitespace(&mut lines, &tokens, span.clone()) { - i = group_end; - continue; - } + let fragments = build_fragments(&tokens); + let mut lines = Vec::new(); + let mut buffer: Vec = Vec::new(); - buffer.flush_into(&mut lines); - if span_is_whitespace { - for tok in &tokens[span] { - carried_whitespace.push_str(tok); - } - i = group_end; + for fragment in fragments { + buffer.push(fragment); + let wrapped = wrap_first_fit(&buffer, &[width_as_f64(width)]); + let raw_lines = wrapped.iter().map(|line| line.to_vec()).collect::>(); + let mut grouped_lines = merge_whitespace_only_lines(&raw_lines); + rebalance_atomic_tails(&mut grouped_lines, width); + + if grouped_lines.len() == 1 { continue; } - push_span_with_carry(&mut buffer, &tokens, i..group_end, &mut carried_whitespace); - i = group_end; + for line in &grouped_lines[..grouped_lines.len() - 1] { + lines.push(render_line(line, false)); + } + buffer = grouped_lines.pop().unwrap_or_default(); } - if !carried_whitespace.is_empty() { - buffer.push_token(carried_whitespace.as_str()); - carried_whitespace.clear(); + if !buffer.is_empty() { + lines.push(render_line(&buffer, true)); } - buffer.flush_into(&mut lines); lines } diff --git a/src/wrap/inline/postprocess.rs b/src/wrap/inline/postprocess.rs new file mode 100644 index 00000000..7bfc5e24 --- /dev/null +++ b/src/wrap/inline/postprocess.rs @@ -0,0 +1,367 @@ +//! Post-wrap normalization helpers for inline fragment lines. + +use super::{FragmentKind, InlineFragment}; + +/// Returns whether every fragment on the line is whitespace-only. +fn is_whitespace_only_line(line: &[InlineFragment]) -> bool { + line.iter().all(InlineFragment::is_whitespace) +} + +/// Returns whether the line consists of one literal space fragment. +fn is_single_space_line(line: &[InlineFragment]) -> bool { line.len() == 1 && line[0].text == " " } + +/// Returns the total display width of the rendered line. +fn line_width(line: &[InlineFragment]) -> usize { line.iter().map(|fragment| fragment.width).sum() } + +/// Returns whether the line begins with an atomic fragment. +fn line_starts_with_atomic(line: &[InlineFragment]) -> bool { + line.first().is_some_and(InlineFragment::is_atomic) +} + +/// Returns whether the line begins with a single-space carry and plain prose. +fn line_starts_with_single_space_then_plain(line: &[InlineFragment]) -> bool { + line.first() + .is_some_and(|fragment| fragment.is_whitespace() && fragment.text == " ") + && line.get(1).is_some_and(InlineFragment::is_plain) +} + +/// Returns whether the line ends with a fragment worth rebalancing. +fn line_has_rebalanceable_tail(line: &[InlineFragment]) -> bool { + line.last().is_some_and(InlineFragment::is_atomic) + || (line.len() > 1 && line.last().is_some_and(InlineFragment::is_plain)) +} + +/// Merges whitespace-only wrap artefacts into neighbouring content lines. +/// +/// `lines` is the provisional fragment layout from `wrap_first_fit`, and the +/// return value folds whitespace-only lines into adjacent content. A single +/// space after an inline-code tail is carried forward with that atomic +/// fragment instead of being merged backward. This helper never panics. +pub(super) fn merge_whitespace_only_lines( + lines: &[Vec], +) -> Vec> { + let mut merged: Vec> = Vec::with_capacity(lines.len()); + let mut pending_whitespace: Vec = Vec::new(); + + for (index, mut line) in lines.iter().cloned().enumerate() { + if is_whitespace_only_line(&line) { + let next_starts_atomic = lines + .get(index + 1) + .is_some_and(|next_line| line_starts_with_atomic(next_line)); + let line_is_single_space = is_single_space_line(&line); + let previous_line_has_single_fragment = merged + .last() + .is_some_and(|previous_line| previous_line.len() == 1); + let mut should_carry_whitespace = !line_is_single_space; + + if line_is_single_space + && !next_starts_atomic + && let Some(previous_line) = merged.last_mut() + && previous_line + .last() + .is_some_and(|fragment| fragment.kind == FragmentKind::InlineCode) + { + let Some(previous_atomic) = previous_line.pop() else { + continue; + }; + pending_whitespace.push(previous_atomic); + if previous_line.is_empty() { + merged.pop(); + } + should_carry_whitespace = true; + } + + if line_is_single_space && previous_line_has_single_fragment { + should_carry_whitespace = true; + } + + if should_carry_whitespace { + pending_whitespace.extend(line); + } + continue; + } + + if pending_whitespace.is_empty() { + merged.push(line); + } else { + pending_whitespace.append(&mut line); + merged.push(std::mem::take(&mut pending_whitespace)); + } + } + + if !pending_whitespace.is_empty() { + if let Some(last_line) = merged.last_mut() { + last_line.append(&mut pending_whitespace); + } else { + merged.push(pending_whitespace); + } + } + + merged +} + +/// Moves an eligible tail fragment onto the following line when it still fits. +/// +/// `lines` is mutated in place after whitespace-line merging, and `width` is +/// the maximum display width allowed for the destination line. The move is +/// applied only when the next line starts with a carried single space plus +/// plain text and the new width stays within `width`. This helper never +/// panics. +pub(super) fn rebalance_atomic_tails(lines: &mut [Vec], width: usize) { + for index in 0..lines.len().saturating_sub(1) { + if !line_starts_with_single_space_then_plain(&lines[index + 1]) + || !line_has_rebalanceable_tail(&lines[index]) + { + continue; + } + + let Some(trailing_width) = lines[index].last().map(|fragment| fragment.width) else { + continue; + }; + if line_width(&lines[index + 1]) + trailing_width > width { + continue; + } + + let Some(trailing_fragment) = lines[index].pop() else { + continue; + }; + lines[index + 1].insert(0, trailing_fragment); + } +} + +#[cfg(test)] +mod tests { + use super::{ + super::{FragmentKind, InlineFragment}, + *, + }; + + fn fragment(text: &str) -> InlineFragment { InlineFragment::new(text.into()) } + + #[test] + fn inline_fragment_whitespace_space() { + let fragment = InlineFragment::new(" ".into()); + assert_eq!(fragment.kind, FragmentKind::Whitespace); + assert!(fragment.is_whitespace()); + assert!(!fragment.is_atomic()); + assert_eq!(fragment.width, 1); + } + + #[test] + fn inline_fragment_whitespace_tab() { + let fragment = InlineFragment::new("\t".into()); + assert_eq!(fragment.kind, FragmentKind::Whitespace); + } + + #[test] + fn inline_fragment_inline_code() { + let fragment = InlineFragment::new("`foo`".into()); + assert_eq!(fragment.kind, FragmentKind::InlineCode); + assert!(fragment.is_atomic()); + assert!(!fragment.is_whitespace()); + assert!(!fragment.is_plain()); + } + + #[test] + fn inline_fragment_link() { + let fragment = InlineFragment::new("[text](url)".into()); + assert_eq!(fragment.kind, FragmentKind::Link); + assert!(fragment.is_atomic()); + } + + #[test] + fn inline_fragment_plain() { + let fragment = InlineFragment::new("word".into()); + assert_eq!(fragment.kind, FragmentKind::Plain); + assert!(fragment.is_plain()); + assert!(!fragment.is_atomic()); + } + + #[test] + fn merge_keeps_content_lines_unchanged() { + let lines = vec![vec![fragment("hello")], vec![fragment("world")]]; + assert_eq!(merge_whitespace_only_lines(&lines), lines); + } + + #[test] + fn merge_carries_whitespace_forward() { + let lines = vec![ + vec![fragment("hello")], + vec![fragment(" ")], + vec![fragment("world")], + ]; + assert_eq!( + merge_whitespace_only_lines(&lines), + vec![ + vec![fragment("hello")], + vec![fragment(" "), fragment("world")], + ] + ); + } + + #[test] + fn merge_moves_inline_code_tail_before_single_space() { + let lines = vec![ + vec![fragment("plain"), fragment("`code`")], + vec![fragment(" ")], + vec![fragment("tail")], + ]; + let merged = merge_whitespace_only_lines(&lines); + + assert_eq!(merged[0], vec![fragment("plain")]); + assert_eq!( + merged[1], + vec![fragment("`code`"), fragment(" "), fragment("tail")] + ); + } + + #[test] + fn merge_trailing_whitespace_appended_to_last_line() { + let lines = vec![vec![fragment("hello")], vec![fragment(" ")]]; + assert_eq!( + merge_whitespace_only_lines(&lines), + vec![vec![fragment("hello"), fragment(" ")]] + ); + } + + #[test] + fn merge_carries_multiple_consecutive_whitespace_lines_forward() { + let lines = vec![ + vec![fragment("hello")], + vec![fragment(" ")], + vec![fragment("\t")], + vec![fragment("world")], + ]; + assert_eq!( + merge_whitespace_only_lines(&lines), + vec![ + vec![fragment("hello")], + vec![fragment(" "), fragment("\t"), fragment("world")] + ] + ); + } + + #[test] + fn merge_drops_single_space_before_atomic_starting_line() { + let lines = vec![ + vec![fragment("alpha"), fragment("beta")], + vec![fragment(" ")], + vec![fragment("`code`")], + ]; + assert_eq!( + merge_whitespace_only_lines(&lines), + vec![ + vec![fragment("alpha"), fragment("beta")], + vec![fragment("`code`")] + ] + ); + } + + #[test] + fn merge_empty_input_returns_empty_output() { + assert!(merge_whitespace_only_lines(&[]).is_empty()); + } + + #[test] + fn rebalance_moves_atomic_tail_when_fits() { + let mut lines = vec![ + vec![fragment("alpha"), fragment("`code`")], + vec![fragment(" "), fragment("tail")], + ]; + rebalance_atomic_tails(&mut lines, 80); + assert_eq!(lines[0], vec![fragment("alpha")]); + assert_eq!( + lines[1], + vec![fragment("`code`"), fragment(" "), fragment("tail")] + ); + } + + #[test] + fn rebalance_does_not_move_when_overflow() { + let mut lines = vec![ + vec![fragment("alpha"), fragment("`code`")], + vec![fragment(" "), fragment("plain")], + ]; + let original = lines.clone(); + let width = line_width(&lines[1]) + lines[0].last().expect("tail exists").width - 1; + rebalance_atomic_tails(&mut lines, width); + assert_eq!(lines, original); + } + + #[test] + fn rebalance_skips_when_next_does_not_start_with_space_then_plain() { + let mut lines = vec![ + vec![fragment("alpha"), fragment("`code`")], + vec![fragment("plain")], + ]; + let original = lines.clone(); + rebalance_atomic_tails(&mut lines, 80); + assert_eq!(lines, original); + } + + #[test] + fn rebalance_moves_atomic_tail_at_exact_width_boundary() { + let mut lines = vec![ + vec![fragment("alpha"), fragment("`tail`")], + vec![fragment(" "), fragment("beta")], + ]; + let width = line_width(&lines[1]) + lines[0].last().expect("tail exists").width; + rebalance_atomic_tails(&mut lines, width); + assert_eq!(lines[0], vec![fragment("alpha")]); + assert_eq!( + lines[1], + vec![fragment("`tail`"), fragment(" "), fragment("beta")] + ); + } + + #[test] + fn rebalance_moves_plain_tail_when_fits() { + let mut lines = vec![ + vec![fragment("alpha"), fragment("tail")], + vec![fragment(" "), fragment("beta")], + ]; + rebalance_atomic_tails(&mut lines, 20); + assert_eq!(lines[0], vec![fragment("alpha")]); + assert_eq!( + lines[1], + vec![fragment("tail"), fragment(" "), fragment("beta")] + ); + } + + #[test] + fn rebalance_leaves_single_plain_fragment_line_unchanged() { + let mut lines = vec![ + vec![fragment("tail")], + vec![fragment(" "), fragment("beta")], + ]; + let original = lines.clone(); + rebalance_atomic_tails(&mut lines, 20); + assert_eq!(lines, original); + } + + #[test] + fn rebalance_ignores_empty_input() { + let mut lines: Vec> = Vec::new(); + rebalance_atomic_tails(&mut lines, 10); + assert!(lines.is_empty()); + } + + #[test] + fn rebalance_leaves_single_line_input_unchanged() { + let mut lines = vec![vec![fragment("alpha"), fragment("tail")]]; + let original = lines.clone(); + rebalance_atomic_tails(&mut lines, 10); + assert_eq!(lines, original); + } + + #[test] + fn rebalance_skips_when_next_line_starts_with_space_then_atomic() { + let mut lines = vec![ + vec![fragment("alpha"), fragment("tail")], + vec![fragment(" "), fragment("`code`")], + ]; + let original = lines.clone(); + rebalance_atomic_tails(&mut lines, 20); + assert_eq!(lines, original); + } +} diff --git a/src/wrap/inline/test_support.rs b/src/wrap/inline/test_support.rs new file mode 100644 index 00000000..608dbd3b --- /dev/null +++ b/src/wrap/inline/test_support.rs @@ -0,0 +1,29 @@ +//! Test-only inline helpers that keep `inline.rs` under the file-size ceiling. + +/// Attaches punctuation-only `token` text to the previous wrapped code line. +/// +/// `lines` holds the previously rendered lines, `current` is the pending +/// current-line buffer, and `token` is the punctuation fragment under test. +/// The return value is `true` when the punctuation was appended to the last +/// line. This test-only helper assumes `token` is already isolated as a single +/// rendered token and never panics. +pub(crate) fn attach_punctuation_to_previous_line( + lines: &mut [String], + current: &str, + token: &str, +) -> bool { + if !current.is_empty() || token.len() != 1 || !".?!,:;".contains(token) { + return false; + } + + let Some(last_line) = lines.last_mut() else { + return false; + }; + + if last_line.trim_end().ends_with('`') { + last_line.push_str(token); + return true; + } + + false +} diff --git a/src/wrap/inline/tests.rs b/src/wrap/inline/tests.rs new file mode 100644 index 00000000..c7772213 --- /dev/null +++ b/src/wrap/inline/tests.rs @@ -0,0 +1,36 @@ +//! Unit tests for inline fragment classification helpers. + +use unicode_width::UnicodeWidthStr; + +use super::{FragmentKind, InlineFragment}; + +#[test] +fn inline_fragment_new_marks_spaces_as_whitespace() { + let fragment = InlineFragment::new(" ".into()); + assert_eq!(fragment.kind, FragmentKind::Whitespace); +} + +#[test] +fn inline_fragment_new_marks_backtick_spans_as_inline_code() { + let fragment = InlineFragment::new("`code`".into()); + assert_eq!(fragment.kind, FragmentKind::InlineCode); +} + +#[test] +fn inline_fragment_new_marks_markdown_links_as_links() { + let fragment = InlineFragment::new("[text](url)".into()); + assert_eq!(fragment.kind, FragmentKind::Link); +} + +#[test] +fn inline_fragment_new_marks_plain_words_as_plain() { + let fragment = InlineFragment::new("word".into()); + assert_eq!(fragment.kind, FragmentKind::Plain); +} + +#[test] +fn inline_fragment_new_records_unicode_display_width() { + let text = "表🙂"; + let fragment = InlineFragment::new(text.into()); + assert_eq!(fragment.width, UnicodeWidthStr::width(text)); +} diff --git a/src/wrap/line_buffer.rs b/src/wrap/line_buffer.rs deleted file mode 100644 index 0eafa3b2..00000000 --- a/src/wrap/line_buffer.rs +++ /dev/null @@ -1,154 +0,0 @@ -//! Line buffer utilities for `wrap_preserving_code`. -//! -//! This module encapsulates the mutable state required to accumulate tokens into -//! wrapped lines while reusing allocations between iterations. - -use std::ops::Range; - -use unicode_width::UnicodeWidthStr; - -#[derive(Default)] -pub(crate) struct LineBuffer { - text: String, - width: usize, - last_split: Option, -} - -pub(crate) struct SplitContext<'a> { - pub(crate) lines: &'a mut Vec, - pub(crate) width: usize, -} - -impl LineBuffer { - pub(crate) fn new() -> Self { Self::default() } - - pub(crate) fn text(&self) -> &str { self.text.as_str() } - - pub(crate) fn width(&self) -> usize { self.width } - - pub(crate) fn push_token(&mut self, token: &str) { - if token.len() == 1 && ".?!,:;".contains(token) && self.text.trim_end().ends_with('`') { - let trimmed_len = self.text.trim_end_matches(char::is_whitespace).len(); - if trimmed_len < self.text.len() { - let removed = &self.text[trimmed_len..]; - let removed_width = UnicodeWidthStr::width(removed); - self.text.truncate(trimmed_len); - self.width = self.width.saturating_sub(removed_width); - self.last_split = self - .text - .char_indices() - .rev() - .find(|(_, ch)| ch.is_whitespace()) - .map(|(idx, ch)| idx + ch.len_utf8()); - } - } - - self.text.push_str(token); - self.width += UnicodeWidthStr::width(token); - if token.chars().all(char::is_whitespace) { - self.last_split = Some(self.text.len()); - } - } - - pub(crate) fn push_span(&mut self, tokens: &[String], start: usize, end: usize) { - for tok in &tokens[start..end] { - self.push_token(tok.as_str()); - } - } - - pub(crate) fn flush_into(&mut self, lines: &mut Vec) { - if self.text.is_empty() { - return; - } - lines.push(std::mem::take(&mut self.text)); - self.width = 0; - self.last_split = None; - } - - pub(crate) fn split_with_span( - &mut self, - ctx: &mut SplitContext<'_>, - tokens: &[String], - span: Range, - ) -> bool { - let Some(pos) = self.last_split else { - return false; - }; - - let (head_bounds, trimmed_tail_start) = { - let (head, tail) = self.text.split_at(pos); - let trimmed_head = head.trim_end(); - let trimmed_head_len = trimmed_head.len(); - let trailing_ws = &head[trimmed_head_len..]; - let head_bounds = if trimmed_head_len == 0 { - None - } else if trailing_ws.chars().count() > 1 { - Some((0, pos)) - } else { - Some((0, trimmed_head_len)) - }; - - let trimmed_tail = tail.trim_start(); - let trimmed_tail_start = pos + (tail.len() - trimmed_tail.len()); - (head_bounds, trimmed_tail_start) - }; - - if let Some((start_idx, end_idx)) = head_bounds { - ctx.lines.push(self.text[start_idx..end_idx].to_owned()); - } - - self.text.drain(..trimmed_tail_start); - for tok in &tokens[span.clone()] { - self.text.push_str(tok); - } - - self.width = UnicodeWidthStr::width(self.text.as_str()); - if span.end > span.start - && tokens[span.end - 1].chars().all(char::is_whitespace) - && !self.text.is_empty() - { - self.last_split = Some(self.text.len()); - } else { - self.last_split = None; - } - - if self.width > ctx.width { - ctx.lines.push(self.text.trim_end().to_string()); - self.text.clear(); - self.width = 0; - self.last_split = None; - } - - true - } - - pub(crate) fn flush_trailing_whitespace( - &mut self, - lines: &mut Vec, - tokens: &[String], - span: Range, - ) -> bool { - if span.end != tokens.len() { - return false; - } - if !tokens[span.clone()] - .iter() - .all(|tok| tok.chars().all(char::is_whitespace)) - { - return false; - } - - if self.text.is_empty() { - self.last_split = None; - return true; - } - - for tok in &tokens[span] { - self.text.push_str(tok); - } - lines.push(std::mem::take(&mut self.text)); - self.width = 0; - self.last_split = None; - true - } -} diff --git a/src/wrap/paragraph.rs b/src/wrap/paragraph.rs index 358c946f..11e09170 100644 --- a/src/wrap/paragraph.rs +++ b/src/wrap/paragraph.rs @@ -9,56 +9,85 @@ use unicode_width::UnicodeWidthStr; use super::inline::wrap_preserving_code; +/// Carries the parsed prefix metadata for a line that should be wrapped. pub(super) struct PrefixLine<'a> { + /// Stores the literal prefix emitted on the first wrapped line. pub(super) prefix: Cow<'a, str>, + /// Stores the text that follows the prefix and should be wrapped. pub(super) rest: &'a str, + /// Marks whether continuation lines should repeat the full prefix. pub(super) repeat_prefix: bool, } #[derive(Default)] +/// Tracks buffered paragraph content and its shared indentation. pub(super) struct ParagraphState { + /// Stores buffered paragraph segments and whether each ends with a hard break. buf: Vec<(String, bool)>, + /// Stores the leading indentation reused for wrapped continuation lines. indent: String, } impl ParagraphState { + /// Clears the buffered paragraph state. + /// + /// This resets both the accumulated segments and the remembered indent. + /// It returns no value and preserves the invariant that an empty state has + /// no buffered text. This method never panics. pub(super) fn clear(&mut self) { self.buf.clear(); self.indent.clear(); } + /// Records the paragraph indent from `line` when the buffer is still empty. + /// + /// The `line` parameter is the original input line whose leading + /// whitespace may become the continuation prefix. This method returns no + /// value, updates `indent` only for the first buffered segment, and never + /// panics. pub(super) fn note_indent(&mut self, line: &str) { if self.buf.is_empty() { self.indent = line.chars().take_while(|c| c.is_whitespace()).collect(); } } + /// Appends one paragraph segment and its hard-break marker. + /// + /// `text` is stored verbatim and `hard_break` records whether the source + /// line ended with Markdown hard-break spacing. This method returns no + /// value and keeps buffered segments in input order without panicking. pub(super) fn push(&mut self, text: String, hard_break: bool) { self.buf.push((text, hard_break)); } } +/// Emits wrapped paragraph lines into the caller-provided output buffer. pub(super) struct ParagraphWriter<'a> { + /// Borrows the caller-owned output buffer that receives emitted lines. out: &'a mut Vec, + /// Stores the target wrap width in Unicode display columns. width: usize, } impl<'a> ParagraphWriter<'a> { + /// Creates a writer over `out` with the target wrap `width`. + /// + /// The returned writer borrows `out` for subsequent emission. `width` is + /// interpreted in Unicode display columns, and the constructor never + /// panics. pub(super) fn new(out: &'a mut Vec, width: usize) -> Self { Self { out, width } } - fn append_wrapped_with_prefix(&mut self, line: &PrefixLine<'_>) { - let prefix = line.prefix.as_ref(); + /// Wraps `text` with `prefix` on the first line and `continuation_prefix` + /// on later lines. + /// + /// The available content width is computed once from the Unicode display + /// width of `prefix`. This method returns no value, emits directly into + /// `out`, and preserves the invariant that every continuation line uses + /// the supplied continuation prefix. + fn wrap_with_prefix(&mut self, prefix: &str, continuation_prefix: &str, text: &str) { let prefix_width = UnicodeWidthStr::width(prefix); let available = self.width.saturating_sub(prefix_width).max(1); - let indent_str: String = prefix.chars().take_while(|c| c.is_whitespace()).collect(); - let indent_width = UnicodeWidthStr::width(indent_str.as_str()); - let wrapped_indent = if line.repeat_prefix { - prefix.to_string() - } else { - format!("{}{}", indent_str, " ".repeat(prefix_width - indent_width)) - }; - - let lines = wrap_preserving_code(line.rest, available); + let lines = wrap_preserving_code(text, available); if lines.is_empty() { self.out.push(prefix.to_string()); return; @@ -68,11 +97,36 @@ impl<'a> ParagraphWriter<'a> { if index == 0 { self.out.push(format!("{prefix}{wrapped_line}")); } else { - self.out.push(format!("{wrapped_indent}{wrapped_line}")); + self.out + .push(format!("{continuation_prefix}{wrapped_line}")); } } } + /// Wraps one parsed prefixed line using the correct continuation prefix. + /// + /// `line` supplies the first-line prefix, the text to wrap, and whether + /// the full prefix should repeat on continuations. This method returns no + /// value and keeps continuation alignment in the same visual column. + fn append_wrapped_with_prefix(&mut self, line: &PrefixLine<'_>) { + let prefix = line.prefix.as_ref(); + let prefix_width = UnicodeWidthStr::width(prefix); + let indent_str: String = prefix.chars().take_while(|c| c.is_whitespace()).collect(); + let indent_width = UnicodeWidthStr::width(indent_str.as_str()); + let continuation_prefix = if line.repeat_prefix { + prefix.to_string() + } else { + format!("{}{}", indent_str, " ".repeat(prefix_width - indent_width)) + }; + + self.wrap_with_prefix(prefix, continuation_prefix.as_str(), line.rest); + } + + /// Flushes the buffered paragraph into wrapped output lines. + /// + /// `state` supplies the buffered segments and remembered indent. This + /// method returns no value, clears the state when flushing completes, and + /// preserves hard-break segments as distinct wrapped emissions. pub(super) fn flush_paragraph(&mut self, state: &mut ParagraphState) { if state.buf.is_empty() { return; @@ -97,17 +151,30 @@ impl<'a> ParagraphWriter<'a> { state.clear(); } + /// Wraps one buffered `segment` using the shared indentation `indent`. + /// + /// Both parameters are emitted through `wrap_with_prefix`, and the method + /// returns no value. It assumes `indent` already reflects the paragraph's + /// visual indent and never panics. fn push_wrapped_segment(&mut self, indent: &str, segment: &str) { - for line in wrap_preserving_code(segment, self.width - indent.len()) { - self.out.push(format!("{indent}{line}")); - } + self.wrap_with_prefix(indent, indent, segment); } + /// Flushes any active paragraph and then emits `line` verbatim. + /// + /// `state` is flushed before `line` is appended unchanged to `out`. This + /// method returns no value and preserves the invariant that verbatim lines + /// break paragraph accumulation boundaries. pub(super) fn push_verbatim(&mut self, state: &mut ParagraphState, line: &str) { self.flush_paragraph(state); self.out.push(line.to_string()); } + /// Flushes any active paragraph and wraps `prefix_line`. + /// + /// `state` is flushed first so prefixed lines never join the preceding + /// paragraph buffer. This method returns no value and emits the wrapped + /// output directly into `out`. pub(super) fn handle_prefix_line( &mut self, state: &mut ParagraphState, @@ -117,3 +184,53 @@ impl<'a> ParagraphWriter<'a> { self.append_wrapped_with_prefix(prefix_line); } } + +#[cfg(test)] +mod tests { + use std::borrow::Cow; + + use super::{ParagraphState, ParagraphWriter, PrefixLine}; + + #[test] + fn wrap_with_prefix_emits_single_line_when_text_fits() { + let mut out = Vec::new(); + let mut writer = ParagraphWriter::new(&mut out, 80); + writer.wrap_with_prefix("> ", "> ", "hello world"); + assert_eq!(out, vec!["> hello world".to_string()]); + } + + #[test] + fn wrap_with_prefix_uses_continuation_prefix_on_wrapped_lines() { + let mut out = Vec::new(); + let mut writer = ParagraphWriter::new(&mut out, 14); + writer.wrap_with_prefix("> ", " ", "alpha beta gamma"); + assert_eq!(out, vec!["> alpha beta".to_string(), " gamma".to_string()]); + } + + #[test] + fn handle_prefix_line_can_repeat_or_change_the_continuation_prefix() { + let mut out = Vec::new(); + let mut writer = ParagraphWriter::new(&mut out, 14); + let mut state = ParagraphState::default(); + writer.handle_prefix_line( + &mut state, + &PrefixLine { + prefix: Cow::Borrowed("- [ ] "), + rest: "alpha beta", + repeat_prefix: false, + }, + ); + assert_eq!( + out, + vec!["- [ ] alpha".to_string(), " beta".to_string()] + ); + } + + #[test] + fn wrap_with_prefix_accounts_for_unicode_wide_prefixes() { + let mut out = Vec::new(); + let mut writer = ParagraphWriter::new(&mut out, 7); + writer.wrap_with_prefix("「 ", " ", "ab cd"); + assert_eq!(out, vec!["「 ab".to_string(), " cd".to_string()]); + } +} diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs index 344c2990..80a19a9c 100644 --- a/src/wrap/tests.rs +++ b/src/wrap/tests.rs @@ -1,13 +1,10 @@ //! Unit tests for text wrapping functionality. -//! -//! This module contains tests for the `wrap_text` function, verifying correct -//! behaviour with code spans, links, hyphenated words, and various line widths. +//! Verifies `wrap_text` with code spans, links, hyphenated words, and widths. use rstest::rstest; use super::{ inline::{attach_punctuation_to_previous_line, determine_token_span, wrap_preserving_code}, - line_buffer::{LineBuffer, SplitContext}, tokenize::segment_inline, }; use crate::wrap::{BlockKind, classify_block, wrap_text}; @@ -94,106 +91,6 @@ fn attach_punctuation_ignores_non_code_suffix() { assert_eq!(lines, vec!["plain text".to_string()]); } -#[test] -fn line_buffer_trims_trailing_whitespace_before_punctuation() { - let tokens = vec![ - "wrap".to_string(), - " ".to_string(), - "`code`".to_string(), - " ".to_string(), - ]; - let mut buffer = LineBuffer::new(); - buffer.push_span(&tokens, 0, tokens.len()); - assert_eq!(buffer.text(), "wrap `code` "); - - let punct = vec!["!".to_string()]; - buffer.push_span(&punct, 0, punct.len()); - assert_eq!(buffer.text(), "wrap `code`!"); -} - -#[test] -fn line_buffer_split_preserves_multi_space_lines() { - let tokens = vec![ - "alpha".to_string(), - " ".to_string(), - "beta".to_string(), - " ".to_string(), - ]; - let mut buffer = LineBuffer::new(); - buffer.push_span(&tokens, 0, 2); - - let mut lines = Vec::new(); - let mut split = SplitContext { - lines: &mut lines, - width: 8, - }; - assert!(buffer.split_with_span(&mut split, &tokens, 2..4)); - assert_eq!(lines, vec!["alpha ".to_string()]); - assert_eq!(buffer.text(), "beta "); - assert_eq!( - buffer.width(), - unicode_width::UnicodeWidthStr::width(buffer.text()) - ); -} - -#[test] -fn line_buffer_split_trims_single_trailing_space() { - let tokens = vec!["alpha".to_string(), " ".to_string(), "beta".to_string()]; - let mut buffer = LineBuffer::new(); - buffer.push_span(&tokens, 0, 2); - - let mut lines = Vec::new(); - let mut split = SplitContext { - lines: &mut lines, - width: 5, - }; - assert!(buffer.split_with_span(&mut split, &tokens, 2..3)); - assert_eq!(lines, vec!["alpha".to_string()]); - assert_eq!(buffer.text(), "beta"); - assert_eq!( - buffer.width(), - unicode_width::UnicodeWidthStr::width(buffer.text()) - ); -} - -#[test] -fn line_buffer_split_tracks_multiple_whitespace_tokens() { - let tokens = vec![ - "foo".to_string(), - " ".to_string(), - " ".to_string(), - "bar".to_string(), - ]; - let mut buffer = LineBuffer::new(); - buffer.push_span(&tokens, 0, 3); - - let mut lines = Vec::new(); - let mut split = SplitContext { - lines: &mut lines, - width: 4, - }; - assert!(buffer.split_with_span(&mut split, &tokens, 3..4)); - assert_eq!(lines, vec!["foo ".to_string()]); - assert_eq!(buffer.text(), "bar"); -} - -#[test] -fn line_buffer_trailing_whitespace_flushes_line() { - let mut buffer = LineBuffer::new(); - let words = vec!["foo".to_string()]; - buffer.push_span(&words, 0, words.len()); - - let whitespace_tokens = vec![" ".to_string()]; - let mut lines = Vec::new(); - assert!(buffer.flush_trailing_whitespace( - &mut lines, - &whitespace_tokens, - 0..whitespace_tokens.len() - )); - assert_eq!(lines, vec!["foo ".to_string()]); - assert!(buffer.text().is_empty()); -} - #[test] fn wrap_preserving_code_splits_after_consecutive_whitespace() { let lines = wrap_preserving_code("alpha beta gamma", 8); @@ -500,3 +397,4 @@ fn classify_block_detects_markdown_prefixes(line: &str, expected: Option> hello world".to_string()]; + let wrapped = wrap_text(&input, 80); + assert_eq!(wrapped, vec![">> hello world".to_string()]); +} + +#[test] +fn wrap_with_prefix_multiline_uses_continuation() { + let input = vec!["> alpha beta gamma".to_string()]; + let wrapped = wrap_text(&input, 10); + assert_eq!( + wrapped, + vec![ + "> alpha".to_string(), + "> beta".to_string(), + "> gamma".to_string(), + ] + ); +} + +#[test] +fn wrap_text_repeats_nested_blockquote_prefix() { + let prefix = "> > "; + let input = vec![ + concat!( + "> > This nested quote contains enough text to require wrapping so that we can verify ", + "multi-level handling." + ) + .to_string(), + ]; + let wrapped = wrap_text(&input, 80); + assert!(wrapped.len() > 1); + assert!(wrapped.iter().all(|line| line.starts_with("> > "))); + let wrapped_payload = wrapped + .iter() + .map(|line| { + line.strip_prefix(prefix) + .expect("nested blockquote line keeps its prefix") + }) + .collect::>() + .join(" "); + let normalized_payload = wrapped_payload + .split_whitespace() + .collect::>() + .join(" "); + assert_eq!( + normalized_payload, + input[0] + .strip_prefix(prefix) + .expect("original test input uses the nested blockquote prefix") + ); +} + +#[test] +fn wrap_with_prefix_plain_indent_both_lines() { + let input = vec![" alpha beta gamma".to_string()]; + let wrapped = wrap_text(&input, 10); + assert_eq!( + wrapped, + vec![ + " alpha".to_string(), + " beta".to_string(), + " gamma".to_string(), + ] + ); +} + +#[rstest( + input, + width, + expected, + case( + vec![ + concat!( + "[^5]: Given When Then - Martin Fowler, accessed on 14 July 2025, ", + "" + ) + .to_string(), + ], + 80, + vec![ + "[^5]: Given When Then - Martin Fowler, accessed on 14 July 2025," + .to_string(), + " " + .to_string(), + ] + ), + case( + vec![ + concat!( + "- [ ] Create a `HttpTravelTimeProvider` struct that implements the ", + "`TravelTimeProvider` trait." + ) + .to_string(), + ], + 70, + vec![ + "- [ ] Create a `HttpTravelTimeProvider` struct that implements the" + .to_string(), + " `TravelTimeProvider` trait.".to_string(), + ] + ) +)] +fn wrap_text_preserves_prefixed_continuation_alignment( + input: Vec, + width: usize, + expected: Vec, +) { + let wrapped = wrap_text(&input, width); + assert_eq!(wrapped, expected); +} diff --git a/src/wrap/tokenize/mod.rs b/src/wrap/tokenize/mod.rs index c6e80041..d00f407a 100644 --- a/src/wrap/tokenize/mod.rs +++ b/src/wrap/tokenize/mod.rs @@ -229,15 +229,30 @@ where } } -/// Tokenize a Markdown snippet using backtick-delimited code spans. +fn push_newline_if_needed( + tokens: &mut Vec>, + lines: &mut std::iter::Peekable, + had_trailing_newline: bool, +) where + I: Iterator, +{ + // Emit a newline token if another line follows or when the + // original input ended with a trailing newline. The peek avoids + // prematurely allocating for the final newline when it isn't + // necessary. + if lines.peek().is_some() || (had_trailing_newline && lines.peek().is_none()) { + tokens.push(Token::Newline); + } +} + +#[must_use] +/// Tokenizes inline Markdown `source` into an ordered list of [`Token`] values. /// -/// The function scans the input line by line. Lines matching [`FENCE_RE`] -/// produce [`Token::Fence`] tokens and toggle fenced mode. Lines inside a -/// fence are yielded verbatim. Outside fenced regions the scanner searches for -/// backtick sequences. Text before a backtick becomes [`Token::Text`]. When a -/// closing backtick follows, the enclosed portion forms a [`Token::Code`] -/// span. If no closing backtick is found the delimiter and remaining text are -/// returned as [`Token::Text`]. Whitespace is preserved exactly as it appears. +/// The `source` parameter is the inline Markdown text to tokenize. The return +/// value is `Vec>`, preserving the token order found in `source`. +/// Inline code spans, links, and whitespace runs are emitted as distinct token +/// variants or text slices so callers can perform width-aware or +/// format-aware processing. /// /// ```rust /// use mdtablefix::wrap::{Token, tokenize_markdown}; @@ -255,23 +270,6 @@ where /// ] /// ); /// ``` -fn push_newline_if_needed( - tokens: &mut Vec>, - lines: &mut std::iter::Peekable, - had_trailing_newline: bool, -) where - I: Iterator, -{ - // Emit a newline token if another line follows or when the - // original input ended with a trailing newline. The peek avoids - // prematurely allocating for the final newline when it isn't - // necessary. - if lines.peek().is_some() || (had_trailing_newline && lines.peek().is_none()) { - tokens.push(Token::Newline); - } -} - -#[must_use] pub fn tokenize_markdown(source: &str) -> Vec> { if source.is_empty() { return Vec::new(); diff --git a/tests/wrap_unit.rs b/tests/wrap_unit.rs index 3445975a..a403afd3 100644 --- a/tests/wrap_unit.rs +++ b/tests/wrap_unit.rs @@ -5,6 +5,7 @@ use mdtablefix::wrap::wrap_text; use rstest::rstest; +use unicode_width::UnicodeWidthStr; #[test] fn wrap_text_preserves_hyphenated_words() { @@ -153,3 +154,15 @@ fn wrap_text_preserves_fenced_shell_block_without_blank_line_after_heading() { assert_eq!(wrap_text(&input, 80), input); } + +#[test] +fn wrap_text_does_not_overflow_after_tail_rebalancing() { + let wrapped = wrap_text(&["a four five".to_string()], 6); + + assert_eq!(wrapped.join(""), "a four five"); + assert!( + wrapped + .iter() + .all(|line| UnicodeWidthStr::width(line.as_str()) <= 6) + ); +}