diff --git a/Cargo.lock b/Cargo.lock index 190a163..00f5ccd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -421,7 +421,8 @@ dependencies = [ "regex", "rstest", "tempfile", - "unicode-width", + "textwrap", + "unicode-width 0.1.14", ] [[package]] @@ -785,6 +786,12 @@ version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +[[package]] +name = "smawk" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c388c1b5e93756d0c740965c41e8822f866621d41acbdf6336a6a168f8840c" + [[package]] name = "string_cache" version = "0.8.9" @@ -857,6 +864,17 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683" +[[package]] +name = "textwrap" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c13547615a44dc9c452a8a534638acdf07120d4b6847c8178705da06306a3057" +dependencies = [ + "smawk", + "unicode-linebreak", + "unicode-width 0.2.2", +] + [[package]] name = "toml_datetime" version = "0.6.11" @@ -880,12 +898,24 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" +[[package]] +name = "unicode-linebreak" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b09c83c3c29d37506a3e260c08c03743a6bb66a9cd432c6934ab501a190571f" + [[package]] name = "unicode-width" version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + [[package]] name = "utf-8" version = "0.7.6" diff --git a/Cargo.toml b/Cargo.toml index bbb81b3..89255a9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ once_cell = "1" rayon = "1.11" html5ever = "0.27" markup5ever_rcdom = "0.3" +textwrap = "0.16.2" unicode-width = "0.1" diff --git a/README.md b/README.md index 23ad0f6..e9812d2 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,9 @@ list items to 80 columns. Hyphenated words are treated as indivisible during wrapping, so `very-long-word` will move to the next line intact rather than split at the -hyphen. The tool ignores fenced code blocks and respects escaped pipes (`\|`), +hyphen. The wrap engine now delegates line fitting to the `textwrap` crate +while preserving Markdown-aware token grouping for inline code, links, and hard +breaks. The tool ignores fenced code blocks and respects escaped pipes (`\|`), making it safe to use on Markdown with mixed content. ## Installation diff --git a/docs/adrs/0002-textwrap-inline-wrapping.md b/docs/adrs/0002-textwrap-inline-wrapping.md new file mode 100644 index 0000000..1bfa2a7 --- /dev/null +++ b/docs/adrs/0002-textwrap-inline-wrapping.md @@ -0,0 +1,86 @@ +# Architecture Decision Record (ADR) 0002: Delegate inline line fitting to `textwrap` + +- Status: Accepted +- Date: 2026-04-22 + +## Context + +The `--wrap` pipeline in `mdtablefix` previously relied on a bespoke +`LineBuffer`-driven loop in `src/wrap/inline.rs` to accumulate tokens into +output lines while preserving inline code spans, Markdown links, and trailing +punctuation. The same module also duplicated prefix-handling logic across +`append_wrapped_with_prefix` and `handle_prefix_line` in +`src/wrap/paragraph.rs`. + +Several problems motivated a replacement: + +- The `LineBuffer` implementation mixed display-width and byte-length + calculations in a way that was difficult to audit. +- The whitespace-carry and split-boundary logic was tightly coupled to + individual token types, making it hard to extend without introducing + regressions. +- Prefix width was computed twice (once for the first line, once for + continuation lines) using subtly different code paths, risking drift between + the two calculations. +- The approach did not use any battle-tested line-breaking library; every edge + case had to be handled bespoke. + +## Decision + +The inline line-fitting step now delegates to +`textwrap::wrap_algorithms::wrap_first_fit`, accepting pre-grouped +`InlineFragment` values that implement `textwrap::core::Fragment`. + +Key design choices: + +1. **Fragment model** — tokens are grouped into `InlineFragment` values before + wrapping. Each fragment stores its rendered text, precomputed + display-column width (`UnicodeWidthStr::width`), and a `FragmentKind` + discriminant (`Whitespace`, `InlineCode`, `Link`, `Plain`). This keeps + Markdown-aware grouping under repository control while delegating the + actual line-fitting arithmetic to `textwrap`. + +2. **Greedy algorithm** — `wrap_first_fit` was chosen over the optimal-fit + algorithm because Markdown wrapping must produce deterministic, line-by-line + output that matches the existing tests. Optimal fit would require look-ahead + that changes the wrapping of earlier lines based on later content. + +3. **Post-processing passes** — two passes normalise the raw fit output: + `merge_whitespace_only_lines` absorbs whitespace-only separator lines back + into adjacent content lines, and `rebalance_atomic_tails` moves trailing + atomic or plain fragments to the following line when the destination line + can accommodate them within the target width. Both passes are + width-constrained so they cannot create lines that `wrap_first_fit` would + have rejected. + +4. **Unified prefix helper** — `ParagraphWriter::wrap_with_prefix` computes + available content width once from the display width of the prefix string, + then emits first-line and continuation-line prefixes from the same code + path. `append_wrapped_with_prefix` and `push_wrapped_segment` both delegate + to this helper. + +5. **Public API stability** — `wrap_text`, `Token`, and `tokenize_markdown` + remain unchanged. The `tokenize_markdown` public API is explicitly out of + scope for this change because it is used by `src/code_emphasis.rs`, + `src/footnotes/mod.rs`, `src/footnotes/renumber.rs`, and `src/textproc.rs`. + +6. **Dead code removal** — `src/wrap/line_buffer.rs` is deleted because it is + no longer reachable from the active wrap path after the fragment-based + implementation is complete. + +## Consequences + +- Line-fitting arithmetic for inline text is handled by `textwrap`, a + well-tested dependency, rather than a custom accumulation loop. +- Display-width measurements are centralised through `unicode-width` and passed + to `textwrap` via the `Fragment` trait, eliminating the earlier mixed + byte-length / display-width inconsistency. +- The `textwrap` crate (v0.16.2) is added as a dependency. It transitively + introduces `smawk`, `unicode-linebreak`, and a newer `unicode-width` (v0.2) + that coexists with the direct `unicode-width` v0.1 dependency. +- The post-processing passes add complexity that did not exist in the + `LineBuffer` approach. That complexity is warranted because it is narrowly + scoped, independently testable, and avoids re-implementing the full greedy + algorithm. +- All existing active wrap tests continue to pass, confirming observable + behaviour is preserved. \ No newline at end of file diff --git a/docs/architecture.md b/docs/architecture.md index a25c6cf..daca369 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -36,8 +36,9 @@ The function combines several helpers documented in `docs/`: - `html::convert_html_tables` transforms basic HTML tables into Markdown so \ they can be reflowed like regular tables. See \ [HTML table support](#html-table-support-in-mdtablefix). -- `wrap::wrap_text` applies optional line wrapping. It relies on the - `unicode-width` crate for accurate character widths. +- `wrap::wrap_text` applies optional line wrapping. It classifies Markdown + block structure locally and delegates greedy line fitting to the `textwrap` + crate over Markdown-aware fragments measured with `unicode-width`. - `wrap::tokenize_markdown` emits `Token` values for custom processing. - `headings::convert_setext_headings` rewrites Setext headings with underline markers into ATX headings when the CLI `--headings` flag is provided. The @@ -374,35 +375,113 @@ module handles filesystem operations, delegating the text processing to ### Tokenizer flow -The inline tokenizer iterates over the source string lazily, so no duplicate -`Vec` representation is required. The following diagram summarizes the -control flow, highlighting the helpers touched during whitespace, code span, -and link handling. +The inline tokenizer still iterates over the source string lazily, so no +duplicate `Vec` representation is required. The resulting tokens are then +grouped into Markdown-aware fragments and passed to +`textwrap::wrap_algorithms::wrap_first_fit`, which chooses the breakpoints +without splitting code spans, links, or punctuation groups. ```mermaid flowchart TD - A["Input text (&str)"] --> B["Initialize tokens Vec"] - B --> C["Iterate over text by byte index"] - C --> D{"Current char is whitespace?"} - D -- Yes --> E["scan_while for whitespace"] - E --> F["collect_range and push token"] - D -- No --> G{"Current char is '`'?"} - G -- Yes --> H["Check backslash escape (has_odd_backslash_escape_bytes)"] - H -- Escaped --> I["Push '`' as token"] - H -- Not escaped --> J["scan_while for code fence"] - J --> K["Find closing fence, collect_range and push token"] - G -- No --> L{"Current char is '[' or '!['?"} - L -- Yes --> M["parse_link_or_image"] - M --> N["Push link/image token"] - N --> O["scan_while for trailing punctuation"] - O --> P["collect_range and push punctuation token"] - L -- No --> Q["scan_while for non-whitespace/non-` chars"] - Q --> R["collect_range and push token"] - F & I & K & P & R --> S["Continue iteration"] - S --> C - C -->|End| T["Return tokens Vec"] + A["Input text (&str)"] --> B["Tokenize into whitespace and inline Markdown tokens"] + B --> C["Group tokens into Markdown-aware fragments"] + C --> D["Measure fragment widths with unicode-width"] + D --> E["Run textwrap wrap_first_fit over current fragments"] + E --> F["Merge whitespace-only continuation lines forward"] + F --> G["Render wrapped lines, trimming only a single trailing separator space"] ``` +Figure: Wrap-tokenizer flow. Starting from an input string, the wrapper emits +whitespace and inline Markdown tokens, groups them into fragments, measures +their display widths with `unicode-width`, feeds them through +`textwrap::wrap_algorithms::wrap_first_fit`, and then reconstructs wrapped +lines while preserving Markdown-aware spacing rules. + +### Wrap flow + +The higher-level `wrap_text` entry point combines block classification, +paragraph buffering, prefix-aware wrapping, and inline line fitting. The +following flow shows how a line moves through those stages before it is either +preserved verbatim or emitted as wrapped output. + +```mermaid +flowchart TD + A[Start: wrap_text called with lines and width] --> B{Classify line} + + B -->|Fenced or indented code block| C[Preserve line verbatim] + B -->|Table or heading or directive| C + B -->|Blank line| D[Flush active paragraph and emit blank] + B -->|Paragraph or prefixed line| E[Send to ParagraphWriter] + + E --> F{Has prefix such as bullet, blockquote, footnote} + F -->|Yes| G[wrap_with_prefix computes display width using unicode-width] + F -->|No| H[wrap_preserving_code wraps inline content] + + G --> I[fragment-building / post-process helpers] + H --> I + + I --> J[textwrap::wrap_algorithms::wrap_first_fit performs line breaking] + J --> K[Reconstruct wrapped lines with prefixes and preserved spans] + K --> L[Emit wrapped lines to wrap_text] + + C --> M[Append line to output] + D --> M + L --> M + + M --> N{More input lines?} + N -->|Yes| B + N -->|No| O[Flush remaining paragraph and finish] +``` + +Figure: `wrap_text` control flow. The wrapper classifies each incoming line, +passes fenced blocks, tables, headings, directives, and indented code through +unchanged, flushes paragraphs on blanks, routes prose and prefixed lines +through `ParagraphWriter`, computes visible widths with `unicode-width`, and +delegates inline line fitting to `textwrap` before reconstructing the emitted +Markdown lines. + +### Wrap sequence + +The following sequence diagram focuses on the runtime collaboration between the +CLI entry point, `wrap_text`, `ParagraphWriter`, the inline wrapper, and +`textwrap` while a paragraph is being processed. + +```mermaid +sequenceDiagram + participant CLI as mdtablefix_CLI + participant WT as wrap_text + participant PW as ParagraphWriter + participant WP as wrap_preserving_code + participant IH as inline.rs_helpers + participant TW as textwrap::wrap_first_fit + + CLI->>WT: wrap_text(lines, width) + loop For each classified paragraph line + WT->>PW: handle_prefix_line / flush_paragraph + alt Prefixed or plain paragraph content + PW->>WP: wrap_preserving_code(text, width) + WP->>IH: build_fragments + merge/rebalance + IH->>TW: wrap_first_fit(fragments, line_widths) + TW-->>IH: wrapped_fragment_groups + IH-->>WP: wrapped_lines_with_spans + WP-->>PW: wrapped_lines_with_prefixes + PW-->>WT: wrapped_lines + WT-->>CLI: append wrapped output + else Nonwrappable line + PW-->>WT: push_verbatim / original_line + WT-->>CLI: append original output + end + end + WT-->>CLI: return final wrapped text +``` + +Figure: `wrap_text` sequence flow. The CLI calls `wrap_text`, which delegates +paragraph handling to `ParagraphWriter`; wrappable paragraph content then flows +through `wrap_preserving_code`, the fragment-building and post-processing +helpers in `src/wrap/inline.rs`, and the underlying `textwrap` engine before +wrapped lines return through the same stack to the CLI, while nonwrappable +lines bypass the inline wrapping path and are emitted unchanged. + The helper `html_table_to_markdown` is retained for backward compatibility but is deprecated. New code should call `convert_html_tables` instead. @@ -444,8 +523,9 @@ sequenceDiagram `mdtablefix` wraps paragraphs and list items while respecting the display width of Unicode characters. The `unicode-width` crate is used to compute the width -of strings when deciding where to break lines. This prevents emojis or other -multibyte characters from causing unexpected wraps or truncation. +of prefixes and Markdown-aware wrapping fragments before `textwrap` performs +line fitting. This prevents emojis or other multibyte characters from causing +unexpected wraps or truncation. Whenever wrapping logic examines the length of a token, it relies on `UnicodeWidthStr::width` to measure visible columns rather than byte length. diff --git a/docs/developers-guide.md b/docs/developers-guide.md index d4ebe80..51adfec 100644 --- a/docs/developers-guide.md +++ b/docs/developers-guide.md @@ -82,9 +82,84 @@ restores the separator row with widths derived from the final table body. - `format_separator_cells`: Expands separator cells to the target widths while preserving Markdown alignment markers. +## Wrap module architecture + +The `--wrap` pipeline is spread across four modules under `src/wrap/`: + +- `src/wrap.rs` — entry point; classifies each incoming line (fenced code, + table, heading, blank, or paragraph) and routes it to `ParagraphWriter`. +- `src/wrap/paragraph.rs` — `ParagraphWriter` and `ParagraphState`; buffers + paragraph lines and applies prefix-aware wrapping via `wrap_with_prefix`. +- `src/wrap/inline.rs` — `wrap_preserving_code`; converts a paragraph string + into `InlineFragment` values, feeds them to `textwrap::wrap_first_fit`, and + post-processes the result. +- `src/wrap/inline/postprocess.rs` — `merge_whitespace_only_lines` and + `rebalance_atomic_tails`; normalises the raw fit output before rendering. + +### Key types + +`InlineFragment` (in `src/wrap/inline.rs`) implements +`textwrap::core::Fragment`. Each fragment stores the rendered text, its +precomputed display-column width, and a `FragmentKind` discriminant: + +| `FragmentKind` | Description | +| :-- | :-- | +| `Whitespace` | Fragment composed entirely of whitespace characters. | +| `InlineCode` | Backtick-delimited code span, optionally with trailing punctuation. | +| `Link` | Markdown inline link or image reference. | +| `Plain` | Ordinary prose text. | + +`PrefixLine` (in `src/wrap/paragraph.rs`) carries a prefix string (e.g. `"> "` +for blockquotes, `"- "` for bullets), the content after the prefix, and a +`repeat_prefix` flag that controls whether continuation lines repeat the prefix +verbatim (blockquotes) or use a space-padded equivalent (bullets, footnotes, +checkboxes). + +### Width accounting + +All display-width measurements use `UnicodeWidthStr::width` from the +`unicode-width` crate. The `textwrap` call receives widths expressed as `f64` +values through `InlineFragment::width()` so that `wrap_first_fit` never +measures the byte length of a fragment. + +Prefix width is computed once in `wrap_with_prefix` before calling +`wrap_preserving_code`, ensuring that available content width is derived from +the rendered prefix width rather than its byte count. + +### Post-processing passes + +After `wrap_first_fit` assigns fragments to lines, two passes correct edge +cases that greedy fitting cannot anticipate: + +1. `merge_whitespace_only_lines` — absorbs whitespace-only separator lines back + into adjacent content lines so that rendered output does not gain spurious + blank entries. +2. `rebalance_atomic_tails` — moves a trailing atomic or plain fragment from + one line to the start of the next when the destination line can accommodate + it within `width`, preventing orphaned code spans or punctuation. + +Both passes are width-constrained: `rebalance_atomic_tails` rechecks the +destination line width before moving a fragment, so it cannot create a line +that `wrap_first_fit` would have rejected. + +### Public API stability + +`wrap_text`, `Token`, and `tokenize_markdown` are re-exported from `src/lib.rs` +and are used by `src/code_emphasis.rs`, `src/footnotes/mod.rs`, +`src/footnotes/renumber.rs`, and `src/textproc.rs`. These symbols must remain +stable across internal refactoring. Any change to their signatures or +observable behaviour requires an explicit approval step and updated tests. + +### Design decisions + +The rationale for adopting `textwrap` and the fragment-adapter approach is +recorded in `docs/adrs/0002-textwrap-inline-wrapping.md`. Refer to that ADR +when evaluating changes to the fragment model, post-processing passes, or the +`textwrap` dependency. + ## Design decisions The rationale for the staged table reflow pipeline is recorded in `docs/adrs/0001-table-reflow-pipeline.md`. Refer to that ADR when changing the parse, width-calculation, or separator-handling flow so implementation changes -stay aligned with the documented design constraints. +stay aligned with the documented design constraints. \ No newline at end of file diff --git a/docs/execplans/replace-bespoke-wrapping-with-textwrap-and-unicode-width.md b/docs/execplans/replace-bespoke-wrapping-with-textwrap-and-unicode-width.md new file mode 100644 index 0000000..ba25f01 --- /dev/null +++ b/docs/execplans/replace-bespoke-wrapping-with-textwrap-and-unicode-width.md @@ -0,0 +1,468 @@ +# Replace bespoke wrapping with `textwrap` and `unicode-width` + +This ExecPlan (execution plan) is a living document. The sections +`Constraints`, `Tolerances`, `Risks`, `Progress`, `Surprises & Discoveries`, +`Decision Log`, and `Outcomes & Retrospective` must be kept up to date as work +proceeds. + +Status: COMPLETED + +## Purpose / big picture + +After this change, `mdtablefix --wrap` must keep the same observable wrapping +behaviour that users already rely on for paragraphs, list items, blockquotes, +footnotes, hard line breaks, and fenced or indented code blocks, but the +line-breaking engine should be driven by the `textwrap` crate instead of the +current bespoke span buffer in `src/wrap/inline.rs` and the duplicated prefix +logic in `src/wrap/paragraph.rs`. + +Observable success means three things. First, the existing active wrap-related +tests still pass unchanged. Second, the implementation no longer depends on the +custom `LineBuffer`-driven wrapping loop or the duplicated +`append_wrapped_with_prefix` and `handle_prefix_line` split. Third, the +documentation explains that wrapping now delegates width-sensitive line +breaking to `textwrap` together with `unicode-width`. + +This issue is a maintainability refactor, not a user-facing bug fix. The plan +therefore prioritizes behaviour preservation over aggressive deletion. If a +proposed simplification would change a public API or visible wrapping result, +the plan prefers a smaller, safer first delivery and a follow-up issue. + +## Constraints + +- Keep public CLI flags and the public `mdtablefix::wrap::wrap_text` entry + point stable. +- Treat `mdtablefix::wrap::Token` and + `mdtablefix::wrap::tokenize_markdown` as stable unless the user explicitly + approves a public API change. They are re-exported from `src/lib.rs` and are + used by `src/code_emphasis.rs`, `src/footnotes/mod.rs`, + `src/footnotes/renumber.rs`, and `src/textproc.rs`. +- Preserve the current observable behaviour covered by active tests in + `src/wrap/tests.rs`, `tests/wrap_unit.rs`, `tests/wrap_cli.rs`, + `tests/wrap_renumber.rs`, `tests/cli.rs`, `tests/cli_frontmatter.rs`, + `tests/code_emphasis.rs`, and `tests/markdownlint.rs`. +- Do not rely on `tests/wrap/*.rs` for safety. Those files are currently + orphaned from Cargo test discovery and do not run unless they are promoted + into an active top-level target. +- Keep Unicode display-width handling based on `unicode-width`. Any helper that + computes indentation or available columns must use display width rather than + byte length. +- Add the `textwrap` dependency to `Cargo.toml` using implicit semver (no + leading `^`) that is compatible with the repository's Rust `1.89` minimum + version and existing 2024 edition build. +- Keep touched source files below the repository's 400-line soft ceiling. If + the refactor would make a file exceed that limit, split the code into a small + new module instead of extending the file. +- Update documentation that currently describes the bespoke wrapping engine: + at minimum `README.md`, `docs/architecture.md`, and `docs/trailing-spaces.md` + if the named helper or explanation changes. + +## Tolerances (exception triggers) + +- Scope: if the implementation requires changes to more than 12 files or + roughly 500 net lines of code after cleanup, stop and re-evaluate whether the + work has grown beyond a maintainability refactor. +- Interfaces: if removing bespoke wrapping would require changing the public + `Token` type, `tokenize_markdown`, or the signature of `wrap_text`, stop and + escalate rather than smuggling in an API break. +- Behaviour: if a naive `textwrap` prototype fails more than three existing + active wrap regressions that cannot be fixed with a small shielding layer, + stop and document the cases before proceeding. +- Dependencies: if the chosen `textwrap` release requires a Rust version newer + than `1.89`, stop and choose a compatible release or ask for approval to + raise the minimum supported version. +- Complexity: if preserving inline code spans and Markdown links appears to + require rebuilding a custom tokenizer and buffer that is materially similar + to the current one, stop and reframe the work as a narrower prefix-handling + simplification plus a separate design review for the inline engine. +- Validation: if `make lint` or `make test` still fail after three focused fix + cycles, stop and capture the failing cases in `Decision Log`. + +## Risks + +- Risk: plain `textwrap::wrap` is likely to split Markdown inline code spans or + links that contain spaces, which would regress tests such as + `wrap_text_preserves_code_spans`, `wrap_text_multiple_code_spans`, and + `wrap_text_preserves_links`. Severity: high Likelihood: high Mitigation: + begin with a red test baseline and a deliberate prototype. If the prototype + fails, add a minimal shielding layer for inline code spans and links before + invoking `textwrap`, rather than reviving the full custom `LineBuffer` loop. + +- Risk: the issue text suggests removing `tokenize_markdown`, but that helper + is public and is used outside wrapping. Severity: high Likelihood: high + Mitigation: treat public tokenization as out of scope for the first delivery. + Remove only wrapping-specific internals that become dead after the refactor. + Record any remaining token API cleanup as a follow-up issue. + +- Risk: `src/wrap/paragraph.rs` currently mixes byte-length and display-width + calculations. A rushed refactor could accidentally preserve that bug or + introduce a different indentation drift for bullets, blockquotes, or + footnotes. Severity: medium Likelihood: medium Mitigation: centralize prefix + wrapping in one helper that computes prefix and indent widths once with + `UnicodeWidthStr::width`, then add focused tests for repeated prefixes and + continuation indentation. + +- Risk: the repository contains many wrap-focused tests under `tests/wrap/`, + but they are not active Cargo targets. Severity: medium Likelihood: high + Mitigation: do not assume they provide safety. Promote any critical cases + needed for the refactor into `src/wrap/tests.rs`, `tests/wrap_unit.rs`, or + another active top-level `tests/*.rs` target before changing the engine. + +- Risk: the architecture docs currently describe the bespoke tokenizer flow and + module relationships in detail. Severity: low Likelihood: high Mitigation: + update the docs as part of the same change so the repository does not + advertise an implementation that no longer exists. + +## Progress + +- [x] (2026-04-22 00:00Z) Reviewed the current wrapping pipeline in + `src/wrap.rs`, `src/wrap/inline.rs`, `src/wrap/paragraph.rs`, and + `src/wrap/tokenize/`. +- [x] (2026-04-22 00:00Z) Confirmed that `unicode-width` is already present in + `Cargo.toml` and that `textwrap` is not yet a dependency. +- [x] (2026-04-22 00:00Z) Confirmed that `tokenize_markdown` is public and is + used by footnotes, text processing, and code-emphasis logic outside the wrap + engine. +- [x] (2026-04-22 00:00Z) Confirmed that `tests/wrap/*.rs` are orphaned from + Cargo integration discovery and must not be relied upon as active coverage. +- [x] (2026-04-22 00:00Z) Added active regressions for nested blockquote + prefix repetition, footnote continuation alignment, and checkbox indentation + in `src/wrap/tests.rs`. +- [x] (2026-04-22 00:00Z) Introduced `textwrap` and routed inline line fitting + through `textwrap::wrap_algorithms::wrap_first_fit`. +- [x] (2026-04-22 00:00Z) Replaced the bespoke line-buffer loop with + fragment-based wrapping and consolidated prefix handling in + `src/wrap/paragraph.rs`. +- [x] (2026-04-22 00:00Z) Removed the dead `src/wrap/line_buffer.rs` module + from the active wrap path. +- [x] (2026-04-22 00:00Z) Updated the affected documentation and passed + `make fmt`, `make check-fmt`, `make lint`, `make test`, `make markdownlint`, + and `make nixie`. +- [x] (2026-04-23 00:00Z) Tightened `rebalance_atomic_tails` so post-wrap + fragment moves no longer create overflow lines, reused the trailing buffer in + `wrap_preserving_code` instead of cloning the full prefix per fragment, and + added an active regression for the `a four five` / width `6` case. + +## Surprises & Discoveries + +- Observation: the user-facing issue text names `tokenize_markdown` as a + candidate for removal, but in this repository it is not just an internal wrap + helper. It is re-exported publicly from `src/lib.rs` and also feeds + `src/textproc.rs`, `src/footnotes/mod.rs`, `src/footnotes/renumber.rs`, and + `src/code_emphasis.rs`. Evidence: `src/lib.rs`, `src/textproc.rs`, + `src/footnotes/mod.rs`, `src/footnotes/renumber.rs`, and + `src/code_emphasis.rs`. Impact: the first implementation milestone should + replace the bespoke wrap engine without promising removal of the public token + API. + +- Observation: `tests/wrap/mod.rs` and the other files under `tests/wrap/` are + not active integration tests. Evidence: Cargo only discovers top-level + `tests/*.rs` files, and the active wrap targets are instead + `tests/wrap_unit.rs`, `tests/wrap_cli.rs`, and the library tests in + `src/wrap/tests.rs`. Impact: any critical regression that currently exists + only in the nested directory must be promoted to an active harness before the + refactor. + +- Observation: `docs/architecture.md` explicitly documents the bespoke + tokenizer flow and module relationships, and `docs/trailing-spaces.md` + mentions `wrap_preserving_code` by name. Evidence: `docs/architecture.md` + sections "Module relationships", "Tokenizer flow", and "Unicode width + handling", plus `docs/trailing-spaces.md`. Impact: documentation updates are + part of the required work, not optional cleanup. + +- Observation: `textwrap`'s low-level `wrap_first_fit` API works better for + this migration than `textwrap::wrap` because it accepts pre-grouped fragments + with caller-defined widths. Evidence: `textwrap::core::Fragment` and + `textwrap::wrap_algorithms::wrap_first_fit`. Impact: the repository can keep + its existing Markdown-aware token grouping and whitespace preservation + without reviving the old mutable `LineBuffer`. + +- Observation: the post-processing pass that rebalances trailing atomic or + plain fragments can invalidate `wrap_first_fit`'s width guarantee if it moves + a fragment without rechecking the destination line width. Evidence: + `rebalance_atomic_tails` on 2026-04-23 could turn `a four` / `five` into `a` + / `four five` at width `6`. Impact: any heuristic that mutates fitted lines + after wrapping must be width-aware or it can regress downstream layout + assumptions. + +## Decision Log + +- Decision: scope the first delivery to replacing the bespoke wrapping engine, + not the public token API. Rationale: `tokenize_markdown` is public and has + non-wrap consumers. Keeping that boundary stable preserves safety and lets + this issue land as the low-risk refactor it was labelled to be. Date/Author: + 2026-04-22 / Codex + +- Decision: use `textwrap` only for line-breaking and keep the existing + high-level block classification in `wrap_text`. Rationale: fences, indented + code blocks, headings, tables, and hard line breaks are repository-specific + policy decisions that already live in `src/wrap.rs` and do not need to move + into a third-party crate. Date/Author: 2026-04-22 / Codex + +- Decision: prove behaviour with active tests before deleting old internals. + Rationale: the repository contains inactive wrap tests, so moving directly to + clean up would create a false sense of safety. Date/Author: 2026-04-22 / Codex + +- Decision: use `textwrap::wrap_algorithms::wrap_first_fit` with custom + Markdown-aware fragments instead of `textwrap::wrap`. Rationale: + `wrap_first_fit` delegates line fitting to `textwrap` while allowing + `mdtablefix` to preserve leading carry whitespace, atomic code spans, and + display-width measurements on grouped fragments. That keeps the refactor + small without losing the behaviours guarded by the current tests. + Date/Author: 2026-04-22 / Codex + +- Decision: treat post-wrap fragment rebalancing as width-constrained, and keep + fragment classification explicit on `InlineFragment`. Rationale: the + migration's shielding layer is still part of the line-fitting contract, so it + must not create lines that `wrap_first_fit` would have rejected. Recording + fragment kind once also makes the whitespace-carry and tail-rebalance passes + easier to audit than repeating ad hoc string classification in each branch. + Date/Author: 2026-04-23 / Codex + +## Outcomes & Retrospective + +The implementation kept `tokenize_markdown` and the existing block +classification intact, but replaced the bespoke `LineBuffer` loop with a +fragment-based adapter over `textwrap::wrap_algorithms::wrap_first_fit`. Prefix +handling now flows through one helper in `src/wrap/paragraph.rs`, and active +regressions cover nested blockquotes, footnote continuation alignment, and +checkbox indentation. + +The key lesson from the migration is that `textwrap` alone is not a drop-in +replacement for the repository's historic whitespace semantics. Preserving the +old behaviour required a thin adaptation layer that merges whitespace-only +overflow lines forward and sometimes rebalances the trailing fragment when the +correct break only becomes obvious after the following separator arrives. + +That adaptation layer still needs the same width discipline as the underlying +wrapper. The 2026-04-23 follow-up fix confirmed that a seemingly harmless +tail-rebalancing heuristic can reintroduce overflow unless it rechecks the +destination line's display width before moving a fragment. + +## Context and orientation + +The top-level wrapping entry point is `wrap_text(lines, width)` in +`src/wrap.rs`. It streams input line by line, preserving fenced code blocks, +indented code blocks, tables, headings, Markdownlint directives, and blank +lines. Only ordinary paragraph text and prefixed lines such as bullets, +blockquotes, and footnotes reach the wrapping helpers. + +Prefix-aware wrapping currently lives in `src/wrap/paragraph.rs`. +`ParagraphWriter::handle_prefix_line` flushes any active paragraph and +delegates to `append_wrapped_with_prefix`. That helper computes the available +width for the content after the prefix, wraps the content, and then prefixes +the first and continuation lines manually. This is the best place to +consolidate the prefix logic into one helper such as `wrap_with_prefix`. + +Inline line-breaking currently lives in `src/wrap/inline.rs`. +`wrap_preserving_code` tokenizes the paragraph into inline fragments, groups +them into spans, and then walks them through a custom `LineBuffer`. The custom +logic exists to preserve inline code spans, links, adjacent punctuation, +consecutive whitespace, and trailing spaces. This is the core complexity that +the refactor is meant to remove. + +The `src/wrap/tokenize/` directory serves two separate roles today. One role is +the inline segmentation used only by `wrap_preserving_code`. The other role is +the public `tokenize_markdown` API used by other modules. These roles must be +treated separately during implementation; deleting the wrapping-specific +segmenter does not automatically justify removing the public token API. + +Active tests that currently exercise wrapping behaviour live in +`src/wrap/tests.rs`, `tests/wrap_unit.rs`, `tests/wrap_cli.rs`, +`tests/wrap_renumber.rs`, and wrap-related cases inside `tests/cli.rs`, +`tests/cli_frontmatter.rs`, `tests/code_emphasis.rs`, and +`tests/markdownlint.rs`. The nested `tests/wrap/*.rs` tree is useful as a +source of examples, but not as active protection until its cases are promoted +to top-level targets. + +The documentation that will need review lives in `README.md`, +`docs/architecture.md`, and `docs/trailing-spaces.md`. `docs/architecture.md` +currently includes a tokenizer-flow diagram that will become misleading once +the bespoke span buffer is removed. + +## Plan of work + +Stage A is a baseline and red-test stage. Start by identifying which current +behaviours the refactor must preserve. The minimum list is: inline code spans +remain intact, Markdown links remain intact, hyphenated words are not split, +trailing punctuation stays attached to code spans or links, trailing spaces on +the final wrapped line survive, bullets and footnotes indent continuation lines +correctly, blockquotes repeat the quote prefix, and fenced or indented code +blocks remain byte-identical. Promote any missing critical cases from the +inactive `tests/wrap/*.rs` tree into active harnesses before changing the wrap +engine. Prefer `src/wrap/tests.rs` for unit-level behaviour and +`tests/wrap_cli.rs` or `tests/wrap_unit.rs` for end-to-end regressions. + +Stage B is a prototype that proves whether `textwrap` can replace the custom +buffer without hidden regressions. Add `textwrap` to `Cargo.toml` and create a +temporary adapter in `src/wrap/inline.rs` that keeps the current +`wrap_preserving_code` call site stable while routing line breaking through +`textwrap::wrap` plus explicit `Options`. Start with `break_words(false)` and +Unicode-aware word separation. Run only the active wrap tests added in Stage A. +If the naive adapter breaks code spans or links, do not continue deleting old +logic. Instead, implement a small shielding layer that marks inline code spans, +links, and images as atomic fragments before calling `textwrap`, then restore +them after wrapping. The key goal of this stage is to prove that the new engine +can satisfy current tests without recreating the whole old buffer. + +Stage C simplifies prefix handling. Replace the current +`append_wrapped_with_prefix` plus `handle_prefix_line` split with a single +helper in `src/wrap/paragraph.rs` that receives a prefix, the remaining text, +the available width, and whether the prefix repeats on continuation lines. +Compute the display width of the prefix once with `UnicodeWidthStr::width`, +compute the wrapped continuation prefix once, and reuse that value for all +continuation lines. Apply the same display-width approach to ordinary paragraph +indentation so the module no longer mixes byte length and visible column width. + +Stage D removes dead internals. After the `textwrap`-backed path passes the +full active test suite, delete any no-longer-used custom buffering helpers such +as `src/wrap/line_buffer.rs` and any wrapping-only inline tokenization helpers +that are left unused. Do not remove `tokenize_markdown`, `Token`, or the +modules that still serve non-wrap consumers unless the user explicitly expands +the scope and the public API question is resolved. + +Stage E updates the docs. Rewrite the wrapping description in `README.md` only +where user-visible semantics changed or need clarification. In +`docs/architecture.md`, replace the bespoke tokenizer-flow explanation with a +short description of the new split: repository-owned block classification and +prefix handling on one side, `textwrap` plus `unicode-width` for line breaking +on the other. Update `docs/trailing-spaces.md` if `wrap_preserving_code` +changes name, visibility, or implementation details. + +Stage F is final validation and cleanup. Run formatting, lint, tests, and the +documentation gates through `tee` so the logs remain inspectable in this +environment. Review the diff to confirm that the result is a real +simplification and not a hidden rewrite of the bespoke engine under new names. + +## Concrete steps + +Work from the repository root: + +```bash +pwd +``` + +Expected: + +```plaintext +/home/user/project +``` + +List the currently active wrap-related tests before editing so the red/green +cycle is grounded in real targets: + +```bash +cargo test -- --list | rg "wrap|tokenize_markdown|frontmatter|markdownlint" +``` + +Expected: + +```plaintext +...wrap_text_preserves_code_spans +...wrap_text_multiple_code_spans +...cli_wrap_in_place_preserves_shell_block_verbatim +... +``` + +After adding any missing active regressions, prove they fail against a naive +`textwrap` prototype before investing in cleanup. For example, run the focused +unit and CLI suites that exercise the wrapping path: + +```bash +cargo test wrap_text_preserves_code_spans --lib +cargo test wrap_text_multiple_code_spans --lib +cargo test --test wrap_unit +cargo test --test wrap_cli +``` + +Expected: + +```plaintext +running ... +test ... ok +``` + +Once the prototype is passing, run the wider wrap-adjacent suites that can +catch unintended interactions: + +```bash +cargo test --test wrap_renumber +cargo test --test cli +cargo test --test cli_frontmatter +cargo test --test code_emphasis +cargo test --test markdownlint +``` + +Manually verify an end-to-end wrap example that covers a blockquote, inline +code, and a Markdown link: + +```bash +printf '%s\n' \ + '> A deliberately long blockquote line that mentions' \ + '`cargo test --test wrap_cli` and the' \ + '[project README](https://example.invalid/readme) so the wrap' \ + 'engine must preserve both atomic spans while continuing the quote' \ + 'prefix correctly.' \ + | cargo run -- --wrap +``` + +Expected: + +```plaintext +> A deliberately long blockquote line that mentions +> `cargo test --test wrap_cli` and the +> [project README](https://example.invalid/readme) so the wrap +> engine must preserve both atomic spans while continuing the quote +> prefix correctly. +``` + +Finish with the full repository quality gates. Use `tee` so the logs survive +output truncation and can be inspected afterward: + +```bash +set -o pipefail +make fmt 2>&1 | tee /tmp/issue-80-fmt.log +set -o pipefail +make check-fmt 2>&1 | tee /tmp/issue-80-check-fmt.log +set -o pipefail +make lint 2>&1 | tee /tmp/issue-80-lint.log +set -o pipefail +make test 2>&1 | tee /tmp/issue-80-test.log +set -o pipefail +make markdownlint 2>&1 | tee /tmp/issue-80-markdownlint.log +set -o pipefail +make nixie 2>&1 | tee /tmp/issue-80-nixie.log +``` + +Expected: + +```plaintext +... finished successfully ... +``` + +If any command fails, inspect the tail of the corresponding log before making +another change: + +```bash +tail -n 40 /tmp/issue-80-test.log +tail -n 40 /tmp/issue-80-lint.log +``` + +## Acceptance criteria + +The change is complete when all of the following statements are true: + +1. `wrap_text` still preserves the behaviour covered by the active wrap tests, + including code spans, links, prefixes, fences, hard breaks, and trailing + spaces. +2. `src/wrap/paragraph.rs` uses one display-width-aware helper for prefix + wrapping instead of the current two-step helper split. +3. The bespoke `LineBuffer`-driven wrapping loop is gone from the active wrap + path, and the implementation delegates line breaking to `textwrap`. +4. `tokenize_markdown` either remains intact and documented as out of scope, or + any broader removal is explicitly approved and covered by updated tests and + documentation. +5. `README.md`, `docs/architecture.md`, and any other affected docs describe + the new implementation accurately. +6. `make check-fmt`, `make lint`, `make test`, `make markdownlint`, and + `make nixie` all pass. diff --git a/docs/execplans/yaml-frontmatter.md b/docs/execplans/yaml-frontmatter.md index 91b2a76..2092567 100644 --- a/docs/execplans/yaml-frontmatter.md +++ b/docs/execplans/yaml-frontmatter.md @@ -96,7 +96,7 @@ according to the selected options. - [x] (2026-04-09) Update `README.md` and `docs/architecture.md`. - [x] (2026-04-09) Run `make check-fmt`, `make lint`, `make test`, `make markdownlint`, - and `make nixie`. + and `make nixie` if Mermaid content changes. ## Surprises & discoveries @@ -261,7 +261,8 @@ make markdownlint make nixie ``` -`make nixie` is required. +If `docs/architecture.md` does not change any Mermaid content, `make nixie` may +be skipped. ## Validation and acceptance diff --git a/docs/trailing-spaces.md b/docs/trailing-spaces.md index abc7222..b0f096b 100644 --- a/docs/trailing-spaces.md +++ b/docs/trailing-spaces.md @@ -1,11 +1,11 @@ # Trailing spaces -`wrap_preserving_code` keeps trailing spaces on the final line. +The `textwrap`-backed inline wrapper keeps trailing spaces on the final line. Markdown treats two spaces at the end of a line as a hard break. Earlier versions trimmed those spaces during the final flush, turning hard breaks into -soft ones. The final line is now pushed as-is so trailing whitespace survives -wrapping. +soft ones. The current wrapper still renders the final line as-is after +`textwrap` has chosen the line breaks, so trailing whitespace survives wrapping. ## Example diff --git a/docs/users-guide.md b/docs/users-guide.md index 1d78ba5..1905650 100644 --- a/docs/users-guide.md +++ b/docs/users-guide.md @@ -16,9 +16,47 @@ Literal pipe characters inside cells must be written as `\|`. `mdtablefix` preserves that escaping during reflow, so a literal pipe remains part of the cell content rather than being interpreted as a column boundary. +## Paragraph wrapping + +The `--wrap` flag reflows paragraphs and list items to a target column width +(default 80). The wrap engine uses the `textwrap` crate for greedy line fitting +and `unicode-width` for display-column measurements, so accented characters, +CJK glyphs, and emoji are counted by their rendered width rather than their +byte length. + +### Hyphenation + +Hyphenated words are treated as indivisible during wrapping. A compound such as +`very-long-word` moves to the next line intact rather than being split at the +hyphen. + +### Inline code and links + +Inline code spans (`` `like this` ``) and Markdown links (`[text](url)`) are +kept as atomic units and are never broken across lines. Trailing punctuation +such as `.` or `,` that immediately follows a code span stays attached to it on +the same line. + +### Task list indentation + +Checkbox list items (`- [ ]` and `- [x]`) keep their continuation lines aligned +with the text column that follows the checkbox marker, not the leading dash. + +### Fenced code blocks + +Lines inside fenced code blocks (`` ``` `` or `~~~` delimiters) are always +preserved verbatim. The wrap engine does not attempt to reflow indented code +blocks either. + +### Unicode display-width awareness + +All width calculations use the Unicode display width of each character, which +means multi-column characters such as full-width CJK glyphs and emoji are +measured correctly when deciding where to break a line. + ## Ellipsis handling The `--ellipsis` flag replaces `...` inside table cells with the Unicode ellipsis character `…` before the table is reflowed. This ensures column widths are computed from the final emitted glyph rather than from the three-dot source -sequence. +sequence. \ No newline at end of file diff --git a/src/wrap.rs b/src/wrap.rs index 4499eb6..42281f7 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -13,7 +13,6 @@ use std::borrow::Cow; mod block; mod fence; mod inline; -mod line_buffer; mod paragraph; mod tokenize; use block::{BLOCKQUOTE_RE, BULLET_RE, FOOTNOTE_RE}; diff --git a/src/wrap/inline.rs b/src/wrap/inline.rs index 8b8027f..bf5b394 100644 --- a/src/wrap/inline.rs +++ b/src/wrap/inline.rs @@ -4,41 +4,56 @@ //! inline code, links, and trailing punctuation without reimplementing the //! grouping logic in multiple places. +mod postprocess; + use std::ops::Range; +use postprocess::{merge_whitespace_only_lines, rebalance_atomic_tails}; +use textwrap::{core::Fragment, wrap_algorithms::wrap_first_fit}; use unicode_width::UnicodeWidthStr; -use super::{ - line_buffer::{LineBuffer, SplitContext}, - tokenize, -}; +use super::tokenize; +/// Describes the Markdown role of a grouped token span used during +/// span-coupling decisions. #[derive(Copy, Clone, PartialEq, Eq)] enum SpanKind { + /// Ordinary prose text with no special Markdown role. General, + /// An inline code span delimited by backticks. Code, + /// A Markdown inline link or image. Link, } +/// Returns `true` when `c` is trailing punctuation that should stay attached +/// to the preceding code span or link rather than breaking to a new line. #[inline] fn is_trailing_punct(c: char) -> bool { // ASCII closers + common Unicode closers and word-final punctuation matches!( c, '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '"' | '\'' - ) || "…—–»›)]】》」』、。,:;!?”.’".contains(c) + ) || "…—–»›)]】》」』、。,:;!?".'".contains(c) } +/// Returns `true` when `token` looks like a complete Markdown inline link or +/// image reference (`[text](url)` or `![alt](url)`). fn looks_like_link(token: &str) -> bool { (token.starts_with('[') || token.starts_with("![")) && token.contains("](") && token.ends_with(')') } +/// Returns `true` when every character in `token` is a Unicode whitespace +/// character. fn is_whitespace_token(token: &str) -> bool { token.chars().all(char::is_whitespace) } +/// Returns `true` when `token` is a complete backtick-delimited inline code span. fn is_inline_code_token(token: &str) -> bool { token.starts_with('`') && token.ends_with('`') } +/// Advances `j` past any trailing-punctuation tokens, accumulating widths into +/// `*width`, and returns the updated index. fn extend_punctuation(tokens: &[String], mut j: usize, width: &mut usize) -> usize { while j < tokens.len() && tokens[j].chars().all(is_trailing_punct) { *width += UnicodeWidthStr::width(tokens[j].as_str()); @@ -69,6 +84,9 @@ fn should_couple_whitespace(kind: SpanKind, next_token: Option<&String>) -> bool } } +/// Merges a backtick-delimited code span starting at `i` into a single span, +/// accumulating display width and returning the index beyond the span. +/// Unmatched opening backticks extend to the end of the token list. #[inline] fn merge_code_span(tokens: &[String], i: usize, width: &mut usize) -> usize { debug_assert!( @@ -88,6 +106,17 @@ fn merge_code_span(tokens: &[String], i: usize, width: &mut usize) -> usize { j } +/// Groups consecutive tokens starting at `start` into a single logical span +/// whose fragments should not be broken across lines. +/// +/// Returns `(end, width)` where `end` is the exclusive index of the last token +/// in the span and `width` is the total display-column width of all tokens +/// from `start` to `end`. +/// +/// Inline code spans (including single-backtick openers matched to a closer), +/// Markdown links, and images are kept atomic. Trailing punctuation is absorbed +/// into the preceding span. Whitespace is absorbed only when +/// `should_couple_whitespace` indicates the surrounding context requires it. pub(super) fn determine_token_span(tokens: &[String], start: usize) -> (usize, usize) { let mut end = start + 1; let mut width = UnicodeWidthStr::width(tokens[start].as_str()); @@ -148,6 +177,16 @@ pub(super) fn determine_token_span(tokens: &[String], start: usize) -> (usize, u (end, width) } +/// Appends a single-character punctuation token to the last line in `lines` +/// when that line ends with an inline code span and the current buffer is +/// empty. +/// +/// Returns `true` when the token was consumed, `false` when the caller should +/// continue processing it normally. +/// +/// This function is only compiled for tests because the production path now +/// uses fragment-level classification instead of string-level post-processing. +#[cfg(test)] pub(super) fn attach_punctuation_to_previous_line( lines: &mut [String], current: &str, @@ -169,99 +208,192 @@ pub(super) fn attach_punctuation_to_previous_line( false } -fn push_span_with_carry( - buffer: &mut LineBuffer, - tokens: &[String], - span: Range, - carried_whitespace: &mut String, -) { - if span.start >= span.end { - return; +/// Classifies an inline fragment for post-wrap normalisation and +/// `textwrap::core::Fragment` width reporting. +#[derive(Debug, Clone, PartialEq, Eq)] +enum FragmentKind { + /// A fragment composed entirely of whitespace characters. + Whitespace, + /// A backtick-delimited inline code span, optionally followed by trailing + /// punctuation. + InlineCode, + /// A Markdown inline link or image reference. + Link, + /// Ordinary prose text that does not match any of the above categories. + Plain, +} + +/// A Markdown-aware inline fragment that implements +/// `textwrap::core::Fragment`. +/// +/// Each fragment holds the rendered text, its precomputed display-column +/// width, and a [`FragmentKind`] that drives post-wrap normalisation decisions +/// in `merge_whitespace_only_lines` and `rebalance_atomic_tails`. +#[derive(Debug, Clone, PartialEq, Eq)] +struct InlineFragment { + /// The rendered text of this fragment as it will appear in output. + text: String, + /// Display-column width of `text` computed with `unicode-width`. + width: usize, + /// The Markdown role of this fragment. + kind: FragmentKind, +} + +impl InlineFragment { + /// Constructs a new `InlineFragment` from `text`, computing the display + /// width and classifying the kind automatically. + fn new(text: String) -> Self { + let width = UnicodeWidthStr::width(text.as_str()); + let kind = classify_fragment(text.as_str()); + Self { text, width, kind } } - if carried_whitespace.is_empty() { - buffer.push_span(tokens, span.start, span.end); - return; + /// Returns `true` when this fragment is a whitespace-only fragment. + fn is_whitespace(&self) -> bool { self.kind == FragmentKind::Whitespace } + + /// Returns `true` when this fragment is an atomic inline element (inline + /// code span or Markdown link) that must not be split across lines. + fn is_atomic(&self) -> bool { + matches!(self.kind, FragmentKind::InlineCode | FragmentKind::Link) } - let mut first_token = std::mem::take(carried_whitespace); - first_token.push_str(tokens[span.start].as_str()); - buffer.push_token(first_token.as_str()); - if span.start + 1 < span.end { - buffer.push_span(tokens, span.start + 1, span.end); + /// Returns `true` when this fragment is ordinary plain-text prose. + fn is_plain(&self) -> bool { self.kind == FragmentKind::Plain } +} + +impl Fragment for InlineFragment { + /// Display-column width as `f64` for `textwrap::wrap_algorithms`. + fn width(&self) -> f64 { width_as_f64(self.width) } + + /// Zero — whitespace is handled by fragment classification, not `textwrap`. + fn whitespace_width(&self) -> f64 { 0.0 } + + /// Zero — no penalty is applied at any fragment boundary. + fn penalty_width(&self) -> f64 { 0.0 } +} + +/// Converts a `usize` display width to `f64`, clamping at `u32::MAX` to avoid +/// precision loss for pathologically long tokens. +fn width_as_f64(width: usize) -> f64 { f64::from(u32::try_from(width).unwrap_or(u32::MAX)) } + +/// Determines the [`FragmentKind`] for a rendered fragment text. +/// +/// Whitespace-only text maps to `Whitespace`. Text whose non-punctuation core +/// is a backtick-delimited code span maps to `InlineCode`. Text whose core is +/// a Markdown link or image maps to `Link`. Everything else maps to `Plain`. +fn classify_fragment(text: &str) -> FragmentKind { + if is_whitespace_token(text) { + return FragmentKind::Whitespace; + } + let trimmed = text.trim_end_matches(is_trailing_punct); + if is_inline_code_token(trimmed) { + FragmentKind::InlineCode + } else if looks_like_link(trimmed) { + FragmentKind::Link + } else { + FragmentKind::Plain } } -pub(super) fn wrap_preserving_code(text: &str, width: usize) -> Vec { - let tokens = tokenize::segment_inline(text); - if tokens.is_empty() { - return Vec::new(); +/// Appends tokens in `span` to `text`. When a punctuation token immediately +/// follows a backtick-terminated code span, trailing whitespace is trimmed from +/// `text` first so the punctuation attaches directly to the closing backtick. +fn push_span_text(text: &mut String, tokens: &[String], span: Range) { + for token in &tokens[span] { + if token.len() == 1 && ".?!,:;".contains(token) && text.trim_end().ends_with('`') { + text.truncate(text.trim_end_matches(char::is_whitespace).len()); + } + text.push_str(token); } +} - let mut lines = Vec::new(); - let mut buffer = LineBuffer::new(); - let mut carried_whitespace = String::new(); +/// Converts a token list into a `Vec` by grouping consecutive +/// tokens into logical spans via `determine_token_span`. Whitespace-only spans +/// are joined directly; other spans pass through `push_span_text`. +fn build_fragments(tokens: &[String]) -> Vec { + let mut fragments: Vec = Vec::new(); let mut i = 0; while i < tokens.len() { - let (group_end, group_width) = determine_token_span(&tokens, i); + let (group_end, _) = determine_token_span(tokens, i); let span = i..group_end; - let span_is_whitespace = tokens[span.clone()] + let text = if tokens[span.clone()] .iter() - .all(|tok| is_whitespace_token(tok)); + .all(|token| is_whitespace_token(token)) + { + tokens[span].join("") + } else { + let mut text = String::new(); + push_span_text(&mut text, tokens, span); + text + }; + fragments.push(InlineFragment::new(text)); + i = group_end; + } - if span_is_whitespace && !carried_whitespace.is_empty() && group_end != tokens.len() { - for tok in &tokens[span.clone()] { - carried_whitespace.push_str(tok); - } - i = group_end; - continue; - } + fragments +} - if attach_punctuation_to_previous_line(lines.as_mut_slice(), buffer.text(), &tokens[i]) { - carried_whitespace.clear(); - i += 1; - continue; - } +/// Renders a line of `InlineFragment` slices into a `String`. +/// +/// Non-final lines have a single trailing space removed to avoid unintended +/// hard line breaks (two trailing spaces are preserved). The final line is +/// always rendered as-is so intentional trailing spaces survive wrapping. +fn render_line(line: &[InlineFragment], is_final_output_line: bool) -> String { + let mut text = line + .iter() + .map(|fragment| fragment.text.as_str()) + .collect::(); + + if !is_final_output_line && text.ends_with(' ') && !text.ends_with(" ") { + text.pop(); + } - if buffer.width() + group_width <= width { - push_span_with_carry(&mut buffer, &tokens, span.clone(), &mut carried_whitespace); - i = group_end; - continue; - } + text +} - let mut split = SplitContext { - lines: &mut lines, - width, - }; - if buffer.split_with_span(&mut split, &tokens, span.clone()) { - i = group_end; - continue; - } +/// Wraps `text` to at most `width` display columns while preserving inline +/// code spans, Markdown links, and trailing punctuation as atomic units. +/// +/// The function tokenises `text` with `tokenize::segment_inline`, groups the +/// tokens into `InlineFragment` values, and feeds them incrementally to +/// `textwrap::wrap_algorithms::wrap_first_fit`. After each fragment is added, +/// the current fit is post-processed by `merge_whitespace_only_lines` and +/// `rebalance_atomic_tails` to normalise separator lines and balance trailing +/// fragments. Completed lines are emitted and only the trailing incomplete +/// line is retained in the buffer for the next iteration. +/// +/// Returns an empty `Vec` when `text` is empty. +pub(super) fn wrap_preserving_code(text: &str, width: usize) -> Vec { + let tokens = tokenize::segment_inline(text); + if tokens.is_empty() { + return Vec::new(); + } - if buffer.flush_trailing_whitespace(&mut lines, &tokens, span.clone()) { - i = group_end; - continue; - } + let fragments = build_fragments(&tokens); + let mut lines = Vec::new(); + let mut buffer: Vec = Vec::new(); - buffer.flush_into(&mut lines); - if span_is_whitespace { - for tok in &tokens[span] { - carried_whitespace.push_str(tok); - } - i = group_end; + for fragment in fragments { + buffer.push(fragment); + let wrapped = wrap_first_fit(&buffer, &[width_as_f64(width)]); + let raw_lines = wrapped.iter().map(|line| line.to_vec()).collect::>(); + let mut grouped_lines = merge_whitespace_only_lines(&raw_lines); + rebalance_atomic_tails(&mut grouped_lines, width); + + if grouped_lines.len() == 1 { continue; } - push_span_with_carry(&mut buffer, &tokens, i..group_end, &mut carried_whitespace); - i = group_end; + for line in &grouped_lines[..grouped_lines.len() - 1] { + lines.push(render_line(line, false)); + } + buffer = grouped_lines.pop().unwrap_or_default(); } - if !carried_whitespace.is_empty() { - buffer.push_token(carried_whitespace.as_str()); - carried_whitespace.clear(); + if !buffer.is_empty() { + lines.push(render_line(&buffer, true)); } - buffer.flush_into(&mut lines); lines } diff --git a/src/wrap/inline/postprocess.rs b/src/wrap/inline/postprocess.rs new file mode 100644 index 0000000..e8505a1 --- /dev/null +++ b/src/wrap/inline/postprocess.rs @@ -0,0 +1,344 @@ +//! Post-wrap normalization helpers for inline fragment lines. +//! +//! After `textwrap::wrap_algorithms::wrap_first_fit` assigns fragments to +//! lines, two passes correct edge cases that the greedy algorithm cannot +//! anticipate: +//! +//! 1. `merge_whitespace_only_lines` absorbs separator lines that consist +//! entirely of whitespace fragments back into adjacent content lines so +//! that rendered output does not gain spurious blank entries. +//! 2. `rebalance_atomic_tails` moves a trailing atomic or plain fragment from +//! one line to the beginning of the next when doing so keeps both lines +//! within the target width, preventing orphaned punctuation or code spans. + +use super::{FragmentKind, InlineFragment}; + +/// Returns `true` when every fragment on `line` is a whitespace fragment. +fn is_whitespace_only_line(line: &[InlineFragment]) -> bool { + line.iter().all(InlineFragment::is_whitespace) +} + +/// Returns `true` when `line` contains exactly one fragment whose text is a +/// single ASCII space character. +fn is_single_space_line(line: &[InlineFragment]) -> bool { line.len() == 1 && line[0].text == " " } + +/// Returns the total display-column width of all fragments on `line`. +fn line_width(line: &[InlineFragment]) -> usize { line.iter().map(|fragment| fragment.width).sum() } + +/// Returns `true` when the first fragment on `line` is an atomic fragment +/// (inline code span or Markdown link). +fn line_starts_with_atomic(line: &[InlineFragment]) -> bool { + line.first().is_some_and(InlineFragment::is_atomic) +} + +/// Returns `true` when `line` starts with a single-space whitespace fragment +/// followed by at least one plain-text fragment. +/// +/// This pattern identifies lines that are candidates for receiving a +/// rebalanced tail fragment from the previous line. +fn line_starts_with_single_space_then_plain(line: &[InlineFragment]) -> bool { + line.first() + .is_some_and(|fragment| fragment.is_whitespace() && fragment.text == " ") + && line.get(1).is_some_and(InlineFragment::is_plain) +} + +/// Returns `true` when `line` ends with an atomic fragment, or ends with a +/// plain-text fragment that is not the only fragment on the line. +/// +/// Both cases are eligible for tail rebalancing: moving the last fragment to +/// the following line when width permits. +fn line_has_rebalanceable_tail(line: &[InlineFragment]) -> bool { + line.last().is_some_and(InlineFragment::is_atomic) + || (line.len() > 1 && line.last().is_some_and(InlineFragment::is_plain)) +} + +/// Merges whitespace-only lines produced by `wrap_first_fit` back into +/// adjacent content lines. +/// +/// `textwrap` may split a separator whitespace fragment onto its own line when +/// it occurs at a break boundary. Such lines would render as spurious blank +/// output entries. This function accumulates pending whitespace and prepends +/// it to the next non-whitespace line, with two special cases: +/// +/// - When a single-space line follows a line whose last fragment is an inline +/// code span and the next line does not start with an atomic fragment, the +/// code span is popped back into the pending buffer so it will flow together +/// with the following content. +/// - When a single-space line follows a line that contains only one fragment, +/// the space is carried forward rather than being dropped, preserving the +/// spacing relationship between the two fragments. +/// +/// Any pending whitespace that remains after all lines have been processed is +/// appended to the last merged line. +pub(super) fn merge_whitespace_only_lines( + lines: &[Vec], +) -> Vec> { + let mut merged: Vec> = Vec::with_capacity(lines.len()); + let mut pending_whitespace: Vec = Vec::new(); + + for (index, mut line) in lines.iter().cloned().enumerate() { + if is_whitespace_only_line(&line) { + let next_starts_atomic = lines + .get(index + 1) + .is_some_and(|next_line| line_starts_with_atomic(next_line)); + let line_is_single_space = is_single_space_line(&line); + let previous_line_has_single_fragment = merged + .last() + .is_some_and(|previous_line| previous_line.len() == 1); + let mut should_carry_whitespace = !line_is_single_space; + + if line_is_single_space + && !next_starts_atomic + && let Some(previous_line) = merged.last_mut() + && previous_line + .last() + .is_some_and(|fragment| fragment.kind == FragmentKind::InlineCode) + { + let previous_atomic = previous_line + .pop() + .expect("line with an atomic tail contains that fragment"); + pending_whitespace.push(previous_atomic); + if previous_line.is_empty() { + merged.pop(); + } + should_carry_whitespace = true; + } + + if line_is_single_space && previous_line_has_single_fragment { + should_carry_whitespace = true; + } + + if should_carry_whitespace { + pending_whitespace.extend(line); + } + continue; + } + + if pending_whitespace.is_empty() { + merged.push(line); + } else { + pending_whitespace.append(&mut line); + merged.push(std::mem::take(&mut pending_whitespace)); + } + } + + if !pending_whitespace.is_empty() { + if let Some(last_line) = merged.last_mut() { + last_line.append(&mut pending_whitespace); + } else { + merged.push(pending_whitespace); + } + } + + merged +} + +/// Moves trailing atomic or plain fragments to the following line when doing +/// so keeps both lines within `width` display columns. +/// +/// After `merge_whitespace_only_lines` normalises separator lines, a wrapped +/// line may still end with an isolated code span or word that would read more +/// naturally at the start of the next line. This function iterates over +/// adjacent line pairs and, when the next line starts with a single-space +/// separator followed by a plain fragment, moves the current line's last +/// fragment forward if the resulting next-line width remains within `width`. +/// +/// The width guard ensures that this heuristic never creates a line that +/// `wrap_first_fit` would itself have rejected. +pub(super) fn rebalance_atomic_tails(lines: &mut [Vec], width: usize) { + for index in 0..lines.len().saturating_sub(1) { + if !line_starts_with_single_space_then_plain(&lines[index + 1]) + || !line_has_rebalanceable_tail(&lines[index]) + { + continue; + } + + let trailing_width = lines[index] + .last() + .map(|fragment| fragment.width) + .expect("line selected for tail rebalancing contains a trailing fragment"); + if line_width(&lines[index + 1]) + trailing_width > width { + continue; + } + + let trailing_fragment = lines[index] + .pop() + .expect("line selected for tail rebalancing contains a trailing fragment"); + lines[index + 1].insert(0, trailing_fragment); + } +} + +#[cfg(test)] +mod tests { + use super::super::{FragmentKind, InlineFragment}; + use super::{merge_whitespace_only_lines, rebalance_atomic_tails}; + + #[test] + fn inline_fragment_new_classifies_whitespace() { + let fragment = InlineFragment::new(" ".to_string()); + assert_eq!(fragment.kind, FragmentKind::Whitespace); + assert_eq!(fragment.width, 1); + assert!(fragment.is_whitespace()); + assert!(!fragment.is_atomic()); + assert!(!fragment.is_plain()); + } + + #[test] + fn inline_fragment_new_classifies_inline_code() { + let fragment = InlineFragment::new("`code`".to_string()); + assert_eq!(fragment.kind, FragmentKind::InlineCode); + assert_eq!(fragment.width, 6); + assert!(!fragment.is_whitespace()); + assert!(fragment.is_atomic()); + assert!(!fragment.is_plain()); + } + + #[test] + fn inline_fragment_new_classifies_link() { + let fragment = InlineFragment::new("[text](url)".to_string()); + assert_eq!(fragment.kind, FragmentKind::Link); + assert_eq!(fragment.width, 11); + assert!(!fragment.is_whitespace()); + assert!(fragment.is_atomic()); + assert!(!fragment.is_plain()); + } + + #[test] + fn inline_fragment_new_classifies_plain_text() { + let fragment = InlineFragment::new("hello".to_string()); + assert_eq!(fragment.kind, FragmentKind::Plain); + assert_eq!(fragment.width, 5); + assert!(!fragment.is_whitespace()); + assert!(!fragment.is_atomic()); + assert!(fragment.is_plain()); + } + + #[test] + fn inline_fragment_new_classifies_code_with_trailing_punct() { + let fragment = InlineFragment::new("`code`.".to_string()); + assert_eq!(fragment.kind, FragmentKind::InlineCode); + assert!(fragment.is_atomic()); + } + + #[test] + fn inline_fragment_new_classifies_link_with_trailing_punct() { + let fragment = InlineFragment::new("[text](url).".to_string()); + assert_eq!(fragment.kind, FragmentKind::Link); + assert!(fragment.is_atomic()); + } + + #[test] + fn inline_fragment_new_computes_width_from_display_columns() { + let fragment = InlineFragment::new("abc".to_string()); + assert_eq!(fragment.width, 3); + } + + #[test] + fn inline_fragment_new_handles_empty_string() { + let fragment = InlineFragment::new(String::new()); + assert_eq!(fragment.width, 0); + assert_eq!(fragment.kind, FragmentKind::Plain); + } + + fn plain(text: &str) -> InlineFragment { + InlineFragment { text: text.to_string(), width: text.len(), kind: FragmentKind::Plain } + } + + fn ws(text: &str) -> InlineFragment { + InlineFragment { + text: text.to_string(), + width: text.len(), + kind: FragmentKind::Whitespace, + } + } + + fn code(text: &str) -> InlineFragment { + InlineFragment { + text: text.to_string(), + width: text.len(), + kind: FragmentKind::InlineCode, + } + } + + fn link(text: &str) -> InlineFragment { + InlineFragment { text: text.to_string(), width: text.len(), kind: FragmentKind::Link } + } + + #[test] + fn merge_whitespace_only_lines_absorbs_whitespace_line_into_next() { + let lines = vec![vec![plain("foo")], vec![ws(" ")], vec![plain("bar")]]; + let merged = merge_whitespace_only_lines(&lines); + assert_eq!(merged.len(), 2); + assert_eq!(merged[0], vec![plain("foo")]); + assert_eq!(merged[1], vec![ws(" "), plain("bar")]); + } + + #[test] + fn merge_whitespace_only_lines_appends_trailing_whitespace_to_last_line() { + let lines = vec![vec![plain("foo")], vec![ws(" ")]]; + let merged = merge_whitespace_only_lines(&lines); + assert_eq!(merged.len(), 1); + assert_eq!(merged[0], vec![plain("foo"), ws(" ")]); + } + + #[test] + fn merge_whitespace_only_lines_pops_inline_code_before_single_space() { + // A single-space whitespace line following a line ending with InlineCode + // and not preceding an atomic fragment should pop the code span back + // into pending so it flows with the next line. + let next_plain = vec![plain("baz")]; + let lines = vec![vec![plain("foo"), code("`x`")], vec![ws(" ")], next_plain.clone()]; + let merged = merge_whitespace_only_lines(&lines); + // The code span and space should be prepended to the next content line. + assert_eq!(merged.len(), 2); + assert_eq!(merged[0], vec![plain("foo")]); + assert!(merged[1].contains(&code("`x`"))); + assert!(merged[1].contains(&plain("baz"))); + } + + #[test] + fn merge_whitespace_only_lines_preserves_non_whitespace_lines_unchanged() { + let lines = vec![vec![plain("hello")], vec![plain("world")]]; + let merged = merge_whitespace_only_lines(&lines); + assert_eq!(merged, lines); + } + + #[test] + fn rebalance_atomic_tails_moves_code_span_to_next_line_when_it_fits() { + // Line 0: ["word", `code`] widths 4 + 6 = 10 + // Line 1: [" ", "next"] widths 1 + 4 = 5 -> after move: 6 + 1 + 4 = 11 > 12? No: 6+5=11 <= 12 + let mut lines = vec![ + vec![plain("word"), code("`code`")], + vec![ws(" "), plain("next")], + ]; + rebalance_atomic_tails(&mut lines, 12); + // `code` should move to line 1 because 6 + 1 + 4 = 11 <= 12 + assert_eq!(lines[0], vec![plain("word")]); + assert_eq!(lines[1][0], code("`code`")); + } + + #[test] + fn rebalance_atomic_tails_does_not_move_when_overflow_would_result() { + // Line 0: ["word", `code`] total widths for moved fragment = 6 + // Line 1: [" ", "verylongword"] width 1 + 12 = 13; with fragment 6+13=19 > 10 + let mut lines = vec![ + vec![plain("word"), code("`code`")], + vec![ws(" "), plain("verylongword")], + ]; + rebalance_atomic_tails(&mut lines, 10); + // Should not move because 6 + 1 + 12 = 19 > 10 + assert_eq!(lines[0], vec![plain("word"), code("`code`")]); + } + + #[test] + fn rebalance_atomic_tails_moves_link_fragment_when_fits() { + let mut lines = vec![ + vec![plain("See"), link("[doc](u)")], + vec![ws(" "), plain("here")], + ]; + // [doc](u) width = 8, " here" width = 5, total = 13 <= 15 + rebalance_atomic_tails(&mut lines, 15); + assert_eq!(lines[0], vec![plain("See")]); + assert_eq!(lines[1][0], link("[doc](u)")); + } +} \ No newline at end of file diff --git a/src/wrap/line_buffer.rs b/src/wrap/line_buffer.rs deleted file mode 100644 index 0eafa3b..0000000 --- a/src/wrap/line_buffer.rs +++ /dev/null @@ -1,154 +0,0 @@ -//! Line buffer utilities for `wrap_preserving_code`. -//! -//! This module encapsulates the mutable state required to accumulate tokens into -//! wrapped lines while reusing allocations between iterations. - -use std::ops::Range; - -use unicode_width::UnicodeWidthStr; - -#[derive(Default)] -pub(crate) struct LineBuffer { - text: String, - width: usize, - last_split: Option, -} - -pub(crate) struct SplitContext<'a> { - pub(crate) lines: &'a mut Vec, - pub(crate) width: usize, -} - -impl LineBuffer { - pub(crate) fn new() -> Self { Self::default() } - - pub(crate) fn text(&self) -> &str { self.text.as_str() } - - pub(crate) fn width(&self) -> usize { self.width } - - pub(crate) fn push_token(&mut self, token: &str) { - if token.len() == 1 && ".?!,:;".contains(token) && self.text.trim_end().ends_with('`') { - let trimmed_len = self.text.trim_end_matches(char::is_whitespace).len(); - if trimmed_len < self.text.len() { - let removed = &self.text[trimmed_len..]; - let removed_width = UnicodeWidthStr::width(removed); - self.text.truncate(trimmed_len); - self.width = self.width.saturating_sub(removed_width); - self.last_split = self - .text - .char_indices() - .rev() - .find(|(_, ch)| ch.is_whitespace()) - .map(|(idx, ch)| idx + ch.len_utf8()); - } - } - - self.text.push_str(token); - self.width += UnicodeWidthStr::width(token); - if token.chars().all(char::is_whitespace) { - self.last_split = Some(self.text.len()); - } - } - - pub(crate) fn push_span(&mut self, tokens: &[String], start: usize, end: usize) { - for tok in &tokens[start..end] { - self.push_token(tok.as_str()); - } - } - - pub(crate) fn flush_into(&mut self, lines: &mut Vec) { - if self.text.is_empty() { - return; - } - lines.push(std::mem::take(&mut self.text)); - self.width = 0; - self.last_split = None; - } - - pub(crate) fn split_with_span( - &mut self, - ctx: &mut SplitContext<'_>, - tokens: &[String], - span: Range, - ) -> bool { - let Some(pos) = self.last_split else { - return false; - }; - - let (head_bounds, trimmed_tail_start) = { - let (head, tail) = self.text.split_at(pos); - let trimmed_head = head.trim_end(); - let trimmed_head_len = trimmed_head.len(); - let trailing_ws = &head[trimmed_head_len..]; - let head_bounds = if trimmed_head_len == 0 { - None - } else if trailing_ws.chars().count() > 1 { - Some((0, pos)) - } else { - Some((0, trimmed_head_len)) - }; - - let trimmed_tail = tail.trim_start(); - let trimmed_tail_start = pos + (tail.len() - trimmed_tail.len()); - (head_bounds, trimmed_tail_start) - }; - - if let Some((start_idx, end_idx)) = head_bounds { - ctx.lines.push(self.text[start_idx..end_idx].to_owned()); - } - - self.text.drain(..trimmed_tail_start); - for tok in &tokens[span.clone()] { - self.text.push_str(tok); - } - - self.width = UnicodeWidthStr::width(self.text.as_str()); - if span.end > span.start - && tokens[span.end - 1].chars().all(char::is_whitespace) - && !self.text.is_empty() - { - self.last_split = Some(self.text.len()); - } else { - self.last_split = None; - } - - if self.width > ctx.width { - ctx.lines.push(self.text.trim_end().to_string()); - self.text.clear(); - self.width = 0; - self.last_split = None; - } - - true - } - - pub(crate) fn flush_trailing_whitespace( - &mut self, - lines: &mut Vec, - tokens: &[String], - span: Range, - ) -> bool { - if span.end != tokens.len() { - return false; - } - if !tokens[span.clone()] - .iter() - .all(|tok| tok.chars().all(char::is_whitespace)) - { - return false; - } - - if self.text.is_empty() { - self.last_split = None; - return true; - } - - for tok in &tokens[span] { - self.text.push_str(tok); - } - lines.push(std::mem::take(&mut self.text)); - self.width = 0; - self.last_split = None; - true - } -} diff --git a/src/wrap/paragraph.rs b/src/wrap/paragraph.rs index 358c946..4f8fcf3 100644 --- a/src/wrap/paragraph.rs +++ b/src/wrap/paragraph.rs @@ -9,56 +9,89 @@ use unicode_width::UnicodeWidthStr; use super::inline::wrap_preserving_code; +/// A prefixed line from the input stream such as a bullet, blockquote, +/// footnote, or checkbox entry. +/// +/// `prefix` holds the leading marker and any associated whitespace. +/// `rest` is the content after that prefix. `repeat_prefix` controls +/// whether continuation lines use the same prefix string (`true`) or a +/// space-padded equivalent of the same display width (`false`). pub(super) struct PrefixLine<'a> { + /// The leading marker string (e.g. `"> "`, `"- "`, `"[^1]: "`). pub(super) prefix: Cow<'a, str>, + /// The text content after the prefix. pub(super) rest: &'a str, + /// When `true`, continuation lines repeat `prefix` verbatim (blockquotes). + /// When `false`, continuation lines use a space-padded indent of the same + /// display width (bullets, footnotes). pub(super) repeat_prefix: bool, } +/// Accumulates buffered paragraph lines before they are flushed and wrapped. +/// +/// `ParagraphState` collects consecutive plain-text lines into a single +/// segment so that `ParagraphWriter::flush_paragraph` can wrap the entire +/// paragraph as one unit, respecting hard line breaks (two trailing spaces). #[derive(Default)] pub(super) struct ParagraphState { + /// Buffered text segments with a flag indicating whether each ends with a + /// Markdown hard line break. buf: Vec<(String, bool)>, + /// The leading whitespace indent detected from the first buffered line. indent: String, } impl ParagraphState { + /// Clears all buffered segments and resets the indent. pub(super) fn clear(&mut self) { self.buf.clear(); self.indent.clear(); } + /// Records the leading whitespace indent from `line` if this is the first + /// line pushed into the buffer. pub(super) fn note_indent(&mut self, line: &str) { if self.buf.is_empty() { self.indent = line.chars().take_while(|c| c.is_whitespace()).collect(); } } + /// Appends a text segment to the buffer. + /// + /// `hard_break` should be `true` when the source line ends with two or + /// more trailing spaces, signalling a Markdown hard line break. pub(super) fn push(&mut self, text: String, hard_break: bool) { self.buf.push((text, hard_break)); } } +/// Writes wrapped output lines to an output `Vec`. +/// +/// `ParagraphWriter` centralises prefix-aware and plain-paragraph wrapping so +/// that `wrap_text` does not need to manage display-width calculations or +/// continuation-prefix logic directly. pub(super) struct ParagraphWriter<'a> { + /// The output buffer that receives wrapped lines. out: &'a mut Vec, + /// Target line width in display columns. width: usize, } impl<'a> ParagraphWriter<'a> { + /// Creates a new `ParagraphWriter` that appends wrapped lines to `out` + /// with a target display width of `width` columns. pub(super) fn new(out: &'a mut Vec, width: usize) -> Self { Self { out, width } } - fn append_wrapped_with_prefix(&mut self, line: &PrefixLine<'_>) { - let prefix = line.prefix.as_ref(); + /// Wraps `text` to fit within `self.width` minus the display width of + /// `prefix`, then emits the first wrapped line as `{prefix}{line}` and + /// all subsequent lines as `{continuation_prefix}{line}`. + /// + /// When `wrap_preserving_code` returns no lines (empty `text`), a single + /// line consisting of `prefix` alone is emitted. + fn wrap_with_prefix(&mut self, prefix: &str, continuation_prefix: &str, text: &str) { let prefix_width = UnicodeWidthStr::width(prefix); let available = self.width.saturating_sub(prefix_width).max(1); - let indent_str: String = prefix.chars().take_while(|c| c.is_whitespace()).collect(); - let indent_width = UnicodeWidthStr::width(indent_str.as_str()); - let wrapped_indent = if line.repeat_prefix { - prefix.to_string() - } else { - format!("{}{}", indent_str, " ".repeat(prefix_width - indent_width)) - }; - - let lines = wrap_preserving_code(line.rest, available); + let lines = wrap_preserving_code(text, available); if lines.is_empty() { self.out.push(prefix.to_string()); return; @@ -68,11 +101,39 @@ impl<'a> ParagraphWriter<'a> { if index == 0 { self.out.push(format!("{prefix}{wrapped_line}")); } else { - self.out.push(format!("{wrapped_indent}{wrapped_line}")); + self.out + .push(format!("{continuation_prefix}{wrapped_line}")); } } } + /// Derives the continuation prefix for a prefixed line and delegates to + /// `wrap_with_prefix`. + /// + /// When `line.repeat_prefix` is `true` (blockquotes), the same prefix is + /// used on all lines. Otherwise a space-padded string of the same display + /// width is used so that continuation content aligns with the first line's + /// text column. + fn append_wrapped_with_prefix(&mut self, line: &PrefixLine<'_>) { + let prefix = line.prefix.as_ref(); + let prefix_width = UnicodeWidthStr::width(prefix); + let indent_str: String = prefix.chars().take_while(|c| c.is_whitespace()).collect(); + let indent_width = UnicodeWidthStr::width(indent_str.as_str()); + let continuation_prefix = if line.repeat_prefix { + prefix.to_string() + } else { + format!("{}{}", indent_str, " ".repeat(prefix_width - indent_width)) + }; + + self.wrap_with_prefix(prefix, continuation_prefix.as_str(), line.rest); + } + + /// Flushes all buffered paragraph segments in `state` as wrapped output. + /// + /// Consecutive segments are joined with a single space before wrapping. + /// When a segment ends with a hard line break flag, it is wrapped + /// independently and the accumulation resets, preserving the Markdown + /// hard-break semantics. The state is cleared after flushing. pub(super) fn flush_paragraph(&mut self, state: &mut ParagraphState) { if state.buf.is_empty() { return; @@ -97,17 +158,27 @@ impl<'a> ParagraphWriter<'a> { state.clear(); } + /// Wraps a single `segment` with `indent` applied as both the first-line + /// prefix and the continuation prefix. fn push_wrapped_segment(&mut self, indent: &str, segment: &str) { - for line in wrap_preserving_code(segment, self.width - indent.len()) { - self.out.push(format!("{indent}{line}")); - } + self.wrap_with_prefix(indent, indent, segment); } + /// Flushes any buffered paragraph and then appends `line` verbatim to the + /// output without wrapping. + /// + /// Used for lines that must not be reflowed (fenced code blocks, tables, + /// headings, and Markdownlint directives). pub(super) fn push_verbatim(&mut self, state: &mut ParagraphState, line: &str) { self.flush_paragraph(state); self.out.push(line.to_string()); } + /// Flushes any buffered paragraph and then wraps `prefix_line` using + /// `append_wrapped_with_prefix`. + /// + /// Called by `wrap_text` when a prefixed line (bullet, blockquote, + /// footnote, or checkbox) is encountered. pub(super) fn handle_prefix_line( &mut self, state: &mut ParagraphState, @@ -116,4 +187,4 @@ impl<'a> ParagraphWriter<'a> { self.flush_paragraph(state); self.append_wrapped_with_prefix(prefix_line); } -} +} \ No newline at end of file diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs index 344c299..91d4255 100644 --- a/src/wrap/tests.rs +++ b/src/wrap/tests.rs @@ -7,7 +7,6 @@ use rstest::rstest; use super::{ inline::{attach_punctuation_to_previous_line, determine_token_span, wrap_preserving_code}, - line_buffer::{LineBuffer, SplitContext}, tokenize::segment_inline, }; use crate::wrap::{BlockKind, classify_block, wrap_text}; @@ -94,106 +93,6 @@ fn attach_punctuation_ignores_non_code_suffix() { assert_eq!(lines, vec!["plain text".to_string()]); } -#[test] -fn line_buffer_trims_trailing_whitespace_before_punctuation() { - let tokens = vec![ - "wrap".to_string(), - " ".to_string(), - "`code`".to_string(), - " ".to_string(), - ]; - let mut buffer = LineBuffer::new(); - buffer.push_span(&tokens, 0, tokens.len()); - assert_eq!(buffer.text(), "wrap `code` "); - - let punct = vec!["!".to_string()]; - buffer.push_span(&punct, 0, punct.len()); - assert_eq!(buffer.text(), "wrap `code`!"); -} - -#[test] -fn line_buffer_split_preserves_multi_space_lines() { - let tokens = vec![ - "alpha".to_string(), - " ".to_string(), - "beta".to_string(), - " ".to_string(), - ]; - let mut buffer = LineBuffer::new(); - buffer.push_span(&tokens, 0, 2); - - let mut lines = Vec::new(); - let mut split = SplitContext { - lines: &mut lines, - width: 8, - }; - assert!(buffer.split_with_span(&mut split, &tokens, 2..4)); - assert_eq!(lines, vec!["alpha ".to_string()]); - assert_eq!(buffer.text(), "beta "); - assert_eq!( - buffer.width(), - unicode_width::UnicodeWidthStr::width(buffer.text()) - ); -} - -#[test] -fn line_buffer_split_trims_single_trailing_space() { - let tokens = vec!["alpha".to_string(), " ".to_string(), "beta".to_string()]; - let mut buffer = LineBuffer::new(); - buffer.push_span(&tokens, 0, 2); - - let mut lines = Vec::new(); - let mut split = SplitContext { - lines: &mut lines, - width: 5, - }; - assert!(buffer.split_with_span(&mut split, &tokens, 2..3)); - assert_eq!(lines, vec!["alpha".to_string()]); - assert_eq!(buffer.text(), "beta"); - assert_eq!( - buffer.width(), - unicode_width::UnicodeWidthStr::width(buffer.text()) - ); -} - -#[test] -fn line_buffer_split_tracks_multiple_whitespace_tokens() { - let tokens = vec![ - "foo".to_string(), - " ".to_string(), - " ".to_string(), - "bar".to_string(), - ]; - let mut buffer = LineBuffer::new(); - buffer.push_span(&tokens, 0, 3); - - let mut lines = Vec::new(); - let mut split = SplitContext { - lines: &mut lines, - width: 4, - }; - assert!(buffer.split_with_span(&mut split, &tokens, 3..4)); - assert_eq!(lines, vec!["foo ".to_string()]); - assert_eq!(buffer.text(), "bar"); -} - -#[test] -fn line_buffer_trailing_whitespace_flushes_line() { - let mut buffer = LineBuffer::new(); - let words = vec!["foo".to_string()]; - buffer.push_span(&words, 0, words.len()); - - let whitespace_tokens = vec![" ".to_string()]; - let mut lines = Vec::new(); - assert!(buffer.flush_trailing_whitespace( - &mut lines, - &whitespace_tokens, - 0..whitespace_tokens.len() - )); - assert_eq!(lines, vec!["foo ".to_string()]); - assert!(buffer.text().is_empty()); -} - #[test] fn wrap_preserving_code_splits_after_consecutive_whitespace() { let lines = wrap_preserving_code("alpha beta gamma", 8); @@ -440,6 +339,84 @@ fn wrap_text_keeps_trailing_spaces_for_bullet_final_line() { ); } +#[test] +fn wrap_text_repeats_nested_blockquote_prefix() { + let prefix = "> > "; + let input = vec![ + concat!( + "> > This nested quote contains enough text to require wrapping so that we can verify ", + "multi-level handling." + ) + .to_string(), + ]; + let wrapped = wrap_text(&input, 80); + assert!(wrapped.len() > 1); + assert!(wrapped.iter().all(|line| line.starts_with("> > "))); + let wrapped_payload = wrapped + .iter() + .map(|line| { + line.strip_prefix(prefix) + .expect("nested blockquote line keeps its prefix") + }) + .collect::>() + .join(" "); + let normalized_payload = wrapped_payload + .split_whitespace() + .collect::>() + .join(" "); + assert_eq!( + normalized_payload, + input[0] + .strip_prefix(prefix) + .expect("original test input uses the nested blockquote prefix") + ); +} + +#[rstest( + input, + width, + expected, + case( + vec![ + concat!( + "[^5]: Given When Then - Martin Fowler, accessed on 14 July 2025, ", + "" + ) + .to_string(), + ], + 80, + vec![ + "[^5]: Given When Then - Martin Fowler, accessed on 14 July 2025," + .to_string(), + " " + .to_string(), + ] + ), + case( + vec![ + concat!( + "- [ ] Create a `HttpTravelTimeProvider` struct that implements the ", + "`TravelTimeProvider` trait." + ) + .to_string(), + ], + 70, + vec![ + "- [ ] Create a `HttpTravelTimeProvider` struct that implements the" + .to_string(), + " `TravelTimeProvider` trait.".to_string(), + ] + ) +)] +fn wrap_text_preserves_prefixed_continuation_alignment( + input: Vec, + width: usize, + expected: Vec, +) { + let wrapped = wrap_text(&input, width); + assert_eq!(wrapped, expected); +} + #[test] fn wrap_text_preserves_indented_hash_as_text() { let input = vec![ diff --git a/tests/wrap_unit.rs b/tests/wrap_unit.rs index 3445975..a403afd 100644 --- a/tests/wrap_unit.rs +++ b/tests/wrap_unit.rs @@ -5,6 +5,7 @@ use mdtablefix::wrap::wrap_text; use rstest::rstest; +use unicode_width::UnicodeWidthStr; #[test] fn wrap_text_preserves_hyphenated_words() { @@ -153,3 +154,15 @@ fn wrap_text_preserves_fenced_shell_block_without_blank_line_after_heading() { assert_eq!(wrap_text(&input, 80), input); } + +#[test] +fn wrap_text_does_not_overflow_after_tail_rebalancing() { + let wrapped = wrap_text(&["a four five".to_string()], 6); + + assert_eq!(wrapped.join(""), "a four five"); + assert!( + wrapped + .iter() + .all(|line| UnicodeWidthStr::width(line.as_str()) <= 6) + ); +}