From 069d71223ad05fe7e26463ff5f980ff0e31d740e Mon Sep 17 00:00:00 2001 From: Leynos Date: Mon, 29 Sep 2025 09:20:57 +0100 Subject: [PATCH 01/11] Share Markdown block classifiers Expose wrap::classify_block so table detection reuses the same regex-backed logic for headings, bullets, and blockquotes. --- src/process.rs | 10 ++-------- src/wrap.rs | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/src/process.rs b/src/process.rs index 5cef3989..34b8e9d7 100644 --- a/src/process.rs +++ b/src/process.rs @@ -6,7 +6,7 @@ use crate::{ footnotes::convert_footnotes, html::convert_html_tables, table::reflow_table, - wrap::{FenceTracker, wrap_text}, + wrap::{FenceTracker, classify_block, wrap_text}, }; /// Column width used when wrapping text. @@ -102,13 +102,7 @@ fn handle_table_line( return true; } if *in_table { - let trimmed = line.trim_start(); - let new_block = trimmed.starts_with('#') - || trimmed.starts_with('*') - || trimmed.starts_with('-') - || trimmed.starts_with('>') - || trimmed.chars().next().is_some_and(|c| c.is_ascii_digit()); - if new_block { + if classify_block(line).is_some() { flush_buffer(buf, in_table, out); return false; } diff --git a/src/wrap.rs b/src/wrap.rs index 8b204073..a086a295 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -244,6 +244,45 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { lines } +/// Describes the Markdown block prefix detected by [`classify_block`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum BlockKind { + /// Lines that begin with `#`, `##`, and similar heading prefixes. + Heading, + /// Bullet or ordered list markers matched by [`BULLET_RE`]. + Bullet, + /// Lines that begin with one or more `>` markers. + Blockquote, + /// Footnote definitions recognised by [`FOOTNOTE_RE`]. + FootnoteDefinition, + /// HTML-style markdownlint directives recognised by [`is_markdownlint_directive`]. + MarkdownlintDirective, +} + +/// Classifies block-level Markdown prefixes shared by wrapping and table detection. +/// +/// For example, passing `"> quote"` returns `Some(BlockKind::Blockquote)` while +/// `"| cell |"` yields `None` because the line is part of a table. +pub(crate) fn classify_block(line: &str) -> Option { + let trimmed = line.trim_start(); + if trimmed.starts_with('#') { + return Some(BlockKind::Heading); + } + if BULLET_RE.is_match(line) { + return Some(BlockKind::Bullet); + } + if BLOCKQUOTE_RE.is_match(line) { + return Some(BlockKind::Blockquote); + } + if FOOTNOTE_RE.is_match(line) { + return Some(BlockKind::FootnoteDefinition); + } + if is_markdownlint_directive(line) { + return Some(BlockKind::MarkdownlintDirective); + } + None +} + pub(crate) fn is_markdownlint_directive(line: &str) -> bool { MARKDOWNLINT_DIRECTIVE_RE.is_match(line) } From ff3529b250d9d06426da016c53fb106bd156d9a6 Mon Sep 17 00:00:00 2001 From: Leynos Date: Mon, 29 Sep 2025 16:56:51 +0100 Subject: [PATCH 02/11] Reuse classify_block for wrap flush Deduplicate heading and markdownlint detection in wrap_text by sharing classify_block and cover indented hash behaviour with a regression test. --- src/wrap.rs | 13 ++++--------- src/wrap/tests.rs | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/src/wrap.rs b/src/wrap.rs index a086a295..1f2623c6 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -395,15 +395,10 @@ pub fn wrap_text(lines: &[String], width: usize) -> Vec { continue; } - if line.trim_start().starts_with('#') { - flush_paragraph(&mut out, &buf, &indent, width); - buf.clear(); - indent.clear(); - out.push(line.clone()); - continue; - } - - if is_markdownlint_directive(line) { + if matches!( + classify_block(line), + Some(BlockKind::Heading | BlockKind::MarkdownlintDirective) + ) { flush_paragraph(&mut out, &buf, &indent, width); buf.clear(); indent.clear(); diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs index 86453f19..8883feb6 100644 --- a/src/wrap/tests.rs +++ b/src/wrap/tests.rs @@ -333,4 +333,22 @@ fn wrap_text_keeps_trailing_spaces_for_bullet_final_line() { ); } +#[test] +fn wrap_text_preserves_indented_hash_as_text() { + let input = vec![ + "Paragraph intro.".to_string(), + " # code".to_string(), + "Continuation.".to_string(), + ]; + let wrapped = wrap_text(&input, 40); + assert_eq!( + wrapped, + vec![ + "Paragraph intro.".to_string(), + " # code".to_string(), + "Continuation.".to_string(), + ] + ); +} + mod fence_tracker; From 9b9c6df1f7a16cdafe5a4ae6ce74041c75434262 Mon Sep 17 00:00:00 2001 From: Leynos Date: Mon, 29 Sep 2025 16:56:57 +0100 Subject: [PATCH 03/11] Refine shared block classification - Document block precedence and extend the shared classifier with a digit prefix variant so wrapping and table detection stay in sync. - Guard heading detection against indented code, treat verbatim code lines without reflow, and reuse the helper when leaving tables. - Cover the classifier behaviour with new rstest cases and ensure numeric paragraphs no longer block table flushing. --- src/process.rs | 10 +++++++++- src/wrap.rs | 33 ++++++++++++++++++++++++++++++++- src/wrap/tests.rs | 23 ++++++++++++++++++++++- 3 files changed, 63 insertions(+), 3 deletions(-) diff --git a/src/process.rs b/src/process.rs index 34b8e9d7..6444b6fc 100644 --- a/src/process.rs +++ b/src/process.rs @@ -86,7 +86,9 @@ fn handle_table_line( in_table: &mut bool, out: &mut Vec, ) -> bool { - if line.trim_start().starts_with('|') { + let trimmed = line.trim_start(); + + if trimmed.starts_with('|') { *in_table = true; buf.push(line.to_string()); return true; @@ -106,6 +108,12 @@ fn handle_table_line( flush_buffer(buf, in_table, out); return false; } + let indent = line.len().saturating_sub(trimmed.len()); + if indent >= 4 && trimmed.starts_with('#') { + // Treat indented hash-prefixed lines as code, not table rows. + flush_buffer(buf, in_table, out); + return false; + } buf.push(line.to_string()); return true; } diff --git a/src/wrap.rs b/src/wrap.rs index 1f2623c6..8b5bcb30 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -257,15 +257,24 @@ pub(crate) enum BlockKind { FootnoteDefinition, /// HTML-style markdownlint directives recognised by [`is_markdownlint_directive`]. MarkdownlintDirective, + /// Lines whose first non-whitespace character is an ASCII digit. + DigitPrefix, } /// Classifies block-level Markdown prefixes shared by wrapping and table detection. /// +/// Detection order determines precedence when a line could match multiple prefixes. +/// The current precedence is: heading, bullet, blockquote, footnote definition, +/// markdownlint directive, digit prefix. Headings outrank bullets and blockquotes, +/// so inputs such as "# 1" remain headings rather than list items. Headings ignore +/// indentation of four or more spaces so indented code remains untouched. /// For example, passing `"> quote"` returns `Some(BlockKind::Blockquote)` while /// `"| cell |"` yields `None` because the line is part of a table. pub(crate) fn classify_block(line: &str) -> Option { let trimmed = line.trim_start(); - if trimmed.starts_with('#') { + let indent = line.len().saturating_sub(trimmed.len()); + + if indent < 4 && trimmed.starts_with('#') { return Some(BlockKind::Heading); } if BULLET_RE.is_match(line) { @@ -280,6 +289,9 @@ pub(crate) fn classify_block(line: &str) -> Option { if is_markdownlint_directive(line) { return Some(BlockKind::MarkdownlintDirective); } + if trimmed.chars().next().is_some_and(|c| c.is_ascii_digit()) { + return Some(BlockKind::DigitPrefix); + } None } @@ -358,6 +370,16 @@ fn handle_prefix_line( append_wrapped_with_prefix(out, prefix, rest, width, repeat_prefix); } +fn is_indented_code_line(line: &str) -> bool { + let indent_width = line + .as_bytes() + .iter() + .take_while(|b| **b == b' ' || **b == 0x09) + .fold(0_usize, |acc, &b| acc + if b == 0x09 { 4 } else { 1 }); + + indent_width >= 4 && line.chars().any(|c| !c.is_whitespace()) +} + /// Wrap text lines to the given width. /// /// # Panics @@ -441,6 +463,15 @@ pub fn wrap_text(lines: &[String], width: usize) -> Vec { continue; } + if is_indented_code_line(line) { + // Preserve indented code blocks verbatim so wrapping does not merge them into paragraphs. + flush_paragraph(&mut out, &buf, &indent, width); + buf.clear(); + indent.clear(); + out.push(line.clone()); + continue; + } + if buf.is_empty() { indent = line.chars().take_while(|c| c.is_whitespace()).collect(); } diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs index 8883feb6..7e97bf22 100644 --- a/src/wrap/tests.rs +++ b/src/wrap/tests.rs @@ -9,7 +9,7 @@ use super::{ LineBuffer, attach_punctuation_to_previous_line, determine_token_span, tokenize::segment_inline, wrap_preserving_code, }; -use crate::wrap::wrap_text; +use crate::wrap::{BlockKind, classify_block, wrap_text}; #[rstest] #[case("`code`!", "`code`!")] @@ -351,4 +351,25 @@ fn wrap_text_preserves_indented_hash_as_text() { ); } +#[rstest( + line, + expected, + case("# Heading", Some(BlockKind::Heading)), + case(" # Heading", Some(BlockKind::Heading)), + case(" # Heading", None), + case("- item", Some(BlockKind::Bullet)), + case("1. item", Some(BlockKind::Bullet)), + case("> quote", Some(BlockKind::Blockquote)), + case("[^1]: footnote", Some(BlockKind::FootnoteDefinition)), + case( + "", + Some(BlockKind::MarkdownlintDirective) + ), + case("2024 revenue", Some(BlockKind::DigitPrefix)), + case("plain text", None) +)] +fn classify_block_detects_markdown_prefixes(line: &str, expected: Option) { + assert_eq!(classify_block(line), expected); +} + mod fence_tracker; From 65b77259ab7b760578628133ff2a635b2c5cec8f Mon Sep 17 00:00:00 2001 From: Leynos Date: Mon, 29 Sep 2025 17:28:15 +0100 Subject: [PATCH 04/11] Add regression test for numeric paragraphs after tables --- tests/table/process_stream_tests.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/table/process_stream_tests.rs b/tests/table/process_stream_tests.rs index 44d76836..9a86a530 100644 --- a/tests/table/process_stream_tests.rs +++ b/tests/table/process_stream_tests.rs @@ -80,6 +80,21 @@ fn test_non_table_lines_unchanged() { assert_eq!(output, expected); } +#[test] +fn test_process_stream_reflows_table_before_numeric_paragraph() { + let input = lines_vec![ + "| a | b |", + "| 1 | 22 |", + "2024 revenue climbed 10%", + ]; + let expected = lines_vec![ + "| a | b |", + "| 1 | 22 |", + "2024 revenue climbed 10%", + ]; + assert_eq!(process_stream(&input), expected); +} + #[test] fn test_process_stream_only_whitespace() { let input = lines_vec!["", " ", "\t\t"]; From a753452df6a51486f82f1495d340fa873d747da8 Mon Sep 17 00:00:00 2001 From: Leynos Date: Mon, 29 Sep 2025 18:35:57 +0100 Subject: [PATCH 05/11] Extract block classifier module Split block detection into wrap/block.rs so wrapping and table handling reuse shared regexes, add a fallback digit flush for tables, and cover pipe-less rows in classify tests. --- src/process.rs | 3 +- src/wrap.rs | 90 ++------------------------------------------ src/wrap/block.rs | 96 +++++++++++++++++++++++++++++++++++++++++++++++ src/wrap/tests.rs | 1 + 4 files changed, 102 insertions(+), 88 deletions(-) create mode 100644 src/wrap/block.rs diff --git a/src/process.rs b/src/process.rs index 6444b6fc..af2e2e9f 100644 --- a/src/process.rs +++ b/src/process.rs @@ -104,7 +104,8 @@ fn handle_table_line( return true; } if *in_table { - if classify_block(line).is_some() { + let digit_prefixed = trimmed.chars().next().is_some_and(|c| c.is_ascii_digit()); + if classify_block(line).is_some() || digit_prefixed { flush_buffer(buf, in_table, out); return false; } diff --git a/src/wrap.rs b/src/wrap.rs index 8b5bcb30..64740ace 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -8,13 +8,15 @@ //! The [`Token`] enum and [`tokenize_markdown`] function are public so callers //! can perform custom token-based processing. -use regex::Regex; use unicode_width::UnicodeWidthStr; +mod block; mod fence; mod line_buffer; mod tokenize; pub(crate) use self::line_buffer::LineBuffer; +use block::{BLOCKQUOTE_RE, BULLET_RE, FOOTNOTE_RE}; +pub(crate) use block::{BlockKind, classify_block}; pub use fence::{FenceTracker, is_fence}; /// Token emitted by the `tokenize::segment_inline` parser and used by /// higher-level wrappers. @@ -31,37 +33,6 @@ pub use tokenize::tokenize_markdown; // Permit GFM task list markers with flexible spacing and missing post-marker // spaces in Markdown. -static BULLET_RE: std::sync::LazyLock = lazy_regex!( - r"^(\s*(?:[-*+]|\d+[.)])\s+(?:\[\s*(?:[xX]|\s)\s*\]\s*)?)(.*)", - "bullet pattern regex should compile", -); - -static FOOTNOTE_RE: std::sync::LazyLock = lazy_regex!( - r"^(\s*)(\[\^[^]]+\]:\s*)(.*)$", - "footnote pattern regex should compile", -); - -static BLOCKQUOTE_RE: std::sync::LazyLock = lazy_regex!( - r"^(\s*(?:>\s*)+)(.*)$", - "blockquote pattern regex should compile", -); - -/// Matches `markdownlint` comment directives. -/// -/// The regex is case-insensitive and recognises these forms with optional rule -/// names (including plugin rules such as `MD013/line-length` or -/// `plugin/rule-name`): -/// - `` -/// - `` -/// - `` -/// - `` -static MARKDOWNLINT_DIRECTIVE_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| { - Regex::new( - r"(?i)^\s*\s*$", - ) - .expect("valid markdownlint regex") -}); - #[inline] fn is_trailing_punct(c: char) -> bool { // ASCII closers + common Unicode closers and word-final punctuation @@ -244,61 +215,6 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { lines } -/// Describes the Markdown block prefix detected by [`classify_block`]. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub(crate) enum BlockKind { - /// Lines that begin with `#`, `##`, and similar heading prefixes. - Heading, - /// Bullet or ordered list markers matched by [`BULLET_RE`]. - Bullet, - /// Lines that begin with one or more `>` markers. - Blockquote, - /// Footnote definitions recognised by [`FOOTNOTE_RE`]. - FootnoteDefinition, - /// HTML-style markdownlint directives recognised by [`is_markdownlint_directive`]. - MarkdownlintDirective, - /// Lines whose first non-whitespace character is an ASCII digit. - DigitPrefix, -} - -/// Classifies block-level Markdown prefixes shared by wrapping and table detection. -/// -/// Detection order determines precedence when a line could match multiple prefixes. -/// The current precedence is: heading, bullet, blockquote, footnote definition, -/// markdownlint directive, digit prefix. Headings outrank bullets and blockquotes, -/// so inputs such as "# 1" remain headings rather than list items. Headings ignore -/// indentation of four or more spaces so indented code remains untouched. -/// For example, passing `"> quote"` returns `Some(BlockKind::Blockquote)` while -/// `"| cell |"` yields `None` because the line is part of a table. -pub(crate) fn classify_block(line: &str) -> Option { - let trimmed = line.trim_start(); - let indent = line.len().saturating_sub(trimmed.len()); - - if indent < 4 && trimmed.starts_with('#') { - return Some(BlockKind::Heading); - } - if BULLET_RE.is_match(line) { - return Some(BlockKind::Bullet); - } - if BLOCKQUOTE_RE.is_match(line) { - return Some(BlockKind::Blockquote); - } - if FOOTNOTE_RE.is_match(line) { - return Some(BlockKind::FootnoteDefinition); - } - if is_markdownlint_directive(line) { - return Some(BlockKind::MarkdownlintDirective); - } - if trimmed.chars().next().is_some_and(|c| c.is_ascii_digit()) { - return Some(BlockKind::DigitPrefix); - } - None -} - -pub(crate) fn is_markdownlint_directive(line: &str) -> bool { - MARKDOWNLINT_DIRECTIVE_RE.is_match(line) -} - fn flush_paragraph(out: &mut Vec, buf: &[(String, bool)], indent: &str, width: usize) { if buf.is_empty() { return; diff --git a/src/wrap/block.rs b/src/wrap/block.rs new file mode 100644 index 00000000..869b804a --- /dev/null +++ b/src/wrap/block.rs @@ -0,0 +1,96 @@ +//! Block-level Markdown prefix classification shared by wrapping and table detection. +//! +//! The regex helpers centralise detection for headings, lists, blockquotes, footnotes, +//! markdownlint directives, and digit-prefixed paragraphs so wrapping and table handlers +//! stay in sync. + +use regex::Regex; + +pub(super) static BULLET_RE: std::sync::LazyLock = lazy_regex!( + r"^(\s*(?:[-*+]|\d+[.)])\s+(?:\[\s*(?:[xX]|\s)\s*\]\s*)?)(.*)", + "bullet pattern regex should compile", +); + +pub(super) static FOOTNOTE_RE: std::sync::LazyLock = lazy_regex!( + r"^(\s*)(\[\^[^]]+\]:\s*)(.*)$", + "footnote pattern regex should compile", +); + +pub(super) static BLOCKQUOTE_RE: std::sync::LazyLock = lazy_regex!( + r"^(\s*(?:>\s*)+)(.*)$", + "blockquote pattern regex should compile", +); + +/// Matches `markdownlint` comment directives. +/// +/// The regex is case-insensitive and recognises these forms with optional rule +/// names (including plugin rules such as `MD013/line-length` or +/// `plugin/rule-name`): +/// - `` +/// - `` +/// - `` +/// - `` +pub(super) static MARKDOWNLINT_DIRECTIVE_RE: std::sync::LazyLock = std::sync::LazyLock::new( + || { + Regex::new( + r"(?i)^\s*\s*$", + ) + .expect("valid markdownlint regex") + }, +); + +/// Describes the Markdown block prefix detected by [`classify_block`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum BlockKind { + /// Lines that begin with `#`, `##`, and similar heading prefixes. + Heading, + /// Bullet or ordered list markers matched by [`BULLET_RE`]. + Bullet, + /// Lines that begin with one or more `>` markers. + Blockquote, + /// Footnote definitions recognised by [`FOOTNOTE_RE`]. + FootnoteDefinition, + /// HTML-style markdownlint directives recognised by [`is_markdownlint_directive`]. + MarkdownlintDirective, + /// Lines whose first non-whitespace character is an ASCII digit. + DigitPrefix, +} + +/// Classifies block-level Markdown prefixes shared by wrapping and table detection. +/// +/// Detection order determines precedence when a line could match multiple prefixes. +/// The current precedence is: heading, bullet, blockquote, footnote definition, +/// markdownlint directive, digit prefix. Headings outrank bullets and blockquotes, +/// so inputs such as "# 1" remain headings rather than list items. Headings ignore +/// indentation of four or more spaces so indented code remains untouched. +/// For example, passing "> quote" returns `Some(BlockKind::Blockquote)` while +/// "| cell |" yields `None` because the line is part of a table. +pub(crate) fn classify_block(line: &str) -> Option { + let trimmed = line.trim_start(); + let indent = line.len().saturating_sub(trimmed.len()); + + if indent < 4 && trimmed.starts_with('#') { + return Some(BlockKind::Heading); + } + if BULLET_RE.is_match(line) { + return Some(BlockKind::Bullet); + } + if BLOCKQUOTE_RE.is_match(line) { + return Some(BlockKind::Blockquote); + } + if FOOTNOTE_RE.is_match(line) { + return Some(BlockKind::FootnoteDefinition); + } + if is_markdownlint_directive(line) { + return Some(BlockKind::MarkdownlintDirective); + } + if trimmed.chars().next().is_some_and(|c| c.is_ascii_digit()) { + return Some(BlockKind::DigitPrefix); + } + None +} + +#[inline] +pub(super) fn is_markdownlint_directive(line: &str) -> bool { + MARKDOWNLINT_DIRECTIVE_RE.is_match(line) +} diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs index 7e97bf22..33200545 100644 --- a/src/wrap/tests.rs +++ b/src/wrap/tests.rs @@ -366,6 +366,7 @@ fn wrap_text_preserves_indented_hash_as_text() { Some(BlockKind::MarkdownlintDirective) ), case("2024 revenue", Some(BlockKind::DigitPrefix)), + case("a | b", None), case("plain text", None) )] fn classify_block_detects_markdown_prefixes(line: &str, expected: Option) { From 865c837131a88433b84b94a3423298f16808093b Mon Sep 17 00:00:00 2001 From: Leynos Date: Tue, 30 Sep 2025 08:59:26 +0100 Subject: [PATCH 06/11] Document shared block classifiers Explain the shared regex statics, align the markdownlint directive helper with the lazy_regex! usage, and clarify why table handling still flushes on digit-prefixed paragraphs. --- src/process.rs | 2 ++ src/wrap.rs | 4 +++- src/wrap/block.rs | 14 +++++++------- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/process.rs b/src/process.rs index af2e2e9f..a07249b3 100644 --- a/src/process.rs +++ b/src/process.rs @@ -106,6 +106,8 @@ fn handle_table_line( if *in_table { let digit_prefixed = trimmed.chars().next().is_some_and(|c| c.is_ascii_digit()); if classify_block(line).is_some() || digit_prefixed { + // Flush on digit-prefixed paragraphs as well so numeric introductions + // following tables remain wrap candidates. flush_buffer(buf, in_table, out); return false; } diff --git a/src/wrap.rs b/src/wrap.rs index 64740ace..40f064d5 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -287,11 +287,13 @@ fn handle_prefix_line( } fn is_indented_code_line(line: &str) -> bool { + // CommonMark expands tabs to four spaces when measuring indentation. let indent_width = line .as_bytes() .iter() .take_while(|b| **b == b' ' || **b == 0x09) - .fold(0_usize, |acc, &b| acc + if b == 0x09 { 4 } else { 1 }); + .map(|&b| if b == 0x09 { 4 } else { 1 }) + .sum::(); indent_width >= 4 && line.chars().any(|c| !c.is_whitespace()) } diff --git a/src/wrap/block.rs b/src/wrap/block.rs index 869b804a..723c861f 100644 --- a/src/wrap/block.rs +++ b/src/wrap/block.rs @@ -6,16 +6,19 @@ use regex::Regex; +/// Matches bullet and ordered list prefixes captured for wrapping and table detection. pub(super) static BULLET_RE: std::sync::LazyLock = lazy_regex!( r"^(\s*(?:[-*+]|\d+[.)])\s+(?:\[\s*(?:[xX]|\s)\s*\]\s*)?)(.*)", "bullet pattern regex should compile", ); +/// Matches footnote definition prefixes so they remain atomic during wrapping and table parsing. pub(super) static FOOTNOTE_RE: std::sync::LazyLock = lazy_regex!( r"^(\s*)(\[\^[^]]+\]:\s*)(.*)$", "footnote pattern regex should compile", ); +/// Matches blockquote prefixes, capturing the marker run and the remainder for reuse. pub(super) static BLOCKQUOTE_RE: std::sync::LazyLock = lazy_regex!( r"^(\s*(?:>\s*)+)(.*)$", "blockquote pattern regex should compile", @@ -30,13 +33,9 @@ pub(super) static BLOCKQUOTE_RE: std::sync::LazyLock = lazy_regex!( /// - `` /// - `` /// - `` -pub(super) static MARKDOWNLINT_DIRECTIVE_RE: std::sync::LazyLock = std::sync::LazyLock::new( - || { - Regex::new( - r"(?i)^\s*\s*$", - ) - .expect("valid markdownlint regex") - }, +pub(super) static MARKDOWNLINT_DIRECTIVE_RE: std::sync::LazyLock = lazy_regex!( + r"(?i)^\s*\s*$", + "markdownlint directive regex should compile", ); /// Describes the Markdown block prefix detected by [`classify_block`]. @@ -90,6 +89,7 @@ pub(crate) fn classify_block(line: &str) -> Option { None } +/// Returns `true` when `line` matches a recognised `markdownlint` directive comment. #[inline] pub(super) fn is_markdownlint_directive(line: &str) -> bool { MARKDOWNLINT_DIRECTIVE_RE.is_match(line) From 933d3132bf0a802db2f6f6531e425e5f24bfa3c1 Mon Sep 17 00:00:00 2001 From: Leynos Date: Tue, 30 Sep 2025 09:02:07 +0100 Subject: [PATCH 07/11] Document markdownlint helper usage --- src/wrap/block.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/wrap/block.rs b/src/wrap/block.rs index 723c861f..40bd2fb7 100644 --- a/src/wrap/block.rs +++ b/src/wrap/block.rs @@ -90,6 +90,14 @@ pub(crate) fn classify_block(line: &str) -> Option { } /// Returns `true` when `line` matches a recognised `markdownlint` directive comment. +/// +/// # Examples +/// +/// ```rust,ignore +/// use crate::wrap::block::is_markdownlint_directive; +/// assert!(is_markdownlint_directive("")); +/// assert!(!is_markdownlint_directive("")); +/// ``` #[inline] pub(super) fn is_markdownlint_directive(line: &str) -> bool { MARKDOWNLINT_DIRECTIVE_RE.is_match(line) From 9676205679ba8e877d3d658107dfe4e8d3f4c466 Mon Sep 17 00:00:00 2001 From: Leynos Date: Tue, 30 Sep 2025 21:11:55 +0100 Subject: [PATCH 08/11] Refactor wrap helpers and fix table flushing --- src/process.rs | 20 +-- src/wrap.rs | 258 +--------------------------- src/wrap/inline.rs | 194 +++++++++++++++++++++ src/wrap/paragraph.rs | 84 +++++++++ src/wrap/tests.rs | 9 +- tests/table/process_stream_tests.rs | 12 ++ 6 files changed, 304 insertions(+), 273 deletions(-) create mode 100644 src/wrap/inline.rs create mode 100644 src/wrap/paragraph.rs diff --git a/src/process.rs b/src/process.rs index a07249b3..b371d2a1 100644 --- a/src/process.rs +++ b/src/process.rs @@ -6,7 +6,7 @@ use crate::{ footnotes::convert_footnotes, html::convert_html_tables, table::reflow_table, - wrap::{FenceTracker, classify_block, wrap_text}, + wrap::{FenceTracker, wrap_text}, }; /// Column width used when wrapping text. @@ -104,21 +104,9 @@ fn handle_table_line( return true; } if *in_table { - let digit_prefixed = trimmed.chars().next().is_some_and(|c| c.is_ascii_digit()); - if classify_block(line).is_some() || digit_prefixed { - // Flush on digit-prefixed paragraphs as well so numeric introductions - // following tables remain wrap candidates. - flush_buffer(buf, in_table, out); - return false; - } - let indent = line.len().saturating_sub(trimmed.len()); - if indent >= 4 && trimmed.starts_with('#') { - // Treat indented hash-prefixed lines as code, not table rows. - flush_buffer(buf, in_table, out); - return false; - } - buf.push(line.to_string()); - return true; + // Any non-tableish line ends the table; let the caller reprocess this line. + flush_buffer(buf, in_table, out); + return false; } false } diff --git a/src/wrap.rs b/src/wrap.rs index 40f064d5..2fbd3334 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -8,16 +8,16 @@ //! The [`Token`] enum and [`tokenize_markdown`] function are public so callers //! can perform custom token-based processing. -use unicode_width::UnicodeWidthStr; - mod block; mod fence; +mod inline; mod line_buffer; +mod paragraph; mod tokenize; -pub(crate) use self::line_buffer::LineBuffer; use block::{BLOCKQUOTE_RE, BULLET_RE, FOOTNOTE_RE}; pub(crate) use block::{BlockKind, classify_block}; pub use fence::{FenceTracker, is_fence}; +use paragraph::{flush_paragraph, handle_prefix_line}; /// Token emitted by the `tokenize::segment_inline` parser and used by /// higher-level wrappers. /// @@ -33,258 +33,6 @@ pub use tokenize::tokenize_markdown; // Permit GFM task list markers with flexible spacing and missing post-marker // spaces in Markdown. -#[inline] -fn is_trailing_punct(c: char) -> bool { - // ASCII closers + common Unicode closers and word-final punctuation - matches!( - c, - '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '"' | '\'' - ) || "…—–»›)]】》」』、。,:;!?”.’".contains(c) -} - -fn looks_like_link(token: &str) -> bool { - (token.starts_with('[') || token.starts_with("![")) - && token.contains("](") - && token.ends_with(')') -} - -fn is_whitespace_token(token: &str) -> bool { - token.chars().all(char::is_whitespace) -} - -fn is_inline_code_token(token: &str) -> bool { - token.starts_with('`') && token.ends_with('`') -} - -fn extend_punctuation(tokens: &[String], mut j: usize, width: &mut usize) -> usize { - while j < tokens.len() && tokens[j].chars().all(is_trailing_punct) { - *width += UnicodeWidthStr::width(tokens[j].as_str()); - j += 1; - } - j -} - -#[inline] -fn merge_code_span(tokens: &[String], i: usize, width: &mut usize) -> usize { - debug_assert!( - tokens[i] == "`", - "merge_code_span requires a single backtick opener" - ); - let mut j = i + 1; - while j < tokens.len() && tokens[j] != "`" { - *width += UnicodeWidthStr::width(tokens[j].as_str()); - j += 1; - } - if j < tokens.len() { - *width += UnicodeWidthStr::width(tokens[j].as_str()); - j += 1; - j = extend_punctuation(tokens, j, width); - } - j -} - -#[inline] -fn determine_token_span(tokens: &[String], start: usize) -> (usize, usize) { - #[derive(PartialEq, Eq)] - enum SpanKind { - General, - Code, - Link, - } - - let mut end = start + 1; - let mut width = UnicodeWidthStr::width(tokens[start].as_str()); - let mut kind = SpanKind::General; - - if tokens[start] == "`" { - kind = SpanKind::Code; - end = merge_code_span(tokens, start, &mut width); - } else if is_inline_code_token(&tokens[start]) { - kind = SpanKind::Code; - end = extend_punctuation(tokens, end, &mut width); - } else if looks_like_link(&tokens[start]) { - kind = SpanKind::Link; - end = extend_punctuation(tokens, end, &mut width); - } - - while end < tokens.len() { - let token = &tokens[end]; - if is_whitespace_token(token) { - if matches!(kind, SpanKind::Code | SpanKind::Link) - && end + 1 < tokens.len() - && (looks_like_link(&tokens[end + 1]) - || is_inline_code_token(&tokens[end + 1]) - || tokens[end + 1].chars().all(is_trailing_punct)) - { - width += UnicodeWidthStr::width(token.as_str()); - end += 1; - continue; - } - break; - } - - if token.chars().all(is_trailing_punct) { - if matches!(kind, SpanKind::Code | SpanKind::Link) { - width += UnicodeWidthStr::width(token.as_str()); - end += 1; - continue; - } - break; - } - - let is_link = looks_like_link(token); - let is_code = is_inline_code_token(token); - - if kind == SpanKind::Link && is_link { - width += UnicodeWidthStr::width(token.as_str()); - end += 1; - end = extend_punctuation(tokens, end, &mut width); - continue; - } - - if kind == SpanKind::Code && is_code { - width += UnicodeWidthStr::width(token.as_str()); - end += 1; - end = extend_punctuation(tokens, end, &mut width); - continue; - } - - break; - } - - (end, width) -} - -fn attach_punctuation_to_previous_line(lines: &mut [String], current: &str, token: &str) -> bool { - if !current.is_empty() || token.len() != 1 || !".?!,:;".contains(token) { - return false; - } - - let Some(last_line) = lines.last_mut() else { - return false; - }; - - if last_line.trim_end().ends_with('`') { - last_line.push_str(token); - return true; - } - - false -} - -fn wrap_preserving_code(text: &str, width: usize) -> Vec { - let tokens = tokenize::segment_inline(text); - if tokens.is_empty() { - return Vec::new(); - } - - let mut lines = Vec::new(); - let mut buffer = LineBuffer::new(); - let mut i = 0; - - while i < tokens.len() { - let (group_end, group_width) = determine_token_span(&tokens, i); - - if attach_punctuation_to_previous_line(lines.as_mut_slice(), buffer.text(), &tokens[i]) { - i += 1; - continue; - } - - if buffer.width() + group_width <= width { - buffer.push_span(&tokens, i, group_end); - i = group_end; - continue; - } - - if buffer.split_with_span(&mut lines, &tokens, i, group_end, width) { - i = group_end; - continue; - } - - if buffer.flush_trailing_whitespace(&mut lines, &tokens, i, group_end) { - i = group_end; - continue; - } - - buffer.flush_into(&mut lines); - buffer.push_non_whitespace_span(&tokens, i, group_end); - i = group_end; - } - - buffer.flush_into(&mut lines); - lines -} - -fn flush_paragraph(out: &mut Vec, buf: &[(String, bool)], indent: &str, width: usize) { - if buf.is_empty() { - return; - } - let mut segment = String::new(); - for (text, hard_break) in buf { - if !segment.is_empty() { - segment.push(' '); - } - segment.push_str(text); - if *hard_break { - for line in wrap_preserving_code(&segment, width - indent.len()) { - out.push(format!("{indent}{line}")); - } - segment.clear(); - } - } - if !segment.is_empty() { - for line in wrap_preserving_code(&segment, width - indent.len()) { - out.push(format!("{indent}{line}")); - } - } -} - -fn append_wrapped_with_prefix( - out: &mut Vec, - prefix: &str, - text: &str, - width: usize, - repeat_prefix: bool, -) { - let prefix_width = UnicodeWidthStr::width(prefix); - let available = width.saturating_sub(prefix_width).max(1); - let indent_str: String = prefix.chars().take_while(|c| c.is_whitespace()).collect(); - let indent_width = UnicodeWidthStr::width(indent_str.as_str()); - let wrapped_indent = if repeat_prefix { - prefix.to_string() - } else { - format!("{}{}", indent_str, " ".repeat(prefix_width - indent_width)) - }; - - let lines = wrap_preserving_code(text, available); - if lines.is_empty() { - out.push(prefix.to_string()); - return; - } - - for (i, line) in lines.iter().enumerate() { - if i == 0 { - out.push(format!("{prefix}{line}")); - } else { - out.push(format!("{wrapped_indent}{line}")); - } - } -} - -fn handle_prefix_line( - out: &mut Vec, - buf: &mut Vec<(String, bool)>, - indent: &mut String, - width: usize, - prefix: &str, - rest: &str, - repeat_prefix: bool, -) { - flush_paragraph(out, buf, indent, width); - buf.clear(); - indent.clear(); - append_wrapped_with_prefix(out, prefix, rest, width, repeat_prefix); -} fn is_indented_code_line(line: &str) -> bool { // CommonMark expands tabs to four spaces when measuring indentation. diff --git a/src/wrap/inline.rs b/src/wrap/inline.rs new file mode 100644 index 00000000..d00afbe0 --- /dev/null +++ b/src/wrap/inline.rs @@ -0,0 +1,194 @@ +//! Inline wrapping helpers that keep code spans intact. +//! +//! These functions operate on token streams so `wrap_text` can preserve +//! inline code, links, and trailing punctuation without reimplementing the +//! grouping logic in multiple places. + +use unicode_width::UnicodeWidthStr; + +use super::{line_buffer::LineBuffer, tokenize}; + +#[inline] +fn is_trailing_punct(c: char) -> bool { + // ASCII closers + common Unicode closers and word-final punctuation + matches!( + c, + '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '"' | '\'' + ) || "…—–»›)]】》」』、。,:;!?”.’".contains(c) +} + +fn looks_like_link(token: &str) -> bool { + (token.starts_with('[') || token.starts_with("![")) + && token.contains("](") + && token.ends_with(')') +} + +fn is_whitespace_token(token: &str) -> bool { + token.chars().all(char::is_whitespace) +} + +fn is_inline_code_token(token: &str) -> bool { + token.starts_with('`') && token.ends_with('`') +} + +fn extend_punctuation(tokens: &[String], mut j: usize, width: &mut usize) -> usize { + while j < tokens.len() && tokens[j].chars().all(is_trailing_punct) { + *width += UnicodeWidthStr::width(tokens[j].as_str()); + j += 1; + } + j +} + +#[inline] +fn merge_code_span(tokens: &[String], i: usize, width: &mut usize) -> usize { + debug_assert!( + tokens[i] == "`", + "merge_code_span requires a single backtick opener" + ); + let mut j = i + 1; + while j < tokens.len() && tokens[j] != "`" { + *width += UnicodeWidthStr::width(tokens[j].as_str()); + j += 1; + } + if j < tokens.len() { + *width += UnicodeWidthStr::width(tokens[j].as_str()); + j += 1; + j = extend_punctuation(tokens, j, width); + } + j +} + +pub(super) fn determine_token_span(tokens: &[String], start: usize) -> (usize, usize) { + #[derive(PartialEq, Eq)] + enum SpanKind { + General, + Code, + Link, + } + + let mut end = start + 1; + let mut width = UnicodeWidthStr::width(tokens[start].as_str()); + let mut kind = SpanKind::General; + + if tokens[start] == "`" { + kind = SpanKind::Code; + end = merge_code_span(tokens, start, &mut width); + } else if is_inline_code_token(&tokens[start]) { + kind = SpanKind::Code; + end = extend_punctuation(tokens, end, &mut width); + } else if looks_like_link(&tokens[start]) { + kind = SpanKind::Link; + end = extend_punctuation(tokens, end, &mut width); + } + + while end < tokens.len() { + let token = &tokens[end]; + if is_whitespace_token(token) { + if matches!(kind, SpanKind::Code | SpanKind::Link) + && end + 1 < tokens.len() + && (looks_like_link(&tokens[end + 1]) + || is_inline_code_token(&tokens[end + 1]) + || tokens[end + 1].chars().all(is_trailing_punct)) + { + width += UnicodeWidthStr::width(token.as_str()); + end += 1; + continue; + } + break; + } + + if token.chars().all(is_trailing_punct) { + if matches!(kind, SpanKind::Code | SpanKind::Link) { + width += UnicodeWidthStr::width(token.as_str()); + end += 1; + continue; + } + break; + } + + let is_link = looks_like_link(token); + let is_code = is_inline_code_token(token); + + if kind == SpanKind::Link && is_link { + width += UnicodeWidthStr::width(token.as_str()); + end += 1; + end = extend_punctuation(tokens, end, &mut width); + continue; + } + + if kind == SpanKind::Code && is_code { + width += UnicodeWidthStr::width(token.as_str()); + end += 1; + end = extend_punctuation(tokens, end, &mut width); + continue; + } + + break; + } + + (end, width) +} + +pub(super) fn attach_punctuation_to_previous_line( + lines: &mut [String], + current: &str, + token: &str, +) -> bool { + if !current.is_empty() || token.len() != 1 || !".?!,:;".contains(token) { + return false; + } + + let Some(last_line) = lines.last_mut() else { + return false; + }; + + if last_line.trim_end().ends_with('`') { + last_line.push_str(token); + return true; + } + + false +} + +pub(super) fn wrap_preserving_code(text: &str, width: usize) -> Vec { + let tokens = tokenize::segment_inline(text); + if tokens.is_empty() { + return Vec::new(); + } + + let mut lines = Vec::new(); + let mut buffer = LineBuffer::new(); + let mut i = 0; + + while i < tokens.len() { + let (group_end, group_width) = determine_token_span(&tokens, i); + + if attach_punctuation_to_previous_line(lines.as_mut_slice(), buffer.text(), &tokens[i]) { + i += 1; + continue; + } + + if buffer.width() + group_width <= width { + buffer.push_span(&tokens, i, group_end); + i = group_end; + continue; + } + + if buffer.split_with_span(&mut lines, &tokens, i, group_end, width) { + i = group_end; + continue; + } + + if buffer.flush_trailing_whitespace(&mut lines, &tokens, i, group_end) { + i = group_end; + continue; + } + + buffer.flush_into(&mut lines); + buffer.push_non_whitespace_span(&tokens, i, group_end); + i = group_end; + } + + buffer.flush_into(&mut lines); + lines +} diff --git a/src/wrap/paragraph.rs b/src/wrap/paragraph.rs new file mode 100644 index 00000000..79e52a17 --- /dev/null +++ b/src/wrap/paragraph.rs @@ -0,0 +1,84 @@ +//! Paragraph wrapping utilities shared by `wrap_text`. +//! +//! These helpers keep paragraph logic focused on buffer management while +//! deferring inline wrapping to `inline::wrap_preserving_code`. + +use unicode_width::UnicodeWidthStr; + +use super::inline::wrap_preserving_code; + +fn append_wrapped_with_prefix( + out: &mut Vec, + prefix: &str, + text: &str, + width: usize, + repeat_prefix: bool, +) { + let prefix_width = UnicodeWidthStr::width(prefix); + let available = width.saturating_sub(prefix_width).max(1); + let indent_str: String = prefix.chars().take_while(|c| c.is_whitespace()).collect(); + let indent_width = UnicodeWidthStr::width(indent_str.as_str()); + let wrapped_indent = if repeat_prefix { + prefix.to_string() + } else { + format!("{}{}", indent_str, " ".repeat(prefix_width - indent_width)) + }; + + let lines = wrap_preserving_code(text, available); + if lines.is_empty() { + out.push(prefix.to_string()); + return; + } + + for (i, line) in lines.iter().enumerate() { + if i == 0 { + out.push(format!("{prefix}{line}")); + } else { + out.push(format!("{wrapped_indent}{line}")); + } + } +} + +pub(super) fn flush_paragraph( + out: &mut Vec, + buf: &[(String, bool)], + indent: &str, + width: usize, +) { + if buf.is_empty() { + return; + } + let mut segment = String::new(); + for (text, hard_break) in buf { + if !segment.is_empty() { + segment.push(' '); + } + segment.push_str(text); + if *hard_break { + for line in wrap_preserving_code(&segment, width - indent.len()) { + out.push(format!("{indent}{line}")); + } + segment.clear(); + } + } + if !segment.is_empty() { + for line in wrap_preserving_code(&segment, width - indent.len()) { + out.push(format!("{indent}{line}")); + } + } +} + +pub(super) fn handle_prefix_line( + out: &mut Vec, + buf: &mut Vec<(String, bool)>, + indent: &mut String, + width: usize, + prefix: &str, + rest: &str, + repeat_prefix: bool, +) { + flush_paragraph(out, buf, indent, width); + buf.clear(); + indent.clear(); + append_wrapped_with_prefix(out, prefix, rest, width, repeat_prefix); +} diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs index 33200545..58384076 100644 --- a/src/wrap/tests.rs +++ b/src/wrap/tests.rs @@ -6,8 +6,13 @@ use rstest::rstest; use super::{ - LineBuffer, attach_punctuation_to_previous_line, determine_token_span, - tokenize::segment_inline, wrap_preserving_code, + inline::{ + attach_punctuation_to_previous_line, + determine_token_span, + wrap_preserving_code, + }, + line_buffer::LineBuffer, + tokenize::segment_inline, }; use crate::wrap::{BlockKind, classify_block, wrap_text}; diff --git a/tests/table/process_stream_tests.rs b/tests/table/process_stream_tests.rs index 9a86a530..4fb245c5 100644 --- a/tests/table/process_stream_tests.rs +++ b/tests/table/process_stream_tests.rs @@ -95,6 +95,18 @@ fn test_process_stream_reflows_table_before_numeric_paragraph() { assert_eq!(process_stream(&input), expected); } +#[test] +fn flushes_table_before_plain_paragraph_no_blank() { + let input = vec![ + "| a | b |".to_string(), + "|---|---|".to_string(), + "AWS revenue rose".to_string(), + ]; + let out = crate::process::process_stream(&input); + assert!(out.iter().any(|l| l.contains("| a | b |"))); + assert!(out.iter().any(|l| l == "AWS revenue rose")); +} + #[test] fn test_process_stream_only_whitespace() { let input = lines_vec!["", " ", "\t\t"]; From 18a707c97b2af0ea92a611080aa21f25a1d196d5 Mon Sep 17 00:00:00 2001 From: Leynos Date: Tue, 30 Sep 2025 21:12:02 +0100 Subject: [PATCH 09/11] Flush tables with shared block classifier --- src/process.rs | 10 ++++++++-- src/wrap/block.rs | 27 ++++++++++++++++++++++++--- src/wrap/tests.rs | 7 ++----- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/src/process.rs b/src/process.rs index b371d2a1..5805f846 100644 --- a/src/process.rs +++ b/src/process.rs @@ -6,7 +6,7 @@ use crate::{ footnotes::convert_footnotes, html::convert_html_tables, table::reflow_table, - wrap::{FenceTracker, wrap_text}, + wrap::{FenceTracker, classify_block, wrap_text}, }; /// Column width used when wrapping text. @@ -104,7 +104,13 @@ fn handle_table_line( return true; } if *in_table { - // Any non-tableish line ends the table; let the caller reprocess this line. + if classify_block(line).is_some() { + // Flush when a new Markdown block (heading, list, quote, footnote, directive, + // or digit-prefixed text) begins so wrapping and table detection stay aligned. + flush_buffer(buf, in_table, out); + return false; + } + // Plain paragraphs also end the table so the caller can reprocess them for wrapping. flush_buffer(buf, in_table, out); return false; } diff --git a/src/wrap/block.rs b/src/wrap/block.rs index 40bd2fb7..698f7257 100644 --- a/src/wrap/block.rs +++ b/src/wrap/block.rs @@ -6,6 +6,27 @@ use regex::Regex; +/// Returns the indentation width (treating tabs as four columns) and the byte +/// offset of the first non-space or tab character. +fn leading_indent(line: &str) -> (usize, usize) { + let mut width = 0; + let mut bytes = 0; + for &b in line.as_bytes() { + match b { + b' ' => { + width += 1; + bytes += 1; + } + 0x09 => { + width += 4; + bytes += 1; + } + _ => break, + } + } + (width, bytes) +} + /// Matches bullet and ordered list prefixes captured for wrapping and table detection. pub(super) static BULLET_RE: std::sync::LazyLock = lazy_regex!( r"^(\s*(?:[-*+]|\d+[.)])\s+(?:\[\s*(?:[xX]|\s)\s*\]\s*)?)(.*)", @@ -65,10 +86,10 @@ pub(crate) enum BlockKind { /// For example, passing "> quote" returns `Some(BlockKind::Blockquote)` while /// "| cell |" yields `None` because the line is part of a table. pub(crate) fn classify_block(line: &str) -> Option { - let trimmed = line.trim_start(); - let indent = line.len().saturating_sub(trimmed.len()); + let (indent_width, indent_bytes) = leading_indent(line); + let trimmed = line[indent_bytes..].trim_start(); - if indent < 4 && trimmed.starts_with('#') { + if indent_width < 4 && trimmed.starts_with('#') { return Some(BlockKind::Heading); } if BULLET_RE.is_match(line) { diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs index 58384076..18b7b482 100644 --- a/src/wrap/tests.rs +++ b/src/wrap/tests.rs @@ -6,11 +6,7 @@ use rstest::rstest; use super::{ - inline::{ - attach_punctuation_to_previous_line, - determine_token_span, - wrap_preserving_code, - }, + inline::{attach_punctuation_to_previous_line, determine_token_span, wrap_preserving_code}, line_buffer::LineBuffer, tokenize::segment_inline, }; @@ -362,6 +358,7 @@ fn wrap_text_preserves_indented_hash_as_text() { case("# Heading", Some(BlockKind::Heading)), case(" # Heading", Some(BlockKind::Heading)), case(" # Heading", None), + case(" # Heading", None), case("- item", Some(BlockKind::Bullet)), case("1. item", Some(BlockKind::Bullet)), case("> quote", Some(BlockKind::Blockquote)), From 276e8ead8acdb73ccb13796c9a2d20fc94e586a5 Mon Sep 17 00:00:00 2001 From: Leynos Date: Wed, 1 Oct 2025 12:38:56 +0100 Subject: [PATCH 10/11] Add tests for block classifier Add unit coverage for classify_block and behavioural regressions in wrapping and table processing to guard shared detection. --- src/wrap/block.rs | 42 +++++++++++++++++++++++++++++ src/wrap/tests.rs | 18 +++++++++++++ tests/table/process_stream_tests.rs | 16 +++++++++++ 3 files changed, 76 insertions(+) diff --git a/src/wrap/block.rs b/src/wrap/block.rs index 698f7257..8ebe6868 100644 --- a/src/wrap/block.rs +++ b/src/wrap/block.rs @@ -123,3 +123,45 @@ pub(crate) fn classify_block(line: &str) -> Option { pub(super) fn is_markdownlint_directive(line: &str) -> bool { MARKDOWNLINT_DIRECTIVE_RE.is_match(line) } + +#[cfg(test)] +mod tests { + use super::*; + use rstest::rstest; + + #[rstest( + line, + expected, + case("# Heading", Some(BlockKind::Heading)), + case(" # Heading", Some(BlockKind::Heading)), + case(" # Code block", None), + case("- item", Some(BlockKind::Bullet)), + case("1. item", Some(BlockKind::Bullet)), + case("> quote", Some(BlockKind::Blockquote)), + case("[^1]: footnote", Some(BlockKind::FootnoteDefinition)), + case( + "", + Some(BlockKind::MarkdownlintDirective) + ), + case("2024 revenue", Some(BlockKind::DigitPrefix)), + case("plain paragraph", None), + case("| a | b |", None), + case("#123", Some(BlockKind::Heading)), + case("1) list", Some(BlockKind::Bullet)), + case(" 2024", Some(BlockKind::DigitPrefix)), + case(" 1. code", Some(BlockKind::Bullet)) + )] + fn classify_block_identifies_prefixes(line: &str, expected: Option) { + assert_eq!(classify_block(line), expected); + } + + #[rstest] + #[case("", true)] + #[case("", true)] + #[case("", true)] + #[case("", false)] + #[case("", false)] + fn detects_markdownlint_directives(#[case] line: &str, #[case] expected: bool) { + assert_eq!(is_markdownlint_directive(line), expected); + } +} diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs index 18b7b482..3ea1eacb 100644 --- a/src/wrap/tests.rs +++ b/src/wrap/tests.rs @@ -352,6 +352,24 @@ fn wrap_text_preserves_indented_hash_as_text() { ); } +#[test] +fn wrap_text_flushes_before_heading() { + let input = vec![ + "Paragraph intro.".to_string(), + "# Heading".to_string(), + "Continuation.".to_string(), + ]; + let wrapped = wrap_text(&input, 40); + assert_eq!( + wrapped, + vec![ + "Paragraph intro.".to_string(), + "# Heading".to_string(), + "Continuation.".to_string(), + ] + ); +} + #[rstest( line, expected, diff --git a/tests/table/process_stream_tests.rs b/tests/table/process_stream_tests.rs index 4fb245c5..70e80966 100644 --- a/tests/table/process_stream_tests.rs +++ b/tests/table/process_stream_tests.rs @@ -107,6 +107,22 @@ fn flushes_table_before_plain_paragraph_no_blank() { assert!(out.iter().any(|l| l == "AWS revenue rose")); } +#[test] +fn test_process_stream_reflows_table_before_heading() { + let input = lines_vec![ + "| a | b |", + "| 1 | 22 |", + "# Heading", + ]; + let expected = lines_vec![ + "| a | b |", + "| 1 | 22 |", + "# Heading", + ]; + assert_eq!(process_stream(&input), expected); +} + + #[test] fn test_process_stream_only_whitespace() { let input = lines_vec!["", " ", "\t\t"]; From 066abe70b811c0122273e4eef9283c077738dcb0 Mon Sep 17 00:00:00 2001 From: Leynos Date: Wed, 1 Oct 2025 12:39:01 +0100 Subject: [PATCH 11/11] Guard block classifier against indented code Ensure bullet, quote, footnote, markdownlint, and digit detection require less than four spaces of indentation so indented code lines remain untouched. --- src/wrap/block.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/wrap/block.rs b/src/wrap/block.rs index 8ebe6868..c8fea348 100644 --- a/src/wrap/block.rs +++ b/src/wrap/block.rs @@ -92,19 +92,19 @@ pub(crate) fn classify_block(line: &str) -> Option { if indent_width < 4 && trimmed.starts_with('#') { return Some(BlockKind::Heading); } - if BULLET_RE.is_match(line) { + if indent_width < 4 && BULLET_RE.is_match(line) { return Some(BlockKind::Bullet); } - if BLOCKQUOTE_RE.is_match(line) { + if indent_width < 4 && BLOCKQUOTE_RE.is_match(line) { return Some(BlockKind::Blockquote); } - if FOOTNOTE_RE.is_match(line) { + if indent_width < 4 && FOOTNOTE_RE.is_match(line) { return Some(BlockKind::FootnoteDefinition); } - if is_markdownlint_directive(line) { + if indent_width < 4 && is_markdownlint_directive(line) { return Some(BlockKind::MarkdownlintDirective); } - if trimmed.chars().next().is_some_and(|c| c.is_ascii_digit()) { + if indent_width < 4 && trimmed.chars().next().is_some_and(|c| c.is_ascii_digit()) { return Some(BlockKind::DigitPrefix); } None @@ -149,7 +149,7 @@ mod tests { case("#123", Some(BlockKind::Heading)), case("1) list", Some(BlockKind::Bullet)), case(" 2024", Some(BlockKind::DigitPrefix)), - case(" 1. code", Some(BlockKind::Bullet)) + case(" 1. code", None) )] fn classify_block_identifies_prefixes(line: &str, expected: Option) { assert_eq!(classify_block(line), expected);