diff --git a/src/process.rs b/src/process.rs index 5cef3989..5805f846 100644 --- a/src/process.rs +++ b/src/process.rs @@ -6,7 +6,7 @@ use crate::{ footnotes::convert_footnotes, html::convert_html_tables, table::reflow_table, - wrap::{FenceTracker, wrap_text}, + wrap::{FenceTracker, classify_block, wrap_text}, }; /// Column width used when wrapping text. @@ -86,7 +86,9 @@ fn handle_table_line( in_table: &mut bool, out: &mut Vec, ) -> bool { - if line.trim_start().starts_with('|') { + let trimmed = line.trim_start(); + + if trimmed.starts_with('|') { *in_table = true; buf.push(line.to_string()); return true; @@ -102,18 +104,15 @@ fn handle_table_line( return true; } if *in_table { - let trimmed = line.trim_start(); - let new_block = trimmed.starts_with('#') - || trimmed.starts_with('*') - || trimmed.starts_with('-') - || trimmed.starts_with('>') - || trimmed.chars().next().is_some_and(|c| c.is_ascii_digit()); - if new_block { + if classify_block(line).is_some() { + // Flush when a new Markdown block (heading, list, quote, footnote, directive, + // or digit-prefixed text) begins so wrapping and table detection stay aligned. flush_buffer(buf, in_table, out); return false; } - buf.push(line.to_string()); - return true; + // Plain paragraphs also end the table so the caller can reprocess them for wrapping. + flush_buffer(buf, in_table, out); + return false; } false } diff --git a/src/wrap.rs b/src/wrap.rs index 8b204073..2fbd3334 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -8,14 +8,16 @@ //! The [`Token`] enum and [`tokenize_markdown`] function are public so callers //! can perform custom token-based processing. -use regex::Regex; -use unicode_width::UnicodeWidthStr; - +mod block; mod fence; +mod inline; mod line_buffer; +mod paragraph; mod tokenize; -pub(crate) use self::line_buffer::LineBuffer; +use block::{BLOCKQUOTE_RE, BULLET_RE, FOOTNOTE_RE}; +pub(crate) use block::{BlockKind, classify_block}; pub use fence::{FenceTracker, is_fence}; +use paragraph::{flush_paragraph, handle_prefix_line}; /// Token emitted by the `tokenize::segment_inline` parser and used by /// higher-level wrappers. /// @@ -31,292 +33,17 @@ pub use tokenize::tokenize_markdown; // Permit GFM task list markers with flexible spacing and missing post-marker // spaces in Markdown. -static BULLET_RE: std::sync::LazyLock = lazy_regex!( - r"^(\s*(?:[-*+]|\d+[.)])\s+(?:\[\s*(?:[xX]|\s)\s*\]\s*)?)(.*)", - "bullet pattern regex should compile", -); - -static FOOTNOTE_RE: std::sync::LazyLock = lazy_regex!( - r"^(\s*)(\[\^[^]]+\]:\s*)(.*)$", - "footnote pattern regex should compile", -); - -static BLOCKQUOTE_RE: std::sync::LazyLock = lazy_regex!( - r"^(\s*(?:>\s*)+)(.*)$", - "blockquote pattern regex should compile", -); - -/// Matches `markdownlint` comment directives. -/// -/// The regex is case-insensitive and recognises these forms with optional rule -/// names (including plugin rules such as `MD013/line-length` or -/// `plugin/rule-name`): -/// - `` -/// - `` -/// - `` -/// - `` -static MARKDOWNLINT_DIRECTIVE_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| { - Regex::new( - r"(?i)^\s*\s*$", - ) - .expect("valid markdownlint regex") -}); - -#[inline] -fn is_trailing_punct(c: char) -> bool { - // ASCII closers + common Unicode closers and word-final punctuation - matches!( - c, - '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '"' | '\'' - ) || "…—–»›)]】》」』、。,:;!?”.’".contains(c) -} - -fn looks_like_link(token: &str) -> bool { - (token.starts_with('[') || token.starts_with("![")) - && token.contains("](") - && token.ends_with(')') -} - -fn is_whitespace_token(token: &str) -> bool { - token.chars().all(char::is_whitespace) -} - -fn is_inline_code_token(token: &str) -> bool { - token.starts_with('`') && token.ends_with('`') -} - -fn extend_punctuation(tokens: &[String], mut j: usize, width: &mut usize) -> usize { - while j < tokens.len() && tokens[j].chars().all(is_trailing_punct) { - *width += UnicodeWidthStr::width(tokens[j].as_str()); - j += 1; - } - j -} - -#[inline] -fn merge_code_span(tokens: &[String], i: usize, width: &mut usize) -> usize { - debug_assert!( - tokens[i] == "`", - "merge_code_span requires a single backtick opener" - ); - let mut j = i + 1; - while j < tokens.len() && tokens[j] != "`" { - *width += UnicodeWidthStr::width(tokens[j].as_str()); - j += 1; - } - if j < tokens.len() { - *width += UnicodeWidthStr::width(tokens[j].as_str()); - j += 1; - j = extend_punctuation(tokens, j, width); - } - j -} - -#[inline] -fn determine_token_span(tokens: &[String], start: usize) -> (usize, usize) { - #[derive(PartialEq, Eq)] - enum SpanKind { - General, - Code, - Link, - } - - let mut end = start + 1; - let mut width = UnicodeWidthStr::width(tokens[start].as_str()); - let mut kind = SpanKind::General; - - if tokens[start] == "`" { - kind = SpanKind::Code; - end = merge_code_span(tokens, start, &mut width); - } else if is_inline_code_token(&tokens[start]) { - kind = SpanKind::Code; - end = extend_punctuation(tokens, end, &mut width); - } else if looks_like_link(&tokens[start]) { - kind = SpanKind::Link; - end = extend_punctuation(tokens, end, &mut width); - } - - while end < tokens.len() { - let token = &tokens[end]; - if is_whitespace_token(token) { - if matches!(kind, SpanKind::Code | SpanKind::Link) - && end + 1 < tokens.len() - && (looks_like_link(&tokens[end + 1]) - || is_inline_code_token(&tokens[end + 1]) - || tokens[end + 1].chars().all(is_trailing_punct)) - { - width += UnicodeWidthStr::width(token.as_str()); - end += 1; - continue; - } - break; - } - - if token.chars().all(is_trailing_punct) { - if matches!(kind, SpanKind::Code | SpanKind::Link) { - width += UnicodeWidthStr::width(token.as_str()); - end += 1; - continue; - } - break; - } - - let is_link = looks_like_link(token); - let is_code = is_inline_code_token(token); - - if kind == SpanKind::Link && is_link { - width += UnicodeWidthStr::width(token.as_str()); - end += 1; - end = extend_punctuation(tokens, end, &mut width); - continue; - } - - if kind == SpanKind::Code && is_code { - width += UnicodeWidthStr::width(token.as_str()); - end += 1; - end = extend_punctuation(tokens, end, &mut width); - continue; - } - - break; - } - - (end, width) -} - -fn attach_punctuation_to_previous_line(lines: &mut [String], current: &str, token: &str) -> bool { - if !current.is_empty() || token.len() != 1 || !".?!,:;".contains(token) { - return false; - } - - let Some(last_line) = lines.last_mut() else { - return false; - }; - - if last_line.trim_end().ends_with('`') { - last_line.push_str(token); - return true; - } - - false -} - -fn wrap_preserving_code(text: &str, width: usize) -> Vec { - let tokens = tokenize::segment_inline(text); - if tokens.is_empty() { - return Vec::new(); - } - let mut lines = Vec::new(); - let mut buffer = LineBuffer::new(); - let mut i = 0; +fn is_indented_code_line(line: &str) -> bool { + // CommonMark expands tabs to four spaces when measuring indentation. + let indent_width = line + .as_bytes() + .iter() + .take_while(|b| **b == b' ' || **b == 0x09) + .map(|&b| if b == 0x09 { 4 } else { 1 }) + .sum::(); - while i < tokens.len() { - let (group_end, group_width) = determine_token_span(&tokens, i); - - if attach_punctuation_to_previous_line(lines.as_mut_slice(), buffer.text(), &tokens[i]) { - i += 1; - continue; - } - - if buffer.width() + group_width <= width { - buffer.push_span(&tokens, i, group_end); - i = group_end; - continue; - } - - if buffer.split_with_span(&mut lines, &tokens, i, group_end, width) { - i = group_end; - continue; - } - - if buffer.flush_trailing_whitespace(&mut lines, &tokens, i, group_end) { - i = group_end; - continue; - } - - buffer.flush_into(&mut lines); - buffer.push_non_whitespace_span(&tokens, i, group_end); - i = group_end; - } - - buffer.flush_into(&mut lines); - lines -} - -pub(crate) fn is_markdownlint_directive(line: &str) -> bool { - MARKDOWNLINT_DIRECTIVE_RE.is_match(line) -} - -fn flush_paragraph(out: &mut Vec, buf: &[(String, bool)], indent: &str, width: usize) { - if buf.is_empty() { - return; - } - let mut segment = String::new(); - for (text, hard_break) in buf { - if !segment.is_empty() { - segment.push(' '); - } - segment.push_str(text); - if *hard_break { - for line in wrap_preserving_code(&segment, width - indent.len()) { - out.push(format!("{indent}{line}")); - } - segment.clear(); - } - } - if !segment.is_empty() { - for line in wrap_preserving_code(&segment, width - indent.len()) { - out.push(format!("{indent}{line}")); - } - } -} - -fn append_wrapped_with_prefix( - out: &mut Vec, - prefix: &str, - text: &str, - width: usize, - repeat_prefix: bool, -) { - let prefix_width = UnicodeWidthStr::width(prefix); - let available = width.saturating_sub(prefix_width).max(1); - let indent_str: String = prefix.chars().take_while(|c| c.is_whitespace()).collect(); - let indent_width = UnicodeWidthStr::width(indent_str.as_str()); - let wrapped_indent = if repeat_prefix { - prefix.to_string() - } else { - format!("{}{}", indent_str, " ".repeat(prefix_width - indent_width)) - }; - - let lines = wrap_preserving_code(text, available); - if lines.is_empty() { - out.push(prefix.to_string()); - return; - } - - for (i, line) in lines.iter().enumerate() { - if i == 0 { - out.push(format!("{prefix}{line}")); - } else { - out.push(format!("{wrapped_indent}{line}")); - } - } -} - -fn handle_prefix_line( - out: &mut Vec, - buf: &mut Vec<(String, bool)>, - indent: &mut String, - width: usize, - prefix: &str, - rest: &str, - repeat_prefix: bool, -) { - flush_paragraph(out, buf, indent, width); - buf.clear(); - indent.clear(); - append_wrapped_with_prefix(out, prefix, rest, width, repeat_prefix); + indent_width >= 4 && line.chars().any(|c| !c.is_whitespace()) } /// Wrap text lines to the given width. @@ -356,15 +83,10 @@ pub fn wrap_text(lines: &[String], width: usize) -> Vec { continue; } - if line.trim_start().starts_with('#') { - flush_paragraph(&mut out, &buf, &indent, width); - buf.clear(); - indent.clear(); - out.push(line.clone()); - continue; - } - - if is_markdownlint_directive(line) { + if matches!( + classify_block(line), + Some(BlockKind::Heading | BlockKind::MarkdownlintDirective) + ) { flush_paragraph(&mut out, &buf, &indent, width); buf.clear(); indent.clear(); @@ -407,6 +129,15 @@ pub fn wrap_text(lines: &[String], width: usize) -> Vec { continue; } + if is_indented_code_line(line) { + // Preserve indented code blocks verbatim so wrapping does not merge them into paragraphs. + flush_paragraph(&mut out, &buf, &indent, width); + buf.clear(); + indent.clear(); + out.push(line.clone()); + continue; + } + if buf.is_empty() { indent = line.chars().take_while(|c| c.is_whitespace()).collect(); } diff --git a/src/wrap/block.rs b/src/wrap/block.rs new file mode 100644 index 00000000..c8fea348 --- /dev/null +++ b/src/wrap/block.rs @@ -0,0 +1,167 @@ +//! Block-level Markdown prefix classification shared by wrapping and table detection. +//! +//! The regex helpers centralise detection for headings, lists, blockquotes, footnotes, +//! markdownlint directives, and digit-prefixed paragraphs so wrapping and table handlers +//! stay in sync. + +use regex::Regex; + +/// Returns the indentation width (treating tabs as four columns) and the byte +/// offset of the first non-space or tab character. +fn leading_indent(line: &str) -> (usize, usize) { + let mut width = 0; + let mut bytes = 0; + for &b in line.as_bytes() { + match b { + b' ' => { + width += 1; + bytes += 1; + } + 0x09 => { + width += 4; + bytes += 1; + } + _ => break, + } + } + (width, bytes) +} + +/// Matches bullet and ordered list prefixes captured for wrapping and table detection. +pub(super) static BULLET_RE: std::sync::LazyLock = lazy_regex!( + r"^(\s*(?:[-*+]|\d+[.)])\s+(?:\[\s*(?:[xX]|\s)\s*\]\s*)?)(.*)", + "bullet pattern regex should compile", +); + +/// Matches footnote definition prefixes so they remain atomic during wrapping and table parsing. +pub(super) static FOOTNOTE_RE: std::sync::LazyLock = lazy_regex!( + r"^(\s*)(\[\^[^]]+\]:\s*)(.*)$", + "footnote pattern regex should compile", +); + +/// Matches blockquote prefixes, capturing the marker run and the remainder for reuse. +pub(super) static BLOCKQUOTE_RE: std::sync::LazyLock = lazy_regex!( + r"^(\s*(?:>\s*)+)(.*)$", + "blockquote pattern regex should compile", +); + +/// Matches `markdownlint` comment directives. +/// +/// The regex is case-insensitive and recognises these forms with optional rule +/// names (including plugin rules such as `MD013/line-length` or +/// `plugin/rule-name`): +/// - `` +/// - `` +/// - `` +/// - `` +pub(super) static MARKDOWNLINT_DIRECTIVE_RE: std::sync::LazyLock = lazy_regex!( + r"(?i)^\s*\s*$", + "markdownlint directive regex should compile", +); + +/// Describes the Markdown block prefix detected by [`classify_block`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum BlockKind { + /// Lines that begin with `#`, `##`, and similar heading prefixes. + Heading, + /// Bullet or ordered list markers matched by [`BULLET_RE`]. + Bullet, + /// Lines that begin with one or more `>` markers. + Blockquote, + /// Footnote definitions recognised by [`FOOTNOTE_RE`]. + FootnoteDefinition, + /// HTML-style markdownlint directives recognised by [`is_markdownlint_directive`]. + MarkdownlintDirective, + /// Lines whose first non-whitespace character is an ASCII digit. + DigitPrefix, +} + +/// Classifies block-level Markdown prefixes shared by wrapping and table detection. +/// +/// Detection order determines precedence when a line could match multiple prefixes. +/// The current precedence is: heading, bullet, blockquote, footnote definition, +/// markdownlint directive, digit prefix. Headings outrank bullets and blockquotes, +/// so inputs such as "# 1" remain headings rather than list items. Headings ignore +/// indentation of four or more spaces so indented code remains untouched. +/// For example, passing "> quote" returns `Some(BlockKind::Blockquote)` while +/// "| cell |" yields `None` because the line is part of a table. +pub(crate) fn classify_block(line: &str) -> Option { + let (indent_width, indent_bytes) = leading_indent(line); + let trimmed = line[indent_bytes..].trim_start(); + + if indent_width < 4 && trimmed.starts_with('#') { + return Some(BlockKind::Heading); + } + if indent_width < 4 && BULLET_RE.is_match(line) { + return Some(BlockKind::Bullet); + } + if indent_width < 4 && BLOCKQUOTE_RE.is_match(line) { + return Some(BlockKind::Blockquote); + } + if indent_width < 4 && FOOTNOTE_RE.is_match(line) { + return Some(BlockKind::FootnoteDefinition); + } + if indent_width < 4 && is_markdownlint_directive(line) { + return Some(BlockKind::MarkdownlintDirective); + } + if indent_width < 4 && trimmed.chars().next().is_some_and(|c| c.is_ascii_digit()) { + return Some(BlockKind::DigitPrefix); + } + None +} + +/// Returns `true` when `line` matches a recognised `markdownlint` directive comment. +/// +/// # Examples +/// +/// ```rust,ignore +/// use crate::wrap::block::is_markdownlint_directive; +/// assert!(is_markdownlint_directive("")); +/// assert!(!is_markdownlint_directive("")); +/// ``` +#[inline] +pub(super) fn is_markdownlint_directive(line: &str) -> bool { + MARKDOWNLINT_DIRECTIVE_RE.is_match(line) +} + +#[cfg(test)] +mod tests { + use super::*; + use rstest::rstest; + + #[rstest( + line, + expected, + case("# Heading", Some(BlockKind::Heading)), + case(" # Heading", Some(BlockKind::Heading)), + case(" # Code block", None), + case("- item", Some(BlockKind::Bullet)), + case("1. item", Some(BlockKind::Bullet)), + case("> quote", Some(BlockKind::Blockquote)), + case("[^1]: footnote", Some(BlockKind::FootnoteDefinition)), + case( + "", + Some(BlockKind::MarkdownlintDirective) + ), + case("2024 revenue", Some(BlockKind::DigitPrefix)), + case("plain paragraph", None), + case("| a | b |", None), + case("#123", Some(BlockKind::Heading)), + case("1) list", Some(BlockKind::Bullet)), + case(" 2024", Some(BlockKind::DigitPrefix)), + case(" 1. code", None) + )] + fn classify_block_identifies_prefixes(line: &str, expected: Option) { + assert_eq!(classify_block(line), expected); + } + + #[rstest] + #[case("", true)] + #[case("", true)] + #[case("", true)] + #[case("", false)] + #[case("", false)] + fn detects_markdownlint_directives(#[case] line: &str, #[case] expected: bool) { + assert_eq!(is_markdownlint_directive(line), expected); + } +} diff --git a/src/wrap/inline.rs b/src/wrap/inline.rs new file mode 100644 index 00000000..d00afbe0 --- /dev/null +++ b/src/wrap/inline.rs @@ -0,0 +1,194 @@ +//! Inline wrapping helpers that keep code spans intact. +//! +//! These functions operate on token streams so `wrap_text` can preserve +//! inline code, links, and trailing punctuation without reimplementing the +//! grouping logic in multiple places. + +use unicode_width::UnicodeWidthStr; + +use super::{line_buffer::LineBuffer, tokenize}; + +#[inline] +fn is_trailing_punct(c: char) -> bool { + // ASCII closers + common Unicode closers and word-final punctuation + matches!( + c, + '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '"' | '\'' + ) || "…—–»›)]】》」』、。,:;!?”.’".contains(c) +} + +fn looks_like_link(token: &str) -> bool { + (token.starts_with('[') || token.starts_with("![")) + && token.contains("](") + && token.ends_with(')') +} + +fn is_whitespace_token(token: &str) -> bool { + token.chars().all(char::is_whitespace) +} + +fn is_inline_code_token(token: &str) -> bool { + token.starts_with('`') && token.ends_with('`') +} + +fn extend_punctuation(tokens: &[String], mut j: usize, width: &mut usize) -> usize { + while j < tokens.len() && tokens[j].chars().all(is_trailing_punct) { + *width += UnicodeWidthStr::width(tokens[j].as_str()); + j += 1; + } + j +} + +#[inline] +fn merge_code_span(tokens: &[String], i: usize, width: &mut usize) -> usize { + debug_assert!( + tokens[i] == "`", + "merge_code_span requires a single backtick opener" + ); + let mut j = i + 1; + while j < tokens.len() && tokens[j] != "`" { + *width += UnicodeWidthStr::width(tokens[j].as_str()); + j += 1; + } + if j < tokens.len() { + *width += UnicodeWidthStr::width(tokens[j].as_str()); + j += 1; + j = extend_punctuation(tokens, j, width); + } + j +} + +pub(super) fn determine_token_span(tokens: &[String], start: usize) -> (usize, usize) { + #[derive(PartialEq, Eq)] + enum SpanKind { + General, + Code, + Link, + } + + let mut end = start + 1; + let mut width = UnicodeWidthStr::width(tokens[start].as_str()); + let mut kind = SpanKind::General; + + if tokens[start] == "`" { + kind = SpanKind::Code; + end = merge_code_span(tokens, start, &mut width); + } else if is_inline_code_token(&tokens[start]) { + kind = SpanKind::Code; + end = extend_punctuation(tokens, end, &mut width); + } else if looks_like_link(&tokens[start]) { + kind = SpanKind::Link; + end = extend_punctuation(tokens, end, &mut width); + } + + while end < tokens.len() { + let token = &tokens[end]; + if is_whitespace_token(token) { + if matches!(kind, SpanKind::Code | SpanKind::Link) + && end + 1 < tokens.len() + && (looks_like_link(&tokens[end + 1]) + || is_inline_code_token(&tokens[end + 1]) + || tokens[end + 1].chars().all(is_trailing_punct)) + { + width += UnicodeWidthStr::width(token.as_str()); + end += 1; + continue; + } + break; + } + + if token.chars().all(is_trailing_punct) { + if matches!(kind, SpanKind::Code | SpanKind::Link) { + width += UnicodeWidthStr::width(token.as_str()); + end += 1; + continue; + } + break; + } + + let is_link = looks_like_link(token); + let is_code = is_inline_code_token(token); + + if kind == SpanKind::Link && is_link { + width += UnicodeWidthStr::width(token.as_str()); + end += 1; + end = extend_punctuation(tokens, end, &mut width); + continue; + } + + if kind == SpanKind::Code && is_code { + width += UnicodeWidthStr::width(token.as_str()); + end += 1; + end = extend_punctuation(tokens, end, &mut width); + continue; + } + + break; + } + + (end, width) +} + +pub(super) fn attach_punctuation_to_previous_line( + lines: &mut [String], + current: &str, + token: &str, +) -> bool { + if !current.is_empty() || token.len() != 1 || !".?!,:;".contains(token) { + return false; + } + + let Some(last_line) = lines.last_mut() else { + return false; + }; + + if last_line.trim_end().ends_with('`') { + last_line.push_str(token); + return true; + } + + false +} + +pub(super) fn wrap_preserving_code(text: &str, width: usize) -> Vec { + let tokens = tokenize::segment_inline(text); + if tokens.is_empty() { + return Vec::new(); + } + + let mut lines = Vec::new(); + let mut buffer = LineBuffer::new(); + let mut i = 0; + + while i < tokens.len() { + let (group_end, group_width) = determine_token_span(&tokens, i); + + if attach_punctuation_to_previous_line(lines.as_mut_slice(), buffer.text(), &tokens[i]) { + i += 1; + continue; + } + + if buffer.width() + group_width <= width { + buffer.push_span(&tokens, i, group_end); + i = group_end; + continue; + } + + if buffer.split_with_span(&mut lines, &tokens, i, group_end, width) { + i = group_end; + continue; + } + + if buffer.flush_trailing_whitespace(&mut lines, &tokens, i, group_end) { + i = group_end; + continue; + } + + buffer.flush_into(&mut lines); + buffer.push_non_whitespace_span(&tokens, i, group_end); + i = group_end; + } + + buffer.flush_into(&mut lines); + lines +} diff --git a/src/wrap/paragraph.rs b/src/wrap/paragraph.rs new file mode 100644 index 00000000..79e52a17 --- /dev/null +++ b/src/wrap/paragraph.rs @@ -0,0 +1,84 @@ +//! Paragraph wrapping utilities shared by `wrap_text`. +//! +//! These helpers keep paragraph logic focused on buffer management while +//! deferring inline wrapping to `inline::wrap_preserving_code`. + +use unicode_width::UnicodeWidthStr; + +use super::inline::wrap_preserving_code; + +fn append_wrapped_with_prefix( + out: &mut Vec, + prefix: &str, + text: &str, + width: usize, + repeat_prefix: bool, +) { + let prefix_width = UnicodeWidthStr::width(prefix); + let available = width.saturating_sub(prefix_width).max(1); + let indent_str: String = prefix.chars().take_while(|c| c.is_whitespace()).collect(); + let indent_width = UnicodeWidthStr::width(indent_str.as_str()); + let wrapped_indent = if repeat_prefix { + prefix.to_string() + } else { + format!("{}{}", indent_str, " ".repeat(prefix_width - indent_width)) + }; + + let lines = wrap_preserving_code(text, available); + if lines.is_empty() { + out.push(prefix.to_string()); + return; + } + + for (i, line) in lines.iter().enumerate() { + if i == 0 { + out.push(format!("{prefix}{line}")); + } else { + out.push(format!("{wrapped_indent}{line}")); + } + } +} + +pub(super) fn flush_paragraph( + out: &mut Vec, + buf: &[(String, bool)], + indent: &str, + width: usize, +) { + if buf.is_empty() { + return; + } + let mut segment = String::new(); + for (text, hard_break) in buf { + if !segment.is_empty() { + segment.push(' '); + } + segment.push_str(text); + if *hard_break { + for line in wrap_preserving_code(&segment, width - indent.len()) { + out.push(format!("{indent}{line}")); + } + segment.clear(); + } + } + if !segment.is_empty() { + for line in wrap_preserving_code(&segment, width - indent.len()) { + out.push(format!("{indent}{line}")); + } + } +} + +pub(super) fn handle_prefix_line( + out: &mut Vec, + buf: &mut Vec<(String, bool)>, + indent: &mut String, + width: usize, + prefix: &str, + rest: &str, + repeat_prefix: bool, +) { + flush_paragraph(out, buf, indent, width); + buf.clear(); + indent.clear(); + append_wrapped_with_prefix(out, prefix, rest, width, repeat_prefix); +} diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs index 86453f19..3ea1eacb 100644 --- a/src/wrap/tests.rs +++ b/src/wrap/tests.rs @@ -6,10 +6,11 @@ use rstest::rstest; use super::{ - LineBuffer, attach_punctuation_to_previous_line, determine_token_span, - tokenize::segment_inline, wrap_preserving_code, + inline::{attach_punctuation_to_previous_line, determine_token_span, wrap_preserving_code}, + line_buffer::LineBuffer, + tokenize::segment_inline, }; -use crate::wrap::wrap_text; +use crate::wrap::{BlockKind, classify_block, wrap_text}; #[rstest] #[case("`code`!", "`code`!")] @@ -333,4 +334,63 @@ fn wrap_text_keeps_trailing_spaces_for_bullet_final_line() { ); } +#[test] +fn wrap_text_preserves_indented_hash_as_text() { + let input = vec![ + "Paragraph intro.".to_string(), + " # code".to_string(), + "Continuation.".to_string(), + ]; + let wrapped = wrap_text(&input, 40); + assert_eq!( + wrapped, + vec![ + "Paragraph intro.".to_string(), + " # code".to_string(), + "Continuation.".to_string(), + ] + ); +} + +#[test] +fn wrap_text_flushes_before_heading() { + let input = vec![ + "Paragraph intro.".to_string(), + "# Heading".to_string(), + "Continuation.".to_string(), + ]; + let wrapped = wrap_text(&input, 40); + assert_eq!( + wrapped, + vec![ + "Paragraph intro.".to_string(), + "# Heading".to_string(), + "Continuation.".to_string(), + ] + ); +} + +#[rstest( + line, + expected, + case("# Heading", Some(BlockKind::Heading)), + case(" # Heading", Some(BlockKind::Heading)), + case(" # Heading", None), + case(" # Heading", None), + case("- item", Some(BlockKind::Bullet)), + case("1. item", Some(BlockKind::Bullet)), + case("> quote", Some(BlockKind::Blockquote)), + case("[^1]: footnote", Some(BlockKind::FootnoteDefinition)), + case( + "", + Some(BlockKind::MarkdownlintDirective) + ), + case("2024 revenue", Some(BlockKind::DigitPrefix)), + case("a | b", None), + case("plain text", None) +)] +fn classify_block_detects_markdown_prefixes(line: &str, expected: Option) { + assert_eq!(classify_block(line), expected); +} + mod fence_tracker; diff --git a/tests/table/process_stream_tests.rs b/tests/table/process_stream_tests.rs index 44d76836..70e80966 100644 --- a/tests/table/process_stream_tests.rs +++ b/tests/table/process_stream_tests.rs @@ -80,6 +80,49 @@ fn test_non_table_lines_unchanged() { assert_eq!(output, expected); } +#[test] +fn test_process_stream_reflows_table_before_numeric_paragraph() { + let input = lines_vec![ + "| a | b |", + "| 1 | 22 |", + "2024 revenue climbed 10%", + ]; + let expected = lines_vec![ + "| a | b |", + "| 1 | 22 |", + "2024 revenue climbed 10%", + ]; + assert_eq!(process_stream(&input), expected); +} + +#[test] +fn flushes_table_before_plain_paragraph_no_blank() { + let input = vec![ + "| a | b |".to_string(), + "|---|---|".to_string(), + "AWS revenue rose".to_string(), + ]; + let out = crate::process::process_stream(&input); + assert!(out.iter().any(|l| l.contains("| a | b |"))); + assert!(out.iter().any(|l| l == "AWS revenue rose")); +} + +#[test] +fn test_process_stream_reflows_table_before_heading() { + let input = lines_vec![ + "| a | b |", + "| 1 | 22 |", + "# Heading", + ]; + let expected = lines_vec![ + "| a | b |", + "| 1 | 22 |", + "# Heading", + ]; + assert_eq!(process_stream(&input), expected); +} + + #[test] fn test_process_stream_only_whitespace() { let input = lines_vec!["", " ", "\t\t"];