From 13b2cfd3af506688256ff0c5e47149e929e80945 Mon Sep 17 00:00:00 2001 From: Leynos Date: Thu, 31 Jul 2025 09:57:35 +0100 Subject: [PATCH 1/3] Extract tokenize helpers and move wrap tests --- src/wrap.rs | 312 +------------------------------------------ src/wrap/tests.rs | 104 +++++++++++++++ src/wrap/tokenize.rs | 176 ++++++++++++++++++++++++ 3 files changed, 286 insertions(+), 306 deletions(-) create mode 100644 src/wrap/tests.rs create mode 100644 src/wrap/tokenize.rs diff --git a/src/wrap.rs b/src/wrap.rs index 7d331bd3..8932438c 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -6,6 +6,9 @@ use regex::{Captures, Regex}; +mod tokenize; +pub(crate) use tokenize::{Token, tokenize_markdown}; + static FENCE_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| Regex::new(r"^\s*(```|~~~).*").unwrap()); @@ -70,195 +73,6 @@ static HANDLERS: &[PrefixHandler] = &[ }, ]; -/// Markdown token emitted by [`tokenize_markdown`]. -#[derive(Debug, PartialEq)] -pub enum Token<'a> { - /// Line within a fenced code block, including the fence itself. - Fence(&'a str), - /// Inline code span without surrounding backticks. - Code(&'a str), - /// Plain text outside code regions. - Text(&'a str), - /// Line break separating tokens. - Newline, -} - -fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) { - let start = i; - if chars[i] == '!' { - i += 1; - } - // skip initial '[' which we know is present - i += 1; - while i < chars.len() && chars[i] != ']' { - i += 1; - } - if i < chars.len() && chars[i] == ']' { - i += 1; - if i < chars.len() && chars[i] == '(' { - i += 1; - let mut depth = 1; - while i < chars.len() && depth > 0 { - match chars[i] { - '(' => depth += 1, - ')' => depth -= 1, - _ => {} - } - i += 1; - } - let tok: String = chars[start..i].iter().collect(); - return (tok, i); - } - } - let tok: String = chars[start..=start].iter().collect(); - (tok, start + 1) -} - -fn is_trailing_punctuation(c: char) -> bool { - matches!( - c, - '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '"' | '\'' - ) -} - -fn tokenize_inline(text: &str) -> Vec { - let mut tokens = Vec::new(); - let chars: Vec = text.chars().collect(); - let mut i = 0; - while i < chars.len() { - let c = chars[i]; - if c.is_whitespace() { - let start = i; - while i < chars.len() && chars[i].is_whitespace() { - i += 1; - } - tokens.push(chars[start..i].iter().collect()); - } else if c == '`' { - let start = i; - let mut delim_len = 0; - while i < chars.len() && chars[i] == '`' { - i += 1; - delim_len += 1; - } - let mut end = i; - while end < chars.len() { - if chars[end] == '`' { - let mut j = end; - let mut count = 0; - while j < chars.len() && chars[j] == '`' { - j += 1; - count += 1; - } - if count == delim_len { - end = j; - break; - } - } - end += 1; - } - if end >= chars.len() { - tokens.push(chars[start..start + delim_len].iter().collect()); - i = start + delim_len; - } else { - tokens.push(chars[start..end].iter().collect()); - i = end; - } - } else if c == '[' || (c == '!' && i + 1 < chars.len() && chars[i + 1] == '[') { - let (tok, mut new_i) = parse_link_or_image(&chars, i); - tokens.push(tok); - let mut punct = String::new(); - while new_i < chars.len() && is_trailing_punctuation(chars[new_i]) { - punct.push(chars[new_i]); - new_i += 1; - } - if !punct.is_empty() { - tokens.push(punct); - } - i = new_i; - } else { - let start = i; - while i < chars.len() && !chars[i].is_whitespace() && chars[i] != '`' { - i += 1; - } - tokens.push(chars[start..i].iter().collect()); - } - } - tokens -} - -/// Split the input string into [`Token`]s by analysing whitespace and -/// backtick delimiters. -/// -/// The tokenizer groups consecutive whitespace into a single -/// [`Token::Text`] and recognises backtick sequences as inline code spans. -/// When a run of backticks is encountered the parser searches forward for an -/// identical delimiter, allowing nested backticks when the span uses a longer -/// fence. Unmatched delimiter sequences are treated as literal text. -/// -/// ```rust,ignore -/// use mdtablefix::wrap::{Token, tokenize_markdown}; -/// -/// let tokens = tokenize_markdown("Example with `code`"); -/// assert_eq!( -/// tokens, -/// vec![Token::Text("Example with "), Token::Code("code")] -/// ); -/// ``` -pub(crate) fn tokenize_markdown(input: &str) -> Vec> { - let mut out = Vec::new(); - let mut in_fence = false; - for line in input.split_inclusive('\n') { - let trimmed = line.trim_end_matches('\n'); - if FENCE_RE.is_match(trimmed) { - out.push(Token::Fence(trimmed)); - out.push(Token::Newline); - in_fence = !in_fence; - continue; - } - if in_fence { - out.push(Token::Fence(trimmed)); - out.push(Token::Newline); - continue; - } - let mut rest = trimmed; - while let Some(pos) = rest.find('`') { - if pos > 0 { - out.push(Token::Text(&rest[..pos])); - } - if let Some(end) = rest[pos + 1..].find('`') { - out.push(Token::Code(&rest[pos + 1..pos + 1 + end])); - rest = &rest[pos + end + 2..]; - } else { - out.push(Token::Text(&rest[pos..])); - rest = ""; - break; - } - } - if !rest.is_empty() { - out.push(Token::Text(rest)); - } - out.push(Token::Newline); - } - out.pop(); - out -} - -/// Determine if the current line should break at the last whitespace. -/// -/// Returns `true` if `current_width` exceeds `width` and a whitespace split -/// position is available. -/// -/// # Examples -/// -/// ```ignore -/// use mdtablefix::wrap::should_break_line; -/// assert!(should_break_line(10, 12, Some(3))); -/// assert!(!should_break_line(10, 8, Some(3))); -/// ``` -fn should_break_line(width: usize, current_width: usize, last_split: Option) -> bool { - current_width > width && last_split.is_some() -} - fn wrap_preserving_code(text: &str, width: usize) -> Vec { use unicode_width::UnicodeWidthStr; @@ -266,14 +80,14 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { let mut current = String::new(); let mut current_width = 0; let mut last_split: Option = None; - let tokens = tokenize_inline(text); + let tokens = tokenize::tokenize_inline(text); let mut i = 0; while i < tokens.len() { let mut j = i + 1; let mut group_width = UnicodeWidthStr::width(tokens[i].as_str()); if tokens[i].contains("](") && tokens[i].ends_with(')') { - while j < tokens.len() && tokens[j].chars().all(is_trailing_punctuation) { + while j < tokens.len() && tokens[j].chars().all(tokenize::is_trailing_punctuation) { group_width += UnicodeWidthStr::width(tokens[j].as_str()); j += 1; } @@ -306,7 +120,7 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { continue; } - if should_break_line(width, current_width + group_width, last_split) { + if tokenize::should_break_line(width, current_width + group_width, last_split) { let pos = last_split.unwrap(); let line = current[..pos].to_string(); let mut rest = current[pos..].trim_start().to_string(); @@ -547,117 +361,3 @@ pub fn wrap_text(lines: &[String], width: usize) -> Vec { flush_paragraph(&mut out, &buf, &indent, width); out } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn wrap_text_preserves_hyphenated_words() { - let input = vec!["A word that is very-long-word indeed".to_string()]; - let wrapped = wrap_text(&input, 20); - assert_eq!( - wrapped, - vec![ - "A word that is".to_string(), - "very-long-word".to_string(), - "indeed".to_string(), - ] - ); - } - - #[test] - fn wrap_text_does_not_insert_spaces_in_hyphenated_words() { - let input = vec![ - concat!( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt ", - "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur ", - "volutpat." - ) - .to_string(), - ]; - let wrapped = wrap_text(&input, 80); - assert_eq!( - wrapped, - vec![ - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt" - .to_string(), - "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur volutpat." - .to_string(), - ] - ); - } - - #[test] - fn wrap_text_preserves_code_spans() { - let input = vec![ - "with their own escaping rules. On Windows, scripts default to `powershell -Command` \ - unless the manifest's `interpreter` field overrides the setting." - .to_string(), - ]; - let wrapped = wrap_text(&input, 60); - assert_eq!( - wrapped, - vec![ - "with their own escaping rules. On Windows, scripts default".to_string(), - "to `powershell -Command` unless the manifest's".to_string(), - "`interpreter` field overrides the setting.".to_string(), - ] - ); - } - - #[test] - fn wrap_text_multiple_code_spans() { - let input = vec!["combine `foo bar` and `baz qux` in one line".to_string()]; - let wrapped = wrap_text(&input, 25); - assert_eq!( - wrapped, - vec![ - "combine `foo bar` and".to_string(), - "`baz qux` in one line".to_string(), - ] - ); - } - - #[test] - fn wrap_text_nested_backticks() { - let input = vec!["Use `` `code` `` to quote backticks".to_string()]; - let wrapped = wrap_text(&input, 20); - assert_eq!( - wrapped, - vec![ - "Use `` `code` `` to".to_string(), - "quote backticks".to_string() - ] - ); - } - - #[test] - fn wrap_text_unmatched_backticks() { - let input = vec!["This has a `dangling code span.".to_string()]; - let wrapped = wrap_text(&input, 20); - assert_eq!( - wrapped, - vec!["This has a".to_string(), "`dangling code span.".to_string()] - ); - } - - #[test] - fn wrap_text_preserves_links() { - let input = vec![ - "`falcon-pachinko` is an extension library for the".to_string(), - "[Falcon](https://falcon.readthedocs.io) web framework. It adds a structured" - .to_string(), - "approach to asynchronous WebSocket routing and background worker integration." - .to_string(), - ]; - let wrapped = wrap_text(&input, 80); - let joined = wrapped.join("\n"); - assert_eq!(joined.matches("https://").count(), 1); - assert!( - wrapped - .iter() - .any(|l| l.contains("https://falcon.readthedocs.io")) - ); - } -} diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs new file mode 100644 index 00000000..66fb9a94 --- /dev/null +++ b/src/wrap/tests.rs @@ -0,0 +1,104 @@ +#[cfg(test)] +mod tests { + use super::super::*; + + #[test] + fn wrap_text_preserves_hyphenated_words() { + let input = vec!["A word that is very-long-word indeed".to_string()]; + let wrapped = wrap_text(&input, 20); + assert_eq!( + wrapped, + vec![ + "A word that is".to_string(), + "very-long-word".to_string(), + "indeed".to_string(), + ] + ); + } + + #[test] + fn wrap_text_does_not_insert_spaces_in_hyphenated_words() { + let input = vec![ + concat!( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt ", + "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur ", + "volutpat.", + ) + .to_string(), + ]; + let wrapped = wrap_text(&input, 80); + assert_eq!( + wrapped, + vec![ + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt" + .to_string(), + "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur volutpat." + .to_string(), + ] + ); + } + + #[test] + fn wrap_text_preserves_code_spans() { + let input = vec![ + "with their own escaping rules. On Windows, scripts default to `powershell -Command` \ + unless the manifest's `interpreter` field overrides the setting." + .to_string(), + ]; + let wrapped = wrap_text(&input, 60); + assert_eq!( + wrapped, + vec![ + "with their own escaping rules. On Windows, scripts default".to_string(), + "to `powershell -Command` unless the manifest's".to_string(), + "`interpreter` field overrides the setting.".to_string(), + ] + ); + } + + #[test] + fn wrap_text_multiple_code_spans() { + let input = vec!["combine `foo bar` and `baz qux` in one line".to_string()]; + let wrapped = wrap_text(&input, 25); + assert_eq!( + wrapped, + vec![ + "combine `foo bar` and".to_string(), + "`baz qux` in one line".to_string(), + ] + ); + } + + #[test] + fn wrap_text_nested_backticks() { + let input = vec!["Use `` `code` `` to quote backticks".to_string()]; + let wrapped = wrap_text(&input, 20); + assert_eq!( + wrapped, + vec!["Use `` `code` `` to".to_string(), "quote backticks".to_string()], + ); + } + + #[test] + fn wrap_text_unmatched_backticks() { + let input = vec!["This has a `dangling code span.".to_string()]; + let wrapped = wrap_text(&input, 20); + assert_eq!( + wrapped, + vec!["This has a".to_string(), "`dangling code span.".to_string()], + ); + } + + #[test] + fn wrap_text_preserves_links() { + let input = vec![ + "`falcon-pachinko` is an extension library for the".to_string(), + "[Falcon](https://falcon.readthedocs.io) web framework. It adds a structured".to_string(), + "approach to asynchronous WebSocket routing and background worker integration.".to_string(), + ]; + let wrapped = wrap_text(&input, 80); + let joined = wrapped.join("\n"); + assert_eq!(joined.matches("https://").count(), 1); + assert!(wrapped.iter().any(|l| l.contains("https://falcon.readthedocs.io"))); + } +} diff --git a/src/wrap/tokenize.rs b/src/wrap/tokenize.rs new file mode 100644 index 00000000..ed81a4c3 --- /dev/null +++ b/src/wrap/tokenize.rs @@ -0,0 +1,176 @@ +//! Tokenization helpers for wrapping logic. +//! +//! This module contains utilities for breaking lines into tokens so that +//! inline code spans and Markdown links are preserved during wrapping. + +use super::FENCE_RE; + +/// Markdown token emitted by [`tokenize_markdown`]. +#[derive(Debug, PartialEq)] +pub enum Token<'a> { + /// Line within a fenced code block, including the fence itself. + Fence(&'a str), + /// Inline code span without surrounding backticks. + Code(&'a str), + /// Plain text outside code regions. + Text(&'a str), + /// Line break separating tokens. + Newline, +} + +fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) { + let start = i; + if chars[i] == '!' { + i += 1; + } + i += 1; // skip initial '[' which we know is present + while i < chars.len() && chars[i] != ']' { + i += 1; + } + if i < chars.len() && chars[i] == ']' { + i += 1; + if i < chars.len() && chars[i] == '(' { + i += 1; + let mut depth = 1; + while i < chars.len() && depth > 0 { + match chars[i] { + '(' => depth += 1, + ')' => depth -= 1, + _ => {} + } + i += 1; + } + let tok: String = chars[start..i].iter().collect(); + return (tok, i); + } + } + let tok: String = chars[start..=start].iter().collect(); + (tok, start + 1) +} + +pub(super) fn is_trailing_punctuation(c: char) -> bool { + matches!( + c, + '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '"' | '\'' + ) +} + +pub(super) fn tokenize_inline(text: &str) -> Vec { + let mut tokens = Vec::new(); + let chars: Vec = text.chars().collect(); + let mut i = 0; + while i < chars.len() { + let c = chars[i]; + if c.is_whitespace() { + let start = i; + while i < chars.len() && chars[i].is_whitespace() { + i += 1; + } + tokens.push(chars[start..i].iter().collect()); + } else if c == '`' { + let start = i; + let mut delim_len = 0; + while i < chars.len() && chars[i] == '`' { + i += 1; + delim_len += 1; + } + let mut end = i; + while end < chars.len() { + if chars[end] == '`' { + let mut j = end; + let mut count = 0; + while j < chars.len() && chars[j] == '`' { + j += 1; + count += 1; + } + if count == delim_len { + end = j; + break; + } + } + end += 1; + } + if end >= chars.len() { + tokens.push(chars[start..start + delim_len].iter().collect()); + i = start + delim_len; + } else { + tokens.push(chars[start..end].iter().collect()); + i = end; + } + } else if c == '[' || (c == '!' && i + 1 < chars.len() && chars[i + 1] == '[') { + let (tok, mut new_i) = parse_link_or_image(&chars, i); + tokens.push(tok); + let mut punct = String::new(); + while new_i < chars.len() && is_trailing_punctuation(chars[new_i]) { + punct.push(chars[new_i]); + new_i += 1; + } + if !punct.is_empty() { + tokens.push(punct); + } + i = new_i; + } else { + let start = i; + while i < chars.len() && !chars[i].is_whitespace() && chars[i] != '`' { + i += 1; + } + tokens.push(chars[start..i].iter().collect()); + } + } + tokens +} + +/// Split the input string into [`Token`]s by analysing whitespace and backtick +/// delimiters. +/// +/// The tokenizer groups consecutive whitespace into a single [`Token::Text`] and +/// recognises backtick sequences as inline code spans. When a run of backticks +/// is encountered the parser searches forward for an identical delimiter, +/// allowing nested backticks when the span uses a longer fence. Unmatched +/// delimiter sequences are treated as literal text. +pub(crate) fn tokenize_markdown(input: &str) -> Vec> { + let mut out = Vec::new(); + let mut in_fence = false; + for line in input.split_inclusive('\n') { + let trimmed = line.trim_end_matches('\n'); + if FENCE_RE.is_match(trimmed) { + out.push(Token::Fence(trimmed)); + out.push(Token::Newline); + in_fence = !in_fence; + continue; + } + if in_fence { + out.push(Token::Fence(trimmed)); + out.push(Token::Newline); + continue; + } + let mut rest = trimmed; + while let Some(pos) = rest.find('`') { + if pos > 0 { + out.push(Token::Text(&rest[..pos])); + } + if let Some(end) = rest[pos + 1..].find('`') { + out.push(Token::Code(&rest[pos + 1..pos + 1 + end])); + rest = &rest[pos + end + 2..]; + } else { + out.push(Token::Text(&rest[pos..])); + rest = ""; + break; + } + } + if !rest.is_empty() { + out.push(Token::Text(rest)); + } + out.push(Token::Newline); + } + out.pop(); + out +} + +pub(super) fn should_break_line( + width: usize, + current_width: usize, + last_split: Option, +) -> bool { + current_width > width && last_split.is_some() +} From 37876fb1b43e23dd4c9fea36f44afd7c98b2502c Mon Sep 17 00:00:00 2001 From: Leynos Date: Thu, 31 Jul 2025 11:29:25 +0100 Subject: [PATCH 2/3] Add tokenization tests and helpers --- src/wrap.rs | 5 +- src/wrap/tests.rs | 194 ++++++++++++++++++++++--------------------- src/wrap/tokenize.rs | 109 ++++++++++++++---------- 3 files changed, 169 insertions(+), 139 deletions(-) diff --git a/src/wrap.rs b/src/wrap.rs index 8932438c..1ba777b2 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -80,7 +80,7 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { let mut current = String::new(); let mut current_width = 0; let mut last_split: Option = None; - let tokens = tokenize::tokenize_inline(text); + let tokens = tokenize::segment_inline(text); let mut i = 0; while i < tokens.len() { let mut j = i + 1; @@ -361,3 +361,6 @@ pub fn wrap_text(lines: &[String], width: usize) -> Vec { flush_paragraph(&mut out, &buf, &indent, width); out } + +#[cfg(test)] +mod tests; diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs index 66fb9a94..d9315c9e 100644 --- a/src/wrap/tests.rs +++ b/src/wrap/tests.rs @@ -1,104 +1,106 @@ -#[cfg(test)] -mod tests { - use super::super::*; +use super::super::*; - #[test] - fn wrap_text_preserves_hyphenated_words() { - let input = vec!["A word that is very-long-word indeed".to_string()]; - let wrapped = wrap_text(&input, 20); - assert_eq!( - wrapped, - vec![ - "A word that is".to_string(), - "very-long-word".to_string(), - "indeed".to_string(), - ] - ); - } +#[test] +fn wrap_text_preserves_hyphenated_words() { + let input = vec!["A word that is very-long-word indeed".to_string()]; + let wrapped = wrap_text(&input, 20); + assert_eq!( + wrapped, + vec![ + "A word that is".to_string(), + "very-long-word".to_string(), + "indeed".to_string(), + ] + ); +} - #[test] - fn wrap_text_does_not_insert_spaces_in_hyphenated_words() { - let input = vec![ - concat!( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt ", - "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur ", - "volutpat.", - ) - .to_string(), - ]; - let wrapped = wrap_text(&input, 80); - assert_eq!( - wrapped, - vec![ - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt" - .to_string(), - "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur volutpat." - .to_string(), - ] - ); - } +#[test] +fn wrap_text_does_not_insert_spaces_in_hyphenated_words() { + let input = vec![ + concat!( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt ", + "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur ", + "volutpat.", + ) + .to_string(), + ]; + let wrapped = wrap_text(&input, 80); + assert_eq!( + wrapped, + vec![ + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt".to_string(), + "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur volutpat.".to_string(), + ] + ); +} - #[test] - fn wrap_text_preserves_code_spans() { - let input = vec![ - "with their own escaping rules. On Windows, scripts default to `powershell -Command` \ - unless the manifest's `interpreter` field overrides the setting." - .to_string(), - ]; - let wrapped = wrap_text(&input, 60); - assert_eq!( - wrapped, - vec![ - "with their own escaping rules. On Windows, scripts default".to_string(), - "to `powershell -Command` unless the manifest's".to_string(), - "`interpreter` field overrides the setting.".to_string(), - ] - ); - } +#[test] +fn wrap_text_preserves_code_spans() { + let input = vec![ + "with their own escaping rules. On Windows, scripts default to `powershell -Command` \ + unless the manifest's `interpreter` field overrides the setting." + .to_string(), + ]; + let wrapped = wrap_text(&input, 60); + assert_eq!( + wrapped, + vec![ + "with their own escaping rules. On Windows, scripts default".to_string(), + "to `powershell -Command` unless the manifest's".to_string(), + "`interpreter` field overrides the setting.".to_string(), + ] + ); +} - #[test] - fn wrap_text_multiple_code_spans() { - let input = vec!["combine `foo bar` and `baz qux` in one line".to_string()]; - let wrapped = wrap_text(&input, 25); - assert_eq!( - wrapped, - vec![ - "combine `foo bar` and".to_string(), - "`baz qux` in one line".to_string(), - ] - ); - } +#[test] +fn wrap_text_multiple_code_spans() { + let input = vec!["combine `foo bar` and `baz qux` in one line".to_string()]; + let wrapped = wrap_text(&input, 25); + assert_eq!( + wrapped, + vec![ + "combine `foo bar` and".to_string(), + "`baz qux` in one line".to_string(), + ] + ); +} - #[test] - fn wrap_text_nested_backticks() { - let input = vec!["Use `` `code` `` to quote backticks".to_string()]; - let wrapped = wrap_text(&input, 20); - assert_eq!( - wrapped, - vec!["Use `` `code` `` to".to_string(), "quote backticks".to_string()], - ); - } +#[test] +fn wrap_text_nested_backticks() { + let input = vec!["Use `` `code` `` to quote backticks".to_string()]; + let wrapped = wrap_text(&input, 20); + assert_eq!( + wrapped, + vec![ + "Use `` `code` `` to".to_string(), + "quote backticks".to_string() + ], + ); +} - #[test] - fn wrap_text_unmatched_backticks() { - let input = vec!["This has a `dangling code span.".to_string()]; - let wrapped = wrap_text(&input, 20); - assert_eq!( - wrapped, - vec!["This has a".to_string(), "`dangling code span.".to_string()], - ); - } +#[test] +fn wrap_text_unmatched_backticks() { + let input = vec!["This has a `dangling code span.".to_string()]; + let wrapped = wrap_text(&input, 20); + assert_eq!( + wrapped, + vec!["This has a".to_string(), "`dangling code span.".to_string()], + ); +} - #[test] - fn wrap_text_preserves_links() { - let input = vec![ - "`falcon-pachinko` is an extension library for the".to_string(), - "[Falcon](https://falcon.readthedocs.io) web framework. It adds a structured".to_string(), - "approach to asynchronous WebSocket routing and background worker integration.".to_string(), - ]; - let wrapped = wrap_text(&input, 80); - let joined = wrapped.join("\n"); - assert_eq!(joined.matches("https://").count(), 1); - assert!(wrapped.iter().any(|l| l.contains("https://falcon.readthedocs.io"))); - } +#[test] +fn wrap_text_preserves_links() { + let input = vec![ + "`falcon-pachinko` is an extension library for the".to_string(), + "[Falcon](https://falcon.readthedocs.io) web framework. It adds a structured".to_string(), + "approach to asynchronous WebSocket routing and background worker integration.".to_string(), + ]; + let wrapped = wrap_text(&input, 80); + let joined = wrapped.join("\n"); + assert_eq!(joined.matches("https://").count(), 1); + assert!( + wrapped + .iter() + .any(|l| l.contains("https://falcon.readthedocs.io")) + ); } diff --git a/src/wrap/tokenize.rs b/src/wrap/tokenize.rs index ed81a4c3..0bfd0832 100644 --- a/src/wrap/tokenize.rs +++ b/src/wrap/tokenize.rs @@ -5,6 +5,20 @@ use super::FENCE_RE; +fn scan_while(chars: &[char], mut i: usize, cond: F) -> usize +where + F: Fn(char) -> bool, +{ + while i < chars.len() && cond(chars[i]) { + i += 1; + } + i +} + +fn collect_range(chars: &[char], start: usize, end: usize) -> String { + chars[start..end].iter().collect() +} + /// Markdown token emitted by [`tokenize_markdown`]. #[derive(Debug, PartialEq)] pub enum Token<'a> { @@ -24,9 +38,7 @@ fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) { i += 1; } i += 1; // skip initial '[' which we know is present - while i < chars.len() && chars[i] != ']' { - i += 1; - } + i = scan_while(chars, i, |c| c != ']'); if i < chars.len() && chars[i] == ']' { i += 1; if i < chars.len() && chars[i] == '(' { @@ -40,12 +52,10 @@ fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) { } i += 1; } - let tok: String = chars[start..i].iter().collect(); - return (tok, i); + return (collect_range(chars, start, i), i); } } - let tok: String = chars[start..=start].iter().collect(); - (tok, start + 1) + (collect_range(chars, start, start + 1), start + 1) } pub(super) fn is_trailing_punctuation(c: char) -> bool { @@ -55,7 +65,7 @@ pub(super) fn is_trailing_punctuation(c: char) -> bool { ) } -pub(super) fn tokenize_inline(text: &str) -> Vec { +pub(super) fn segment_inline(text: &str) -> Vec { let mut tokens = Vec::new(); let chars: Vec = text.chars().collect(); let mut i = 0; @@ -63,58 +73,44 @@ pub(super) fn tokenize_inline(text: &str) -> Vec { let c = chars[i]; if c.is_whitespace() { let start = i; - while i < chars.len() && chars[i].is_whitespace() { - i += 1; - } - tokens.push(chars[start..i].iter().collect()); + i = scan_while(&chars, i, char::is_whitespace); + tokens.push(collect_range(&chars, start, i)); } else if c == '`' { let start = i; - let mut delim_len = 0; - while i < chars.len() && chars[i] == '`' { - i += 1; - delim_len += 1; - } + let fence_end = scan_while(&chars, i, |ch| ch == '`'); + let fence_len = fence_end - start; + i = fence_end; + let mut end = i; while end < chars.len() { - if chars[end] == '`' { - let mut j = end; - let mut count = 0; - while j < chars.len() && chars[j] == '`' { - j += 1; - count += 1; - } - if count == delim_len { - end = j; - break; - } + let j = scan_while(&chars, end, |ch| ch == '`'); + if j - end == fence_len { + end = j; + break; } end += 1; } + if end >= chars.len() { - tokens.push(chars[start..start + delim_len].iter().collect()); - i = start + delim_len; + tokens.push(collect_range(&chars, start, start + fence_len)); + i = start + fence_len; } else { - tokens.push(chars[start..end].iter().collect()); + tokens.push(collect_range(&chars, start, end)); i = end; } } else if c == '[' || (c == '!' && i + 1 < chars.len() && chars[i + 1] == '[') { let (tok, mut new_i) = parse_link_or_image(&chars, i); tokens.push(tok); - let mut punct = String::new(); - while new_i < chars.len() && is_trailing_punctuation(chars[new_i]) { - punct.push(chars[new_i]); - new_i += 1; - } - if !punct.is_empty() { - tokens.push(punct); + let punct_start = new_i; + new_i = scan_while(&chars, new_i, is_trailing_punctuation); + if new_i > punct_start { + tokens.push(collect_range(&chars, punct_start, new_i)); } i = new_i; } else { let start = i; - while i < chars.len() && !chars[i].is_whitespace() && chars[i] != '`' { - i += 1; - } - tokens.push(chars[start..i].iter().collect()); + i = scan_while(&chars, i, |ch| !ch.is_whitespace() && ch != '`'); + tokens.push(collect_range(&chars, start, i)); } } tokens @@ -174,3 +170,32 @@ pub(super) fn should_break_line( ) -> bool { current_width > width && last_split.is_some() } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn link_with_trailing_punctuation() { + let tokens = segment_inline("see [link](url)."); + assert_eq!(tokens, vec!["see", " ", "[link](url)", "."]); + } + + #[test] + fn image_with_nested_parentheses() { + let tokens = segment_inline("![alt](path(a(b)c))"); + assert_eq!(tokens, vec!["![alt](path(a(b)c))"]); + } + + #[test] + fn inline_code_fences() { + let tokens = segment_inline("use ``cmd`` now"); + assert_eq!(tokens, vec!["use", " ", "``cmd``", " ", "now"]); + } + + #[test] + fn unmatched_backticks() { + let tokens = segment_inline("bad `code span"); + assert_eq!(tokens, vec!["bad", " ", "`", "code", " ", "span"]); + } +} From bf00f5635bc3707a715098f7fd4dc67d0499ade4 Mon Sep 17 00:00:00 2001 From: Leynos Date: Thu, 31 Jul 2025 12:26:14 +0100 Subject: [PATCH 3/3] Add docs and inline helpers --- src/wrap.rs | 11 +++++++++-- src/wrap/tests.rs | 5 +++++ src/wrap/tokenize.rs | 15 ++++++--------- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/wrap.rs b/src/wrap.rs index 1ba777b2..8a45e20a 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -87,7 +87,14 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { let mut group_width = UnicodeWidthStr::width(tokens[i].as_str()); if tokens[i].contains("](") && tokens[i].ends_with(')') { - while j < tokens.len() && tokens[j].chars().all(tokenize::is_trailing_punctuation) { + while j < tokens.len() + && tokens[j].chars().all(|c| { + matches!( + c, + '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '"' | '\'' + ) + }) + { group_width += UnicodeWidthStr::width(tokens[j].as_str()); j += 1; } @@ -120,7 +127,7 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { continue; } - if tokenize::should_break_line(width, current_width + group_width, last_split) { + if current_width + group_width > width && last_split.is_some() { let pos = last_split.unwrap(); let line = current[..pos].to_string(); let mut rest = current[pos..].trim_start().to_string(); diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs index d9315c9e..18bca090 100644 --- a/src/wrap/tests.rs +++ b/src/wrap/tests.rs @@ -1,3 +1,8 @@ +//! Unit tests for text wrapping functionality. +//! +//! This module contains tests for the `wrap_text` function, verifying correct +//! behaviour with code spans, links, hyphenated words, and various line widths. + use super::super::*; #[test] diff --git a/src/wrap/tokenize.rs b/src/wrap/tokenize.rs index 0bfd0832..184dae3e 100644 --- a/src/wrap/tokenize.rs +++ b/src/wrap/tokenize.rs @@ -32,6 +32,11 @@ pub enum Token<'a> { Newline, } +/// Parse a Markdown link or image starting at `i`. +/// +/// Handles nested parentheses within URLs by tracking the depth of opening and +/// closing delimiters. Returns the parsed slice and the index after the closing +/// parenthesis if one is found. fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) { let start = i; if chars[i] == '!' { @@ -58,7 +63,7 @@ fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) { (collect_range(chars, start, start + 1), start + 1) } -pub(super) fn is_trailing_punctuation(c: char) -> bool { +fn is_trailing_punctuation(c: char) -> bool { matches!( c, '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '"' | '\'' @@ -163,14 +168,6 @@ pub(crate) fn tokenize_markdown(input: &str) -> Vec> { out } -pub(super) fn should_break_line( - width: usize, - current_width: usize, - last_split: Option, -) -> bool { - current_width > width && last_split.is_some() -} - #[cfg(test)] mod tests { use super::*;