diff --git a/src/wrap.rs b/src/wrap.rs index 7d331bd3..8a45e20a 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -6,6 +6,9 @@ use regex::{Captures, Regex}; +mod tokenize; +pub(crate) use tokenize::{Token, tokenize_markdown}; + static FENCE_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| Regex::new(r"^\s*(```|~~~).*").unwrap()); @@ -70,195 +73,6 @@ static HANDLERS: &[PrefixHandler] = &[ }, ]; -/// Markdown token emitted by [`tokenize_markdown`]. -#[derive(Debug, PartialEq)] -pub enum Token<'a> { - /// Line within a fenced code block, including the fence itself. - Fence(&'a str), - /// Inline code span without surrounding backticks. - Code(&'a str), - /// Plain text outside code regions. - Text(&'a str), - /// Line break separating tokens. - Newline, -} - -fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) { - let start = i; - if chars[i] == '!' { - i += 1; - } - // skip initial '[' which we know is present - i += 1; - while i < chars.len() && chars[i] != ']' { - i += 1; - } - if i < chars.len() && chars[i] == ']' { - i += 1; - if i < chars.len() && chars[i] == '(' { - i += 1; - let mut depth = 1; - while i < chars.len() && depth > 0 { - match chars[i] { - '(' => depth += 1, - ')' => depth -= 1, - _ => {} - } - i += 1; - } - let tok: String = chars[start..i].iter().collect(); - return (tok, i); - } - } - let tok: String = chars[start..=start].iter().collect(); - (tok, start + 1) -} - -fn is_trailing_punctuation(c: char) -> bool { - matches!( - c, - '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '"' | '\'' - ) -} - -fn tokenize_inline(text: &str) -> Vec { - let mut tokens = Vec::new(); - let chars: Vec = text.chars().collect(); - let mut i = 0; - while i < chars.len() { - let c = chars[i]; - if c.is_whitespace() { - let start = i; - while i < chars.len() && chars[i].is_whitespace() { - i += 1; - } - tokens.push(chars[start..i].iter().collect()); - } else if c == '`' { - let start = i; - let mut delim_len = 0; - while i < chars.len() && chars[i] == '`' { - i += 1; - delim_len += 1; - } - let mut end = i; - while end < chars.len() { - if chars[end] == '`' { - let mut j = end; - let mut count = 0; - while j < chars.len() && chars[j] == '`' { - j += 1; - count += 1; - } - if count == delim_len { - end = j; - break; - } - } - end += 1; - } - if end >= chars.len() { - tokens.push(chars[start..start + delim_len].iter().collect()); - i = start + delim_len; - } else { - tokens.push(chars[start..end].iter().collect()); - i = end; - } - } else if c == '[' || (c == '!' && i + 1 < chars.len() && chars[i + 1] == '[') { - let (tok, mut new_i) = parse_link_or_image(&chars, i); - tokens.push(tok); - let mut punct = String::new(); - while new_i < chars.len() && is_trailing_punctuation(chars[new_i]) { - punct.push(chars[new_i]); - new_i += 1; - } - if !punct.is_empty() { - tokens.push(punct); - } - i = new_i; - } else { - let start = i; - while i < chars.len() && !chars[i].is_whitespace() && chars[i] != '`' { - i += 1; - } - tokens.push(chars[start..i].iter().collect()); - } - } - tokens -} - -/// Split the input string into [`Token`]s by analysing whitespace and -/// backtick delimiters. -/// -/// The tokenizer groups consecutive whitespace into a single -/// [`Token::Text`] and recognises backtick sequences as inline code spans. -/// When a run of backticks is encountered the parser searches forward for an -/// identical delimiter, allowing nested backticks when the span uses a longer -/// fence. Unmatched delimiter sequences are treated as literal text. -/// -/// ```rust,ignore -/// use mdtablefix::wrap::{Token, tokenize_markdown}; -/// -/// let tokens = tokenize_markdown("Example with `code`"); -/// assert_eq!( -/// tokens, -/// vec![Token::Text("Example with "), Token::Code("code")] -/// ); -/// ``` -pub(crate) fn tokenize_markdown(input: &str) -> Vec> { - let mut out = Vec::new(); - let mut in_fence = false; - for line in input.split_inclusive('\n') { - let trimmed = line.trim_end_matches('\n'); - if FENCE_RE.is_match(trimmed) { - out.push(Token::Fence(trimmed)); - out.push(Token::Newline); - in_fence = !in_fence; - continue; - } - if in_fence { - out.push(Token::Fence(trimmed)); - out.push(Token::Newline); - continue; - } - let mut rest = trimmed; - while let Some(pos) = rest.find('`') { - if pos > 0 { - out.push(Token::Text(&rest[..pos])); - } - if let Some(end) = rest[pos + 1..].find('`') { - out.push(Token::Code(&rest[pos + 1..pos + 1 + end])); - rest = &rest[pos + end + 2..]; - } else { - out.push(Token::Text(&rest[pos..])); - rest = ""; - break; - } - } - if !rest.is_empty() { - out.push(Token::Text(rest)); - } - out.push(Token::Newline); - } - out.pop(); - out -} - -/// Determine if the current line should break at the last whitespace. -/// -/// Returns `true` if `current_width` exceeds `width` and a whitespace split -/// position is available. -/// -/// # Examples -/// -/// ```ignore -/// use mdtablefix::wrap::should_break_line; -/// assert!(should_break_line(10, 12, Some(3))); -/// assert!(!should_break_line(10, 8, Some(3))); -/// ``` -fn should_break_line(width: usize, current_width: usize, last_split: Option) -> bool { - current_width > width && last_split.is_some() -} - fn wrap_preserving_code(text: &str, width: usize) -> Vec { use unicode_width::UnicodeWidthStr; @@ -266,14 +80,21 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { let mut current = String::new(); let mut current_width = 0; let mut last_split: Option = None; - let tokens = tokenize_inline(text); + let tokens = tokenize::segment_inline(text); let mut i = 0; while i < tokens.len() { let mut j = i + 1; let mut group_width = UnicodeWidthStr::width(tokens[i].as_str()); if tokens[i].contains("](") && tokens[i].ends_with(')') { - while j < tokens.len() && tokens[j].chars().all(is_trailing_punctuation) { + while j < tokens.len() + && tokens[j].chars().all(|c| { + matches!( + c, + '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '"' | '\'' + ) + }) + { group_width += UnicodeWidthStr::width(tokens[j].as_str()); j += 1; } @@ -306,7 +127,7 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { continue; } - if should_break_line(width, current_width + group_width, last_split) { + if current_width + group_width > width && last_split.is_some() { let pos = last_split.unwrap(); let line = current[..pos].to_string(); let mut rest = current[pos..].trim_start().to_string(); @@ -549,115 +370,4 @@ pub fn wrap_text(lines: &[String], width: usize) -> Vec { } #[cfg(test)] -mod tests { - use super::*; - - #[test] - fn wrap_text_preserves_hyphenated_words() { - let input = vec!["A word that is very-long-word indeed".to_string()]; - let wrapped = wrap_text(&input, 20); - assert_eq!( - wrapped, - vec![ - "A word that is".to_string(), - "very-long-word".to_string(), - "indeed".to_string(), - ] - ); - } - - #[test] - fn wrap_text_does_not_insert_spaces_in_hyphenated_words() { - let input = vec![ - concat!( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt ", - "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur ", - "volutpat." - ) - .to_string(), - ]; - let wrapped = wrap_text(&input, 80); - assert_eq!( - wrapped, - vec![ - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt" - .to_string(), - "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur volutpat." - .to_string(), - ] - ); - } - - #[test] - fn wrap_text_preserves_code_spans() { - let input = vec![ - "with their own escaping rules. On Windows, scripts default to `powershell -Command` \ - unless the manifest's `interpreter` field overrides the setting." - .to_string(), - ]; - let wrapped = wrap_text(&input, 60); - assert_eq!( - wrapped, - vec![ - "with their own escaping rules. On Windows, scripts default".to_string(), - "to `powershell -Command` unless the manifest's".to_string(), - "`interpreter` field overrides the setting.".to_string(), - ] - ); - } - - #[test] - fn wrap_text_multiple_code_spans() { - let input = vec!["combine `foo bar` and `baz qux` in one line".to_string()]; - let wrapped = wrap_text(&input, 25); - assert_eq!( - wrapped, - vec![ - "combine `foo bar` and".to_string(), - "`baz qux` in one line".to_string(), - ] - ); - } - - #[test] - fn wrap_text_nested_backticks() { - let input = vec!["Use `` `code` `` to quote backticks".to_string()]; - let wrapped = wrap_text(&input, 20); - assert_eq!( - wrapped, - vec![ - "Use `` `code` `` to".to_string(), - "quote backticks".to_string() - ] - ); - } - - #[test] - fn wrap_text_unmatched_backticks() { - let input = vec!["This has a `dangling code span.".to_string()]; - let wrapped = wrap_text(&input, 20); - assert_eq!( - wrapped, - vec!["This has a".to_string(), "`dangling code span.".to_string()] - ); - } - - #[test] - fn wrap_text_preserves_links() { - let input = vec![ - "`falcon-pachinko` is an extension library for the".to_string(), - "[Falcon](https://falcon.readthedocs.io) web framework. It adds a structured" - .to_string(), - "approach to asynchronous WebSocket routing and background worker integration." - .to_string(), - ]; - let wrapped = wrap_text(&input, 80); - let joined = wrapped.join("\n"); - assert_eq!(joined.matches("https://").count(), 1); - assert!( - wrapped - .iter() - .any(|l| l.contains("https://falcon.readthedocs.io")) - ); - } -} +mod tests; diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs new file mode 100644 index 00000000..18bca090 --- /dev/null +++ b/src/wrap/tests.rs @@ -0,0 +1,111 @@ +//! Unit tests for text wrapping functionality. +//! +//! This module contains tests for the `wrap_text` function, verifying correct +//! behaviour with code spans, links, hyphenated words, and various line widths. + +use super::super::*; + +#[test] +fn wrap_text_preserves_hyphenated_words() { + let input = vec!["A word that is very-long-word indeed".to_string()]; + let wrapped = wrap_text(&input, 20); + assert_eq!( + wrapped, + vec![ + "A word that is".to_string(), + "very-long-word".to_string(), + "indeed".to_string(), + ] + ); +} + +#[test] +fn wrap_text_does_not_insert_spaces_in_hyphenated_words() { + let input = vec![ + concat!( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt ", + "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur ", + "volutpat.", + ) + .to_string(), + ]; + let wrapped = wrap_text(&input, 80); + assert_eq!( + wrapped, + vec![ + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt".to_string(), + "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur volutpat.".to_string(), + ] + ); +} + +#[test] +fn wrap_text_preserves_code_spans() { + let input = vec![ + "with their own escaping rules. On Windows, scripts default to `powershell -Command` \ + unless the manifest's `interpreter` field overrides the setting." + .to_string(), + ]; + let wrapped = wrap_text(&input, 60); + assert_eq!( + wrapped, + vec![ + "with their own escaping rules. On Windows, scripts default".to_string(), + "to `powershell -Command` unless the manifest's".to_string(), + "`interpreter` field overrides the setting.".to_string(), + ] + ); +} + +#[test] +fn wrap_text_multiple_code_spans() { + let input = vec!["combine `foo bar` and `baz qux` in one line".to_string()]; + let wrapped = wrap_text(&input, 25); + assert_eq!( + wrapped, + vec![ + "combine `foo bar` and".to_string(), + "`baz qux` in one line".to_string(), + ] + ); +} + +#[test] +fn wrap_text_nested_backticks() { + let input = vec!["Use `` `code` `` to quote backticks".to_string()]; + let wrapped = wrap_text(&input, 20); + assert_eq!( + wrapped, + vec![ + "Use `` `code` `` to".to_string(), + "quote backticks".to_string() + ], + ); +} + +#[test] +fn wrap_text_unmatched_backticks() { + let input = vec!["This has a `dangling code span.".to_string()]; + let wrapped = wrap_text(&input, 20); + assert_eq!( + wrapped, + vec!["This has a".to_string(), "`dangling code span.".to_string()], + ); +} + +#[test] +fn wrap_text_preserves_links() { + let input = vec![ + "`falcon-pachinko` is an extension library for the".to_string(), + "[Falcon](https://falcon.readthedocs.io) web framework. It adds a structured".to_string(), + "approach to asynchronous WebSocket routing and background worker integration.".to_string(), + ]; + let wrapped = wrap_text(&input, 80); + let joined = wrapped.join("\n"); + assert_eq!(joined.matches("https://").count(), 1); + assert!( + wrapped + .iter() + .any(|l| l.contains("https://falcon.readthedocs.io")) + ); +} diff --git a/src/wrap/tokenize.rs b/src/wrap/tokenize.rs new file mode 100644 index 00000000..184dae3e --- /dev/null +++ b/src/wrap/tokenize.rs @@ -0,0 +1,198 @@ +//! Tokenization helpers for wrapping logic. +//! +//! This module contains utilities for breaking lines into tokens so that +//! inline code spans and Markdown links are preserved during wrapping. + +use super::FENCE_RE; + +fn scan_while(chars: &[char], mut i: usize, cond: F) -> usize +where + F: Fn(char) -> bool, +{ + while i < chars.len() && cond(chars[i]) { + i += 1; + } + i +} + +fn collect_range(chars: &[char], start: usize, end: usize) -> String { + chars[start..end].iter().collect() +} + +/// Markdown token emitted by [`tokenize_markdown`]. +#[derive(Debug, PartialEq)] +pub enum Token<'a> { + /// Line within a fenced code block, including the fence itself. + Fence(&'a str), + /// Inline code span without surrounding backticks. + Code(&'a str), + /// Plain text outside code regions. + Text(&'a str), + /// Line break separating tokens. + Newline, +} + +/// Parse a Markdown link or image starting at `i`. +/// +/// Handles nested parentheses within URLs by tracking the depth of opening and +/// closing delimiters. Returns the parsed slice and the index after the closing +/// parenthesis if one is found. +fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) { + let start = i; + if chars[i] == '!' { + i += 1; + } + i += 1; // skip initial '[' which we know is present + i = scan_while(chars, i, |c| c != ']'); + if i < chars.len() && chars[i] == ']' { + i += 1; + if i < chars.len() && chars[i] == '(' { + i += 1; + let mut depth = 1; + while i < chars.len() && depth > 0 { + match chars[i] { + '(' => depth += 1, + ')' => depth -= 1, + _ => {} + } + i += 1; + } + return (collect_range(chars, start, i), i); + } + } + (collect_range(chars, start, start + 1), start + 1) +} + +fn is_trailing_punctuation(c: char) -> bool { + matches!( + c, + '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '"' | '\'' + ) +} + +pub(super) fn segment_inline(text: &str) -> Vec { + let mut tokens = Vec::new(); + let chars: Vec = text.chars().collect(); + let mut i = 0; + while i < chars.len() { + let c = chars[i]; + if c.is_whitespace() { + let start = i; + i = scan_while(&chars, i, char::is_whitespace); + tokens.push(collect_range(&chars, start, i)); + } else if c == '`' { + let start = i; + let fence_end = scan_while(&chars, i, |ch| ch == '`'); + let fence_len = fence_end - start; + i = fence_end; + + let mut end = i; + while end < chars.len() { + let j = scan_while(&chars, end, |ch| ch == '`'); + if j - end == fence_len { + end = j; + break; + } + end += 1; + } + + if end >= chars.len() { + tokens.push(collect_range(&chars, start, start + fence_len)); + i = start + fence_len; + } else { + tokens.push(collect_range(&chars, start, end)); + i = end; + } + } else if c == '[' || (c == '!' && i + 1 < chars.len() && chars[i + 1] == '[') { + let (tok, mut new_i) = parse_link_or_image(&chars, i); + tokens.push(tok); + let punct_start = new_i; + new_i = scan_while(&chars, new_i, is_trailing_punctuation); + if new_i > punct_start { + tokens.push(collect_range(&chars, punct_start, new_i)); + } + i = new_i; + } else { + let start = i; + i = scan_while(&chars, i, |ch| !ch.is_whitespace() && ch != '`'); + tokens.push(collect_range(&chars, start, i)); + } + } + tokens +} + +/// Split the input string into [`Token`]s by analysing whitespace and backtick +/// delimiters. +/// +/// The tokenizer groups consecutive whitespace into a single [`Token::Text`] and +/// recognises backtick sequences as inline code spans. When a run of backticks +/// is encountered the parser searches forward for an identical delimiter, +/// allowing nested backticks when the span uses a longer fence. Unmatched +/// delimiter sequences are treated as literal text. +pub(crate) fn tokenize_markdown(input: &str) -> Vec> { + let mut out = Vec::new(); + let mut in_fence = false; + for line in input.split_inclusive('\n') { + let trimmed = line.trim_end_matches('\n'); + if FENCE_RE.is_match(trimmed) { + out.push(Token::Fence(trimmed)); + out.push(Token::Newline); + in_fence = !in_fence; + continue; + } + if in_fence { + out.push(Token::Fence(trimmed)); + out.push(Token::Newline); + continue; + } + let mut rest = trimmed; + while let Some(pos) = rest.find('`') { + if pos > 0 { + out.push(Token::Text(&rest[..pos])); + } + if let Some(end) = rest[pos + 1..].find('`') { + out.push(Token::Code(&rest[pos + 1..pos + 1 + end])); + rest = &rest[pos + end + 2..]; + } else { + out.push(Token::Text(&rest[pos..])); + rest = ""; + break; + } + } + if !rest.is_empty() { + out.push(Token::Text(rest)); + } + out.push(Token::Newline); + } + out.pop(); + out +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn link_with_trailing_punctuation() { + let tokens = segment_inline("see [link](url)."); + assert_eq!(tokens, vec!["see", " ", "[link](url)", "."]); + } + + #[test] + fn image_with_nested_parentheses() { + let tokens = segment_inline("![alt](path(a(b)c))"); + assert_eq!(tokens, vec!["![alt](path(a(b)c))"]); + } + + #[test] + fn inline_code_fences() { + let tokens = segment_inline("use ``cmd`` now"); + assert_eq!(tokens, vec!["use", " ", "``cmd``", " ", "now"]); + } + + #[test] + fn unmatched_backticks() { + let tokens = segment_inline("bad `code span"); + assert_eq!(tokens, vec!["bad", " ", "`", "code", " ", "span"]); + } +}