From 8c0c79f0556caf6ce76ba9b902c550fd929a06dd Mon Sep 17 00:00:00 2001 From: Leynos Date: Wed, 30 Jul 2025 02:00:52 +0100 Subject: [PATCH 1/5] Fix instructions typo and simplify wrap prefixes --- AGENTS.md | 9 +++-- src/wrap.rs | 94 +++++++++++++++++++---------------------------------- 2 files changed, 38 insertions(+), 65 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 06f65d12..fb33d43f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -28,11 +28,10 @@ examples demonstrating the usage and outcome of the function. Test documentation should omit examples where the example serves only to reiterate the test logic. -- **Keep file size managable.** No single code file may be longer than 400 - lines. - Long switch statements or dispatch tables should be broken up by feature and - constituents colocated with targets. Large blocks of test data should be - moved to external data files. +- **Keep file size manageable.** No single code file may be longer than 400 + lines. Long switch statements or dispatch tables should be broken up by + feature and constituents colocated with targets. Large blocks of test data + should be moved to external data files. ## Documentation Maintenance diff --git a/src/wrap.rs b/src/wrap.rs index cfd1431b..a656f5cb 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -4,55 +4,25 @@ //! `docs/architecture.md` and uses the `unicode-width` crate for accurate //! display calculations. -use regex::{Captures, Regex}; +use regex::Regex; static FENCE_RE: std::sync::LazyLock = - std::sync::LazyLock::new(|| Regex::new(r"^\s*(```|~~~).*").unwrap()); + lazy_regex!(r"^\s*(```|~~~).*", "fence pattern regex should compile",); -static BULLET_RE: std::sync::LazyLock = - std::sync::LazyLock::new(|| Regex::new(r"^(\s*(?:[-*+]|\d+[.)])\s+)(.*)").unwrap()); +static BULLET_RE: std::sync::LazyLock = lazy_regex!( + r"^(\s*(?:[-*+]|\d+[.)])\s+)(.*)", + "bullet pattern regex should compile", +); -static FOOTNOTE_RE: std::sync::LazyLock = - std::sync::LazyLock::new(|| Regex::new(r"^(\s*)(\[\^[^]]+\]:\s*)(.*)$").unwrap()); +static FOOTNOTE_RE: std::sync::LazyLock = lazy_regex!( + r"^(\s*)(\[\^[^]]+\]:\s*)(.*)$", + "footnote pattern regex should compile", +); -static BLOCKQUOTE_RE: std::sync::LazyLock = - std::sync::LazyLock::new(|| Regex::new(r"^(\s*(?:>\s*)+)(.*)$").unwrap()); - -struct PrefixHandler { - re: &'static std::sync::LazyLock, - is_bq: bool, - build_prefix: fn(&Captures) -> String, - rest_group: usize, -} - -impl PrefixHandler { - fn build_bullet_prefix(cap: &Captures) -> String { cap[1].to_string() } - - fn build_footnote_prefix(cap: &Captures) -> String { format!("{}{}", &cap[1], &cap[2]) } - - fn build_blockquote_prefix(cap: &Captures) -> String { cap[1].to_string() } -} - -static HANDLERS: &[PrefixHandler] = &[ - PrefixHandler { - re: &BULLET_RE, - is_bq: false, - build_prefix: PrefixHandler::build_bullet_prefix, - rest_group: 2, - }, - PrefixHandler { - re: &FOOTNOTE_RE, - is_bq: false, - build_prefix: PrefixHandler::build_footnote_prefix, - rest_group: 3, - }, - PrefixHandler { - re: &BLOCKQUOTE_RE, - is_bq: true, - build_prefix: PrefixHandler::build_blockquote_prefix, - rest_group: 2, - }, -]; +static BLOCKQUOTE_RE: std::sync::LazyLock = lazy_regex!( + r"^(\s*(?:>\s*)+)(.*)$", + "blockquote pattern regex should compile", +); /// Markdown token emitted by [`tokenize_markdown`]. #[derive(Debug, PartialEq)] @@ -390,7 +360,7 @@ pub fn wrap_text(lines: &[String], width: usize) -> Vec { let mut indent = String::new(); let mut in_code = false; - 'line_loop: for line in lines { + for line in lines { if FENCE_RE.is_match(line) { flush_paragraph(&mut out, &buf, &indent, width); buf.clear(); @@ -429,21 +399,25 @@ pub fn wrap_text(lines: &[String], width: usize) -> Vec { continue; } - for handler in HANDLERS { - if let Some(cap) = handler.re.captures(line) { - let prefix = (handler.build_prefix)(&cap); - let rest = cap.get(handler.rest_group).unwrap().as_str(); - handle_prefix_line( - &mut out, - &mut buf, - &mut indent, - width, - &prefix, - rest, - handler.is_bq, - ); - continue 'line_loop; - } + if let Some(cap) = BULLET_RE.captures(line) { + let prefix = cap.get(1).unwrap().as_str(); + let rest = cap.get(2).unwrap().as_str(); + handle_prefix_line(&mut out, &mut buf, &mut indent, width, prefix, rest, false); + continue; + } + + if let Some(cap) = FOOTNOTE_RE.captures(line) { + let prefix = format!("{}{}", &cap[1], &cap[2]); + let rest = cap.get(3).unwrap().as_str(); + handle_prefix_line(&mut out, &mut buf, &mut indent, width, &prefix, rest, false); + continue; + } + + if let Some(cap) = BLOCKQUOTE_RE.captures(line) { + let prefix = cap.get(1).unwrap().as_str(); + let rest = cap.get(2).unwrap().as_str(); + handle_prefix_line(&mut out, &mut buf, &mut indent, width, prefix, rest, true); + continue; } if buf.is_empty() { From 1fddf48a045d88446613c7a65ee3da1a51ee529d Mon Sep 17 00:00:00 2001 From: Leynos Date: Wed, 30 Jul 2025 02:23:42 +0100 Subject: [PATCH 2/5] Refactor tokenization and prefix handling --- src/lib.rs | 1 + src/tokenize.rs | 155 +++++++++++++++++++++++ src/wrap.rs | 304 ++------------------------------------------- tests/wrap_unit.rs | 106 ++++++++++++++++ 4 files changed, 275 insertions(+), 291 deletions(-) create mode 100644 src/tokenize.rs create mode 100644 tests/wrap_unit.rs diff --git a/src/lib.rs b/src/lib.rs index 9ae0f228..138c95b6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,6 +29,7 @@ pub mod lists; pub mod process; mod reflow; pub mod table; +mod tokenize; pub mod wrap; #[doc(hidden)] diff --git a/src/tokenize.rs b/src/tokenize.rs new file mode 100644 index 00000000..05c28c5c --- /dev/null +++ b/src/tokenize.rs @@ -0,0 +1,155 @@ +//! Tokenization utilities for Markdown wrapping. +//! +//! Provides `Token` and helpers to parse inline code, links, and fences. + +use std::sync::LazyLock; + +use regex::Regex; + +static FENCE_RE: LazyLock = + lazy_regex!(r"^\s*(```|~~~).*", "fence pattern regex should compile"); + +/// Markdown token emitted by [`tokenize_markdown`]. +#[derive(Debug, PartialEq)] +pub enum Token<'a> { + /// Line within a fenced code block, including the fence itself. + Fence(&'a str), + /// Inline code span without surrounding backticks. + Code(&'a str), + /// Plain text outside code regions. + Text(&'a str), + /// Line break separating tokens. + Newline, +} + +fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) { + let start = i; + if chars[i] == '!' { + i += 1; + } + // skip initial '[' which we know is present + i += 1; + while i < chars.len() && chars[i] != ']' { + i += 1; + } + if i < chars.len() && chars[i] == ']' { + i += 1; + if i < chars.len() && chars[i] == '(' { + i += 1; + let mut depth = 1; + while i < chars.len() && depth > 0 { + match chars[i] { + '(' => depth += 1, + ')' => depth -= 1, + _ => {} + } + i += 1; + } + let tok: String = chars[start..i].iter().collect(); + return (tok, i); + } + } + let tok: String = chars[start..=start].iter().collect(); + (tok, start + 1) +} + +#[must_use] +pub fn tokenize_inline(text: &str) -> Vec { + let mut tokens = Vec::new(); + let chars: Vec = text.chars().collect(); + let mut i = 0; + while i < chars.len() { + let c = chars[i]; + if c.is_whitespace() { + let start = i; + while i < chars.len() && chars[i].is_whitespace() { + i += 1; + } + tokens.push(chars[start..i].iter().collect()); + } else if c == '`' { + let start = i; + let mut delim_len = 0; + while i < chars.len() && chars[i] == '`' { + i += 1; + delim_len += 1; + } + let mut end = i; + while end < chars.len() { + if chars[end] == '`' { + let mut j = end; + let mut count = 0; + while j < chars.len() && chars[j] == '`' { + j += 1; + count += 1; + } + if count == delim_len { + end = j; + break; + } + } + end += 1; + } + if end >= chars.len() { + tokens.push(chars[start..start + delim_len].iter().collect()); + i = start + delim_len; + } else { + tokens.push(chars[start..end].iter().collect()); + i = end; + } + } else if c == '[' || (c == '!' && i + 1 < chars.len() && chars[i + 1] == '[') { + let (tok, new_i) = parse_link_or_image(&chars, i); + tokens.push(tok); + i = new_i; + } else { + let start = i; + while i < chars.len() && !chars[i].is_whitespace() && chars[i] != '`' { + i += 1; + } + tokens.push(chars[start..i].iter().collect()); + } + } + tokens +} + +/// Split the input string into [`Token`]s by analysing whitespace and backtick delimiters. +pub fn tokenize_markdown(input: &str) -> Vec> { + let mut out = Vec::new(); + let mut in_fence = false; + for line in input.split_inclusive('\n') { + let trimmed = line.trim_end_matches('\n'); + if FENCE_RE.is_match(trimmed) { + out.push(Token::Fence(trimmed)); + out.push(Token::Newline); + in_fence = !in_fence; + continue; + } + if in_fence { + out.push(Token::Fence(trimmed)); + out.push(Token::Newline); + continue; + } + let mut rest = trimmed; + while let Some(pos) = rest.find('`') { + if pos > 0 { + out.push(Token::Text(&rest[..pos])); + } + if let Some(end) = rest[pos + 1..].find('`') { + out.push(Token::Code(&rest[pos + 1..pos + 1 + end])); + rest = &rest[pos + end + 2..]; + } else { + out.push(Token::Text(&rest[pos..])); + rest = ""; + break; + } + } + if !rest.is_empty() { + out.push(Token::Text(rest)); + } + out.push(Token::Newline); + } + out.pop(); + out +} + +#[doc(hidden)] +pub fn is_fence(line: &str) -> bool { FENCE_RE.is_match(line) } diff --git a/src/wrap.rs b/src/wrap.rs index a656f5cb..2be560c8 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -6,8 +6,7 @@ use regex::Regex; -static FENCE_RE: std::sync::LazyLock = - lazy_regex!(r"^\s*(```|~~~).*", "fence pattern regex should compile",); +pub use crate::tokenize::{Token, is_fence, tokenize_inline, tokenize_markdown}; static BULLET_RE: std::sync::LazyLock = lazy_regex!( r"^(\s*(?:[-*+]|\d+[.)])\s+)(.*)", @@ -24,176 +23,10 @@ static BLOCKQUOTE_RE: std::sync::LazyLock = lazy_regex!( "blockquote pattern regex should compile", ); -/// Markdown token emitted by [`tokenize_markdown`]. -#[derive(Debug, PartialEq)] -pub enum Token<'a> { - /// Line within a fenced code block, including the fence itself. - Fence(&'a str), - /// Inline code span without surrounding backticks. - Code(&'a str), - /// Plain text outside code regions. - Text(&'a str), - /// Line break separating tokens. - Newline, -} - -fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) { - let start = i; - if chars[i] == '!' { - i += 1; - } - // skip initial '[' which we know is present - i += 1; - while i < chars.len() && chars[i] != ']' { - i += 1; - } - if i < chars.len() && chars[i] == ']' { - i += 1; - if i < chars.len() && chars[i] == '(' { - i += 1; - let mut depth = 1; - while i < chars.len() && depth > 0 { - match chars[i] { - '(' => depth += 1, - ')' => depth -= 1, - _ => {} - } - i += 1; - } - let tok: String = chars[start..i].iter().collect(); - return (tok, i); - } - } - let tok: String = chars[start..=start].iter().collect(); - (tok, start + 1) -} - -fn tokenize_inline(text: &str) -> Vec { - let mut tokens = Vec::new(); - let chars: Vec = text.chars().collect(); - let mut i = 0; - while i < chars.len() { - let c = chars[i]; - if c.is_whitespace() { - let start = i; - while i < chars.len() && chars[i].is_whitespace() { - i += 1; - } - tokens.push(chars[start..i].iter().collect()); - } else if c == '`' { - let start = i; - let mut delim_len = 0; - while i < chars.len() && chars[i] == '`' { - i += 1; - delim_len += 1; - } - let mut end = i; - while end < chars.len() { - if chars[end] == '`' { - let mut j = end; - let mut count = 0; - while j < chars.len() && chars[j] == '`' { - j += 1; - count += 1; - } - if count == delim_len { - end = j; - break; - } - } - end += 1; - } - if end >= chars.len() { - tokens.push(chars[start..start + delim_len].iter().collect()); - i = start + delim_len; - } else { - tokens.push(chars[start..end].iter().collect()); - i = end; - } - } else if c == '[' || (c == '!' && i + 1 < chars.len() && chars[i + 1] == '[') { - let (tok, new_i) = parse_link_or_image(&chars, i); - tokens.push(tok); - i = new_i; - } else { - let start = i; - while i < chars.len() && !chars[i].is_whitespace() && chars[i] != '`' { - i += 1; - } - tokens.push(chars[start..i].iter().collect()); - } - } - tokens -} - -/// Split the input string into [`Token`]s by analysing whitespace and -/// backtick delimiters. -/// -/// The tokenizer groups consecutive whitespace into a single -/// [`Token::Text`] and recognises backtick sequences as inline code spans. -/// When a run of backticks is encountered the parser searches forward for an -/// identical delimiter, allowing nested backticks when the span uses a longer -/// fence. Unmatched delimiter sequences are treated as literal text. -/// -/// ```rust,ignore -/// use mdtablefix::wrap::{Token, tokenize_markdown}; -/// -/// let tokens = tokenize_markdown("Example with `code`"); -/// assert_eq!( -/// tokens, -/// vec![Token::Text("Example with "), Token::Code("code")] -/// ); -/// ``` -pub(crate) fn tokenize_markdown(input: &str) -> Vec> { - let mut out = Vec::new(); - let mut in_fence = false; - for line in input.split_inclusive('\n') { - let trimmed = line.trim_end_matches('\n'); - if FENCE_RE.is_match(trimmed) { - out.push(Token::Fence(trimmed)); - out.push(Token::Newline); - in_fence = !in_fence; - continue; - } - if in_fence { - out.push(Token::Fence(trimmed)); - out.push(Token::Newline); - continue; - } - let mut rest = trimmed; - while let Some(pos) = rest.find('`') { - if pos > 0 { - out.push(Token::Text(&rest[..pos])); - } - if let Some(end) = rest[pos + 1..].find('`') { - out.push(Token::Code(&rest[pos + 1..pos + 1 + end])); - rest = &rest[pos + end + 2..]; - } else { - out.push(Token::Text(&rest[pos..])); - rest = ""; - break; - } - } - if !rest.is_empty() { - out.push(Token::Text(rest)); - } - out.push(Token::Newline); - } - out.pop(); - out -} - /// Determine if the current line should break at the last whitespace. /// /// Returns `true` if `current_width` exceeds `width` and a whitespace split /// position is available. -/// -/// # Examples -/// -/// ```ignore -/// use mdtablefix::wrap::should_break_line; -/// assert!(should_break_line(10, 12, Some(3))); -/// assert!(!should_break_line(10, 8, Some(3))); -/// ``` fn should_break_line(width: usize, current_width: usize, last_split: Option) -> bool { current_width > width && last_split.is_some() } @@ -273,9 +106,6 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { lines } -#[doc(hidden)] -pub fn is_fence(line: &str) -> bool { FENCE_RE.is_match(line) } - fn flush_paragraph(out: &mut Vec, buf: &[(String, bool)], indent: &str, width: usize) { if buf.is_empty() { return; @@ -361,7 +191,7 @@ pub fn wrap_text(lines: &[String], width: usize) -> Vec { let mut in_code = false; for line in lines { - if FENCE_RE.is_match(line) { + if is_fence(line) { flush_paragraph(&mut out, &buf, &indent, width); buf.clear(); indent.clear(); @@ -400,22 +230,28 @@ pub fn wrap_text(lines: &[String], width: usize) -> Vec { } if let Some(cap) = BULLET_RE.captures(line) { - let prefix = cap.get(1).unwrap().as_str(); - let rest = cap.get(2).unwrap().as_str(); + let prefix = cap.get(1).expect("bullet regex capture").as_str(); + let rest = cap.get(2).expect("bullet regex remainder capture").as_str(); handle_prefix_line(&mut out, &mut buf, &mut indent, width, prefix, rest, false); continue; } if let Some(cap) = FOOTNOTE_RE.captures(line) { let prefix = format!("{}{}", &cap[1], &cap[2]); - let rest = cap.get(3).unwrap().as_str(); + let rest = cap + .get(3) + .expect("footnote regex remainder capture") + .as_str(); handle_prefix_line(&mut out, &mut buf, &mut indent, width, &prefix, rest, false); continue; } if let Some(cap) = BLOCKQUOTE_RE.captures(line) { - let prefix = cap.get(1).unwrap().as_str(); - let rest = cap.get(2).unwrap().as_str(); + let prefix = cap.get(1).expect("blockquote prefix capture").as_str(); + let rest = cap + .get(2) + .expect("blockquote regex remainder capture") + .as_str(); handle_prefix_line(&mut out, &mut buf, &mut indent, width, prefix, rest, true); continue; } @@ -452,117 +288,3 @@ pub fn wrap_text(lines: &[String], width: usize) -> Vec { flush_paragraph(&mut out, &buf, &indent, width); out } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn wrap_text_preserves_hyphenated_words() { - let input = vec!["A word that is very-long-word indeed".to_string()]; - let wrapped = wrap_text(&input, 20); - assert_eq!( - wrapped, - vec![ - "A word that is".to_string(), - "very-long-word".to_string(), - "indeed".to_string(), - ] - ); - } - - #[test] - fn wrap_text_does_not_insert_spaces_in_hyphenated_words() { - let input = vec![ - concat!( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt ", - "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur ", - "volutpat." - ) - .to_string(), - ]; - let wrapped = wrap_text(&input, 80); - assert_eq!( - wrapped, - vec![ - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt" - .to_string(), - "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur volutpat." - .to_string(), - ] - ); - } - - #[test] - fn wrap_text_preserves_code_spans() { - let input = vec![ - "with their own escaping rules. On Windows, scripts default to `powershell -Command` \ - unless the manifest's `interpreter` field overrides the setting." - .to_string(), - ]; - let wrapped = wrap_text(&input, 60); - assert_eq!( - wrapped, - vec![ - "with their own escaping rules. On Windows, scripts default".to_string(), - "to `powershell -Command` unless the manifest's".to_string(), - "`interpreter` field overrides the setting.".to_string(), - ] - ); - } - - #[test] - fn wrap_text_multiple_code_spans() { - let input = vec!["combine `foo bar` and `baz qux` in one line".to_string()]; - let wrapped = wrap_text(&input, 25); - assert_eq!( - wrapped, - vec![ - "combine `foo bar` and".to_string(), - "`baz qux` in one line".to_string(), - ] - ); - } - - #[test] - fn wrap_text_nested_backticks() { - let input = vec!["Use `` `code` `` to quote backticks".to_string()]; - let wrapped = wrap_text(&input, 20); - assert_eq!( - wrapped, - vec![ - "Use `` `code` `` to".to_string(), - "quote backticks".to_string() - ] - ); - } - - #[test] - fn wrap_text_unmatched_backticks() { - let input = vec!["This has a `dangling code span.".to_string()]; - let wrapped = wrap_text(&input, 20); - assert_eq!( - wrapped, - vec!["This has a".to_string(), "`dangling code span.".to_string()] - ); - } - - #[test] - fn wrap_text_preserves_links() { - let input = vec![ - "`falcon-pachinko` is an extension library for the".to_string(), - "[Falcon](https://falcon.readthedocs.io) web framework. It adds a structured" - .to_string(), - "approach to asynchronous WebSocket routing and background worker integration." - .to_string(), - ]; - let wrapped = wrap_text(&input, 80); - let joined = wrapped.join("\n"); - assert_eq!(joined.matches("https://").count(), 1); - assert!( - wrapped - .iter() - .any(|l| l.contains("https://falcon.readthedocs.io")) - ); - } -} diff --git a/tests/wrap_unit.rs b/tests/wrap_unit.rs new file mode 100644 index 00000000..7f127cf9 --- /dev/null +++ b/tests/wrap_unit.rs @@ -0,0 +1,106 @@ +use mdtablefix::wrap::wrap_text; + +#[test] +fn wrap_text_preserves_hyphenated_words() { + let input = vec!["A word that is very-long-word indeed".to_string()]; + let wrapped = wrap_text(&input, 20); + assert_eq!( + wrapped, + vec![ + "A word that is".to_string(), + "very-long-word".to_string(), + "indeed".to_string(), + ] + ); +} + +#[test] +fn wrap_text_does_not_insert_spaces_in_hyphenated_words() { + let input = vec![ + concat!( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt ", + "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur ", + "volutpat." + ) + .to_string(), + ]; + let wrapped = wrap_text(&input, 80); + assert_eq!( + wrapped, + vec![ + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt".to_string(), + "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur volutpat.".to_string(), + ] + ); +} + +#[test] +fn wrap_text_preserves_code_spans() { + let input = vec![ + "with their own escaping rules. On Windows, scripts default to `powershell -Command` \ + unless the manifest's `interpreter` field overrides the setting." + .to_string(), + ]; + let wrapped = wrap_text(&input, 60); + assert_eq!( + wrapped, + vec![ + "with their own escaping rules. On Windows, scripts default".to_string(), + "to `powershell -Command` unless the manifest's".to_string(), + "`interpreter` field overrides the setting.".to_string(), + ] + ); +} + +#[test] +fn wrap_text_multiple_code_spans() { + let input = vec!["combine `foo bar` and `baz qux` in one line".to_string()]; + let wrapped = wrap_text(&input, 25); + assert_eq!( + wrapped, + vec![ + "combine `foo bar` and".to_string(), + "`baz qux` in one line".to_string(), + ] + ); +} + +#[test] +fn wrap_text_nested_backticks() { + let input = vec!["Use `` `code` `` to quote backticks".to_string()]; + let wrapped = wrap_text(&input, 20); + assert_eq!( + wrapped, + vec![ + "Use `` `code` `` to".to_string(), + "quote backticks".to_string() + ] + ); +} + +#[test] +fn wrap_text_unmatched_backticks() { + let input = vec!["This has a `dangling code span.".to_string()]; + let wrapped = wrap_text(&input, 20); + assert_eq!( + wrapped, + vec!["This has a".to_string(), "`dangling code span.".to_string()] + ); +} + +#[test] +fn wrap_text_preserves_links() { + let input = vec![ + "`falcon-pachinko` is an extension library for the".to_string(), + "[Falcon](https://falcon.readthedocs.io) web framework. It adds a structured".to_string(), + "approach to asynchronous WebSocket routing and background worker integration.".to_string(), + ]; + let wrapped = wrap_text(&input, 80); + let joined = wrapped.join("\n"); + assert_eq!(joined.matches("https://").count(), 1); + assert!( + wrapped + .iter() + .any(|l| l.contains("https://falcon.readthedocs.io")) + ); +} From 28908d7eb8ec984d3f71d9b2e22a05b83c52ce37 Mon Sep 17 00:00:00 2001 From: Leynos Date: Sat, 2 Aug 2025 00:29:50 +0100 Subject: [PATCH 3/5] Remove unused tokenizer and tidy wrap helpers (#174) --- src/html.rs | 4 +- src/io.rs | 4 +- src/lib.rs | 1 - src/tokenize.rs | 155 ------------------------------------------------ src/wrap.rs | 51 +--------------- 5 files changed, 9 insertions(+), 206 deletions(-) delete mode 100644 src/tokenize.rs diff --git a/src/html.rs b/src/html.rs index 2742d7b3..b148e314 100644 --- a/src/html.rs +++ b/src/html.rs @@ -84,7 +84,9 @@ fn is_element(handle: &Handle, tag: &str) -> bool { } /// Returns `true` if `handle` represents a `` or `` element. -fn is_table_cell(handle: &Handle) -> bool { is_element(handle, "td") || is_element(handle, "th") } +fn is_table_cell(handle: &Handle) -> bool { + is_element(handle, "td") || is_element(handle, "th") +} /// Walks the DOM tree collecting `` nodes under `handle`. fn collect_tables(handle: &Handle, tables: &mut Vec) { diff --git a/src/io.rs b/src/io.rs index cb30bea4..e9bd9c17 100644 --- a/src/io.rs +++ b/src/io.rs @@ -30,7 +30,9 @@ where /// /// # Errors /// Returns an error if reading or writing the file fails. -pub fn rewrite(path: &Path) -> std::io::Result<()> { rewrite_with(path, process_stream) } +pub fn rewrite(path: &Path) -> std::io::Result<()> { + rewrite_with(path, process_stream) +} /// Rewrite a file in place without wrapping text. /// diff --git a/src/lib.rs b/src/lib.rs index c7e7678d..3edae610 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,7 +30,6 @@ pub mod lists; pub mod process; mod reflow; pub mod table; -mod tokenize; pub mod textproc; pub mod wrap; diff --git a/src/tokenize.rs b/src/tokenize.rs deleted file mode 100644 index 05c28c5c..00000000 --- a/src/tokenize.rs +++ /dev/null @@ -1,155 +0,0 @@ -//! Tokenization utilities for Markdown wrapping. -//! -//! Provides `Token` and helpers to parse inline code, links, and fences. - -use std::sync::LazyLock; - -use regex::Regex; - -static FENCE_RE: LazyLock = - lazy_regex!(r"^\s*(```|~~~).*", "fence pattern regex should compile"); - -/// Markdown token emitted by [`tokenize_markdown`]. -#[derive(Debug, PartialEq)] -pub enum Token<'a> { - /// Line within a fenced code block, including the fence itself. - Fence(&'a str), - /// Inline code span without surrounding backticks. - Code(&'a str), - /// Plain text outside code regions. - Text(&'a str), - /// Line break separating tokens. - Newline, -} - -fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) { - let start = i; - if chars[i] == '!' { - i += 1; - } - // skip initial '[' which we know is present - i += 1; - while i < chars.len() && chars[i] != ']' { - i += 1; - } - if i < chars.len() && chars[i] == ']' { - i += 1; - if i < chars.len() && chars[i] == '(' { - i += 1; - let mut depth = 1; - while i < chars.len() && depth > 0 { - match chars[i] { - '(' => depth += 1, - ')' => depth -= 1, - _ => {} - } - i += 1; - } - let tok: String = chars[start..i].iter().collect(); - return (tok, i); - } - } - let tok: String = chars[start..=start].iter().collect(); - (tok, start + 1) -} - -#[must_use] -pub fn tokenize_inline(text: &str) -> Vec { - let mut tokens = Vec::new(); - let chars: Vec = text.chars().collect(); - let mut i = 0; - while i < chars.len() { - let c = chars[i]; - if c.is_whitespace() { - let start = i; - while i < chars.len() && chars[i].is_whitespace() { - i += 1; - } - tokens.push(chars[start..i].iter().collect()); - } else if c == '`' { - let start = i; - let mut delim_len = 0; - while i < chars.len() && chars[i] == '`' { - i += 1; - delim_len += 1; - } - let mut end = i; - while end < chars.len() { - if chars[end] == '`' { - let mut j = end; - let mut count = 0; - while j < chars.len() && chars[j] == '`' { - j += 1; - count += 1; - } - if count == delim_len { - end = j; - break; - } - } - end += 1; - } - if end >= chars.len() { - tokens.push(chars[start..start + delim_len].iter().collect()); - i = start + delim_len; - } else { - tokens.push(chars[start..end].iter().collect()); - i = end; - } - } else if c == '[' || (c == '!' && i + 1 < chars.len() && chars[i + 1] == '[') { - let (tok, new_i) = parse_link_or_image(&chars, i); - tokens.push(tok); - i = new_i; - } else { - let start = i; - while i < chars.len() && !chars[i].is_whitespace() && chars[i] != '`' { - i += 1; - } - tokens.push(chars[start..i].iter().collect()); - } - } - tokens -} - -/// Split the input string into [`Token`]s by analysing whitespace and backtick delimiters. -pub fn tokenize_markdown(input: &str) -> Vec> { - let mut out = Vec::new(); - let mut in_fence = false; - for line in input.split_inclusive('\n') { - let trimmed = line.trim_end_matches('\n'); - if FENCE_RE.is_match(trimmed) { - out.push(Token::Fence(trimmed)); - out.push(Token::Newline); - in_fence = !in_fence; - continue; - } - if in_fence { - out.push(Token::Fence(trimmed)); - out.push(Token::Newline); - continue; - } - let mut rest = trimmed; - while let Some(pos) = rest.find('`') { - if pos > 0 { - out.push(Token::Text(&rest[..pos])); - } - if let Some(end) = rest[pos + 1..].find('`') { - out.push(Token::Code(&rest[pos + 1..pos + 1 + end])); - rest = &rest[pos + end + 2..]; - } else { - out.push(Token::Text(&rest[pos..])); - rest = ""; - break; - } - } - if !rest.is_empty() { - out.push(Token::Text(rest)); - } - out.push(Token::Newline); - } - out.pop(); - out -} - -#[doc(hidden)] -pub fn is_fence(line: &str) -> bool { FENCE_RE.is_match(line) } diff --git a/src/wrap.rs b/src/wrap.rs index 02508ca7..eb148236 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -38,17 +38,6 @@ static BLOCKQUOTE_RE: std::sync::LazyLock = lazy_regex!( "blockquote pattern regex should compile", ); -/// Determine if the current line should break at the last whitespace. -/// -/// Returns `true` if `current_width` exceeds `width` and a whitespace split -/// position is available. -fn should_break_line(width: usize, current_width: usize, last_split: Option) -> bool { - current_width > width && last_split.is_some() -} - -static BLOCKQUOTE_RE: std::sync::LazyLock = - std::sync::LazyLock::new(|| Regex::new(r"^(\s*(?:>\s*)+)(.*)$").unwrap()); - /// Matches `markdownlint` comment directives. /// /// The regex is case-insensitive and recognises these forms with optional rule @@ -65,42 +54,6 @@ static MARKDOWNLINT_DIRECTIVE_RE: std::sync::LazyLock = std::sync::LazyLo .expect("valid markdownlint regex") }); -struct PrefixHandler { - re: &'static std::sync::LazyLock, - is_bq: bool, - build_prefix: fn(&Captures) -> String, - rest_group: usize, -} - -impl PrefixHandler { - fn build_bullet_prefix(cap: &Captures) -> String { cap[1].to_string() } - - fn build_footnote_prefix(cap: &Captures) -> String { format!("{}{}", &cap[1], &cap[2]) } - - fn build_blockquote_prefix(cap: &Captures) -> String { cap[1].to_string() } -} - -static HANDLERS: &[PrefixHandler] = &[ - PrefixHandler { - re: &BULLET_RE, - is_bq: false, - build_prefix: PrefixHandler::build_bullet_prefix, - rest_group: 2, - }, - PrefixHandler { - re: &FOOTNOTE_RE, - is_bq: false, - build_prefix: PrefixHandler::build_footnote_prefix, - rest_group: 3, - }, - PrefixHandler { - re: &BLOCKQUOTE_RE, - is_bq: true, - build_prefix: PrefixHandler::build_blockquote_prefix, - rest_group: 2, - }, -]; - fn wrap_preserving_code(text: &str, width: usize) -> Vec { use unicode_width::UnicodeWidthStr; @@ -210,7 +163,9 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { } #[doc(hidden)] -pub fn is_fence(line: &str) -> bool { FENCE_RE.is_match(line) } +pub fn is_fence(line: &str) -> bool { + FENCE_RE.is_match(line) +} pub(crate) fn is_markdownlint_directive(line: &str) -> bool { MARKDOWNLINT_DIRECTIVE_RE.is_match(line) From ce4519a0586314a760cc31501f38f135379421a4 Mon Sep 17 00:00:00 2001 From: Payton McIntosh Date: Sat, 2 Aug 2025 00:39:12 +0100 Subject: [PATCH 4/5] Formatting for rust-fmt --- src/html.rs | 4 +--- src/io.rs | 4 +--- src/wrap.rs | 4 +--- tests/common/mod.rs | 2 +- 4 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/html.rs b/src/html.rs index b148e314..2742d7b3 100644 --- a/src/html.rs +++ b/src/html.rs @@ -84,9 +84,7 @@ fn is_element(handle: &Handle, tag: &str) -> bool { } /// Returns `true` if `handle` represents a `
` or `` element. -fn is_table_cell(handle: &Handle) -> bool { - is_element(handle, "td") || is_element(handle, "th") -} +fn is_table_cell(handle: &Handle) -> bool { is_element(handle, "td") || is_element(handle, "th") } /// Walks the DOM tree collecting `` nodes under `handle`. fn collect_tables(handle: &Handle, tables: &mut Vec) { diff --git a/src/io.rs b/src/io.rs index e9bd9c17..cb30bea4 100644 --- a/src/io.rs +++ b/src/io.rs @@ -30,9 +30,7 @@ where /// /// # Errors /// Returns an error if reading or writing the file fails. -pub fn rewrite(path: &Path) -> std::io::Result<()> { - rewrite_with(path, process_stream) -} +pub fn rewrite(path: &Path) -> std::io::Result<()> { rewrite_with(path, process_stream) } /// Rewrite a file in place without wrapping text. /// diff --git a/src/wrap.rs b/src/wrap.rs index eb148236..430e4be9 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -163,9 +163,7 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { } #[doc(hidden)] -pub fn is_fence(line: &str) -> bool { - FENCE_RE.is_match(line) -} +pub fn is_fence(line: &str) -> bool { FENCE_RE.is_match(line) } pub(crate) fn is_markdownlint_directive(line: &str) -> bool { MARKDOWNLINT_DIRECTIVE_RE.is_match(line) diff --git a/tests/common/mod.rs b/tests/common/mod.rs index ff4729e8..64b5fc38 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -19,7 +19,7 @@ macro_rules! lines_vec { /// /// Example: /// ``` -/// let input: Vec = include_lines!("data/bold_header_input.txt"); +/// let input: Vec = include_lines!("data/bold_header_input.txt"); /// ``` #[expect(unused_macros, reason = "macros are optional helpers across modules")] macro_rules! include_lines { From bcad4683af8eca72c440c4b36b3c435dbb65237b Mon Sep 17 00:00:00 2001 From: Payton McIntosh Date: Sat, 2 Aug 2025 00:42:34 +0100 Subject: [PATCH 5/5] Reformat per clippy --- src/html.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/html.rs b/src/html.rs index 2742d7b3..c3372881 100644 --- a/src/html.rs +++ b/src/html.rs @@ -112,10 +112,10 @@ fn is_bold_tag(tag: &str) -> bool { /// Returns `true` if `handle` contains a `` or `` descendant. fn contains_strong(handle: &Handle) -> bool { - if let NodeData::Element { name, .. } = &handle.data { - if is_bold_tag(name.local.as_ref()) { - return true; - } + if let NodeData::Element { name, .. } = &handle.data + && is_bold_tag(name.local.as_ref()) + { + return true; } let children = handle.children.borrow(); children.iter().any(contains_strong)