From 13b2cfd3af506688256ff0c5e47149e929e80945 Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Thu, 31 Jul 2025 09:57:35 +0100
Subject: [PATCH 1/3] Extract tokenize helpers and move wrap tests

---
 src/wrap.rs          | 312 +------------------------------------------
 src/wrap/tests.rs    | 104 +++++++++++++++
 src/wrap/tokenize.rs | 176 ++++++++++++++++++++++++
 3 files changed, 286 insertions(+), 306 deletions(-)
 create mode 100644 src/wrap/tests.rs
 create mode 100644 src/wrap/tokenize.rs
diff --git a/src/wrap.rs b/src/wrap.rs
index 7d331bd3..8932438c 100644
--- a/src/wrap.rs
+++ b/src/wrap.rs
@@ -6,6 +6,9 @@
 
 use regex::{Captures, Regex};
 
+mod tokenize;
+pub(crate) use tokenize::{Token, tokenize_markdown};
+
 static FENCE_RE: std::sync::LazyLock<Regex> =
     std::sync::LazyLock::new(|| Regex::new(r"^\s*(```|~~~).*").unwrap());
 
@@ -70,195 +73,6 @@ static HANDLERS: &[PrefixHandler] = &[
     },
 ];
 
-/// Markdown token emitted by [`tokenize_markdown`].
-#[derive(Debug, PartialEq)]
-pub enum Token<'a> {
-    /// Line within a fenced code block, including the fence itself.
-    Fence(&'a str),
-    /// Inline code span without surrounding backticks.
-    Code(&'a str),
-    /// Plain text outside code regions.
-    Text(&'a str),
-    /// Line break separating tokens.
-    Newline,
-}
-
-fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) {
-    let start = i;
-    if chars[i] == '!' {
-        i += 1;
-    }
-    // skip initial '[' which we know is present
-    i += 1;
-    while i < chars.len() && chars[i] != ']' {
-        i += 1;
-    }
-    if i < chars.len() && chars[i] == ']' {
-        i += 1;
-        if i < chars.len() && chars[i] == '(' {
-            i += 1;
-            let mut depth = 1;
-            while i < chars.len() && depth > 0 {
-                match chars[i] {
-                    '(' => depth += 1,
-                    ')' => depth -= 1,
-                    _ => {}
-                }
-                i += 1;
-            }
-            let tok: String = chars[start..i].iter().collect();
-            return (tok, i);
-        }
-    }
-    let tok: String = chars[start..=start].iter().collect();
-    (tok, start + 1)
-}
-
-fn is_trailing_punctuation(c: char) -> bool {
-    matches!(
-        c,
-        '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '"' | '\''
-    )
-}
-
-fn tokenize_inline(text: &str) -> Vec<String> {
-    let mut tokens = Vec::new();
-    let chars: Vec<char> = text.chars().collect();
-    let mut i = 0;
-    while i < chars.len() {
-        let c = chars[i];
-        if c.is_whitespace() {
-            let start = i;
-            while i < chars.len() && chars[i].is_whitespace() {
-                i += 1;
-            }
-            tokens.push(chars[start..i].iter().collect());
-        } else if c == '`' {
-            let start = i;
-            let mut delim_len = 0;
-            while i < chars.len() && chars[i] == '`' {
-                i += 1;
-                delim_len += 1;
-            }
-            let mut end = i;
-            while end < chars.len() {
-                if chars[end] == '`' {
-                    let mut j = end;
-                    let mut count = 0;
-                    while j < chars.len() && chars[j] == '`' {
-                        j += 1;
-                        count += 1;
-                    }
-                    if count == delim_len {
-                        end = j;
-                        break;
-                    }
-                }
-                end += 1;
-            }
-            if end >= chars.len() {
-                tokens.push(chars[start..start + delim_len].iter().collect());
-                i = start + delim_len;
-            } else {
-                tokens.push(chars[start..end].iter().collect());
-                i = end;
-            }
-        } else if c == '[' || (c == '!' && i + 1 < chars.len() && chars[i + 1] == '[') {
-            let (tok, mut new_i) = parse_link_or_image(&chars, i);
-            tokens.push(tok);
-            let mut punct = String::new();
-            while new_i < chars.len() && is_trailing_punctuation(chars[new_i]) {
-                punct.push(chars[new_i]);
-                new_i += 1;
-            }
-            if !punct.is_empty() {
-                tokens.push(punct);
-            }
-            i = new_i;
-        } else {
-            let start = i;
-            while i < chars.len() && !chars[i].is_whitespace() && chars[i] != '`' {
-                i += 1;
-            }
-            tokens.push(chars[start..i].iter().collect());
-        }
-    }
-    tokens
-}
-
-/// Split the input string into [`Token`]s by analysing whitespace and
-/// backtick delimiters.
-///
-/// The tokenizer groups consecutive whitespace into a single
-/// [`Token::Text`] and recognises backtick sequences as inline code spans.
-/// When a run of backticks is encountered the parser searches forward for an
-/// identical delimiter, allowing nested backticks when the span uses a longer
-/// fence. Unmatched delimiter sequences are treated as literal text.
-///
-/// ```rust,ignore
-/// use mdtablefix::wrap::{Token, tokenize_markdown};
-///
-/// let tokens = tokenize_markdown("Example with `code`");
-/// assert_eq!(
-///     tokens,
-///     vec![Token::Text("Example with "), Token::Code("code")]
-/// );
-/// ```
-pub(crate) fn tokenize_markdown(input: &str) -> Vec<Token<'_>> {
-    let mut out = Vec::new();
-    let mut in_fence = false;
-    for line in input.split_inclusive('\n') {
-        let trimmed = line.trim_end_matches('\n');
-        if FENCE_RE.is_match(trimmed) {
-            out.push(Token::Fence(trimmed));
-            out.push(Token::Newline);
-            in_fence = !in_fence;
-            continue;
-        }
-        if in_fence {
-            out.push(Token::Fence(trimmed));
-            out.push(Token::Newline);
-            continue;
-        }
-        let mut rest = trimmed;
-        while let Some(pos) = rest.find('`') {
-            if pos > 0 {
-                out.push(Token::Text(&rest[..pos]));
-            }
-            if let Some(end) = rest[pos + 1..].find('`') {
-                out.push(Token::Code(&rest[pos + 1..pos + 1 + end]));
-                rest = &rest[pos + end + 2..];
-            } else {
-                out.push(Token::Text(&rest[pos..]));
-                rest = "";
-                break;
-            }
-        }
-        if !rest.is_empty() {
-            out.push(Token::Text(rest));
-        }
-        out.push(Token::Newline);
-    }
-    out.pop();
-    out
-}
-
-/// Determine if the current line should break at the last whitespace.
-///
-/// Returns `true` if `current_width` exceeds `width` and a whitespace split
-/// position is available.
-///
-/// # Examples
-///
-/// ```ignore
-/// use mdtablefix::wrap::should_break_line;
-/// assert!(should_break_line(10, 12, Some(3)));
-/// assert!(!should_break_line(10, 8, Some(3)));
-/// ```
-fn should_break_line(width: usize, current_width: usize, last_split: Option<usize>) -> bool {
-    current_width > width && last_split.is_some()
-}
-
 fn wrap_preserving_code(text: &str, width: usize) -> Vec<String> {
     use unicode_width::UnicodeWidthStr;
 
@@ -266,14 +80,14 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec<String> {
     let mut current = String::new();
     let mut current_width = 0;
     let mut last_split: Option<usize> = None;
-    let tokens = tokenize_inline(text);
+    let tokens = tokenize::tokenize_inline(text);
     let mut i = 0;
     while i < tokens.len() {
         let mut j = i + 1;
         let mut group_width = UnicodeWidthStr::width(tokens[i].as_str());
 
         if tokens[i].contains("](") && tokens[i].ends_with(')') {
-            while j < tokens.len() && tokens[j].chars().all(is_trailing_punctuation) {
+            while j < tokens.len() && tokens[j].chars().all(tokenize::is_trailing_punctuation) {
                 group_width += UnicodeWidthStr::width(tokens[j].as_str());
                 j += 1;
             }
@@ -306,7 +120,7 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec<String> {
             continue;
         }
 
-        if should_break_line(width, current_width + group_width, last_split) {
+        if tokenize::should_break_line(width, current_width + group_width, last_split) {
             let pos = last_split.unwrap();
             let line = current[..pos].to_string();
             let mut rest = current[pos..].trim_start().to_string();
@@ -547,117 +361,3 @@ pub fn wrap_text(lines: &[String], width: usize) -> Vec<String> {
     flush_paragraph(&mut out, &buf, &indent, width);
     out
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn wrap_text_preserves_hyphenated_words() {
-        let input = vec!["A word that is very-long-word indeed".to_string()];
-        let wrapped = wrap_text(&input, 20);
-        assert_eq!(
-            wrapped,
-            vec![
-                "A word that is".to_string(),
-                "very-long-word".to_string(),
-                "indeed".to_string(),
-            ]
-        );
-    }
-
-    #[test]
-    fn wrap_text_does_not_insert_spaces_in_hyphenated_words() {
-        let input = vec![
-            concat!(
-                "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt ",
-                "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur ",
-                "volutpat."
-            )
-            .to_string(),
-        ];
-        let wrapped = wrap_text(&input, 80);
-        assert_eq!(
-            wrapped,
-            vec![
-                "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt"
-                    .to_string(),
-                "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur volutpat."
-                    .to_string(),
-            ]
-        );
-    }
-
-    #[test]
-    fn wrap_text_preserves_code_spans() {
-        let input = vec![
-            "with their own escaping rules. On Windows, scripts default to `powershell -Command` \
-             unless the manifest's `interpreter` field overrides the setting."
-                .to_string(),
-        ];
-        let wrapped = wrap_text(&input, 60);
-        assert_eq!(
-            wrapped,
-            vec![
-                "with their own escaping rules. On Windows, scripts default".to_string(),
-                "to `powershell -Command` unless the manifest's".to_string(),
-                "`interpreter` field overrides the setting.".to_string(),
-            ]
-        );
-    }
-
-    #[test]
-    fn wrap_text_multiple_code_spans() {
-        let input = vec!["combine `foo bar` and `baz qux` in one line".to_string()];
-        let wrapped = wrap_text(&input, 25);
-        assert_eq!(
-            wrapped,
-            vec![
-                "combine `foo bar` and".to_string(),
-                "`baz qux` in one line".to_string(),
-            ]
-        );
-    }
-
-    #[test]
-    fn wrap_text_nested_backticks() {
-        let input = vec!["Use `` `code` `` to quote backticks".to_string()];
-        let wrapped = wrap_text(&input, 20);
-        assert_eq!(
-            wrapped,
-            vec![
-                "Use `` `code` `` to".to_string(),
-                "quote backticks".to_string()
-            ]
-        );
-    }
-
-    #[test]
-    fn wrap_text_unmatched_backticks() {
-        let input = vec!["This has a `dangling code span.".to_string()];
-        let wrapped = wrap_text(&input, 20);
-        assert_eq!(
-            wrapped,
-            vec!["This has a".to_string(), "`dangling code span.".to_string()]
-        );
-    }
-
-    #[test]
-    fn wrap_text_preserves_links() {
-        let input = vec![
-            "`falcon-pachinko` is an extension library for the".to_string(),
-            "[Falcon](https://falcon.readthedocs.io) web framework. It adds a structured"
-                .to_string(),
-            "approach to asynchronous WebSocket routing and background worker integration."
-                .to_string(),
-        ];
-        let wrapped = wrap_text(&input, 80);
-        let joined = wrapped.join("\n");
-        assert_eq!(joined.matches("https://").count(), 1);
-        assert!(
-            wrapped
-                .iter()
-                .any(|l| l.contains("https://falcon.readthedocs.io"))
-        );
-    }
-}
diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs
new file mode 100644
index 00000000..66fb9a94
--- /dev/null
+++ b/src/wrap/tests.rs
@@ -0,0 +1,104 @@
+#[cfg(test)]
+mod tests {
+    use super::super::*;
+
+    #[test]
+    fn wrap_text_preserves_hyphenated_words() {
+        let input = vec!["A word that is very-long-word indeed".to_string()];
+        let wrapped = wrap_text(&input, 20);
+        assert_eq!(
+            wrapped,
+            vec![
+                "A word that is".to_string(),
+                "very-long-word".to_string(),
+                "indeed".to_string(),
+            ]
+        );
+    }
+
+    #[test]
+    fn wrap_text_does_not_insert_spaces_in_hyphenated_words() {
+        let input = vec![
+            concat!(
+                "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt ",
+                "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur ",
+                "volutpat.",
+            )
+            .to_string(),
+        ];
+        let wrapped = wrap_text(&input, 80);
+        assert_eq!(
+            wrapped,
+            vec![
+                "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt"
+                    .to_string(),
+                "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur volutpat."
+                    .to_string(),
+            ]
+        );
+    }
+
+    #[test]
+    fn wrap_text_preserves_code_spans() {
+        let input = vec![
+            "with their own escaping rules. On Windows, scripts default to `powershell -Command` \
+             unless the manifest's `interpreter` field overrides the setting."
+                .to_string(),
+        ];
+        let wrapped = wrap_text(&input, 60);
+        assert_eq!(
+            wrapped,
+            vec![
+                "with their own escaping rules. On Windows, scripts default".to_string(),
+                "to `powershell -Command` unless the manifest's".to_string(),
+                "`interpreter` field overrides the setting.".to_string(),
+            ]
+        );
+    }
+
+    #[test]
+    fn wrap_text_multiple_code_spans() {
+        let input = vec!["combine `foo bar` and `baz qux` in one line".to_string()];
+        let wrapped = wrap_text(&input, 25);
+        assert_eq!(
+            wrapped,
+            vec![
+                "combine `foo bar` and".to_string(),
+                "`baz qux` in one line".to_string(),
+            ]
+        );
+    }
+
+    #[test]
+    fn wrap_text_nested_backticks() {
+        let input = vec!["Use `` `code` `` to quote backticks".to_string()];
+        let wrapped = wrap_text(&input, 20);
+        assert_eq!(
+            wrapped,
+            vec!["Use `` `code` `` to".to_string(), "quote backticks".to_string()],
+        );
+    }
+
+    #[test]
+    fn wrap_text_unmatched_backticks() {
+        let input = vec!["This has a `dangling code span.".to_string()];
+        let wrapped = wrap_text(&input, 20);
+        assert_eq!(
+            wrapped,
+            vec!["This has a".to_string(), "`dangling code span.".to_string()],
+        );
+    }
+
+    #[test]
+    fn wrap_text_preserves_links() {
+        let input = vec![
+            "`falcon-pachinko` is an extension library for the".to_string(),
+            "[Falcon](https://falcon.readthedocs.io) web framework. It adds a structured".to_string(),
+            "approach to asynchronous WebSocket routing and background worker integration.".to_string(),
+        ];
+        let wrapped = wrap_text(&input, 80);
+        let joined = wrapped.join("\n");
+        assert_eq!(joined.matches("https://").count(), 1);
+        assert!(wrapped.iter().any(|l| l.contains("https://falcon.readthedocs.io")));
+    }
+}
diff --git a/src/wrap/tokenize.rs b/src/wrap/tokenize.rs
new file mode 100644
index 00000000..ed81a4c3
--- /dev/null
+++ b/src/wrap/tokenize.rs
@@ -0,0 +1,176 @@
+//! Tokenization helpers for wrapping logic.
+//!
+//! This module contains utilities for breaking lines into tokens so that
+//! inline code spans and Markdown links are preserved during wrapping.
+
+use super::FENCE_RE;
+
+/// Markdown token emitted by [`tokenize_markdown`].
+#[derive(Debug, PartialEq)]
+pub enum Token<'a> {
+    /// Line within a fenced code block, including the fence itself.
+    Fence(&'a str),
+    /// Inline code span without surrounding backticks.
+    Code(&'a str),
+    /// Plain text outside code regions.
+    Text(&'a str),
+    /// Line break separating tokens.
+    Newline,
+}
+
+fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) {
+    let start = i;
+    if chars[i] == '!' {
+        i += 1;
+    }
+    i += 1; // skip initial '[' which we know is present
+    while i < chars.len() && chars[i] != ']' {
+        i += 1;
+    }
+    if i < chars.len() && chars[i] == ']' {
+        i += 1;
+        if i < chars.len() && chars[i] == '(' {
+            i += 1;
+            let mut depth = 1;
+            while i < chars.len() && depth > 0 {
+                match chars[i] {
+                    '(' => depth += 1,
+                    ')' => depth -= 1,
+                    _ => {}
+                }
+                i += 1;
+            }
+            let tok: String = chars[start..i].iter().collect();
+            return (tok, i);
+        }
+    }
+    let tok: String = chars[start..=start].iter().collect();
+    (tok, start + 1)
+}
+
+pub(super) fn is_trailing_punctuation(c: char) -> bool {
+    matches!(
+        c,
+        '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '"' | '\''
+    )
+}
+
+pub(super) fn tokenize_inline(text: &str) -> Vec<String> {
+    let mut tokens = Vec::new();
+    let chars: Vec<char> = text.chars().collect();
+    let mut i = 0;
+    while i < chars.len() {
+        let c = chars[i];
+        if c.is_whitespace() {
+            let start = i;
+            while i < chars.len() && chars[i].is_whitespace() {
+                i += 1;
+            }
+            tokens.push(chars[start..i].iter().collect());
+        } else if c == '`' {
+            let start = i;
+            let mut delim_len = 0;
+            while i < chars.len() && chars[i] == '`' {
+                i += 1;
+                delim_len += 1;
+            }
+            let mut end = i;
+            while end < chars.len() {
+                if chars[end] == '`' {
+                    let mut j = end;
+                    let mut count = 0;
+                    while j < chars.len() && chars[j] == '`' {
+                        j += 1;
+                        count += 1;
+                    }
+                    if count == delim_len {
+                        end = j;
+                        break;
+                    }
+                }
+                end += 1;
+            }
+            if end >= chars.len() {
+                tokens.push(chars[start..start + delim_len].iter().collect());
+                i = start + delim_len;
+            } else {
+                tokens.push(chars[start..end].iter().collect());
+                i = end;
+            }
+        } else if c == '[' || (c == '!' && i + 1 < chars.len() && chars[i + 1] == '[') {
+            let (tok, mut new_i) = parse_link_or_image(&chars, i);
+            tokens.push(tok);
+            let mut punct = String::new();
+            while new_i < chars.len() && is_trailing_punctuation(chars[new_i]) {
+                punct.push(chars[new_i]);
+                new_i += 1;
+            }
+            if !punct.is_empty() {
+                tokens.push(punct);
+            }
+            i = new_i;
+        } else {
+            let start = i;
+            while i < chars.len() && !chars[i].is_whitespace() && chars[i] != '`' {
+                i += 1;
+            }
+            tokens.push(chars[start..i].iter().collect());
+        }
+    }
+    tokens
+}
+
+/// Split the input string into [`Token`]s by analysing whitespace and backtick
+/// delimiters.
+///
+/// The tokenizer groups consecutive whitespace into a single [`Token::Text`] and
+/// recognises backtick sequences as inline code spans. When a run of backticks
+/// is encountered the parser searches forward for an identical delimiter,
+/// allowing nested backticks when the span uses a longer fence. Unmatched
+/// delimiter sequences are treated as literal text.
+pub(crate) fn tokenize_markdown(input: &str) -> Vec<Token<'_>> {
+    let mut out = Vec::new();
+    let mut in_fence = false;
+    for line in input.split_inclusive('\n') {
+        let trimmed = line.trim_end_matches('\n');
+        if FENCE_RE.is_match(trimmed) {
+            out.push(Token::Fence(trimmed));
+            out.push(Token::Newline);
+            in_fence = !in_fence;
+            continue;
+        }
+        if in_fence {
+            out.push(Token::Fence(trimmed));
+            out.push(Token::Newline);
+            continue;
+        }
+        let mut rest = trimmed;
+        while let Some(pos) = rest.find('`') {
+            if pos > 0 {
+                out.push(Token::Text(&rest[..pos]));
+            }
+            if let Some(end) = rest[pos + 1..].find('`') {
+                out.push(Token::Code(&rest[pos + 1..pos + 1 + end]));
+                rest = &rest[pos + end + 2..];
+            } else {
+                out.push(Token::Text(&rest[pos..]));
+                rest = "";
+                break;
+            }
+        }
+        if !rest.is_empty() {
+            out.push(Token::Text(rest));
+        }
+        out.push(Token::Newline);
+    }
+    out.pop();
+    out
+}
+
+pub(super) fn should_break_line(
+    width: usize,
+    current_width: usize,
+    last_split: Option<usize>,
+) -> bool {
+    current_width > width && last_split.is_some()
+}

From 37876fb1b43e23dd4c9fea36f44afd7c98b2502c Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Thu, 31 Jul 2025 11:29:25 +0100
Subject: [PATCH 2/3] Add tokenization tests and helpers

---
 src/wrap.rs          |   5 +-
 src/wrap/tests.rs    | 194 ++++++++++++++++++++++---------------------
 src/wrap/tokenize.rs | 109 ++++++++++++++----------
 3 files changed, 169 insertions(+), 139 deletions(-)

diff --git a/src/wrap.rs b/src/wrap.rs
index 8932438c..1ba777b2 100644
--- a/src/wrap.rs
+++ b/src/wrap.rs
@@ -80,7 +80,7 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec<String> {
     let mut current = String::new();
     let mut current_width = 0;
     let mut last_split: Option<usize> = None;
-    let tokens = tokenize::tokenize_inline(text);
+    let tokens = tokenize::segment_inline(text);
     let mut i = 0;
     while i < tokens.len() {
         let mut j = i + 1;
@@ -361,3 +361,6 @@ pub fn wrap_text(lines: &[String], width: usize) -> Vec<String> {
     flush_paragraph(&mut out, &buf, &indent, width);
     out
 }
+
+#[cfg(test)]
+mod tests;
diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs
index 66fb9a94..d9315c9e 100644
--- a/src/wrap/tests.rs
+++ b/src/wrap/tests.rs
@@ -1,104 +1,106 @@
-#[cfg(test)]
-mod tests {
-    use super::super::*;
+use super::super::*;
 
-    #[test]
-    fn wrap_text_preserves_hyphenated_words() {
-        let input = vec!["A word that is very-long-word indeed".to_string()];
-        let wrapped = wrap_text(&input, 20);
-        assert_eq!(
-            wrapped,
-            vec![
-                "A word that is".to_string(),
-                "very-long-word".to_string(),
-                "indeed".to_string(),
-            ]
-        );
-    }
+#[test]
+fn wrap_text_preserves_hyphenated_words() {
+    let input = vec!["A word that is very-long-word indeed".to_string()];
+    let wrapped = wrap_text(&input, 20);
+    assert_eq!(
+        wrapped,
+        vec![
+            "A word that is".to_string(),
+            "very-long-word".to_string(),
+            "indeed".to_string(),
+        ]
+    );
+}
 
-    #[test]
-    fn wrap_text_does_not_insert_spaces_in_hyphenated_words() {
-        let input = vec![
-            concat!(
-                "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt ",
-                "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur ",
-                "volutpat.",
-            )
-            .to_string(),
-        ];
-        let wrapped = wrap_text(&input, 80);
-        assert_eq!(
-            wrapped,
-            vec![
-                "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt"
-                    .to_string(),
-                "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur volutpat."
-                    .to_string(),
-            ]
-        );
-    }
+#[test]
+fn wrap_text_does_not_insert_spaces_in_hyphenated_words() {
+    let input = vec![
+        concat!(
+            "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt ",
+            "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur ",
+            "volutpat.",
+        )
+        .to_string(),
+    ];
+    let wrapped = wrap_text(&input, 80);
+    assert_eq!(
+        wrapped,
+        vec![
+            "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt".to_string(),
+            "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur volutpat.".to_string(),
+        ]
+    );
+}
 
-    #[test]
-    fn wrap_text_preserves_code_spans() {
-        let input = vec![
-            "with their own escaping rules. On Windows, scripts default to `powershell -Command` \
-             unless the manifest's `interpreter` field overrides the setting."
-                .to_string(),
-        ];
-        let wrapped = wrap_text(&input, 60);
-        assert_eq!(
-            wrapped,
-            vec![
-                "with their own escaping rules. On Windows, scripts default".to_string(),
-                "to `powershell -Command` unless the manifest's".to_string(),
-                "`interpreter` field overrides the setting.".to_string(),
-            ]
-        );
-    }
+#[test]
+fn wrap_text_preserves_code_spans() {
+    let input = vec![
+        "with their own escaping rules. On Windows, scripts default to `powershell -Command` \
+         unless the manifest's `interpreter` field overrides the setting."
+            .to_string(),
+    ];
+    let wrapped = wrap_text(&input, 60);
+    assert_eq!(
+        wrapped,
+        vec![
+            "with their own escaping rules. On Windows, scripts default".to_string(),
+            "to `powershell -Command` unless the manifest's".to_string(),
+            "`interpreter` field overrides the setting.".to_string(),
+        ]
+    );
+}
 
-    #[test]
-    fn wrap_text_multiple_code_spans() {
-        let input = vec!["combine `foo bar` and `baz qux` in one line".to_string()];
-        let wrapped = wrap_text(&input, 25);
-        assert_eq!(
-            wrapped,
-            vec![
-                "combine `foo bar` and".to_string(),
-                "`baz qux` in one line".to_string(),
-            ]
-        );
-    }
+#[test]
+fn wrap_text_multiple_code_spans() {
+    let input = vec!["combine `foo bar` and `baz qux` in one line".to_string()];
+    let wrapped = wrap_text(&input, 25);
+    assert_eq!(
+        wrapped,
+        vec![
+            "combine `foo bar` and".to_string(),
+            "`baz qux` in one line".to_string(),
+        ]
+    );
+}
 
-    #[test]
-    fn wrap_text_nested_backticks() {
-        let input = vec!["Use `` `code` `` to quote backticks".to_string()];
-        let wrapped = wrap_text(&input, 20);
-        assert_eq!(
-            wrapped,
-            vec!["Use `` `code` `` to".to_string(), "quote backticks".to_string()],
-        );
-    }
+#[test]
+fn wrap_text_nested_backticks() {
+    let input = vec!["Use `` `code` `` to quote backticks".to_string()];
+    let wrapped = wrap_text(&input, 20);
+    assert_eq!(
+        wrapped,
+        vec![
+            "Use `` `code` `` to".to_string(),
+            "quote backticks".to_string()
+        ],
+    );
+}
 
-    #[test]
-    fn wrap_text_unmatched_backticks() {
-        let input = vec!["This has a `dangling code span.".to_string()];
-        let wrapped = wrap_text(&input, 20);
-        assert_eq!(
-            wrapped,
-            vec!["This has a".to_string(), "`dangling code span.".to_string()],
-        );
-    }
+#[test]
+fn wrap_text_unmatched_backticks() {
+    let input = vec!["This has a `dangling code span.".to_string()];
+    let wrapped = wrap_text(&input, 20);
+    assert_eq!(
+        wrapped,
+        vec!["This has a".to_string(), "`dangling code span.".to_string()],
+    );
+}
 
-    #[test]
-    fn wrap_text_preserves_links() {
-        let input = vec![
-            "`falcon-pachinko` is an extension library for the".to_string(),
-            "[Falcon](https://falcon.readthedocs.io) web framework. It adds a structured".to_string(),
-            "approach to asynchronous WebSocket routing and background worker integration.".to_string(),
-        ];
-        let wrapped = wrap_text(&input, 80);
-        let joined = wrapped.join("\n");
-        assert_eq!(joined.matches("https://").count(), 1);
-        assert!(wrapped.iter().any(|l| l.contains("https://falcon.readthedocs.io")));
-    }
+#[test]
+fn wrap_text_preserves_links() {
+    let input = vec![
+        "`falcon-pachinko` is an extension library for the".to_string(),
+        "[Falcon](https://falcon.readthedocs.io) web framework. It adds a structured".to_string(),
+        "approach to asynchronous WebSocket routing and background worker integration.".to_string(),
+    ];
+    let wrapped = wrap_text(&input, 80);
+    let joined = wrapped.join("\n");
+    assert_eq!(joined.matches("https://").count(), 1);
+    assert!(
+        wrapped
+            .iter()
+            .any(|l| l.contains("https://falcon.readthedocs.io"))
+    );
 }
diff --git a/src/wrap/tokenize.rs b/src/wrap/tokenize.rs
index ed81a4c3..0bfd0832 100644
--- a/src/wrap/tokenize.rs
+++ b/src/wrap/tokenize.rs
@@ -5,6 +5,20 @@
 
 use super::FENCE_RE;
 
+fn scan_while<F>(chars: &[char], mut i: usize, cond: F) -> usize
+where
+    F: Fn(char) -> bool,
+{
+    while i < chars.len() && cond(chars[i]) {
+        i += 1;
+    }
+    i
+}
+
+fn collect_range(chars: &[char], start: usize, end: usize) -> String {
+    chars[start..end].iter().collect()
+}
+
 /// Markdown token emitted by [`tokenize_markdown`].
 #[derive(Debug, PartialEq)]
 pub enum Token<'a> {
@@ -24,9 +38,7 @@ fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) {
         i += 1;
     }
     i += 1; // skip initial '[' which we know is present
-    while i < chars.len() && chars[i] != ']' {
-        i += 1;
-    }
+    i = scan_while(chars, i, |c| c != ']');
     if i < chars.len() && chars[i] == ']' {
         i += 1;
         if i < chars.len() && chars[i] == '(' {
@@ -40,12 +52,10 @@ fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) {
                 }
                 i += 1;
             }
-            let tok: String = chars[start..i].iter().collect();
-            return (tok, i);
+            return (collect_range(chars, start, i), i);
         }
     }
-    let tok: String = chars[start..=start].iter().collect();
-    (tok, start + 1)
+    (collect_range(chars, start, start + 1), start + 1)
 }
 
 pub(super) fn is_trailing_punctuation(c: char) -> bool {
@@ -55,7 +65,7 @@ pub(super) fn is_trailing_punctuation(c: char) -> bool {
     )
 }
 
-pub(super) fn tokenize_inline(text: &str) -> Vec<String> {
+pub(super) fn segment_inline(text: &str) -> Vec<String> {
     let mut tokens = Vec::new();
     let chars: Vec<char> = text.chars().collect();
     let mut i = 0;
@@ -63,58 +73,44 @@ pub(super) fn tokenize_inline(text: &str) -> Vec<String> {
         let c = chars[i];
         if c.is_whitespace() {
             let start = i;
-            while i < chars.len() && chars[i].is_whitespace() {
-                i += 1;
-            }
-            tokens.push(chars[start..i].iter().collect());
+            i = scan_while(&chars, i, char::is_whitespace);
+            tokens.push(collect_range(&chars, start, i));
         } else if c == '`' {
             let start = i;
-            let mut delim_len = 0;
-            while i < chars.len() && chars[i] == '`' {
-                i += 1;
-                delim_len += 1;
-            }
+            let fence_end = scan_while(&chars, i, |ch| ch == '`');
+            let fence_len = fence_end - start;
+            i = fence_end;
+
             let mut end = i;
             while end < chars.len() {
-                if chars[end] == '`' {
-                    let mut j = end;
-                    let mut count = 0;
-                    while j < chars.len() && chars[j] == '`' {
-                        j += 1;
-                        count += 1;
-                    }
-                    if count == delim_len {
-                        end = j;
-                        break;
-                    }
+                let j = scan_while(&chars, end, |ch| ch == '`');
+                if j - end == fence_len {
+                    end = j;
+                    break;
                 }
                 end += 1;
             }
+
             if end >= chars.len() {
-                tokens.push(chars[start..start + delim_len].iter().collect());
-                i = start + delim_len;
+                tokens.push(collect_range(&chars, start, start + fence_len));
+                i = start + fence_len;
             } else {
-                tokens.push(chars[start..end].iter().collect());
+                tokens.push(collect_range(&chars, start, end));
                 i = end;
             }
         } else if c == '[' || (c == '!' && i + 1 < chars.len() && chars[i + 1] == '[') {
             let (tok, mut new_i) = parse_link_or_image(&chars, i);
             tokens.push(tok);
-            let mut punct = String::new();
-            while new_i < chars.len() && is_trailing_punctuation(chars[new_i]) {
-                punct.push(chars[new_i]);
-                new_i += 1;
-            }
-            if !punct.is_empty() {
-                tokens.push(punct);
+            let punct_start = new_i;
+            new_i = scan_while(&chars, new_i, is_trailing_punctuation);
+            if new_i > punct_start {
+                tokens.push(collect_range(&chars, punct_start, new_i));
             }
             i = new_i;
         } else {
             let start = i;
-            while i < chars.len() && !chars[i].is_whitespace() && chars[i] != '`' {
-                i += 1;
-            }
-            tokens.push(chars[start..i].iter().collect());
+            i = scan_while(&chars, i, |ch| !ch.is_whitespace() && ch != '`');
+            tokens.push(collect_range(&chars, start, i));
         }
     }
     tokens
@@ -174,3 +170,32 @@ pub(super) fn should_break_line(
 ) -> bool {
     current_width > width && last_split.is_some()
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn link_with_trailing_punctuation() {
+        let tokens = segment_inline("see [link](url).");
+        assert_eq!(tokens, vec!["see", " ", "[link](url)", "."]);
+    }
+
+    #[test]
+    fn image_with_nested_parentheses() {
+        let tokens = segment_inline("![alt](path(a(b)c))");
+        assert_eq!(tokens, vec!["![alt](path(a(b)c))"]);
+    }
+
+    #[test]
+    fn inline_code_fences() {
+        let tokens = segment_inline("use ``cmd`` now");
+        assert_eq!(tokens, vec!["use", " ", "``cmd``", " ", "now"]);
+    }
+
+    #[test]
+    fn unmatched_backticks() {
+        let tokens = segment_inline("bad `code span");
+        assert_eq!(tokens, vec!["bad", " ", "`", "code", " ", "span"]);
+    }
+}

From bf00f5635bc3707a715098f7fd4dc67d0499ade4 Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Thu, 31 Jul 2025 12:26:14 +0100
Subject: [PATCH 3/3] Add docs and inline helpers

---
 src/wrap.rs          | 11 +++++++++--
 src/wrap/tests.rs    |  5 +++++
 src/wrap/tokenize.rs | 15 ++++++---------
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/wrap.rs b/src/wrap.rs
index 1ba777b2..8a45e20a 100644
--- a/src/wrap.rs
+++ b/src/wrap.rs
@@ -87,7 +87,14 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec<String> {
         let mut group_width = UnicodeWidthStr::width(tokens[i].as_str());
 
         if tokens[i].contains("](") && tokens[i].ends_with(')') {
-            while j < tokens.len() && tokens[j].chars().all(tokenize::is_trailing_punctuation) {
+            while j < tokens.len()
+                && tokens[j].chars().all(|c| {
+                    matches!(
+                        c,
+                        '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '"' | '\''
+                    )
+                })
+            {
                 group_width += UnicodeWidthStr::width(tokens[j].as_str());
                 j += 1;
             }
@@ -120,7 +127,7 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec<String> {
             continue;
         }
 
-        if tokenize::should_break_line(width, current_width + group_width, last_split) {
+        if current_width + group_width > width && last_split.is_some() {
             let pos = last_split.unwrap();
             let line = current[..pos].to_string();
             let mut rest = current[pos..].trim_start().to_string();
diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs
index d9315c9e..18bca090 100644
--- a/src/wrap/tests.rs
+++ b/src/wrap/tests.rs
@@ -1,3 +1,8 @@
+//! Unit tests for text wrapping functionality.
+//!
+//! This module contains tests for the `wrap_text` function, verifying correct
+//! behaviour with code spans, links, hyphenated words, and various line widths.
+
 use super::super::*;
 
 #[test]
diff --git a/src/wrap/tokenize.rs b/src/wrap/tokenize.rs
index 0bfd0832..184dae3e 100644
--- a/src/wrap/tokenize.rs
+++ b/src/wrap/tokenize.rs
@@ -32,6 +32,11 @@ pub enum Token<'a> {
     Newline,
 }
 
+/// Parse a Markdown link or image starting at `i`.
+///
+/// Handles nested parentheses within URLs by tracking the depth of opening and
+/// closing delimiters. Returns the parsed slice and the index after the closing
+/// parenthesis if one is found.
 fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) {
     let start = i;
     if chars[i] == '!' {
@@ -58,7 +63,7 @@ fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) {
     (collect_range(chars, start, start + 1), start + 1)
 }
 
-pub(super) fn is_trailing_punctuation(c: char) -> bool {
+fn is_trailing_punctuation(c: char) -> bool {
     matches!(
         c,
         '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '"' | '\''
@@ -163,14 +168,6 @@ pub(crate) fn tokenize_markdown(input: &str) -> Vec<Token<'_>> {
     out
 }
 
-pub(super) fn should_break_line(
-    width: usize,
-    current_width: usize,
-    last_split: Option<usize>,
-) -> bool {
-    current_width > width && last_split.is_some()
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;