From fda10bdaa8234781c8d618b86a75a913e01b83e2 Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Sun, 9 Nov 2025 14:21:01 +0000
Subject: [PATCH 1/7] refactor(wrap/tokenize): use &str instead of &[char] to
 improve efficiency

Refactored the tokenize module to operate directly on string slices (&str) rather than allocating and indexing slices of chars (&[char]). This avoids unnecessary allocation and improves performance by iterating over string slices and handling UTF-8 boundaries correctly. Helper functions were updated accordingly to maintain consistent byte indexing and UTF-8 character handling throughout the tokenization logic.

Co-authored-by: terragon-labs[bot] <terragon-labs[bot]@users.noreply.github.com>
---
 src/wrap/tokenize.rs | 146 +++++++++++++++++++++++--------------------
 1 file changed, 79 insertions(+), 67 deletions(-)
diff --git a/src/wrap/tokenize.rs b/src/wrap/tokenize.rs
index 975286e3..9ec94bc4 100644
--- a/src/wrap/tokenize.rs
+++ b/src/wrap/tokenize.rs
@@ -3,26 +3,31 @@
 //! This module contains utilities for breaking lines into tokens so that
 //! inline code spans and Markdown links are preserved during wrapping.
 
-/// Advance `i` while the predicate evaluates to `true`.
+/// Advance `idx` while the predicate evaluates to `true`.
 ///
-/// Returns the index of the first character for which `cond` fails. This small
-/// helper keeps the scanning loops concise.
+/// Returns the byte index of the first character for which `cond` fails.
+/// This small helper keeps the scanning loops concise and avoids
+/// materialising the source as a char buffer.
 ///
 /// # Examples
 ///
 /// ```rust,ignore
-/// let chars: Vec<char> = "abc123".chars().collect();
-/// let end = scan_while(&chars, 0, char::is_alphabetic);
+/// let text = "abc123";
+/// let end = scan_while(text, 0, char::is_alphabetic);
 /// assert_eq!(end, 3);
 /// ```
-fn scan_while<F>(chars: &[char], mut i: usize, mut cond: F) -> usize
+fn scan_while<F>(text: &str, mut idx: usize, mut cond: F) -> usize
 where
     F: FnMut(char) -> bool,
 {
-    while i < chars.len() && cond(chars[i]) {
-        i += 1;
+    while idx < text.len() {
+        let ch = text[idx..].chars().next().expect("valid char boundary");
+        if !cond(ch) {
+            break;
+        }
+        idx += ch.len_utf8();
     }
-    i
+    idx
 }
 
 /// Collect a range of characters into a [`String`].
@@ -30,33 +35,31 @@ where
 /// # Examples
 ///
 /// ```rust,ignore
-/// let chars: Vec<char> = ['a', 'b', 'c'];
-/// assert_eq!(collect_range(&chars, 0, 2), "ab");
+/// let text = "abc";
+/// assert_eq!(collect_range(text, 0, 2), "ab");
 /// ```
-fn collect_range(chars: &[char], start: usize, end: usize) -> String {
-    chars[start..end].iter().collect()
+fn collect_range(text: &str, start: usize, end: usize) -> String {
+    text[start..end].to_string()
 }
 
-const BACKSLASH: char = '\\';
 const BACKSLASH_BYTE: u8 = b'\\';
 
-/// Check if a character at the given index is preceded by an odd number of backslashes.
-///
-/// An odd number of preceding backslashes means the character is escaped.
-fn has_odd_backslash_escape(chars: &[char], mut idx: usize) -> bool {
-    let mut count = 0;
-    while idx > 0 {
-        idx -= 1;
-        if chars[idx] == BACKSLASH {
-            count += 1;
-        } else {
-            break;
-        }
+fn char_at(text: &str, idx: usize) -> Option<char> {
+    if idx >= text.len() {
+        None
+    } else {
+        text[idx..].chars().next()
     }
-    count % 2 == 1
 }
 
-/// Check if a byte at the given index is preceded by an odd number of backslashes.
+fn advance_char(text: &str, idx: usize) -> usize {
+    char_at(text, idx)
+        .map(|ch| idx + ch.len_utf8())
+        .unwrap_or_else(|| text.len())
+}
+
+/// Check if a byte at the given index is preceded by an odd number of
+/// backslashes.
 ///
 /// An odd number of preceding backslashes means the byte is escaped.
 fn has_odd_backslash_escape_bytes(bytes: &[u8], mut idx: usize) -> bool {
@@ -98,35 +101,44 @@ pub enum Token<'a> {
 /// # Examples
 ///
 /// ```rust,ignore
-/// let chars: Vec<char> = "![alt](a(b)c)".chars().collect();
-/// let (tok, idx) = parse_link_or_image(&chars, 0);
+/// let text = "![alt](a(b)c)";
+/// let (tok, idx) = parse_link_or_image(text, 0);
 /// assert_eq!(tok, "![alt](a(b)c)");
-/// assert_eq!(idx, chars.len());
+/// assert_eq!(idx, text.len());
 /// ```
-fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) {
-    let start = i;
-    if chars[i] == '!' {
-        i += 1;
+fn parse_link_or_image(text: &str, mut idx: usize) -> (String, usize) {
+    let start = idx;
+    if char_at(text, idx) == Some('!') {
+        idx = advance_char(text, idx);
     }
-    i += 1; // skip initial '[' which we know is present
-    i = scan_while(chars, i, |c| c != ']');
-    if i < chars.len() && chars[i] == ']' {
-        i += 1;
-        if i < chars.len() && chars[i] == '(' {
-            i += 1;
+
+    if char_at(text, idx) != Some('[') {
+        let next = advance_char(text, start);
+        return (collect_range(text, start, next), next);
+    }
+
+    idx = advance_char(text, idx); // skip '['
+    idx = scan_while(text, idx, |c| c != ']');
+    if idx < text.len() && char_at(text, idx) == Some(']') {
+        idx = advance_char(text, idx);
+        if idx < text.len() && char_at(text, idx) == Some('(') {
+            idx = advance_char(text, idx);
             let mut depth = 1;
-            while i < chars.len() && depth > 0 {
-                match chars[i] {
+            while idx < text.len() && depth > 0 {
+                let ch = char_at(text, idx).expect("valid char boundary");
+                idx = advance_char(text, idx);
+                match ch {
                     '(' => depth += 1,
                     ')' => depth -= 1,
                     _ => {}
                 }
-                i += 1;
             }
-            return (collect_range(chars, start, i), i);
+            return (collect_range(text, start, idx), idx);
         }
     }
-    (collect_range(chars, start, start + 1), start + 1)
+
+    let next = advance_char(text, start);
+    (collect_range(text, start, next), next)
 }
 
 /// Determine whether a character is considered trailing punctuation.
@@ -172,61 +184,61 @@ fn is_trailing_punctuation(c: char) -> bool {
 /// ```
 pub(super) fn segment_inline(text: &str) -> Vec<String> {
     let mut tokens = Vec::new();
-    let chars: Vec<char> = text.chars().collect();
+    let bytes = text.as_bytes();
     let mut i = 0;
-    while i < chars.len() {
-        let c = chars[i];
+    while i < text.len() {
+        let c = char_at(text, i).expect("valid char boundary");
         if c.is_whitespace() {
             let start = i;
-            i = scan_while(&chars, i, char::is_whitespace);
-            tokens.push(collect_range(&chars, start, i));
+            i = scan_while(text, i, char::is_whitespace);
+            tokens.push(collect_range(text, start, i));
         } else if c == '`' {
-            if has_odd_backslash_escape(&chars, i) {
+            if has_odd_backslash_escape_bytes(bytes, i) {
                 if let Some(last) = tokens.last_mut() {
                     last.push('`');
                 } else {
                     tokens.push(String::from("`"));
                 }
-                i += 1;
+                i += c.len_utf8();
                 continue;
             }
 
             let start = i;
-            let fence_end = scan_while(&chars, i, |ch| ch == '`');
+            let fence_end = scan_while(text, i, |ch| ch == '`');
             let fence_len = fence_end - start;
             i = fence_end;
 
             let mut end = i;
             let mut closing = None;
-            while end < chars.len() {
-                let j = scan_while(&chars, end, |ch| ch == '`');
-                if j - end == fence_len && !has_odd_backslash_escape(&chars, end) {
+            while end < text.len() {
+                let j = scan_while(text, end, |ch| ch == '`');
+                if j - end == fence_len && !has_odd_backslash_escape_bytes(bytes, end) {
                     closing = Some(j);
                     break;
                 }
-                end += 1;
+                end = advance_char(text, end);
             }
 
             if let Some(end_idx) = closing {
-                tokens.push(collect_range(&chars, start, end_idx));
+                tokens.push(collect_range(text, start, end_idx));
                 i = end_idx;
             } else {
-                tokens.push(collect_range(&chars, start, start + fence_len));
-                i = start + fence_len;
+                tokens.push(collect_range(text, start, fence_end));
+                i = fence_end;
             }
-        } else if c == '[' || (c == '!' && i + 1 < chars.len() && chars[i + 1] == '[') {
-            let (tok, mut new_i) = parse_link_or_image(&chars, i);
+        } else if c == '[' || (c == '!' && char_at(text, advance_char(text, i)) == Some('[')) {
+            let (tok, mut new_i) = parse_link_or_image(text, i);
             tokens.push(tok);
             let punct_start = new_i;
-            new_i = scan_while(&chars, new_i, is_trailing_punctuation);
+            new_i = scan_while(text, new_i, is_trailing_punctuation);
             if new_i > punct_start {
-                tokens.push(collect_range(&chars, punct_start, new_i));
+                tokens.push(collect_range(text, punct_start, new_i));
             }
             i = new_i;
         } else {
             let start = i;
-            i = scan_while(&chars, i, |ch| !ch.is_whitespace() && ch != '`');
-            tokens.push(collect_range(&chars, start, i));
+            i = scan_while(text, i, |ch| !ch.is_whitespace() && ch != '`');
+            tokens.push(collect_range(text, start, i));
         }
     }
     tokens

From 6630ef3675f24e0dbdd1f07b68cd28781b3c3509 Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Tue, 11 Nov 2025 01:33:07 +0000
Subject: [PATCH 2/7] refactor(wrap/tokenize): simplify advance_char function
 using map_or

Rewrote advance_char to use map_or instead of map followed by unwrap_or_else for more concise and readable code.

Co-authored-by: terragon-labs[bot] <terragon-labs[bot]@users.noreply.github.com>
---
 src/wrap/tokenize.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/wrap/tokenize.rs b/src/wrap/tokenize.rs
index 9ec94bc4..df80bd67 100644
--- a/src/wrap/tokenize.rs
+++ b/src/wrap/tokenize.rs
@@ -53,9 +53,7 @@ fn char_at(text: &str, idx: usize) -> Option<char> {
 }
 
 fn advance_char(text: &str, idx: usize) -> usize {
-    char_at(text, idx)
-        .map(|ch| idx + ch.len_utf8())
-        .unwrap_or_else(|| text.len())
+    char_at(text, idx).map_or(text.len(), |ch| idx + ch.len_utf8())
 }
 
 /// Check if a byte at the given index is preceded by an odd number of

From 80fafee3cfcccb1d12cfc8841a7f9708a1a11e07 Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Tue, 11 Nov 2025 02:04:31 +0000
Subject: [PATCH 3/7] docs(tokenizer): add architecture documentation
 describing inline tokenizer flow

Added a detailed section with a flowchart diagram explaining the control flow of the inline tokenizer. This includes handling of whitespace, code spans, and links, improving the documentation on how tokens are generated from input text.

Co-authored-by: terragon-labs[bot] <terragon-labs[bot]@users.noreply.github.com>
---
 docs/architecture.md | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/docs/architecture.md b/docs/architecture.md
index 56acd825..e8e609d0 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -333,6 +333,37 @@ provides streaming helpers that combine the lower-level functions. The `io`
 module handles filesystem operations, delegating the text processing to
 `process`.
 
+### Tokenizer flow
+
+The inline tokenizer iterates over the source string lazily so no duplicate
+`Vec<char>` representation is required. The following diagram summarises the
+control flow, highlighting the helpers touched during whitespace, code span,
+and link handling.
+
+```mermaid
+flowchart TD
+    A["Input text (&str)"] --> B["Initialize tokens Vec"]
+    B --> C["Iterate over text by byte index"]
+    C --> D{"Current char is whitespace?"}
+    D -- Yes --> E["scan_while for whitespace"]
+    E --> F["collect_range and push token"]
+    D -- No --> G{"Current char is '`'?"}
+    G -- Yes --> H["Check backslash escape (has_odd_backslash_escape_bytes)"]
+    H -- Escaped --> I["Push '`' as token"]
+    H -- Not escaped --> J["scan_while for code fence"]
+    J --> K["Find closing fence, collect_range and push token"]
+    G -- No --> L{"Current char is '[' or '!['?"}
+    L -- Yes --> M["parse_link_or_image"]
+    M --> N["Push link/image token"]
+    N --> O["scan_while for trailing punctuation"]
+    O --> P["collect_range and push punctuation token"]
+    L -- No --> Q["scan_while for non-whitespace/non-` chars"]
+    Q --> R["collect_range and push token"]
+    F & I & K & P & R --> S["Continue iteration"]
+    S --> C
+    C -->|End| T["Return tokens Vec"]
+```
+
 The helper `html_table_to_markdown` is retained for backward compatibility but
 is deprecated. New code should call `convert_html_tables` instead.
 

From ada31508a0aaa3d73c16eb9d1a9e37547d934efb Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Tue, 11 Nov 2025 02:10:50 +0000
Subject: [PATCH 4/7] refactor(wrap/tokenize): improve multibyte character
 handling and parsing logic

- Replace index-based char access with slicing and char iterators to properly handle multibyte chars.
- Remove unsafe index manipulations and simplify advancing through UTF-8 strings.
- Improve parse_link_or_image to handle nested parentheses correctly.
- Refactor segment_inline to better handle multibyte tokens and special markdown chars.
- Add extensive tests for multibyte char handling and parsing correctness.

Co-authored-by: terragon-labs[bot] <terragon-labs[bot]@users.noreply.github.com>
---
 src/wrap/tokenize.rs            | 137 +++++++++++++++++++++++---------
 tests/wrap/tokenize_markdown.rs |  17 ++++
 2 files changed, 117 insertions(+), 37 deletions(-)

diff --git a/src/wrap/tokenize.rs b/src/wrap/tokenize.rs
index df80bd67..ee6c081b 100644
--- a/src/wrap/tokenize.rs
+++ b/src/wrap/tokenize.rs
@@ -16,12 +16,12 @@
 /// let end = scan_while(text, 0, char::is_alphabetic);
 /// assert_eq!(end, 3);
 /// ```
-fn scan_while<F>(text: &str, mut idx: usize, mut cond: F) -> usize
+fn scan_while<F>(text: &str, start: usize, mut cond: F) -> usize
 where
     F: FnMut(char) -> bool,
 {
-    while idx < text.len() {
-        let ch = text[idx..].chars().next().expect("valid char boundary");
+    let mut idx = start;
+    for ch in text[start..].chars() {
         if !cond(ch) {
             break;
         }
@@ -44,18 +44,6 @@ fn collect_range(text: &str, start: usize, end: usize) -> String {
 
 const BACKSLASH_BYTE: u8 = b'\\';
 
-fn char_at(text: &str, idx: usize) -> Option<char> {
-    if idx >= text.len() {
-        None
-    } else {
-        text[idx..].chars().next()
-    }
-}
-
-fn advance_char(text: &str, idx: usize) -> usize {
-    char_at(text, idx).map_or(text.len(), |ch| idx + ch.len_utf8())
-}
-
 /// Check if a byte at the given index is preceded by an odd number of
 /// backslashes.
 ///
@@ -106,36 +94,46 @@ pub enum Token<'a> {
 /// ```
 fn parse_link_or_image(text: &str, mut idx: usize) -> (String, usize) {
     let start = idx;
-    if char_at(text, idx) == Some('!') {
-        idx = advance_char(text, idx);
+
+    if text[idx..].starts_with('!') {
+        idx += '!'.len_utf8();
     }
 
-    if char_at(text, idx) != Some('[') {
-        let next = advance_char(text, start);
+    if !text[idx..].starts_with('[') {
+        let next = text[start..]
+            .chars()
+            .next()
+            .map_or(text.len(), |ch| start + ch.len_utf8());
         return (collect_range(text, start, next), next);
     }
 
-    idx = advance_char(text, idx); // skip '['
+    idx += '['.len_utf8();
     idx = scan_while(text, idx, |c| c != ']');
-    if idx < text.len() && char_at(text, idx) == Some(']') {
-        idx = advance_char(text, idx);
-        if idx < text.len() && char_at(text, idx) == Some('(') {
-            idx = advance_char(text, idx);
+    if idx < text.len() && text[idx..].starts_with(']') {
+        idx += ']'.len_utf8();
+        if idx < text.len() && text[idx..].starts_with('(') {
+            idx += '('.len_utf8();
             let mut depth = 1;
             while idx < text.len() && depth > 0 {
-                let ch = char_at(text, idx).expect("valid char boundary");
-                idx = advance_char(text, idx);
-                match ch {
-                    '(' => depth += 1,
-                    ')' => depth -= 1,
-                    _ => {}
+                if let Some(ch) = text[idx..].chars().next() {
+                    idx += ch.len_utf8();
+                    match ch {
+                        '(' => depth += 1,
+                        ')' => depth -= 1,
+                        _ => {}
+                    }
+                } else {
+                    break;
                 }
             }
             return (collect_range(text, start, idx), idx);
         }
     }
 
-    let next = advance_char(text, start);
+    let next = text[start..]
+        .chars()
+        .next()
+        .map_or(text.len(), |ch| start + ch.len_utf8());
     (collect_range(text, start, next), next)
 }
 
@@ -185,19 +183,20 @@ pub(super) fn segment_inline(text: &str) -> Vec<String> {
     let bytes = text.as_bytes();
     let mut i = 0;
     while i < text.len() {
-        let c = char_at(text, i).expect("valid char boundary");
-        if c.is_whitespace() {
+        let ch = text[i..].chars().next().expect("valid char boundary");
+        if ch.is_whitespace() {
             let start = i;
             i = scan_while(text, i, char::is_whitespace);
             tokens.push(collect_range(text, start, i));
-        } else if c == '`' {
+            continue;
+        } else if ch == '`' {
             if has_odd_backslash_escape_bytes(bytes, i) {
                 if let Some(last) = tokens.last_mut() {
                     last.push('`');
                 } else {
                     tokens.push(String::from("`"));
                 }
-                i += c.len_utf8();
+                i += ch.len_utf8();
                 continue;
             }
 
@@ -214,7 +213,11 @@ pub(super) fn segment_inline(text: &str) -> Vec<String> {
                     closing = Some(j);
                     break;
                 }
-                end = advance_char(text, end);
+                if let Some(next) = text[end..].chars().next() {
+                    end += next.len_utf8();
+                } else {
+                    break;
+                }
             }
 
             if let Some(end_idx) = closing {
@@ -224,7 +227,13 @@ pub(super) fn segment_inline(text: &str) -> Vec<String> {
                 tokens.push(collect_range(text, start, fence_end));
                 i = fence_end;
             }
-        } else if c == '[' || (c == '!' && char_at(text, advance_char(text, i)) == Some('[')) {
+            continue;
+        }
+
+        let after_bang = i + ch.len_utf8();
+        let looks_like_image =
+            ch == '!' && after_bang <= text.len() && text[after_bang..].starts_with('[');
+        if ch == '[' || looks_like_image {
             let (tok, mut new_i) = parse_link_or_image(text, i);
             tokens.push(tok);
             let punct_start = new_i;
@@ -406,6 +415,60 @@ pub fn tokenize_markdown(source: &str) -> Vec<Token<'_>> {
 mod tests {
     use super::*;
 
+    #[test]
+    fn scan_while_respects_predicate_boundaries() {
+        let text = "abc123";
+        assert_eq!(scan_while(text, 0, char::is_alphabetic), 3);
+        assert_eq!(scan_while(text, 3, char::is_numeric), text.len());
+    }
+
+    #[test]
+    fn scan_while_advances_over_multibyte_characters() {
+        let text = "åßç123";
+        let idx = scan_while(text, 0, char::is_alphabetic);
+        assert_eq!(&text[..idx], "åßç");
+    }
+
+    #[test]
+    fn collect_range_extracts_multibyte_segments() {
+        let text = "αβγδε";
+        let first_two = "αβ".len();
+        let middle = first_two + "γδ".len();
+        assert_eq!(collect_range(text, 0, first_two), "αβ");
+        assert_eq!(collect_range(text, first_two, middle), "γδ");
+    }
+
+    #[test]
+    fn parse_link_or_image_handles_nested_parentheses() {
+        let text = "![alt](path(a(b)c)) more";
+        let (token, idx) = parse_link_or_image(text, 0);
+        assert_eq!(token, "![alt](path(a(b)c))");
+        assert_eq!(idx, token.len());
+    }
+
+    #[test]
+    fn parse_link_or_image_falls_back_on_malformed_input() {
+        let text = "[broken";
+        let (token, idx) = parse_link_or_image(text, 0);
+        assert_eq!(token, "[");
+        assert_eq!(idx, "[".len());
+    }
+
+    #[test]
+    fn segment_inline_handles_multibyte_tokens() {
+        let tokens = segment_inline("ßß `λ` фин");
+        assert_eq!(
+            tokens,
+            vec![
+                String::from("ßß"),
+                String::from(" "),
+                String::from("`λ`"),
+                String::from(" "),
+                String::from("фин"),
+            ]
+        );
+    }
+
     #[test]
     fn link_with_trailing_punctuation() {
         let tokens = segment_inline("see [link](url).");
diff --git a/tests/wrap/tokenize_markdown.rs b/tests/wrap/tokenize_markdown.rs
index 4e41a826..21a2850c 100644
--- a/tests/wrap/tokenize_markdown.rs
+++ b/tests/wrap/tokenize_markdown.rs
@@ -77,3 +77,20 @@ fn multiple_unmatched_backticks_are_text() {
     );
 }
 
+#[test]
+fn multibyte_characters_round_trip() {
+    let source = "ßß `λ` fin";
+    let tokens = wrap::tokenize_markdown(source);
+    assert_eq!(
+        tokens,
+        vec![
+            Token::Text("ßß "),
+            Token::Code {
+                raw: "`λ`",
+                fence: "`",
+                code: "λ",
+            },
+            Token::Text(" fin"),
+        ]
+    );
+}

From c0e24651df6af899b4f0a2d9cf8c299f897649cd Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Tue, 11 Nov 2025 02:43:21 +0000
Subject: [PATCH 5/7] refactor(tokenize): extract backtick fence handling into
 a dedicated function

Refactored the backtick fence parsing logic in segment_inline by moving it
into a new helper function `handle_backtick_fence`. This improves code
readability and reduces duplication by isolating the backtick fence
scanning and matching logic.

Co-authored-by: terragon-labs[bot] <terragon-labs[bot]@users.noreply.github.com>
---
 src/wrap/tokenize.rs | 56 ++++++++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/src/wrap/tokenize.rs b/src/wrap/tokenize.rs
index ee6c081b..9c0abb83 100644
--- a/src/wrap/tokenize.rs
+++ b/src/wrap/tokenize.rs
@@ -156,6 +156,28 @@ fn is_trailing_punctuation(c: char) -> bool {
     )
 }
 
+fn handle_backtick_fence(text: &str, bytes: &[u8], start_idx: usize) -> (String, usize) {
+    let start = start_idx;
+    let fence_end = scan_while(text, start_idx, |ch| ch == '`');
+    let fence_len = fence_end - start;
+    let mut end = fence_end;
+
+    while end < text.len() {
+        let candidate_end = scan_while(text, end, |ch| ch == '`');
+        if candidate_end - end == fence_len && !has_odd_backslash_escape_bytes(bytes, end) {
+            return (collect_range(text, start, candidate_end), candidate_end);
+        }
+
+        if let Some(next) = text[end..].chars().next() {
+            end += next.len_utf8();
+        } else {
+            break;
+        }
+    }
+
+    (collect_range(text, start, fence_end), fence_end)
+}
+
 /// Break a single line of text into inline token strings.
 ///
 /// Code spans, links, images and surrounding whitespace are preserved as
@@ -183,7 +205,9 @@ pub(super) fn segment_inline(text: &str) -> Vec<String> {
     let bytes = text.as_bytes();
     let mut i = 0;
     while i < text.len() {
-        let ch = text[i..].chars().next().expect("valid char boundary");
+        let Some(ch) = text[i..].chars().next() else {
+            break;
+        };
         if ch.is_whitespace() {
             let start = i;
             i = scan_while(text, i, char::is_whitespace);
@@ -200,33 +224,9 @@ pub(super) fn segment_inline(text: &str) -> Vec<String> {
                 continue;
             }
 
-            let start = i;
-            let fence_end = scan_while(text, i, |ch| ch == '`');
-            let fence_len = fence_end - start;
-            i = fence_end;
-
-            let mut end = i;
-            let mut closing = None;
-            while end < text.len() {
-                let j = scan_while(text, end, |ch| ch == '`');
-                if j - end == fence_len && !has_odd_backslash_escape_bytes(bytes, end) {
-                    closing = Some(j);
-                    break;
-                }
-                if let Some(next) = text[end..].chars().next() {
-                    end += next.len_utf8();
-                } else {
-                    break;
-                }
-            }
-
-            if let Some(end_idx) = closing {
-                tokens.push(collect_range(text, start, end_idx));
-                i = end_idx;
-            } else {
-                tokens.push(collect_range(text, start, fence_end));
-                i = fence_end;
-            }
+            let (token, new_i) = handle_backtick_fence(text, bytes, i);
+            tokens.push(token);
+            i = new_i;
             continue;
         }
 

From fa392b4c2ada1f7087eb9636e3e7abd3e60be1ec Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Tue, 11 Nov 2025 02:45:06 +0000
Subject: [PATCH 6/7] docs(architecture): fix typo by adding missing comma in
 architecture.md

Co-authored-by: terragon-labs[bot] <terragon-labs[bot]@users.noreply.github.com>
---
 docs/architecture.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/architecture.md b/docs/architecture.md
index e8e609d0..c7d67ecf 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -335,7 +335,7 @@ module handles filesystem operations, delegating the text processing to
 
 ### Tokenizer flow
 
-The inline tokenizer iterates over the source string lazily so no duplicate
+The inline tokenizer iterates over the source string lazily, so no duplicate
 `Vec<char>` representation is required. The following diagram summarises the
 control flow, highlighting the helpers touched during whitespace, code span,
 and link handling.

From 5503cf3a7eadd9c994fbd4c7e597d4bdc7f54170 Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Wed, 12 Nov 2025 00:50:41 +0000
Subject: [PATCH 7/7] docs(architecture): correct spelling of 'summarises' to
 'summarizes' in documentation

Co-authored-by: terragon-labs[bot] <terragon-labs[bot]@users.noreply.github.com>
---
 docs/architecture.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/architecture.md b/docs/architecture.md
index c7d67ecf..8f5c8850 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -336,7 +336,7 @@ module handles filesystem operations, delegating the text processing to
 ### Tokenizer flow
 
 The inline tokenizer iterates over the source string lazily, so no duplicate
-`Vec<char>` representation is required. The following diagram summarises the
+`Vec<char>` representation is required. The following diagram summarizes the
 control flow, highlighting the helpers touched during whitespace, code span,
 and link handling.