leynos · leynos · Jul 31, 2025 · Jul 31, 2025 · Jul 31, 2025 · Jul 31, 2025
diff --git a/docs/architecture.md b/docs/architecture.md
@@ -266,9 +266,11 @@ classDiagram
 The `lib` module re-exports the public API from the other modules. The
 `ellipsis` module performs text normalization, while `footnotes` converts bare
 references. The `textproc` module contains shared token-processing helpers used
-by both the `ellipsis` and `footnotes` modules. The `process` module provides
-streaming helpers that combine the lower-level functions. The `io` module
-handles filesystem operations, delegating the text processing to `process`.
+by both the `ellipsis` and `footnotes` modules. Tokenization is handled by
+`wrap::tokenize_markdown`, replacing the small state machine that previously
+resided in `process_tokens`. The `process` module provides streaming helpers
+that combine the lower-level functions. The `io` module handles filesystem
+operations, delegating the text processing to `process`.
 
 The helper `html_table_to_markdown` is retained for backward compatibility but
 is deprecated. New code should call `convert_html_tables` instead.

diff --git a/src/textproc.rs b/src/textproc.rs
@@ -6,61 +6,7 @@
 //! then reconstructs the lines. Trailing blank lines roundtrip
 //! correctly.
 
-pub use crate::wrap::Token;
-use crate::wrap::is_fence;
-
-fn tokenize_inline<'a, F>(text: &'a str, emit: &mut F)
-where
-    F: FnMut(Token<'a>),
-{
-    let mut rest = text;
-    while let Some(pos) = rest.find('`') {
-        if pos > 0 {
-            emit(Token::Text(&rest[..pos]));
-        }
-        let delim_len = rest[pos..].chars().take_while(|&c| c == '`').count();
-        let search = &rest[pos + delim_len..];
-        let closing = "`".repeat(delim_len);
-        if let Some(end) = search.find(&closing) {
-            emit(Token::Code(&rest[pos + delim_len..pos + delim_len + end]));
-            rest = &search[end + delim_len..];
-        } else {
-            emit(Token::Text(&rest[pos..]));
-            rest = "";
-            break;
-        }
-    }
-    if !rest.is_empty() {
-        emit(Token::Text(rest));
-    }
-}
-
-fn handle_line<'a, F>(line: &'a str, last: bool, in_fence: &mut bool, f: &mut F, out: &mut String)
-where
-    F: FnMut(Token<'a>, &mut String),
-{
-    if is_fence(line) {
-        f(Token::Fence(line), out);
-        if !last {
-            f(Token::Newline, out);
-        }
-        *in_fence = !*in_fence;
-        return;
-    }
-
-    if *in_fence {
-        f(Token::Fence(line), out);
-        if !last {
-            f(Token::Newline, out);
-        }
-        return;
-    }
-
-    tokenize_inline(line, &mut |tok| f(tok, out));
-    if !last {
-        f(Token::Newline, out);
-    }
-}
+pub use crate::wrap::{Token, tokenize_markdown};
 
 /// Apply a transformation to a sequence of [`Token`]s.
 ///
@@ -101,21 +47,23 @@
         return vec![String::new(); lines.len()];
     }
 
+    let source = lines.join("\n");
     let mut out = String::new();
-    let mut in_fence = false;
-    let last_idx = lines.len() - 1;
-    for (i, line) in lines.iter().enumerate() {
-        handle_line(line, i == last_idx, &mut in_fence, &mut f, &mut out);
+    for token in tokenize_markdown(&source) {
+        f(token, &mut out);
     }
 
     if out.is_empty() {
         return Vec::new();
     }
 
-    let mut result: Vec<String> = out.split('\n').map(str::to_string).collect();
+    let mut result: Vec<String> = out.split('\n').map(ToOwned::to_owned).collect();
     let out_blanks = result.iter().rev().take_while(|l| l.is_empty()).count();
-    for _ in out_blanks..trailing_blanks {
-        result.push(String::new());
+    if out_blanks < trailing_blanks {
+        result.extend(std::iter::repeat_n(
+            String::new(),
+            trailing_blanks - out_blanks,
+        ));
     }
     result
 }

diff --git a/src/wrap/tokenize.rs b/src/wrap/tokenize.rs
@@ -119,6 +119,70 @@ pub(super) fn segment_inline(text: &str) -> Vec<String> {
     tokens
 }
 
+fn tokenize_inline<'a, F>(text: &'a str, emit: &mut F)
+where
+    F: FnMut(Token<'a>),
+{
+    let mut rest = text;
+    while let Some(pos) = rest.find('`') {
+        if pos > 0 {
+            emit(Token::Text(&rest[..pos]));
+        }
+        let delim_len = rest[pos..].chars().take_while(|&c| c == '`').count();
+        let search = &rest[pos + delim_len..];
+        let closing = "`".repeat(delim_len);
+        if let Some(end) = search.find(&closing) {
+            emit(Token::Code(&rest[pos + delim_len..pos + delim_len + end]));
+            rest = &search[end + delim_len..];
+        } else {
+            emit(Token::Text(&rest[pos..]));
+            rest = "";
+            break;
+        }
+    }
+    if !rest.is_empty() {
+        emit(Token::Text(rest));
+    }
+}
+
+/// Tokenize a block of Markdown into [`Token`]s.
+#[must_use]
+pub fn tokenize_markdown(source: &str) -> Vec<Token<'_>> {
+    if source.is_empty() {
+        return Vec::new();
+    }
+
+    let mut tokens = Vec::new();
+    let lines: Vec<&str> = source.split('\n').collect();
+    let last_idx = lines.len() - 1;
+    let mut in_fence = false;
+
+    for (i, line) in lines.iter().enumerate() {
+        if super::is_fence(line) {
+            tokens.push(Token::Fence(line));
+            if i != last_idx {
+                tokens.push(Token::Newline);
+            }
+            in_fence = !in_fence;
+            continue;
+        }
+
+        if in_fence {
+            tokens.push(Token::Fence(line));
+            if i != last_idx {
+                tokens.push(Token::Newline);
+            }
+            continue;
+        }
+
+        tokenize_inline(line, &mut |tok| tokens.push(tok));
+        if i != last_idx {
+            tokens.push(Token::Newline);
+        }
+    }
+    tokens
+}
+
 /// Split the input string into [`Token`]s by analysing whitespace and backtick
 /// delimiters.
 ///

diff --git a/tests/wrap/tokenize_markdown.rs b/tests/wrap/tokenize_markdown.rs
@@ -0,0 +1,52 @@
+//! Tests for the tokenize_markdown helper.
+
+use mdtablefix::wrap::{self, Token};
+
+#[test]
+fn unclosed_fence_yields_fence_tokens() {
+    let lines = vec!["```rust", "let x = 42;", "fn foo() {}"];
+    let joined = lines.join("\n");
+    let tokens = wrap::tokenize_markdown(&joined);
+    assert_eq!(
+        tokens,
+        vec![
+            Token::Fence("```rust"),
+            Token::Newline,
+            Token::Fence("let x = 42;"),
+            Token::Newline,
+            Token::Fence("fn foo() {}"),
+        ]
+    );
+}
+
+#[test]
+fn malformed_fence_is_text() {
+    let source = "``~~\ncode\n``~~";
+    let tokens = wrap::tokenize_markdown(source);
+    assert_eq!(
+        tokens,
+        vec![
+            Token::Text("``~~"),
+            Token::Newline,
+            Token::Text("code"),
+            Token::Newline,
+            Token::Text("``~~"),
+        ]
+    );
+}
+
+#[test]
+fn incorrect_fence_length_is_text() {
+    let source = "````\ncode\n````";
+    let tokens = wrap::tokenize_markdown(source);
+    assert_eq!(
+        tokens,
+        vec![
+            Token::Text("````"),
+            Token::Newline,
+            Token::Text("code"),
+            Token::Newline,
+            Token::Text("````"),
+        ]
+    );
+}