From 3b3f05be49506917ca113d8eacea28ee79ab8270 Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Thu, 31 Jul 2025 19:57:06 +0100
Subject: [PATCH 1/3] Simplify token processing

---
 docs/architecture.md |  8 +++--
 src/textproc.rs      | 73 ++++++--------------------------------------
 src/wrap.rs          |  2 +-
 src/wrap/tokenize.rs | 64 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 79 insertions(+), 68 deletions(-)
diff --git a/docs/architecture.md b/docs/architecture.md
index 5baf68bc..d4f52a9b 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -266,9 +266,11 @@ classDiagram
 The `lib` module re-exports the public API from the other modules. The
 `ellipsis` module performs text normalization, while `footnotes` converts bare
 references. The `textproc` module contains shared token-processing helpers used
-by both the `ellipsis` and `footnotes` modules. The `process` module provides
-streaming helpers that combine the lower-level functions. The `io` module
-handles filesystem operations, delegating the text processing to `process`.
+by both the `ellipsis` and `footnotes` modules. Tokenization is handled by
+`wrap::tokenize_markdown`, replacing the small state machine that previously
+resided in `process_tokens`. The `process` module provides streaming helpers
+that combine the lower-level functions. The `io` module handles filesystem
+operations, delegating the text processing to `process`.
 
 The helper `html_table_to_markdown` is retained for backward compatibility but
 is deprecated. New code should call `convert_html_tables` instead.
diff --git a/src/textproc.rs b/src/textproc.rs
index 9ea8545f..331edcf9 100644
--- a/src/textproc.rs
+++ b/src/textproc.rs
@@ -6,61 +6,7 @@
 //! then reconstructs the lines. Trailing blank lines roundtrip
 //! correctly.
 
-pub use crate::wrap::Token;
-use crate::wrap::is_fence;
-
-fn tokenize_inline<'a, F>(text: &'a str, emit: &mut F)
-where
-    F: FnMut(Token<'a>),
-{
-    let mut rest = text;
-    while let Some(pos) = rest.find('`') {
-        if pos > 0 {
-            emit(Token::Text(&rest[..pos]));
-        }
-        let delim_len = rest[pos..].chars().take_while(|&c| c == '`').count();
-        let search = &rest[pos + delim_len..];
-        let closing = "`".repeat(delim_len);
-        if let Some(end) = search.find(&closing) {
-            emit(Token::Code(&rest[pos + delim_len..pos + delim_len + end]));
-            rest = &search[end + delim_len..];
-        } else {
-            emit(Token::Text(&rest[pos..]));
-            rest = "";
-            break;
-        }
-    }
-    if !rest.is_empty() {
-        emit(Token::Text(rest));
-    }
-}
-
-fn handle_line<'a, F>(line: &'a str, last: bool, in_fence: &mut bool, f: &mut F, out: &mut String)
-where
-    F: FnMut(Token<'a>, &mut String),
-{
-    if is_fence(line) {
-        f(Token::Fence(line), out);
-        if !last {
-            f(Token::Newline, out);
-        }
-        *in_fence = !*in_fence;
-        return;
-    }
-
-    if *in_fence {
-        f(Token::Fence(line), out);
-        if !last {
-            f(Token::Newline, out);
-        }
-        return;
-    }
-
-    tokenize_inline(line, &mut |tok| f(tok, out));
-    if !last {
-        f(Token::Newline, out);
-    }
-}
+pub use crate::wrap::{Token, tokenize_markdown};
 
 /// Apply a transformation to a sequence of [`Token`]s.
 ///
@@ -101,22 +47,21 @@ where
         return vec![String::new(); lines.len()];
     }
 
+    let source = lines.join("\n");
     let mut out = String::new();
-    let mut in_fence = false;
-    let last_idx = lines.len() - 1;
-    for (i, line) in lines.iter().enumerate() {
-        handle_line(line, i == last_idx, &mut in_fence, &mut f, &mut out);
+    for token in tokenize_markdown(&source) {
+        f(token, &mut out);
     }
 
     if out.is_empty() {
         return Vec::new();
     }
 
-    let mut result: Vec<String> = out.split('\n').map(str::to_string).collect();
-    let out_blanks = result.iter().rev().take_while(|l| l.is_empty()).count();
-    for _ in out_blanks..trailing_blanks {
-        result.push(String::new());
-    }
+    let mut result: Vec<String> = out.split('\n').map(ToOwned::to_owned).collect();
+    result.extend(std::iter::repeat_n(
+        String::new(),
+        trailing_blanks.saturating_sub(result.len()),
+    ));
     result
 }
 
diff --git a/src/wrap.rs b/src/wrap.rs
index ce97ecfd..6b1ef390 100644
--- a/src/wrap.rs
+++ b/src/wrap.rs
@@ -8,7 +8,7 @@
 use regex::{Captures, Regex};
 
 mod tokenize;
-pub use tokenize::Token;
+pub use tokenize::{Token, tokenize_markdown};
 
 static FENCE_RE: std::sync::LazyLock<Regex> =
     std::sync::LazyLock::new(|| Regex::new(r"^\s*(```|~~~).*").unwrap());
diff --git a/src/wrap/tokenize.rs b/src/wrap/tokenize.rs
index 6be74ffe..a01f052e 100644
--- a/src/wrap/tokenize.rs
+++ b/src/wrap/tokenize.rs
@@ -119,6 +119,70 @@ pub(super) fn segment_inline(text: &str) -> Vec<String> {
     tokens
 }
 
+fn tokenize_inline<'a, F>(text: &'a str, emit: &mut F)
+where
+    F: FnMut(Token<'a>),
+{
+    let mut rest = text;
+    while let Some(pos) = rest.find('`') {
+        if pos > 0 {
+            emit(Token::Text(&rest[..pos]));
+        }
+        let delim_len = rest[pos..].chars().take_while(|&c| c == '`').count();
+        let search = &rest[pos + delim_len..];
+        let closing = "`".repeat(delim_len);
+        if let Some(end) = search.find(&closing) {
+            emit(Token::Code(&rest[pos + delim_len..pos + delim_len + end]));
+            rest = &search[end + delim_len..];
+        } else {
+            emit(Token::Text(&rest[pos..]));
+            rest = "";
+            break;
+        }
+    }
+    if !rest.is_empty() {
+        emit(Token::Text(rest));
+    }
+}
+
+/// Tokenize a block of Markdown into [`Token`]s.
+#[must_use]
+pub fn tokenize_markdown(source: &str) -> Vec<Token<'_>> {
+    if source.is_empty() {
+        return Vec::new();
+    }
+
+    let mut tokens = Vec::new();
+    let lines: Vec<&str> = source.split('\n').collect();
+    let last_idx = lines.len() - 1;
+    let mut in_fence = false;
+
+    for (i, line) in lines.iter().enumerate() {
+        if super::is_fence(line) {
+            tokens.push(Token::Fence(line));
+            if i != last_idx {
+                tokens.push(Token::Newline);
+            }
+            in_fence = !in_fence;
+            continue;
+        }
+
+        if in_fence {
+            tokens.push(Token::Fence(line));
+            if i != last_idx {
+                tokens.push(Token::Newline);
+            }
+            continue;
+        }
+
+        tokenize_inline(line, &mut |tok| tokens.push(tok));
+        if i != last_idx {
+            tokens.push(Token::Newline);
+        }
+    }
+    tokens
+}
+
 /// Split the input string into [`Token`]s by analysing whitespace and backtick
 /// delimiters.
 ///

From 31c0f6809fbe3ad5ff9ea388266bf25f532bee75 Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Thu, 31 Jul 2025 20:27:51 +0100
Subject: [PATCH 2/3] Fix token processing and add tokenizer tests

---
 src/textproc.rs                 | 11 ++++++----
 tests/wrap/tokenize_markdown.rs | 36 +++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 4 deletions(-)
 create mode 100644 tests/wrap/tokenize_markdown.rs

diff --git a/src/textproc.rs b/src/textproc.rs
index 331edcf9..74008660 100644
--- a/src/textproc.rs
+++ b/src/textproc.rs
@@ -58,10 +58,13 @@ where
     }
 
     let mut result: Vec<String> = out.split('\n').map(ToOwned::to_owned).collect();
-    result.extend(std::iter::repeat_n(
-        String::new(),
-        trailing_blanks.saturating_sub(result.len()),
-    ));
+    let out_blanks = result.iter().rev().take_while(|l| l.is_empty()).count();
+    if out_blanks < trailing_blanks {
+        result.extend(std::iter::repeat_n(
+            String::new(),
+            trailing_blanks - out_blanks,
+        ));
+    }
     result
 }
 
diff --git a/tests/wrap/tokenize_markdown.rs b/tests/wrap/tokenize_markdown.rs
new file mode 100644
index 00000000..4d146f59
--- /dev/null
+++ b/tests/wrap/tokenize_markdown.rs
@@ -0,0 +1,36 @@
+//! Tests for the tokenize_markdown helper.
+
+use mdtablefix::wrap::{self, Token};
+
+#[test]
+fn unclosed_fence_yields_fence_tokens() {
+    let lines = vec!["```rust", "let x = 42;", "fn foo() {}"];
+    let joined = lines.join("\n");
+    let tokens = wrap::tokenize_markdown(&joined);
+    assert_eq!(
+        tokens,
+        vec![
+            Token::Fence("```rust"),
+            Token::Newline,
+            Token::Fence("let x = 42;"),
+            Token::Newline,
+            Token::Fence("fn foo() {}"),
+        ]
+    );
+}
+
+#[test]
+fn malformed_fence_is_text() {
+    let source = "``~~\ncode\n``~~";
+    let tokens = wrap::tokenize_markdown(source);
+    assert_eq!(
+        tokens,
+        vec![
+            Token::Text("``~~"),
+            Token::Newline,
+            Token::Text("code"),
+            Token::Newline,
+            Token::Text("``~~"),
+        ]
+    );
+}

From d0e2b776c4e963a78b51789c38fb14cf7251cdea Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Thu, 31 Jul 2025 20:40:07 +0100
Subject: [PATCH 3/3] Add tests for malformed fence lengths

---
 tests/wrap/tokenize_markdown.rs | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/wrap/tokenize_markdown.rs b/tests/wrap/tokenize_markdown.rs
index 4d146f59..33f67f0b 100644
--- a/tests/wrap/tokenize_markdown.rs
+++ b/tests/wrap/tokenize_markdown.rs
@@ -34,3 +34,19 @@ fn malformed_fence_is_text() {
         ]
     );
 }
+
+#[test]
+fn incorrect_fence_length_is_text() {
+    let source = "````\ncode\n````";
+    let tokens = wrap::tokenize_markdown(source);
+    assert_eq!(
+        tokens,
+        vec![
+            Token::Text("````"),
+            Token::Newline,
+            Token::Text("code"),
+            Token::Newline,
+            Token::Text("````"),
+        ]
+    );
+}