From f2c5cace477a621c965b009d815e53ddc711aaea Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Tue, 29 Jul 2025 17:54:40 +0100
Subject: [PATCH 1/4] Add helper for token transformations

---
 src/ellipsis.rs  | 46 ++++++++++++------------------
 src/footnotes.rs | 28 +++++++------------
 src/lib.rs       |  1 +
 src/textproc.rs  | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 102 insertions(+), 46 deletions(-)
 create mode 100644 src/textproc.rs
diff --git a/src/ellipsis.rs b/src/ellipsis.rs
index 1b84f0b1..10925d9d 100644
--- a/src/ellipsis.rs
+++ b/src/ellipsis.rs
@@ -9,41 +9,31 @@ use std::sync::LazyLock;
 
 use regex::Regex;
 
-use crate::wrap::{Token, tokenize_markdown};
+use crate::{textproc::process_tokens, wrap::Token};
 
 static DOT_RE: LazyLock<Regex> = lazy_regex!(r"\.{3,}", "ellipsis pattern regex should compile");
 
 /// Replace `...` with `…` outside code spans and fences.
 #[must_use]
 pub fn replace_ellipsis(lines: &[String]) -> Vec<String> {
-    if lines.is_empty() {
-        return Vec::new();
-    }
-    let joined = lines.join("\n");
-    let mut out = String::new();
-    for token in tokenize_markdown(&joined) {
-        match token {
-            Token::Text(t) => {
-                let replaced = DOT_RE.replace_all(t, |caps: &regex::Captures<'_>| {
-                    let len = caps[0].len();
-                    let ellipses = "…".repeat(len / 3);
-                    let leftover = ".".repeat(len % 3);
-                    format!("{ellipses}{leftover}")
-                });
-                out.push_str(&replaced);
-            }
-            Token::Code(c) => {
-                out.push('`');
-                out.push_str(c);
-                out.push('`');
-            }
-            Token::Fence(f) => {
-                out.push_str(f);
-            }
-            Token::Newline => out.push('\n'),
+    process_tokens(lines, |token, out| match token {
+        Token::Text(t) => {
+            let replaced = DOT_RE.replace_all(t, |caps: &regex::Captures<'_>| {
+                let len = caps[0].len();
+                let ellipses = "…".repeat(len / 3);
+                let leftover = ".".repeat(len % 3);
+                format!("{ellipses}{leftover}")
+            });
+            out.push_str(&replaced);
         }
-    }
-    out.split('\n').map(str::to_string).collect()
+        Token::Code(c) => {
+            out.push('`');
+            out.push_str(c);
+            out.push('`');
+        }
+        Token::Fence(f) => out.push_str(f),
+        Token::Newline => out.push('\n'),
+    })
 }
 
 #[cfg(test)]
diff --git a/src/footnotes.rs b/src/footnotes.rs
index 24b5491a..d926b8ed 100644
--- a/src/footnotes.rs
+++ b/src/footnotes.rs
@@ -18,7 +18,7 @@ static FOOTNOTE_LINE_RE: LazyLock<Regex> = lazy_regex!(
     "footnote line pattern should compile",
 );
 
-use crate::wrap::{Token, tokenize_markdown};
+use crate::{textproc::process_tokens, wrap::Token};
 
 /// Extract the components of an inline footnote reference.
 #[inline]
@@ -96,24 +96,16 @@ fn convert_block(lines: &mut [String]) {
 /// Convert bare numeric footnote references to Markdown footnote syntax.
 #[must_use]
 pub fn convert_footnotes(lines: &[String]) -> Vec<String> {
-    if lines.is_empty() {
-        return Vec::new();
-    }
-    let joined = lines.join("\n");
-    let mut out = String::new();
-    for token in tokenize_markdown(&joined) {
-        match token {
-            Token::Text(t) => out.push_str(&convert_inline(t)),
-            Token::Code(c) => {
-                out.push('`');
-                out.push_str(c);
-                out.push('`');
-            }
-            Token::Fence(f) => out.push_str(f),
-            Token::Newline => out.push('\n'),
+    let mut lines = process_tokens(lines, |token, out| match token {
+        Token::Text(t) => out.push_str(&convert_inline(t)),
+        Token::Code(c) => {
+            out.push('`');
+            out.push_str(c);
+            out.push('`');
         }
-    }
-    let mut lines: Vec<String> = out.split('\n').map(str::to_string).collect();
+        Token::Fence(f) => out.push_str(f),
+        Token::Newline => out.push('\n'),
+    });
     convert_block(&mut lines);
     lines
 }
diff --git a/src/lib.rs b/src/lib.rs
index 9ae0f228..9a392498 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -29,6 +29,7 @@ pub mod lists;
 pub mod process;
 mod reflow;
 pub mod table;
+mod textproc;
 pub mod wrap;
 
 #[doc(hidden)]
diff --git a/src/textproc.rs b/src/textproc.rs
new file mode 100644
index 00000000..0fafeae0
--- /dev/null
+++ b/src/textproc.rs
@@ -0,0 +1,73 @@
+//! Token-level transformation utilities.
+//!
+//! This module provides helpers for processing Markdown input by
+//! reusing the tokenizer from the [`wrap`] module. Each helper joins
+//! incoming lines, tokenizes them, and feeds the tokens to caller
+//! provided logic before splitting the output back into lines.
+
+use crate::wrap::{Token, tokenize_markdown};
+
+/// Apply a transformation to a sequence of [`Token`]s.
+///
+/// The `lines` slice is joined with newlines and tokenized. Each token
+/// is passed to `f` along with the output accumulator. The final
+/// string is split on newline characters and returned as a vector of
+/// lines.
+///
+/// # Examples
+///
+/// ```
+/// use mdtablefix::{
+///     textproc::process_tokens,
+///     wrap::{Token, tokenize_markdown},
+/// };
+///
+/// let lines = vec!["code".to_string()];
+/// let out = process_tokens(&lines, |tok, out| match tok {
+///     Token::Text(t) => out.push_str(t),
+///     Token::Code(c) => {
+///         out.push('`');
+///         out.push_str(c);
+///         out.push('`');
+///     }
+///     Token::Fence(f) => out.push_str(f),
+///     Token::Newline => out.push('\n'),
+/// });
+/// assert_eq!(out, lines);
+/// ```
+#[must_use]
+pub(crate) fn process_tokens<F>(lines: &[String], mut f: F) -> Vec<String>
+where
+    F: FnMut(Token<'_>, &mut String),
+{
+    if lines.is_empty() {
+        return Vec::new();
+    }
+    let joined = lines.join("\n");
+    let mut out = String::new();
+    for token in tokenize_markdown(&joined) {
+        f(token, &mut out);
+    }
+    out.split('\n').map(str::to_string).collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn identity_transformation_returns_input() {
+        let lines = vec!["a `b`".to_string()];
+        let out = process_tokens(&lines, |tok, buf| match tok {
+            Token::Text(t) => buf.push_str(t),
+            Token::Code(c) => {
+                buf.push('`');
+                buf.push_str(c);
+                buf.push('`');
+            }
+            Token::Fence(f) => buf.push_str(f),
+            Token::Newline => buf.push('\n'),
+        });
+        assert_eq!(out, lines);
+    }
+}

From 082cd08db09df448d2774e6ecd96045c89e89c33 Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Tue, 29 Jul 2025 18:35:08 +0100
Subject: [PATCH 2/4] Add tests and preserve trailing blanks

---
 docs/architecture.md | 15 +++++++++++----
 src/textproc.rs      | 31 ++++++++++++++++++++++++++++++-
 2 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/docs/architecture.md b/docs/architecture.md
index dbf2524e..fbcadc05 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -225,6 +225,10 @@ classDiagram
         <<module>>
         +convert_footnotes()
     }
+    class textproc {
+        <<module>>
+        +process_tokens()
+    }
     class process {
         <<module>>
         +process_stream()
@@ -249,20 +253,23 @@ classDiagram
     lists ..> wrap : uses is_fence
     breaks ..> wrap : uses is_fence
     ellipsis ..> wrap : uses tokenize_markdown
+    ellipsis ..> textproc : uses process_tokens
     process ..> html : uses convert_html_tables
     process ..> table : uses reflow_table
     process ..> wrap : uses wrap_text, is_fence
     process ..> fences : uses compress_fences, attach_orphan_specifiers
     process ..> ellipsis : uses replace_ellipsis
     process ..> footnotes : uses convert_footnotes
+    footnotes ..> textproc : uses process_tokens
     io ..> process : uses process_stream, process_stream_no_wrap
 ```
 
 The `lib` module re-exports the public API from the other modules. The
-`ellipsis` module performs text normalization. The `process` module provides
-streaming helpers that combine the lower-level functions, including ellipsis
-replacement and footnote conversion. The `io` module handles filesystem
-operations, delegating the text processing to `process`.
+`ellipsis` module performs text normalization, while `footnotes` converts bare
+references. The `textproc` module contains shared token-processing helpers used
+by both. The `process` module provides streaming helpers that combine the
+lower-level functions. The `io` module handles filesystem operations,
+delegating the text processing to `process`.
 
 ## Concurrency with `rayon`
 
diff --git a/src/textproc.rs b/src/textproc.rs
index 0fafeae0..8a1126b5 100644
--- a/src/textproc.rs
+++ b/src/textproc.rs
@@ -43,12 +43,18 @@ where
     if lines.is_empty() {
         return Vec::new();
     }
+    let trailing_blanks = lines.iter().rev().take_while(|l| l.is_empty()).count();
     let joined = lines.join("\n");
     let mut out = String::new();
     for token in tokenize_markdown(&joined) {
         f(token, &mut out);
     }
-    out.split('\n').map(str::to_string).collect()
+    let mut result: Vec<String> = out.split('\n').map(str::to_string).collect();
+    let out_blanks = result.iter().rev().take_while(|l| l.is_empty()).count();
+    for _ in out_blanks..trailing_blanks {
+        result.push(String::new());
+    }
+    result
 }
 
 #[cfg(test)]
@@ -70,4 +76,27 @@ mod tests {
         });
         assert_eq!(out, lines);
     }
+
+    #[test]
+    fn empty_input_returns_empty_vector() {
+        let lines: Vec<String> = Vec::new();
+        let out = process_tokens(&lines, |_tok, _out| unreachable!());
+        assert!(out.is_empty());
+    }
+
+    #[test]
+    fn preserves_trailing_blank_lines() {
+        let lines = vec!["a".to_string(), String::new(), String::new()];
+        let out = process_tokens(&lines, |tok, buf| match tok {
+            Token::Text(t) => buf.push_str(t),
+            Token::Code(c) => {
+                buf.push('`');
+                buf.push_str(c);
+                buf.push('`');
+            }
+            Token::Fence(f) => buf.push_str(f),
+            Token::Newline => buf.push('\n'),
+        });
+        assert_eq!(out, lines);
+    }
 }

From 4b39408d15bf43cbcea6c63a27f58f575541bc48 Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Tue, 29 Jul 2025 19:32:00 +0100
Subject: [PATCH 3/4] Handle empty output in token processor

---
 src/textproc.rs | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/textproc.rs b/src/textproc.rs
index 8a1126b5..3d8d0e57 100644
--- a/src/textproc.rs
+++ b/src/textproc.rs
@@ -49,6 +49,9 @@ where
     for token in tokenize_markdown(&joined) {
         f(token, &mut out);
     }
+    if out.is_empty() {
+        return Vec::new();
+    }
     let mut result: Vec<String> = out.split('\n').map(str::to_string).collect();
     let out_blanks = result.iter().rev().take_while(|l| l.is_empty()).count();
     for _ in out_blanks..trailing_blanks {
@@ -84,6 +87,13 @@ mod tests {
         assert!(out.is_empty());
     }
 
+    #[test]
+    fn transformation_can_remove_all_content() {
+        let lines = vec!["data".to_string()];
+        let out = process_tokens(&lines, |_tok, _out| {});
+        assert!(out.is_empty());
+    }
+
     #[test]
     fn preserves_trailing_blank_lines() {
         let lines = vec!["a".to_string(), String::new(), String::new()];

From 191360c2efb8688b3e6bdbda6edaf8e4e9b908f3 Mon Sep 17 00:00:00 2001
From: Leynos <leynos@troubledskies.net>
Date: Wed, 30 Jul 2025 01:04:08 +0100
Subject: [PATCH 4/4] Refactor token processing

---
 docs/architecture.md |  1 -
 src/textproc.rs      | 81 ++++++++++++++++++++++++++++++++++++--------
 src/wrap.rs          | 59 +-------------------------------
 3 files changed, 67 insertions(+), 74 deletions(-)

diff --git a/docs/architecture.md b/docs/architecture.md
index fbcadc05..0ec0ea11 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -252,7 +252,6 @@ classDiagram
     table ..> reflow : uses parse_rows, etc.
     lists ..> wrap : uses is_fence
     breaks ..> wrap : uses is_fence
-    ellipsis ..> wrap : uses tokenize_markdown
     ellipsis ..> textproc : uses process_tokens
     process ..> html : uses convert_html_tables
     process ..> table : uses reflow_table
diff --git a/src/textproc.rs b/src/textproc.rs
index 3d8d0e57..a46fa8dc 100644
--- a/src/textproc.rs
+++ b/src/textproc.rs
@@ -1,25 +1,26 @@
-//! Token-level transformation utilities.
+//! Provides helpers for token-based transformations of Markdown lines.
 //!
-//! This module provides helpers for processing Markdown input by
-//! reusing the tokenizer from the [`wrap`] module. Each helper joins
-//! incoming lines, tokenizes them, and feeds the tokens to caller
-//! provided logic before splitting the output back into lines.
+//! This module reuses the tokenizer from the [`wrap`] module and offers
+//! a streaming API for rewriting Markdown. Each helper tokenizes lines
+//! on the fly, feeds the resulting tokens to caller-provided logic, and
+//! then reconstructs the lines. Trailing blank lines roundtrip
+//! correctly.
 
-use crate::wrap::{Token, tokenize_markdown};
+use crate::wrap::{Token, is_fence};
 
 /// Apply a transformation to a sequence of [`Token`]s.
 ///
-/// The `lines` slice is joined with newlines and tokenized. Each token
-/// is passed to `f` along with the output accumulator. The final
-/// string is split on newline characters and returned as a vector of
-/// lines.
+/// The `lines` slice is tokenized in order, preserving fence context.
+/// Each token is passed to `f` along with the output accumulator. The
+/// final string is split on newline characters and returned as a
+/// vector of lines.
 ///
 /// # Examples
 ///
-/// ```
+/// ```ignore
 /// use mdtablefix::{
 ///     textproc::process_tokens,
-///     wrap::{Token, tokenize_markdown},
+///     wrap::Token,
 /// };
 ///
 /// let lines = vec!["code".to_string()];
@@ -43,15 +44,58 @@ where
     if lines.is_empty() {
         return Vec::new();
     }
+
     let trailing_blanks = lines.iter().rev().take_while(|l| l.is_empty()).count();
-    let joined = lines.join("\n");
+    if trailing_blanks == lines.len() {
+        return vec![String::new(); lines.len()];
+    }
+
     let mut out = String::new();
-    for token in tokenize_markdown(&joined) {
-        f(token, &mut out);
+    let mut in_fence = false;
+    let last_idx = lines.len() - 1;
+    for (i, line) in lines.iter().enumerate() {
+        let trimmed = line.as_str();
+        if is_fence(trimmed) {
+            f(Token::Fence(trimmed), &mut out);
+            if i < last_idx {
+                f(Token::Newline, &mut out);
+            }
+            in_fence = !in_fence;
+            continue;
+        }
+        if in_fence {
+            f(Token::Fence(trimmed), &mut out);
+            if i < last_idx {
+                f(Token::Newline, &mut out);
+            }
+            continue;
+        }
+        let mut rest = trimmed;
+        while let Some(pos) = rest.find('`') {
+            if pos > 0 {
+                f(Token::Text(&rest[..pos]), &mut out);
+            }
+            if let Some(end) = rest[pos + 1..].find('`') {
+                f(Token::Code(&rest[pos + 1..pos + 1 + end]), &mut out);
+                rest = &rest[pos + end + 2..];
+            } else {
+                f(Token::Text(&rest[pos..]), &mut out);
+                rest = "";
+                break;
+            }
+        }
+        if !rest.is_empty() {
+            f(Token::Text(rest), &mut out);
+        }
+        if i < last_idx {
+            f(Token::Newline, &mut out);
+        }
     }
+
     if out.is_empty() {
         return Vec::new();
     }
+
     let mut result: Vec<String> = out.split('\n').map(str::to_string).collect();
     let out_blanks = result.iter().rev().take_while(|l| l.is_empty()).count();
     for _ in out_blanks..trailing_blanks {
@@ -109,4 +153,11 @@ mod tests {
         });
         assert_eq!(out, lines);
     }
+
+    #[test]
+    fn blanks_only_are_preserved() {
+        let lines = vec![String::new(), String::new()];
+        let out = process_tokens(&lines, |_tok, _buf| {});
+        assert_eq!(out, lines);
+    }
 }
diff --git a/src/wrap.rs b/src/wrap.rs
index 9cf3e670..0f240ffa 100644
--- a/src/wrap.rs
+++ b/src/wrap.rs
@@ -71,6 +71,7 @@ static HANDLERS: &[PrefixHandler] = &[
 ];
 
 /// Markdown token emitted by [`tokenize_markdown`].
+/// Markdown token emitted by token-processing helpers.
 #[derive(Debug, PartialEq)]
 pub enum Token<'a> {
     /// Line within a fenced code block, including the fence itself.
@@ -170,64 +171,6 @@ fn tokenize_inline(text: &str) -> Vec<String> {
     }
     tokens
 }
-
-/// Split the input string into [`Token`]s by analysing whitespace and
-/// backtick delimiters.
-///
-/// The tokenizer groups consecutive whitespace into a single
-/// [`Token::Text`] and recognises backtick sequences as inline code spans.
-/// When a run of backticks is encountered the parser searches forward for an
-/// identical delimiter, allowing nested backticks when the span uses a longer
-/// fence. Unmatched delimiter sequences are treated as literal text.
-///
-/// ```rust,ignore
-/// use mdtablefix::wrap::{Token, tokenize_markdown};
-///
-/// let tokens = tokenize_markdown("Example with `code`");
-/// assert_eq!(
-///     tokens,
-///     vec![Token::Text("Example with "), Token::Code("code")]
-/// );
-/// ```
-pub(crate) fn tokenize_markdown(input: &str) -> Vec<Token<'_>> {
-    let mut out = Vec::new();
-    let mut in_fence = false;
-    for line in input.split_inclusive('\n') {
-        let trimmed = line.trim_end_matches('\n');
-        if FENCE_RE.is_match(trimmed) {
-            out.push(Token::Fence(trimmed));
-            out.push(Token::Newline);
-            in_fence = !in_fence;
-            continue;
-        }
-        if in_fence {
-            out.push(Token::Fence(trimmed));
-            out.push(Token::Newline);
-            continue;
-        }
-        let mut rest = trimmed;
-        while let Some(pos) = rest.find('`') {
-            if pos > 0 {
-                out.push(Token::Text(&rest[..pos]));
-            }
-            if let Some(end) = rest[pos + 1..].find('`') {
-                out.push(Token::Code(&rest[pos + 1..pos + 1 + end]));
-                rest = &rest[pos + end + 2..];
-            } else {
-                out.push(Token::Text(&rest[pos..]));
-                rest = "";
-                break;
-            }
-        }
-        if !rest.is_empty() {
-            out.push(Token::Text(rest));
-        }
-        out.push(Token::Newline);
-    }
-    out.pop();
-    out
-}
-
 /// Determine if the current line should break at the last whitespace.
 ///
 /// Returns `true` if `current_width` exceeds `width` and a whitespace split