leynos · leynos · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025 · Jul 30, 2025
diff --git a/docs/architecture.md b/docs/architecture.md
@@ -225,6 +225,10 @@ classDiagram
         <<module>>
         +convert_footnotes()
     }
+    class textproc {
+        <<module>>
+        +process_tokens()
+    }
     class process {
         <<module>>
         +process_stream()
@@ -248,21 +252,23 @@ classDiagram
     table ..> reflow : uses parse_rows, etc.
     lists ..> wrap : uses is_fence
     breaks ..> wrap : uses is_fence
-    ellipsis ..> wrap : uses tokenize_markdown
+    ellipsis ..> textproc : uses process_tokens
     process ..> html : uses convert_html_tables
     process ..> table : uses reflow_table
     process ..> wrap : uses wrap_text, is_fence
     process ..> fences : uses compress_fences, attach_orphan_specifiers
     process ..> ellipsis : uses replace_ellipsis
     process ..> footnotes : uses convert_footnotes
+    footnotes ..> textproc : uses process_tokens
     io ..> process : uses process_stream, process_stream_no_wrap
 ```
 
 The `lib` module re-exports the public API from the other modules. The
-`ellipsis` module performs text normalization. The `process` module provides
-streaming helpers that combine the lower-level functions, including ellipsis
-replacement and footnote conversion. The `io` module handles filesystem
-operations, delegating the text processing to `process`.
+`ellipsis` module performs text normalization, while `footnotes` converts bare
+references. The `textproc` module contains shared token-processing helpers used
+by both. The `process` module provides streaming helpers that combine the
+lower-level functions. The `io` module handles filesystem operations,
+delegating the text processing to `process`.
 
 ## Concurrency with `rayon`
 

diff --git a/src/ellipsis.rs b/src/ellipsis.rs
@@ -9,41 +9,31 @@ use std::sync::LazyLock;
 
 use regex::Regex;
 
-use crate::wrap::{Token, tokenize_markdown};
+use crate::{textproc::process_tokens, wrap::Token};
 
 static DOT_RE: LazyLock<Regex> = lazy_regex!(r"\.{3,}", "ellipsis pattern regex should compile");
 
 /// Replace `...` with `…` outside code spans and fences.
 #[must_use]
 pub fn replace_ellipsis(lines: &[String]) -> Vec<String> {
-    if lines.is_empty() {
-        return Vec::new();
-    }
-    let joined = lines.join("\n");
-    let mut out = String::new();
-    for token in tokenize_markdown(&joined) {
-        match token {
-            Token::Text(t) => {
-                let replaced = DOT_RE.replace_all(t, |caps: &regex::Captures<'_>| {
-                    let len = caps[0].len();
-                    let ellipses = "…".repeat(len / 3);
-                    let leftover = ".".repeat(len % 3);
-                    format!("{ellipses}{leftover}")
-                });
-                out.push_str(&replaced);
-            }
-            Token::Code(c) => {
-                out.push('`');
-                out.push_str(c);
-                out.push('`');
-            }
-            Token::Fence(f) => {
-                out.push_str(f);
-            }
-            Token::Newline => out.push('\n'),
+    process_tokens(lines, |token, out| match token {
+        Token::Text(t) => {
+            let replaced = DOT_RE.replace_all(t, |caps: &regex::Captures<'_>| {
+                let len = caps[0].len();
+                let ellipses = "…".repeat(len / 3);
+                let leftover = ".".repeat(len % 3);
+                format!("{ellipses}{leftover}")
+            });
+            out.push_str(&replaced);
-    process_tokens(lines, |token, out| match token {
-        Token::Text(t) => {
-            let replaced = DOT_RE.replace_all(t, |caps: &regex::Captures<'_>| {
-                let len = caps[0].len();
-                let ellipses = "…".repeat(len / 3);
-                let leftover = ".".repeat(len % 3);
-                format!("{ellipses}{leftover}")
-            });
-            out.push_str(&replaced);
+    process_tokens(lines, |token, out| match token {
+        Token::Text(t) => {
+            if !DOT_RE.is_match(t) {
+                out.push_str(t);
+                return;
+            }
+
+            let replaced = DOT_RE.replace_all(t, |caps: &regex::Captures<'_>| {
+                let len = caps[0].len();
+                let ellipses = "…".repeat(len / 3);
+                let leftover = ".".repeat(len % 3);
+                format!("{ellipses}{leftover}")
+            });
+            out.push_str(&replaced);
-    process_tokens(lines, |token, out| match token {
-        Token::Text(t) => {
-            let replaced = DOT_RE.replace_all(t, |caps: &regex::Captures<'_>| {
-                let len = caps[0].len();
-                let ellipses = "…".repeat(len / 3);
-                let leftover = ".".repeat(len % 3);
-                format!("{ellipses}{leftover}")
-            });
-            out.push_str(&replaced);
+    process_tokens(lines, |token, out| match token {
+        Token::Text(t) => {
+            if !DOT_RE.is_match(t) {
+                out.push_str(t);
+                return;
+            }
+
+            let replaced = DOT_RE.replace_all(t, |caps: &regex::Captures<'_>| {
+                let len = caps[0].len();
+                let ellipses = "…".repeat(len / 3);
+                let leftover = ".".repeat(len % 3);
+                format!("{ellipses}{leftover}")
+            });
+            out.push_str(&replaced);
         }
-    }
-    out.split('\n').map(str::to_string).collect()
+        Token::Code(c) => {
+            out.push('`');
+            out.push_str(c);
+            out.push('`');
+        }
+        Token::Fence(f) => out.push_str(f),
+        Token::Newline => out.push('\n'),
+    })
 }
 
 #[cfg(test)]

diff --git a/src/footnotes.rs b/src/footnotes.rs
@@ -18,7 +18,7 @@ static FOOTNOTE_LINE_RE: LazyLock<Regex> = lazy_regex!(
     "footnote line pattern should compile",
 );
 
-use crate::wrap::{Token, tokenize_markdown};
+use crate::{textproc::process_tokens, wrap::Token};
 
 /// Extract the components of an inline footnote reference.
 #[inline]
@@ -96,24 +96,16 @@ fn convert_block(lines: &mut [String]) {
 /// Convert bare numeric footnote references to Markdown footnote syntax.
 #[must_use]
 pub fn convert_footnotes(lines: &[String]) -> Vec<String> {
-    if lines.is_empty() {
-        return Vec::new();
-    }
-    let joined = lines.join("\n");
-    let mut out = String::new();
-    for token in tokenize_markdown(&joined) {
-        match token {
-            Token::Text(t) => out.push_str(&convert_inline(t)),
-            Token::Code(c) => {
-                out.push('`');
-                out.push_str(c);
-                out.push('`');
-            }
-            Token::Fence(f) => out.push_str(f),
-            Token::Newline => out.push('\n'),
+    let mut lines = process_tokens(lines, |token, out| match token {
+        Token::Text(t) => out.push_str(&convert_inline(t)),
+        Token::Code(c) => {
+            out.push('`');
+            out.push_str(c);
+            out.push('`');
         }
-    }
-    let mut lines: Vec<String> = out.split('\n').map(str::to_string).collect();
+        Token::Fence(f) => out.push_str(f),
+        Token::Newline => out.push('\n'),
+    });
     convert_block(&mut lines);
     lines
 }

diff --git a/src/lib.rs b/src/lib.rs
@@ -29,6 +29,7 @@ pub mod lists;
 pub mod process;
 mod reflow;
 pub mod table;
+mod textproc;
 pub mod wrap;
 
 #[doc(hidden)]

diff --git a/src/textproc.rs b/src/textproc.rs
@@ -0,0 +1,163 @@
+//! Provides helpers for token-based transformations of Markdown lines.
+//!
+//! This module reuses the tokenizer from the [`wrap`] module and offers
+//! a streaming API for rewriting Markdown. Each helper tokenizes lines
+//! on the fly, feeds the resulting tokens to caller-provided logic, and
+//! then reconstructs the lines. Trailing blank lines roundtrip
+//! correctly.
+
+use crate::wrap::{Token, is_fence};
+
+/// Apply a transformation to a sequence of [`Token`]s.
+///
+/// The `lines` slice is tokenized in order, preserving fence context.
+/// Each token is passed to `f` along with the output accumulator. The
+/// final string is split on newline characters and returned as a
+/// vector of lines.
+///
+/// # Examples
+///
+/// ```ignore
+/// use mdtablefix::{
+///     textproc::process_tokens,
+///     wrap::Token,
+/// };
+///
+/// let lines = vec!["code".to_string()];
+/// let out = process_tokens(&lines, |tok, out| match tok {
+///     Token::Text(t) => out.push_str(t),
+///     Token::Code(c) => {
+///         out.push('`');
+///         out.push_str(c);
+///         out.push('`');
+///     }
+///     Token::Fence(f) => out.push_str(f),
+///     Token::Newline => out.push('\n'),
+/// });
+/// assert_eq!(out, lines);
+/// ```
+#[must_use]
+pub(crate) fn process_tokens<F>(lines: &[String], mut f: F) -> Vec<String>
+where
+    F: FnMut(Token<'_>, &mut String),
+{
+    if lines.is_empty() {
+        return Vec::new();
+    }
+
+    let trailing_blanks = lines.iter().rev().take_while(|l| l.is_empty()).count();
+    if trailing_blanks == lines.len() {
+        return vec![String::new(); lines.len()];
+    }
+
+    let mut out = String::new();
+    let mut in_fence = false;
+    let last_idx = lines.len() - 1;
+    for (i, line) in lines.iter().enumerate() {
+        let trimmed = line.as_str();
+        if is_fence(trimmed) {
+            f(Token::Fence(trimmed), &mut out);
+            if i < last_idx {
+                f(Token::Newline, &mut out);
+            }
+            in_fence = !in_fence;
+            continue;
+        }
+        if in_fence {
+            f(Token::Fence(trimmed), &mut out);
+            if i < last_idx {
+                f(Token::Newline, &mut out);
+            }
+            continue;
+        }
+        let mut rest = trimmed;
+        while let Some(pos) = rest.find('`') {
+            if pos > 0 {
+                f(Token::Text(&rest[..pos]), &mut out);
+            }
+            if let Some(end) = rest[pos + 1..].find('`') {
+                f(Token::Code(&rest[pos + 1..pos + 1 + end]), &mut out);
+                rest = &rest[pos + end + 2..];
+            } else {
+                f(Token::Text(&rest[pos..]), &mut out);
+                rest = "";
+                break;
+            }
+        }
+        if !rest.is_empty() {
+            f(Token::Text(rest), &mut out);
+        }
+        if i < last_idx {
+            f(Token::Newline, &mut out);
+        }
+    }
+
+    if out.is_empty() {
+        return Vec::new();
+    }
+
+    let mut result: Vec<String> = out.split('\n').map(str::to_string).collect();
+    let out_blanks = result.iter().rev().take_while(|l| l.is_empty()).count();
+    for _ in out_blanks..trailing_blanks {
+        result.push(String::new());
+    }
+    result
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn identity_transformation_returns_input() {
+        let lines = vec!["a `b`".to_string()];
+        let out = process_tokens(&lines, |tok, buf| match tok {
+            Token::Text(t) => buf.push_str(t),
+            Token::Code(c) => {
+                buf.push('`');
+                buf.push_str(c);
+                buf.push('`');
+            }
+            Token::Fence(f) => buf.push_str(f),
+            Token::Newline => buf.push('\n'),
+        });
+        assert_eq!(out, lines);
+    }
+
+    #[test]
+    fn empty_input_returns_empty_vector() {
+        let lines: Vec<String> = Vec::new();
+        let out = process_tokens(&lines, |_tok, _out| unreachable!());
+        assert!(out.is_empty());
+    }
+
+    #[test]
+    fn transformation_can_remove_all_content() {
+        let lines = vec!["data".to_string()];
+        let out = process_tokens(&lines, |_tok, _out| {});
+        assert!(out.is_empty());
+    }
+
+    #[test]
+    fn preserves_trailing_blank_lines() {
+        let lines = vec!["a".to_string(), String::new(), String::new()];
+        let out = process_tokens(&lines, |tok, buf| match tok {
+            Token::Text(t) => buf.push_str(t),
+            Token::Code(c) => {
+                buf.push('`');
+                buf.push_str(c);
+                buf.push('`');
+            }
+            Token::Fence(f) => buf.push_str(f),
+            Token::Newline => buf.push('\n'),
+        });
+        assert_eq!(out, lines);
+    }
+
+    #[test]
+    fn blanks_only_are_preserved() {
+        let lines = vec![String::new(), String::new()];
+        let out = process_tokens(&lines, |_tok, _buf| {});
+        assert_eq!(out, lines);
+    }
+}
diff --git a/src/wrap.rs b/src/wrap.rs
@@ -71,6 +71,7 @@ static HANDLERS: &[PrefixHandler] = &[
 ];
 
 /// Markdown token emitted by [`tokenize_markdown`].
+/// Markdown token emitted by token-processing helpers.
 #[derive(Debug, PartialEq)]
 pub enum Token<'a> {
     /// Line within a fenced code block, including the fence itself.
@@ -170,64 +171,6 @@ fn tokenize_inline(text: &str) -> Vec<String> {
     }
     tokens
 }
-
-/// Split the input string into [`Token`]s by analysing whitespace and
-/// backtick delimiters.
-///
-/// The tokenizer groups consecutive whitespace into a single
-/// [`Token::Text`] and recognises backtick sequences as inline code spans.
-/// When a run of backticks is encountered the parser searches forward for an
-/// identical delimiter, allowing nested backticks when the span uses a longer
-/// fence. Unmatched delimiter sequences are treated as literal text.
-///
-/// ```rust,ignore
-/// use mdtablefix::wrap::{Token, tokenize_markdown};
-///
-/// let tokens = tokenize_markdown("Example with `code`");
-/// assert_eq!(
-///     tokens,
-///     vec![Token::Text("Example with "), Token::Code("code")]
-/// );
-/// ```
-pub(crate) fn tokenize_markdown(input: &str) -> Vec<Token<'_>> {
-    let mut out = Vec::new();
-    let mut in_fence = false;
-    for line in input.split_inclusive('\n') {
-        let trimmed = line.trim_end_matches('\n');
-        if FENCE_RE.is_match(trimmed) {
-            out.push(Token::Fence(trimmed));
-            out.push(Token::Newline);
-            in_fence = !in_fence;
-            continue;
-        }
-        if in_fence {
-            out.push(Token::Fence(trimmed));
-            out.push(Token::Newline);
-            continue;
-        }
-        let mut rest = trimmed;
-        while let Some(pos) = rest.find('`') {
-            if pos > 0 {
-                out.push(Token::Text(&rest[..pos]));
-            }
-            if let Some(end) = rest[pos + 1..].find('`') {
-                out.push(Token::Code(&rest[pos + 1..pos + 1 + end]));
-                rest = &rest[pos + end + 2..];
-            } else {
-                out.push(Token::Text(&rest[pos..]));
-                rest = "";
-                break;
-            }
-        }
-        if !rest.is_empty() {
-            out.push(Token::Text(rest));
-        }
-        out.push(Token::Newline);
-    }
-    out.pop();
-    out
-}
-
 /// Determine if the current line should break at the last whitespace.
 ///
 /// Returns `true` if `current_width` exceeds `width` and a whitespace split