leynos · leynos · Jul 31, 2025 · Jul 30, 2025 · Jul 31, 2025 · Jul 31, 2025
diff --git a/docs/architecture.md b/docs/architecture.md
@@ -225,6 +225,10 @@ classDiagram
         <<module>>
         +convert_footnotes()
     }
+    class textproc {
+        <<module>>
+        +process_tokens()
+    }
     class process {
         <<module>>
         +process_stream()
@@ -248,21 +252,23 @@ classDiagram
     table ..> reflow : uses parse_rows, etc.
     lists ..> wrap : uses is_fence
     breaks ..> wrap : uses is_fence
-    ellipsis ..> wrap : uses tokenize_markdown
+    ellipsis ..> textproc : uses process_tokens
     process ..> html : uses convert_html_tables
     process ..> table : uses reflow_table
     process ..> wrap : uses wrap_text, is_fence
     process ..> fences : uses compress_fences, attach_orphan_specifiers
     process ..> ellipsis : uses replace_ellipsis
     process ..> footnotes : uses convert_footnotes
+    footnotes ..> textproc : uses process_tokens
     io ..> process : uses process_stream, process_stream_no_wrap
 ```
 
 The `lib` module re-exports the public API from the other modules. The
-`ellipsis` module performs text normalization. The `process` module provides
-streaming helpers that combine the lower-level functions, including ellipsis
-replacement and footnote conversion. The `io` module handles filesystem
-operations, delegating the text processing to `process`.
+`ellipsis` module performs text normalization, while `footnotes` converts bare
+references. The `textproc` module contains shared token-processing helpers used
+by both the `ellipsis` and `footnotes` modules. The `process` module provides
+streaming helpers that combine the lower-level functions. The `io` module
+handles filesystem operations, delegating the text processing to `process`.
 
 The helper `html_table_to_markdown` is retained for backward compatibility but
 is deprecated. New code should call `convert_html_tables` instead.

diff --git a/src/ellipsis.rs b/src/ellipsis.rs
@@ -9,41 +9,35 @@ use std::sync::LazyLock;
 
 use regex::Regex;
 
-use crate::wrap::{Token, tokenize_markdown};
+use crate::textproc::{Token, process_tokens};
 
 static DOT_RE: LazyLock<Regex> = lazy_regex!(r"\.{3,}", "ellipsis pattern regex should compile");
 
 /// Replace `...` with `…` outside code spans and fences.
 #[must_use]
 pub fn replace_ellipsis(lines: &[String]) -> Vec<String> {
-    if lines.is_empty() {
-        return Vec::new();
-    }
-    let joined = lines.join("\n");
-    let mut out = String::new();
-    for token in tokenize_markdown(&joined) {
-        match token {
-            Token::Text(t) => {
-                let replaced = DOT_RE.replace_all(t, |caps: &regex::Captures<'_>| {
-                    let len = caps[0].len();
-                    let ellipses = "…".repeat(len / 3);
-                    let leftover = ".".repeat(len % 3);
-                    format!("{ellipses}{leftover}")
-                });
-                out.push_str(&replaced);
-            }
-            Token::Code(c) => {
-                out.push('`');
-                out.push_str(c);
-                out.push('`');
-            }
-            Token::Fence(f) => {
-                out.push_str(f);
+    process_tokens(lines, |token, out| match token {
+        Token::Text(t) => {
+            if !DOT_RE.is_match(t) {
+                out.push_str(t);
+                return;
             }
-            Token::Newline => out.push('\n'),
+            let replaced = DOT_RE.replace_all(t, |caps: &regex::Captures<'_>| {
+                let len = caps[0].len();
+                let ellipses = "…".repeat(len / 3);
+                let leftover = ".".repeat(len % 3);
+                format!("{ellipses}{leftover}")
+            });
+            out.push_str(&replaced);
         }
-    }
-    out.split('\n').map(str::to_string).collect()
+        Token::Code(c) => {
+            out.push('`');
+            out.push_str(c);
+            out.push('`');
+        }
+        Token::Fence(f) => out.push_str(f),
+        Token::Newline => out.push('\n'),
+    })
 }
 
 #[cfg(test)]

diff --git a/src/footnotes.rs b/src/footnotes.rs
@@ -18,7 +18,7 @@ static FOOTNOTE_LINE_RE: LazyLock<Regex> = lazy_regex!(
     "footnote line pattern should compile",
 );
 
-use crate::wrap::{Token, tokenize_markdown};
+use crate::textproc::{Token, process_tokens};
 
 /// Extract the components of an inline footnote reference.
 #[inline]
@@ -96,24 +96,16 @@ fn convert_block(lines: &mut [String]) {
 /// Convert bare numeric footnote references to Markdown footnote syntax.
 #[must_use]
 pub fn convert_footnotes(lines: &[String]) -> Vec<String> {
-    if lines.is_empty() {
-        return Vec::new();
-    }
-    let joined = lines.join("\n");
-    let mut out = String::new();
-    for token in tokenize_markdown(&joined) {
-        match token {
-            Token::Text(t) => out.push_str(&convert_inline(t)),
-            Token::Code(c) => {
-                out.push('`');
-                out.push_str(c);
-                out.push('`');
-            }
-            Token::Fence(f) => out.push_str(f),
-            Token::Newline => out.push('\n'),
+    let mut lines = process_tokens(lines, |token, out| match token {
+        Token::Text(t) => out.push_str(&convert_inline(t)),
+        Token::Code(c) => {
+            out.push('`');
+            out.push_str(c);
+            out.push('`');
         }
-    }
-    let mut lines: Vec<String> = out.split('\n').map(str::to_string).collect();
+        Token::Fence(f) => out.push_str(f),
+        Token::Newline => out.push('\n'),
+    });
     convert_block(&mut lines);
     lines
 }

diff --git a/src/lib.rs b/src/lib.rs
@@ -9,6 +9,7 @@
 //! - `ellipsis` for normalizing textual ellipses.
 //! - `fences` for issues with code block fences
 //! - `footnotes` for converting bare footnote links.
+//! - `textproc` for token-based transformations.
 //! - `process` for stream processing.
 //! - `io` for file helpers.
 
@@ -29,6 +30,7 @@ pub mod lists;
 pub mod process;
 mod reflow;
 pub mod table;
+pub mod textproc;
 pub mod wrap;
 
 #[deprecated(note = "this function is legacy; use `convert_html_tables` instead")]

diff --git a/src/textproc.rs b/src/textproc.rs
@@ -0,0 +1,227 @@
+//! Provides helpers for token-based transformations of Markdown lines.
+//!
+//! This module reuses the tokenizer from the [`wrap`] module and offers
+//! a streaming API for rewriting Markdown. Each helper tokenizes lines
+//! on the fly, feeds the resulting tokens to caller-provided logic, and
+//! then reconstructs the lines. Trailing blank lines roundtrip
+//! correctly.
+
+pub use crate::wrap::Token;
+use crate::wrap::is_fence;
+
+fn tokenize_inline<'a, F>(text: &'a str, emit: &mut F)
+where
+    F: FnMut(Token<'a>),
+{
+    let mut rest = text;
+    while let Some(pos) = rest.find('`') {
+        if pos > 0 {
+            emit(Token::Text(&rest[..pos]));
+        }
+        let delim_len = rest[pos..].chars().take_while(|&c| c == '`').count();
+        let search = &rest[pos + delim_len..];
+        let closing = "`".repeat(delim_len);
+        if let Some(end) = search.find(&closing) {
+            emit(Token::Code(&rest[pos + delim_len..pos + delim_len + end]));
+            rest = &search[end + delim_len..];
+        } else {
+            emit(Token::Text(&rest[pos..]));
+            rest = "";
+            break;
+        }
+    }
+    if !rest.is_empty() {
+        emit(Token::Text(rest));
+    }
+}
+
+fn handle_line<'a, F>(line: &'a str, last: bool, in_fence: &mut bool, f: &mut F, out: &mut String)
+where
+    F: FnMut(Token<'a>, &mut String),
+{
+    if is_fence(line) {
+        f(Token::Fence(line), out);
+        if !last {
+            f(Token::Newline, out);
+        }
+        *in_fence = !*in_fence;
+        return;
+    }
+
+    if *in_fence {
+        f(Token::Fence(line), out);
+        if !last {
+            f(Token::Newline, out);
+        }
+        return;
+    }
+
+    tokenize_inline(line, &mut |tok| f(tok, out));
+    if !last {
+        f(Token::Newline, out);
+    }
+}
+
+/// Apply a transformation to a sequence of [`Token`]s.
+///
+/// The `lines` slice is tokenized in order, preserving fence context.
+/// Each token is passed to `f` along with the output accumulator. The
+/// final string is split on newline characters and returned as a
+/// vector of lines.
+///
+/// # Examples
+///
+/// ```rust
+/// use mdtablefix::{textproc::process_tokens, wrap::Token};
+///
+/// let lines = vec!["code".to_string()];
+/// let out = process_tokens(&lines, |tok, out| match tok {
+///     Token::Text(t) => out.push_str(t),
+///     Token::Code(c) => {
+///         out.push('`');
+///         out.push_str(c);
+///         out.push('`');
+///     }
+///     Token::Fence(f) => out.push_str(f),
+///     Token::Newline => out.push('\n'),
+/// });
+/// assert_eq!(out, lines);
+/// ```
+#[must_use]
+pub fn process_tokens<F>(lines: &[String], mut f: F) -> Vec<String>
+where
+    F: FnMut(Token<'_>, &mut String),
+{
+    if lines.is_empty() {
+        return Vec::new();
+    }
+
+    let trailing_blanks = lines.iter().rev().take_while(|l| l.is_empty()).count();
+    if trailing_blanks == lines.len() {
+        return vec![String::new(); lines.len()];
+    }
+
+    let mut out = String::new();
+    let mut in_fence = false;
+    let last_idx = lines.len() - 1;
+    for (i, line) in lines.iter().enumerate() {
+        handle_line(line, i == last_idx, &mut in_fence, &mut f, &mut out);
+    }
+
+    if out.is_empty() {
+        return Vec::new();
+    }
+
+    let mut result: Vec<String> = out.split('\n').map(str::to_string).collect();
+    let out_blanks = result.iter().rev().take_while(|l| l.is_empty()).count();
+    for _ in out_blanks..trailing_blanks {
+        result.push(String::new());
+    }
+    result
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn identity_transformation_returns_input() {
+        let lines = vec!["a `b`".to_string()];
+        let out = process_tokens(&lines, |tok, buf| match tok {
+            Token::Text(t) => buf.push_str(t),
+            Token::Code(c) => {
+                buf.push('`');
+                buf.push_str(c);
+                buf.push('`');
+            }
+            Token::Fence(f) => buf.push_str(f),
+            Token::Newline => buf.push('\n'),
+        });
+        assert_eq!(out, lines);
+    }
+
+    #[test]
+    fn empty_input_returns_empty_vector() {
+        let lines: Vec<String> = Vec::new();
+        let out = process_tokens(&lines, |_tok, _out| unreachable!());
+        assert!(out.is_empty());
+    }
+
+    #[test]
+    fn transformation_can_remove_all_content() {
+        let lines = vec!["data".to_string()];
+        let out = process_tokens(&lines, |_tok, _out| {});
+        assert!(out.is_empty());
+    }
+
+    #[test]
+    fn preserves_trailing_blank_lines() {
+        let lines = vec!["a".to_string(), String::new(), String::new()];
+        let out = process_tokens(&lines, |tok, buf| match tok {
+            Token::Text(t) => buf.push_str(t),
+            Token::Code(c) => {
+                buf.push('`');
+                buf.push_str(c);
+                buf.push('`');
+            }
+            Token::Fence(f) => buf.push_str(f),
+            Token::Newline => buf.push('\n'),
+        });
+        assert_eq!(out, lines);
+    }
+
+    #[test]
+    fn blanks_only_are_preserved() {
+        let lines = vec![String::new(), String::new()];
+        let out = process_tokens(&lines, |_tok, _buf| {});
+        assert_eq!(out, lines);
+    }
+
+    #[test]
+    fn token_stream_handles_fences() {
+        let lines = vec![
+            "```rust".to_string(),
+            "fn main() {".to_string(),
+            "    println!(\"hi\");".to_string(),
+            "```".to_string(),
+        ];
+        let mut tokens = Vec::new();
+        let _ = process_tokens(&lines, |tok, _| tokens.push(format!("{tok:?}")));
+        let expected = vec![
+            "Fence(\"```rust\")".to_string(),
+            "Newline".to_string(),
+            "Fence(\"fn main() {\")".to_string(),
+            "Newline".to_string(),
+            "Fence(\"    println!(\\\"hi\\\");\")".to_string(),
+            "Newline".to_string(),
+            "Fence(\"```\")".to_string(),
+        ];
+        assert_eq!(tokens, expected);
+    }
+
+    #[test]
+    fn malformed_fence_sequence_returns_tokens() {
+        let lines = vec!["```".to_string(), "code".to_string()];
+        let mut tokens = Vec::new();
+        let _ = process_tokens(&lines, |tok, _| tokens.push(format!("{tok:?}")));
+        let expected = vec![
+            "Fence(\"```\")".to_string(),
+            "Newline".to_string(),
+            "Fence(\"code\")".to_string(),
+        ];
+        assert_eq!(tokens, expected);
+    }
+
+    #[test]
+    fn multi_backtick_spans_are_recognised() {
+        let lines = vec!["A ``code`` span".to_string()];
+        let mut tokens = Vec::new();
+        let _ = process_tokens(&lines, |tok, _| tokens.push(format!("{tok:?}")));
+        let expected = vec![
+            "Text(\"A \")".to_string(),
+            "Code(\"code\")".to_string(),
+            "Text(\" span\")".to_string(),
+        ];
+        assert_eq!(tokens, expected);
+    }
+}
diff --git a/src/wrap.rs b/src/wrap.rs
@@ -8,7 +8,7 @@
 use regex::{Captures, Regex};
 
 mod tokenize;
-pub(crate) use tokenize::{Token, tokenize_markdown};
+pub use tokenize::Token;
 
 static FENCE_RE: std::sync::LazyLock<Regex> =
     std::sync::LazyLock::new(|| Regex::new(r"^\s*(```|~~~).*").unwrap());