From f2c5cace477a621c965b009d815e53ddc711aaea Mon Sep 17 00:00:00 2001 From: Leynos Date: Tue, 29 Jul 2025 17:54:40 +0100 Subject: [PATCH 1/4] Add helper for token transformations --- src/ellipsis.rs | 46 ++++++++++++------------------ src/footnotes.rs | 28 +++++++------------ src/lib.rs | 1 + src/textproc.rs | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 102 insertions(+), 46 deletions(-) create mode 100644 src/textproc.rs diff --git a/src/ellipsis.rs b/src/ellipsis.rs index 1b84f0b1..10925d9d 100644 --- a/src/ellipsis.rs +++ b/src/ellipsis.rs @@ -9,41 +9,31 @@ use std::sync::LazyLock; use regex::Regex; -use crate::wrap::{Token, tokenize_markdown}; +use crate::{textproc::process_tokens, wrap::Token}; static DOT_RE: LazyLock = lazy_regex!(r"\.{3,}", "ellipsis pattern regex should compile"); /// Replace `...` with `…` outside code spans and fences. #[must_use] pub fn replace_ellipsis(lines: &[String]) -> Vec { - if lines.is_empty() { - return Vec::new(); - } - let joined = lines.join("\n"); - let mut out = String::new(); - for token in tokenize_markdown(&joined) { - match token { - Token::Text(t) => { - let replaced = DOT_RE.replace_all(t, |caps: ®ex::Captures<'_>| { - let len = caps[0].len(); - let ellipses = "…".repeat(len / 3); - let leftover = ".".repeat(len % 3); - format!("{ellipses}{leftover}") - }); - out.push_str(&replaced); - } - Token::Code(c) => { - out.push('`'); - out.push_str(c); - out.push('`'); - } - Token::Fence(f) => { - out.push_str(f); - } - Token::Newline => out.push('\n'), + process_tokens(lines, |token, out| match token { + Token::Text(t) => { + let replaced = DOT_RE.replace_all(t, |caps: ®ex::Captures<'_>| { + let len = caps[0].len(); + let ellipses = "…".repeat(len / 3); + let leftover = ".".repeat(len % 3); + format!("{ellipses}{leftover}") + }); + out.push_str(&replaced); } - } - out.split('\n').map(str::to_string).collect() + Token::Code(c) => { + out.push('`'); + out.push_str(c); + out.push('`'); + } + Token::Fence(f) => out.push_str(f), + Token::Newline => out.push('\n'), + }) } #[cfg(test)] diff --git a/src/footnotes.rs b/src/footnotes.rs index 24b5491a..d926b8ed 100644 --- a/src/footnotes.rs +++ b/src/footnotes.rs @@ -18,7 +18,7 @@ static FOOTNOTE_LINE_RE: LazyLock = lazy_regex!( "footnote line pattern should compile", ); -use crate::wrap::{Token, tokenize_markdown}; +use crate::{textproc::process_tokens, wrap::Token}; /// Extract the components of an inline footnote reference. #[inline] @@ -96,24 +96,16 @@ fn convert_block(lines: &mut [String]) { /// Convert bare numeric footnote references to Markdown footnote syntax. #[must_use] pub fn convert_footnotes(lines: &[String]) -> Vec { - if lines.is_empty() { - return Vec::new(); - } - let joined = lines.join("\n"); - let mut out = String::new(); - for token in tokenize_markdown(&joined) { - match token { - Token::Text(t) => out.push_str(&convert_inline(t)), - Token::Code(c) => { - out.push('`'); - out.push_str(c); - out.push('`'); - } - Token::Fence(f) => out.push_str(f), - Token::Newline => out.push('\n'), + let mut lines = process_tokens(lines, |token, out| match token { + Token::Text(t) => out.push_str(&convert_inline(t)), + Token::Code(c) => { + out.push('`'); + out.push_str(c); + out.push('`'); } - } - let mut lines: Vec = out.split('\n').map(str::to_string).collect(); + Token::Fence(f) => out.push_str(f), + Token::Newline => out.push('\n'), + }); convert_block(&mut lines); lines } diff --git a/src/lib.rs b/src/lib.rs index 9ae0f228..9a392498 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,6 +29,7 @@ pub mod lists; pub mod process; mod reflow; pub mod table; +mod textproc; pub mod wrap; #[doc(hidden)] diff --git a/src/textproc.rs b/src/textproc.rs new file mode 100644 index 00000000..0fafeae0 --- /dev/null +++ b/src/textproc.rs @@ -0,0 +1,73 @@ +//! Token-level transformation utilities. +//! +//! This module provides helpers for processing Markdown input by +//! reusing the tokenizer from the [`wrap`] module. Each helper joins +//! incoming lines, tokenizes them, and feeds the tokens to caller +//! provided logic before splitting the output back into lines. + +use crate::wrap::{Token, tokenize_markdown}; + +/// Apply a transformation to a sequence of [`Token`]s. +/// +/// The `lines` slice is joined with newlines and tokenized. Each token +/// is passed to `f` along with the output accumulator. The final +/// string is split on newline characters and returned as a vector of +/// lines. +/// +/// # Examples +/// +/// ``` +/// use mdtablefix::{ +/// textproc::process_tokens, +/// wrap::{Token, tokenize_markdown}, +/// }; +/// +/// let lines = vec!["code".to_string()]; +/// let out = process_tokens(&lines, |tok, out| match tok { +/// Token::Text(t) => out.push_str(t), +/// Token::Code(c) => { +/// out.push('`'); +/// out.push_str(c); +/// out.push('`'); +/// } +/// Token::Fence(f) => out.push_str(f), +/// Token::Newline => out.push('\n'), +/// }); +/// assert_eq!(out, lines); +/// ``` +#[must_use] +pub(crate) fn process_tokens(lines: &[String], mut f: F) -> Vec +where + F: FnMut(Token<'_>, &mut String), +{ + if lines.is_empty() { + return Vec::new(); + } + let joined = lines.join("\n"); + let mut out = String::new(); + for token in tokenize_markdown(&joined) { + f(token, &mut out); + } + out.split('\n').map(str::to_string).collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn identity_transformation_returns_input() { + let lines = vec!["a `b`".to_string()]; + let out = process_tokens(&lines, |tok, buf| match tok { + Token::Text(t) => buf.push_str(t), + Token::Code(c) => { + buf.push('`'); + buf.push_str(c); + buf.push('`'); + } + Token::Fence(f) => buf.push_str(f), + Token::Newline => buf.push('\n'), + }); + assert_eq!(out, lines); + } +} From 082cd08db09df448d2774e6ecd96045c89e89c33 Mon Sep 17 00:00:00 2001 From: Leynos Date: Tue, 29 Jul 2025 18:35:08 +0100 Subject: [PATCH 2/4] Add tests and preserve trailing blanks --- docs/architecture.md | 15 +++++++++++---- src/textproc.rs | 31 ++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/docs/architecture.md b/docs/architecture.md index dbf2524e..fbcadc05 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -225,6 +225,10 @@ classDiagram <> +convert_footnotes() } + class textproc { + <> + +process_tokens() + } class process { <> +process_stream() @@ -249,20 +253,23 @@ classDiagram lists ..> wrap : uses is_fence breaks ..> wrap : uses is_fence ellipsis ..> wrap : uses tokenize_markdown + ellipsis ..> textproc : uses process_tokens process ..> html : uses convert_html_tables process ..> table : uses reflow_table process ..> wrap : uses wrap_text, is_fence process ..> fences : uses compress_fences, attach_orphan_specifiers process ..> ellipsis : uses replace_ellipsis process ..> footnotes : uses convert_footnotes + footnotes ..> textproc : uses process_tokens io ..> process : uses process_stream, process_stream_no_wrap ``` The `lib` module re-exports the public API from the other modules. The -`ellipsis` module performs text normalization. The `process` module provides -streaming helpers that combine the lower-level functions, including ellipsis -replacement and footnote conversion. The `io` module handles filesystem -operations, delegating the text processing to `process`. +`ellipsis` module performs text normalization, while `footnotes` converts bare +references. The `textproc` module contains shared token-processing helpers used +by both. The `process` module provides streaming helpers that combine the +lower-level functions. The `io` module handles filesystem operations, +delegating the text processing to `process`. ## Concurrency with `rayon` diff --git a/src/textproc.rs b/src/textproc.rs index 0fafeae0..8a1126b5 100644 --- a/src/textproc.rs +++ b/src/textproc.rs @@ -43,12 +43,18 @@ where if lines.is_empty() { return Vec::new(); } + let trailing_blanks = lines.iter().rev().take_while(|l| l.is_empty()).count(); let joined = lines.join("\n"); let mut out = String::new(); for token in tokenize_markdown(&joined) { f(token, &mut out); } - out.split('\n').map(str::to_string).collect() + let mut result: Vec = out.split('\n').map(str::to_string).collect(); + let out_blanks = result.iter().rev().take_while(|l| l.is_empty()).count(); + for _ in out_blanks..trailing_blanks { + result.push(String::new()); + } + result } #[cfg(test)] @@ -70,4 +76,27 @@ mod tests { }); assert_eq!(out, lines); } + + #[test] + fn empty_input_returns_empty_vector() { + let lines: Vec = Vec::new(); + let out = process_tokens(&lines, |_tok, _out| unreachable!()); + assert!(out.is_empty()); + } + + #[test] + fn preserves_trailing_blank_lines() { + let lines = vec!["a".to_string(), String::new(), String::new()]; + let out = process_tokens(&lines, |tok, buf| match tok { + Token::Text(t) => buf.push_str(t), + Token::Code(c) => { + buf.push('`'); + buf.push_str(c); + buf.push('`'); + } + Token::Fence(f) => buf.push_str(f), + Token::Newline => buf.push('\n'), + }); + assert_eq!(out, lines); + } } From 4b39408d15bf43cbcea6c63a27f58f575541bc48 Mon Sep 17 00:00:00 2001 From: Leynos Date: Tue, 29 Jul 2025 19:32:00 +0100 Subject: [PATCH 3/4] Handle empty output in token processor --- src/textproc.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/textproc.rs b/src/textproc.rs index 8a1126b5..3d8d0e57 100644 --- a/src/textproc.rs +++ b/src/textproc.rs @@ -49,6 +49,9 @@ where for token in tokenize_markdown(&joined) { f(token, &mut out); } + if out.is_empty() { + return Vec::new(); + } let mut result: Vec = out.split('\n').map(str::to_string).collect(); let out_blanks = result.iter().rev().take_while(|l| l.is_empty()).count(); for _ in out_blanks..trailing_blanks { @@ -84,6 +87,13 @@ mod tests { assert!(out.is_empty()); } + #[test] + fn transformation_can_remove_all_content() { + let lines = vec!["data".to_string()]; + let out = process_tokens(&lines, |_tok, _out| {}); + assert!(out.is_empty()); + } + #[test] fn preserves_trailing_blank_lines() { let lines = vec!["a".to_string(), String::new(), String::new()]; From 191360c2efb8688b3e6bdbda6edaf8e4e9b908f3 Mon Sep 17 00:00:00 2001 From: Leynos Date: Wed, 30 Jul 2025 01:04:08 +0100 Subject: [PATCH 4/4] Refactor token processing --- docs/architecture.md | 1 - src/textproc.rs | 81 ++++++++++++++++++++++++++++++++++++-------- src/wrap.rs | 59 +------------------------------- 3 files changed, 67 insertions(+), 74 deletions(-) diff --git a/docs/architecture.md b/docs/architecture.md index fbcadc05..0ec0ea11 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -252,7 +252,6 @@ classDiagram table ..> reflow : uses parse_rows, etc. lists ..> wrap : uses is_fence breaks ..> wrap : uses is_fence - ellipsis ..> wrap : uses tokenize_markdown ellipsis ..> textproc : uses process_tokens process ..> html : uses convert_html_tables process ..> table : uses reflow_table diff --git a/src/textproc.rs b/src/textproc.rs index 3d8d0e57..a46fa8dc 100644 --- a/src/textproc.rs +++ b/src/textproc.rs @@ -1,25 +1,26 @@ -//! Token-level transformation utilities. +//! Provides helpers for token-based transformations of Markdown lines. //! -//! This module provides helpers for processing Markdown input by -//! reusing the tokenizer from the [`wrap`] module. Each helper joins -//! incoming lines, tokenizes them, and feeds the tokens to caller -//! provided logic before splitting the output back into lines. +//! This module reuses the tokenizer from the [`wrap`] module and offers +//! a streaming API for rewriting Markdown. Each helper tokenizes lines +//! on the fly, feeds the resulting tokens to caller-provided logic, and +//! then reconstructs the lines. Trailing blank lines roundtrip +//! correctly. -use crate::wrap::{Token, tokenize_markdown}; +use crate::wrap::{Token, is_fence}; /// Apply a transformation to a sequence of [`Token`]s. /// -/// The `lines` slice is joined with newlines and tokenized. Each token -/// is passed to `f` along with the output accumulator. The final -/// string is split on newline characters and returned as a vector of -/// lines. +/// The `lines` slice is tokenized in order, preserving fence context. +/// Each token is passed to `f` along with the output accumulator. The +/// final string is split on newline characters and returned as a +/// vector of lines. /// /// # Examples /// -/// ``` +/// ```ignore /// use mdtablefix::{ /// textproc::process_tokens, -/// wrap::{Token, tokenize_markdown}, +/// wrap::Token, /// }; /// /// let lines = vec!["code".to_string()]; @@ -43,15 +44,58 @@ where if lines.is_empty() { return Vec::new(); } + let trailing_blanks = lines.iter().rev().take_while(|l| l.is_empty()).count(); - let joined = lines.join("\n"); + if trailing_blanks == lines.len() { + return vec![String::new(); lines.len()]; + } + let mut out = String::new(); - for token in tokenize_markdown(&joined) { - f(token, &mut out); + let mut in_fence = false; + let last_idx = lines.len() - 1; + for (i, line) in lines.iter().enumerate() { + let trimmed = line.as_str(); + if is_fence(trimmed) { + f(Token::Fence(trimmed), &mut out); + if i < last_idx { + f(Token::Newline, &mut out); + } + in_fence = !in_fence; + continue; + } + if in_fence { + f(Token::Fence(trimmed), &mut out); + if i < last_idx { + f(Token::Newline, &mut out); + } + continue; + } + let mut rest = trimmed; + while let Some(pos) = rest.find('`') { + if pos > 0 { + f(Token::Text(&rest[..pos]), &mut out); + } + if let Some(end) = rest[pos + 1..].find('`') { + f(Token::Code(&rest[pos + 1..pos + 1 + end]), &mut out); + rest = &rest[pos + end + 2..]; + } else { + f(Token::Text(&rest[pos..]), &mut out); + rest = ""; + break; + } + } + if !rest.is_empty() { + f(Token::Text(rest), &mut out); + } + if i < last_idx { + f(Token::Newline, &mut out); + } } + if out.is_empty() { return Vec::new(); } + let mut result: Vec = out.split('\n').map(str::to_string).collect(); let out_blanks = result.iter().rev().take_while(|l| l.is_empty()).count(); for _ in out_blanks..trailing_blanks { @@ -109,4 +153,11 @@ mod tests { }); assert_eq!(out, lines); } + + #[test] + fn blanks_only_are_preserved() { + let lines = vec![String::new(), String::new()]; + let out = process_tokens(&lines, |_tok, _buf| {}); + assert_eq!(out, lines); + } } diff --git a/src/wrap.rs b/src/wrap.rs index 9cf3e670..0f240ffa 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -71,6 +71,7 @@ static HANDLERS: &[PrefixHandler] = &[ ]; /// Markdown token emitted by [`tokenize_markdown`]. +/// Markdown token emitted by token-processing helpers. #[derive(Debug, PartialEq)] pub enum Token<'a> { /// Line within a fenced code block, including the fence itself. @@ -170,64 +171,6 @@ fn tokenize_inline(text: &str) -> Vec { } tokens } - -/// Split the input string into [`Token`]s by analysing whitespace and -/// backtick delimiters. -/// -/// The tokenizer groups consecutive whitespace into a single -/// [`Token::Text`] and recognises backtick sequences as inline code spans. -/// When a run of backticks is encountered the parser searches forward for an -/// identical delimiter, allowing nested backticks when the span uses a longer -/// fence. Unmatched delimiter sequences are treated as literal text. -/// -/// ```rust,ignore -/// use mdtablefix::wrap::{Token, tokenize_markdown}; -/// -/// let tokens = tokenize_markdown("Example with `code`"); -/// assert_eq!( -/// tokens, -/// vec![Token::Text("Example with "), Token::Code("code")] -/// ); -/// ``` -pub(crate) fn tokenize_markdown(input: &str) -> Vec> { - let mut out = Vec::new(); - let mut in_fence = false; - for line in input.split_inclusive('\n') { - let trimmed = line.trim_end_matches('\n'); - if FENCE_RE.is_match(trimmed) { - out.push(Token::Fence(trimmed)); - out.push(Token::Newline); - in_fence = !in_fence; - continue; - } - if in_fence { - out.push(Token::Fence(trimmed)); - out.push(Token::Newline); - continue; - } - let mut rest = trimmed; - while let Some(pos) = rest.find('`') { - if pos > 0 { - out.push(Token::Text(&rest[..pos])); - } - if let Some(end) = rest[pos + 1..].find('`') { - out.push(Token::Code(&rest[pos + 1..pos + 1 + end])); - rest = &rest[pos + end + 2..]; - } else { - out.push(Token::Text(&rest[pos..])); - rest = ""; - break; - } - } - if !rest.is_empty() { - out.push(Token::Text(rest)); - } - out.push(Token::Newline); - } - out.pop(); - out -} - /// Determine if the current line should break at the last whitespace. /// /// Returns `true` if `current_width` exceeds `width` and a whitespace split