diff --git a/docs/architecture.md b/docs/architecture.md index 61739015..5baf68bc 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -225,6 +225,10 @@ classDiagram <> +convert_footnotes() } + class textproc { + <> + +process_tokens() + } class process { <> +process_stream() @@ -248,21 +252,23 @@ classDiagram table ..> reflow : uses parse_rows, etc. lists ..> wrap : uses is_fence breaks ..> wrap : uses is_fence - ellipsis ..> wrap : uses tokenize_markdown + ellipsis ..> textproc : uses process_tokens process ..> html : uses convert_html_tables process ..> table : uses reflow_table process ..> wrap : uses wrap_text, is_fence process ..> fences : uses compress_fences, attach_orphan_specifiers process ..> ellipsis : uses replace_ellipsis process ..> footnotes : uses convert_footnotes + footnotes ..> textproc : uses process_tokens io ..> process : uses process_stream, process_stream_no_wrap ``` The `lib` module re-exports the public API from the other modules. The -`ellipsis` module performs text normalization. The `process` module provides -streaming helpers that combine the lower-level functions, including ellipsis -replacement and footnote conversion. The `io` module handles filesystem -operations, delegating the text processing to `process`. +`ellipsis` module performs text normalization, while `footnotes` converts bare +references. The `textproc` module contains shared token-processing helpers used +by both the `ellipsis` and `footnotes` modules. The `process` module provides +streaming helpers that combine the lower-level functions. The `io` module +handles filesystem operations, delegating the text processing to `process`. The helper `html_table_to_markdown` is retained for backward compatibility but is deprecated. New code should call `convert_html_tables` instead. diff --git a/src/ellipsis.rs b/src/ellipsis.rs index 1b84f0b1..c9db9d82 100644 --- a/src/ellipsis.rs +++ b/src/ellipsis.rs @@ -9,41 +9,35 @@ use std::sync::LazyLock; use regex::Regex; -use crate::wrap::{Token, tokenize_markdown}; +use crate::textproc::{Token, process_tokens}; static DOT_RE: LazyLock = lazy_regex!(r"\.{3,}", "ellipsis pattern regex should compile"); /// Replace `...` with `…` outside code spans and fences. #[must_use] pub fn replace_ellipsis(lines: &[String]) -> Vec { - if lines.is_empty() { - return Vec::new(); - } - let joined = lines.join("\n"); - let mut out = String::new(); - for token in tokenize_markdown(&joined) { - match token { - Token::Text(t) => { - let replaced = DOT_RE.replace_all(t, |caps: ®ex::Captures<'_>| { - let len = caps[0].len(); - let ellipses = "…".repeat(len / 3); - let leftover = ".".repeat(len % 3); - format!("{ellipses}{leftover}") - }); - out.push_str(&replaced); - } - Token::Code(c) => { - out.push('`'); - out.push_str(c); - out.push('`'); - } - Token::Fence(f) => { - out.push_str(f); + process_tokens(lines, |token, out| match token { + Token::Text(t) => { + if !DOT_RE.is_match(t) { + out.push_str(t); + return; } - Token::Newline => out.push('\n'), + let replaced = DOT_RE.replace_all(t, |caps: ®ex::Captures<'_>| { + let len = caps[0].len(); + let ellipses = "…".repeat(len / 3); + let leftover = ".".repeat(len % 3); + format!("{ellipses}{leftover}") + }); + out.push_str(&replaced); } - } - out.split('\n').map(str::to_string).collect() + Token::Code(c) => { + out.push('`'); + out.push_str(c); + out.push('`'); + } + Token::Fence(f) => out.push_str(f), + Token::Newline => out.push('\n'), + }) } #[cfg(test)] diff --git a/src/footnotes.rs b/src/footnotes.rs index 24b5491a..d4bbd590 100644 --- a/src/footnotes.rs +++ b/src/footnotes.rs @@ -18,7 +18,7 @@ static FOOTNOTE_LINE_RE: LazyLock = lazy_regex!( "footnote line pattern should compile", ); -use crate::wrap::{Token, tokenize_markdown}; +use crate::textproc::{Token, process_tokens}; /// Extract the components of an inline footnote reference. #[inline] @@ -96,24 +96,16 @@ fn convert_block(lines: &mut [String]) { /// Convert bare numeric footnote references to Markdown footnote syntax. #[must_use] pub fn convert_footnotes(lines: &[String]) -> Vec { - if lines.is_empty() { - return Vec::new(); - } - let joined = lines.join("\n"); - let mut out = String::new(); - for token in tokenize_markdown(&joined) { - match token { - Token::Text(t) => out.push_str(&convert_inline(t)), - Token::Code(c) => { - out.push('`'); - out.push_str(c); - out.push('`'); - } - Token::Fence(f) => out.push_str(f), - Token::Newline => out.push('\n'), + let mut lines = process_tokens(lines, |token, out| match token { + Token::Text(t) => out.push_str(&convert_inline(t)), + Token::Code(c) => { + out.push('`'); + out.push_str(c); + out.push('`'); } - } - let mut lines: Vec = out.split('\n').map(str::to_string).collect(); + Token::Fence(f) => out.push_str(f), + Token::Newline => out.push('\n'), + }); convert_block(&mut lines); lines } diff --git a/src/lib.rs b/src/lib.rs index 197edba1..3edae610 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,7 @@ //! - `ellipsis` for normalizing textual ellipses. //! - `fences` for issues with code block fences //! - `footnotes` for converting bare footnote links. +//! - `textproc` for token-based transformations. //! - `process` for stream processing. //! - `io` for file helpers. @@ -29,6 +30,7 @@ pub mod lists; pub mod process; mod reflow; pub mod table; +pub mod textproc; pub mod wrap; #[deprecated(note = "this function is legacy; use `convert_html_tables` instead")] diff --git a/src/textproc.rs b/src/textproc.rs new file mode 100644 index 00000000..9ea8545f --- /dev/null +++ b/src/textproc.rs @@ -0,0 +1,227 @@ +//! Provides helpers for token-based transformations of Markdown lines. +//! +//! This module reuses the tokenizer from the [`wrap`] module and offers +//! a streaming API for rewriting Markdown. Each helper tokenizes lines +//! on the fly, feeds the resulting tokens to caller-provided logic, and +//! then reconstructs the lines. Trailing blank lines roundtrip +//! correctly. + +pub use crate::wrap::Token; +use crate::wrap::is_fence; + +fn tokenize_inline<'a, F>(text: &'a str, emit: &mut F) +where + F: FnMut(Token<'a>), +{ + let mut rest = text; + while let Some(pos) = rest.find('`') { + if pos > 0 { + emit(Token::Text(&rest[..pos])); + } + let delim_len = rest[pos..].chars().take_while(|&c| c == '`').count(); + let search = &rest[pos + delim_len..]; + let closing = "`".repeat(delim_len); + if let Some(end) = search.find(&closing) { + emit(Token::Code(&rest[pos + delim_len..pos + delim_len + end])); + rest = &search[end + delim_len..]; + } else { + emit(Token::Text(&rest[pos..])); + rest = ""; + break; + } + } + if !rest.is_empty() { + emit(Token::Text(rest)); + } +} + +fn handle_line<'a, F>(line: &'a str, last: bool, in_fence: &mut bool, f: &mut F, out: &mut String) +where + F: FnMut(Token<'a>, &mut String), +{ + if is_fence(line) { + f(Token::Fence(line), out); + if !last { + f(Token::Newline, out); + } + *in_fence = !*in_fence; + return; + } + + if *in_fence { + f(Token::Fence(line), out); + if !last { + f(Token::Newline, out); + } + return; + } + + tokenize_inline(line, &mut |tok| f(tok, out)); + if !last { + f(Token::Newline, out); + } +} + +/// Apply a transformation to a sequence of [`Token`]s. +/// +/// The `lines` slice is tokenized in order, preserving fence context. +/// Each token is passed to `f` along with the output accumulator. The +/// final string is split on newline characters and returned as a +/// vector of lines. +/// +/// # Examples +/// +/// ```rust +/// use mdtablefix::{textproc::process_tokens, wrap::Token}; +/// +/// let lines = vec!["code".to_string()]; +/// let out = process_tokens(&lines, |tok, out| match tok { +/// Token::Text(t) => out.push_str(t), +/// Token::Code(c) => { +/// out.push('`'); +/// out.push_str(c); +/// out.push('`'); +/// } +/// Token::Fence(f) => out.push_str(f), +/// Token::Newline => out.push('\n'), +/// }); +/// assert_eq!(out, lines); +/// ``` +#[must_use] +pub fn process_tokens(lines: &[String], mut f: F) -> Vec +where + F: FnMut(Token<'_>, &mut String), +{ + if lines.is_empty() { + return Vec::new(); + } + + let trailing_blanks = lines.iter().rev().take_while(|l| l.is_empty()).count(); + if trailing_blanks == lines.len() { + return vec![String::new(); lines.len()]; + } + + let mut out = String::new(); + let mut in_fence = false; + let last_idx = lines.len() - 1; + for (i, line) in lines.iter().enumerate() { + handle_line(line, i == last_idx, &mut in_fence, &mut f, &mut out); + } + + if out.is_empty() { + return Vec::new(); + } + + let mut result: Vec = out.split('\n').map(str::to_string).collect(); + let out_blanks = result.iter().rev().take_while(|l| l.is_empty()).count(); + for _ in out_blanks..trailing_blanks { + result.push(String::new()); + } + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn identity_transformation_returns_input() { + let lines = vec!["a `b`".to_string()]; + let out = process_tokens(&lines, |tok, buf| match tok { + Token::Text(t) => buf.push_str(t), + Token::Code(c) => { + buf.push('`'); + buf.push_str(c); + buf.push('`'); + } + Token::Fence(f) => buf.push_str(f), + Token::Newline => buf.push('\n'), + }); + assert_eq!(out, lines); + } + + #[test] + fn empty_input_returns_empty_vector() { + let lines: Vec = Vec::new(); + let out = process_tokens(&lines, |_tok, _out| unreachable!()); + assert!(out.is_empty()); + } + + #[test] + fn transformation_can_remove_all_content() { + let lines = vec!["data".to_string()]; + let out = process_tokens(&lines, |_tok, _out| {}); + assert!(out.is_empty()); + } + + #[test] + fn preserves_trailing_blank_lines() { + let lines = vec!["a".to_string(), String::new(), String::new()]; + let out = process_tokens(&lines, |tok, buf| match tok { + Token::Text(t) => buf.push_str(t), + Token::Code(c) => { + buf.push('`'); + buf.push_str(c); + buf.push('`'); + } + Token::Fence(f) => buf.push_str(f), + Token::Newline => buf.push('\n'), + }); + assert_eq!(out, lines); + } + + #[test] + fn blanks_only_are_preserved() { + let lines = vec![String::new(), String::new()]; + let out = process_tokens(&lines, |_tok, _buf| {}); + assert_eq!(out, lines); + } + + #[test] + fn token_stream_handles_fences() { + let lines = vec![ + "```rust".to_string(), + "fn main() {".to_string(), + " println!(\"hi\");".to_string(), + "```".to_string(), + ]; + let mut tokens = Vec::new(); + let _ = process_tokens(&lines, |tok, _| tokens.push(format!("{tok:?}"))); + let expected = vec![ + "Fence(\"```rust\")".to_string(), + "Newline".to_string(), + "Fence(\"fn main() {\")".to_string(), + "Newline".to_string(), + "Fence(\" println!(\\\"hi\\\");\")".to_string(), + "Newline".to_string(), + "Fence(\"```\")".to_string(), + ]; + assert_eq!(tokens, expected); + } + + #[test] + fn malformed_fence_sequence_returns_tokens() { + let lines = vec!["```".to_string(), "code".to_string()]; + let mut tokens = Vec::new(); + let _ = process_tokens(&lines, |tok, _| tokens.push(format!("{tok:?}"))); + let expected = vec![ + "Fence(\"```\")".to_string(), + "Newline".to_string(), + "Fence(\"code\")".to_string(), + ]; + assert_eq!(tokens, expected); + } + + #[test] + fn multi_backtick_spans_are_recognised() { + let lines = vec!["A ``code`` span".to_string()]; + let mut tokens = Vec::new(); + let _ = process_tokens(&lines, |tok, _| tokens.push(format!("{tok:?}"))); + let expected = vec![ + "Text(\"A \")".to_string(), + "Code(\"code\")".to_string(), + "Text(\" span\")".to_string(), + ]; + assert_eq!(tokens, expected); + } +} diff --git a/src/wrap.rs b/src/wrap.rs index 4627f00e..ce97ecfd 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -8,7 +8,7 @@ use regex::{Captures, Regex}; mod tokenize; -pub(crate) use tokenize::{Token, tokenize_markdown}; +pub use tokenize::Token; static FENCE_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| Regex::new(r"^\s*(```|~~~).*").unwrap()); diff --git a/src/wrap/tokenize.rs b/src/wrap/tokenize.rs index 184dae3e..6be74ffe 100644 --- a/src/wrap/tokenize.rs +++ b/src/wrap/tokenize.rs @@ -3,8 +3,6 @@ //! This module contains utilities for breaking lines into tokens so that //! inline code spans and Markdown links are preserved during wrapping. -use super::FENCE_RE; - fn scan_while(chars: &[char], mut i: usize, cond: F) -> usize where F: Fn(char) -> bool, @@ -19,7 +17,7 @@ fn collect_range(chars: &[char], start: usize, end: usize) -> String { chars[start..end].iter().collect() } -/// Markdown token emitted by [`tokenize_markdown`]. +/// Markdown token emitted by [`segment_inline`]. #[derive(Debug, PartialEq)] pub enum Token<'a> { /// Line within a fenced code block, including the fence itself. @@ -129,45 +127,6 @@ pub(super) fn segment_inline(text: &str) -> Vec { /// is encountered the parser searches forward for an identical delimiter, /// allowing nested backticks when the span uses a longer fence. Unmatched /// delimiter sequences are treated as literal text. -pub(crate) fn tokenize_markdown(input: &str) -> Vec> { - let mut out = Vec::new(); - let mut in_fence = false; - for line in input.split_inclusive('\n') { - let trimmed = line.trim_end_matches('\n'); - if FENCE_RE.is_match(trimmed) { - out.push(Token::Fence(trimmed)); - out.push(Token::Newline); - in_fence = !in_fence; - continue; - } - if in_fence { - out.push(Token::Fence(trimmed)); - out.push(Token::Newline); - continue; - } - let mut rest = trimmed; - while let Some(pos) = rest.find('`') { - if pos > 0 { - out.push(Token::Text(&rest[..pos])); - } - if let Some(end) = rest[pos + 1..].find('`') { - out.push(Token::Code(&rest[pos + 1..pos + 1 + end])); - rest = &rest[pos + end + 2..]; - } else { - out.push(Token::Text(&rest[pos..])); - rest = ""; - break; - } - } - if !rest.is_empty() { - out.push(Token::Text(rest)); - } - out.push(Token::Newline); - } - out.pop(); - out -} - #[cfg(test)] mod tests { use super::*;