From 3b3f05be49506917ca113d8eacea28ee79ab8270 Mon Sep 17 00:00:00 2001 From: Leynos Date: Thu, 31 Jul 2025 19:57:06 +0100 Subject: [PATCH 1/3] Simplify token processing --- docs/architecture.md | 8 +++-- src/textproc.rs | 73 ++++++-------------------------------------- src/wrap.rs | 2 +- src/wrap/tokenize.rs | 64 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 79 insertions(+), 68 deletions(-) diff --git a/docs/architecture.md b/docs/architecture.md index 5baf68bc..d4f52a9b 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -266,9 +266,11 @@ classDiagram The `lib` module re-exports the public API from the other modules. The `ellipsis` module performs text normalization, while `footnotes` converts bare references. The `textproc` module contains shared token-processing helpers used -by both the `ellipsis` and `footnotes` modules. The `process` module provides -streaming helpers that combine the lower-level functions. The `io` module -handles filesystem operations, delegating the text processing to `process`. +by both the `ellipsis` and `footnotes` modules. Tokenization is handled by +`wrap::tokenize_markdown`, replacing the small state machine that previously +resided in `process_tokens`. The `process` module provides streaming helpers +that combine the lower-level functions. The `io` module handles filesystem +operations, delegating the text processing to `process`. The helper `html_table_to_markdown` is retained for backward compatibility but is deprecated. New code should call `convert_html_tables` instead. diff --git a/src/textproc.rs b/src/textproc.rs index 9ea8545f..331edcf9 100644 --- a/src/textproc.rs +++ b/src/textproc.rs @@ -6,61 +6,7 @@ //! then reconstructs the lines. Trailing blank lines roundtrip //! correctly. -pub use crate::wrap::Token; -use crate::wrap::is_fence; - -fn tokenize_inline<'a, F>(text: &'a str, emit: &mut F) -where - F: FnMut(Token<'a>), -{ - let mut rest = text; - while let Some(pos) = rest.find('`') { - if pos > 0 { - emit(Token::Text(&rest[..pos])); - } - let delim_len = rest[pos..].chars().take_while(|&c| c == '`').count(); - let search = &rest[pos + delim_len..]; - let closing = "`".repeat(delim_len); - if let Some(end) = search.find(&closing) { - emit(Token::Code(&rest[pos + delim_len..pos + delim_len + end])); - rest = &search[end + delim_len..]; - } else { - emit(Token::Text(&rest[pos..])); - rest = ""; - break; - } - } - if !rest.is_empty() { - emit(Token::Text(rest)); - } -} - -fn handle_line<'a, F>(line: &'a str, last: bool, in_fence: &mut bool, f: &mut F, out: &mut String) -where - F: FnMut(Token<'a>, &mut String), -{ - if is_fence(line) { - f(Token::Fence(line), out); - if !last { - f(Token::Newline, out); - } - *in_fence = !*in_fence; - return; - } - - if *in_fence { - f(Token::Fence(line), out); - if !last { - f(Token::Newline, out); - } - return; - } - - tokenize_inline(line, &mut |tok| f(tok, out)); - if !last { - f(Token::Newline, out); - } -} +pub use crate::wrap::{Token, tokenize_markdown}; /// Apply a transformation to a sequence of [`Token`]s. /// @@ -101,22 +47,21 @@ where return vec![String::new(); lines.len()]; } + let source = lines.join("\n"); let mut out = String::new(); - let mut in_fence = false; - let last_idx = lines.len() - 1; - for (i, line) in lines.iter().enumerate() { - handle_line(line, i == last_idx, &mut in_fence, &mut f, &mut out); + for token in tokenize_markdown(&source) { + f(token, &mut out); } if out.is_empty() { return Vec::new(); } - let mut result: Vec = out.split('\n').map(str::to_string).collect(); - let out_blanks = result.iter().rev().take_while(|l| l.is_empty()).count(); - for _ in out_blanks..trailing_blanks { - result.push(String::new()); - } + let mut result: Vec = out.split('\n').map(ToOwned::to_owned).collect(); + result.extend(std::iter::repeat_n( + String::new(), + trailing_blanks.saturating_sub(result.len()), + )); result } diff --git a/src/wrap.rs b/src/wrap.rs index ce97ecfd..6b1ef390 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -8,7 +8,7 @@ use regex::{Captures, Regex}; mod tokenize; -pub use tokenize::Token; +pub use tokenize::{Token, tokenize_markdown}; static FENCE_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| Regex::new(r"^\s*(```|~~~).*").unwrap()); diff --git a/src/wrap/tokenize.rs b/src/wrap/tokenize.rs index 6be74ffe..a01f052e 100644 --- a/src/wrap/tokenize.rs +++ b/src/wrap/tokenize.rs @@ -119,6 +119,70 @@ pub(super) fn segment_inline(text: &str) -> Vec { tokens } +fn tokenize_inline<'a, F>(text: &'a str, emit: &mut F) +where + F: FnMut(Token<'a>), +{ + let mut rest = text; + while let Some(pos) = rest.find('`') { + if pos > 0 { + emit(Token::Text(&rest[..pos])); + } + let delim_len = rest[pos..].chars().take_while(|&c| c == '`').count(); + let search = &rest[pos + delim_len..]; + let closing = "`".repeat(delim_len); + if let Some(end) = search.find(&closing) { + emit(Token::Code(&rest[pos + delim_len..pos + delim_len + end])); + rest = &search[end + delim_len..]; + } else { + emit(Token::Text(&rest[pos..])); + rest = ""; + break; + } + } + if !rest.is_empty() { + emit(Token::Text(rest)); + } +} + +/// Tokenize a block of Markdown into [`Token`]s. +#[must_use] +pub fn tokenize_markdown(source: &str) -> Vec> { + if source.is_empty() { + return Vec::new(); + } + + let mut tokens = Vec::new(); + let lines: Vec<&str> = source.split('\n').collect(); + let last_idx = lines.len() - 1; + let mut in_fence = false; + + for (i, line) in lines.iter().enumerate() { + if super::is_fence(line) { + tokens.push(Token::Fence(line)); + if i != last_idx { + tokens.push(Token::Newline); + } + in_fence = !in_fence; + continue; + } + + if in_fence { + tokens.push(Token::Fence(line)); + if i != last_idx { + tokens.push(Token::Newline); + } + continue; + } + + tokenize_inline(line, &mut |tok| tokens.push(tok)); + if i != last_idx { + tokens.push(Token::Newline); + } + } + tokens +} + /// Split the input string into [`Token`]s by analysing whitespace and backtick /// delimiters. /// From 31c0f6809fbe3ad5ff9ea388266bf25f532bee75 Mon Sep 17 00:00:00 2001 From: Leynos Date: Thu, 31 Jul 2025 20:27:51 +0100 Subject: [PATCH 2/3] Fix token processing and add tokenizer tests --- src/textproc.rs | 11 ++++++---- tests/wrap/tokenize_markdown.rs | 36 +++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 4 deletions(-) create mode 100644 tests/wrap/tokenize_markdown.rs diff --git a/src/textproc.rs b/src/textproc.rs index 331edcf9..74008660 100644 --- a/src/textproc.rs +++ b/src/textproc.rs @@ -58,10 +58,13 @@ where } let mut result: Vec = out.split('\n').map(ToOwned::to_owned).collect(); - result.extend(std::iter::repeat_n( - String::new(), - trailing_blanks.saturating_sub(result.len()), - )); + let out_blanks = result.iter().rev().take_while(|l| l.is_empty()).count(); + if out_blanks < trailing_blanks { + result.extend(std::iter::repeat_n( + String::new(), + trailing_blanks - out_blanks, + )); + } result } diff --git a/tests/wrap/tokenize_markdown.rs b/tests/wrap/tokenize_markdown.rs new file mode 100644 index 00000000..4d146f59 --- /dev/null +++ b/tests/wrap/tokenize_markdown.rs @@ -0,0 +1,36 @@ +//! Tests for the tokenize_markdown helper. + +use mdtablefix::wrap::{self, Token}; + +#[test] +fn unclosed_fence_yields_fence_tokens() { + let lines = vec!["```rust", "let x = 42;", "fn foo() {}"]; + let joined = lines.join("\n"); + let tokens = wrap::tokenize_markdown(&joined); + assert_eq!( + tokens, + vec![ + Token::Fence("```rust"), + Token::Newline, + Token::Fence("let x = 42;"), + Token::Newline, + Token::Fence("fn foo() {}"), + ] + ); +} + +#[test] +fn malformed_fence_is_text() { + let source = "``~~\ncode\n``~~"; + let tokens = wrap::tokenize_markdown(source); + assert_eq!( + tokens, + vec![ + Token::Text("``~~"), + Token::Newline, + Token::Text("code"), + Token::Newline, + Token::Text("``~~"), + ] + ); +} From d0e2b776c4e963a78b51789c38fb14cf7251cdea Mon Sep 17 00:00:00 2001 From: Leynos Date: Thu, 31 Jul 2025 20:40:07 +0100 Subject: [PATCH 3/3] Add tests for malformed fence lengths --- tests/wrap/tokenize_markdown.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/wrap/tokenize_markdown.rs b/tests/wrap/tokenize_markdown.rs index 4d146f59..33f67f0b 100644 --- a/tests/wrap/tokenize_markdown.rs +++ b/tests/wrap/tokenize_markdown.rs @@ -34,3 +34,19 @@ fn malformed_fence_is_text() { ] ); } + +#[test] +fn incorrect_fence_length_is_text() { + let source = "````\ncode\n````"; + let tokens = wrap::tokenize_markdown(source); + assert_eq!( + tokens, + vec![ + Token::Text("````"), + Token::Newline, + Token::Text("code"), + Token::Newline, + Token::Text("````"), + ] + ); +}