Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions docs/architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,10 @@ classDiagram
<<module>>
+convert_footnotes()
}
class textproc {
<<module>>
+process_tokens()
}
class process {
<<module>>
+process_stream()
Expand All @@ -248,21 +252,23 @@ classDiagram
table ..> reflow : uses parse_rows, etc.
lists ..> wrap : uses is_fence
breaks ..> wrap : uses is_fence
ellipsis ..> wrap : uses tokenize_markdown
ellipsis ..> textproc : uses process_tokens
process ..> html : uses convert_html_tables
process ..> table : uses reflow_table
process ..> wrap : uses wrap_text, is_fence
process ..> fences : uses compress_fences, attach_orphan_specifiers
process ..> ellipsis : uses replace_ellipsis
process ..> footnotes : uses convert_footnotes
footnotes ..> textproc : uses process_tokens
io ..> process : uses process_stream, process_stream_no_wrap
```

The `lib` module re-exports the public API from the other modules. The
`ellipsis` module performs text normalization. The `process` module provides
streaming helpers that combine the lower-level functions, including ellipsis
replacement and footnote conversion. The `io` module handles filesystem
operations, delegating the text processing to `process`.
`ellipsis` module performs text normalization, while `footnotes` converts bare
references. The `textproc` module contains shared token-processing helpers used
Comment thread
leynos marked this conversation as resolved.
by both the `ellipsis` and `footnotes` modules. The `process` module provides
streaming helpers that combine the lower-level functions. The `io` module
handles filesystem operations, delegating the text processing to `process`.

The helper `html_table_to_markdown` is retained for backward compatibility but
is deprecated. New code should call `convert_html_tables` instead.
Expand Down
48 changes: 21 additions & 27 deletions src/ellipsis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,41 +9,35 @@ use std::sync::LazyLock;

use regex::Regex;

use crate::wrap::{Token, tokenize_markdown};
use crate::textproc::{Token, process_tokens};

static DOT_RE: LazyLock<Regex> = lazy_regex!(r"\.{3,}", "ellipsis pattern regex should compile");

/// Replace `...` with `…` outside code spans and fences.
#[must_use]
pub fn replace_ellipsis(lines: &[String]) -> Vec<String> {
if lines.is_empty() {
return Vec::new();
}
let joined = lines.join("\n");
let mut out = String::new();
for token in tokenize_markdown(&joined) {
match token {
Token::Text(t) => {
let replaced = DOT_RE.replace_all(t, |caps: &regex::Captures<'_>| {
let len = caps[0].len();
let ellipses = "…".repeat(len / 3);
let leftover = ".".repeat(len % 3);
format!("{ellipses}{leftover}")
});
out.push_str(&replaced);
}
Token::Code(c) => {
out.push('`');
out.push_str(c);
out.push('`');
}
Token::Fence(f) => {
out.push_str(f);
process_tokens(lines, |token, out| match token {
Token::Text(t) => {
if !DOT_RE.is_match(t) {
out.push_str(t);
return;
}
Token::Newline => out.push('\n'),
let replaced = DOT_RE.replace_all(t, |caps: &regex::Captures<'_>| {
let len = caps[0].len();
let ellipses = "…".repeat(len / 3);
let leftover = ".".repeat(len % 3);
format!("{ellipses}{leftover}")
});
out.push_str(&replaced);
}
}
out.split('\n').map(str::to_string).collect()
Token::Code(c) => {
out.push('`');
out.push_str(c);
out.push('`');
}
Token::Fence(f) => out.push_str(f),
Token::Newline => out.push('\n'),
})
}

#[cfg(test)]
Expand Down
28 changes: 10 additions & 18 deletions src/footnotes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ static FOOTNOTE_LINE_RE: LazyLock<Regex> = lazy_regex!(
"footnote line pattern should compile",
);

use crate::wrap::{Token, tokenize_markdown};
use crate::textproc::{Token, process_tokens};

/// Extract the components of an inline footnote reference.
#[inline]
Expand Down Expand Up @@ -96,24 +96,16 @@ fn convert_block(lines: &mut [String]) {
/// Convert bare numeric footnote references to Markdown footnote syntax.
#[must_use]
pub fn convert_footnotes(lines: &[String]) -> Vec<String> {
if lines.is_empty() {
return Vec::new();
}
let joined = lines.join("\n");
let mut out = String::new();
for token in tokenize_markdown(&joined) {
match token {
Token::Text(t) => out.push_str(&convert_inline(t)),
Token::Code(c) => {
out.push('`');
out.push_str(c);
out.push('`');
}
Token::Fence(f) => out.push_str(f),
Token::Newline => out.push('\n'),
let mut lines = process_tokens(lines, |token, out| match token {
Token::Text(t) => out.push_str(&convert_inline(t)),
Token::Code(c) => {
out.push('`');
out.push_str(c);
out.push('`');
}
}
let mut lines: Vec<String> = out.split('\n').map(str::to_string).collect();
Token::Fence(f) => out.push_str(f),
Token::Newline => out.push('\n'),
});
convert_block(&mut lines);
lines
}
Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
//! - `ellipsis` for normalizing textual ellipses.
//! - `fences` for issues with code block fences
//! - `footnotes` for converting bare footnote links.
//! - `textproc` for token-based transformations.
//! - `process` for stream processing.
//! - `io` for file helpers.

Expand All @@ -29,6 +30,7 @@ pub mod lists;
pub mod process;
mod reflow;
pub mod table;
pub mod textproc;
pub mod wrap;

#[deprecated(note = "this function is legacy; use `convert_html_tables` instead")]
Expand Down
227 changes: 227 additions & 0 deletions src/textproc.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
//! Provides helpers for token-based transformations of Markdown lines.
//!
//! This module reuses the tokenizer from the [`wrap`] module and offers
//! a streaming API for rewriting Markdown. Each helper tokenizes lines
//! on the fly, feeds the resulting tokens to caller-provided logic, and
//! then reconstructs the lines. Trailing blank lines roundtrip
//! correctly.

pub use crate::wrap::Token;
use crate::wrap::is_fence;

fn tokenize_inline<'a, F>(text: &'a str, emit: &mut F)
where
F: FnMut(Token<'a>),
{
let mut rest = text;
while let Some(pos) = rest.find('`') {
if pos > 0 {
emit(Token::Text(&rest[..pos]));
}
let delim_len = rest[pos..].chars().take_while(|&c| c == '`').count();
let search = &rest[pos + delim_len..];
let closing = "`".repeat(delim_len);
if let Some(end) = search.find(&closing) {
emit(Token::Code(&rest[pos + delim_len..pos + delim_len + end]));
rest = &search[end + delim_len..];
} else {
emit(Token::Text(&rest[pos..]));
rest = "";
break;
}
}
if !rest.is_empty() {
emit(Token::Text(rest));
}
}

fn handle_line<'a, F>(line: &'a str, last: bool, in_fence: &mut bool, f: &mut F, out: &mut String)
where
F: FnMut(Token<'a>, &mut String),
{
if is_fence(line) {
f(Token::Fence(line), out);
if !last {
f(Token::Newline, out);
}
*in_fence = !*in_fence;
return;
}

if *in_fence {
f(Token::Fence(line), out);
if !last {
f(Token::Newline, out);
}
return;
}

tokenize_inline(line, &mut |tok| f(tok, out));
if !last {
f(Token::Newline, out);
}
}

/// Apply a transformation to a sequence of [`Token`]s.
///
/// The `lines` slice is tokenized in order, preserving fence context.
/// Each token is passed to `f` along with the output accumulator. The
/// final string is split on newline characters and returned as a
/// vector of lines.
///
/// # Examples
///
/// ```rust
/// use mdtablefix::{textproc::process_tokens, wrap::Token};
///
/// let lines = vec!["code".to_string()];
/// let out = process_tokens(&lines, |tok, out| match tok {
/// Token::Text(t) => out.push_str(t),
/// Token::Code(c) => {
/// out.push('`');
/// out.push_str(c);
/// out.push('`');
/// }
/// Token::Fence(f) => out.push_str(f),
/// Token::Newline => out.push('\n'),
/// });
/// assert_eq!(out, lines);
/// ```
#[must_use]
pub fn process_tokens<F>(lines: &[String], mut f: F) -> Vec<String>
where
F: FnMut(Token<'_>, &mut String),
{
if lines.is_empty() {
return Vec::new();
}

let trailing_blanks = lines.iter().rev().take_while(|l| l.is_empty()).count();
if trailing_blanks == lines.len() {
return vec![String::new(); lines.len()];
}

let mut out = String::new();
let mut in_fence = false;
let last_idx = lines.len() - 1;
for (i, line) in lines.iter().enumerate() {
handle_line(line, i == last_idx, &mut in_fence, &mut f, &mut out);
}

if out.is_empty() {
return Vec::new();
}

let mut result: Vec<String> = out.split('\n').map(str::to_string).collect();
let out_blanks = result.iter().rev().take_while(|l| l.is_empty()).count();
for _ in out_blanks..trailing_blanks {
result.push(String::new());
}
result
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn identity_transformation_returns_input() {
let lines = vec!["a `b`".to_string()];
let out = process_tokens(&lines, |tok, buf| match tok {
Token::Text(t) => buf.push_str(t),
Token::Code(c) => {
buf.push('`');
buf.push_str(c);
buf.push('`');
}
Token::Fence(f) => buf.push_str(f),
Token::Newline => buf.push('\n'),
});
assert_eq!(out, lines);
}

#[test]
fn empty_input_returns_empty_vector() {
let lines: Vec<String> = Vec::new();
let out = process_tokens(&lines, |_tok, _out| unreachable!());
assert!(out.is_empty());
}

#[test]
fn transformation_can_remove_all_content() {
let lines = vec!["data".to_string()];
let out = process_tokens(&lines, |_tok, _out| {});
assert!(out.is_empty());
}

#[test]
fn preserves_trailing_blank_lines() {
let lines = vec!["a".to_string(), String::new(), String::new()];
let out = process_tokens(&lines, |tok, buf| match tok {
Token::Text(t) => buf.push_str(t),
Token::Code(c) => {
buf.push('`');
buf.push_str(c);
buf.push('`');
}
Token::Fence(f) => buf.push_str(f),
Token::Newline => buf.push('\n'),
});
assert_eq!(out, lines);
}

#[test]
fn blanks_only_are_preserved() {
let lines = vec![String::new(), String::new()];
let out = process_tokens(&lines, |_tok, _buf| {});
assert_eq!(out, lines);
}

#[test]
fn token_stream_handles_fences() {
let lines = vec![
"```rust".to_string(),
"fn main() {".to_string(),
" println!(\"hi\");".to_string(),
"```".to_string(),
];
let mut tokens = Vec::new();
let _ = process_tokens(&lines, |tok, _| tokens.push(format!("{tok:?}")));
let expected = vec![
"Fence(\"```rust\")".to_string(),
"Newline".to_string(),
"Fence(\"fn main() {\")".to_string(),
"Newline".to_string(),
"Fence(\" println!(\\\"hi\\\");\")".to_string(),
"Newline".to_string(),
"Fence(\"```\")".to_string(),
];
assert_eq!(tokens, expected);
}

#[test]
fn malformed_fence_sequence_returns_tokens() {
let lines = vec!["```".to_string(), "code".to_string()];
let mut tokens = Vec::new();
let _ = process_tokens(&lines, |tok, _| tokens.push(format!("{tok:?}")));
let expected = vec![
"Fence(\"```\")".to_string(),
"Newline".to_string(),
"Fence(\"code\")".to_string(),
];
assert_eq!(tokens, expected);
}

#[test]
fn multi_backtick_spans_are_recognised() {
let lines = vec!["A ``code`` span".to_string()];
let mut tokens = Vec::new();
let _ = process_tokens(&lines, |tok, _| tokens.push(format!("{tok:?}")));
let expected = vec![
"Text(\"A \")".to_string(),
"Code(\"code\")".to_string(),
"Text(\" span\")".to_string(),
];
assert_eq!(tokens, expected);
}
}
2 changes: 1 addition & 1 deletion src/wrap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
use regex::{Captures, Regex};

mod tokenize;
pub(crate) use tokenize::{Token, tokenize_markdown};
pub use tokenize::Token;
Comment thread
leynos marked this conversation as resolved.

static FENCE_RE: std::sync::LazyLock<Regex> =
std::sync::LazyLock::new(|| Regex::new(r"^\s*(```|~~~).*").unwrap());
Expand Down
Loading
Loading