Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions docs/architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -266,9 +266,11 @@ classDiagram
The `lib` module re-exports the public API from the other modules. The
`ellipsis` module performs text normalization, while `footnotes` converts bare
references. The `textproc` module contains shared token-processing helpers used
by both the `ellipsis` and `footnotes` modules. The `process` module provides
streaming helpers that combine the lower-level functions. The `io` module
handles filesystem operations, delegating the text processing to `process`.
by both the `ellipsis` and `footnotes` modules. Tokenization is handled by
`wrap::tokenize_markdown`, replacing the small state machine that previously
resided in `process_tokens`. The `process` module provides streaming helpers
that combine the lower-level functions. The `io` module handles filesystem
operations, delegating the text processing to `process`.

The helper `html_table_to_markdown` is retained for backward compatibility but
is deprecated. New code should call `convert_html_tables` instead.
Expand Down
72 changes: 10 additions & 62 deletions src/textproc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,61 +6,7 @@
//! then reconstructs the lines. Trailing blank lines roundtrip
//! correctly.

pub use crate::wrap::Token;
use crate::wrap::is_fence;

fn tokenize_inline<'a, F>(text: &'a str, emit: &mut F)
where
F: FnMut(Token<'a>),
{
let mut rest = text;
while let Some(pos) = rest.find('`') {
if pos > 0 {
emit(Token::Text(&rest[..pos]));
}
let delim_len = rest[pos..].chars().take_while(|&c| c == '`').count();
let search = &rest[pos + delim_len..];
let closing = "`".repeat(delim_len);
if let Some(end) = search.find(&closing) {
emit(Token::Code(&rest[pos + delim_len..pos + delim_len + end]));
rest = &search[end + delim_len..];
} else {
emit(Token::Text(&rest[pos..]));
rest = "";
break;
}
}
if !rest.is_empty() {
emit(Token::Text(rest));
}
}

fn handle_line<'a, F>(line: &'a str, last: bool, in_fence: &mut bool, f: &mut F, out: &mut String)
where
F: FnMut(Token<'a>, &mut String),
{
if is_fence(line) {
f(Token::Fence(line), out);
if !last {
f(Token::Newline, out);
}
*in_fence = !*in_fence;
return;
}

if *in_fence {
f(Token::Fence(line), out);
if !last {
f(Token::Newline, out);
}
return;
}

tokenize_inline(line, &mut |tok| f(tok, out));
if !last {
f(Token::Newline, out);
}
}
pub use crate::wrap::{Token, tokenize_markdown};

Check failure on line 9 in src/textproc.rs

View workflow job for this annotation

GitHub Actions / build-test

unresolved import `crate::wrap::tokenize_markdown`

/// Apply a transformation to a sequence of [`Token`]s.
///
Expand Down Expand Up @@ -101,21 +47,23 @@
return vec![String::new(); lines.len()];
}

let source = lines.join("\n");
let mut out = String::new();
let mut in_fence = false;
let last_idx = lines.len() - 1;
for (i, line) in lines.iter().enumerate() {
handle_line(line, i == last_idx, &mut in_fence, &mut f, &mut out);
for token in tokenize_markdown(&source) {
f(token, &mut out);
}

if out.is_empty() {
return Vec::new();
}

let mut result: Vec<String> = out.split('\n').map(str::to_string).collect();
let mut result: Vec<String> = out.split('\n').map(ToOwned::to_owned).collect();
let out_blanks = result.iter().rev().take_while(|l| l.is_empty()).count();
for _ in out_blanks..trailing_blanks {
result.push(String::new());
if out_blanks < trailing_blanks {
result.extend(std::iter::repeat_n(
String::new(),
trailing_blanks - out_blanks,
));
}
result
}
Expand Down
64 changes: 64 additions & 0 deletions src/wrap/tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,70 @@ pub(super) fn segment_inline(text: &str) -> Vec<String> {
tokens
}

fn tokenize_inline<'a, F>(text: &'a str, emit: &mut F)
Comment thread
leynos marked this conversation as resolved.
where
F: FnMut(Token<'a>),
{
let mut rest = text;
while let Some(pos) = rest.find('`') {
if pos > 0 {
emit(Token::Text(&rest[..pos]));
}
let delim_len = rest[pos..].chars().take_while(|&c| c == '`').count();
let search = &rest[pos + delim_len..];
let closing = "`".repeat(delim_len);
if let Some(end) = search.find(&closing) {
emit(Token::Code(&rest[pos + delim_len..pos + delim_len + end]));
rest = &search[end + delim_len..];
} else {
emit(Token::Text(&rest[pos..]));
rest = "";
break;
}
}
if !rest.is_empty() {
emit(Token::Text(rest));
}
}

/// Tokenize a block of Markdown into [`Token`]s.
#[must_use]
pub fn tokenize_markdown(source: &str) -> Vec<Token<'_>> {
Comment thread
leynos marked this conversation as resolved.
if source.is_empty() {
return Vec::new();
}

let mut tokens = Vec::new();
let lines: Vec<&str> = source.split('\n').collect();
let last_idx = lines.len() - 1;
let mut in_fence = false;

for (i, line) in lines.iter().enumerate() {
if super::is_fence(line) {
tokens.push(Token::Fence(line));
if i != last_idx {
tokens.push(Token::Newline);
}
in_fence = !in_fence;
continue;
}

if in_fence {
tokens.push(Token::Fence(line));
if i != last_idx {
tokens.push(Token::Newline);
}
continue;
}

tokenize_inline(line, &mut |tok| tokens.push(tok));
if i != last_idx {
tokens.push(Token::Newline);
}
}
tokens
}
Comment thread
leynos marked this conversation as resolved.

/// Split the input string into [`Token`]s by analysing whitespace and backtick
/// delimiters.
///
Expand Down
52 changes: 52 additions & 0 deletions tests/wrap/tokenize_markdown.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
//! Tests for the tokenize_markdown helper.

use mdtablefix::wrap::{self, Token};

#[test]
fn unclosed_fence_yields_fence_tokens() {
let lines = vec!["```rust", "let x = 42;", "fn foo() {}"];
let joined = lines.join("\n");
let tokens = wrap::tokenize_markdown(&joined);
assert_eq!(
tokens,
vec![
Token::Fence("```rust"),
Token::Newline,
Token::Fence("let x = 42;"),
Token::Newline,
Token::Fence("fn foo() {}"),
]
);
}

#[test]
fn malformed_fence_is_text() {
let source = "``~~\ncode\n``~~";
let tokens = wrap::tokenize_markdown(source);
assert_eq!(
tokens,
vec![
Token::Text("``~~"),
Token::Newline,
Token::Text("code"),
Token::Newline,
Token::Text("``~~"),
]
);
}

#[test]
fn incorrect_fence_length_is_text() {
let source = "````\ncode\n````";
let tokens = wrap::tokenize_markdown(source);
assert_eq!(
tokens,
vec![
Token::Text("````"),
Token::Newline,
Token::Text("code"),
Token::Newline,
Token::Text("````"),
]
);
}
Loading