From 616c2ac8c4c16b5a097d240603708c827a5a8ed2 Mon Sep 17 00:00:00 2001 From: Leynos Date: Wed, 30 Jul 2025 20:16:24 +0100 Subject: [PATCH 1/3] Refine text token helper --- docs/architecture.md | 16 ++-- src/ellipsis.rs | 48 +++++------ src/footnotes.rs | 28 +++--- src/lib.rs | 1 + src/textproc.rs | 199 +++++++++++++++++++++++++++++++++++++++++++ src/wrap.rs | 83 ++++++++++++++++++ 6 files changed, 325 insertions(+), 50 deletions(-) create mode 100644 src/textproc.rs diff --git a/docs/architecture.md b/docs/architecture.md index 61739015..84f58e56 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -225,6 +225,10 @@ classDiagram <> +convert_footnotes() } + class textproc { + <> + +process_tokens() + } class process { <> +process_stream() @@ -248,21 +252,23 @@ classDiagram table ..> reflow : uses parse_rows, etc. lists ..> wrap : uses is_fence breaks ..> wrap : uses is_fence - ellipsis ..> wrap : uses tokenize_markdown + ellipsis ..> textproc : uses process_tokens process ..> html : uses convert_html_tables process ..> table : uses reflow_table process ..> wrap : uses wrap_text, is_fence process ..> fences : uses compress_fences, attach_orphan_specifiers process ..> ellipsis : uses replace_ellipsis process ..> footnotes : uses convert_footnotes + footnotes ..> textproc : uses process_tokens io ..> process : uses process_stream, process_stream_no_wrap ``` The `lib` module re-exports the public API from the other modules. The -`ellipsis` module performs text normalization. The `process` module provides -streaming helpers that combine the lower-level functions, including ellipsis -replacement and footnote conversion. The `io` module handles filesystem -operations, delegating the text processing to `process`. +`ellipsis` module performs text normalization, while `footnotes` converts bare +references. The `textproc` module contains shared token-processing helpers used +by both. The `process` module provides streaming helpers that combine the +lower-level functions. The `io` module handles filesystem operations, +delegating the text processing to `process`. The helper `html_table_to_markdown` is retained for backward compatibility but is deprecated. New code should call `convert_html_tables` instead. diff --git a/src/ellipsis.rs b/src/ellipsis.rs index 1b84f0b1..c9db9d82 100644 --- a/src/ellipsis.rs +++ b/src/ellipsis.rs @@ -9,41 +9,35 @@ use std::sync::LazyLock; use regex::Regex; -use crate::wrap::{Token, tokenize_markdown}; +use crate::textproc::{Token, process_tokens}; static DOT_RE: LazyLock = lazy_regex!(r"\.{3,}", "ellipsis pattern regex should compile"); /// Replace `...` with `…` outside code spans and fences. #[must_use] pub fn replace_ellipsis(lines: &[String]) -> Vec { - if lines.is_empty() { - return Vec::new(); - } - let joined = lines.join("\n"); - let mut out = String::new(); - for token in tokenize_markdown(&joined) { - match token { - Token::Text(t) => { - let replaced = DOT_RE.replace_all(t, |caps: ®ex::Captures<'_>| { - let len = caps[0].len(); - let ellipses = "…".repeat(len / 3); - let leftover = ".".repeat(len % 3); - format!("{ellipses}{leftover}") - }); - out.push_str(&replaced); - } - Token::Code(c) => { - out.push('`'); - out.push_str(c); - out.push('`'); - } - Token::Fence(f) => { - out.push_str(f); + process_tokens(lines, |token, out| match token { + Token::Text(t) => { + if !DOT_RE.is_match(t) { + out.push_str(t); + return; } - Token::Newline => out.push('\n'), + let replaced = DOT_RE.replace_all(t, |caps: ®ex::Captures<'_>| { + let len = caps[0].len(); + let ellipses = "…".repeat(len / 3); + let leftover = ".".repeat(len % 3); + format!("{ellipses}{leftover}") + }); + out.push_str(&replaced); } - } - out.split('\n').map(str::to_string).collect() + Token::Code(c) => { + out.push('`'); + out.push_str(c); + out.push('`'); + } + Token::Fence(f) => out.push_str(f), + Token::Newline => out.push('\n'), + }) } #[cfg(test)] diff --git a/src/footnotes.rs b/src/footnotes.rs index 24b5491a..d4bbd590 100644 --- a/src/footnotes.rs +++ b/src/footnotes.rs @@ -18,7 +18,7 @@ static FOOTNOTE_LINE_RE: LazyLock = lazy_regex!( "footnote line pattern should compile", ); -use crate::wrap::{Token, tokenize_markdown}; +use crate::textproc::{Token, process_tokens}; /// Extract the components of an inline footnote reference. #[inline] @@ -96,24 +96,16 @@ fn convert_block(lines: &mut [String]) { /// Convert bare numeric footnote references to Markdown footnote syntax. #[must_use] pub fn convert_footnotes(lines: &[String]) -> Vec { - if lines.is_empty() { - return Vec::new(); - } - let joined = lines.join("\n"); - let mut out = String::new(); - for token in tokenize_markdown(&joined) { - match token { - Token::Text(t) => out.push_str(&convert_inline(t)), - Token::Code(c) => { - out.push('`'); - out.push_str(c); - out.push('`'); - } - Token::Fence(f) => out.push_str(f), - Token::Newline => out.push('\n'), + let mut lines = process_tokens(lines, |token, out| match token { + Token::Text(t) => out.push_str(&convert_inline(t)), + Token::Code(c) => { + out.push('`'); + out.push_str(c); + out.push('`'); } - } - let mut lines: Vec = out.split('\n').map(str::to_string).collect(); + Token::Fence(f) => out.push_str(f), + Token::Newline => out.push('\n'), + }); convert_block(&mut lines); lines } diff --git a/src/lib.rs b/src/lib.rs index 197edba1..9b2ad0d8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,6 +29,7 @@ pub mod lists; pub mod process; mod reflow; pub mod table; +mod textproc; pub mod wrap; #[deprecated(note = "this function is legacy; use `convert_html_tables` instead")] diff --git a/src/textproc.rs b/src/textproc.rs new file mode 100644 index 00000000..d72da7b4 --- /dev/null +++ b/src/textproc.rs @@ -0,0 +1,199 @@ +//! Provides helpers for token-based transformations of Markdown lines. +//! +//! This module reuses the tokenizer from the [`wrap`] module and offers +//! a streaming API for rewriting Markdown. Each helper tokenizes lines +//! on the fly, feeds the resulting tokens to caller-provided logic, and +//! then reconstructs the lines. Trailing blank lines roundtrip +//! correctly. + +pub use crate::wrap::Token; +use crate::wrap::is_fence; + +/// Apply a transformation to a sequence of [`Token`]s. +/// +/// The `lines` slice is tokenized in order, preserving fence context. +/// Each token is passed to `f` along with the output accumulator. The +/// final string is split on newline characters and returned as a +/// vector of lines. +/// +/// # Examples +/// +/// ```ignore +/// use mdtablefix::{ +/// textproc::process_tokens, +/// wrap::Token, +/// }; +/// +/// let lines = vec!["code".to_string()]; +/// let out = process_tokens(&lines, |tok, out| match tok { +/// Token::Text(t) => out.push_str(t), +/// Token::Code(c) => { +/// out.push('`'); +/// out.push_str(c); +/// out.push('`'); +/// } +/// Token::Fence(f) => out.push_str(f), +/// Token::Newline => out.push('\n'), +/// }); +/// assert_eq!(out, lines); +/// ``` +#[must_use] +pub(crate) fn process_tokens(lines: &[String], mut f: F) -> Vec +where + F: FnMut(Token<'_>, &mut String), +{ + if lines.is_empty() { + return Vec::new(); + } + + let trailing_blanks = lines.iter().rev().take_while(|l| l.is_empty()).count(); + if trailing_blanks == lines.len() { + return vec![String::new(); lines.len()]; + } + + let mut out = String::new(); + let mut in_fence = false; + let last_idx = lines.len() - 1; + for (i, line) in lines.iter().enumerate() { + let trimmed = line.as_str(); + if is_fence(trimmed) { + f(Token::Fence(trimmed), &mut out); + if i < last_idx { + f(Token::Newline, &mut out); + } + in_fence = !in_fence; + continue; + } + if in_fence { + f(Token::Fence(trimmed), &mut out); + if i < last_idx { + f(Token::Newline, &mut out); + } + continue; + } + let mut rest = trimmed; + while let Some(pos) = rest.find('`') { + if pos > 0 { + f(Token::Text(&rest[..pos]), &mut out); + } + if let Some(end) = rest[pos + 1..].find('`') { + f(Token::Code(&rest[pos + 1..pos + 1 + end]), &mut out); + rest = &rest[pos + end + 2..]; + } else { + f(Token::Text(&rest[pos..]), &mut out); + rest = ""; + break; + } + } + if !rest.is_empty() { + f(Token::Text(rest), &mut out); + } + if i < last_idx { + f(Token::Newline, &mut out); + } + } + + if out.is_empty() { + return Vec::new(); + } + + let mut result: Vec = out.split('\n').map(str::to_string).collect(); + let out_blanks = result.iter().rev().take_while(|l| l.is_empty()).count(); + for _ in out_blanks..trailing_blanks { + result.push(String::new()); + } + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn identity_transformation_returns_input() { + let lines = vec!["a `b`".to_string()]; + let out = process_tokens(&lines, |tok, buf| match tok { + Token::Text(t) => buf.push_str(t), + Token::Code(c) => { + buf.push('`'); + buf.push_str(c); + buf.push('`'); + } + Token::Fence(f) => buf.push_str(f), + Token::Newline => buf.push('\n'), + }); + assert_eq!(out, lines); + } + + #[test] + fn empty_input_returns_empty_vector() { + let lines: Vec = Vec::new(); + let out = process_tokens(&lines, |_tok, _out| unreachable!()); + assert!(out.is_empty()); + } + + #[test] + fn transformation_can_remove_all_content() { + let lines = vec!["data".to_string()]; + let out = process_tokens(&lines, |_tok, _out| {}); + assert!(out.is_empty()); + } + + #[test] + fn preserves_trailing_blank_lines() { + let lines = vec!["a".to_string(), String::new(), String::new()]; + let out = process_tokens(&lines, |tok, buf| match tok { + Token::Text(t) => buf.push_str(t), + Token::Code(c) => { + buf.push('`'); + buf.push_str(c); + buf.push('`'); + } + Token::Fence(f) => buf.push_str(f), + Token::Newline => buf.push('\n'), + }); + assert_eq!(out, lines); + } + + #[test] + fn blanks_only_are_preserved() { + let lines = vec![String::new(), String::new()]; + let out = process_tokens(&lines, |_tok, _buf| {}); + assert_eq!(out, lines); + } + + #[test] + fn token_stream_handles_fences() { + let lines = vec![ + "```rust".to_string(), + "fn main() {".to_string(), + " println!(\"hi\");".to_string(), + "```".to_string(), + ]; + let mut tokens = Vec::new(); + let _ = process_tokens(&lines, |tok, _| tokens.push(format!("{tok:?}"))); + let expected = vec![ + "Fence(\"```rust\")".to_string(), + "Newline".to_string(), + "Fence(\"fn main() {\")".to_string(), + "Newline".to_string(), + "Fence(\" println!(\\\"hi\\\");\")".to_string(), + "Newline".to_string(), + "Fence(\"```\")".to_string(), + ]; + assert_eq!(tokens, expected); + } + + #[test] + fn malformed_fence_sequence_returns_tokens() { + let lines = vec!["```".to_string(), "code".to_string()]; + let mut tokens = Vec::new(); + let _ = process_tokens(&lines, |tok, _| tokens.push(format!("{tok:?}"))); + let expected = vec![ + "Fence(\"```\")".to_string(), + "Newline".to_string(), + "Fence(\"code\")".to_string(), + ]; + assert_eq!(tokens, expected); + } +} diff --git a/src/wrap.rs b/src/wrap.rs index 4627f00e..6bb7bcfd 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -37,6 +37,18 @@ static MARKDOWNLINT_DIRECTIVE_RE: std::sync::LazyLock = std::sync::LazyLo ) .expect("valid markdownlint regex") }); +/// Markdown token emitted by token-processing helpers. +#[derive(Debug, PartialEq)] +pub enum Token<'a> { + /// Line within a fenced code block, including the fence itself. + Fence(&'a str), + /// Inline code span without surrounding backticks. + Code(&'a str), + /// Plain text outside code regions. + Text(&'a str), + /// Line break separating tokens. + Newline, +} struct PrefixHandler { re: &'static std::sync::LazyLock, @@ -51,6 +63,77 @@ impl PrefixHandler { fn build_footnote_prefix(cap: &Captures) -> String { format!("{}{}", &cap[1], &cap[2]) } fn build_blockquote_prefix(cap: &Captures) -> String { cap[1].to_string() } +fn tokenize_inline(text: &str) -> Vec { + let mut tokens = Vec::new(); + let chars: Vec = text.chars().collect(); + let mut i = 0; + while i < chars.len() { + let c = chars[i]; + if c.is_whitespace() { + let start = i; + while i < chars.len() && chars[i].is_whitespace() { + i += 1; + } + tokens.push(chars[start..i].iter().collect()); + } else if c == '`' { + let start = i; + let mut delim_len = 0; + while i < chars.len() && chars[i] == '`' { + i += 1; + delim_len += 1; + } + let mut end = i; + while end < chars.len() { + if chars[end] == '`' { + let mut j = end; + let mut count = 0; + while j < chars.len() && chars[j] == '`' { + j += 1; + count += 1; + } + if count == delim_len { + end = j; + break; + } + } + end += 1; + } + if end >= chars.len() { + tokens.push(chars[start..start + delim_len].iter().collect()); + i = start + delim_len; + } else { + tokens.push(chars[start..end].iter().collect()); + i = end; + } + } else if c == '[' || (c == '!' && i + 1 < chars.len() && chars[i + 1] == '[') { + let (tok, new_i) = parse_link_or_image(&chars, i); + tokens.push(tok); + i = new_i; + } else { + let start = i; + while i < chars.len() && !chars[i].is_whitespace() && chars[i] != '`' { + i += 1; + } + tokens.push(chars[start..i].iter().collect()); + } + } + tokens +} + +/// Determine if the current line should break at the last whitespace. +/// +/// Returns `true` if `current_width` exceeds `width` and a whitespace split +/// position is available. +/// +/// # Examples +/// +/// ```ignore +/// use mdtablefix::wrap::should_break_line; +/// assert!(should_break_line(10, 12, Some(3))); +/// assert!(!should_break_line(10, 8, Some(3))); +/// ``` +fn should_break_line(width: usize, current_width: usize, last_split: Option) -> bool { + current_width > width && last_split.is_some() } static HANDLERS: &[PrefixHandler] = &[ From ec1102e484de254f5e19c8d60c05138ef90dfd18 Mon Sep 17 00:00:00 2001 From: Leynos Date: Thu, 31 Jul 2025 18:44:00 +0100 Subject: [PATCH 2/3] Expose process_tokens for doctests --- docs/architecture.md | 6 +-- src/lib.rs | 3 +- src/textproc.rs | 112 +++++++++++++++++++++++++++---------------- 3 files changed, 75 insertions(+), 46 deletions(-) diff --git a/docs/architecture.md b/docs/architecture.md index 84f58e56..5baf68bc 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -266,9 +266,9 @@ classDiagram The `lib` module re-exports the public API from the other modules. The `ellipsis` module performs text normalization, while `footnotes` converts bare references. The `textproc` module contains shared token-processing helpers used -by both. The `process` module provides streaming helpers that combine the -lower-level functions. The `io` module handles filesystem operations, -delegating the text processing to `process`. +by both the `ellipsis` and `footnotes` modules. The `process` module provides +streaming helpers that combine the lower-level functions. The `io` module +handles filesystem operations, delegating the text processing to `process`. The helper `html_table_to_markdown` is retained for backward compatibility but is deprecated. New code should call `convert_html_tables` instead. diff --git a/src/lib.rs b/src/lib.rs index 9b2ad0d8..3edae610 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,7 @@ //! - `ellipsis` for normalizing textual ellipses. //! - `fences` for issues with code block fences //! - `footnotes` for converting bare footnote links. +//! - `textproc` for token-based transformations. //! - `process` for stream processing. //! - `io` for file helpers. @@ -29,7 +30,7 @@ pub mod lists; pub mod process; mod reflow; pub mod table; -mod textproc; +pub mod textproc; pub mod wrap; #[deprecated(note = "this function is legacy; use `convert_html_tables` instead")] diff --git a/src/textproc.rs b/src/textproc.rs index d72da7b4..9ea8545f 100644 --- a/src/textproc.rs +++ b/src/textproc.rs @@ -9,6 +9,59 @@ pub use crate::wrap::Token; use crate::wrap::is_fence; +fn tokenize_inline<'a, F>(text: &'a str, emit: &mut F) +where + F: FnMut(Token<'a>), +{ + let mut rest = text; + while let Some(pos) = rest.find('`') { + if pos > 0 { + emit(Token::Text(&rest[..pos])); + } + let delim_len = rest[pos..].chars().take_while(|&c| c == '`').count(); + let search = &rest[pos + delim_len..]; + let closing = "`".repeat(delim_len); + if let Some(end) = search.find(&closing) { + emit(Token::Code(&rest[pos + delim_len..pos + delim_len + end])); + rest = &search[end + delim_len..]; + } else { + emit(Token::Text(&rest[pos..])); + rest = ""; + break; + } + } + if !rest.is_empty() { + emit(Token::Text(rest)); + } +} + +fn handle_line<'a, F>(line: &'a str, last: bool, in_fence: &mut bool, f: &mut F, out: &mut String) +where + F: FnMut(Token<'a>, &mut String), +{ + if is_fence(line) { + f(Token::Fence(line), out); + if !last { + f(Token::Newline, out); + } + *in_fence = !*in_fence; + return; + } + + if *in_fence { + f(Token::Fence(line), out); + if !last { + f(Token::Newline, out); + } + return; + } + + tokenize_inline(line, &mut |tok| f(tok, out)); + if !last { + f(Token::Newline, out); + } +} + /// Apply a transformation to a sequence of [`Token`]s. /// /// The `lines` slice is tokenized in order, preserving fence context. @@ -18,11 +71,8 @@ use crate::wrap::is_fence; /// /// # Examples /// -/// ```ignore -/// use mdtablefix::{ -/// textproc::process_tokens, -/// wrap::Token, -/// }; +/// ```rust +/// use mdtablefix::{textproc::process_tokens, wrap::Token}; /// /// let lines = vec!["code".to_string()]; /// let out = process_tokens(&lines, |tok, out| match tok { @@ -38,7 +88,7 @@ use crate::wrap::is_fence; /// assert_eq!(out, lines); /// ``` #[must_use] -pub(crate) fn process_tokens(lines: &[String], mut f: F) -> Vec +pub fn process_tokens(lines: &[String], mut f: F) -> Vec where F: FnMut(Token<'_>, &mut String), { @@ -55,42 +105,7 @@ where let mut in_fence = false; let last_idx = lines.len() - 1; for (i, line) in lines.iter().enumerate() { - let trimmed = line.as_str(); - if is_fence(trimmed) { - f(Token::Fence(trimmed), &mut out); - if i < last_idx { - f(Token::Newline, &mut out); - } - in_fence = !in_fence; - continue; - } - if in_fence { - f(Token::Fence(trimmed), &mut out); - if i < last_idx { - f(Token::Newline, &mut out); - } - continue; - } - let mut rest = trimmed; - while let Some(pos) = rest.find('`') { - if pos > 0 { - f(Token::Text(&rest[..pos]), &mut out); - } - if let Some(end) = rest[pos + 1..].find('`') { - f(Token::Code(&rest[pos + 1..pos + 1 + end]), &mut out); - rest = &rest[pos + end + 2..]; - } else { - f(Token::Text(&rest[pos..]), &mut out); - rest = ""; - break; - } - } - if !rest.is_empty() { - f(Token::Text(rest), &mut out); - } - if i < last_idx { - f(Token::Newline, &mut out); - } + handle_line(line, i == last_idx, &mut in_fence, &mut f, &mut out); } if out.is_empty() { @@ -196,4 +211,17 @@ mod tests { ]; assert_eq!(tokens, expected); } + + #[test] + fn multi_backtick_spans_are_recognised() { + let lines = vec!["A ``code`` span".to_string()]; + let mut tokens = Vec::new(); + let _ = process_tokens(&lines, |tok, _| tokens.push(format!("{tok:?}"))); + let expected = vec![ + "Text(\"A \")".to_string(), + "Code(\"code\")".to_string(), + "Text(\" span\")".to_string(), + ]; + assert_eq!(tokens, expected); + } } From b44776b9b1d122227e6892934fda2ccf5fa26876 Mon Sep 17 00:00:00 2001 From: Leynos Date: Thu, 31 Jul 2025 19:23:38 +0100 Subject: [PATCH 3/3] Fix unclosed delimiter and remove unused code (#160) --- src/wrap.rs | 85 +------------------------------------------- src/wrap/tokenize.rs | 43 +--------------------- 2 files changed, 2 insertions(+), 126 deletions(-) diff --git a/src/wrap.rs b/src/wrap.rs index 6bb7bcfd..ce97ecfd 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -8,7 +8,7 @@ use regex::{Captures, Regex}; mod tokenize; -pub(crate) use tokenize::{Token, tokenize_markdown}; +pub use tokenize::Token; static FENCE_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| Regex::new(r"^\s*(```|~~~).*").unwrap()); @@ -37,18 +37,6 @@ static MARKDOWNLINT_DIRECTIVE_RE: std::sync::LazyLock = std::sync::LazyLo ) .expect("valid markdownlint regex") }); -/// Markdown token emitted by token-processing helpers. -#[derive(Debug, PartialEq)] -pub enum Token<'a> { - /// Line within a fenced code block, including the fence itself. - Fence(&'a str), - /// Inline code span without surrounding backticks. - Code(&'a str), - /// Plain text outside code regions. - Text(&'a str), - /// Line break separating tokens. - Newline, -} struct PrefixHandler { re: &'static std::sync::LazyLock, @@ -63,77 +51,6 @@ impl PrefixHandler { fn build_footnote_prefix(cap: &Captures) -> String { format!("{}{}", &cap[1], &cap[2]) } fn build_blockquote_prefix(cap: &Captures) -> String { cap[1].to_string() } -fn tokenize_inline(text: &str) -> Vec { - let mut tokens = Vec::new(); - let chars: Vec = text.chars().collect(); - let mut i = 0; - while i < chars.len() { - let c = chars[i]; - if c.is_whitespace() { - let start = i; - while i < chars.len() && chars[i].is_whitespace() { - i += 1; - } - tokens.push(chars[start..i].iter().collect()); - } else if c == '`' { - let start = i; - let mut delim_len = 0; - while i < chars.len() && chars[i] == '`' { - i += 1; - delim_len += 1; - } - let mut end = i; - while end < chars.len() { - if chars[end] == '`' { - let mut j = end; - let mut count = 0; - while j < chars.len() && chars[j] == '`' { - j += 1; - count += 1; - } - if count == delim_len { - end = j; - break; - } - } - end += 1; - } - if end >= chars.len() { - tokens.push(chars[start..start + delim_len].iter().collect()); - i = start + delim_len; - } else { - tokens.push(chars[start..end].iter().collect()); - i = end; - } - } else if c == '[' || (c == '!' && i + 1 < chars.len() && chars[i + 1] == '[') { - let (tok, new_i) = parse_link_or_image(&chars, i); - tokens.push(tok); - i = new_i; - } else { - let start = i; - while i < chars.len() && !chars[i].is_whitespace() && chars[i] != '`' { - i += 1; - } - tokens.push(chars[start..i].iter().collect()); - } - } - tokens -} - -/// Determine if the current line should break at the last whitespace. -/// -/// Returns `true` if `current_width` exceeds `width` and a whitespace split -/// position is available. -/// -/// # Examples -/// -/// ```ignore -/// use mdtablefix::wrap::should_break_line; -/// assert!(should_break_line(10, 12, Some(3))); -/// assert!(!should_break_line(10, 8, Some(3))); -/// ``` -fn should_break_line(width: usize, current_width: usize, last_split: Option) -> bool { - current_width > width && last_split.is_some() } static HANDLERS: &[PrefixHandler] = &[ diff --git a/src/wrap/tokenize.rs b/src/wrap/tokenize.rs index 184dae3e..6be74ffe 100644 --- a/src/wrap/tokenize.rs +++ b/src/wrap/tokenize.rs @@ -3,8 +3,6 @@ //! This module contains utilities for breaking lines into tokens so that //! inline code spans and Markdown links are preserved during wrapping. -use super::FENCE_RE; - fn scan_while(chars: &[char], mut i: usize, cond: F) -> usize where F: Fn(char) -> bool, @@ -19,7 +17,7 @@ fn collect_range(chars: &[char], start: usize, end: usize) -> String { chars[start..end].iter().collect() } -/// Markdown token emitted by [`tokenize_markdown`]. +/// Markdown token emitted by [`segment_inline`]. #[derive(Debug, PartialEq)] pub enum Token<'a> { /// Line within a fenced code block, including the fence itself. @@ -129,45 +127,6 @@ pub(super) fn segment_inline(text: &str) -> Vec { /// is encountered the parser searches forward for an identical delimiter, /// allowing nested backticks when the span uses a longer fence. Unmatched /// delimiter sequences are treated as literal text. -pub(crate) fn tokenize_markdown(input: &str) -> Vec> { - let mut out = Vec::new(); - let mut in_fence = false; - for line in input.split_inclusive('\n') { - let trimmed = line.trim_end_matches('\n'); - if FENCE_RE.is_match(trimmed) { - out.push(Token::Fence(trimmed)); - out.push(Token::Newline); - in_fence = !in_fence; - continue; - } - if in_fence { - out.push(Token::Fence(trimmed)); - out.push(Token::Newline); - continue; - } - let mut rest = trimmed; - while let Some(pos) = rest.find('`') { - if pos > 0 { - out.push(Token::Text(&rest[..pos])); - } - if let Some(end) = rest[pos + 1..].find('`') { - out.push(Token::Code(&rest[pos + 1..pos + 1 + end])); - rest = &rest[pos + end + 2..]; - } else { - out.push(Token::Text(&rest[pos..])); - rest = ""; - break; - } - } - if !rest.is_empty() { - out.push(Token::Text(rest)); - } - out.push(Token::Newline); - } - out.pop(); - out -} - #[cfg(test)] mod tests { use super::*;