diff --git a/docs/architecture.md b/docs/architecture.md index 56acd825..8f5c8850 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -333,6 +333,37 @@ provides streaming helpers that combine the lower-level functions. The `io` module handles filesystem operations, delegating the text processing to `process`. +### Tokenizer flow + +The inline tokenizer iterates over the source string lazily, so no duplicate +`Vec` representation is required. The following diagram summarizes the +control flow, highlighting the helpers touched during whitespace, code span, +and link handling. + +```mermaid +flowchart TD + A["Input text (&str)"] --> B["Initialize tokens Vec"] + B --> C["Iterate over text by byte index"] + C --> D{"Current char is whitespace?"} + D -- Yes --> E["scan_while for whitespace"] + E --> F["collect_range and push token"] + D -- No --> G{"Current char is '`'?"} + G -- Yes --> H["Check backslash escape (has_odd_backslash_escape_bytes)"] + H -- Escaped --> I["Push '`' as token"] + H -- Not escaped --> J["scan_while for code fence"] + J --> K["Find closing fence, collect_range and push token"] + G -- No --> L{"Current char is '[' or '!['?"} + L -- Yes --> M["parse_link_or_image"] + M --> N["Push link/image token"] + N --> O["scan_while for trailing punctuation"] + O --> P["collect_range and push punctuation token"] + L -- No --> Q["scan_while for non-whitespace/non-` chars"] + Q --> R["collect_range and push token"] + F & I & K & P & R --> S["Continue iteration"] + S --> C + C -->|End| T["Return tokens Vec"] +``` + The helper `html_table_to_markdown` is retained for backward compatibility but is deprecated. New code should call `convert_html_tables` instead. diff --git a/src/wrap/tokenize.rs b/src/wrap/tokenize.rs index 975286e3..9c0abb83 100644 --- a/src/wrap/tokenize.rs +++ b/src/wrap/tokenize.rs @@ -3,26 +3,31 @@ //! This module contains utilities for breaking lines into tokens so that //! inline code spans and Markdown links are preserved during wrapping. -/// Advance `i` while the predicate evaluates to `true`. +/// Advance `idx` while the predicate evaluates to `true`. /// -/// Returns the index of the first character for which `cond` fails. This small -/// helper keeps the scanning loops concise. +/// Returns the byte index of the first character for which `cond` fails. +/// This small helper keeps the scanning loops concise and avoids +/// materialising the source as a char buffer. /// /// # Examples /// /// ```rust,ignore -/// let chars: Vec = "abc123".chars().collect(); -/// let end = scan_while(&chars, 0, char::is_alphabetic); +/// let text = "abc123"; +/// let end = scan_while(text, 0, char::is_alphabetic); /// assert_eq!(end, 3); /// ``` -fn scan_while(chars: &[char], mut i: usize, mut cond: F) -> usize +fn scan_while(text: &str, start: usize, mut cond: F) -> usize where F: FnMut(char) -> bool, { - while i < chars.len() && cond(chars[i]) { - i += 1; + let mut idx = start; + for ch in text[start..].chars() { + if !cond(ch) { + break; + } + idx += ch.len_utf8(); } - i + idx } /// Collect a range of characters into a [`String`]. @@ -30,33 +35,17 @@ where /// # Examples /// /// ```rust,ignore -/// let chars: Vec = ['a', 'b', 'c']; -/// assert_eq!(collect_range(&chars, 0, 2), "ab"); +/// let text = "abc"; +/// assert_eq!(collect_range(text, 0, 2), "ab"); /// ``` -fn collect_range(chars: &[char], start: usize, end: usize) -> String { - chars[start..end].iter().collect() +fn collect_range(text: &str, start: usize, end: usize) -> String { + text[start..end].to_string() } -const BACKSLASH: char = '\\'; const BACKSLASH_BYTE: u8 = b'\\'; -/// Check if a character at the given index is preceded by an odd number of backslashes. -/// -/// An odd number of preceding backslashes means the character is escaped. -fn has_odd_backslash_escape(chars: &[char], mut idx: usize) -> bool { - let mut count = 0; - while idx > 0 { - idx -= 1; - if chars[idx] == BACKSLASH { - count += 1; - } else { - break; - } - } - count % 2 == 1 -} - -/// Check if a byte at the given index is preceded by an odd number of backslashes. +/// Check if a byte at the given index is preceded by an odd number of +/// backslashes. /// /// An odd number of preceding backslashes means the byte is escaped. fn has_odd_backslash_escape_bytes(bytes: &[u8], mut idx: usize) -> bool { @@ -98,35 +87,54 @@ pub enum Token<'a> { /// # Examples /// /// ```rust,ignore -/// let chars: Vec = "![alt](a(b)c)".chars().collect(); -/// let (tok, idx) = parse_link_or_image(&chars, 0); +/// let text = "![alt](a(b)c)"; +/// let (tok, idx) = parse_link_or_image(text, 0); /// assert_eq!(tok, "![alt](a(b)c)"); -/// assert_eq!(idx, chars.len()); +/// assert_eq!(idx, text.len()); /// ``` -fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) { - let start = i; - if chars[i] == '!' { - i += 1; +fn parse_link_or_image(text: &str, mut idx: usize) -> (String, usize) { + let start = idx; + + if text[idx..].starts_with('!') { + idx += '!'.len_utf8(); } - i += 1; // skip initial '[' which we know is present - i = scan_while(chars, i, |c| c != ']'); - if i < chars.len() && chars[i] == ']' { - i += 1; - if i < chars.len() && chars[i] == '(' { - i += 1; + + if !text[idx..].starts_with('[') { + let next = text[start..] + .chars() + .next() + .map_or(text.len(), |ch| start + ch.len_utf8()); + return (collect_range(text, start, next), next); + } + + idx += '['.len_utf8(); + idx = scan_while(text, idx, |c| c != ']'); + if idx < text.len() && text[idx..].starts_with(']') { + idx += ']'.len_utf8(); + if idx < text.len() && text[idx..].starts_with('(') { + idx += '('.len_utf8(); let mut depth = 1; - while i < chars.len() && depth > 0 { - match chars[i] { - '(' => depth += 1, - ')' => depth -= 1, - _ => {} + while idx < text.len() && depth > 0 { + if let Some(ch) = text[idx..].chars().next() { + idx += ch.len_utf8(); + match ch { + '(' => depth += 1, + ')' => depth -= 1, + _ => {} + } + } else { + break; } - i += 1; } - return (collect_range(chars, start, i), i); + return (collect_range(text, start, idx), idx); } } - (collect_range(chars, start, start + 1), start + 1) + + let next = text[start..] + .chars() + .next() + .map_or(text.len(), |ch| start + ch.len_utf8()); + (collect_range(text, start, next), next) } /// Determine whether a character is considered trailing punctuation. @@ -148,6 +156,28 @@ fn is_trailing_punctuation(c: char) -> bool { ) } +fn handle_backtick_fence(text: &str, bytes: &[u8], start_idx: usize) -> (String, usize) { + let start = start_idx; + let fence_end = scan_while(text, start_idx, |ch| ch == '`'); + let fence_len = fence_end - start; + let mut end = fence_end; + + while end < text.len() { + let candidate_end = scan_while(text, end, |ch| ch == '`'); + if candidate_end - end == fence_len && !has_odd_backslash_escape_bytes(bytes, end) { + return (collect_range(text, start, candidate_end), candidate_end); + } + + if let Some(next) = text[end..].chars().next() { + end += next.len_utf8(); + } else { + break; + } + } + + (collect_range(text, start, fence_end), fence_end) +} + /// Break a single line of text into inline token strings. /// /// Code spans, links, images and surrounding whitespace are preserved as @@ -172,61 +202,50 @@ fn is_trailing_punctuation(c: char) -> bool { /// ``` pub(super) fn segment_inline(text: &str) -> Vec { let mut tokens = Vec::new(); - let chars: Vec = text.chars().collect(); + let bytes = text.as_bytes(); let mut i = 0; - while i < chars.len() { - let c = chars[i]; - if c.is_whitespace() { + while i < text.len() { + let Some(ch) = text[i..].chars().next() else { + break; + }; + if ch.is_whitespace() { let start = i; - i = scan_while(&chars, i, char::is_whitespace); - tokens.push(collect_range(&chars, start, i)); - } else if c == '`' { - if has_odd_backslash_escape(&chars, i) { + i = scan_while(text, i, char::is_whitespace); + tokens.push(collect_range(text, start, i)); + continue; + } else if ch == '`' { + if has_odd_backslash_escape_bytes(bytes, i) { if let Some(last) = tokens.last_mut() { last.push('`'); } else { tokens.push(String::from("`")); } - i += 1; + i += ch.len_utf8(); continue; } - let start = i; - let fence_end = scan_while(&chars, i, |ch| ch == '`'); - let fence_len = fence_end - start; - i = fence_end; - - let mut end = i; - let mut closing = None; - while end < chars.len() { - let j = scan_while(&chars, end, |ch| ch == '`'); - if j - end == fence_len && !has_odd_backslash_escape(&chars, end) { - closing = Some(j); - break; - } - end += 1; - } + let (token, new_i) = handle_backtick_fence(text, bytes, i); + tokens.push(token); + i = new_i; + continue; + } - if let Some(end_idx) = closing { - tokens.push(collect_range(&chars, start, end_idx)); - i = end_idx; - } else { - tokens.push(collect_range(&chars, start, start + fence_len)); - i = start + fence_len; - } - } else if c == '[' || (c == '!' && i + 1 < chars.len() && chars[i + 1] == '[') { - let (tok, mut new_i) = parse_link_or_image(&chars, i); + let after_bang = i + ch.len_utf8(); + let looks_like_image = + ch == '!' && after_bang <= text.len() && text[after_bang..].starts_with('['); + if ch == '[' || looks_like_image { + let (tok, mut new_i) = parse_link_or_image(text, i); tokens.push(tok); let punct_start = new_i; - new_i = scan_while(&chars, new_i, is_trailing_punctuation); + new_i = scan_while(text, new_i, is_trailing_punctuation); if new_i > punct_start { - tokens.push(collect_range(&chars, punct_start, new_i)); + tokens.push(collect_range(text, punct_start, new_i)); } i = new_i; } else { let start = i; - i = scan_while(&chars, i, |ch| !ch.is_whitespace() && ch != '`'); - tokens.push(collect_range(&chars, start, i)); + i = scan_while(text, i, |ch| !ch.is_whitespace() && ch != '`'); + tokens.push(collect_range(text, start, i)); } } tokens @@ -396,6 +415,60 @@ pub fn tokenize_markdown(source: &str) -> Vec> { mod tests { use super::*; + #[test] + fn scan_while_respects_predicate_boundaries() { + let text = "abc123"; + assert_eq!(scan_while(text, 0, char::is_alphabetic), 3); + assert_eq!(scan_while(text, 3, char::is_numeric), text.len()); + } + + #[test] + fn scan_while_advances_over_multibyte_characters() { + let text = "åßç123"; + let idx = scan_while(text, 0, char::is_alphabetic); + assert_eq!(&text[..idx], "åßç"); + } + + #[test] + fn collect_range_extracts_multibyte_segments() { + let text = "αβγδε"; + let first_two = "αβ".len(); + let middle = first_two + "γδ".len(); + assert_eq!(collect_range(text, 0, first_two), "αβ"); + assert_eq!(collect_range(text, first_two, middle), "γδ"); + } + + #[test] + fn parse_link_or_image_handles_nested_parentheses() { + let text = "![alt](path(a(b)c)) more"; + let (token, idx) = parse_link_or_image(text, 0); + assert_eq!(token, "![alt](path(a(b)c))"); + assert_eq!(idx, token.len()); + } + + #[test] + fn parse_link_or_image_falls_back_on_malformed_input() { + let text = "[broken"; + let (token, idx) = parse_link_or_image(text, 0); + assert_eq!(token, "["); + assert_eq!(idx, "[".len()); + } + + #[test] + fn segment_inline_handles_multibyte_tokens() { + let tokens = segment_inline("ßß `λ` фин"); + assert_eq!( + tokens, + vec![ + String::from("ßß"), + String::from(" "), + String::from("`λ`"), + String::from(" "), + String::from("фин"), + ] + ); + } + #[test] fn link_with_trailing_punctuation() { let tokens = segment_inline("see [link](url)."); diff --git a/tests/wrap/tokenize_markdown.rs b/tests/wrap/tokenize_markdown.rs index 4e41a826..21a2850c 100644 --- a/tests/wrap/tokenize_markdown.rs +++ b/tests/wrap/tokenize_markdown.rs @@ -77,3 +77,20 @@ fn multiple_unmatched_backticks_are_text() { ); } +#[test] +fn multibyte_characters_round_trip() { + let source = "ßß `λ` fin"; + let tokens = wrap::tokenize_markdown(source); + assert_eq!( + tokens, + vec![ + Token::Text("ßß "), + Token::Code { + raw: "`λ`", + fence: "`", + code: "λ", + }, + Token::Text(" fin"), + ] + ); +}