From af32d7ecbfbfc877369fe4e72512c91fab299d82 Mon Sep 17 00:00:00 2001 From: Leynos Date: Thu, 11 Sep 2025 13:07:09 +0100 Subject: [PATCH 1/5] Preserve trailing spaces on final flush --- src/wrap.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/wrap.rs b/src/wrap.rs index 48b21563..264aaba0 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -156,11 +156,9 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { continue; } - let trimmed = current.trim_end(); - if !trimmed.is_empty() { - lines.push(trimmed.to_string()); + if !current.is_empty() { + lines.push(std::mem::take(&mut current)); } - current.clear(); current_width = 0; last_split = None; From 7a72eb348213bc31bad8a3279942d1f2ca73efe3 Mon Sep 17 00:00:00 2001 From: Leynos Date: Fri, 12 Sep 2025 08:23:56 +0100 Subject: [PATCH 2/5] Reuse buffer capacity after wrap flush --- src/wrap.rs | 3 +++ src/wrap/tests.rs | 28 ++++++++++++++++------------ 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/wrap.rs b/src/wrap.rs index 264aaba0..a367ea28 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -157,7 +157,10 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { } if !current.is_empty() { + // Reuse allocation to avoid repeated growth on long wraps. + let prev_capacity = current.capacity(); lines.push(std::mem::take(&mut current)); + current = String::with_capacity(prev_capacity); } current_width = 0; last_split = None; diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs index 8c10c636..8fd44c85 100644 --- a/src/wrap/tests.rs +++ b/src/wrap/tests.rs @@ -113,23 +113,27 @@ fn wrap_text_preserves_links() { } #[rstest] -#[case("ends with space ", 80, &["ends with space "])] -#[case("four spaces ", 80, &["four spaces "])] -#[case(" ", 80, &[" "])] -#[case("word1 word2 ", 8, &["word1", "word2 "])] -fn wrap_preserving_code_keeps_trailing_spaces( +#[case("trail ", 80, &["trail "])] +#[case("`code span` ", 13, &["`code span` "])] +fn preserves_trailing_spaces(#[case] input: &str, #[case] width: usize, #[case] expected: &[&str]) { + let out = super::wrap_preserving_code(input, width); + assert_eq!( + out, + expected.iter().map(|&s| s.to_string()).collect::>() + ); +} + +#[rstest] +#[case("aaaaaaaaaaaa", 5, &["aaaaaaaaaaaa"])] // forced flush without split +fn no_split_forced_flush_no_trim( #[case] input: &str, #[case] width: usize, #[case] expected: &[&str], ) { - // The final flush must not trim trailing spaces, even after wrapping. - let lines = super::wrap_preserving_code(input, width); + let out = super::wrap_preserving_code(input, width); assert_eq!( - lines, - expected - .iter() - .map(ToString::to_string) - .collect::>() + out, + expected.iter().map(|&s| s.to_string()).collect::>() ); } From 3df0c18e8a7d7603965225cf0b517686c8dba446 Mon Sep 17 00:00:00 2001 From: Leynos Date: Sat, 13 Sep 2025 23:14:24 +0100 Subject: [PATCH 3/5] Handle trailing spaces on forced flush --- src/wrap.rs | 24 +++++++++++++++++------- src/wrap/tests.rs | 7 ++++--- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/src/wrap.rs b/src/wrap.rs index a367ea28..3dcc3a07 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -71,7 +71,6 @@ fn is_trailing_punct(c: char) -> bool { fn extend_punctuation(tokens: &[String], mut j: usize, width: &mut usize) -> usize { use unicode_width::UnicodeWidthStr; - while j < tokens.len() && tokens[j].chars().all(is_trailing_punct) { *width += UnicodeWidthStr::width(tokens[j].as_str()); j += 1; @@ -79,9 +78,15 @@ fn extend_punctuation(tokens: &[String], mut j: usize, width: &mut usize) -> usi j } +fn flush_trailing_whitespace(lines: &mut Vec, current: &mut String, token: &str) { + let prev_capacity = current.capacity(); + current.push_str(token); + lines.push(std::mem::take(current)); + *current = String::with_capacity(prev_capacity); +} + fn wrap_preserving_code(text: &str, width: usize) -> Vec { use unicode_width::UnicodeWidthStr; - let mut lines = Vec::new(); let mut current = String::new(); let mut current_width = 0; @@ -91,11 +96,9 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { while i < tokens.len() { let mut j = i + 1; let mut group_width = UnicodeWidthStr::width(tokens[i].as_str()); - if tokens[i].contains("](") && tokens[i].ends_with(')') { j = extend_punctuation(&tokens, j, &mut group_width); } - if tokens[i].starts_with('`') && tokens[i].ends_with('`') { // Keep trailing punctuation glued to inline code spans. j = extend_punctuation(&tokens, j, &mut group_width); @@ -115,7 +118,6 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { i += 1; continue; } - if current_width + group_width <= width { for tok in &tokens[i..j] { current.push_str(tok); @@ -155,7 +157,16 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { i = j; continue; } - + if tokens[i].chars().all(char::is_whitespace) && j == tokens.len() { + // Preserve trailing spaces that forced a flush. + if !current.is_empty() { + flush_trailing_whitespace(&mut lines, &mut current, &tokens[i]); + } + current_width = 0; + last_split = None; + i = j; + continue; + } if !current.is_empty() { // Reuse allocation to avoid repeated growth on long wraps. let prev_capacity = current.capacity(); @@ -219,7 +230,6 @@ fn append_wrapped_with_prefix( repeat_prefix: bool, ) { use unicode_width::UnicodeWidthStr; - let prefix_width = UnicodeWidthStr::width(prefix); let available = width.saturating_sub(prefix_width).max(1); let indent_str: String = prefix.chars().take_while(|c| c.is_whitespace()).collect(); diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs index 8fd44c85..1c2859a1 100644 --- a/src/wrap/tests.rs +++ b/src/wrap/tests.rs @@ -6,6 +6,7 @@ use rstest::rstest; use super::super::*; +use super::wrap_preserving_code; #[test] fn wrap_text_preserves_hyphenated_words() { @@ -114,9 +115,9 @@ fn wrap_text_preserves_links() { #[rstest] #[case("trail ", 80, &["trail "])] -#[case("`code span` ", 13, &["`code span` "])] +#[case("`code span` ", 12, &["`code span` "])] fn preserves_trailing_spaces(#[case] input: &str, #[case] width: usize, #[case] expected: &[&str]) { - let out = super::wrap_preserving_code(input, width); + let out = wrap_preserving_code(input, width); assert_eq!( out, expected.iter().map(|&s| s.to_string()).collect::>() @@ -130,7 +131,7 @@ fn no_split_forced_flush_no_trim( #[case] width: usize, #[case] expected: &[&str], ) { - let out = super::wrap_preserving_code(input, width); + let out = wrap_preserving_code(input, width); assert_eq!( out, expected.iter().map(|&s| s.to_string()).collect::>() From a09c9cc3b0dadd0a226c7f294945aaff66ab361e Mon Sep 17 00:00:00 2001 From: Leynos Date: Sun, 14 Sep 2025 17:24:38 +0100 Subject: [PATCH 4/5] Handle over-width code spans without splitting --- src/wrap.rs | 38 ++++++++++++++++++++++++++++++++------ src/wrap/tests.rs | 4 ++++ 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/src/wrap.rs b/src/wrap.rs index 3dcc3a07..29c4f340 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -78,11 +78,35 @@ fn extend_punctuation(tokens: &[String], mut j: usize, width: &mut usize) -> usi j } +fn merge_code_span(tokens: &[String], i: usize, width: &mut usize) -> usize { + use unicode_width::UnicodeWidthStr; + let mut j = i + 1; + while j < tokens.len() && tokens[j] != "`" { + *width += UnicodeWidthStr::width(tokens[j].as_str()); + j += 1; + } + if j < tokens.len() { + *width += UnicodeWidthStr::width(tokens[j].as_str()); + j += 1; + j = extend_punctuation(tokens, j, width); + } + j +} + +#[inline] +fn flush_current(lines: &mut Vec, current: &mut String) { + let cap = current.capacity(); + lines.push(std::mem::take(current)); + *current = String::with_capacity(cap); +} + fn flush_trailing_whitespace(lines: &mut Vec, current: &mut String, token: &str) { - let prev_capacity = current.capacity(); + debug_assert!( + token.chars().all(char::is_whitespace), + "expected whitespace token" + ); current.push_str(token); - lines.push(std::mem::take(current)); - *current = String::with_capacity(prev_capacity); + flush_current(lines, current); } fn wrap_preserving_code(text: &str, width: usize) -> Vec { @@ -96,6 +120,10 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { while i < tokens.len() { let mut j = i + 1; let mut group_width = UnicodeWidthStr::width(tokens[i].as_str()); + if tokens[i] == "`" { + j = merge_code_span(&tokens, i, &mut group_width); + } + if tokens[i].contains("](") && tokens[i].ends_with(')') { j = extend_punctuation(&tokens, j, &mut group_width); } @@ -169,9 +197,7 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { } if !current.is_empty() { // Reuse allocation to avoid repeated growth on long wraps. - let prev_capacity = current.capacity(); - lines.push(std::mem::take(&mut current)); - current = String::with_capacity(prev_capacity); + flush_current(&mut lines, &mut current); } current_width = 0; last_split = None; diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs index 1c2859a1..9c9ed54e 100644 --- a/src/wrap/tests.rs +++ b/src/wrap/tests.rs @@ -116,6 +116,8 @@ fn wrap_text_preserves_links() { #[rstest] #[case("trail ", 80, &["trail "])] #[case("`code span` ", 12, &["`code span` "])] +#[case("foo ", 3, &["foo "])] +#[case("x ", 1, &["x "])] fn preserves_trailing_spaces(#[case] input: &str, #[case] width: usize, #[case] expected: &[&str]) { let out = wrap_preserving_code(input, width); assert_eq!( @@ -126,6 +128,8 @@ fn preserves_trailing_spaces(#[case] input: &str, #[case] width: usize, #[case] #[rstest] #[case("aaaaaaaaaaaa", 5, &["aaaaaaaaaaaa"])] // forced flush without split +#[case("abcde", 3, &["abcde"])] +#[case("`codespan`", 6, &["`codespan`"])] fn no_split_forced_flush_no_trim( #[case] input: &str, #[case] width: usize, From 98588595b089c6c55607abdd6cdb321c94ef74fc Mon Sep 17 00:00:00 2001 From: Leynos Date: Sun, 14 Sep 2025 18:43:35 +0100 Subject: [PATCH 5/5] Preserve trailing spaces across HTML and tables --- src/html.rs | 11 +++++------ src/process.rs | 6 +++--- src/reflow.rs | 2 +- src/wrap.rs | 7 +++++++ tests/table/convert_html.rs | 15 +++++++++++++++ 5 files changed, 31 insertions(+), 10 deletions(-) diff --git a/src/html.rs b/src/html.rs index f495c185..d3ab29b7 100644 --- a/src/html.rs +++ b/src/html.rs @@ -29,7 +29,10 @@ fn node_text(handle: &Handle) -> String { let mut out = String::new(); let mut last_space = false; collect_text(handle, &mut out, &mut last_space); - out.trim().to_string() + if last_space { + out.push(' '); + } + out.trim_start().to_string() } fn is_ignored_tag(tag: &str) -> bool { @@ -178,11 +181,7 @@ fn table_lines_to_markdown(lines: &[String]) -> Vec { .first() .map(|l| l.chars().take_while(|c| c.is_whitespace()).collect()) .unwrap_or_default(); - let html: String = lines - .iter() - .map(|l| l.trim_end()) - .collect::>() - .join("\n"); + let html: String = lines.join("\n"); let opts = ParseOpts::default(); let dom: RcDom = parse_document(RcDom::default(), opts).one(html); diff --git a/src/process.rs b/src/process.rs index 81f6e6b7..24fd3135 100644 --- a/src/process.rs +++ b/src/process.rs @@ -88,7 +88,7 @@ fn handle_table_line( ) -> bool { if line.trim_start().starts_with('|') { *in_table = true; - buf.push(line.trim_end().to_string()); + buf.push(line.to_string()); return true; } if line.trim().is_empty() { @@ -98,7 +98,7 @@ fn handle_table_line( return false; } if *in_table && (line.contains('|') || crate::table::SEP_RE.is_match(line.trim())) { - buf.push(line.trim_end().to_string()); + buf.push(line.to_string()); return true; } if *in_table { @@ -112,7 +112,7 @@ fn handle_table_line( flush_buffer(buf, in_table, out); return false; } - buf.push(line.trim_end().to_string()); + buf.push(line.to_string()); return true; } false diff --git a/src/reflow.rs b/src/reflow.rs index b3b244ee..a67a9158 100644 --- a/src/reflow.rs +++ b/src/reflow.rs @@ -26,7 +26,7 @@ fn collect_cells(chunks: &[&str]) -> Vec { for (idx, chunk) in chunks.iter().enumerate() { let mut ch = (*chunk).to_string(); if idx != chunks.len() - 1 { - ch = ch.trim_end().to_string() + " |ROW_END|"; + ch.push_str(" |ROW_END|"); } cells.extend(split_cells(&ch)); } diff --git a/src/wrap.rs b/src/wrap.rs index 29c4f340..03b4d8a5 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -78,8 +78,13 @@ fn extend_punctuation(tokens: &[String], mut j: usize, width: &mut usize) -> usi j } +#[inline] fn merge_code_span(tokens: &[String], i: usize, width: &mut usize) -> usize { use unicode_width::UnicodeWidthStr; + debug_assert!( + tokens[i] == "`", + "merge_code_span requires a single backtick opener" + ); let mut j = i + 1; while j < tokens.len() && tokens[j] != "`" { *width += UnicodeWidthStr::width(tokens[j].as_str()); @@ -162,6 +167,7 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { let pos = last_split.unwrap(); let line = current[..pos].to_string(); let mut rest = current[pos..].trim_start().to_string(); + // Mid-wrap lines discard trailing spaces. let trimmed = line.trim_end(); if !trimmed.is_empty() { lines.push(trimmed.to_string()); @@ -177,6 +183,7 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { None }; if current_width > width { + // Mid-wrap overflow flush trims trailing spaces. lines.push(current.trim_end().to_string()); current.clear(); current_width = 0; diff --git a/tests/table/convert_html.rs b/tests/table/convert_html.rs index 1ace15a5..21f57e24 100644 --- a/tests/table/convert_html.rs +++ b/tests/table/convert_html.rs @@ -62,3 +62,18 @@ fn test_convert_html_table_bold_header() { let expected: Vec = include_lines!("data/bold_header_expected.txt"); assert_eq!(convert_html_tables(&input), expected); } +#[test] +fn preserves_trailing_spaces_in_cells() { + let input = lines_vec![ + "", + "", + "", + "
H
cell
", + ]; + let expected = lines_vec![ + "| H |", + "| --- |", + "| cell |", + ]; + assert_eq!(convert_html_tables(&input), expected); +}