diff --git a/Cargo.lock b/Cargo.lock index 8d889b3e..510f7e10 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -47,7 +47,7 @@ version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c8bdeb6047d8983be085bab0ba1472e6dc604e7041dbf6fcd5e71523014fae9" dependencies = [ - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -58,7 +58,7 @@ checksum = "403f75924867bb1033c59fbf0797484329750cfbe3c4325cd33127941fabc882" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -85,9 +85,9 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "bitflags" @@ -114,9 +114,9 @@ checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" [[package]] name = "clap" -version = "4.5.40" +version = "4.5.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40b6887a1d8685cebccf115538db5c0efe625ccac9696ad45c409d96566e910f" +checksum = "be92d32e80243a54711e5d7ce823c35c41c9d929dc4ab58e1276f625841aadf9" dependencies = [ "clap_builder", "clap_derive", @@ -124,9 +124,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.40" +version = "4.5.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0c66c08ce9f0c698cbce5c0279d0bb6ac936d8674174fe48f736533b964f59e" +checksum = "707eab41e9622f9139419d573eca0900137718000c517d47da73045f54331c3d" dependencies = [ "anstream", "anstyle", @@ -136,9 +136,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.40" +version = "4.5.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2c7947ae4cc3d851207c1adb5b5e260ff0cca11446b1d6d1423788e442257ce" +checksum = "ef4f52386a59ca4c860f7393bcf8abd8dfd91ecccc0f774635ff68e92eeef491" dependencies = [ "heck", "proc-macro2", @@ -172,12 +172,12 @@ checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" [[package]] name = "errno" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cea14ef9355e3beab063703aa9dab15afd25f0667c341310c1e5274bb1d0da18" +checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.60.2", ] [[package]] @@ -337,9 +337,9 @@ checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] name = "libc" -version = "0.2.173" +version = "0.2.174" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8cfeafaffdbc32176b64fb251369d52ea9f0a8fbc6f8759edffef7b525d64bb" +checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" [[package]] name = "linux-raw-sys" @@ -407,7 +407,7 @@ dependencies = [ "regex", "rstest", "tempfile", - "textwrap", + "unicode-width", ] [[package]] @@ -454,7 +454,7 @@ dependencies = [ "libc", "redox_syscall", "smallvec", - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -560,9 +560,9 @@ dependencies = [ [[package]] name = "r-efi" -version = "5.2.0" +version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" [[package]] name = "rand" @@ -671,7 +671,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -714,12 +714,9 @@ checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" [[package]] name = "slab" -version = "0.4.9" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" -dependencies = [ - "autocfg", -] +checksum = "04dc19736151f35336d325007ac991178d504a119863a2fcb3758cdb5e52c50d" [[package]] name = "smallvec" @@ -727,12 +724,6 @@ version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" -[[package]] -name = "smawk" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7c388c1b5e93756d0c740965c41e8822f866621d41acbdf6336a6a168f8840c" - [[package]] name = "string_cache" version = "0.8.9" @@ -766,9 +757,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.103" +version = "2.0.104" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4307e30089d6fd6aff212f2da3a1f9e32f3223b1f010fb09b7c95f90f3ca1e8" +checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40" dependencies = [ "proc-macro2", "quote", @@ -785,7 +776,7 @@ dependencies = [ "getrandom", "once_cell", "rustix", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -805,34 +796,17 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683" -[[package]] -name = "textwrap" -version = "0.16.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c13547615a44dc9c452a8a534638acdf07120d4b6847c8178705da06306a3057" -dependencies = [ - "smawk", - "unicode-linebreak", - "unicode-width", -] - [[package]] name = "unicode-ident" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" -[[package]] -name = "unicode-linebreak" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b09c83c3c29d37506a3e260c08c03743a6bb66a9cd432c6934ab501a190571f" - [[package]] name = "unicode-width" -version = "0.2.1" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" [[package]] name = "utf-8" @@ -870,7 +844,16 @@ version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.2", ] [[package]] @@ -879,14 +862,30 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c66f69fcc9ce11da9966ddb31a40968cad001c5bedeb5c2b82ede4253ab48aef" +dependencies = [ + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", ] [[package]] @@ -895,48 +894,96 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + [[package]] name = "wit-bindgen-rt" version = "0.39.0" diff --git a/Cargo.toml b/Cargo.toml index 37dff9c7..7c529e07 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ clap = { version = "4", features = ["derive"] } regex = "1" html5ever = "0.27" markup5ever_rcdom = "0.3" -textwrap = "^0.16" +unicode-width = ">=0.1, <0.2" [dev-dependencies] diff --git a/docs/unicode-width.md b/docs/unicode-width.md new file mode 100644 index 00000000..2b1da2be --- /dev/null +++ b/docs/unicode-width.md @@ -0,0 +1,9 @@ +# Unicode Width Handling + +`mdtablefix` wraps paragraphs and list items while respecting the display width of +Unicode characters. The `unicode-width` crate is used to compute the width of +strings when deciding where to break lines. This prevents emojis or other +multi-byte characters from causing unexpected wraps or truncation. + +Whenever wrapping logic examines the length of a token, it relies on +`UnicodeWidthStr::width` to measure visible columns rather than byte length. diff --git a/src/lib.rs b/src/lib.rs index bc025956..5468c4ea 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,7 +17,6 @@ use std::{fs, path::Path}; pub use html::convert_html_tables; use regex::Regex; -use textwrap::{Options, WordSplitter, fill}; /// Splits a markdown table line into trimmed cell strings. /// @@ -221,15 +220,65 @@ pub fn reflow_table(lines: &[String]) -> Vec { static FENCE_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| Regex::new(r"^(```|~~~).*").unwrap()); -static CODE_SPAN_RE: std::sync::LazyLock = - std::sync::LazyLock::new(|| Regex::new(r"(`+[^`]*`+)").unwrap()); - static BULLET_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| Regex::new(r"^(\s*(?:[-*+]|\d+[.)])\s+)(.*)").unwrap()); static NUMBERED_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| Regex::new(r"^(\s*)([1-9][0-9]*)\.(\s+)(.*)").unwrap()); +fn tokenize_markdown(text: &str) -> Vec { + let mut tokens = Vec::new(); + let chars: Vec = text.chars().collect(); + let mut i = 0; + while i < chars.len() { + let c = chars[i]; + if c.is_whitespace() { + let start = i; + while i < chars.len() && chars[i].is_whitespace() { + i += 1; + } + tokens.push(chars[start..i].iter().collect()); + } else if c == '`' { + let start = i; + let mut delim_len = 0; + while i < chars.len() && chars[i] == '`' { + i += 1; + delim_len += 1; + } + let mut end = i; + while end < chars.len() { + if chars[end] == '`' { + let mut j = end; + let mut count = 0; + while j < chars.len() && chars[j] == '`' { + j += 1; + count += 1; + } + if count == delim_len { + end = j; + break; + } + } + end += 1; + } + if end >= chars.len() { + tokens.push(chars[start..start + delim_len].iter().collect()); + i = start + delim_len; + } else { + tokens.push(chars[start..end].iter().collect()); + i = end; + } + } else { + let start = i; + while i < chars.len() && !chars[i].is_whitespace() && chars[i] != '`' { + i += 1; + } + tokens.push(chars[start..i].iter().collect()); + } + } + tokens +} + /// Width of a normalised thematic break. /// The width used when rewriting thematic breaks. pub const THEMATIC_BREAK_LEN: usize = 70; @@ -241,6 +290,39 @@ static THEMATIC_BREAK_RE: std::sync::LazyLock = std::sync::LazyLock::new( static THEMATIC_BREAK_LINE: std::sync::LazyLock = std::sync::LazyLock::new(|| "_".repeat(THEMATIC_BREAK_LEN)); +fn wrap_preserving_code(text: &str, width: usize) -> Vec { + use unicode_width::UnicodeWidthStr; + + let mut lines = Vec::new(); + let mut current = String::new(); + let mut current_width = 0; + for token in tokenize_markdown(text) { + let token_width = UnicodeWidthStr::width(token.as_str()); + if current_width + token_width <= width { + current.push_str(&token); + current_width += token_width; + continue; + } + + let trimmed = current.trim_end(); + if !trimmed.is_empty() { + lines.push(trimmed.to_string()); + } + current.clear(); + current_width = 0; + + if !token.chars().all(char::is_whitespace) { + current.push_str(&token); + current_width = token_width; + } + } + let trimmed = current.trim_end(); + if !trimmed.is_empty() { + lines.push(trimmed.to_string()); + } + lines +} + /// Returns `true` if the line is a fenced code block delimiter (e.g., three backticks or "~~~"). /// /// # Examples @@ -259,22 +341,6 @@ pub fn is_fence(line: &str) -> bool { FENCE_RE.is_match(line) } /// Inline code spans are delimited by matching pairs of backticks. This helper /// replaces normal spaces inside those spans with `U+00A0` (non-breaking space) /// so that the wrapping logic does not split them across lines. -fn protect_code_span_spaces(text: &str) -> String { - CODE_SPAN_RE - .replace_all(text, |caps: ®ex::Captures| { - caps[0].replace(' ', "\u{00A0}") - }) - .into_owned() -} - -fn wrap_segment(seg: &str, indent: &str, width: usize, out: &mut Vec) { - let opts = Options::new(width - indent.len()).word_splitter(WordSplitter::NoHyphenation); - let protected = protect_code_span_spaces(seg); - for line in fill(&protected, &opts).lines() { - let restored = line.replace('\u{00A0}', " "); - out.push(format!("{indent}{restored}")); - } -} /// Flushes a buffered paragraph to the output, wrapping text to the specified width and applying /// indentation. /// @@ -293,12 +359,16 @@ fn flush_paragraph(out: &mut Vec, buf: &[(String, bool)], indent: &str, } segment.push_str(text); if *hard_break { - wrap_segment(&segment, indent, width, out); + for line in wrap_preserving_code(&segment, width - indent.len()) { + out.push(format!("{indent}{line}")); + } segment.clear(); } } if !segment.is_empty() { - wrap_segment(&segment, indent, width, out); + for line in wrap_preserving_code(&segment, width - indent.len()) { + out.push(format!("{indent}{line}")); + } } } @@ -389,9 +459,10 @@ pub fn wrap_text(lines: &[String], width: usize) -> Vec { let prefix = cap.get(1).unwrap().as_str(); let rest = cap.get(2).unwrap().as_str().trim(); let spaces = " ".repeat(prefix.len()); - let opts = - Options::new(width - prefix.len()).word_splitter(WordSplitter::NoHyphenation); - for (i, l) in fill(rest, &opts).lines().enumerate() { + for (i, l) in wrap_preserving_code(rest, width - prefix.len()) + .iter() + .enumerate() + { if i == 0 { out.push(format!("{prefix}{l}")); } else { diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 0903d971..72142b7d 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -24,4 +24,30 @@ pub fn assert_wrapped_list_item(output: &[String], prefix: &str, expected: usize for line in output.iter().skip(1) { assert!(line.starts_with(&indent)); } + + let mut open: Option = None; + for line in output { + let chars: Vec = line.chars().collect(); + let mut i = 0; + while i < chars.len() { + if chars[i] == '`' { + let mut len = 0; + while i < chars.len() && chars[i] == '`' { + len += 1; + i += 1; + } + if let Some(open_len) = open { + if open_len == len { + open = None; + } + } else { + open = Some(len); + } + } else { + i += 1; + } + } + assert!(open.is_none(), "code span split across lines"); + } + assert!(open.is_none(), "unclosed code span"); } diff --git a/tests/integration.rs b/tests/integration.rs index e03854ea..c12b6415 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -671,6 +671,7 @@ fn test_wrap_list_item() { #[case("- ", 3)] #[case("1. ", 3)] #[case("10. ", 3)] +#[case("100. ", 3)] fn test_wrap_list_items_with_inline_code(#[case] prefix: &str, #[case] expected: usize) { let input = vec![format!( "{prefix}`script`: A multi-line script declared with the YAML `|` block style. The entire \ @@ -681,6 +682,40 @@ fn test_wrap_list_items_with_inline_code(#[case] prefix: &str, #[case] expected: common::assert_wrapped_list_item(&output, prefix, expected); } +#[test] +fn test_wrap_preserves_inline_code_spans() { + let input = vec![ + "- `script`: A multi-line script declared with the YAML `|` block style. The entire block \ + is passed to an interpreter. If the first line begins with `#!`, Netsuke executes the \ + script verbatim, respecting the shebang." + .to_string(), + ]; + let output = process_stream(&input); + common::assert_wrapped_list_item(&output, "- ", 3); +} + +#[test] +fn test_wrap_multi_backtick_code() { + let input = vec![ + "- ``cmd`` executes ```echo``` output with ``json`` format and prints results to the \ + console" + .to_string(), + ]; + let output = process_stream(&input); + common::assert_wrapped_list_item(&output, "- ", 2); +} + +#[test] +fn test_wrap_multiple_inline_code_spans() { + let input = vec![ + "- Use `foo` and `bar` inside ``baz`` for testing with additional commentary to exceed \ + wrapping width" + .to_string(), + ]; + let output = process_stream(&input); + common::assert_wrapped_list_item(&output, "- ", 2); +} + #[test] /// Verifies that short list items are not wrapped or altered by the stream processing logic. ///