From 46e84a0e7a7578a36b589c1fc0ca705c596b1f64 Mon Sep 17 00:00:00 2001 From: Leynos Date: Sat, 14 Jun 2025 00:18:01 +0100 Subject: [PATCH 1/4] Add process_stream test for multiple tables --- Cargo.lock | 260 +++++++++++++++++++++++++++++++++++++ Cargo.toml | 2 + README.md | 1 + docs/html-table-support.md | 10 ++ src/lib.rs | 124 ++++++++++++++++++ tests/integration.rs | 39 ++++++ 6 files changed, 436 insertions(+) create mode 100644 docs/html-table-support.md diff --git a/Cargo.lock b/Cargo.lock index 3677d533..72627ddf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -186,6 +186,16 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures" version = "0.3.31" @@ -305,6 +315,18 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "html5ever" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "953cbbe631aae7fc0a112702ad5d3aaf09da38beaf45ea84610d6e1c358f569c" +dependencies = [ + "log", + "mac", + "markup5ever", + "match_token", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.1" @@ -323,6 +345,62 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" +[[package]] +name = "lock_api" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" + +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a8096766c229e8c88a3900c9b44b7e06aa7f7343cc229158c3e58ef8f9973a" +dependencies = [ + "log", + "tendril", + "web_atoms", +] + +[[package]] +name = "markup5ever_rcdom" +version = "0.5.3-unofficial" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "853740b93240b82f68a23d8b296b2d19fc81521c298fcae44bf34bed6e445f00" +dependencies = [ + "html5ever", + "markup5ever", + "tendril", + "xml5ever", +] + +[[package]] +name = "match_token" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88a9689d8d44bf9964484516275f5cd4c9b59457a6940c1d5d0ecbb94510a36b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "mdtablefix" version = "0.1.0" @@ -330,6 +408,8 @@ dependencies = [ "anyhow", "assert_cmd", "clap", + "html5ever", + "markup5ever_rcdom", "regex", "rstest", "tempfile", @@ -341,6 +421,12 @@ version = "2.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + [[package]] name = "once_cell" version = "1.21.3" @@ -353,6 +439,67 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" +[[package]] +name = "parking_lot" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70d58bf43669b5795d1576d0641cfb6fbb2057bf629506267a92807158584a13" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project-lite" version = "0.2.16" @@ -365,6 +512,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "predicates" version = "3.1.3" @@ -416,6 +569,30 @@ version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" + +[[package]] +name = "redox_syscall" +version = "0.5.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d04b7d0ee6b4a0207a0a7adb104d23ecb0b47d6beae7152d0fa34b692b29fd6" +dependencies = [ + "bitflags", +] + [[package]] name = "regex" version = "1.11.1" @@ -502,6 +679,12 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "semver" version = "1.0.26" @@ -528,6 +711,12 @@ dependencies = [ "syn", ] +[[package]] +name = "siphasher" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" + [[package]] name = "slab" version = "0.4.9" @@ -537,6 +726,37 @@ dependencies = [ "autocfg", ] +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "string_cache" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", +] + [[package]] name = "strsim" version = "0.11.1" @@ -567,6 +787,17 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "termtree" version = "0.5.1" @@ -579,6 +810,12 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utf8parse" version = "0.2.2" @@ -603,6 +840,18 @@ dependencies = [ "wit-bindgen-rt", ] +[[package]] +name = "web_atoms" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57ffde1dc01240bdf9992e3205668b235e59421fd085e8a317ed98da0178d414" +dependencies = [ + "phf", + "phf_codegen", + "string_cache", + "string_cache_codegen", +] + [[package]] name = "windows-sys" version = "0.59.0" @@ -684,3 +933,14 @@ checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" dependencies = [ "bitflags", ] + +[[package]] +name = "xml5ever" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a91563ba5a5ab749488164063f1317e327ca1daa80f00e5bd1e670ad0d78154" +dependencies = [ + "log", + "mac", + "markup5ever", +] diff --git a/Cargo.toml b/Cargo.toml index a4cca3c9..ea79d507 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,8 @@ edition = "2024" anyhow = "1" clap = { version = "4", features = ["derive"] } regex = "1" +html5ever = "0.31" +markup5ever_rcdom = "0.5.3-unofficial" [dev-dependencies] diff --git a/README.md b/README.md index f0a05fb5..e6f20e96 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,4 @@ This is a generated project using [Copier](https://copier.readthedocs.io/). A tool to reflow Markdown tables while preserving other document content. +It also converts simple HTML `` elements to Markdown before reflowing. diff --git a/docs/html-table-support.md b/docs/html-table-support.md new file mode 100644 index 00000000..b3c31ec6 --- /dev/null +++ b/docs/html-table-support.md @@ -0,0 +1,10 @@ +# HTML Table Support + +`mdtablefix` uses the `html5ever` parser to recognise simple `
` elements +embedded in Markdown documents. These tables are converted to Markdown before the +normal reflow logic runs so that Markdown and HTML tables are formatted +consistently. + +The crate `markup5ever_rcdom` provides a minimal DOM which `html5ever` populates +and which is traversed to extract rows and cells. Only basic tables containing +``, `
` and `` elements are supported. diff --git a/src/lib.rs b/src/lib.rs index 6376953a..39beeeef 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,8 @@ //! //! Functions here reflow tables that were broken during formatting. +use html5ever::{parse_document, tendril::TendrilSink}; +use markup5ever_rcdom::{Handle, NodeData, RcDom}; use regex::Regex; use std::fs; use std::path::Path; @@ -85,6 +87,92 @@ fn format_separator_cells(widths: &[usize], sep_cells: &[String]) -> Vec .collect() } +fn node_text(handle: &Handle) -> String { + let mut text = String::new(); + collect_text(handle, &mut text); + text.trim().to_string() +} + +fn collect_text(handle: &Handle, out: &mut String) { + match &handle.data { + NodeData::Text { contents } => out.push_str(&contents.borrow()), + NodeData::Element { .. } | NodeData::Document => { + for child in handle.children.borrow().iter() { + collect_text(child, out); + } + } + _ => {} + } +} + +fn find_table(handle: &Handle) -> Option { + if let NodeData::Element { name, .. } = &handle.data { + if name.local.as_ref() == "table" { + return Some(handle.clone()); + } + } + for child in handle.children.borrow().iter() { + if let Some(t) = find_table(child) { + return Some(t); + } + } + None +} + +fn collect_rows(handle: &Handle, rows: &mut Vec) { + if let NodeData::Element { name, .. } = &handle.data { + if name.local.as_ref() == "tr" { + rows.push(handle.clone()); + } + } + for child in handle.children.borrow().iter() { + collect_rows(child, rows); + } +} + +use html5ever::driver::ParseOpts; + +fn html_table_to_markdown(lines: &[String]) -> Vec { + let html = lines.join("\n"); + let opts = ParseOpts::default(); + let dom: RcDom = parse_document(RcDom::default(), opts).one(html); + let Some(table) = find_table(&dom.document) else { + return lines.to_vec(); + }; + let mut row_handles = Vec::new(); + collect_rows(&table, &mut row_handles); + if row_handles.is_empty() { + return lines.to_vec(); + } + let mut out = Vec::new(); + let mut first_header = false; + let mut col_count = 0; + for (i, row) in row_handles.iter().enumerate() { + let mut cells = Vec::new(); + let mut header_row = false; + for child in row.children.borrow().iter() { + if let NodeData::Element { name, .. } = &child.data { + if name.local.as_ref() == "td" || name.local.as_ref() == "th" { + if name.local.as_ref() == "th" { + header_row = true; + } + cells.push(node_text(child)); + } + } + } + if i == 0 { + first_header = header_row; + col_count = cells.len(); + } + out.push(format!("| {} |", cells.join(" | "))); + } + if first_header { + let sep: Vec = (0..col_count).map(|_| "---".to_string()).collect(); + out.insert(1, format!("| {} |", sep.join(" | "))); + } + reflow_table(&out) +} + /// Reflow a broken markdown table. /// /// # Panics @@ -243,8 +331,10 @@ static FENCE_RE: std::sync::LazyLock = pub fn process_stream(lines: &[String]) -> Vec { let mut out = Vec::new(); let mut buf = Vec::new(); + let mut html_buf = Vec::new(); let mut in_code = false; let mut in_table = false; + let mut in_html = false; for line in lines { if FENCE_RE.is_match(line) { @@ -266,6 +356,36 @@ pub fn process_stream(lines: &[String]) -> Vec { continue; } + if in_html { + html_buf.push(line.trim_end().to_string()); + if line.contains("
") { + out.extend(html_table_to_markdown(&html_buf)); + html_buf.clear(); + in_html = false; + } + continue; + } + + if line.trim_start().starts_with("") { + out.extend(html_table_to_markdown(&html_buf)); + html_buf.clear(); + in_html = false; + } + continue; + } + if line.trim_start().starts_with('|') { if !in_table { in_table = true; @@ -294,6 +414,10 @@ pub fn process_stream(lines: &[String]) -> Vec { } } + if !html_buf.is_empty() { + out.extend(html_table_to_markdown(&html_buf)); + } + out } diff --git a/tests/integration.rs b/tests/integration.rs index 245a20df..aec2f91f 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -63,6 +63,27 @@ fn indented_table() -> Vec { ] } +#[fixture] +fn html_table() -> Vec { + vec![ + "".to_string(), + "".to_string(), + "".to_string(), + "
AB
12
".to_string(), + ] +} + +#[fixture] +fn multiple_tables() -> Vec { + vec![ + "| A | B |".to_string(), + "| 1 | 22 |".to_string(), + String::new(), + "| X | Y |".to_string(), + "| 3 | 4 |".to_string(), + ] +} + #[rstest] /// Tests that `reflow_table` correctly restructures a broken Markdown table into a well-formed table. /// @@ -108,6 +129,24 @@ fn test_reflow_preserves_indentation(indented_table: Vec) { assert_eq!(reflow_table(&indented_table), expected); } +#[rstest] +fn test_process_stream_html_table(html_table: Vec) { + let expected = vec!["| A | B |", "| --- | --- |", "| 1 | 2 |"]; + assert_eq!(process_stream(&html_table), expected); +} + +#[rstest] +fn test_process_stream_multiple_tables(multiple_tables: Vec) { + let expected = vec![ + "| A | B |".to_string(), + "| 1 | 22 |".to_string(), + String::new(), + "| X | Y |".to_string(), + "| 3 | 4 |".to_string(), + ]; + assert_eq!(process_stream(&multiple_tables), expected); +} + /// Tests that `process_stream` leaves lines inside code fences unchanged. /// /// Verifies that both backtick (```) and tilde (~~~) fenced code blocks are ignored by the table processing logic, ensuring their contents are not altered. From 252c97306505654ab8ddef0f3a84884fc7aded1c Mon Sep 17 00:00:00 2001 From: Leynos Date: Sat, 14 Jun 2025 01:38:50 +0100 Subject: [PATCH 2/4] Handle nested HTML tables --- src/lib.rs | 88 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 62 insertions(+), 26 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 39beeeef..33641e2e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -88,14 +88,21 @@ fn format_separator_cells(widths: &[usize], sep_cells: &[String]) -> Vec } fn node_text(handle: &Handle) -> String { - let mut text = String::new(); - collect_text(handle, &mut text); - text.trim().to_string() + let mut parts = Vec::new(); + collect_text(handle, &mut parts); + parts + .into_iter() + .filter(|s| !s.trim().is_empty()) + .collect::>() + .join(" ") + .split_whitespace() + .collect::>() + .join(" ") } -fn collect_text(handle: &Handle, out: &mut String) { +fn collect_text(handle: &Handle, out: &mut Vec) { match &handle.data { - NodeData::Text { contents } => out.push_str(&contents.borrow()), + NodeData::Text { contents } => out.push(contents.borrow().to_string()), NodeData::Element { .. } | NodeData::Document => { for child in handle.children.borrow().iter() { collect_text(child, out); @@ -105,18 +112,15 @@ fn collect_text(handle: &Handle, out: &mut String) { } } -fn find_table(handle: &Handle) -> Option { +fn collect_tables(handle: &Handle, tables: &mut Vec) { if let NodeData::Element { name, .. } = &handle.data { if name.local.as_ref() == "table" { - return Some(handle.clone()); + tables.push(handle.clone()); } } for child in handle.children.borrow().iter() { - if let Some(t) = find_table(child) { - return Some(t); - } + collect_tables(child, tables); } - None } fn collect_rows(handle: &Handle, rows: &mut Vec) { @@ -132,18 +136,13 @@ fn collect_rows(handle: &Handle, rows: &mut Vec) { use html5ever::driver::ParseOpts; -fn html_table_to_markdown(lines: &[String]) -> Vec { - let html = lines.join("\n"); - let opts = ParseOpts::default(); - let dom: RcDom = parse_document(RcDom::default(), opts).one(html); - let Some(table) = find_table(&dom.document) else { - return lines.to_vec(); - }; +fn table_node_to_markdown(table: &Handle) -> Vec { let mut row_handles = Vec::new(); - collect_rows(&table, &mut row_handles); + collect_rows(table, &mut row_handles); if row_handles.is_empty() { - return lines.to_vec(); + return Vec::new(); } + let mut out = Vec::new(); let mut first_header = false; let mut col_count = 0; @@ -173,6 +172,34 @@ fn html_table_to_markdown(lines: &[String]) -> Vec { reflow_table(&out) } +fn html_table_to_markdown(lines: &[String]) -> Vec { + let indent: String = lines + .first() + .map(|l| l.chars().take_while(|c| c.is_whitespace()).collect()) + .unwrap_or_default(); + let html: String = lines + .iter() + .map(|l| l.trim_end()) + .collect::>() + .join("\n"); + let opts = ParseOpts::default(); + let dom: RcDom = parse_document(RcDom::default(), opts).one(html); + + let mut tables = Vec::new(); + collect_tables(&dom.document, &mut tables); + if tables.is_empty() { + return lines.to_vec(); + } + + let mut out = Vec::new(); + for table in tables { + for line in table_node_to_markdown(&table) { + out.push(format!("{indent}{line}")); + } + } + out +} + /// Reflow a broken markdown table. /// /// # Panics @@ -332,6 +359,7 @@ pub fn process_stream(lines: &[String]) -> Vec { let mut out = Vec::new(); let mut buf = Vec::new(); let mut html_buf = Vec::new(); + let mut html_depth = 0usize; let mut in_code = false; let mut in_table = false; let mut in_html = false; @@ -358,10 +386,14 @@ pub fn process_stream(lines: &[String]) -> Vec { if in_html { html_buf.push(line.trim_end().to_string()); + html_depth += line.matches("") { - out.extend(html_table_to_markdown(&html_buf)); - html_buf.clear(); - in_html = false; + html_depth = html_depth.saturating_sub(line.matches("").count()); + if html_depth == 0 { + out.extend(html_table_to_markdown(&html_buf)); + html_buf.clear(); + in_html = false; + } } continue; } @@ -378,10 +410,14 @@ pub fn process_stream(lines: &[String]) -> Vec { } in_html = true; html_buf.push(line.trim_end().to_string()); + html_depth = line.matches("") { - out.extend(html_table_to_markdown(&html_buf)); - html_buf.clear(); - in_html = false; + html_depth = html_depth.saturating_sub(line.matches("").count()); + if html_depth == 0 { + out.extend(html_table_to_markdown(&html_buf)); + html_buf.clear(); + in_html = false; + } } continue; } From f4de0a88d04e05834720f80d0fe66e604edc191d Mon Sep 17 00:00:00 2001 From: Leynos Date: Sat, 14 Jun 2025 02:02:50 +0100 Subject: [PATCH 3/4] Filter non-content HTML nodes --- src/lib.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 33641e2e..833575f5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -103,7 +103,21 @@ fn node_text(handle: &Handle) -> String { fn collect_text(handle: &Handle, out: &mut Vec) { match &handle.data { NodeData::Text { contents } => out.push(contents.borrow().to_string()), - NodeData::Element { .. } | NodeData::Document => { + NodeData::Element { name, .. } => { + let tag = name.local.as_ref(); + if tag.eq_ignore_ascii_case("script") + || tag.eq_ignore_ascii_case("style") + || tag.eq_ignore_ascii_case("noscript") + || tag.eq_ignore_ascii_case("template") + || tag.eq_ignore_ascii_case("head") + { + return; + } + for child in handle.children.borrow().iter() { + collect_text(child, out); + } + } + NodeData::Document => { for child in handle.children.borrow().iter() { collect_text(child, out); } From 2e7ab79dab42c5362b7a812b023e165d8db6d7c8 Mon Sep 17 00:00:00 2001 From: Leynos Date: Sat, 14 Jun 2025 02:14:12 +0100 Subject: [PATCH 4/4] Simplify node text parsing --- Cargo.lock | 52 +++++++++++++++++----------------------------------- Cargo.toml | 4 ++-- src/lib.rs | 45 ++++++++++++++++++++++----------------------- 3 files changed, 41 insertions(+), 60 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 72627ddf..936a8d0f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -317,14 +317,16 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "html5ever" -version = "0.31.0" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "953cbbe631aae7fc0a112702ad5d3aaf09da38beaf45ea84610d6e1c358f569c" +checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" dependencies = [ "log", "mac", "markup5ever", - "match_token", + "proc-macro2", + "quote", + "syn", ] [[package]] @@ -335,9 +337,9 @@ checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] name = "libc" -version = "0.2.172" +version = "0.2.173" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" +checksum = "d8cfeafaffdbc32176b64fb251369d52ea9f0a8fbc6f8759edffef7b525d64bb" [[package]] name = "linux-raw-sys" @@ -369,20 +371,23 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "markup5ever" -version = "0.16.1" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0a8096766c229e8c88a3900c9b44b7e06aa7f7343cc229158c3e58ef8f9973a" +checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45" dependencies = [ "log", + "phf", + "phf_codegen", + "string_cache", + "string_cache_codegen", "tendril", - "web_atoms", ] [[package]] name = "markup5ever_rcdom" -version = "0.5.3-unofficial" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "853740b93240b82f68a23d8b296b2d19fc81521c298fcae44bf34bed6e445f00" +checksum = "edaa21ab3701bfee5099ade5f7e1f84553fd19228cf332f13cd6e964bf59be18" dependencies = [ "html5ever", "markup5ever", @@ -390,17 +395,6 @@ dependencies = [ "xml5ever", ] -[[package]] -name = "match_token" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88a9689d8d44bf9964484516275f5cd4c9b59457a6940c1d5d0ecbb94510a36b" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "mdtablefix" version = "0.1.0" @@ -840,18 +834,6 @@ dependencies = [ "wit-bindgen-rt", ] -[[package]] -name = "web_atoms" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57ffde1dc01240bdf9992e3205668b235e59421fd085e8a317ed98da0178d414" -dependencies = [ - "phf", - "phf_codegen", - "string_cache", - "string_cache_codegen", -] - [[package]] name = "windows-sys" version = "0.59.0" @@ -936,9 +918,9 @@ dependencies = [ [[package]] name = "xml5ever" -version = "0.22.1" +version = "0.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a91563ba5a5ab749488164063f1317e327ca1daa80f00e5bd1e670ad0d78154" +checksum = "9bbb26405d8e919bc1547a5aa9abc95cbfa438f04844f5fdd9dc7596b748bf69" dependencies = [ "log", "mac", diff --git a/Cargo.toml b/Cargo.toml index ea79d507..ac0f5d6e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,8 +7,8 @@ edition = "2024" anyhow = "1" clap = { version = "4", features = ["derive"] } regex = "1" -html5ever = "0.31" -markup5ever_rcdom = "0.5.3-unofficial" +html5ever = "0.27" +markup5ever_rcdom = "0.3" [dev-dependencies] diff --git a/src/lib.rs b/src/lib.rs index 833575f5..e49cda51 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -91,9 +91,6 @@ fn node_text(handle: &Handle) -> String { let mut parts = Vec::new(); collect_text(handle, &mut parts); parts - .into_iter() - .filter(|s| !s.trim().is_empty()) - .collect::>() .join(" ") .split_whitespace() .collect::>() @@ -214,6 +211,25 @@ fn html_table_to_markdown(lines: &[String]) -> Vec { out } +fn push_html_line( + line: &str, + html_buf: &mut Vec, + html_depth: &mut usize, + in_html: &mut bool, + out: &mut Vec, +) { + html_buf.push(line.trim_end().to_string()); + *html_depth += line.matches("") { + *html_depth = html_depth.saturating_sub(line.matches("").count()); + if *html_depth == 0 { + out.extend(html_table_to_markdown(html_buf)); + html_buf.clear(); + *in_html = false; + } + } +} + /// Reflow a broken markdown table. /// /// # Panics @@ -399,16 +415,7 @@ pub fn process_stream(lines: &[String]) -> Vec { } if in_html { - html_buf.push(line.trim_end().to_string()); - html_depth += line.matches("") { - html_depth = html_depth.saturating_sub(line.matches("").count()); - if html_depth == 0 { - out.extend(html_table_to_markdown(&html_buf)); - html_buf.clear(); - in_html = false; - } - } + push_html_line(line, &mut html_buf, &mut html_depth, &mut in_html, &mut out); continue; } @@ -423,16 +430,8 @@ pub fn process_stream(lines: &[String]) -> Vec { in_table = false; } in_html = true; - html_buf.push(line.trim_end().to_string()); - html_depth = line.matches("") { - html_depth = html_depth.saturating_sub(line.matches("").count()); - if html_depth == 0 { - out.extend(html_table_to_markdown(&html_buf)); - html_buf.clear(); - in_html = false; - } - } + html_depth = 0; + push_html_line(line, &mut html_buf, &mut html_depth, &mut in_html, &mut out); continue; }