diff --git a/Cargo.lock b/Cargo.lock
index 3677d533..936a8d0f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -186,6 +186,16 @@ version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
+[[package]]
+name = "futf"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
+dependencies = [
+ "mac",
+ "new_debug_unreachable",
+]
+
[[package]]
name = "futures"
version = "0.3.31"
@@ -305,6 +315,20 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+[[package]]
+name = "html5ever"
+version = "0.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4"
+dependencies = [
+ "log",
+ "mac",
+ "markup5ever",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
[[package]]
name = "is_terminal_polyfill"
version = "1.70.1"
@@ -313,9 +337,9 @@ checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "libc"
-version = "0.2.172"
+version = "0.2.173"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
+checksum = "d8cfeafaffdbc32176b64fb251369d52ea9f0a8fbc6f8759edffef7b525d64bb"
[[package]]
name = "linux-raw-sys"
@@ -323,6 +347,54 @@ version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12"
+[[package]]
+name = "lock_api"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765"
+dependencies = [
+ "autocfg",
+ "scopeguard",
+]
+
+[[package]]
+name = "log"
+version = "0.4.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
+
+[[package]]
+name = "mac"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
+
+[[package]]
+name = "markup5ever"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45"
+dependencies = [
+ "log",
+ "phf",
+ "phf_codegen",
+ "string_cache",
+ "string_cache_codegen",
+ "tendril",
+]
+
+[[package]]
+name = "markup5ever_rcdom"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edaa21ab3701bfee5099ade5f7e1f84553fd19228cf332f13cd6e964bf59be18"
+dependencies = [
+ "html5ever",
+ "markup5ever",
+ "tendril",
+ "xml5ever",
+]
+
[[package]]
name = "mdtablefix"
version = "0.1.0"
@@ -330,6 +402,8 @@ dependencies = [
"anyhow",
"assert_cmd",
"clap",
+ "html5ever",
+ "markup5ever_rcdom",
"regex",
"rstest",
"tempfile",
@@ -341,6 +415,12 @@ version = "2.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
+[[package]]
+name = "new_debug_unreachable"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
+
[[package]]
name = "once_cell"
version = "1.21.3"
@@ -353,6 +433,67 @@ version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
+[[package]]
+name = "parking_lot"
+version = "0.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70d58bf43669b5795d1576d0641cfb6fbb2057bf629506267a92807158584a13"
+dependencies = [
+ "lock_api",
+ "parking_lot_core",
+]
+
+[[package]]
+name = "parking_lot_core"
+version = "0.9.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall",
+ "smallvec",
+ "windows-targets",
+]
+
+[[package]]
+name = "phf"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
+dependencies = [
+ "phf_shared",
+]
+
+[[package]]
+name = "phf_codegen"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
+dependencies = [
+ "phf_generator",
+ "phf_shared",
+]
+
+[[package]]
+name = "phf_generator"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
+dependencies = [
+ "phf_shared",
+ "rand",
+]
+
+[[package]]
+name = "phf_shared"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
+dependencies = [
+ "siphasher",
+]
+
[[package]]
name = "pin-project-lite"
version = "0.2.16"
@@ -365,6 +506,12 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
+[[package]]
+name = "precomputed-hash"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
+
[[package]]
name = "predicates"
version = "3.1.3"
@@ -416,6 +563,30 @@ version = "5.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+
+[[package]]
+name = "redox_syscall"
+version = "0.5.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0d04b7d0ee6b4a0207a0a7adb104d23ecb0b47d6beae7152d0fa34b692b29fd6"
+dependencies = [
+ "bitflags",
+]
+
[[package]]
name = "regex"
version = "1.11.1"
@@ -502,6 +673,12 @@ dependencies = [
"windows-sys",
]
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
[[package]]
name = "semver"
version = "1.0.26"
@@ -528,6 +705,12 @@ dependencies = [
"syn",
]
+[[package]]
+name = "siphasher"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
+
[[package]]
name = "slab"
version = "0.4.9"
@@ -537,6 +720,37 @@ dependencies = [
"autocfg",
]
+[[package]]
+name = "smallvec"
+version = "1.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
+
+[[package]]
+name = "string_cache"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f"
+dependencies = [
+ "new_debug_unreachable",
+ "parking_lot",
+ "phf_shared",
+ "precomputed-hash",
+ "serde",
+]
+
+[[package]]
+name = "string_cache_codegen"
+version = "0.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0"
+dependencies = [
+ "phf_generator",
+ "phf_shared",
+ "proc-macro2",
+ "quote",
+]
+
[[package]]
name = "strsim"
version = "0.11.1"
@@ -567,6 +781,17 @@ dependencies = [
"windows-sys",
]
+[[package]]
+name = "tendril"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
+dependencies = [
+ "futf",
+ "mac",
+ "utf-8",
+]
+
[[package]]
name = "termtree"
version = "0.5.1"
@@ -579,6 +804,12 @@ version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+[[package]]
+name = "utf-8"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
+
[[package]]
name = "utf8parse"
version = "0.2.2"
@@ -684,3 +915,14 @@ checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
dependencies = [
"bitflags",
]
+
+[[package]]
+name = "xml5ever"
+version = "0.18.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9bbb26405d8e919bc1547a5aa9abc95cbfa438f04844f5fdd9dc7596b748bf69"
+dependencies = [
+ "log",
+ "mac",
+ "markup5ever",
+]
diff --git a/Cargo.toml b/Cargo.toml
index a4cca3c9..ac0f5d6e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,6 +7,8 @@ edition = "2024"
anyhow = "1"
clap = { version = "4", features = ["derive"] }
regex = "1"
+html5ever = "0.27"
+markup5ever_rcdom = "0.3"
[dev-dependencies]
diff --git a/README.md b/README.md
index f0a05fb5..e6f20e96 100644
--- a/README.md
+++ b/README.md
@@ -3,3 +3,4 @@
This is a generated project using [Copier](https://copier.readthedocs.io/).
A tool to reflow Markdown tables while preserving other document content.
+It also converts simple HTML `
` elements to Markdown before reflowing.
diff --git a/docs/html-table-support.md b/docs/html-table-support.md
new file mode 100644
index 00000000..b3c31ec6
--- /dev/null
+++ b/docs/html-table-support.md
@@ -0,0 +1,10 @@
+# HTML Table Support
+
+`mdtablefix` uses the `html5ever` parser to recognise simple `` elements
+embedded in Markdown documents. These tables are converted to Markdown before the
+normal reflow logic runs so that Markdown and HTML tables are formatted
+consistently.
+
+The crate `markup5ever_rcdom` provides a minimal DOM which `html5ever` populates
+and which is traversed to extract rows and cells. Only basic tables containing
+``, `| ` and ` | ` elements are supported.
diff --git a/src/lib.rs b/src/lib.rs
index 6376953a..e49cda51 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2,6 +2,8 @@
//!
//! Functions here reflow tables that were broken during formatting.
+use html5ever::{parse_document, tendril::TendrilSink};
+use markup5ever_rcdom::{Handle, NodeData, RcDom};
use regex::Regex;
use std::fs;
use std::path::Path;
@@ -85,6 +87,149 @@ fn format_separator_cells(widths: &[usize], sep_cells: &[String]) -> Vec
.collect()
}
+fn node_text(handle: &Handle) -> String {
+ let mut parts = Vec::new();
+ collect_text(handle, &mut parts);
+ parts
+ .join(" ")
+ .split_whitespace()
+ .collect::>()
+ .join(" ")
+}
+
+fn collect_text(handle: &Handle, out: &mut Vec) {
+ match &handle.data {
+ NodeData::Text { contents } => out.push(contents.borrow().to_string()),
+ NodeData::Element { name, .. } => {
+ let tag = name.local.as_ref();
+ if tag.eq_ignore_ascii_case("script")
+ || tag.eq_ignore_ascii_case("style")
+ || tag.eq_ignore_ascii_case("noscript")
+ || tag.eq_ignore_ascii_case("template")
+ || tag.eq_ignore_ascii_case("head")
+ {
+ return;
+ }
+ for child in handle.children.borrow().iter() {
+ collect_text(child, out);
+ }
+ }
+ NodeData::Document => {
+ for child in handle.children.borrow().iter() {
+ collect_text(child, out);
+ }
+ }
+ _ => {}
+ }
+}
+
+fn collect_tables(handle: &Handle, tables: &mut Vec) {
+ if let NodeData::Element { name, .. } = &handle.data {
+ if name.local.as_ref() == "table" {
+ tables.push(handle.clone());
+ }
+ }
+ for child in handle.children.borrow().iter() {
+ collect_tables(child, tables);
+ }
+}
+
+fn collect_rows(handle: &Handle, rows: &mut Vec) {
+ if let NodeData::Element { name, .. } = &handle.data {
+ if name.local.as_ref() == "tr" {
+ rows.push(handle.clone());
+ }
+ }
+ for child in handle.children.borrow().iter() {
+ collect_rows(child, rows);
+ }
+}
+
+use html5ever::driver::ParseOpts;
+
+fn table_node_to_markdown(table: &Handle) -> Vec {
+ let mut row_handles = Vec::new();
+ collect_rows(table, &mut row_handles);
+ if row_handles.is_empty() {
+ return Vec::new();
+ }
+
+ let mut out = Vec::new();
+ let mut first_header = false;
+ let mut col_count = 0;
+ for (i, row) in row_handles.iter().enumerate() {
+ let mut cells = Vec::new();
+ let mut header_row = false;
+ for child in row.children.borrow().iter() {
+ if let NodeData::Element { name, .. } = &child.data {
+ if name.local.as_ref() == "td" || name.local.as_ref() == "th" {
+ if name.local.as_ref() == "th" {
+ header_row = true;
+ }
+ cells.push(node_text(child));
+ }
+ }
+ }
+ if i == 0 {
+ first_header = header_row;
+ col_count = cells.len();
+ }
+ out.push(format!("| {} |", cells.join(" | ")));
+ }
+ if first_header {
+ let sep: Vec = (0..col_count).map(|_| "---".to_string()).collect();
+ out.insert(1, format!("| {} |", sep.join(" | ")));
+ }
+ reflow_table(&out)
+}
+
+fn html_table_to_markdown(lines: &[String]) -> Vec {
+ let indent: String = lines
+ .first()
+ .map(|l| l.chars().take_while(|c| c.is_whitespace()).collect())
+ .unwrap_or_default();
+ let html: String = lines
+ .iter()
+ .map(|l| l.trim_end())
+ .collect::>()
+ .join("\n");
+ let opts = ParseOpts::default();
+ let dom: RcDom = parse_document(RcDom::default(), opts).one(html);
+
+ let mut tables = Vec::new();
+ collect_tables(&dom.document, &mut tables);
+ if tables.is_empty() {
+ return lines.to_vec();
+ }
+
+ let mut out = Vec::new();
+ for table in tables {
+ for line in table_node_to_markdown(&table) {
+ out.push(format!("{indent}{line}"));
+ }
+ }
+ out
+}
+
+fn push_html_line(
+ line: &str,
+ html_buf: &mut Vec,
+ html_depth: &mut usize,
+ in_html: &mut bool,
+ out: &mut Vec,
+) {
+ html_buf.push(line.trim_end().to_string());
+ *html_depth += line.matches("") {
+ *html_depth = html_depth.saturating_sub(line.matches(" ").count());
+ if *html_depth == 0 {
+ out.extend(html_table_to_markdown(html_buf));
+ html_buf.clear();
+ *in_html = false;
+ }
+ }
+}
+
/// Reflow a broken markdown table.
///
/// # Panics
@@ -243,8 +388,11 @@ static FENCE_RE: std::sync::LazyLock =
pub fn process_stream(lines: &[String]) -> Vec {
let mut out = Vec::new();
let mut buf = Vec::new();
+ let mut html_buf = Vec::new();
+ let mut html_depth = 0usize;
let mut in_code = false;
let mut in_table = false;
+ let mut in_html = false;
for line in lines {
if FENCE_RE.is_match(line) {
@@ -266,6 +414,27 @@ pub fn process_stream(lines: &[String]) -> Vec {
continue;
}
+ if in_html {
+ push_html_line(line, &mut html_buf, &mut html_depth, &mut in_html, &mut out);
+ continue;
+ }
+
+ if line.trim_start().starts_with(" Vec {
}
}
+ if !html_buf.is_empty() {
+ out.extend(html_table_to_markdown(&html_buf));
+ }
+
out
}
diff --git a/tests/integration.rs b/tests/integration.rs
index 245a20df..aec2f91f 100644
--- a/tests/integration.rs
+++ b/tests/integration.rs
@@ -63,6 +63,27 @@ fn indented_table() -> Vec {
]
}
+#[fixture]
+fn html_table() -> Vec {
+ vec![
+ "".to_string(),
+ "| A | B | ".to_string(),
+ "| 1 | 2 | ".to_string(),
+ " ".to_string(),
+ ]
+}
+
+#[fixture]
+fn multiple_tables() -> Vec {
+ vec![
+ "| A | B |".to_string(),
+ "| 1 | 22 |".to_string(),
+ String::new(),
+ "| X | Y |".to_string(),
+ "| 3 | 4 |".to_string(),
+ ]
+}
+
#[rstest]
/// Tests that `reflow_table` correctly restructures a broken Markdown table into a well-formed table.
///
@@ -108,6 +129,24 @@ fn test_reflow_preserves_indentation(indented_table: Vec) {
assert_eq!(reflow_table(&indented_table), expected);
}
+#[rstest]
+fn test_process_stream_html_table(html_table: Vec) {
+ let expected = vec!["| A | B |", "| --- | --- |", "| 1 | 2 |"];
+ assert_eq!(process_stream(&html_table), expected);
+}
+
+#[rstest]
+fn test_process_stream_multiple_tables(multiple_tables: Vec) {
+ let expected = vec![
+ "| A | B |".to_string(),
+ "| 1 | 22 |".to_string(),
+ String::new(),
+ "| X | Y |".to_string(),
+ "| 3 | 4 |".to_string(),
+ ];
+ assert_eq!(process_stream(&multiple_tables), expected);
+}
+
/// Tests that `process_stream` leaves lines inside code fences unchanged.
///
/// Verifies that both backtick (```) and tilde (~~~) fenced code blocks are ignored by the table processing logic, ensuring their contents are not altered.
|