From ef724890a63baff0c3689975ff4395525e3bdcb7 Mon Sep 17 00:00:00 2001 From: Leynos Date: Sat, 14 Jun 2025 02:34:55 +0100 Subject: [PATCH] Improve HTML table detection --- docs/html-table-support.md | 1 + src/lib.rs | 12 ++++++---- tests/integration.rs | 48 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 4 deletions(-) diff --git a/docs/html-table-support.md b/docs/html-table-support.md index b3c31ec6..e5e4cf91 100644 --- a/docs/html-table-support.md +++ b/docs/html-table-support.md @@ -8,3 +8,4 @@ consistently. The crate `markup5ever_rcdom` provides a minimal DOM which `html5ever` populates and which is traversed to extract rows and cells. Only basic tables containing ``, `` and `` elements are supported. +`mdtablefix` detects table elements regardless of attribute usage or tag case. diff --git a/src/lib.rs b/src/lib.rs index e49cda51..a39b7d67 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -219,9 +219,9 @@ fn push_html_line( out: &mut Vec, ) { html_buf.push(line.trim_end().to_string()); - *html_depth += line.matches("") { - *html_depth = html_depth.saturating_sub(line.matches("").count()); + *html_depth += TABLE_START_RE.find_iter(line).count(); + if TABLE_END_RE.is_match(line) { + *html_depth = html_depth.saturating_sub(TABLE_END_RE.find_iter(line).count()); if *html_depth == 0 { out.extend(html_table_to_markdown(html_buf)); html_buf.clear(); @@ -256,6 +256,10 @@ static SENTINEL_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| Regex::new(r"\|\s*\|\s*").unwrap()); static SEP_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| Regex::new(r"^[\s|:-]+$").unwrap()); +static TABLE_START_RE: std::sync::LazyLock = + std::sync::LazyLock::new(|| Regex::new(r"(?i)^|$)").unwrap()); +static TABLE_END_RE: std::sync::LazyLock = + std::sync::LazyLock::new(|| Regex::new(r"(?i)").unwrap()); #[must_use] pub fn reflow_table(lines: &[String]) -> Vec { @@ -419,7 +423,7 @@ pub fn process_stream(lines: &[String]) -> Vec { continue; } - if line.trim_start().starts_with(" Vec { ] } +#[fixture] +fn html_table_with_attrs() -> Vec { + vec![ + "".to_string(), + "".to_string(), + "".to_string(), + "
AB
12
".to_string(), + ] +} + +#[fixture] +fn html_table_uppercase() -> Vec { + vec![ + "".to_string(), + "".to_string(), + "".to_string(), + "
AB
12
".to_string(), + ] +} + +#[fixture] +fn html_table_mixed_case() -> Vec { + vec![ + "".to_string(), + "".to_string(), + "".to_string(), + "
AB
12
".to_string(), + ] +} + #[fixture] fn multiple_tables() -> Vec { vec![ @@ -135,6 +165,24 @@ fn test_process_stream_html_table(html_table: Vec) { assert_eq!(process_stream(&html_table), expected); } +#[rstest] +fn test_process_stream_html_table_with_attrs(html_table_with_attrs: Vec) { + let expected = vec!["| A | B |", "| --- | --- |", "| 1 | 2 |"]; + assert_eq!(process_stream(&html_table_with_attrs), expected); +} + +#[rstest] +fn test_process_stream_html_table_uppercase(html_table_uppercase: Vec) { + let expected = vec!["| A | B |", "| --- | --- |", "| 1 | 2 |"]; + assert_eq!(process_stream(&html_table_uppercase), expected); +} + +#[rstest] +fn test_process_stream_html_table_mixed_case(html_table_mixed_case: Vec) { + let expected = vec!["| A | B |", "| --- | --- |", "| 1 | 2 |"]; + assert_eq!(process_stream(&html_table_mixed_case), expected); +} + #[rstest] fn test_process_stream_multiple_tables(multiple_tables: Vec) { let expected = vec![