From a88c1ff888f7b986716053be81b14acbd7f96e0f Mon Sep 17 00:00:00 2001 From: Leynos Date: Sun, 13 Jul 2025 13:54:05 +0100 Subject: [PATCH 1/2] Add fallback header row for HTML tables If no header markup is detected the converter now treats the first row as the header. Updated docs and tests. --- docs/html-table-support.md | 6 ++++++ src/html.rs | 4 ++++ tests/integration.rs | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/html-table-support.md b/docs/html-table-support.md index 3c64cc74..f5a1c56c 100644 --- a/docs/html-table-support.md +++ b/docs/html-table-support.md @@ -16,3 +16,9 @@ rest of the document. 12 ``` + +The converter checks the first table row for `` cells or for `` or +`` tags inside `` elements to decide whether it is a header. If no such +markers exist and the table contains multiple rows, the first row is still +treated as the header so the Markdown output includes a separator line. This +last-resort behaviour keeps simple tables readable after conversion. diff --git a/src/html.rs b/src/html.rs index 83b347a9..d95a2185 100644 --- a/src/html.rs +++ b/src/html.rs @@ -145,6 +145,10 @@ fn table_node_to_markdown(table: &Handle) -> Vec { } out.push(format!("| {} |", cells.join(" | "))); } + if !first_header && row_handles.len() > 1 { + // Assume a header row when no header markup is present. + first_header = true; + } if first_header { let sep: Vec = (0..col_count).map(|_| "---".to_string()).collect(); out.insert(1, format!("| {} |", sep.join(" | "))); diff --git a/tests/integration.rs b/tests/integration.rs index 959685b8..8a4f8247 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -457,7 +457,7 @@ fn test_convert_html_table_with_colspan() { #[test] fn test_convert_html_table_no_header() { - let expected = vec!["| A | B |", "| 1 | 2 |"]; + let expected = vec!["| A | B |", "| --- | --- |", "| 1 | 2 |"]; assert_eq!(convert_html_tables(&html_table_no_header()), expected); } From a2fc033b3ac306a7c70e59564a6714dbfbfbddbf Mon Sep 17 00:00:00 2001 From: Leynos Date: Sun, 13 Jul 2025 14:10:29 +0100 Subject: [PATCH 2/2] Refactor HTML table header fallback --- docs/html-table-support.md | 2 +- src/html.rs | 54 ++++++++++++++++++++------------------ tests/integration.rs | 54 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 26 deletions(-) diff --git a/docs/html-table-support.md b/docs/html-table-support.md index f5a1c56c..6e255006 100644 --- a/docs/html-table-support.md +++ b/docs/html-table-support.md @@ -20,5 +20,5 @@ rest of the document. The converter checks the first table row for `` cells or for `` or `` tags inside `` elements to decide whether it is a header. If no such markers exist and the table contains multiple rows, the first row is still -treated as the header so the Markdown output includes a separator line. This +treated as the header, so the Markdown output includes a separator line. This last-resort behaviour keeps simple tables readable after conversion. diff --git a/src/html.rs b/src/html.rs index d95a2185..61a867ee 100644 --- a/src/html.rs +++ b/src/html.rs @@ -113,6 +113,24 @@ fn contains_strong(handle: &Handle) -> bool { children.iter().any(contains_strong) } +/// Extracts cell text from a row and reports whether all cells are header cells. +fn parse_row(row: &Handle) -> (Vec, bool) { + let mut cells = Vec::new(); + let mut all_header = true; + for child in row.children.borrow().iter() { + if is_table_cell(child) { + let is_header = if is_element(child, "th") { + true + } else { + contains_strong(child) + }; + all_header &= is_header; + cells.push(node_text(child)); + } + } + (cells, all_header) +} + /// Converts a `` DOM node into Markdown table lines and calls /// `reflow_table` so the columns are uniformly padded. fn table_node_to_markdown(table: &Handle) -> Vec { @@ -122,37 +140,23 @@ fn table_node_to_markdown(table: &Handle) -> Vec { return Vec::new(); } + let (first_cells, explicit_header) = parse_row(&row_handles[0]); + let col_count = first_cells.len(); + let fallback_header = !explicit_header && row_handles.len() > 1; + let has_header = explicit_header || fallback_header; + let mut out = Vec::new(); - let mut first_header = false; - let mut col_count = 0; - for (i, row) in row_handles.iter().enumerate() { - let mut cells = Vec::new(); - let mut all_header = true; - for child in row.children.borrow().iter() { - if is_table_cell(child) { - let is_header = if is_element(child, "th") { - true - } else { - contains_strong(child) - }; - all_header &= is_header; - cells.push(node_text(child)); - } - } - if i == 0 { - first_header = all_header; - col_count = cells.len(); - } + out.push(format!("| {} |", first_cells.join(" | "))); + for row in row_handles.iter().skip(1) { + let (cells, _) = parse_row(row); out.push(format!("| {} |", cells.join(" | "))); } - if !first_header && row_handles.len() > 1 { - // Assume a header row when no header markup is present. - first_header = true; - } - if first_header { + + if has_header { let sep: Vec = (0..col_count).map(|_| "---".to_string()).collect(); out.insert(1, format!("| {} |", sep.join(" | "))); } + crate::reflow_table(&out) } diff --git a/tests/integration.rs b/tests/integration.rs index 8a4f8247..c7b738ee 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -101,6 +101,36 @@ fn html_table_no_header() -> Vec { ) } +#[fixture] +fn html_table_empty_row() -> Vec { + lines_vec!( + "
", + "", + "", + "
12
", + ) +} + +#[fixture] +fn html_table_whitespace_header() -> Vec { + lines_vec!( + "", + "", + "", + "
12
", + ) +} + +#[fixture] +fn html_table_inconsistent_first_row() -> Vec { + lines_vec!( + "", + "", + "", + "
A
12
", + ) +} + #[fixture] fn html_table_empty() -> Vec { let lines = lines_vec!("
"); @@ -461,6 +491,30 @@ fn test_convert_html_table_no_header() { assert_eq!(convert_html_tables(&html_table_no_header()), expected); } +#[test] +fn test_convert_html_table_empty_row() { + let expected = vec!["| 1 | 2 |", "| --- | --- |"]; + assert_eq!(convert_html_tables(&html_table_empty_row()), expected); +} + +#[test] +fn test_convert_html_table_whitespace_header() { + let expected = vec!["| --- | --- |", "| --- | --- |", "| 1 | 2 |"]; + assert_eq!( + convert_html_tables(&html_table_whitespace_header()), + expected + ); +} + +#[test] +fn test_convert_html_table_inconsistent_first_row() { + let expected = vec!["| A |", "| --- |", "| 1 | 2 |"]; + assert_eq!( + convert_html_tables(&html_table_inconsistent_first_row()), + expected + ); +} + #[test] fn test_convert_html_table_empty() { assert!(convert_html_tables(&html_table_empty()).is_empty());