Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/html-table-support.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ consistently.
The crate `markup5ever_rcdom` provides a minimal DOM which `html5ever` populates
and which is traversed to extract rows and cells. Only basic tables containing
`<tr>`, `<th>` and `<td>` elements are supported.
`mdtablefix` detects table elements regardless of attribute usage or tag case.
56 changes: 52 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,9 @@ fn html_table_to_markdown(lines: &[String]) -> Vec<String> {
out
}

/// Buffers lines belonging to an HTML table and converts the table to markdown when complete.
///
/// Tracks the nesting depth of `<table>` tags using regular expressions. When the depth returns to zero, converts the buffered HTML table to markdown and appends the result to the output. Clears the buffer and updates the HTML state accordingly.
fn push_html_line(
line: &str,
html_buf: &mut Vec<String>,
Expand All @@ -219,9 +222,9 @@ fn push_html_line(
out: &mut Vec<String>,
) {
html_buf.push(line.trim_end().to_string());
*html_depth += line.matches("<table").count();
if line.contains("</table>") {
*html_depth = html_depth.saturating_sub(line.matches("</table>").count());
*html_depth += TABLE_START_RE.find_iter(line).count();
if TABLE_END_RE.is_match(line) {
*html_depth = html_depth.saturating_sub(TABLE_END_RE.find_iter(line).count());
if *html_depth == 0 {
out.extend(html_table_to_markdown(html_buf));
html_buf.clear();
Expand Down Expand Up @@ -256,8 +259,35 @@ static SENTINEL_RE: std::sync::LazyLock<Regex> =
std::sync::LazyLock::new(|| Regex::new(r"\|\s*\|\s*").unwrap());
static SEP_RE: std::sync::LazyLock<Regex> =
std::sync::LazyLock::new(|| Regex::new(r"^[\s|:-]+$").unwrap());
static TABLE_START_RE: std::sync::LazyLock<Regex> =
std::sync::LazyLock::new(|| Regex::new(r"(?i)^<table(?:\s|>|$)").unwrap());
static TABLE_END_RE: std::sync::LazyLock<Regex> =
std::sync::LazyLock::new(|| Regex::new(r"(?i)</table>").unwrap());

#[must_use]
/// Reflows a markdown table, aligning columns and formatting separator rows.
///
/// Takes a slice of strings representing a markdown table, detects the separator row, splits and trims cells, checks for consistent column counts, calculates column widths, and reconstructs the table with properly aligned columns and separator. Returns the original lines unchanged if the table is inconsistent or cannot be reflowed.
///
/// # Returns
///
/// A vector of strings representing the reflowed markdown table with aligned columns and formatted separator row, or the original lines if reflow is not possible.
///
/// # Examples
///
/// ```
/// let lines = vec![
/// "| Name | Age |",
/// "|---|---|",
/// "| Alice | 30 |",
/// "| Bob | 25 |",
/// ];
/// let fixed = reflow_table(&lines);
/// assert_eq!(fixed[0], "| Name | Age |");
/// assert_eq!(fixed[1], "|-------|-----|");
/// assert_eq!(fixed[2], "| Alice | 30 |");
/// assert_eq!(fixed[3], "| Bob | 25 |");
/// ```
pub fn reflow_table(lines: &[String]) -> Vec<String> {
if lines.is_empty() {
return Vec::new();
Expand Down Expand Up @@ -385,6 +415,24 @@ static FENCE_RE: std::sync::LazyLock<Regex> =
std::sync::LazyLock::new(|| Regex::new(r"^(```|~~~).*").unwrap());

#[must_use]
/// Processes a sequence of markdown lines, reflowing markdown tables and converting embedded HTML tables to markdown.
///
/// This function preserves fenced code blocks, detects and reflows markdown tables for proper alignment, and converts HTML `<table>` blocks to markdown format. All other lines are passed through unchanged.
///
/// # Examples
///
/// ```
/// let input = vec![
/// "| Header | Value |".to_string(),
/// "|--------|-------|".to_string(),
/// "| a | 1 |".to_string(),
/// "",
/// "<table><tr><td>x</td></tr></table>".to_string(),
/// ];
/// let output = process_stream(&input);
/// assert!(output.iter().any(|line| line.contains("| Header | Value |")));
/// assert!(output.iter().any(|line| line.contains("| x |")));
/// ```
pub fn process_stream(lines: &[String]) -> Vec<String> {
let mut out = Vec::new();
let mut buf = Vec::new();
Expand Down Expand Up @@ -419,7 +467,7 @@ pub fn process_stream(lines: &[String]) -> Vec<String> {
continue;
}

if line.trim_start().starts_with("<table") {
if TABLE_START_RE.is_match(line.trim_start()) {
if !buf.is_empty() {
if in_table {
out.extend(reflow_table(&buf));
Expand Down
133 changes: 133 additions & 0 deletions tests/integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,19 @@ fn indented_table() -> Vec<String> {
}

#[fixture]
/// Returns a vector of strings representing a simple HTML table with two columns and one data row.
///
/// The table includes a header row with columns "A" and "B", and a single data row with values "1" and "2".
///
/// # Examples
///
/// ```
/// let html = html_table();
/// assert_eq!(html[0], "<table>");
/// assert_eq!(html[1], "<tr><th>A</th><th>B</th></tr>");
/// assert_eq!(html[2], "<tr><td>1</td><td>2</td></tr>");
/// assert_eq!(html[3], "</table>");
/// ```
fn html_table() -> Vec<String> {
vec![
"<table>".to_string(),
Expand All @@ -74,6 +87,81 @@ fn html_table() -> Vec<String> {
}

#[fixture]
/// Returns a vector of strings representing an HTML table with attributes on the `<table>` tag.
///
/// The table contains two columns ("A" and "B") and one data row ("1" and "2"). The opening `<table>` tag includes a `class` attribute.
///
/// # Examples
///
/// ```
/// let lines = html_table_with_attrs();
/// assert_eq!(lines[0], "<table class=\"x\">");
/// ```
fn html_table_with_attrs() -> Vec<String> {
vec![
"<table class=\"x\">".to_string(),
"<tr><th>A</th><th>B</th></tr>".to_string(),
"<tr><td>1</td><td>2</td></tr>".to_string(),
"</table>".to_string(),
]
}

#[fixture]
/// Returns a vector of strings representing an HTML table with uppercase tags.
///
/// The table contains two columns ("A" and "B") and one data row ("1", "2").
/// This fixture is used to test case-insensitive HTML table parsing.
///
/// # Examples
///
/// ```
/// let table = html_table_uppercase();
/// assert_eq!(table[0], "<TABLE>");
/// ```
fn html_table_uppercase() -> Vec<String> {
vec![
"<TABLE>".to_string(),
"<tr><th>A</th><th>B</th></tr>".to_string(),
"<tr><td>1</td><td>2</td></tr>".to_string(),
"</TABLE>".to_string(),
]
}

#[fixture]
/// Returns a vector of strings representing an HTML table with mixed-case tag names.
///
/// The table contains two columns ("A" and "B") and one data row ("1", "2"), with the `<table>` tags written in mixed case for testing case-insensitive parsing.
///
/// # Examples
///
/// ```
/// let lines = html_table_mixed_case();
/// assert_eq!(lines[0], "<TaBlE>");
/// ```
fn html_table_mixed_case() -> Vec<String> {
vec![
"<TaBlE>".to_string(),
"<tr><th>A</th><th>B</th></tr>".to_string(),
"<tr><td>1</td><td>2</td></tr>".to_string(),
"</TaBlE>".to_string(),
]
}

#[fixture]
/// Returns a vector of strings representing two separate Markdown tables, each with two columns, separated by an empty line.
///
/// # Examples
///
/// ```
/// let tables = multiple_tables();
/// assert_eq!(tables, vec![
/// "| A | B |",
/// "| 1 | 22 |",
/// "",
/// "| X | Y |",
/// "| 3 | 4 |",
/// ]);
/// ```
fn multiple_tables() -> Vec<String> {
vec![
"| A | B |".to_string(),
Expand Down Expand Up @@ -130,12 +218,57 @@ fn test_reflow_preserves_indentation(indented_table: Vec<String>) {
}

#[rstest]
/// Tests that `process_stream` converts a simple HTML table into a correctly formatted Markdown table.
///
/// The test verifies that an HTML table with two columns and one data row is transformed into a Markdown table with headers and a separator row.
fn test_process_stream_html_table(html_table: Vec<String>) {
let expected = vec!["| A | B |", "| --- | --- |", "| 1 | 2 |"];
assert_eq!(process_stream(&html_table), expected);
}

#[rstest]
/// Tests that `process_stream` correctly converts an HTML table with attributes into a Markdown table.
///
/// Verifies that attributes in the `<table>` tag are ignored and the resulting Markdown table has the expected header and data rows.
fn test_process_stream_html_table_with_attrs(html_table_with_attrs: Vec<String>) {
let expected = vec!["| A | B |", "| --- | --- |", "| 1 | 2 |"];
assert_eq!(process_stream(&html_table_with_attrs), expected);
}

#[rstest]
/// Tests that `process_stream` correctly converts an HTML table with uppercase tags into a Markdown table.
///
/// Verifies that tables with `<TABLE>`, `<TR>`, and `<TD>` tags in uppercase are parsed and converted to the expected Markdown format.
fn test_process_stream_html_table_uppercase(html_table_uppercase: Vec<String>) {
let expected = vec!["| A | B |", "| --- | --- |", "| 1 | 2 |"];
assert_eq!(process_stream(&html_table_uppercase), expected);
}

#[rstest]
/// Tests that `process_stream` correctly converts an HTML table with mixed-case tags into a Markdown table.
///
/// Verifies that HTML tables with tag names in mixed case are parsed and converted to Markdown format with headers and data rows.
fn test_process_stream_html_table_mixed_case(html_table_mixed_case: Vec<String>) {
let expected = vec!["| A | B |", "| --- | --- |", "| 1 | 2 |"];
assert_eq!(process_stream(&html_table_mixed_case), expected);
}

#[rstest]
/// Tests that `process_stream` correctly processes multiple Markdown tables separated by blank lines, reflowing each table independently and preserving separation.
///
/// # Examples
///
/// ```
/// let input = vec![
/// "| A | B |".to_string(),
/// "| 1 | 22 |".to_string(),
/// String::new(),
/// "| X | Y |".to_string(),
/// "| 3 | 4 |".to_string(),
/// ];
/// let output = process_stream(&input);
/// assert_eq!(output, input);
/// ```
fn test_process_stream_multiple_tables(multiple_tables: Vec<String>) {
let expected = vec![
"| A | B |".to_string(),
Expand Down