Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions src/html.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
//! Utilities for converting HTML tables embedded in Markdown into
//! Markdown table syntax.
//!
//! The conversion is intentionally simple: only `<table>`, `<tr>`,
//! `<th>`, and `<td>` tags are recognised. Attributes and tag casing
//! are ignored. The resulting Markdown lines are passed to
//! `reflow_table` to ensure consistent column widths.

use html5ever::driver::ParseOpts;
use html5ever::{parse_document, tendril::TendrilSink};
use markup5ever_rcdom::{Handle, NodeData, RcDom};
Expand All @@ -10,13 +18,17 @@ static TABLE_START_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?i)^<table(?:\s|>|$)").unwrap());
static TABLE_END_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?i)</table>").unwrap());

/// Extracts the text content of a DOM node, collapsing consecutive
/// whitespace to single spaces.
fn node_text(handle: &Handle) -> String {
let mut out = String::new();
let mut last_space = false;
collect_text(handle, &mut out, &mut last_space);
out.trim().to_string()
}

/// Recursively appends text nodes from `handle` to `out`, tracking whether the
/// previous output was whitespace.
fn collect_text(handle: &Handle, out: &mut String, last_space: &mut bool) {
match &handle.data {
NodeData::Text { contents } => {
Expand Down Expand Up @@ -55,6 +67,7 @@ fn collect_text(handle: &Handle, out: &mut String, last_space: &mut bool) {
}
}

/// Walks the DOM tree collecting `<table>` nodes under `handle`.
fn collect_tables(handle: &Handle, tables: &mut Vec<Handle>) {
if let NodeData::Element { name, .. } = &handle.data {
if name.local.as_ref() == "table" {
Expand All @@ -66,6 +79,7 @@ fn collect_tables(handle: &Handle, tables: &mut Vec<Handle>) {
}
}

/// Collects all `<tr>` nodes beneath `handle`.
fn collect_rows(handle: &Handle, rows: &mut Vec<Handle>) {
if let NodeData::Element { name, .. } = &handle.data {
if name.local.as_ref() == "tr" {
Expand All @@ -77,6 +91,8 @@ fn collect_rows(handle: &Handle, rows: &mut Vec<Handle>) {
}
}

/// Converts a `<table>` DOM node into Markdown table lines and calls
/// `reflow_table` so the columns are uniformly padded.
fn table_node_to_markdown(table: &Handle) -> Vec<String> {
let mut row_handles = Vec::new();
collect_rows(table, &mut row_handles);
Expand Down Expand Up @@ -111,6 +127,9 @@ fn table_node_to_markdown(table: &Handle) -> Vec<String> {
crate::reflow_table(&out)
}

/// Parses HTML table markup and returns the equivalent Markdown lines.
///
/// If no `<table>` elements are present, the input is returned unchanged.
fn table_lines_to_markdown(lines: &[String]) -> Vec<String> {
let indent: String = lines
.first()
Expand Down Expand Up @@ -139,6 +158,8 @@ fn table_lines_to_markdown(lines: &[String]) -> Vec<String> {
out
}

/// Buffers a single line of HTML, updating nesting depth and emitting completed
/// tables when an end tag is encountered.
fn push_html_line(
line: &str,
buf: &mut Vec<String>,
Expand All @@ -158,6 +179,7 @@ fn push_html_line(
}
}

/// Converts any HTML tables in `lines` to Markdown syntax.
pub(crate) fn html_table_to_markdown(lines: &[String]) -> Vec<String> {
let mut out = Vec::new();
let mut buf = Vec::new();
Expand Down Expand Up @@ -187,6 +209,11 @@ pub(crate) fn html_table_to_markdown(lines: &[String]) -> Vec<String> {
out
}

/// Processes Markdown lines and converts embedded HTML tables to Markdown.
///
/// Fenced code blocks are left untouched, allowing raw HTML examples to be
/// documented without modification.
#[must_use]
pub fn convert_html_tables(lines: &[String]) -> Vec<String> {
let mut out = Vec::new();
let mut buf = Vec::new();
Expand Down