diff --git a/src/html.rs b/src/html.rs new file mode 100644 index 00000000..f618ef5d --- /dev/null +++ b/src/html.rs @@ -0,0 +1,233 @@ +use html5ever::driver::ParseOpts; +use html5ever::{parse_document, tendril::TendrilSink}; +use markup5ever_rcdom::{Handle, NodeData, RcDom}; +use regex::Regex; +use std::sync::LazyLock; + +use crate::is_fence; + +static TABLE_START_RE: LazyLock = + LazyLock::new(|| Regex::new(r"(?i)^|$)").unwrap()); +static TABLE_END_RE: LazyLock = LazyLock::new(|| Regex::new(r"(?i)").unwrap()); + +fn node_text(handle: &Handle) -> String { + let mut out = String::new(); + let mut last_space = false; + collect_text(handle, &mut out, &mut last_space); + out.trim().to_string() +} + +fn collect_text(handle: &Handle, out: &mut String, last_space: &mut bool) { + match &handle.data { + NodeData::Text { contents } => { + for ch in contents.borrow().chars() { + if ch.is_whitespace() { + *last_space = true; + } else { + if *last_space && !out.is_empty() { + out.push(' '); + } + out.push(ch); + *last_space = false; + } + } + } + NodeData::Element { name, .. } => { + let tag = name.local.as_ref(); + if tag.eq_ignore_ascii_case("script") + || tag.eq_ignore_ascii_case("style") + || tag.eq_ignore_ascii_case("noscript") + || tag.eq_ignore_ascii_case("template") + || tag.eq_ignore_ascii_case("head") + { + return; + } + for child in handle.children.borrow().iter() { + collect_text(child, out, last_space); + } + } + NodeData::Document => { + for child in handle.children.borrow().iter() { + collect_text(child, out, last_space); + } + } + _ => {} + } +} + +fn collect_tables(handle: &Handle, tables: &mut Vec) { + if let NodeData::Element { name, .. } = &handle.data { + if name.local.as_ref() == "table" { + tables.push(handle.clone()); + } + } + for child in handle.children.borrow().iter() { + collect_tables(child, tables); + } +} + +fn collect_rows(handle: &Handle, rows: &mut Vec) { + if let NodeData::Element { name, .. } = &handle.data { + if name.local.as_ref() == "tr" { + rows.push(handle.clone()); + } + } + for child in handle.children.borrow().iter() { + collect_rows(child, rows); + } +} + +fn table_node_to_markdown(table: &Handle) -> Vec { + let mut row_handles = Vec::new(); + collect_rows(table, &mut row_handles); + if row_handles.is_empty() { + return Vec::new(); + } + + let mut out = Vec::new(); + let mut first_header = false; + let mut col_count = 0; + for (i, row) in row_handles.iter().enumerate() { + let mut cells = Vec::new(); + let mut all_header = true; + for child in row.children.borrow().iter() { + if let NodeData::Element { name, .. } = &child.data { + if name.local.as_ref() == "td" || name.local.as_ref() == "th" { + all_header &= name.local.as_ref() == "th"; + cells.push(node_text(child)); + } + } + } + if i == 0 { + first_header = all_header; + col_count = cells.len(); + } + out.push(format!("| {} |", cells.join(" | "))); + } + if first_header { + let sep: Vec = (0..col_count).map(|_| "---".to_string()).collect(); + out.insert(1, format!("| {} |", sep.join(" | "))); + } + crate::reflow_table(&out) +} + +fn table_lines_to_markdown(lines: &[String]) -> Vec { + let indent: String = lines + .first() + .map(|l| l.chars().take_while(|c| c.is_whitespace()).collect()) + .unwrap_or_default(); + let html: String = lines + .iter() + .map(|l| l.trim_end()) + .collect::>() + .join("\n"); + let opts = ParseOpts::default(); + let dom: RcDom = parse_document(RcDom::default(), opts).one(html); + + let mut tables = Vec::new(); + collect_tables(&dom.document, &mut tables); + if tables.is_empty() { + return lines.to_vec(); + } + + let mut out = Vec::new(); + for table in tables { + for line in table_node_to_markdown(&table) { + out.push(format!("{indent}{line}")); + } + } + out +} + +fn push_html_line( + line: &str, + buf: &mut Vec, + depth: &mut usize, + in_html: &mut bool, + out: &mut Vec, +) { + buf.push(line.trim_end().to_string()); + *depth += TABLE_START_RE.find_iter(line).count(); + if TABLE_END_RE.is_match(line) { + *depth = depth.saturating_sub(TABLE_END_RE.find_iter(line).count()); + if *depth == 0 { + out.extend(html_table_to_markdown(buf)); + buf.clear(); + *in_html = false; + } + } +} + +pub(crate) fn html_table_to_markdown(lines: &[String]) -> Vec { + let mut out = Vec::new(); + let mut buf = Vec::new(); + let mut depth = 0usize; + + for line in lines { + if depth > 0 || TABLE_START_RE.is_match(line.trim_start()) { + buf.push(line.trim_end().to_string()); + depth += TABLE_START_RE.find_iter(line).count(); + if TABLE_END_RE.is_match(line) { + depth = depth.saturating_sub(TABLE_END_RE.find_iter(line).count()); + if depth == 0 { + out.extend(table_lines_to_markdown(&buf)); + buf.clear(); + } + } + continue; + } + + out.push(line.trim_end().to_string()); + } + + if !buf.is_empty() { + out.extend(buf); + } + + out +} + +pub fn convert_html_tables(lines: &[String]) -> Vec { + let mut out = Vec::new(); + let mut buf = Vec::new(); + let mut depth = 0usize; + let mut in_html = false; + let mut in_code = false; + + for line in lines { + if is_fence(line) { + if in_html { + out.append(&mut buf); + in_html = false; + depth = 0; + } + in_code = !in_code; + out.push(line.trim_end().to_string()); + continue; + } + + if in_code { + out.push(line.trim_end().to_string()); + continue; + } + + if in_html { + push_html_line(line, &mut buf, &mut depth, &mut in_html, &mut out); + continue; + } + + if TABLE_START_RE.is_match(line.trim_start()) { + in_html = true; + push_html_line(line, &mut buf, &mut depth, &mut in_html, &mut out); + continue; + } + + out.push(line.trim_end().to_string()); + } + + if !buf.is_empty() { + out.extend(buf); + } + + out +} diff --git a/src/lib.rs b/src/lib.rs index a39b7d67..0acee547 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,8 +2,8 @@ //! //! Functions here reflow tables that were broken during formatting. -use html5ever::{parse_document, tendril::TendrilSink}; -use markup5ever_rcdom::{Handle, NodeData, RcDom}; +mod html; + use regex::Regex; use std::fs; use std::path::Path; @@ -87,149 +87,6 @@ fn format_separator_cells(widths: &[usize], sep_cells: &[String]) -> Vec .collect() } -fn node_text(handle: &Handle) -> String { - let mut parts = Vec::new(); - collect_text(handle, &mut parts); - parts - .join(" ") - .split_whitespace() - .collect::>() - .join(" ") -} - -fn collect_text(handle: &Handle, out: &mut Vec) { - match &handle.data { - NodeData::Text { contents } => out.push(contents.borrow().to_string()), - NodeData::Element { name, .. } => { - let tag = name.local.as_ref(); - if tag.eq_ignore_ascii_case("script") - || tag.eq_ignore_ascii_case("style") - || tag.eq_ignore_ascii_case("noscript") - || tag.eq_ignore_ascii_case("template") - || tag.eq_ignore_ascii_case("head") - { - return; - } - for child in handle.children.borrow().iter() { - collect_text(child, out); - } - } - NodeData::Document => { - for child in handle.children.borrow().iter() { - collect_text(child, out); - } - } - _ => {} - } -} - -fn collect_tables(handle: &Handle, tables: &mut Vec) { - if let NodeData::Element { name, .. } = &handle.data { - if name.local.as_ref() == "table" { - tables.push(handle.clone()); - } - } - for child in handle.children.borrow().iter() { - collect_tables(child, tables); - } -} - -fn collect_rows(handle: &Handle, rows: &mut Vec) { - if let NodeData::Element { name, .. } = &handle.data { - if name.local.as_ref() == "tr" { - rows.push(handle.clone()); - } - } - for child in handle.children.borrow().iter() { - collect_rows(child, rows); - } -} - -use html5ever::driver::ParseOpts; - -fn table_node_to_markdown(table: &Handle) -> Vec { - let mut row_handles = Vec::new(); - collect_rows(table, &mut row_handles); - if row_handles.is_empty() { - return Vec::new(); - } - - let mut out = Vec::new(); - let mut first_header = false; - let mut col_count = 0; - for (i, row) in row_handles.iter().enumerate() { - let mut cells = Vec::new(); - let mut header_row = false; - for child in row.children.borrow().iter() { - if let NodeData::Element { name, .. } = &child.data { - if name.local.as_ref() == "td" || name.local.as_ref() == "th" { - if name.local.as_ref() == "th" { - header_row = true; - } - cells.push(node_text(child)); - } - } - } - if i == 0 { - first_header = header_row; - col_count = cells.len(); - } - out.push(format!("| {} |", cells.join(" | "))); - } - if first_header { - let sep: Vec = (0..col_count).map(|_| "---".to_string()).collect(); - out.insert(1, format!("| {} |", sep.join(" | "))); - } - reflow_table(&out) -} - -fn html_table_to_markdown(lines: &[String]) -> Vec { - let indent: String = lines - .first() - .map(|l| l.chars().take_while(|c| c.is_whitespace()).collect()) - .unwrap_or_default(); - let html: String = lines - .iter() - .map(|l| l.trim_end()) - .collect::>() - .join("\n"); - let opts = ParseOpts::default(); - let dom: RcDom = parse_document(RcDom::default(), opts).one(html); - - let mut tables = Vec::new(); - collect_tables(&dom.document, &mut tables); - if tables.is_empty() { - return lines.to_vec(); - } - - let mut out = Vec::new(); - for table in tables { - for line in table_node_to_markdown(&table) { - out.push(format!("{indent}{line}")); - } - } - out -} - -fn push_html_line( - line: &str, - html_buf: &mut Vec, - html_depth: &mut usize, - in_html: &mut bool, - out: &mut Vec, -) { - html_buf.push(line.trim_end().to_string()); - *html_depth += TABLE_START_RE.find_iter(line).count(); - if TABLE_END_RE.is_match(line) { - *html_depth = html_depth.saturating_sub(TABLE_END_RE.find_iter(line).count()); - if *html_depth == 0 { - out.extend(html_table_to_markdown(html_buf)); - html_buf.clear(); - *in_html = false; - } - } -} - /// Reflow a broken markdown table. /// /// # Panics @@ -256,10 +113,6 @@ static SENTINEL_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| Regex::new(r"\|\s*\|\s*").unwrap()); static SEP_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| Regex::new(r"^[\s|:-]+$").unwrap()); -static TABLE_START_RE: std::sync::LazyLock = - std::sync::LazyLock::new(|| Regex::new(r"(?i)^|$)").unwrap()); -static TABLE_END_RE: std::sync::LazyLock = - std::sync::LazyLock::new(|| Regex::new(r"(?i)").unwrap()); #[must_use] pub fn reflow_table(lines: &[String]) -> Vec { @@ -388,17 +241,20 @@ pub fn reflow_table(lines: &[String]) -> Vec { static FENCE_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| Regex::new(r"^(```|~~~).*").unwrap()); +pub(crate) fn is_fence(line: &str) -> bool { + FENCE_RE.is_match(line) +} + #[must_use] pub fn process_stream(lines: &[String]) -> Vec { + let pre = html::convert_html_tables(lines); + let mut out = Vec::new(); let mut buf = Vec::new(); - let mut html_buf = Vec::new(); - let mut html_depth = 0usize; let mut in_code = false; let mut in_table = false; - let mut in_html = false; - for line in lines { + for line in &pre { if FENCE_RE.is_match(line) { if !buf.is_empty() { if in_table { @@ -418,27 +274,6 @@ pub fn process_stream(lines: &[String]) -> Vec { continue; } - if in_html { - push_html_line(line, &mut html_buf, &mut html_depth, &mut in_html, &mut out); - continue; - } - - if TABLE_START_RE.is_match(line.trim_start()) { - if !buf.is_empty() { - if in_table { - out.extend(reflow_table(&buf)); - } else { - out.extend(buf.clone()); - } - buf.clear(); - in_table = false; - } - in_html = true; - html_depth = 0; - push_html_line(line, &mut html_buf, &mut html_depth, &mut in_html, &mut out); - continue; - } - if line.trim_start().starts_with('|') { if !in_table { in_table = true; @@ -456,6 +291,7 @@ pub fn process_stream(lines: &[String]) -> Vec { buf.clear(); in_table = false; } + out.push(line.trim_end().to_string()); } @@ -467,10 +303,6 @@ pub fn process_stream(lines: &[String]) -> Vec { } } - if !html_buf.is_empty() { - out.extend(html_table_to_markdown(&html_buf)); - } - out }