Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
233 changes: 233 additions & 0 deletions src/html.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
use html5ever::driver::ParseOpts;
use html5ever::{parse_document, tendril::TendrilSink};
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use regex::Regex;
use std::sync::LazyLock;

use crate::is_fence;

static TABLE_START_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?i)^<table(?:\s|>|$)").unwrap());
static TABLE_END_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?i)</table>").unwrap());

fn node_text(handle: &Handle) -> String {
let mut out = String::new();
let mut last_space = false;
collect_text(handle, &mut out, &mut last_space);
out.trim().to_string()
}

fn collect_text(handle: &Handle, out: &mut String, last_space: &mut bool) {
match &handle.data {
NodeData::Text { contents } => {
for ch in contents.borrow().chars() {
if ch.is_whitespace() {
*last_space = true;
} else {
if *last_space && !out.is_empty() {
out.push(' ');
}
out.push(ch);
*last_space = false;
}
}
}
NodeData::Element { name, .. } => {
let tag = name.local.as_ref();
if tag.eq_ignore_ascii_case("script")
|| tag.eq_ignore_ascii_case("style")
|| tag.eq_ignore_ascii_case("noscript")
|| tag.eq_ignore_ascii_case("template")
|| tag.eq_ignore_ascii_case("head")
{
return;
}
for child in handle.children.borrow().iter() {
collect_text(child, out, last_space);
}
}
NodeData::Document => {
for child in handle.children.borrow().iter() {
collect_text(child, out, last_space);
}
}
_ => {}
}
}

fn collect_tables(handle: &Handle, tables: &mut Vec<Handle>) {
if let NodeData::Element { name, .. } = &handle.data {
if name.local.as_ref() == "table" {
tables.push(handle.clone());
}
}
for child in handle.children.borrow().iter() {
collect_tables(child, tables);
}
}

fn collect_rows(handle: &Handle, rows: &mut Vec<Handle>) {
if let NodeData::Element { name, .. } = &handle.data {
if name.local.as_ref() == "tr" {
rows.push(handle.clone());
}
}
for child in handle.children.borrow().iter() {
collect_rows(child, rows);
}
}

fn table_node_to_markdown(table: &Handle) -> Vec<String> {
let mut row_handles = Vec::new();
collect_rows(table, &mut row_handles);
if row_handles.is_empty() {
return Vec::new();
}

let mut out = Vec::new();
let mut first_header = false;
let mut col_count = 0;
for (i, row) in row_handles.iter().enumerate() {
let mut cells = Vec::new();
let mut all_header = true;
for child in row.children.borrow().iter() {
if let NodeData::Element { name, .. } = &child.data {
if name.local.as_ref() == "td" || name.local.as_ref() == "th" {
all_header &= name.local.as_ref() == "th";
cells.push(node_text(child));
}
}
}
if i == 0 {
first_header = all_header;
col_count = cells.len();
}
out.push(format!("| {} |", cells.join(" | ")));
}
if first_header {
let sep: Vec<String> = (0..col_count).map(|_| "---".to_string()).collect();
out.insert(1, format!("| {} |", sep.join(" | ")));
}
crate::reflow_table(&out)
}

fn table_lines_to_markdown(lines: &[String]) -> Vec<String> {
let indent: String = lines
.first()
.map(|l| l.chars().take_while(|c| c.is_whitespace()).collect())
.unwrap_or_default();
let html: String = lines
.iter()
.map(|l| l.trim_end())
.collect::<Vec<_>>()
.join("\n");
let opts = ParseOpts::default();
let dom: RcDom = parse_document(RcDom::default(), opts).one(html);

let mut tables = Vec::new();
collect_tables(&dom.document, &mut tables);
if tables.is_empty() {
return lines.to_vec();
}

let mut out = Vec::new();
for table in tables {
for line in table_node_to_markdown(&table) {
out.push(format!("{indent}{line}"));
}
}
out
}

fn push_html_line(
line: &str,
buf: &mut Vec<String>,
depth: &mut usize,
in_html: &mut bool,
out: &mut Vec<String>,
) {
buf.push(line.trim_end().to_string());
*depth += TABLE_START_RE.find_iter(line).count();
if TABLE_END_RE.is_match(line) {
*depth = depth.saturating_sub(TABLE_END_RE.find_iter(line).count());
if *depth == 0 {
out.extend(html_table_to_markdown(buf));
buf.clear();
*in_html = false;
}
}
}

pub(crate) fn html_table_to_markdown(lines: &[String]) -> Vec<String> {
let mut out = Vec::new();
let mut buf = Vec::new();
let mut depth = 0usize;

for line in lines {
if depth > 0 || TABLE_START_RE.is_match(line.trim_start()) {
buf.push(line.trim_end().to_string());
depth += TABLE_START_RE.find_iter(line).count();
if TABLE_END_RE.is_match(line) {
depth = depth.saturating_sub(TABLE_END_RE.find_iter(line).count());
if depth == 0 {
out.extend(table_lines_to_markdown(&buf));
buf.clear();
}
}
continue;
}

out.push(line.trim_end().to_string());
}

if !buf.is_empty() {
out.extend(buf);
}

out
}

pub fn convert_html_tables(lines: &[String]) -> Vec<String> {
let mut out = Vec::new();
let mut buf = Vec::new();
let mut depth = 0usize;
let mut in_html = false;
let mut in_code = false;

for line in lines {
if is_fence(line) {
if in_html {
out.append(&mut buf);
in_html = false;
depth = 0;
}
in_code = !in_code;
out.push(line.trim_end().to_string());
continue;
}

if in_code {
out.push(line.trim_end().to_string());
continue;
}

if in_html {
push_html_line(line, &mut buf, &mut depth, &mut in_html, &mut out);
continue;
}

if TABLE_START_RE.is_match(line.trim_start()) {
in_html = true;
push_html_line(line, &mut buf, &mut depth, &mut in_html, &mut out);
continue;
}

out.push(line.trim_end().to_string());
}

if !buf.is_empty() {
out.extend(buf);
}

out
}
Loading