diff --git a/Cargo.lock b/Cargo.lock index 936a8d0f..8d889b3e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -407,6 +407,7 @@ dependencies = [ "regex", "rstest", "tempfile", + "textwrap", ] [[package]] @@ -726,6 +727,12 @@ version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +[[package]] +name = "smawk" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c388c1b5e93756d0c740965c41e8822f866621d41acbdf6336a6a168f8840c" + [[package]] name = "string_cache" version = "0.8.9" @@ -798,12 +805,35 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683" +[[package]] +name = "textwrap" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c13547615a44dc9c452a8a534638acdf07120d4b6847c8178705da06306a3057" +dependencies = [ + "smawk", + "unicode-linebreak", + "unicode-width", +] + [[package]] name = "unicode-ident" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" +[[package]] +name = "unicode-linebreak" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b09c83c3c29d37506a3e260c08c03743a6bb66a9cd432c6934ab501a190571f" + +[[package]] +name = "unicode-width" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c" + [[package]] name = "utf-8" version = "0.7.6" diff --git a/Cargo.toml b/Cargo.toml index ac0f5d6e..37dff9c7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ clap = { version = "4", features = ["derive"] } regex = "1" html5ever = "0.27" markup5ever_rcdom = "0.3" +textwrap = "^0.16" [dev-dependencies] diff --git a/README.md b/README.md index eba25c58..cd6f3190 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ # mdtablefix `mdtablefix` reflows Markdown tables so that each column has a uniform width. -It ignores fenced code blocks and respects escaped pipes (`\|`), +It also wraps paragraphs and list items to 80 columns. +The tool ignores fenced code blocks and respects escaped pipes (`\|`), making it safe for mixed content. ## Installation @@ -85,4 +86,3 @@ is organised using the [`rstest`](https://crates.io/crates/rstest) crate. This project is licensed under the ISC license. See the [LICENSE](LICENSE) file for details. - diff --git a/src/html.rs b/src/html.rs index 848966ae..647b8b01 100644 --- a/src/html.rs +++ b/src/html.rs @@ -176,7 +176,9 @@ fn table_lines_to_markdown(lines: &[String]) -> Vec { } /// Buffers a single line of HTML, updating nesting depth and emitting completed -/// tables when an end tag is encountered. +/// Buffers a line of HTML table markup and processes the buffer into Markdown when the table is fully closed. +/// +/// Tracks the nesting depth of `` tags, appending each line to the buffer. When all opened tables are closed (depth reaches zero), converts the buffered HTML table lines to Markdown and appends them to the output vector. Resets the buffer and updates the HTML state accordingly. fn push_html_line( line: &str, buf: &mut Vec, @@ -184,7 +186,7 @@ fn push_html_line( in_html: &mut bool, out: &mut Vec, ) { - buf.push(line.trim_end().to_string()); + buf.push(line.to_string()); *depth += TABLE_START_RE.find_iter(line).count(); if TABLE_END_RE.is_match(line) { *depth = depth.saturating_sub(TABLE_END_RE.find_iter(line).count()); @@ -196,7 +198,27 @@ fn push_html_line( } } -/// Converts any HTML tables in `lines` to Markdown syntax. +/// Replaces HTML tables in the provided lines with equivalent Markdown table syntax. +/// +/// Scans the input lines for HTML `
` blocks, converts each detected table to Markdown using `table_lines_to_markdown`, and preserves all other content unchanged. Handles nested tables and maintains original line formatting outside of tables. +/// +/// # Arguments +/// +/// * `lines` - A slice of strings representing lines of Markdown, possibly containing HTML tables. +/// +/// # Returns +/// +/// A vector of strings with HTML tables replaced by Markdown tables, leaving other lines intact. +/// +/// # Examples +/// +/// ``` +/// let html_lines = vec![ +/// "
Header
Cell
".to_string() +/// ]; +/// let md_lines = html_table_to_markdown(&html_lines); +/// assert!(md_lines[0].starts_with("| Header |")); +/// ``` pub(crate) fn html_table_to_markdown(lines: &[String]) -> Vec { let mut out = Vec::new(); let mut buf = Vec::new(); @@ -204,7 +226,7 @@ pub(crate) fn html_table_to_markdown(lines: &[String]) -> Vec { for line in lines { if depth > 0 || TABLE_START_RE.is_match(line.trim_start()) { - buf.push(line.trim_end().to_string()); + buf.push(line.to_string()); depth += TABLE_START_RE.find_iter(line).count(); if TABLE_END_RE.is_match(line) { depth = depth.saturating_sub(TABLE_END_RE.find_iter(line).count()); @@ -216,7 +238,7 @@ pub(crate) fn html_table_to_markdown(lines: &[String]) -> Vec { continue; } - out.push(line.trim_end().to_string()); + out.push(line.to_string()); } if !buf.is_empty() { @@ -231,6 +253,22 @@ pub(crate) fn html_table_to_markdown(lines: &[String]) -> Vec { /// Fenced code blocks are left untouched, allowing raw HTML examples to be /// documented without modification. #[must_use] +/// Converts HTML tables embedded in Markdown lines to Markdown table syntax. +/// +/// Scans the input lines, detects HTML table blocks outside of fenced code blocks, and replaces them with equivalent Markdown tables. Fenced code blocks are left unmodified. Handles nested tables and preserves original line formatting outside of tables. +/// +/// # Examples +/// +/// ``` +/// let lines = vec![ +/// "".to_string(), +/// " ".to_string(), +/// " ".to_string(), +/// "
Header
Cell
".to_string(), +/// ]; +/// let result = convert_html_tables(&lines); +/// assert!(result[0].starts_with("| Header |")); +/// ``` pub fn convert_html_tables(lines: &[String]) -> Vec { let mut out = Vec::new(); let mut buf = Vec::new(); @@ -246,12 +284,12 @@ pub fn convert_html_tables(lines: &[String]) -> Vec { depth = 0; } in_code = !in_code; - out.push(line.trim_end().to_string()); + out.push(line.to_string()); continue; } if in_code { - out.push(line.trim_end().to_string()); + out.push(line.to_string()); continue; } @@ -266,7 +304,7 @@ pub fn convert_html_tables(lines: &[String]) -> Vec { continue; } - out.push(line.trim_end().to_string()); + out.push(line.to_string()); } if !buf.is_empty() { diff --git a/src/lib.rs b/src/lib.rs index d8200344..4662ba64 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,6 +11,7 @@ pub use html::convert_html_tables; use regex::Regex; use std::fs; use std::path::Path; +use textwrap::fill; /// Splits a markdown table line into trimmed cell strings. /// @@ -245,11 +246,188 @@ pub fn reflow_table(lines: &[String]) -> Vec { static FENCE_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| Regex::new(r"^(```|~~~).*").unwrap()); +static BULLET_RE: std::sync::LazyLock = + std::sync::LazyLock::new(|| Regex::new(r"^(\s*(?:[-*+]|\d+[.)])\s+)(.*)").unwrap()); + +/// Returns `true` if the line is a fenced code block delimiter (e.g., "```" or "~~~"). +/// +/// # Examples +/// +/// ``` +/// assert!(is_fence("```")); +/// assert!(is_fence("~~~")); +/// assert!(!is_fence("| foo | bar |")); +/// ``` pub(crate) fn is_fence(line: &str) -> bool { FENCE_RE.is_match(line) } +/// Flushes a buffered paragraph to the output, wrapping text to the specified width and applying indentation. +/// +/// Concatenates buffered lines into a single paragraph, respecting hard line breaks, and writes the wrapped lines to the output vector with the given indentation. Lines are wrapped to the specified width minus the indentation length. Hard breaks in the buffer force a line break at that point. +fn flush_paragraph(out: &mut Vec, buf: &[(String, bool)], indent: &str, width: usize) { + if buf.is_empty() { + return; + } + let mut segment = String::new(); + for (text, hard_break) in buf { + if !segment.is_empty() { + segment.push(' '); + } + segment.push_str(text); + if *hard_break { + for line in fill(&segment, width - indent.len()).lines() { + out.push(format!("{indent}{line}")); + } + segment.clear(); + } + } + if !segment.is_empty() { + for line in fill(&segment, width - indent.len()).lines() { + out.push(format!("{indent}{line}")); + } + } +} + +/// Wraps text lines to a specified width, preserving markdown structure. +/// +/// Paragraphs and list items are reflowed to the given width, while code blocks, tables, headers, and blank lines are left unchanged. Indentation and bullet/numbered list prefixes are preserved. Hard line breaks (two spaces or `
` tags) are respected. +/// +/// # Parameters +/// - `lines`: The input lines of markdown text. +/// - `width`: The maximum line width for wrapping. +/// +/// # Returns +/// A vector of strings containing the wrapped and formatted markdown lines. +/// +/// # Examples +/// +/// ``` +/// let input = vec![ +/// "This is a long paragraph that should be wrapped to a shorter width.".to_string(), +/// "".to_string(), +/// "```".to_string(), +/// "let x = 42;".to_string(), +/// "```".to_string(), +/// ]; +/// let wrapped = wrap_text(&input, 20); +/// assert_eq!(wrapped[0], "This is a long"); +/// assert_eq!(wrapped[1], "paragraph that should"); +/// assert_eq!(wrapped[2], "be wrapped to a"); +/// assert_eq!(wrapped[3], "shorter width."); +/// assert_eq!(wrapped[4], ""); +/// assert_eq!(wrapped[5], "```"); +/// assert_eq!(wrapped[6], "let x = 42;"); +/// assert_eq!(wrapped[7], "```"); +/// ``` +fn wrap_text(lines: &[String], width: usize) -> Vec { + let mut out = Vec::new(); + let mut buf: Vec<(String, bool)> = Vec::new(); + let mut indent = String::new(); + let mut in_code = false; + + for line in lines { + if FENCE_RE.is_match(line) { + flush_paragraph(&mut out, &buf, &indent, width); + buf.clear(); + indent.clear(); + in_code = !in_code; + out.push(line.clone()); + continue; + } + + if in_code { + out.push(line.clone()); + continue; + } + + if line.trim_start().starts_with('|') || SEP_RE.is_match(line.trim()) { + flush_paragraph(&mut out, &buf, &indent, width); + buf.clear(); + indent.clear(); + out.push(line.clone()); + continue; + } + + if line.trim_start().starts_with('#') { + flush_paragraph(&mut out, &buf, &indent, width); + buf.clear(); + indent.clear(); + out.push(line.clone()); + continue; + } + + if line.trim().is_empty() { + flush_paragraph(&mut out, &buf, &indent, width); + buf.clear(); + indent.clear(); + out.push(String::new()); + continue; + } + + if let Some(cap) = BULLET_RE.captures(line) { + flush_paragraph(&mut out, &buf, &indent, width); + buf.clear(); + indent.clear(); + let prefix = cap.get(1).unwrap().as_str(); + let rest = cap.get(2).unwrap().as_str().trim(); + let spaces = " ".repeat(prefix.len()); + for (i, l) in fill(rest, width - prefix.len()).lines().enumerate() { + if i == 0 { + out.push(format!("{prefix}{l}")); + } else { + out.push(format!("{spaces}{l}")); + } + } + continue; + } + + if buf.is_empty() { + indent = line.chars().take_while(|c| c.is_whitespace()).collect(); + } + let trimmed_end = line.trim_end(); + let hard_break = line.ends_with(" ") + || trimmed_end.ends_with("
") + || trimmed_end.ends_with("
") + || trimmed_end.ends_with("
"); + let text = trimmed_end + .trim_end_matches("
") + .trim_end_matches("
") + .trim_end_matches("
") + .trim_end_matches(' ') + .trim_start() + .to_string(); + buf.push((text, hard_break)); + } + + flush_paragraph(&mut out, &buf, &indent, width); + out +} + #[must_use] +/// Processes a stream of markdown lines, converting HTML tables, reflowing markdown tables, and wrapping text to 80 columns. +/// +/// Converts simple HTML tables to markdown, reflows markdown tables for consistent alignment, and wraps paragraphs and list items to 80 characters. Preserves code blocks, headers, and special markdown structures. +/// +/// # Returns +/// +/// A vector of processed markdown lines with tables fixed and text wrapped. +/// +/// # Examples +/// +/// ``` +/// let input = vec![ +/// "
foobar
".to_string(), +/// "| a | b |".to_string(), +/// "|---|---|".to_string(), +/// "| 1 | 2 |".to_string(), +/// "".to_string(), +/// "A paragraph that will be wrapped to fit within eighty columns. This sentence is intentionally long to demonstrate wrapping.".to_string(), +/// ]; +/// let output = process_stream(&input); +/// assert!(output.iter().any(|line| line.contains("| foo | bar |"))); +/// assert!(output.iter().any(|line| line.len() <= 80)); +/// ``` pub fn process_stream(lines: &[String]) -> Vec { let pre = html::convert_html_tables(lines); @@ -269,12 +447,12 @@ pub fn process_stream(lines: &[String]) -> Vec { buf.clear(); } in_code = !in_code; - out.push(line.trim_end().to_string()); + out.push(line.to_string()); continue; } if in_code { - out.push(line.trim_end().to_string()); + out.push(line.to_string()); continue; } @@ -296,7 +474,7 @@ pub fn process_stream(lines: &[String]) -> Vec { in_table = false; } - out.push(line.trim_end().to_string()); + out.push(line.to_string()); } if !buf.is_empty() { @@ -307,7 +485,7 @@ pub fn process_stream(lines: &[String]) -> Vec { } } - out + wrap_text(&out, 80) } /// Rewrite a file in place with fixed tables. diff --git a/tests/integration.rs b/tests/integration.rs index a3962bc4..cc723142 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -476,6 +476,10 @@ fn test_logical_type_table_output_matches() { } #[test] +/// Verifies that reflowing the option table input produces the expected output. +/// +/// Loads the input and expected output from external files and asserts that the +/// `reflow_table` function transforms the input table to match the expected result. fn test_option_table_output_matches() { let input: Vec = include_str!("data/option_table_input.txt") .lines() @@ -487,3 +491,68 @@ fn test_option_table_output_matches() { .collect(); assert_eq!(reflow_table(&input), expected); } + +#[test] +/// Tests that long paragraphs are wrapped at 80 columns by `process_stream`. +/// +/// Ensures that a single long paragraph is split into multiple lines, each not exceeding 80 characters. +fn test_wrap_paragraph() { + let input = vec![ + "This is a very long paragraph that should be wrapped at eighty columns \ + so it needs to contain enough words to exceed that limit." + .to_string(), + ]; + let output = process_stream(&input); + assert!(output.len() > 1); + assert!(output.iter().all(|l| l.len() <= 80)); +} + +#[test] +fn test_wrap_list_item() { + let input = vec![ + r"- This bullet item is exceptionally long and must be wrapped to keep prefix formatting intact." + .to_string(), + ]; + let output = process_stream(&input); + assert!(output.len() > 1); + assert!(output[0].starts_with("- ")); + for line in &output { + assert!(line.len() <= 80); + } + for line in output.iter().skip(1) { + assert!(line.starts_with(" ")); + } +} + +#[test] +/// Verifies that short list items are not wrapped or altered by the stream processing logic. +/// +/// Ensures that a single-line bullet list item remains unchanged after processing. +/// +/// # Examples +/// +/// ``` +/// let input = vec!["- short item".to_string()]; +/// let output = process_stream(&input); +/// assert_eq!(output, input); +/// ``` +fn test_wrap_short_list_item() { + let input = vec!["- short item".to_string()]; + let output = process_stream(&input); + assert_eq!(output, input); +} + +#[test] +/// Tests that lines with hard line breaks (trailing spaces) are preserved after processing. +/// +/// Ensures that the `process_stream` function does not remove or alter lines ending with Markdown hard line breaks. +fn test_preserve_hard_line_breaks() { + let input = vec![ + "Line one with break. ".to_string(), + "Line two follows.".to_string(), + ]; + let output = process_stream(&input); + assert_eq!(output.len(), 2); + assert_eq!(output[0], "Line one with break."); + assert_eq!(output[1], "Line two follows."); +}