diff --git a/src/breaks.rs b/src/breaks.rs new file mode 100644 index 00000000..b24e3db5 --- /dev/null +++ b/src/breaks.rs @@ -0,0 +1,64 @@ +//! Thematic break formatting utilities. + +use regex::Regex; + +use crate::wrap::is_fence; + +pub const THEMATIC_BREAK_LEN: usize = 70; + +static THEMATIC_BREAK_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| { + Regex::new(r"^[ ]{0,3}((?:[ \t]*\*){3,}|(?:[ \t]*-){3,}|(?:[ \t]*_){3,})[ \t]*$").unwrap() +}); + +static THEMATIC_BREAK_LINE: std::sync::LazyLock = + std::sync::LazyLock::new(|| "_".repeat(THEMATIC_BREAK_LEN)); + +#[must_use] +pub fn format_breaks(lines: &[String]) -> Vec { + let mut out = Vec::with_capacity(lines.len()); + let mut in_code = false; + + for line in lines { + if is_fence(line) { + in_code = !in_code; + out.push(line.clone()); + continue; + } + + if !in_code && THEMATIC_BREAK_RE.is_match(line.trim_end()) { + out.push(THEMATIC_BREAK_LINE.clone()); + } else { + out.push(line.clone()); + } + } + + out +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn basic_formatting() { + let input = vec!["foo", "***", "bar"] + .into_iter() + .map(str::to_string) + .collect::>(); + let expected = vec![ + "foo".to_string(), + "_".repeat(THEMATIC_BREAK_LEN), + "bar".to_string(), + ]; + assert_eq!(format_breaks(&input), expected); + } + + #[test] + fn ignores_fenced_code() { + let input = vec!["```", "---", "```"] + .into_iter() + .map(str::to_string) + .collect::>(); + assert_eq!(format_breaks(&input), input); + } +} diff --git a/src/html.rs b/src/html.rs index f86a7754..87e24c8b 100644 --- a/src/html.rs +++ b/src/html.rs @@ -12,7 +12,7 @@ use html5ever::{driver::ParseOpts, parse_document, tendril::TendrilSink}; use markup5ever_rcdom::{Handle, NodeData, RcDom}; use regex::Regex; -use crate::is_fence; +use crate::wrap::is_fence; /// Matches the start of an HTML `` tag, ignoring case. static TABLE_START_RE: LazyLock = diff --git a/src/io.rs b/src/io.rs new file mode 100644 index 00000000..e4c71c7a --- /dev/null +++ b/src/io.rs @@ -0,0 +1,54 @@ +//! File helpers for rewriting Markdown documents. + +use std::{fs, path::Path}; + +use crate::process::{process_stream, process_stream_no_wrap}; + +/// Rewrite a file in place with wrapped tables. +/// +/// # Errors +/// Returns an error if reading or writing the file fails. +pub fn rewrite(path: &Path) -> std::io::Result<()> { + let text = fs::read_to_string(path)?; + let lines: Vec = text.lines().map(str::to_string).collect(); + let fixed = process_stream(&lines); + fs::write(path, fixed.join("\n") + "\n") +} + +/// Rewrite a file in place without wrapping text. +/// +/// # Errors +/// Returns an error if reading or writing the file fails. +pub fn rewrite_no_wrap(path: &Path) -> std::io::Result<()> { + let text = fs::read_to_string(path)?; + let lines: Vec = text.lines().map(str::to_string).collect(); + let fixed = process_stream_no_wrap(&lines); + fs::write(path, fixed.join("\n") + "\n") +} + +#[cfg(test)] +mod tests { + use tempfile::tempdir; + + use super::*; + + #[test] + fn rewrite_roundtrip() { + let dir = tempdir().unwrap(); + let file = dir.path().join("sample.md"); + fs::write(&file, "|A|B|\n|1|2|").unwrap(); + rewrite(&file).unwrap(); + let out = fs::read_to_string(&file).unwrap(); + assert!(out.contains("| A | B |")); + } + + #[test] + fn rewrite_no_wrap_roundtrip() { + let dir = tempdir().unwrap(); + let file = dir.path().join("sample.md"); + fs::write(&file, "|A|B|\n|1|2|").unwrap(); + rewrite_no_wrap(&file).unwrap(); + let out = fs::read_to_string(&file).unwrap(); + assert_eq!(out, "| A | B |\n| 1 | 2 |\n"); + } +} diff --git a/src/lib.rs b/src/lib.rs index 8ecc9261..2096080a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,11 +1,22 @@ -//! Library for fixing markdown tables. +//! Library for fixing Markdown tables and wrapping text. //! -//! Functions here reflow tables that were broken during formatting. -//! The [`convert_html_tables`] helper is re-exported at the crate root so -//! callers can convert simple HTML tables before reflowing. - +//! Modules: +//! - `html` for converting HTML tables. +//! - `table` for Markdown table alignment. +//! - `wrap` for paragraph wrapping. +//! - `lists` for renumbering ordered lists. +//! - `breaks` for thematic break formatting. +//! - `process` for stream processing. +//! - `io` for file helpers. + +pub mod breaks; mod html; +pub mod io; +pub mod lists; +pub mod process; mod reflow; +pub mod table; +pub mod wrap; #[doc(hidden)] #[must_use] @@ -13,956 +24,10 @@ pub fn html_table_to_markdown(lines: &[String]) -> Vec { html::html_table_to_markdown(lines) } -use std::{fs, path::Path}; - +pub use breaks::{THEMATIC_BREAK_LEN, format_breaks}; pub use html::convert_html_tables; -use regex::Regex; - -/// Splits a markdown table line into trimmed cell strings. -/// -/// Removes leading and trailing pipe characters, splits the line by pipes, trims whitespace from -/// each cell, and returns the resulting cell strings as a vector. -/// -/// # Examples -/// -/// ```no_run -/// use mdtablefix::split_cells; -/// let line = "| cell1 | cell2 | cell3 |"; -/// let cells = split_cells(line); -/// assert_eq!(cells, vec!["cell1", "cell2", "cell3"]); -/// ``` -fn next_is_pipe(chars: &mut std::iter::Peekable>) -> bool { - chars.peek() == Some(&'|') -} -#[must_use] -pub fn split_cells(line: &str) -> Vec { - let mut s = line.trim(); - if let Some(stripped) = s.strip_prefix('|') { - s = stripped; - } - if let Some(stripped) = s.strip_suffix('|') { - s = stripped; - } - - let mut cells = Vec::new(); - let mut current = String::new(); - let mut chars = s.chars().peekable(); - while let Some(ch) = chars.next() { - if ch == '\\' { - if next_is_pipe(&mut chars) { - // `\|` escapes the pipe so it becomes part of the cell - chars.next(); - current.push('|'); - continue; - } - current.push(ch); - continue; - } - if ch == '|' { - cells.push(current.trim().to_string()); - current.clear(); - } else { - current.push(ch); - } - } - cells.push(current.trim().to_string()); - cells -} - -/// Formats the cells for a separator row based on column widths. -fn format_separator_cells(widths: &[usize], sep_cells: &[String]) -> Vec { - if sep_cells.len() != widths.len() { - // A malformed separator row could cause a panic below when indexing - // `widths`. Return the cells unchanged so the caller can decide how to - // handle the mismatch gracefully. - return sep_cells.to_vec(); - } - - sep_cells - .iter() - .enumerate() - .map(|(i, cell)| { - let trimmed = cell.trim(); - let left = trimmed.starts_with(':'); - let right = trimmed.ends_with(':'); - let mut dashes = "-".repeat(widths[i].max(3)); - if left { - dashes.remove(0); - dashes.insert(0, ':'); - } - if right { - dashes.pop(); - dashes.push(':'); - } - dashes - }) - .collect() -} - -/// Returns the separator index if it lies within `len`. -fn sep_index_within(idx: Option, len: usize) -> Option { - match idx { - Some(i) if i < len => Some(i), - _ => None, - } -} - -/// Returns `true` if rows have mismatched lengths when not split within lines. -fn rows_mismatched(rows: &[Vec], split_within_line: bool) -> bool { - if split_within_line { - return false; - } - let Some(first_len) = rows.first().map(Vec::len) else { - return false; - }; - rows.iter() - .skip(1) - .any(|row| row.len() != first_len && !row.iter().all(|c| SEP_RE.is_match(c))) -} - -/// Reflow a broken markdown table. -/// -/// # Panics -/// Panics if the internal regex fails to compile. -/// Reflows a broken markdown table into properly aligned rows and columns. -/// -/// Takes a slice of strings representing lines of a markdown table, reconstructs the table by -/// splitting and aligning cells, and returns the reflowed table as a vector of strings. If the rows -/// have inconsistent numbers of non-empty columns, the original lines are returned unchanged. -/// -/// # Examples -/// -/// ```no_run -/// use mdtablefix::reflow_table; -/// let lines = vec!["| a | b |".to_string(), "| c | d |".to_string()]; -/// let fixed = reflow_table(&lines); -/// assert_eq!( -/// fixed, -/// vec!["| a | b |".to_string(), "| c | d |".to_string(),] -/// ); -/// ``` -pub(crate) static SEP_RE: std::sync::LazyLock = - std::sync::LazyLock::new(|| Regex::new(r"^[\s|:-]+$").unwrap()); - -#[must_use] -pub fn reflow_table(lines: &[String]) -> Vec { - if lines.is_empty() { - return Vec::new(); - } - - let indent: String = lines[0].chars().take_while(|c| c.is_whitespace()).collect(); - let mut trimmed: Vec = lines - .iter() - .map(|l| l.trim().to_string()) - .filter(|l| !l.trim_start().starts_with("\\-")) - .collect(); - let sep_idx = trimmed.iter().position(|l| SEP_RE.is_match(l)); - let sep_line = sep_idx.map(|idx| trimmed.remove(idx)); - - let (rows, split_within_line) = reflow::parse_rows(&trimmed); - - // Count every cell, even if it is empty, to preserve column - // positions when checking for consistency across rows. - let max_cols = rows.iter().map(Vec::len).max().unwrap_or(0); - - let (sep_cells, sep_row_idx) = reflow::detect_separator(sep_line.as_ref(), &rows, max_cols); - - let cleaned = reflow::clean_rows(rows); - - let mut output_rows = cleaned.clone(); - if let Some(idx) = sep_index_within(sep_row_idx, output_rows.len()) { - output_rows.remove(idx); - } - - if rows_mismatched(&cleaned, split_within_line) { - return lines.to_vec(); - } - - let widths = reflow::calculate_widths(&cleaned, max_cols); - - let out = reflow::format_rows(output_rows, &widths, &indent); - - reflow::insert_separator(out, sep_cells, &widths, &indent) -} - -/// Processes a stream of markdown lines, reflowing tables while preserving code blocks and other -/// content. -/// -/// Detects fenced code blocks and avoids modifying their contents. Buffers lines that appear to be -/// part of a markdown table and reflows them when the table ends. Non-table lines and code blocks -/// are output unchanged. -/// -/// # Returns -/// -/// A vector of strings representing the processed markdown document with tables reflowed. -/// -/// # Examples -/// -/// ```no_run -/// use mdtablefix::process_stream; -/// let input = vec![ -/// "| a | b |".to_string(), -/// "|---|---|".to_string(), -/// "| 1 | 2 |".to_string(), -/// "".to_string(), -/// "```".to_string(), -/// "code block".to_string(), -/// "```".to_string(), -/// ]; -/// let output = process_stream(&input); -/// assert_eq!(output[0], "| a | b |"); -/// assert_eq!(output[1], "| --- | --- |"); -/// assert_eq!(output[2], "| 1 | 2 |"); -/// assert_eq!(output[3], ""); -/// assert_eq!(output[4], "```"); -/// assert_eq!(output[5], "code block"); -/// assert_eq!(output[6], "```"); -/// ``` -static FENCE_RE: std::sync::LazyLock = - std::sync::LazyLock::new(|| Regex::new(r"^(```|~~~).*").unwrap()); - -static BULLET_RE: std::sync::LazyLock = - std::sync::LazyLock::new(|| Regex::new(r"^(\s*(?:[-*+]|\d+[.)])\s+)(.*)").unwrap()); - -static NUMBERED_RE: std::sync::LazyLock = - std::sync::LazyLock::new(|| Regex::new(r"^(\s*)([1-9][0-9]*)\.(\s+)(.*)").unwrap()); - -static FOOTNOTE_RE: std::sync::LazyLock = - std::sync::LazyLock::new(|| Regex::new(r"^(\s*)(\[\^[^]]+\]:\s*)(.*)$").unwrap()); - -static BLOCKQUOTE_RE: std::sync::LazyLock = - std::sync::LazyLock::new(|| Regex::new(r"^(\s*(?:>\s*)+)(.*)$").unwrap()); - -/// Parses a line beginning with a numbered list marker. -/// -/// Returns the indentation prefix, separator following the number, and the -/// remainder of the line if `line` matches the numbered list pattern. -#[doc(hidden)] -fn parse_numbered(line: &str) -> Option<(&str, &str, &str)> { - let cap = NUMBERED_RE.captures(line)?; - let indent = cap.get(1)?.as_str(); - let sep = cap.get(3)?.as_str(); - let rest = cap.get(4)?.as_str(); - Some((indent, sep, rest)) -} - -/// Returns the effective indentation length treating tabs as four spaces. -#[doc(hidden)] -fn indent_len(indent: &str) -> usize { - indent - .chars() - .fold(0, |acc, ch| acc + if ch == '\t' { 4 } else { 1 }) -} - -#[doc(hidden)] -fn drop_deeper(indent: usize, counters: &mut Vec<(usize, usize)>) { - while counters.last().is_some_and(|(d, _)| *d > indent) { - counters.pop(); - } -} - -fn tokenize_markdown(text: &str) -> Vec { - let mut tokens = Vec::new(); - let chars: Vec = text.chars().collect(); - let mut i = 0; - while i < chars.len() { - let c = chars[i]; - if c.is_whitespace() { - let start = i; - while i < chars.len() && chars[i].is_whitespace() { - i += 1; - } - tokens.push(chars[start..i].iter().collect()); - } else if c == '`' { - let start = i; - let mut delim_len = 0; - while i < chars.len() && chars[i] == '`' { - i += 1; - delim_len += 1; - } - let mut end = i; - while end < chars.len() { - if chars[end] == '`' { - let mut j = end; - let mut count = 0; - while j < chars.len() && chars[j] == '`' { - j += 1; - count += 1; - } - if count == delim_len { - end = j; - break; - } - } - end += 1; - } - if end >= chars.len() { - tokens.push(chars[start..start + delim_len].iter().collect()); - i = start + delim_len; - } else { - tokens.push(chars[start..end].iter().collect()); - i = end; - } - } else { - let start = i; - while i < chars.len() && !chars[i].is_whitespace() && chars[i] != '`' { - i += 1; - } - tokens.push(chars[start..i].iter().collect()); - } - } - tokens -} - -/// Width of a normalised thematic break. -/// The width used when rewriting thematic breaks. -pub const THEMATIC_BREAK_LEN: usize = 70; - -static THEMATIC_BREAK_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| { - Regex::new(r"^[ ]{0,3}((?:[ \t]*\*){3,}|(?:[ \t]*-){3,}|(?:[ \t]*_){3,})[ \t]*$").unwrap() -}); - -static THEMATIC_BREAK_LINE: std::sync::LazyLock = - std::sync::LazyLock::new(|| "_".repeat(THEMATIC_BREAK_LEN)); - -fn wrap_preserving_code(text: &str, width: usize) -> Vec { - use unicode_width::UnicodeWidthStr; - - let mut lines = Vec::new(); - let mut current = String::new(); - let mut current_width = 0; - for token in tokenize_markdown(text) { - let token_width = UnicodeWidthStr::width(token.as_str()); - if current_width + token_width <= width { - current.push_str(&token); - current_width += token_width; - continue; - } - - let trimmed = current.trim_end(); - if !trimmed.is_empty() { - lines.push(trimmed.to_string()); - } - current.clear(); - current_width = 0; - - if !token.chars().all(char::is_whitespace) { - current.push_str(&token); - current_width = token_width; - } - } - let trimmed = current.trim_end(); - if !trimmed.is_empty() { - lines.push(trimmed.to_string()); - } - lines -} - -/// Returns `true` if the line is a fenced code block delimiter (e.g., three backticks or "~~~"). -/// -/// # Examples -/// -/// ```no_run -/// use mdtablefix::is_fence; -/// assert!(is_fence("```")); -/// assert!(is_fence("~~~")); -/// assert!(!is_fence("| foo | bar |")); -/// ``` -#[doc(hidden)] -pub fn is_fence(line: &str) -> bool { FENCE_RE.is_match(line) } - -/// Replaces spaces within inline code spans with non-breaking spaces. -/// -/// Inline code spans are delimited by matching pairs of backticks. This helper -/// replaces normal spaces inside those spans with `U+00A0` (non-breaking space) -/// so that the wrapping logic does not split them across lines. -/// Flushes a buffered paragraph to the output, wrapping text to the specified width and applying -/// indentation. -/// -/// Concatenates buffered lines into a single paragraph, respecting hard line breaks, and writes the -/// wrapped lines to the output vector with the given indentation. Lines are wrapped to the -/// specified width minus the indentation length. Hard breaks in the buffer force a line break at -/// that point. -fn flush_paragraph(out: &mut Vec, buf: &[(String, bool)], indent: &str, width: usize) { - if buf.is_empty() { - return; - } - let mut segment = String::new(); - for (text, hard_break) in buf { - if !segment.is_empty() { - segment.push(' '); - } - segment.push_str(text); - if *hard_break { - for line in wrap_preserving_code(&segment, width - indent.len()) { - out.push(format!("{indent}{line}")); - } - segment.clear(); - } - } - if !segment.is_empty() { - for line in wrap_preserving_code(&segment, width - indent.len()) { - out.push(format!("{indent}{line}")); - } - } -} - -fn append_wrapped_with_prefix( - out: &mut Vec, - prefix: &str, - text: &str, - width: usize, - repeat_prefix: bool, -) { - use unicode_width::UnicodeWidthStr; - - let prefix_width = UnicodeWidthStr::width(prefix); - let available = width.saturating_sub(prefix_width).max(1); - let indent_str: String = prefix.chars().take_while(|c| c.is_whitespace()).collect(); - let indent_width = UnicodeWidthStr::width(indent_str.as_str()); - let wrapped_indent = if repeat_prefix { - prefix.to_string() - } else { - format!("{}{}", indent_str, " ".repeat(prefix_width - indent_width)) - }; - - let lines = wrap_preserving_code(text, available); - if lines.is_empty() { - out.push(prefix.to_string()); - return; - } - - for (i, line) in lines.iter().enumerate() { - if i == 0 { - out.push(format!("{prefix}{line}")); - } else { - out.push(format!("{wrapped_indent}{line}")); - } - } -} - -fn handle_prefix_line( - out: &mut Vec, - buf: &mut Vec<(String, bool)>, - indent: &mut String, - width: usize, - prefix: &str, - rest: &str, - repeat_prefix: bool, -) { - flush_paragraph(out, buf, indent, width); - buf.clear(); - indent.clear(); - append_wrapped_with_prefix(out, prefix, rest, width, repeat_prefix); -} - -/// Wraps text lines to a specified width, preserving markdown structure. -/// -/// Paragraphs and list items are reflowed to the given width, while code blocks, tables, headers, -/// and blank lines are left unchanged. Indentation and bullet/numbered list prefixes are preserved. -/// Hard line breaks (two spaces or `
` tags) are respected. -/// -/// # Parameters -/// - `lines`: The input lines of markdown text. -/// - `width`: The maximum line width for wrapping. -/// -/// # Returns -/// A vector of strings containing the wrapped and formatted markdown lines. -/// -/// # Examples -/// -/// ```no_run -/// use mdtablefix::wrap_text; -/// let input = vec![ -/// "This is a long paragraph that should be wrapped to a shorter width.".to_string(), -/// "".to_string(), -/// "```".to_string(), -/// "let x = 42;".to_string(), -/// "```".to_string(), -/// ]; -/// let wrapped = wrap_text(&input, 20); -/// assert_eq!(wrapped[0], "This is a long"); -/// assert_eq!(wrapped[1], "paragraph that should"); -/// assert_eq!(wrapped[2], "be wrapped to a"); -/// assert_eq!(wrapped[3], "shorter width."); -/// assert_eq!(wrapped[4], ""); -/// assert_eq!(wrapped[5], "```"); -/// assert_eq!(wrapped[6], "let x = 42;"); -/// assert_eq!(wrapped[7], "```"); -/// ``` -#[doc(hidden)] -pub fn wrap_text(lines: &[String], width: usize) -> Vec { - let mut out = Vec::new(); - let mut buf: Vec<(String, bool)> = Vec::new(); - let mut indent = String::new(); - let mut in_code = false; - - for line in lines { - if FENCE_RE.is_match(line) { - flush_paragraph(&mut out, &buf, &indent, width); - buf.clear(); - indent.clear(); - in_code = !in_code; - out.push(line.clone()); - continue; - } - - if in_code { - out.push(line.clone()); - continue; - } - - if line.trim_start().starts_with('|') || SEP_RE.is_match(line.trim()) { - flush_paragraph(&mut out, &buf, &indent, width); - buf.clear(); - indent.clear(); - out.push(line.clone()); - continue; - } - - if line.trim_start().starts_with('#') { - flush_paragraph(&mut out, &buf, &indent, width); - buf.clear(); - indent.clear(); - out.push(line.clone()); - continue; - } - - if line.trim().is_empty() { - flush_paragraph(&mut out, &buf, &indent, width); - buf.clear(); - indent.clear(); - out.push(String::new()); - continue; - } - - if let Some(cap) = BULLET_RE.captures(line) { - let prefix = cap.get(1).unwrap().as_str(); - let rest = cap.get(2).unwrap().as_str(); - handle_prefix_line(&mut out, &mut buf, &mut indent, width, prefix, rest, false); - continue; - } - - if let Some(cap) = FOOTNOTE_RE.captures(line) { - let indent_part = cap.get(1).unwrap().as_str(); - let label_part = cap.get(2).unwrap().as_str(); - let prefix = format!("{indent_part}{label_part}"); - let rest = cap.get(3).unwrap().as_str(); - handle_prefix_line(&mut out, &mut buf, &mut indent, width, &prefix, rest, false); - continue; - } - - if let Some(cap) = BLOCKQUOTE_RE.captures(line) { - let prefix = cap.get(1).unwrap().as_str(); - let rest = cap.get(2).unwrap().as_str(); - handle_prefix_line(&mut out, &mut buf, &mut indent, width, prefix, rest, true); - continue; - } - - if buf.is_empty() { - indent = line.chars().take_while(|c| c.is_whitespace()).collect(); - } - let trimmed_end = line.trim_end(); - let hard_break = line.ends_with(" ") - || trimmed_end.ends_with("
") - || trimmed_end.ends_with("
") - || trimmed_end.ends_with("
"); - let text = trimmed_end - .trim_end_matches("
") - .trim_end_matches("
") - .trim_end_matches("
") - .trim_end_matches(' ') - .trim_start() - .to_string(); - buf.push((text, hard_break)); - } - - flush_paragraph(&mut out, &buf, &indent, width); - out -} - -#[must_use] -/// Processes a stream of markdown lines, converting HTML tables, reflowing markdown tables, and -/// wrapping text to 80 columns. -/// -/// Converts simple HTML tables to markdown, reflows markdown tables for consistent alignment, and -/// wraps paragraphs and list items to 80 characters. Preserves code blocks, headers, and special -/// markdown structures. -/// -/// # Returns -/// -/// A vector of processed markdown lines with tables fixed and text wrapped. -/// -/// # Examples -/// -/// ```no_run -/// use mdtablefix::process_stream; -/// let input = vec![ -/// "
foobar
".to_string(), -/// "| a | b |".to_string(), -/// "|---|---|".to_string(), -/// "| 1 | 2 |".to_string(), -/// "".to_string(), -/// "A paragraph that will be wrapped to fit within eighty columns. This sentence is \ -/// intentionally long to demonstrate wrapping." -/// .to_string(), -/// ]; -/// let output = process_stream(&input); -/// assert!(output.iter().any(|line| line.contains("| foo | bar |"))); -/// assert!(output.iter().any(|line| line.len() <= 80)); -/// ``` -fn process_stream_inner(lines: &[String], wrap: bool) -> Vec { - let pre = html::convert_html_tables(lines); - - let mut out = Vec::new(); - let mut buf = Vec::new(); - let mut in_code = false; - let mut in_table = false; - - for line in &pre { - if FENCE_RE.is_match(line) { - if !buf.is_empty() { - if in_table { - out.extend(reflow_table(&buf)); - } else { - out.extend(buf.clone()); - } - buf.clear(); - } - in_code = !in_code; - out.push(line.to_string()); - continue; - } - - if in_code { - out.push(line.to_string()); - continue; - } - - if line.trim_start().starts_with('|') { - if !in_table { - in_table = true; - } - buf.push(line.trim_end().to_string()); - continue; - } - - if in_table && !line.trim().is_empty() { - buf.push(line.trim_end().to_string()); - continue; - } - - if !buf.is_empty() { - if in_table { - out.extend(reflow_table(&buf)); - } else { - out.extend(buf.clone()); - } - buf.clear(); - in_table = false; - } - - out.push(line.to_string()); - } - - if !buf.is_empty() { - if in_table { - out.extend(reflow_table(&buf)); - } else { - out.extend(buf); - } - } - - if wrap { wrap_text(&out, 80) } else { out } -} - -#[must_use] -/// Renumbers ordered list items in Markdown text. -/// -/// Lines matching `^\s*[1-9][0-9]*\.\s+` are renumbered sequentially within -/// their indentation level. Numbering continues across fenced code blocks -/// without resetting. -/// -/// # Examples -/// ``` -/// use mdtablefix::renumber_lists; -/// -/// let lines = vec!["1. foo", "4. bar"] -/// .into_iter() -/// .map(str::to_string) -/// .collect::>(); -/// assert_eq!( -/// renumber_lists(&lines), -/// vec!["1. foo", "2. bar"] -/// .into_iter() -/// .map(str::to_string) -/// .collect::>() -/// ); -/// ``` -/// -/// # Panics -/// Panics if the internal counter stack is empty when a numbered line is -/// encountered. This indicates a logic error. -pub fn renumber_lists(lines: &[String]) -> Vec { - let mut out = Vec::with_capacity(lines.len()); - let mut counters: Vec<(usize, usize)> = Vec::new(); - let mut in_code = false; - - for line in lines { - if FENCE_RE.is_match(line) { - in_code = !in_code; - out.push(line.clone()); - continue; - } - - if in_code { - out.push(line.clone()); - continue; - } - - if let Some((indent_str, sep, rest)) = parse_numbered(line) { - let indent = indent_len(indent_str); - drop_deeper(indent, &mut counters); - let current = match counters.last_mut() { - Some((d, cnt)) if *d == indent => { - *cnt += 1; - *cnt - } - _ => { - counters.push((indent, 1)); - 1 - } - }; - out.push(format!("{indent_str}{current}.{sep}{rest}")); - continue; - } - - // Avoid allocating when just measuring indentation - let indent_end = line - .char_indices() - .find(|&(_, c)| !c.is_whitespace()) - .map_or_else(|| line.len(), |(i, _)| i); - let indent_str = &line[..indent_end]; - let indent = indent_len(indent_str); - drop_deeper(indent, &mut counters); - out.push(line.clone()); - } - - out -} - -#[must_use] -/// Reformat thematic breaks as 70 underscores. -/// -/// Thematic breaks are lines composed of three or more matching `-`, `_`, or -/// `*` characters (optionally separated by spaces or tabs) with up to three -/// leading spaces. Lines inside fenced code blocks are ignored. -pub fn format_breaks(lines: &[String]) -> Vec { - let mut out = Vec::with_capacity(lines.len()); - let mut in_code = false; - - for line in lines { - if FENCE_RE.is_match(line) { - in_code = !in_code; - out.push(line.clone()); - continue; - } - - if !in_code && THEMATIC_BREAK_RE.is_match(line.trim_end()) { - out.push(THEMATIC_BREAK_LINE.clone()); - } else { - out.push(line.clone()); - } - } - - out -} - -#[must_use] -pub fn process_stream(lines: &[String]) -> Vec { process_stream_inner(lines, true) } - -#[must_use] -pub fn process_stream_no_wrap(lines: &[String]) -> Vec { - process_stream_inner(lines, false) -} - -/// Rewrite a file in place with fixed tables. -/// -/// # Errors -/// Reads a markdown file, reflows any broken tables within it, and writes the updated content back -/// to the same file. -/// -/// Returns an error if the file cannot be read or written. -/// -/// # Examples -/// -/// ```no_run -/// use std::path::Path; -/// -/// use mdtablefix::rewrite; -/// let path = Path::new("example.md"); -/// rewrite(path).unwrap(); -/// ``` -pub fn rewrite(path: &Path) -> std::io::Result<()> { - let text = fs::read_to_string(path)?; - let lines: Vec = text.lines().map(str::to_string).collect(); - let fixed = process_stream(&lines); - fs::write(path, fixed.join("\n") + "\n") -} - -/// Rewrite a file in place with fixed tables without wrapping text. -/// -/// # Errors -/// Returns an error if the file cannot be read or written. -pub fn rewrite_no_wrap(path: &Path) -> std::io::Result<()> { - let text = fs::read_to_string(path)?; - let lines: Vec = text.lines().map(str::to_string).collect(); - let fixed = process_stream_no_wrap(&lines); - fs::write(path, fixed.join("\n") + "\n") -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn sep_index_within_bounds() { - assert_eq!(sep_index_within(Some(1), 3), Some(1)); - assert_eq!(sep_index_within(Some(3), 3), None); - assert_eq!(sep_index_within(None, 3), None); - } - - #[test] - fn detect_row_mismatch() { - let rows = vec![ - vec!["a".to_string(), "b".to_string()], - vec!["1".to_string(), "2".to_string()], - ]; - assert!(!rows_mismatched(&rows, false)); - - let mismatch = vec![ - vec!["a".to_string(), "b".to_string()], - vec!["1".to_string()], - ]; - assert!(rows_mismatched(&mismatch, false)); - - let with_sep = vec![ - vec!["a".to_string(), "b".to_string()], - vec!["---".to_string(), "---".to_string()], - vec!["1".to_string(), "2".to_string()], - ]; - assert!(!rows_mismatched(&with_sep, false)); - - assert!(!rows_mismatched(&mismatch, true)); - } - - #[test] - fn wrap_text_preserves_hyphenated_words() { - let input = vec!["A word that is very-long-word indeed".to_string()]; - let wrapped = wrap_text(&input, 20); - assert_eq!( - wrapped, - vec![ - "A word that is".to_string(), - "very-long-word".to_string(), - "indeed".to_string(), - ] - ); - } - - #[test] - fn wrap_text_does_not_insert_spaces_in_hyphenated_words() { - let input = vec![ - concat!( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt ", - "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur ", - "volutpat." - ) - .to_string(), - ]; - let wrapped = wrap_text(&input, 80); - assert_eq!( - wrapped, - vec![ - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt" - .to_string(), - "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur volutpat." - .to_string(), - ] - ); - } - - #[test] - fn wrap_text_preserves_code_spans() { - let input = vec![ - "with their own escaping rules. On Windows, scripts default to `powershell -Command` \ - unless the manifest's `interpreter` field overrides the setting." - .to_string(), - ]; - let wrapped = wrap_text(&input, 60); - assert_eq!( - wrapped, - vec![ - "with their own escaping rules. On Windows, scripts default".to_string(), - "to `powershell -Command` unless the manifest's `interpreter`".to_string(), - "field overrides the setting.".to_string(), - ] - ); - } - - #[test] - fn wrap_text_multiple_code_spans() { - let input = vec!["combine `foo bar` and `baz qux` in one line".to_string()]; - let wrapped = wrap_text(&input, 25); - assert_eq!( - wrapped, - vec![ - "combine `foo bar` and".to_string(), - "`baz qux` in one line".to_string(), - ] - ); - } - - #[test] - fn wrap_text_nested_backticks() { - let input = vec!["Use `` `code` `` to quote backticks".to_string()]; - let wrapped = wrap_text(&input, 20); - assert_eq!( - wrapped, - vec![ - "Use `` `code` `` to".to_string(), - "quote backticks".to_string() - ] - ); - } - - #[test] - fn wrap_text_unmatched_backticks() { - let input = vec!["This has a `dangling code span.".to_string()]; - let wrapped = wrap_text(&input, 20); - assert_eq!( - wrapped, - vec!["This has a `dangling".to_string(), "code span.".to_string()] - ); - } - - /// Validate that URLs are not broken by re-wrapping paragraphs containing hyperlinks. - #[test] - fn wrap_text_preserves_links() { - let input = vec![ - "`falcon-pachinko` is an extension library for the".to_string(), - "[Falcon](https://falcon.readthedocs.io) web framework. It adds a structured" - .to_string(), - "approach to asynchronous WebSocket routing and background worker integration." - .to_string(), - ]; - let wrapped = wrap_text(&input, 80); - let joined = wrapped.join("\n"); - assert_eq!(joined.matches("https://").count(), 1); - assert!( - wrapped - .iter() - .any(|l| l.contains("https://falcon.readthedocs.io")) - ); - } -} +pub use io::{rewrite, rewrite_no_wrap}; +pub use lists::renumber_lists; +pub use process::{process_stream, process_stream_no_wrap}; +pub use table::{reflow_table, split_cells}; +pub use wrap::{is_fence, wrap_text}; diff --git a/src/lists.rs b/src/lists.rs new file mode 100644 index 00000000..5d001074 --- /dev/null +++ b/src/lists.rs @@ -0,0 +1,106 @@ +//! Ordered list renumbering utilities. + +use regex::Regex; + +use crate::wrap::is_fence; + +fn parse_numbered(line: &str) -> Option<(&str, &str, &str)> { + static NUMBERED_RE: std::sync::LazyLock = + std::sync::LazyLock::new(|| Regex::new(r"^(\s*)([1-9][0-9]*)\.(\s+)(.*)").unwrap()); + let cap = NUMBERED_RE.captures(line)?; + let indent = cap.get(1)?.as_str(); + let sep = cap.get(3)?.as_str(); + let rest = cap.get(4)?.as_str(); + Some((indent, sep, rest)) +} + +fn indent_len(indent: &str) -> usize { + indent + .chars() + .fold(0, |acc, ch| acc + if ch == '\t' { 4 } else { 1 }) +} + +fn drop_deeper(indent: usize, counters: &mut Vec<(usize, usize)>) { + while counters.last().is_some_and(|(d, _)| *d > indent) { + counters.pop(); + } +} + +#[must_use] +pub fn renumber_lists(lines: &[String]) -> Vec { + let mut out = Vec::with_capacity(lines.len()); + let mut counters: Vec<(usize, usize)> = Vec::new(); + let mut in_code = false; + + for line in lines { + if is_fence(line) { + in_code = !in_code; + out.push(line.clone()); + continue; + } + + if in_code { + out.push(line.clone()); + continue; + } + + if let Some((indent_str, sep, rest)) = parse_numbered(line) { + let indent = indent_len(indent_str); + drop_deeper(indent, &mut counters); + let current = match counters.last_mut() { + Some((d, cnt)) if *d == indent => { + *cnt += 1; + *cnt + } + _ => { + counters.push((indent, 1)); + 1 + } + }; + out.push(format!("{indent_str}{current}.{sep}{rest}")); + continue; + } + + let indent_end = line + .char_indices() + .find(|&(_, c)| !c.is_whitespace()) + .map_or_else(|| line.len(), |(i, _)| i); + let indent_str = &line[..indent_end]; + let indent = indent_len(indent_str); + drop_deeper(indent, &mut counters); + out.push(line.clone()); + } + + out +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn simple_renumber() { + let input = vec!["1. a", "3. b"] + .into_iter() + .map(str::to_string) + .collect::>(); + let expected = vec!["1. a", "2. b"] + .into_iter() + .map(str::to_string) + .collect::>(); + assert_eq!(renumber_lists(&input), expected); + } + + #[test] + fn nested_renumber() { + let input = vec!["1. a", " 1. sub", " 3. sub2", "2. b"] + .into_iter() + .map(str::to_string) + .collect::>(); + let expected = vec!["1. a", " 1. sub", " 2. sub2", "2. b"] + .into_iter() + .map(str::to_string) + .collect::>(); + assert_eq!(renumber_lists(&input), expected); + } +} diff --git a/src/process.rs b/src/process.rs new file mode 100644 index 00000000..55e0294e --- /dev/null +++ b/src/process.rs @@ -0,0 +1,106 @@ +//! High-level Markdown stream processing. + +use crate::{ + html::convert_html_tables, + table::reflow_table, + wrap::{self, wrap_text}, +}; + +#[must_use] +pub fn process_stream_inner(lines: &[String], wrap: bool) -> Vec { + let pre = convert_html_tables(lines); + + let mut out = Vec::new(); + let mut buf = Vec::new(); + let mut in_code = false; + let mut in_table = false; + + for line in &pre { + if wrap::is_fence(line) { + if !buf.is_empty() { + if in_table { + out.extend(reflow_table(&buf)); + } else { + out.extend(buf.clone()); + } + buf.clear(); + } + in_code = !in_code; + out.push(line.to_string()); + continue; + } + + if in_code { + out.push(line.to_string()); + continue; + } + + if line.trim_start().starts_with('|') { + if !in_table { + in_table = true; + } + buf.push(line.trim_end().to_string()); + continue; + } + + if in_table && !line.trim().is_empty() { + buf.push(line.trim_end().to_string()); + continue; + } + + if !buf.is_empty() { + if in_table { + out.extend(reflow_table(&buf)); + } else { + out.extend(buf.clone()); + } + buf.clear(); + in_table = false; + } + + out.push(line.to_string()); + } + + if !buf.is_empty() { + if in_table { + out.extend(reflow_table(&buf)); + } else { + out.extend(buf); + } + } + + if wrap { wrap_text(&out, 80) } else { out } +} + +#[must_use] +pub fn process_stream(lines: &[String]) -> Vec { process_stream_inner(lines, true) } + +#[must_use] +pub fn process_stream_no_wrap(lines: &[String]) -> Vec { + process_stream_inner(lines, false) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn processes_html_and_tables() { + let input = vec![ + "
AB
".to_string(), + "| X | Y |".to_string(), + "|---|---|".to_string(), + "| 1 | 2 |".to_string(), + ]; + let output = process_stream(&input); + assert!(output.iter().any(|l| l.contains("| A | B |"))); + assert!(output.iter().any(|l| l.contains("| X | Y |"))); + } + + #[test] + fn no_wrap_option() { + let input = vec!["| a | b |".to_string(), "| 1 | 2 |".to_string()]; + let out = process_stream_no_wrap(&input); + assert_eq!(out, vec!["| a | b |", "| 1 | 2 |"]); + } +} diff --git a/src/reflow.rs b/src/reflow.rs index b63010b7..e9068fee 100644 --- a/src/reflow.rs +++ b/src/reflow.rs @@ -5,7 +5,7 @@ use regex::Regex; -use crate::{format_separator_cells, split_cells}; +use crate::table::{SEP_RE, format_separator_cells, split_cells}; static SENTINEL_RE: std::sync::LazyLock = std::sync::LazyLock::new(|| Regex::new(r"\|\s*\|\s*").unwrap()); @@ -134,5 +134,5 @@ fn should_use_second_row_as_separator(sep_invalid: bool, rows: &[Vec]) - } fn second_row_is_separator(rows: &[Vec]) -> bool { - rows.len() > 1 && rows[1].iter().all(|c| crate::SEP_RE.is_match(c)) + rows.len() > 1 && rows[1].iter().all(|c| SEP_RE.is_match(c)) } diff --git a/src/table.rs b/src/table.rs new file mode 100644 index 00000000..c8841df2 --- /dev/null +++ b/src/table.rs @@ -0,0 +1,168 @@ +//! Markdown table reflow utilities. +//! +//! Implements the algorithm outlined in `docs/html-table-support.md` lines 1-24. +//! Provides helpers used by the `reflow` module and `reflow_table` itself. + +use regex::Regex; + +fn next_is_pipe(chars: &mut std::iter::Peekable>) -> bool { + chars.peek() == Some(&'|') +} + +#[must_use] +pub fn split_cells(line: &str) -> Vec { + let mut s = line.trim(); + if let Some(stripped) = s.strip_prefix('|') { + s = stripped; + } + if let Some(stripped) = s.strip_suffix('|') { + s = stripped; + } + + let mut cells = Vec::new(); + let mut current = String::new(); + let mut chars = s.chars().peekable(); + while let Some(ch) = chars.next() { + if ch == '\\' { + if next_is_pipe(&mut chars) { + chars.next(); + current.push('|'); + continue; + } + current.push(ch); + continue; + } + if ch == '|' { + cells.push(current.trim().to_string()); + current.clear(); + } else { + current.push(ch); + } + } + cells.push(current.trim().to_string()); + cells +} + +pub(crate) fn format_separator_cells(widths: &[usize], sep_cells: &[String]) -> Vec { + if sep_cells.len() != widths.len() { + return sep_cells.to_vec(); + } + + sep_cells + .iter() + .enumerate() + .map(|(i, cell)| { + let trimmed = cell.trim(); + let left = trimmed.starts_with(':'); + let right = trimmed.ends_with(':'); + let mut dashes = "-".repeat(widths[i].max(3)); + if left { + dashes.remove(0); + dashes.insert(0, ':'); + } + if right { + dashes.pop(); + dashes.push(':'); + } + dashes + }) + .collect() +} + +fn sep_index_within(idx: Option, len: usize) -> Option { + match idx { + Some(i) if i < len => Some(i), + _ => None, + } +} + +fn rows_mismatched(rows: &[Vec], split_within_line: bool) -> bool { + if split_within_line { + return false; + } + let Some(first_len) = rows.first().map(Vec::len) else { + return false; + }; + rows.iter() + .skip(1) + .any(|row| row.len() != first_len && !row.iter().all(|c| SEP_RE.is_match(c))) +} + +pub(crate) static SEP_RE: std::sync::LazyLock = + std::sync::LazyLock::new(|| Regex::new(r"^[\s|:-]+$").unwrap()); + +#[must_use] +pub fn reflow_table(lines: &[String]) -> Vec { + if lines.is_empty() { + return Vec::new(); + } + + let indent: String = lines[0].chars().take_while(|c| c.is_whitespace()).collect(); + let mut trimmed: Vec = lines + .iter() + .map(|l| l.trim().to_string()) + .filter(|l| !l.trim_start().starts_with("\\-")) + .collect(); + let sep_idx = trimmed.iter().position(|l| SEP_RE.is_match(l)); + let sep_line = sep_idx.map(|idx| trimmed.remove(idx)); + + let (rows, split_within_line) = crate::reflow::parse_rows(&trimmed); + + let max_cols = rows.iter().map(Vec::len).max().unwrap_or(0); + + let (sep_cells, sep_row_idx) = + crate::reflow::detect_separator(sep_line.as_ref(), &rows, max_cols); + + let cleaned = crate::reflow::clean_rows(rows); + + let mut output_rows = cleaned.clone(); + if let Some(idx) = sep_index_within(sep_row_idx, output_rows.len()) { + output_rows.remove(idx); + } + + if rows_mismatched(&cleaned, split_within_line) { + return lines.to_vec(); + } + + let widths = crate::reflow::calculate_widths(&cleaned, max_cols); + + let out = crate::reflow::format_rows(output_rows, &widths, &indent); + + crate::reflow::insert_separator(out, sep_cells, &widths, &indent) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn sep_index_within_bounds() { + assert_eq!(sep_index_within(Some(1), 3), Some(1)); + assert_eq!(sep_index_within(Some(3), 3), None); + assert_eq!(sep_index_within(None, 3), None); + } + + #[test] + fn detect_row_mismatch() { + let rows = vec![ + vec!["a".to_string(), "b".to_string()], + vec!["1".to_string(), "2".to_string()], + ]; + assert!(!rows_mismatched(&rows, false)); + + let mismatch = vec![ + vec!["a".to_string(), "b".to_string()], + vec!["1".to_string()], + ]; + assert!(rows_mismatched(&mismatch, false)); + + let with_sep = vec![ + vec!["a".to_string(), "b".to_string()], + vec!["---".to_string(), "---".to_string()], + vec!["1".to_string(), "2".to_string()], + ]; + assert!(!rows_mismatched(&with_sep, false)); + + assert!(!rows_mismatched(&mismatch, true)); + } +} diff --git a/src/wrap.rs b/src/wrap.rs new file mode 100644 index 00000000..91253ae2 --- /dev/null +++ b/src/wrap.rs @@ -0,0 +1,389 @@ +//! Text wrapping utilities respecting inline code and prefixes. +//! +//! Unicode width handling follows `docs/unicode-width.md` lines 1-9 using the +//! `unicode-width` crate for accurate display calculations. + +use regex::Regex; + +static FENCE_RE: std::sync::LazyLock = + std::sync::LazyLock::new(|| Regex::new(r"^(```|~~~).*").unwrap()); + +static BULLET_RE: std::sync::LazyLock = + std::sync::LazyLock::new(|| Regex::new(r"^(\s*(?:[-*+]|\d+[.)])\s+)(.*)").unwrap()); + +static FOOTNOTE_RE: std::sync::LazyLock = + std::sync::LazyLock::new(|| Regex::new(r"^(\s*)(\[\^[^]]+\]:\s*)(.*)$").unwrap()); + +static BLOCKQUOTE_RE: std::sync::LazyLock = + std::sync::LazyLock::new(|| Regex::new(r"^(\s*(?:>\s*)+)(.*)$").unwrap()); + +pub(crate) fn tokenize_markdown(text: &str) -> Vec { + let mut tokens = Vec::new(); + let chars: Vec = text.chars().collect(); + let mut i = 0; + while i < chars.len() { + let c = chars[i]; + if c.is_whitespace() { + let start = i; + while i < chars.len() && chars[i].is_whitespace() { + i += 1; + } + tokens.push(chars[start..i].iter().collect()); + } else if c == '`' { + let start = i; + let mut delim_len = 0; + while i < chars.len() && chars[i] == '`' { + i += 1; + delim_len += 1; + } + let mut end = i; + while end < chars.len() { + if chars[end] == '`' { + let mut j = end; + let mut count = 0; + while j < chars.len() && chars[j] == '`' { + j += 1; + count += 1; + } + if count == delim_len { + end = j; + break; + } + } + end += 1; + } + if end >= chars.len() { + tokens.push(chars[start..start + delim_len].iter().collect()); + i = start + delim_len; + } else { + tokens.push(chars[start..end].iter().collect()); + i = end; + } + } else { + let start = i; + while i < chars.len() && !chars[i].is_whitespace() && chars[i] != '`' { + i += 1; + } + tokens.push(chars[start..i].iter().collect()); + } + } + tokens +} + +fn wrap_preserving_code(text: &str, width: usize) -> Vec { + use unicode_width::UnicodeWidthStr; + + let mut lines = Vec::new(); + let mut current = String::new(); + let mut current_width = 0; + for token in tokenize_markdown(text) { + let token_width = UnicodeWidthStr::width(token.as_str()); + if current_width + token_width <= width { + current.push_str(&token); + current_width += token_width; + continue; + } + + let trimmed = current.trim_end(); + if !trimmed.is_empty() { + lines.push(trimmed.to_string()); + } + current.clear(); + current_width = 0; + + if !token.chars().all(char::is_whitespace) { + current.push_str(&token); + current_width = token_width; + } + } + let trimmed = current.trim_end(); + if !trimmed.is_empty() { + lines.push(trimmed.to_string()); + } + lines +} + +#[doc(hidden)] +pub fn is_fence(line: &str) -> bool { FENCE_RE.is_match(line) } + +fn flush_paragraph(out: &mut Vec, buf: &[(String, bool)], indent: &str, width: usize) { + if buf.is_empty() { + return; + } + let mut segment = String::new(); + for (text, hard_break) in buf { + if !segment.is_empty() { + segment.push(' '); + } + segment.push_str(text); + if *hard_break { + for line in wrap_preserving_code(&segment, width - indent.len()) { + out.push(format!("{indent}{line}")); + } + segment.clear(); + } + } + if !segment.is_empty() { + for line in wrap_preserving_code(&segment, width - indent.len()) { + out.push(format!("{indent}{line}")); + } + } +} + +fn append_wrapped_with_prefix( + out: &mut Vec, + prefix: &str, + text: &str, + width: usize, + repeat_prefix: bool, +) { + use unicode_width::UnicodeWidthStr; + + let prefix_width = UnicodeWidthStr::width(prefix); + let available = width.saturating_sub(prefix_width).max(1); + let indent_str: String = prefix.chars().take_while(|c| c.is_whitespace()).collect(); + let indent_width = UnicodeWidthStr::width(indent_str.as_str()); + let wrapped_indent = if repeat_prefix { + prefix.to_string() + } else { + format!("{}{}", indent_str, " ".repeat(prefix_width - indent_width)) + }; + + let lines = wrap_preserving_code(text, available); + if lines.is_empty() { + out.push(prefix.to_string()); + return; + } + + for (i, line) in lines.iter().enumerate() { + if i == 0 { + out.push(format!("{prefix}{line}")); + } else { + out.push(format!("{wrapped_indent}{line}")); + } + } +} + +fn handle_prefix_line( + out: &mut Vec, + buf: &mut Vec<(String, bool)>, + indent: &mut String, + width: usize, + prefix: &str, + rest: &str, + repeat_prefix: bool, +) { + flush_paragraph(out, buf, indent, width); + buf.clear(); + indent.clear(); + append_wrapped_with_prefix(out, prefix, rest, width, repeat_prefix); +} + +/// Wrap text lines to the given width. +/// +/// # Panics +/// Panics if regex captures fail unexpectedly. +#[must_use] +pub fn wrap_text(lines: &[String], width: usize) -> Vec { + let mut out = Vec::new(); + let mut buf: Vec<(String, bool)> = Vec::new(); + let mut indent = String::new(); + let mut in_code = false; + + for line in lines { + if FENCE_RE.is_match(line) { + flush_paragraph(&mut out, &buf, &indent, width); + buf.clear(); + indent.clear(); + in_code = !in_code; + out.push(line.clone()); + continue; + } + + if in_code { + out.push(line.clone()); + continue; + } + + if line.trim_start().starts_with('|') || crate::table::SEP_RE.is_match(line.trim()) { + flush_paragraph(&mut out, &buf, &indent, width); + buf.clear(); + indent.clear(); + out.push(line.clone()); + continue; + } + + if line.trim_start().starts_with('#') { + flush_paragraph(&mut out, &buf, &indent, width); + buf.clear(); + indent.clear(); + out.push(line.clone()); + continue; + } + + if line.trim().is_empty() { + flush_paragraph(&mut out, &buf, &indent, width); + buf.clear(); + indent.clear(); + out.push(String::new()); + continue; + } + + if let Some(cap) = BULLET_RE.captures(line) { + let prefix = cap.get(1).unwrap().as_str(); + let rest = cap.get(2).unwrap().as_str(); + handle_prefix_line(&mut out, &mut buf, &mut indent, width, prefix, rest, false); + continue; + } + + if let Some(cap) = FOOTNOTE_RE.captures(line) { + let indent_part = cap.get(1).unwrap().as_str(); + let label_part = cap.get(2).unwrap().as_str(); + let prefix = format!("{indent_part}{label_part}"); + let rest = cap.get(3).unwrap().as_str(); + handle_prefix_line(&mut out, &mut buf, &mut indent, width, &prefix, rest, false); + continue; + } + + if let Some(cap) = BLOCKQUOTE_RE.captures(line) { + let prefix = cap.get(1).unwrap().as_str(); + let rest = cap.get(2).unwrap().as_str(); + handle_prefix_line(&mut out, &mut buf, &mut indent, width, prefix, rest, true); + continue; + } + + if buf.is_empty() { + indent = line.chars().take_while(|c| c.is_whitespace()).collect(); + } + let trimmed_end = line.trim_end(); + let hard_break = line.ends_with(" ") + || trimmed_end.ends_with("
") + || trimmed_end.ends_with("
") + || trimmed_end.ends_with("
"); + let text = trimmed_end + .trim_end_matches("
") + .trim_end_matches("
") + .trim_end_matches("
") + .trim_end_matches(' ') + .trim_start() + .to_string(); + buf.push((text, hard_break)); + } + + flush_paragraph(&mut out, &buf, &indent, width); + out +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn wrap_text_preserves_hyphenated_words() { + let input = vec!["A word that is very-long-word indeed".to_string()]; + let wrapped = wrap_text(&input, 20); + assert_eq!( + wrapped, + vec![ + "A word that is".to_string(), + "very-long-word".to_string(), + "indeed".to_string(), + ] + ); + } + + #[test] + fn wrap_text_does_not_insert_spaces_in_hyphenated_words() { + let input = vec![ + concat!( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt ", + "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur ", + "volutpat." + ) + .to_string(), + ]; + let wrapped = wrap_text(&input, 80); + assert_eq!( + wrapped, + vec![ + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt" + .to_string(), + "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur volutpat." + .to_string(), + ] + ); + } + + #[test] + fn wrap_text_preserves_code_spans() { + let input = vec![ + "with their own escaping rules. On Windows, scripts default to `powershell -Command` \ + unless the manifest's `interpreter` field overrides the setting." + .to_string(), + ]; + let wrapped = wrap_text(&input, 60); + assert_eq!( + wrapped, + vec![ + "with their own escaping rules. On Windows, scripts default".to_string(), + "to `powershell -Command` unless the manifest's `interpreter`".to_string(), + "field overrides the setting.".to_string(), + ] + ); + } + + #[test] + fn wrap_text_multiple_code_spans() { + let input = vec!["combine `foo bar` and `baz qux` in one line".to_string()]; + let wrapped = wrap_text(&input, 25); + assert_eq!( + wrapped, + vec![ + "combine `foo bar` and".to_string(), + "`baz qux` in one line".to_string(), + ] + ); + } + + #[test] + fn wrap_text_nested_backticks() { + let input = vec!["Use `` `code` `` to quote backticks".to_string()]; + let wrapped = wrap_text(&input, 20); + assert_eq!( + wrapped, + vec![ + "Use `` `code` `` to".to_string(), + "quote backticks".to_string() + ] + ); + } + + #[test] + fn wrap_text_unmatched_backticks() { + let input = vec!["This has a `dangling code span.".to_string()]; + let wrapped = wrap_text(&input, 20); + assert_eq!( + wrapped, + vec!["This has a `dangling".to_string(), "code span.".to_string()] + ); + } + + #[test] + fn wrap_text_preserves_links() { + let input = vec![ + "`falcon-pachinko` is an extension library for the".to_string(), + "[Falcon](https://falcon.readthedocs.io) web framework. It adds a structured" + .to_string(), + "approach to asynchronous WebSocket routing and background worker integration." + .to_string(), + ]; + let wrapped = wrap_text(&input, 80); + let joined = wrapped.join("\n"); + assert_eq!(joined.matches("https://").count(), 1); + assert!( + wrapped + .iter() + .any(|l| l.contains("https://falcon.readthedocs.io")) + ); + } +}