diff --git a/Cargo.toml b/Cargo.toml index acb11865..4b6c0ae2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ the `--wrap` option is used, it also wraps paragraphs and list items to 80 colum [package.metadata.binstall.overrides.'cfg(all(target_os = "linux", any(target_arch = "x86_64", target_arch = "aarch64"), target_env = "gnu"))'] pkg-url = "{ repo }/releases/download/v{ version }/{ name }-{ version }-{ target }.tar.gz" -bin-dir = "{ bin }{ binary-ext }" +bin-dir = "." pkg-fmt = "tgz" [dependencies] diff --git a/src/code_emphasis.rs b/src/code_emphasis.rs index b5acd194..84d48b6c 100644 --- a/src/code_emphasis.rs +++ b/src/code_emphasis.rs @@ -231,4 +231,15 @@ mod tests { let input = vec!["before `code` after".to_string()]; assert_eq!(fix_code_emphasis(&input), input); } + + #[test] + fn consume_code_affixes_clears_mixed_pending_prefix() { + let mut tokens = vec![Token::Text("*lead*tail")].into_iter().peekable(); + let mut pending = "**"; + + let (prefix, suffix, modified) = consume_code_affixes(&mut tokens, &mut pending); + + assert_eq!((prefix, suffix, modified), ("", "", true)); + assert_eq!(tokens.next(), Some(Token::Text("lead*tail"))); + } } diff --git a/src/footnotes/renumber.rs b/src/footnotes/renumber.rs index 203c4d59..24acd57d 100644 --- a/src/footnotes/renumber.rs +++ b/src/footnotes/renumber.rs @@ -1,9 +1,23 @@ //! Sequential renumbering of footnote references and definitions. -use std::{collections::HashMap, fmt::Write, sync::LazyLock}; +use std::{collections::HashMap, sync::LazyLock}; use regex::{Captures, Match, Regex}; +mod definitions; +mod parsing { + pub(super) use super::super::parsing::DefinitionParts; +} + +#[cfg(test)] +use definitions::numeric_candidate_from_line; +use definitions::{ + DefinitionUpdates, + collect_definition_updates, + reorder_definition_block, + rewrite_definition_headers, +}; + use super::{ lists::{footnote_block_range, has_existing_footnote_block, trimmed_range}, parsing::{FOOTNOTE_LINE_RE, is_definition_continuation, parse_definition}, @@ -56,7 +70,9 @@ fn is_fence_line(line: &str) -> bool { fn rewrite_refs_in_segment(text: &str, mapping: &HashMap) -> String { FOOTNOTE_REF_RE .replace_all(text, |caps: &Captures| { - let mat = caps.get(0).expect("regex matched without capture"); + let Some(mat) = caps.get(0) else { + return String::new(); + }; if is_definition_like(text, &mat) { return caps[0].to_string(); } @@ -129,21 +145,6 @@ fn collect_reference_mapping_from_text( } } -#[derive(Clone)] -struct DefinitionLine { - index: usize, - new_number: usize, - line: String, -} - -struct NumericCandidate { - index: usize, - number: usize, - indent: String, - whitespace: String, - rest: String, -} - fn footnote_definition_block_range(lines: &[String]) -> Option<(usize, usize)> { let (mut start, end) = trimmed_range(lines, |line| { line.trim().is_empty() @@ -167,290 +168,6 @@ fn footnote_definition_block_range(lines: &[String]) -> Option<(usize, usize)> { } } -fn definition_segment_end(lines: &[String], start: usize, block_end: usize) -> usize { - let mut idx = start + 1; - while idx < block_end { - let line = &lines[idx]; - if parse_definition(line).is_some() { - break; - } - if is_definition_continuation(line) { - idx += 1; - continue; - } - if line.trim().is_empty() { - if idx + 1 < block_end && parse_definition(&lines[idx + 1]).is_some() { - break; - } - idx += 1; - continue; - } - break; - } - idx -} - -fn reorder_definition_block( - lines: &mut [String], - start: usize, - end: usize, - definitions: &[DefinitionLine], -) { - let header_positions: Vec = (start..end) - .filter(|&idx| parse_definition(&lines[idx]).is_some()) - .collect(); - if header_positions.len() <= 1 { - return; - } - - let def_lookup: HashMap = definitions - .iter() - .filter(|definition| (start..end).contains(&definition.index)) - .map(|definition| (definition.index, definition)) - .collect(); - if def_lookup.len() <= 1 { - return; - } - - let prefix_len = header_positions.first().map_or(0, |first| first - start); - let mut segments: Vec<(usize, usize, Vec)> = Vec::new(); - let mut consumed = start + prefix_len; - for &position in &header_positions { - let mut leading_start = position; - while leading_start > consumed - && lines[leading_start - 1].trim().is_empty() - && !is_definition_continuation(&lines[leading_start - 1]) - { - leading_start -= 1; - } - let next_bound = definition_segment_end(lines, position, end); - if let Some(definition) = def_lookup.get(&position) { - debug_assert!( - position >= leading_start, - "definition header {position} cannot precede leading segment start {leading_start}", - ); - let mut segment = Vec::with_capacity(next_bound.saturating_sub(leading_start).max(1)); - segment.extend(lines[leading_start..position].iter().cloned()); - segment.push(definition.line.clone()); - let tail_start = position.saturating_add(1); - if tail_start < next_bound { - segment.extend(lines[tail_start..next_bound].iter().cloned()); - } - segments.push((definition.new_number, definition.index, segment)); - } - consumed = next_bound; - } - - if segments.len() <= 1 { - return; - } - - segments.sort_by(|left, right| left.0.cmp(&right.0).then(left.1.cmp(&right.1))); - - let mut first_leading = Vec::new(); - if let Some((_, _, first_segment)) = segments.first_mut() { - while first_segment - .first() - .is_some_and(|line| line.trim().is_empty() && !is_definition_continuation(line)) - { - first_leading.push(first_segment.remove(0)); - } - } - - let mut reordered = Vec::with_capacity(end - start); - if prefix_len > 0 { - reordered.extend(lines[start..start + prefix_len].iter().cloned()); - } - - for (idx, (_, _, segment)) in segments.into_iter().enumerate() { - reordered.extend(segment); - if idx == 0 && !first_leading.is_empty() { - reordered.append(&mut first_leading); - } - } - - if reordered.len() == end - start { - lines[start..end].clone_from_slice(&reordered); - } -} - -struct DefinitionUpdates { - definitions: Vec, - is_definition_line: Vec, -} - -struct DefinitionScanContext<'a> { - mapping: &'a mut HashMap, - next_number: &'a mut usize, - numeric_list_range: Option<(usize, usize)>, - skip_numeric_conversion: bool, -} - -struct DefinitionAccumulator { - definitions: Vec, - is_definition_line: Vec, -} - -fn assign_new_number( - mapping: &mut HashMap, - number: usize, - next_number: &mut usize, -) -> usize { - if let Some(&mapped) = mapping.get(&number) { - mapped - } else { - let assigned = *next_number; - *next_number += 1; - mapping.insert(number, assigned); - assigned - } -} - -fn should_convert_numeric_line( - index: usize, - numeric_range: Option<(usize, usize)>, - skip_numeric_conversion: bool, -) -> bool { - if skip_numeric_conversion { - return false; - } - numeric_range.is_some_and(|(start, end)| index >= start && index < end) -} - -fn definition_line_from_parts( - index: usize, - parts: super::parsing::DefinitionParts<'_>, - mapping: &mut HashMap, - next_number: &mut usize, -) -> DefinitionLine { - let new_number = assign_new_number(mapping, parts.number, next_number); - let rewritten_rest = rewrite_tokens(parts.rest, mapping); - let mut line = String::with_capacity(parts.prefix.len() + rewritten_rest.len() + 8); - line.push_str(parts.prefix); - write!(&mut line, "[^{new_number}]:").expect("write to string cannot fail"); - line.push_str(&rewritten_rest); - DefinitionLine { - index, - new_number, - line, - } -} - -fn numeric_candidate_from_line(line: &str, index: usize) -> Option { - let caps = FOOTNOTE_LINE_RE.captures(line)?; - let number = caps["num"].parse::().ok()?; - let indent = caps.name("indent").map_or("", |m| m.as_str()).to_string(); - let rest = caps.name("rest").map_or("", |m| m.as_str()).to_string(); - let num_match = caps - .name("num") - .expect("numeric list capture missing number"); - let rest_match = caps - .name("rest") - .expect("numeric list capture missing rest"); - let whitespace = line[num_match.end() + 1..rest_match.start()].to_string(); - Some(NumericCandidate { - index, - number, - indent, - whitespace, - rest, - }) -} - -fn collect_scan_updates( - lines: &[String], - ctx: &mut DefinitionScanContext<'_>, -) -> (DefinitionAccumulator, Vec) { - let mut acc = DefinitionAccumulator { - definitions: Vec::new(), - is_definition_line: vec![false; lines.len()], - }; - let mut numeric_candidates = Vec::new(); - let mut in_fence = false; - - for (index, line) in lines.iter().enumerate() { - if is_fence_line(line) { - in_fence = !in_fence; - continue; - } - if in_fence { - continue; - } - - if let Some(parts) = parse_definition(line) { - acc.definitions.push(definition_line_from_parts( - index, - parts, - ctx.mapping, - ctx.next_number, - )); - acc.is_definition_line[index] = true; - continue; - } - - if !should_convert_numeric_line(index, ctx.numeric_list_range, ctx.skip_numeric_conversion) - { - continue; - } - if ctx.mapping.is_empty() && acc.definitions.is_empty() { - continue; - } - if let Some(candidate) = numeric_candidate_from_line(line, index) { - numeric_candidates.push(candidate); - } - } - - (acc, numeric_candidates) -} - -fn finalize_numeric_candidates( - numeric_candidates: Vec, - ctx: &mut DefinitionScanContext<'_>, - acc: &mut DefinitionAccumulator, -) { - for candidate in numeric_candidates.into_iter().rev() { - let new_number = assign_new_number(ctx.mapping, candidate.number, ctx.next_number); - let rewritten_rest = rewrite_tokens(&candidate.rest, ctx.mapping); - let mut line = String::with_capacity( - candidate.indent.len() + candidate.whitespace.len() + rewritten_rest.len() + 8, - ); - line.push_str(&candidate.indent); - write!(&mut line, "[^{new_number}]:").expect("write to string cannot fail"); - line.push_str(&candidate.whitespace); - line.push_str(&rewritten_rest); - acc.definitions.push(DefinitionLine { - index: candidate.index, - new_number, - line, - }); - acc.is_definition_line[candidate.index] = true; - } -} - -fn collect_definition_updates( - lines: &[String], - mapping: &mut HashMap, -) -> DefinitionUpdates { - let mut next_number = mapping.values().copied().max().unwrap_or(0) + 1; - let numeric_list_range = footnote_block_range(lines); - let skip_numeric_conversion = numeric_list_range - .as_ref() - .is_some_and(|(start, _)| has_existing_footnote_block(lines, *start)); - let mut ctx = DefinitionScanContext { - mapping, - next_number: &mut next_number, - numeric_list_range, - skip_numeric_conversion, - }; - let (mut acc, numeric_candidates) = collect_scan_updates(lines, &mut ctx); - finalize_numeric_candidates(numeric_candidates, &mut ctx, &mut acc); - - DefinitionUpdates { - definitions: acc.definitions, - is_definition_line: acc.is_definition_line, - } -} - fn apply_mapping_to_lines( lines: &mut [String], mapping: &HashMap, @@ -469,12 +186,6 @@ fn apply_mapping_to_lines( } } -fn rewrite_definition_headers(lines: &mut [String], definitions: &[DefinitionLine]) { - for definition in definitions { - lines[definition.index].clone_from(&definition.line); - } -} - pub(super) fn renumber_footnotes(lines: &mut [String]) { let mut mapping = collect_reference_mapping(lines); let DefinitionUpdates { @@ -498,3 +209,6 @@ pub(super) fn renumber_footnotes(lines: &mut [String]) { reorder_definition_block(lines, start, end, &definitions); } } + +#[cfg(test)] +mod tests; diff --git a/src/footnotes/renumber/definitions.rs b/src/footnotes/renumber/definitions.rs new file mode 100644 index 00000000..56d88ecb --- /dev/null +++ b/src/footnotes/renumber/definitions.rs @@ -0,0 +1,313 @@ +//! Definition scanning and reordering helpers for footnote renumbering. +//! +//! The parent module keeps the top-level rewrite flow, while this submodule +//! owns the detail-heavy definition parsing and reordering machinery so each +//! source file stays readable and within the repository size limit. + +use std::collections::HashMap; + +use super::{ + FOOTNOTE_LINE_RE, + footnote_block_range, + has_existing_footnote_block, + is_definition_continuation, + is_fence_line, + parse_definition, + rewrite_tokens, +}; + +#[derive(Clone)] +pub(super) struct DefinitionLine { + pub(super) index: usize, + pub(super) new_number: usize, + pub(super) line: String, +} + +pub(super) struct NumericCandidate { + index: usize, + number: usize, + indent: String, + whitespace: String, + rest: String, +} + +pub(super) struct DefinitionUpdates { + pub(super) definitions: Vec, + pub(super) is_definition_line: Vec, +} + +struct DefinitionScanState<'a> { + mapping: &'a mut HashMap, + next_number: &'a mut usize, + numeric_list_range: Option<(usize, usize)>, + skip_numeric_conversion: bool, + definitions: Vec, + is_definition_line: Vec, + numeric_candidates: Vec, +} + +fn definition_segment_end(lines: &[String], start: usize, block_end: usize) -> usize { + let mut idx = start + 1; + while idx < block_end { + let line = &lines[idx]; + if parse_definition(line).is_some() { + break; + } + if is_definition_continuation(line) { + idx += 1; + continue; + } + if line.trim().is_empty() { + if idx + 1 < block_end && parse_definition(&lines[idx + 1]).is_some() { + break; + } + idx += 1; + continue; + } + break; + } + idx +} + +fn assign_new_number( + mapping: &mut HashMap, + number: usize, + next_number: &mut usize, +) -> usize { + if let Some(&mapped) = mapping.get(&number) { + mapped + } else { + let assigned = *next_number; + *next_number += 1; + mapping.insert(number, assigned); + assigned + } +} + +fn should_convert_numeric_line( + index: usize, + numeric_range: Option<(usize, usize)>, + skip_numeric_conversion: bool, +) -> bool { + if skip_numeric_conversion { + return false; + } + numeric_range.is_some_and(|(start, end)| index >= start && index < end) +} + +fn definition_line_from_parts( + index: usize, + parts: super::parsing::DefinitionParts<'_>, + mapping: &mut HashMap, + next_number: &mut usize, +) -> DefinitionLine { + let new_number = assign_new_number(mapping, parts.number, next_number); + let rewritten_rest = rewrite_tokens(parts.rest, mapping); + let mut line = String::with_capacity(parts.prefix.len() + rewritten_rest.len() + 8); + line.push_str(parts.prefix); + let header = format!("[^{new_number}]:"); + line.push_str(&header); + line.push_str(&rewritten_rest); + DefinitionLine { + index, + new_number, + line, + } +} + +pub(super) fn numeric_candidate_from_line(line: &str, index: usize) -> Option { + let caps = FOOTNOTE_LINE_RE.captures(line)?; + let indent = caps.name("indent").map_or("", |m| m.as_str()).to_string(); + let num_match = caps.name("num")?; + let rest_match = caps.name("rest")?; + let number = num_match.as_str().parse::().ok()?; + let rest = rest_match.as_str().to_string(); + let whitespace = line[num_match.end() + 1..rest_match.start()].to_string(); + Some(NumericCandidate { + index, + number, + indent, + whitespace, + rest, + }) +} + +fn collect_scan_updates(lines: &[String], state: &mut DefinitionScanState<'_>) { + let mut in_fence = false; + + for (index, line) in lines.iter().enumerate() { + if is_fence_line(line) { + in_fence = !in_fence; + continue; + } + if in_fence { + continue; + } + + if let Some(parts) = parse_definition(line) { + state.definitions.push(definition_line_from_parts( + index, + parts, + state.mapping, + state.next_number, + )); + state.is_definition_line[index] = true; + continue; + } + + if !should_convert_numeric_line( + index, + state.numeric_list_range, + state.skip_numeric_conversion, + ) { + continue; + } + if state.mapping.is_empty() && state.definitions.is_empty() { + continue; + } + if let Some(candidate) = numeric_candidate_from_line(line, index) { + state.numeric_candidates.push(candidate); + } + } +} + +fn finalize_numeric_candidates(state: &mut DefinitionScanState<'_>) { + // Drain from the bottom so wrapped continuation lines stay attached to the + // correct definition when numeric candidates are later reordered by their + // assigned footnote numbers. + for candidate in state.numeric_candidates.drain(..).rev() { + let new_number = assign_new_number(state.mapping, candidate.number, state.next_number); + let rewritten_rest = rewrite_tokens(&candidate.rest, state.mapping); + let mut line = String::with_capacity( + candidate.indent.len() + candidate.whitespace.len() + rewritten_rest.len() + 8, + ); + line.push_str(&candidate.indent); + let header = format!("[^{new_number}]:"); + line.push_str(&header); + line.push_str(&candidate.whitespace); + line.push_str(&rewritten_rest); + state.definitions.push(DefinitionLine { + index: candidate.index, + new_number, + line, + }); + state.is_definition_line[candidate.index] = true; + } +} + +pub(super) fn collect_definition_updates( + lines: &[String], + mapping: &mut HashMap, +) -> DefinitionUpdates { + let mut next_number = mapping.values().copied().max().unwrap_or(0) + 1; + let numeric_list_range = footnote_block_range(lines); + let skip_numeric_conversion = numeric_list_range + .as_ref() + .is_some_and(|(start, _)| has_existing_footnote_block(lines, *start)); + let mut state = DefinitionScanState { + mapping, + next_number: &mut next_number, + numeric_list_range, + skip_numeric_conversion, + definitions: Vec::new(), + is_definition_line: vec![false; lines.len()], + numeric_candidates: Vec::new(), + }; + collect_scan_updates(lines, &mut state); + finalize_numeric_candidates(&mut state); + + DefinitionUpdates { + definitions: state.definitions, + is_definition_line: state.is_definition_line, + } +} + +pub(super) fn rewrite_definition_headers(lines: &mut [String], definitions: &[DefinitionLine]) { + for definition in definitions { + lines[definition.index].clone_from(&definition.line); + } +} + +pub(super) fn reorder_definition_block( + lines: &mut [String], + start: usize, + end: usize, + definitions: &[DefinitionLine], +) { + let header_positions: Vec = (start..end) + .filter(|&idx| parse_definition(&lines[idx]).is_some()) + .collect(); + if header_positions.len() <= 1 { + return; + } + + let def_lookup: HashMap = definitions + .iter() + .filter(|definition| (start..end).contains(&definition.index)) + .map(|definition| (definition.index, definition)) + .collect(); + if def_lookup.len() <= 1 { + return; + } + + let prefix_len = header_positions.first().map_or(0, |first| first - start); + let mut segments: Vec<(usize, usize, Vec)> = Vec::new(); + let mut consumed = start + prefix_len; + for &position in &header_positions { + let mut leading_start = position; + while leading_start > consumed + && lines[leading_start - 1].trim().is_empty() + && !is_definition_continuation(&lines[leading_start - 1]) + { + leading_start -= 1; + } + let next_bound = definition_segment_end(lines, position, end); + if let Some(definition) = def_lookup.get(&position) { + debug_assert!( + position >= leading_start, + "definition header {position} cannot precede leading segment start {leading_start}", + ); + let mut segment = Vec::with_capacity(next_bound.saturating_sub(leading_start).max(1)); + segment.extend(lines[leading_start..position].iter().cloned()); + segment.push(definition.line.clone()); + let tail_start = position.saturating_add(1); + if tail_start < next_bound { + segment.extend(lines[tail_start..next_bound].iter().cloned()); + } + segments.push((definition.new_number, definition.index, segment)); + } + consumed = next_bound; + } + + if segments.len() <= 1 { + return; + } + + segments.sort_by(|left, right| left.0.cmp(&right.0).then(left.1.cmp(&right.1))); + + let mut first_leading = Vec::new(); + if let Some((_, _, first_segment)) = segments.first_mut() { + while first_segment + .first() + .is_some_and(|line| line.trim().is_empty() && !is_definition_continuation(line)) + { + first_leading.push(first_segment.remove(0)); + } + } + + let mut reordered = Vec::with_capacity(end - start); + if prefix_len > 0 { + reordered.extend(lines[start..start + prefix_len].iter().cloned()); + } + + for (idx, (_, _, segment)) in segments.into_iter().enumerate() { + reordered.extend(segment); + if idx == 0 && !first_leading.is_empty() { + reordered.append(&mut first_leading); + } + } + + if reordered.len() == end - start { + lines[start..end].clone_from_slice(&reordered); + } +} diff --git a/src/footnotes/renumber/tests.rs b/src/footnotes/renumber/tests.rs new file mode 100644 index 00000000..577c58c8 --- /dev/null +++ b/src/footnotes/renumber/tests.rs @@ -0,0 +1,42 @@ +//! Unit tests for footnote renumbering helpers. +//! +//! These cases exercise the private numeric-candidate parsing and renumbering +//! helpers directly so the behaviour stays covered without bloating the main +//! module file. + +use rstest::rstest; + +use super::{numeric_candidate_from_line, renumber_footnotes}; + +fn strings(lines: &[&str]) -> Vec { lines.iter().map(|line| (*line).to_string()).collect() } + +#[rstest] +#[case("7.")] +#[case("7:")] +fn malformed_numeric_candidate_line_is_ignored(#[case] line: &str) { + assert!(numeric_candidate_from_line(line, 0).is_none()); +} + +#[test] +fn renumber_footnotes_rewrites_existing_definition_headers() { + let mut lines = strings(&["Reference.[^7]", "", "[^7]: Existing definition"]); + + renumber_footnotes(&mut lines); + + assert_eq!( + lines, + strings(&["Reference.[^1]", "", "[^1]: Existing definition"]) + ); +} + +#[test] +fn renumber_footnotes_rewrites_numeric_candidates() { + let mut lines = strings(&["Reference.[^7]", "", "7. Legacy footnote"]); + + renumber_footnotes(&mut lines); + + assert_eq!( + lines, + strings(&["Reference.[^1]", "", "[^1]: Legacy footnote"]) + ); +} diff --git a/src/html.rs b/src/html.rs index b8c6e326..5a13f728 100644 --- a/src/html.rs +++ b/src/html.rs @@ -208,43 +208,34 @@ fn table_lines_to_markdown(lines: &[String]) -> Vec { out } -fn append_html_table_line(line: &str, buf: &mut Vec, depth: &mut usize) { - buf.push(line.to_string()); - *depth += TABLE_START_RE.find_iter(line).count(); - if TABLE_END_RE.is_match(line) { - *depth = depth.saturating_sub(TABLE_END_RE.find_iter(line).count()); - } -} - -fn flush_completed_html_table(buf: &mut Vec, depth: usize, out: &mut Vec) -> bool { - if depth != 0 { - return false; - } - out.extend(table_lines_to_markdown(buf)); - buf.clear(); - true -} - #[derive(Default)] struct HtmlTableState { buf: Vec, depth: usize, - in_html: bool, } impl HtmlTableState { + fn in_html(&self) -> bool { !self.buf.is_empty() } + fn flush_raw(&mut self, out: &mut Vec) { if !self.buf.is_empty() { out.append(&mut self.buf); } self.depth = 0; - self.in_html = false; } fn push_html_line(&mut self, line: &str, out: &mut Vec) { - append_html_table_line(line, &mut self.buf, &mut self.depth); - if flush_completed_html_table(&mut self.buf, self.depth, out) { - self.in_html = false; + let trimmed = line.trim_start(); + self.buf.push(line.to_string()); + self.depth += TABLE_START_RE.find_iter(trimmed).count(); + if TABLE_END_RE.is_match(trimmed) { + self.depth = self + .depth + .saturating_sub(TABLE_END_RE.find_iter(trimmed).count()); + } + if self.depth == 0 { + out.extend(table_lines_to_markdown(&self.buf)); + self.buf.clear(); } } } @@ -274,22 +265,18 @@ impl HtmlTableState { /// ``` pub(crate) fn html_table_to_markdown(lines: &[String]) -> Vec { let mut out = Vec::new(); - let mut buf = Vec::new(); - let mut depth = 0usize; + let mut html_state = HtmlTableState::default(); for line in lines { - if depth > 0 || TABLE_START_RE.is_match(line.trim_start()) { - append_html_table_line(line, &mut buf, &mut depth); - let _ = flush_completed_html_table(&mut buf, depth, &mut out); + if html_state.in_html() || TABLE_START_RE.is_match(line.trim_start()) { + html_state.push_html_line(line, &mut out); continue; } out.push(line.clone()); } - if !buf.is_empty() { - out.extend(buf); - } + html_state.flush_raw(&mut out); out } @@ -325,7 +312,7 @@ pub fn convert_html_tables(lines: &[String]) -> Vec { for line in lines { if is_fence(line).is_some() { - if html_state.in_html { + if html_state.in_html() { html_state.flush_raw(&mut out); } in_code = !in_code; @@ -338,13 +325,12 @@ pub fn convert_html_tables(lines: &[String]) -> Vec { continue; } - if html_state.in_html { + if html_state.in_html() { html_state.push_html_line(line, &mut out); continue; } if TABLE_START_RE.is_match(line.trim_start()) { - html_state.in_html = true; html_state.push_html_line(line, &mut out); continue; } @@ -352,9 +338,7 @@ pub fn convert_html_tables(lines: &[String]) -> Vec { out.push(line.clone()); } - if !html_state.buf.is_empty() { - out.extend(html_state.buf); - } + html_state.flush_raw(&mut out); out } diff --git a/src/lists.rs b/src/lists.rs index 4fdd5748..98a44c4b 100644 --- a/src/lists.rs +++ b/src/lists.rs @@ -61,12 +61,18 @@ fn is_plain_paragraph_line(line: &str) -> bool { ) } +#[derive(Default)] struct ListState { indent_stack: Vec, counters: HashMap, } impl ListState { + fn reset(&mut self) { + self.indent_stack.clear(); + self.counters.clear(); + } + fn prune_deeper(&mut self, indent: usize, inclusive: bool) { prune_deeper( indent, @@ -76,6 +82,17 @@ impl ListState { ); } + fn next_number(&mut self, indent: usize) -> usize { + self.prune_deeper(indent, false); + if self.indent_stack.last().is_none_or(|&d| d < indent) { + self.indent_stack.push(indent); + } + let num = self.counters.entry(indent).or_insert(1); + let current = *num; + *num += 1; + current + } + fn handle_paragraph_restart(&mut self, indent: usize, line: &str, prev_blank: bool) -> bool { let inclusive = prev_blank && self @@ -97,10 +114,7 @@ impl ListState { #[must_use] pub fn renumber_lists(lines: &[String]) -> Vec { let mut out = Vec::with_capacity(lines.len()); - let mut state = ListState { - indent_stack: Vec::new(), - counters: HashMap::new(), - }; + let mut state = ListState::default(); // Track fenced code blocks consistently across list processing. let mut fences = FenceTracker::default(); #[allow(clippy::unnecessary_map_or)] @@ -123,13 +137,7 @@ pub fn renumber_lists(lines: &[String]) -> Vec { continue; } if let Some((indent, indent_str, sep, rest)) = parse_numbered(line) { - state.prune_deeper(indent, false); - if state.indent_stack.last().is_none_or(|&d| d < indent) { - state.indent_stack.push(indent); - } - let num = state.counters.entry(indent).or_insert(1); - let current = *num; - *num += 1; + let current = state.next_number(indent); out.push(format!("{indent_str}{current}.{sep}{rest}")); prev_blank = false; continue; @@ -141,8 +149,7 @@ pub fn renumber_lists(lines: &[String]) -> Vec { let indent_str = &line[..indent_end]; let indent = indent_len(indent_str); if HEADING_RE.is_match(line) || THEMATIC_BREAK_RE.is_match(line.trim_end()) { - state.indent_stack.clear(); - state.counters.clear(); + state.reset(); out.push(line.clone()); prev_blank = false; continue; diff --git a/src/wrap.rs b/src/wrap.rs index 4499eb62..7cab0952 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -19,7 +19,7 @@ mod tokenize; use block::{BLOCKQUOTE_RE, BULLET_RE, FOOTNOTE_RE}; pub(crate) use block::{BlockKind, classify_block}; pub use fence::{FenceTracker, is_fence}; -use paragraph::{ParagraphState, ParagraphWriter, PrefixLine}; +use paragraph::{ParagraphWriter, PrefixLine}; /// Token emitted by the `tokenize::segment_inline` parser and used by /// higher-level wrappers. /// @@ -48,24 +48,20 @@ fn is_indented_code_line(line: &str) -> bool { indent_width >= 4 && line.chars().any(|c| !c.is_whitespace()) } -fn is_table_or_separator(line: &str) -> bool { - line.trim_start().starts_with('|') || crate::table::SEP_RE.is_match(line.trim()) -} - fn is_passthrough_block(line: &str) -> bool { - is_table_or_separator(line) + line.trim_start().starts_with('|') + || crate::table::SEP_RE.is_match(line.trim()) || matches!( classify_block(line), Some(BlockKind::Heading | BlockKind::MarkdownlintDirective) ) - || line.trim().is_empty() || is_indented_code_line(line) } fn prefix_line(line: &str) -> Option> { if let Some(cap) = BULLET_RE.captures(line) { - let prefix = cap.get(1).expect("bullet regex capture").as_str(); - let rest = cap.get(2).expect("bullet regex remainder capture").as_str(); + let prefix = cap.get(1).map(|m| m.as_str())?; + let rest = cap.get(2).map(|m| m.as_str())?; return Some(PrefixLine { prefix: Cow::Borrowed(prefix), rest, @@ -74,12 +70,9 @@ fn prefix_line(line: &str) -> Option> { } if let Some(cap) = FOOTNOTE_RE.captures(line) { - let prefix = cap.get(1).expect("footnote prefix capture").as_str(); - let marker = cap.get(2).expect("footnote marker capture").as_str(); - let rest = cap - .get(3) - .expect("footnote regex remainder capture") - .as_str(); + let prefix = cap.get(1).map(|m| m.as_str())?; + let marker = cap.get(2).map(|m| m.as_str())?; + let rest = cap.get(3).map(|m| m.as_str())?; return Some(PrefixLine { prefix: Cow::Owned(format!("{prefix}{marker}")), rest, @@ -87,12 +80,12 @@ fn prefix_line(line: &str) -> Option> { }); } - BLOCKQUOTE_RE.captures(line).map(|cap| PrefixLine { - prefix: Cow::Borrowed(cap.get(1).expect("blockquote prefix capture").as_str()), - rest: cap - .get(2) - .expect("blockquote regex remainder capture") - .as_str(), + let cap = BLOCKQUOTE_RE.captures(line)?; + let prefix = cap.get(1).map(|m| m.as_str())?; + let rest = cap.get(2).map(|m| m.as_str())?; + Some(PrefixLine { + prefix: Cow::Borrowed(prefix), + rest, repeat_prefix: true, }) } @@ -117,43 +110,44 @@ fn line_break_parts(line: &str) -> (String, bool) { } /// Wrap text lines to the given width. -/// -/// # Panics -/// Panics if regex captures fail unexpectedly. #[must_use] pub fn wrap_text(lines: &[String], width: usize) -> Vec { let mut out = Vec::new(); - let mut state = ParagraphState::default(); let mut writer = ParagraphWriter::new(&mut out, width); // Track fenced code blocks so wrapping honours shared fence semantics. let mut fence_tracker = FenceTracker::default(); for line in lines { - if fence::handle_fence_line(line, &mut writer, &mut state, &mut fence_tracker) { + if fence::handle_fence_line(line, &mut writer, &mut fence_tracker) { continue; } if fence_tracker.in_fence() { - writer.push_verbatim(&mut state, line); + writer.push_verbatim(line); + continue; + } + + if line.trim().is_empty() { + writer.push_blank_line(); continue; } if is_passthrough_block(line) { - writer.push_verbatim(&mut state, line); + writer.push_verbatim(line); continue; } if let Some(prefix_line) = prefix_line(line) { - writer.handle_prefix_line(&mut state, &prefix_line); + writer.handle_prefix_line(&prefix_line); continue; } - state.note_indent(line); + writer.note_indent(line); let (text, hard_break) = line_break_parts(line); - state.push(text, hard_break); + writer.push_wrapped(text, hard_break); } - writer.flush_paragraph(&mut state); + writer.flush_paragraph(); out } diff --git a/src/wrap/fence.rs b/src/wrap/fence.rs index 1eaea4f4..259b8185 100644 --- a/src/wrap/fence.rs +++ b/src/wrap/fence.rs @@ -2,7 +2,7 @@ use regex::Regex; -use super::paragraph::{ParagraphState, ParagraphWriter}; +use super::paragraph::ParagraphWriter; pub(super) static FENCE_RE: std::sync::LazyLock = // Capture: indent, fence run of 3+ backticks/tilde, and the full info string (incl. leading @@ -45,14 +45,13 @@ pub fn is_fence(line: &str) -> Option<(&str, &str, &str)> { pub(crate) fn handle_fence_line( line: &str, writer: &mut ParagraphWriter<'_>, - state: &mut ParagraphState, tracker: &mut FenceTracker, ) -> bool { if !tracker.observe(line) { return false; } - writer.push_verbatim(state, line); + writer.push_verbatim(line); true } diff --git a/src/wrap/inline.rs b/src/wrap/inline.rs index 8b8027fd..d61cfe1e 100644 --- a/src/wrap/inline.rs +++ b/src/wrap/inline.rs @@ -230,10 +230,7 @@ pub(super) fn wrap_preserving_code(text: &str, width: usize) -> Vec { continue; } - let mut split = SplitContext { - lines: &mut lines, - width, - }; + let mut split = SplitContext::new(&mut lines, width); if buffer.split_with_span(&mut split, &tokens, span.clone()) { i = group_end; continue; diff --git a/src/wrap/line_buffer.rs b/src/wrap/line_buffer.rs index 0eafa3b2..7b4ef23d 100644 --- a/src/wrap/line_buffer.rs +++ b/src/wrap/line_buffer.rs @@ -19,6 +19,10 @@ pub(crate) struct SplitContext<'a> { pub(crate) width: usize, } +impl<'a> SplitContext<'a> { + pub(crate) fn new(lines: &'a mut Vec, width: usize) -> Self { Self { lines, width } } +} + impl LineBuffer { pub(crate) fn new() -> Self { Self::default() } diff --git a/src/wrap/paragraph.rs b/src/wrap/paragraph.rs index 358c946f..ddcce029 100644 --- a/src/wrap/paragraph.rs +++ b/src/wrap/paragraph.rs @@ -15,16 +15,21 @@ pub(super) struct PrefixLine<'a> { pub(super) repeat_prefix: bool, } -#[derive(Default)] -pub(super) struct ParagraphState { +pub(super) struct ParagraphWriter<'a> { + out: &'a mut Vec, + width: usize, buf: Vec<(String, bool)>, indent: String, } -impl ParagraphState { - pub(super) fn clear(&mut self) { - self.buf.clear(); - self.indent.clear(); +impl<'a> ParagraphWriter<'a> { + pub(super) fn new(out: &'a mut Vec, width: usize) -> Self { + Self { + out, + width, + buf: Vec::new(), + indent: String::new(), + } } pub(super) fn note_indent(&mut self, line: &str) { @@ -33,87 +38,78 @@ impl ParagraphState { } } - pub(super) fn push(&mut self, text: String, hard_break: bool) { + pub(super) fn push_wrapped(&mut self, text: String, hard_break: bool) { self.buf.push((text, hard_break)); } -} - -pub(super) struct ParagraphWriter<'a> { - out: &'a mut Vec, - width: usize, -} - -impl<'a> ParagraphWriter<'a> { - pub(super) fn new(out: &'a mut Vec, width: usize) -> Self { Self { out, width } } - - fn append_wrapped_with_prefix(&mut self, line: &PrefixLine<'_>) { - let prefix = line.prefix.as_ref(); - let prefix_width = UnicodeWidthStr::width(prefix); - let available = self.width.saturating_sub(prefix_width).max(1); - let indent_str: String = prefix.chars().take_while(|c| c.is_whitespace()).collect(); - let indent_width = UnicodeWidthStr::width(indent_str.as_str()); - let wrapped_indent = if line.repeat_prefix { - prefix.to_string() - } else { - format!("{}{}", indent_str, " ".repeat(prefix_width - indent_width)) - }; - - let lines = wrap_preserving_code(line.rest, available); - if lines.is_empty() { - self.out.push(prefix.to_string()); - return; - } - for (index, wrapped_line) in lines.iter().enumerate() { - if index == 0 { - self.out.push(format!("{prefix}{wrapped_line}")); - } else { - self.out.push(format!("{wrapped_indent}{wrapped_line}")); - } + fn push_wrapped_segment(&mut self, indent: &str, segment: &str) { + let indent_width = UnicodeWidthStr::width(indent); + let available = self.width.saturating_sub(indent_width).max(1); + for line in wrap_preserving_code(segment, available) { + self.out.push(format!("{indent}{line}")); } } - pub(super) fn flush_paragraph(&mut self, state: &mut ParagraphState) { - if state.buf.is_empty() { + pub(super) fn flush_paragraph(&mut self) { + if self.buf.is_empty() { return; } + let indent = std::mem::take(&mut self.indent); + let buf = std::mem::take(&mut self.buf); let mut segment = String::new(); - for (text, hard_break) in &state.buf { + for (text, hard_break) in &buf { if !segment.is_empty() { segment.push(' '); } segment.push_str(text); if *hard_break { - self.push_wrapped_segment(&state.indent, &segment); + self.push_wrapped_segment(&indent, &segment); segment.clear(); } } if !segment.is_empty() { - self.push_wrapped_segment(&state.indent, &segment); + self.push_wrapped_segment(&indent, &segment); } - - state.clear(); } - fn push_wrapped_segment(&mut self, indent: &str, segment: &str) { - for line in wrap_preserving_code(segment, self.width - indent.len()) { - self.out.push(format!("{indent}{line}")); - } + pub(super) fn push_verbatim(&mut self, line: &str) { + self.flush_paragraph(); + self.out.push(line.to_string()); } - pub(super) fn push_verbatim(&mut self, state: &mut ParagraphState, line: &str) { - self.flush_paragraph(state); - self.out.push(line.to_string()); + pub(super) fn push_blank_line(&mut self) { + self.flush_paragraph(); + self.out.push(String::new()); } - pub(super) fn handle_prefix_line( - &mut self, - state: &mut ParagraphState, - prefix_line: &PrefixLine<'_>, - ) { - self.flush_paragraph(state); - self.append_wrapped_with_prefix(prefix_line); + pub(super) fn handle_prefix_line(&mut self, prefix_line: &PrefixLine<'_>) { + self.flush_paragraph(); + + let prefix = prefix_line.prefix.as_ref(); + let prefix_width = UnicodeWidthStr::width(prefix); + let available = self.width.saturating_sub(prefix_width).max(1); + let indent_str: String = prefix.chars().take_while(|c| c.is_whitespace()).collect(); + let indent_width = UnicodeWidthStr::width(indent_str.as_str()); + let wrapped_indent = if prefix_line.repeat_prefix { + prefix.to_string() + } else { + format!("{}{}", indent_str, " ".repeat(prefix_width - indent_width)) + }; + + let lines = wrap_preserving_code(prefix_line.rest, available); + if lines.is_empty() { + self.out.push(prefix.to_string()); + return; + } + + for (index, wrapped_line) in lines.iter().enumerate() { + if index == 0 { + self.out.push(format!("{prefix}{wrapped_line}")); + } else { + self.out.push(format!("{wrapped_indent}{wrapped_line}")); + } + } } } diff --git a/src/wrap/tests.rs b/src/wrap/tests.rs index 6e244eae..83f89029 100644 --- a/src/wrap/tests.rs +++ b/src/wrap/tests.rs @@ -123,10 +123,7 @@ fn line_buffer_split_preserves_multi_space_lines() { buffer.push_span(&tokens, 0, 2); let mut lines = Vec::new(); - let mut split = SplitContext { - lines: &mut lines, - width: 8, - }; + let mut split = SplitContext::new(&mut lines, 8); assert!(buffer.split_with_span(&mut split, &tokens, 2..4)); assert_eq!(lines, vec!["alpha ".to_string()]); assert_eq!(buffer.text(), "beta "); @@ -143,10 +140,7 @@ fn line_buffer_split_trims_single_trailing_space() { buffer.push_span(&tokens, 0, 2); let mut lines = Vec::new(); - let mut split = SplitContext { - lines: &mut lines, - width: 5, - }; + let mut split = SplitContext::new(&mut lines, 5); assert!(buffer.split_with_span(&mut split, &tokens, 2..3)); assert_eq!(lines, vec!["alpha".to_string()]); assert_eq!(buffer.text(), "beta"); @@ -168,10 +162,7 @@ fn line_buffer_split_tracks_multiple_whitespace_tokens() { buffer.push_span(&tokens, 0, 3); let mut lines = Vec::new(); - let mut split = SplitContext { - lines: &mut lines, - width: 4, - }; + let mut split = SplitContext::new(&mut lines, 4); assert!(buffer.split_with_span(&mut split, &tokens, 3..4)); assert_eq!(lines, vec!["foo ".to_string()]); assert_eq!(buffer.text(), "bar"); @@ -256,111 +247,6 @@ fn wrap_preserving_code_preserves_carry_whitespace( assert_eq!(lines.concat(), input); } -#[test] -fn wrap_text_preserves_hyphenated_words() { - let input = vec!["A word that is very-long-word indeed".to_string()]; - let wrapped = wrap_text(&input, 20); - assert_eq!( - wrapped, - vec![ - "A word that is".to_string(), - "very-long-word".to_string(), - "indeed".to_string(), - ] - ); -} - -#[test] -fn wrap_text_does_not_insert_spaces_in_hyphenated_words() { - let input = vec![ - concat!( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt ", - "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur ", - "volutpat.", - ) - .to_string(), - ]; - let wrapped = wrap_text(&input, 80); - assert_eq!( - wrapped, - vec![ - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt".to_string(), - "elit-sed fermentum congue. Vivamus dictum nulla sed consectetur volutpat.".to_string(), - ] - ); -} - -#[test] -fn wrap_text_preserves_code_spans() { - let input = vec![ - "with their own escaping rules. On Windows, scripts default to `powershell -Command` \ - unless the manifest's `interpreter` field overrides the setting." - .to_string(), - ]; - let wrapped = wrap_text(&input, 60); - assert_eq!( - wrapped, - vec![ - "with their own escaping rules. On Windows, scripts default".to_string(), - "to `powershell -Command` unless the manifest's".to_string(), - "`interpreter` field overrides the setting.".to_string(), - ] - ); -} - -#[test] -fn wrap_text_multiple_code_spans() { - let input = vec!["combine `foo bar` and `baz qux` in one line".to_string()]; - let wrapped = wrap_text(&input, 25); - assert_eq!( - wrapped, - vec![ - "combine `foo bar` and".to_string(), - "`baz qux` in one line".to_string(), - ] - ); -} - -#[test] -fn wrap_text_nested_backticks() { - let input = vec!["Use `` `code` `` to quote backticks".to_string()]; - let wrapped = wrap_text(&input, 20); - assert_eq!( - wrapped, - vec![ - "Use `` `code` `` to".to_string(), - "quote backticks".to_string() - ], - ); -} - -#[test] -fn wrap_text_unmatched_backticks() { - let input = vec!["This has a `dangling code span.".to_string()]; - let wrapped = wrap_text(&input, 20); - assert_eq!( - wrapped, - vec!["This has a".to_string(), "`dangling code span.".to_string()], - ); -} - -#[test] -fn wrap_text_preserves_links() { - let input = vec![ - "`falcon-pachinko` is an extension library for the".to_string(), - "[Falcon](https://falcon.readthedocs.io) web framework. It adds a structured".to_string(), - "approach to asynchronous WebSocket routing and background worker integration.".to_string(), - ]; - let wrapped = wrap_text(&input, 80); - let joined = wrapped.join("\n"); - assert_eq!(joined.matches("https://").count(), 1); - assert!( - wrapped - .iter() - .any(|l| l.contains("https://falcon.readthedocs.io")) - ); -} - #[rstest] #[case("trail ", 80, &["trail "])] #[case("`code span` ", 12, &["`code span` "])] diff --git a/tests/table/convert_html.rs b/tests/table/convert_html.rs index 21f57e24..9b004d0a 100644 --- a/tests/table/convert_html.rs +++ b/tests/table/convert_html.rs @@ -77,3 +77,41 @@ fn preserves_trailing_spaces_in_cells() { ]; assert_eq!(convert_html_tables(&input), expected); } + +#[rstest( + input, + expected, + case::multiline( + lines_vec![ + " ", + " ", + " ", + "
AB
12
", + ], + lines_vec![ + " | A | B |", + " | --- | --- |", + " | 1 | 2 |", + ] + ), + case::surrounding_content( + lines_vec![ + "
before
", + " ", + " ", + " ", + "
AB
12
", + "

after

", + ], + lines_vec![ + "
before
", + " | A | B |", + " | --- | --- |", + " | 1 | 2 |", + "

after

", + ] + ), +)] +fn converts_indented_html_table_cases(input: Vec, expected: Vec) { + assert_eq!(convert_html_tables(&input), expected); +} diff --git a/tests/wrap_unit.rs b/tests/wrap_unit.rs index 7f127cf9..3d5b69b7 100644 --- a/tests/wrap_unit.rs +++ b/tests/wrap_unit.rs @@ -52,6 +52,30 @@ fn wrap_text_preserves_code_spans() { ); } +#[test] +fn wrap_text_normalizes_whitespace_only_lines() { + let input = vec![String::new(), " ".to_string(), "\t\t".to_string()]; + let wrapped = wrap_text(&input, 80); + assert_eq!(wrapped, vec![String::new(), String::new(), String::new()]); +} + +#[test] +fn wrap_text_treats_whitespace_only_lines_as_paragraph_breaks() { + let input = vec!["foo".to_string(), " ".to_string(), "bar".to_string()]; + let wrapped = wrap_text(&input, 80); + assert_eq!( + wrapped, + vec!["foo".to_string(), String::new(), "bar".to_string()] + ); +} + +#[test] +fn wrap_text_uses_display_width_for_unicode_indent() { + let input = vec![" a b".to_string()]; + let wrapped = wrap_text(&input, 4); + assert_eq!(wrapped, vec![" a".to_string(), " b".to_string()]); +} + #[test] fn wrap_text_multiple_code_spans() { let input = vec!["combine `foo bar` and `baz qux` in one line".to_string()];