diff --git a/AGENTS.md b/AGENTS.md index 06f65d12..0a41a4bb 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -29,10 +29,9 @@ documentation should omit examples where the example serves only to reiterate the test logic. - **Keep file size managable.** No single code file may be longer than 400 - lines. - Long switch statements or dispatch tables should be broken up by feature and - constituents colocated with targets. Large blocks of test data should be - moved to external data files. + lines. Long switch statements or dispatch tables should be broken up by + feature and constituents colocated with targets. Large blocks of test data + should be moved to external data files. ## Documentation Maintenance diff --git a/docs/architecture.md b/docs/architecture.md index dbf2524e..22fc4d56 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -307,3 +307,16 @@ multibyte characters from causing unexpected wraps or truncation. Whenever wrapping logic examines the length of a token, it relies on `UnicodeWidthStr::width` to measure visible columns rather than byte length. + +## Link punctuation handling + +Trailing punctuation immediately following a Markdown link or image is +tokenized separately and grouped with the link when wrapping. This keeps +sentences like: + +```markdown +[link](path). +``` + +on a single line, rather than splitting the punctuation onto the next line when +wrapping occurs. diff --git a/src/wrap.rs b/src/wrap.rs index 9cf3e670..7d331bd3 100644 --- a/src/wrap.rs +++ b/src/wrap.rs @@ -114,6 +114,13 @@ fn parse_link_or_image(chars: &[char], mut i: usize) -> (String, usize) { (tok, start + 1) } +fn is_trailing_punctuation(c: char) -> bool { + matches!( + c, + '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '"' | '\'' + ) +} + fn tokenize_inline(text: &str) -> Vec { let mut tokens = Vec::new(); let chars: Vec = text.chars().collect(); @@ -157,8 +164,16 @@ fn tokenize_inline(text: &str) -> Vec { i = end; } } else if c == '[' || (c == '!' && i + 1 < chars.len() && chars[i + 1] == '[') { - let (tok, new_i) = parse_link_or_image(&chars, i); + let (tok, mut new_i) = parse_link_or_image(&chars, i); tokens.push(tok); + let mut punct = String::new(); + while new_i < chars.len() && is_trailing_punctuation(chars[new_i]) { + punct.push(chars[new_i]); + new_i += 1; + } + if !punct.is_empty() { + tokens.push(punct); + } i = new_i; } else { let start = i; @@ -251,11 +266,22 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { let mut current = String::new(); let mut current_width = 0; let mut last_split: Option = None; - for token in tokenize_inline(text) { - let token_width = UnicodeWidthStr::width(token.as_str()); + let tokens = tokenize_inline(text); + let mut i = 0; + while i < tokens.len() { + let mut j = i + 1; + let mut group_width = UnicodeWidthStr::width(tokens[i].as_str()); + + if tokens[i].contains("](") && tokens[i].ends_with(')') { + while j < tokens.len() && tokens[j].chars().all(is_trailing_punctuation) { + group_width += UnicodeWidthStr::width(tokens[j].as_str()); + j += 1; + } + } + if current.is_empty() - && token.len() == 1 - && ".?!,:;".contains(token.as_str()) + && tokens[i].len() == 1 + && ".?!,:;".contains(tokens[i].as_str()) && lines .last() .is_some_and(|l: &String| l.trim_end().ends_with('`')) @@ -263,19 +289,24 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { lines .last_mut() .expect("checked last line exists") - .push_str(&token); + .push_str(&tokens[i]); + i += 1; continue; } - if current_width + token_width <= width { - current.push_str(&token); - current_width += token_width; - if token.chars().all(char::is_whitespace) { - last_split = Some(current.len()); + + if current_width + group_width <= width { + for tok in &tokens[i..j] { + current.push_str(tok); + if tok.chars().all(char::is_whitespace) { + last_split = Some(current.len()); + } + current_width += UnicodeWidthStr::width(tok.as_str()); } + i = j; continue; } - if should_break_line(width, current_width + token_width, last_split) { + if should_break_line(width, current_width + group_width, last_split) { let pos = last_split.unwrap(); let line = current[..pos].to_string(); let mut rest = current[pos..].trim_start().to_string(); @@ -283,10 +314,12 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { if !trimmed.is_empty() { lines.push(trimmed.to_string()); } - rest.push_str(&token); + for tok in &tokens[i..j] { + rest.push_str(tok); + } current = rest; current_width = UnicodeWidthStr::width(current.as_str()); - last_split = if token.chars().all(char::is_whitespace) { + last_split = if tokens[j - 1].chars().all(char::is_whitespace) { Some(current.len()) } else { None @@ -297,6 +330,7 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { current_width = 0; last_split = None; } + i = j; continue; } @@ -306,11 +340,18 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec { } current.clear(); current_width = 0; + last_split = None; - if !token.chars().all(char::is_whitespace) { - current.push_str(&token); - current_width = token_width; + for tok in &tokens[i..j] { + if !tok.chars().all(char::is_whitespace) { + current.push_str(tok); + current_width += UnicodeWidthStr::width(tok.as_str()); + } + } + if j > i && tokens[j - 1].chars().all(char::is_whitespace) { + last_split = Some(current.len()); } + i = j; } let trimmed = current.trim_end(); if !trimmed.is_empty() { diff --git a/tests/wrap.rs b/tests/wrap.rs index 2ab02b19..c07a9768 100644 --- a/tests/wrap.rs +++ b/tests/wrap.rs @@ -475,6 +475,57 @@ fn test_wrap_paragraph_with_nested_link() { ); } +/// Ensures punctuation immediately following a link remains attached when +/// wrapping lines. +#[test] +fn test_wrap_link_with_trailing_punctuation() { + let input = lines_vec![ + "[`rust-multithreaded-logging-framework-for-python-design.md`](./\ + rust-multithreaded-logging-framework-for-python-design.md).", + ]; + let output = process_stream(&input); + assert_eq!(output, input); +} + +/// Test links followed by various punctuation marks remain on a single line. +#[rstest] +#[case(".")] +#[case(",")] +#[case(";")] +#[case(":")] +#[case("!")] +#[case("?")] +#[case("...")] +fn test_wrap_link_with_various_trailing_punctuation(#[case] punct: &str) { + let input = lines_vec![format!("[link](https://example.com){}", punct)]; + let output = process_stream(&input); + assert_eq!(output, input, "Failed for punctuation: {punct}"); +} + +/// Test a link at line end without trailing punctuation. +#[test] +fn test_wrap_link_at_line_end() { + let input = lines_vec!["Check out [link](https://example.com)"]; + let output = process_stream(&input); + assert_eq!(output, input); +} + +/// Test links containing punctuation within the link text. +#[test] +fn test_wrap_link_with_punctuation_in_text() { + let input = lines_vec!["[foo, bar!](https://example.com)"]; + let output = process_stream(&input); + assert_eq!(output, input); +} + +/// Test links containing punctuation inside the URL. +#[test] +fn test_wrap_link_with_punctuation_in_url() { + let input = lines_vec!["[link](https://example.com/foo,bar)"]; + let output = process_stream(&input); + assert_eq!(output, input); +} + /// Regression test for wrapping list items that end with a full stop. /// /// The period following the inline code span should remain on the same line