Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ cargo install --path .
## Command-line usage

```bash
mdtablefix [--wrap] [--renumber] [--breaks] [--in-place] [FILE...]
mdtablefix [--wrap] [--renumber] [--breaks] [--ellipsis] [--in-place] [FILE...]
```

- With file paths provided, the corrected tables are printed to stdout.
Expand All @@ -32,6 +32,9 @@ mdtablefix [--wrap] [--renumber] [--breaks] [--in-place] [FILE...]
`--renumber`.
- Use `--breaks` to normalize thematic breaks to a line of 70 underscores
(configurable via the `THEMATIC_BREAK_LEN` constant).
- Use `--ellipsis` to replace groups of three consecutive dots with the
ellipsis character. Longer runs are processed left-to-right, so leftover dots
remain unchanged.
- Use `--in-place` to overwrite files.
- If no files are supplied, input is read from stdin and results are written
to stdout.
Expand Down Expand Up @@ -62,19 +65,25 @@ The crate exposes helper functions so you can integrate the table reflow logic
in your own project.

```rust
use mdtablefix::{process_stream, rewrite};
use mdtablefix::{process_stream_opts, rewrite};
use std::path::Path;

fn main() -> std::io::Result<()> {
let lines = vec!["|A|B|".to_string(), "|1|2|".to_string()];
let fixed = process_stream(&lines);
let fixed = process_stream_opts(
&lines,
/* wrap = */ true,
/* ellipsis = */ true,
);
println!("{}", fixed.join("\n"));
rewrite(Path::new("table.md"))?;
Ok(())
}
```

- `process_stream(&[String]) -> Vec<String>` rewrites tables in memory.
- `process_stream_opts(lines: &[String], wrap: bool, ellipsis: bool) ->
Vec<String>` rewrites tables in memory with optional wrapping and ellipsis
replacement.
- `rewrite(&Path) -> std::io::Result<()>` updates a Markdown file on disk.

## HTML table support
Expand Down
12 changes: 10 additions & 2 deletions docs/module-relationships.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ classDiagram
+format_breaks()
+THEMATIC_BREAK_LEN
}
class ellipsis {
<<module>>
+replace_ellipsis()
}
class process {
<<module>>
+process_stream()
Expand All @@ -47,19 +51,23 @@ classDiagram
lib --> wrap
lib --> lists
lib --> breaks
lib --> ellipsis
lib --> process
lib --> io
html ..> wrap : uses is_fence
table ..> reflow : uses parse_rows, etc.
lists ..> wrap : uses is_fence
breaks ..> wrap : uses is_fence
ellipsis ..> wrap : uses tokenize_markdown
process ..> html : uses convert_html_tables
process ..> table : uses reflow_table
process ..> wrap : uses wrap_text, is_fence
process ..> ellipsis : uses replace_ellipsis
io ..> process : uses process_stream, process_stream_no_wrap
```

The `lib` module re-exports the public API from the other modules. The
`process` module provides streaming helpers that combine the lower-level
functions. The `io` module handles filesystem operations, delegating the text
`ellipsis` module performs text normalisation. The `process` module provides
streaming helpers that combine the lower-level functions, including ellipsis
replacement. The `io` module handles filesystem operations, delegating the text
processing to `process`.
105 changes: 105 additions & 0 deletions src/ellipsis.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
//! Replace sequences of three dots with the ellipsis character.
//!
//! Groups of three consecutive dots become a single Unicode ellipsis. Longer
//! runs are processed left-to-right so trailing dots that do not form a
//! complete triple remain. Fenced code blocks and inline code spans are left
//! untouched.

use regex::Regex;

use crate::wrap::{Token, tokenize_markdown};

static DOT_RE: std::sync::LazyLock<Regex> =
std::sync::LazyLock::new(|| Regex::new(r"\.{3,}").unwrap());

/// Replace `...` with `…` outside code spans and fences.
#[must_use]
pub fn replace_ellipsis(lines: &[String]) -> Vec<String> {
Comment thread
leynos marked this conversation as resolved.
if lines.is_empty() {
return Vec::new();
}
let joined = lines.join("\n");
let mut out = String::new();
for token in tokenize_markdown(&joined) {
match token {
Token::Text(t) => {
let replaced = DOT_RE.replace_all(t, |caps: &regex::Captures<'_>| {
let len = caps[0].len();
let ellipses = "…".repeat(len / 3);
let leftover = ".".repeat(len % 3);
format!("{ellipses}{leftover}")
});
out.push_str(&replaced);
}
Token::Code(c) => {
out.push('`');
out.push_str(c);
out.push('`');
}
Token::Fence(f) => {
out.push_str(f);
}
Token::Newline => out.push('\n'),
}
}
out.split('\n').map(str::to_string).collect()
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn replaces_simple_text() {
let input = vec!["wait...".to_string()];
let expected = vec!["wait…".to_string()];
assert_eq!(replace_ellipsis(&input), expected);
}

#[test]
fn ignores_code_spans() {
let input = vec!["a `b...` c".to_string()];
let expected = input.clone();
assert_eq!(replace_ellipsis(&input), expected);
}

#[test]
fn ignores_fenced_blocks() {
let input = vec!["```".to_string(), "...".to_string(), "```".to_string()];
let expected = input.clone();
assert_eq!(replace_ellipsis(&input), expected);
}

#[test]
fn replaces_long_sequences() {
let input = vec![".... ..... ...... .......".to_string()];
let expected = vec!["…. ….. …… …….".to_string()];
assert_eq!(replace_ellipsis(&input), expected);
}

#[test]
fn handles_empty_input() {
let input: Vec<String> = Vec::new();
let expected: Vec<String> = Vec::new();
assert_eq!(replace_ellipsis(&input), expected);
}

#[test]
fn handles_multiple_fenced_blocks() {
let input = vec![
"text...".to_string(),
"```".to_string(),
"code...".to_string(),
"```".to_string(),
"more text...".to_string(),
];
let expected = vec![
"text…".to_string(),
"```".to_string(),
"code...".to_string(),
"```".to_string(),
"more text…".to_string(),
];
assert_eq!(replace_ellipsis(&input), expected);
}
}
5 changes: 4 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
//! - `wrap` for paragraph wrapping.
//! - `lists` for renumbering ordered lists.
//! - `breaks` for thematic break formatting.
//! - `ellipsis` for normalising textual ellipses.
//! - `process` for stream processing.
//! - `io` for file helpers.

pub mod breaks;
pub mod ellipsis;
Comment thread
coderabbitai[bot] marked this conversation as resolved.
mod html;
pub mod io;
pub mod lists;
Expand All @@ -25,9 +27,10 @@ pub fn html_table_to_markdown(lines: &[String]) -> Vec<String> {
}

pub use breaks::{THEMATIC_BREAK_LEN, format_breaks};
pub use ellipsis::replace_ellipsis;
pub use html::convert_html_tables;
pub use io::{rewrite, rewrite_no_wrap};
pub use lists::renumber_lists;
pub use process::{process_stream, process_stream_no_wrap};
pub use process::{process_stream, process_stream_no_wrap, process_stream_opts};
pub use table::{reflow_table, split_cells};
pub use wrap::{is_fence, wrap_text};
12 changes: 6 additions & 6 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use std::{
};

use clap::Parser;
use mdtablefix::{format_breaks, process_stream, process_stream_no_wrap, renumber_lists};
use mdtablefix::{format_breaks, process_stream_opts, renumber_lists};

#[derive(Parser)]
#[command(about = "Reflow broken markdown tables")]
Expand All @@ -21,6 +21,7 @@ struct Cli {
}

#[derive(clap::Args, Clone, Copy)]
#[allow(clippy::struct_excessive_bools)] // CLI exposes four independent flags
struct FormatOpts {
/// Wrap paragraphs and list items to 80 columns
#[arg(long = "wrap")]
Expand All @@ -31,14 +32,13 @@ struct FormatOpts {
/// Reformat thematic breaks as underscores
#[arg(long = "breaks")]
breaks: bool,
/// Replace "..." with the ellipsis character
#[arg(long = "ellipsis")]
ellipsis: bool,
}

fn process_lines(lines: &[String], opts: FormatOpts) -> Vec<String> {
let mut out = if opts.wrap {
process_stream(lines)
} else {
process_stream_no_wrap(lines)
};
let mut out = process_stream_opts(lines, opts.wrap, opts.ellipsis);
if opts.renumber {
out = renumber_lists(&out);
}
Expand Down
18 changes: 14 additions & 4 deletions src/process.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
//! High-level Markdown stream processing.

use crate::{
ellipsis::replace_ellipsis,
html::convert_html_tables,
table::reflow_table,
wrap::{self, wrap_text},
};

#[must_use]
pub fn process_stream_inner(lines: &[String], wrap: bool) -> Vec<String> {
pub fn process_stream_inner(lines: &[String], wrap: bool, ellipsis: bool) -> Vec<String> {
let pre = convert_html_tables(lines);

let mut out = Vec::new();
Expand Down Expand Up @@ -69,15 +70,24 @@ pub fn process_stream_inner(lines: &[String], wrap: bool) -> Vec<String> {
}
}

if wrap { wrap_text(&out, 80) } else { out }
let mut out = if wrap { wrap_text(&out, 80) } else { out };
if ellipsis {
out = replace_ellipsis(&out);
}
out
}

#[must_use]
pub fn process_stream(lines: &[String]) -> Vec<String> { process_stream_inner(lines, true) }
pub fn process_stream(lines: &[String]) -> Vec<String> { process_stream_inner(lines, true, false) }

#[must_use]
pub fn process_stream_no_wrap(lines: &[String]) -> Vec<String> {
process_stream_inner(lines, false)
process_stream_inner(lines, false, false)
}

#[must_use]
pub fn process_stream_opts(lines: &[String], wrap: bool, ellipsis: bool) -> Vec<String> {
process_stream_inner(lines, wrap, ellipsis)
}

#[cfg(test)]
Expand Down
57 changes: 55 additions & 2 deletions src/wrap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,20 @@ static FOOTNOTE_RE: std::sync::LazyLock<Regex> =
static BLOCKQUOTE_RE: std::sync::LazyLock<Regex> =
std::sync::LazyLock::new(|| Regex::new(r"^(\s*(?:>\s*)+)(.*)$").unwrap());

pub(crate) fn tokenize_markdown(text: &str) -> Vec<String> {
/// Markdown token emitted by [`tokenize_markdown`].
#[derive(Debug, PartialEq)]
pub enum Token<'a> {
/// Line within a fenced code block, including the fence itself.
Fence(&'a str),
/// Inline code span without surrounding backticks.
Code(&'a str),
/// Plain text outside code regions.
Text(&'a str),
/// Line break separating tokens.
Newline,
}

fn tokenize_inline(text: &str) -> Vec<String> {
let mut tokens = Vec::new();
let chars: Vec<char> = text.chars().collect();
let mut i = 0;
Expand Down Expand Up @@ -70,6 +83,46 @@ pub(crate) fn tokenize_markdown(text: &str) -> Vec<String> {
tokens
}

/// Tokenise Markdown into fences, inline code and plain text.
pub(crate) fn tokenize_markdown(input: &str) -> Vec<Token<'_>> {
let mut out = Vec::new();
let mut in_fence = false;
for line in input.split_inclusive('\n') {
let trimmed = line.trim_end_matches('\n');
if FENCE_RE.is_match(trimmed) {
out.push(Token::Fence(trimmed));
out.push(Token::Newline);
in_fence = !in_fence;
continue;
}
if in_fence {
out.push(Token::Fence(trimmed));
out.push(Token::Newline);
continue;
}
let mut rest = trimmed;
while let Some(pos) = rest.find('`') {
if pos > 0 {
out.push(Token::Text(&rest[..pos]));
}
if let Some(end) = rest[pos + 1..].find('`') {
out.push(Token::Code(&rest[pos + 1..pos + 1 + end]));
rest = &rest[pos + end + 2..];
} else {
out.push(Token::Text(&rest[pos..]));
rest = "";
break;
}
}
if !rest.is_empty() {
out.push(Token::Text(rest));
}
out.push(Token::Newline);
}
out.pop();
out
}

/// Determine if the current line should break at the last whitespace.
///
/// Returns `true` if `current_width` exceeds `width` and a whitespace split
Expand All @@ -93,7 +146,7 @@ fn wrap_preserving_code(text: &str, width: usize) -> Vec<String> {
let mut current = String::new();
let mut current_width = 0;
let mut last_split: Option<usize> = None;
for token in tokenize_markdown(text) {
for token in tokenize_inline(text) {
let token_width = UnicodeWidthStr::width(token.as_str());
if current_width + token_width <= width {
current.push_str(&token);
Expand Down
Loading