From 394b08a0658efea767b9865b9709ae798fb44df0 Mon Sep 17 00:00:00 2001 From: Leynos Date: Mon, 21 Jul 2025 00:22:16 +0100 Subject: [PATCH 1/6] Add parallel file processing with rayon --- Cargo.lock | 52 +++++++++++++++++++++++++++++ Cargo.toml | 1 + docs/parallel-processing-roadmap.md | 8 ++--- docs/rayon-concurrency.md | 6 ++++ src/main.rs | 35 ++++++++++++++----- 5 files changed, 89 insertions(+), 13 deletions(-) create mode 100644 docs/rayon-concurrency.md diff --git a/Cargo.lock b/Cargo.lock index b29d29fd..0c640cf0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -158,6 +158,31 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + [[package]] name = "difflib" version = "0.4.0" @@ -170,6 +195,12 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + [[package]] name = "errno" version = "0.3.13" @@ -406,6 +437,7 @@ dependencies = [ "libc", "markup5ever_rcdom", "once_cell", + "rayon", "regex", "rstest", "tempfile", @@ -581,6 +613,26 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "redox_syscall" version = "0.5.13" diff --git a/Cargo.toml b/Cargo.toml index e025ec42..dd5b1d74 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ anyhow = "1" clap = { version = "4", features = ["derive"] } regex = "1" once_cell = "1" +rayon = "1.10" html5ever = "0.27" markup5ever_rcdom = "0.3" unicode-width = ">=0.1, <0.2" diff --git a/docs/parallel-processing-roadmap.md b/docs/parallel-processing-roadmap.md index 4ab98234..d5856a33 100644 --- a/docs/parallel-processing-roadmap.md +++ b/docs/parallel-processing-roadmap.md @@ -4,15 +4,15 @@ The command-line tool currently processes input files sequentially. The steps below outline the work required to allow concurrent processing while preserving serial output order. -- [ ] **Adopt `rayon` for concurrency** +- [x] **Adopt `rayon` for concurrency** - Use `rayon` thread pools to spawn work for each file path. - Ensure the approach integrates cleanly with existing modules. -- [ ] **Add chosen crate to `Cargo.toml`** +- [x] **Add chosen crate to `Cargo.toml`** - Pin an explicit version and document the decision in `docs/`. -- [ ] **Refactor `main.rs` to launch parallel tasks** +- [x] **Refactor `main.rs` to launch parallel tasks** - Spawn a worker for each file path using the concurrency crate. - Maintain a list of handles so outputs can be gathered in order. -- [ ] **Collect results sequentially** +- [x] **Collect results sequentially** - Await or join handles in the same order the files were supplied. - Print each processed file or error message before moving to the next. - [ ] **Extend tests for parallel execution** diff --git a/docs/rayon-concurrency.md b/docs/rayon-concurrency.md new file mode 100644 index 00000000..c8199a88 --- /dev/null +++ b/docs/rayon-concurrency.md @@ -0,0 +1,6 @@ +# Concurrency with `rayon` + +`mdtablefix` uses the `rayon` crate to process multiple files concurrently. +`rayon` provides a work-stealing thread pool and simple parallel iterators. The +version is pinned to `1.10` in `Cargo.toml` to avoid breaking changes from a +future major release. diff --git a/src/main.rs b/src/main.rs index fa2eabe4..0fa1f725 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,6 +7,7 @@ use std::{ use clap::Parser; use mdtablefix::{Options, format_breaks, process_stream_opts, renumber_lists}; +use rayon::prelude::*; #[derive(Parser)] #[command(about = "Reflow broken markdown tables")] @@ -109,15 +110,31 @@ fn main() -> anyhow::Result<()> { return Ok(()); } - for path in cli.files { - if cli.in_place { - rewrite_path(&path, cli.opts)?; - } else { - let content = fs::read_to_string(&path)?; - let lines: Vec = content.lines().map(str::to_string).collect(); - let fixed = process_lines(&lines, cli.opts); - println!("{}", fixed.join("\n")); - } + let pool = rayon::ThreadPoolBuilder::new().build()?; + + if cli.in_place { + pool.install(|| { + cli.files + .par_iter() + .try_for_each(|p| rewrite_path(p, cli.opts)) + })?; + return Ok(()); + } + + let results: anyhow::Result> = pool.install(|| { + cli.files + .par_iter() + .map(|p| -> anyhow::Result { + let content = fs::read_to_string(p)?; + let lines: Vec = content.lines().map(str::to_string).collect(); + let fixed = process_lines(&lines, cli.opts); + Ok(fixed.join("\n")) + }) + .collect() + }); + + for out in results? { + println!("{out}"); } Ok(()) From ba8b3927c28bdaae5cf4c13a0bb28d0320077173 Mon Sep 17 00:00:00 2001 From: Leynos Date: Mon, 21 Jul 2025 00:53:36 +0100 Subject: [PATCH 2/6] Refactor parallel processing with Rayon Use the global Rayon pool and shared handle_file function. Update docs accordingly. --- docs/rayon-concurrency.md | 5 +++-- src/main.rs | 46 +++++++++++++++++++-------------------- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/docs/rayon-concurrency.md b/docs/rayon-concurrency.md index c8199a88..028dffd8 100644 --- a/docs/rayon-concurrency.md +++ b/docs/rayon-concurrency.md @@ -2,5 +2,6 @@ `mdtablefix` uses the `rayon` crate to process multiple files concurrently. `rayon` provides a work-stealing thread pool and simple parallel iterators. The -version is pinned to `1.10` in `Cargo.toml` to avoid breaking changes from a -future major release. +tool relies on Rayon’s global thread pool so that no manual setup is required. +The version is pinned to `1.10` in `Cargo.toml` to avoid breaking changes from +a future major release. diff --git a/src/main.rs b/src/main.rs index 0fa1f725..94f2591f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -68,11 +68,17 @@ fn process_lines(lines: &[String], opts: FormatOpts) -> Vec { out } -fn rewrite_path(path: &Path, opts: FormatOpts) -> std::io::Result<()> { +fn handle_file(path: &Path, in_place: bool, opts: FormatOpts) -> anyhow::Result> { let content = fs::read_to_string(path)?; let lines: Vec = content.lines().map(str::to_string).collect(); - let fixed = process_lines(&lines, opts); - fs::write(path, fixed.join("\n") + "\n") + let fixed = process_lines(&lines, opts).join("\n"); + + if in_place { + fs::write(path, format!("{fixed}\n"))?; + Ok(None) + } else { + Ok(Some(fixed)) + } } /// Entry point for the command-line tool that reflows broken markdown tables. @@ -110,31 +116,23 @@ fn main() -> anyhow::Result<()> { return Ok(()); } - let pool = rayon::ThreadPoolBuilder::new().build()?; - if cli.in_place { - pool.install(|| { - cli.files - .par_iter() - .try_for_each(|p| rewrite_path(p, cli.opts)) - })?; - return Ok(()); - } - - let results: anyhow::Result> = pool.install(|| { cli.files .par_iter() - .map(|p| -> anyhow::Result { - let content = fs::read_to_string(p)?; - let lines: Vec = content.lines().map(str::to_string).collect(); - let fixed = process_lines(&lines, cli.opts); - Ok(fixed.join("\n")) - }) - .collect() - }); + .try_for_each(|p| handle_file(p, true, cli.opts).map(|_| ()))?; + } else { + let outputs: Vec = cli + .files + .par_iter() + .map(|p| handle_file(p, false, cli.opts)) + .collect::>>()? + .into_iter() + .flatten() + .collect(); - for out in results? { - println!("{out}"); + for out in outputs { + println!("{out}"); + } } Ok(()) From b40edc3c0b6f14c941fec6024b543e9ba7442a01 Mon Sep 17 00:00:00 2001 From: Leynos Date: Mon, 21 Jul 2025 01:16:28 +0100 Subject: [PATCH 3/6] Update Rayon version and documentation --- Cargo.toml | 2 +- docs/parallel-processing-roadmap.md | 2 +- docs/rayon-concurrency.md | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index dd5b1d74..2397537c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,7 @@ anyhow = "1" clap = { version = "4", features = ["derive"] } regex = "1" once_cell = "1" -rayon = "1.10" +rayon = "1" html5ever = "0.27" markup5ever_rcdom = "0.3" unicode-width = ">=0.1, <0.2" diff --git a/docs/parallel-processing-roadmap.md b/docs/parallel-processing-roadmap.md index d5856a33..4a19c9b1 100644 --- a/docs/parallel-processing-roadmap.md +++ b/docs/parallel-processing-roadmap.md @@ -11,7 +11,7 @@ serial output order. - Pin an explicit version and document the decision in `docs/`. - [x] **Refactor `main.rs` to launch parallel tasks** - Spawn a worker for each file path using the concurrency crate. - - Maintain a list of handles so outputs can be gathered in order. + - Maintain a list of handles, so outputs can be gathered in order. - [x] **Collect results sequentially** - Await or join handles in the same order the files were supplied. - Print each processed file or error message before moving to the next. diff --git a/docs/rayon-concurrency.md b/docs/rayon-concurrency.md index 028dffd8..ca8c49d4 100644 --- a/docs/rayon-concurrency.md +++ b/docs/rayon-concurrency.md @@ -3,5 +3,5 @@ `mdtablefix` uses the `rayon` crate to process multiple files concurrently. `rayon` provides a work-stealing thread pool and simple parallel iterators. The tool relies on Rayon’s global thread pool so that no manual setup is required. -The version is pinned to `1.10` in `Cargo.toml` to avoid breaking changes from -a future major release. +The dependency is specified as `1` in `Cargo.toml` to track stable API changes +within the same major release. From 57389862c840d8381c3cd089c063b29438530a3d Mon Sep 17 00:00:00 2001 From: Leynos Date: Mon, 21 Jul 2025 01:59:34 +0100 Subject: [PATCH 4/6] Document rayon usage --- Cargo.toml | 2 +- docs/rayon-concurrency.md | 4 ++-- src/main.rs | 6 ++++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2397537c..dad1e79b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,7 @@ anyhow = "1" clap = { version = "4", features = ["derive"] } regex = "1" once_cell = "1" -rayon = "1" +rayon = "^1.0" html5ever = "0.27" markup5ever_rcdom = "0.3" unicode-width = ">=0.1, <0.2" diff --git a/docs/rayon-concurrency.md b/docs/rayon-concurrency.md index ca8c49d4..7412f609 100644 --- a/docs/rayon-concurrency.md +++ b/docs/rayon-concurrency.md @@ -3,5 +3,5 @@ `mdtablefix` uses the `rayon` crate to process multiple files concurrently. `rayon` provides a work-stealing thread pool and simple parallel iterators. The tool relies on Rayon’s global thread pool so that no manual setup is required. -The dependency is specified as `1` in `Cargo.toml` to track stable API changes -within the same major release. +The dependency is specified as `^1.0` in `Cargo.toml` to track stable API +changes within the same major release. diff --git a/src/main.rs b/src/main.rs index 94f2591f..42266576 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,9 @@ +//! Command-line interface for the mdtablefix tool. +//! +//! This module provides the main entry point and CLI parsing for fixing +//! markdown table formatting. It supports concurrent processing of multiple +//! files using Rayon for improved performance. + use std::{ borrow::Cow, fs, From 91413265048a5f33b6ab623ed4d5f4a7c882cba6 Mon Sep 17 00:00:00 2001 From: Leynos Date: Mon, 21 Jul 2025 02:59:08 +0100 Subject: [PATCH 5/6] Fix newline handling and update Rayon docs --- docs/rayon-concurrency.md | 2 +- src/main.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/rayon-concurrency.md b/docs/rayon-concurrency.md index 7412f609..35129850 100644 --- a/docs/rayon-concurrency.md +++ b/docs/rayon-concurrency.md @@ -3,5 +3,5 @@ `mdtablefix` uses the `rayon` crate to process multiple files concurrently. `rayon` provides a work-stealing thread pool and simple parallel iterators. The tool relies on Rayon’s global thread pool so that no manual setup is required. -The dependency is specified as `^1.0` in `Cargo.toml` to track stable API +The dependency is specified as `>=1, <2` in `Cargo.toml` to track stable API changes within the same major release. diff --git a/src/main.rs b/src/main.rs index 42266576..75f1bbac 100644 --- a/src/main.rs +++ b/src/main.rs @@ -83,7 +83,7 @@ fn handle_file(path: &Path, in_place: bool, opts: FormatOpts) -> anyhow::Result< fs::write(path, format!("{fixed}\n"))?; Ok(None) } else { - Ok(Some(fixed)) + Ok(Some(format!("{fixed}\n"))) } } @@ -137,7 +137,7 @@ fn main() -> anyhow::Result<()> { .collect(); for out in outputs { - println!("{out}"); + print!("{out}"); } } From 734962341b784752680e1795661c0696fa7d9f6f Mon Sep 17 00:00:00 2001 From: Leynos Date: Mon, 21 Jul 2025 18:56:36 +0100 Subject: [PATCH 6/6] Use println for file outputs --- README.md | 10 +++++----- docs/rayon-concurrency.md | 2 +- src/main.rs | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 85cc9a2e..82d28942 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,6 @@ cargo install --path . ## Command-line usage - ```bash mdtablefix [--wrap] [--renumber] [--breaks] [--ellipsis] [--fences] [--footnotes] [--in-place] [FILE...] ``` @@ -111,8 +110,8 @@ A brief intermission for pizza. ## Library usage -The crate exposes helper functions for embedding the table-reflow logic in -Rust projects: +The crate exposes helper functions for embedding the table-reflow logic in Rust +projects: ```rust use mdtablefix::{process_stream_opts, rewrite, Options}; @@ -159,8 +158,9 @@ For an overview of how the crate's internal modules relate to each other, see ## Testing -The test suite is structured using the `rstest` crate. See [Rust testing with -rstest fixtures](docs/rust-testing-with-rstest-fixtures.md) for details. +The test suite is structured using the `rstest` crate. See +[Rust testing with rstest fixtures](docs/rust-testing-with-rstest-fixtures.md) +for details. ## License diff --git a/docs/rayon-concurrency.md b/docs/rayon-concurrency.md index 35129850..7412f609 100644 --- a/docs/rayon-concurrency.md +++ b/docs/rayon-concurrency.md @@ -3,5 +3,5 @@ `mdtablefix` uses the `rayon` crate to process multiple files concurrently. `rayon` provides a work-stealing thread pool and simple parallel iterators. The tool relies on Rayon’s global thread pool so that no manual setup is required. -The dependency is specified as `>=1, <2` in `Cargo.toml` to track stable API +The dependency is specified as `^1.0` in `Cargo.toml` to track stable API changes within the same major release. diff --git a/src/main.rs b/src/main.rs index 75f1bbac..42266576 100644 --- a/src/main.rs +++ b/src/main.rs @@ -83,7 +83,7 @@ fn handle_file(path: &Path, in_place: bool, opts: FormatOpts) -> anyhow::Result< fs::write(path, format!("{fixed}\n"))?; Ok(None) } else { - Ok(Some(format!("{fixed}\n"))) + Ok(Some(fixed)) } } @@ -137,7 +137,7 @@ fn main() -> anyhow::Result<()> { .collect(); for out in outputs { - print!("{out}"); + println!("{out}"); } }