diff --git a/.github/workflows/wasi.yml b/.github/workflows/wasi.yml index c4c08974de4..ba5e5ac3472 100644 --- a/.github/workflows/wasi.yml +++ b/.github/workflows/wasi.yml @@ -36,4 +36,7 @@ jobs: env: CARGO_TARGET_WASM32_WASIP1_RUNNER: wasmtime run: | - cargo test --target wasm32-wasip1 --no-default-features -p uu_echo -p uu_cut + # Get all utilities and exclude ones that don't compile for wasm32-wasip1 + EXCLUDE="dd|df|du|env|expr|mktemp|more|tac|test" + UTILS=$(./util/show-utils.sh | tr ' ' '\n' | grep -vE "^($EXCLUDE)$" | sed 's/^/-p uu_/' | tr '\n' ' ') + cargo test --target wasm32-wasip1 --no-default-features $UTILS diff --git a/src/uu/sort/src/chunks.rs b/src/uu/sort/src/chunks.rs index c808c622b81..ce6b2118be4 100644 --- a/src/uu/sort/src/chunks.rs +++ b/src/uu/sort/src/chunks.rs @@ -47,7 +47,7 @@ pub struct ChunkContents<'a> { pub line_count_hint: usize, } -#[derive(Debug)] +#[derive(Debug, Default)] pub struct LineData<'a> { pub selections: Vec<&'a [u8]>, pub num_infos: Vec, @@ -431,3 +431,32 @@ fn read_to_buffer( } } } + +/// Parse a buffer into a `ChunkContents` suitable for `Chunk::try_new`. +/// Used by the WASI single-threaded sort path. +#[cfg(target_os = "wasi")] +pub fn parse_into_chunk<'a>( + buffer: &'a [u8], + separator: u8, + settings: &GlobalSettings, +) -> ChunkContents<'a> { + let mut lines = Vec::new(); + let mut line_data = LineData::default(); + let mut token_buffer = Vec::new(); + let mut line_count_hint = 0; + parse_lines( + buffer, + &mut lines, + &mut line_data, + &mut token_buffer, + &mut line_count_hint, + separator, + settings, + ); + ChunkContents { + lines, + line_data, + token_buffer, + line_count_hint, + } +} diff --git a/src/uu/sort/src/ext_sort/mod.rs b/src/uu/sort/src/ext_sort/mod.rs new file mode 100644 index 00000000000..099a4b72e62 --- /dev/null +++ b/src/uu/sort/src/ext_sort/mod.rs @@ -0,0 +1,20 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +//! External sort: sort large inputs that may not fit in memory. +//! +//! On most platforms this uses a multi-threaded chunked approach with +//! temporary files. On WASI (no threads) we fall back to an in-memory sort. + +#[cfg(not(target_os = "wasi"))] +mod threaded; +#[cfg(not(target_os = "wasi"))] +pub use threaded::ext_sort; + +#[cfg(target_os = "wasi")] +mod wasi; +#[cfg(target_os = "wasi")] +// `self::` needed to disambiguate from the `wasi` crate +pub use self::wasi::ext_sort; diff --git a/src/uu/sort/src/ext_sort.rs b/src/uu/sort/src/ext_sort/threaded.rs similarity index 94% rename from src/uu/sort/src/ext_sort.rs rename to src/uu/sort/src/ext_sort/threaded.rs index 59bf18a41a5..7dd089d0fe8 100644 --- a/src/uu/sort/src/ext_sort.rs +++ b/src/uu/sort/src/ext_sort/threaded.rs @@ -3,21 +3,15 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -//! Sort big files by using auxiliary files for storing intermediate chunks. -//! -//! Files are read into chunks of memory which are then sorted individually and -//! written to temporary files. There are two threads: One sorter, and one reader/writer. -//! The buffers for the individual chunks are recycled. There are two buffers. +//! Threaded external sort: read input in chunks, sort them in a background +//! thread, and spill to temporary files when memory is exceeded. use std::cmp::Ordering; use std::fs::File; -use std::io::{Write, stderr}; +use std::io::{Read, Write, stderr}; use std::path::PathBuf; -use std::{ - io::Read, - sync::mpsc::{Receiver, SyncSender}, - thread, -}; +use std::sync::mpsc::{Receiver, SyncSender}; +use std::thread; use itertools::Itertools; use uucore::error::{UResult, strip_errno}; @@ -29,17 +23,20 @@ use crate::merge::WriteablePlainTmpFile; use crate::merge::WriteableTmpFile; use crate::tmp_dir::TmpDirWrapper; use crate::{ - GlobalSettings, + GlobalSettings, Line, chunks::{self, Chunk}, - compare_by, merge, sort_by, + compare_by, merge, print_sorted, sort_by, }; -use crate::{Line, print_sorted}; // Note: update `test_sort::test_start_buffer` if this size is changed // Fixed to 8 KiB (equivalent to `std::sys::io::DEFAULT_BUF_SIZE` on most targets) const DEFAULT_BUF_SIZE: usize = 8 * 1024; /// Sort files by using auxiliary files for storing intermediate chunks (if needed), and output the result. +/// +/// Two threads cooperate: one reads input and writes temporary chunk files, +/// while the other sorts each chunk in memory. Once all chunks are written, +/// they are merged back together for final output. pub fn ext_sort( files: &mut impl Iterator>>, settings: &GlobalSettings, diff --git a/src/uu/sort/src/ext_sort/wasi.rs b/src/uu/sort/src/ext_sort/wasi.rs new file mode 100644 index 00000000000..50bd5f63033 --- /dev/null +++ b/src/uu/sort/src/ext_sort/wasi.rs @@ -0,0 +1,59 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +//! WASI single-threaded sort: read all input into memory, sort, and output. +//! Threads are not available on WASI, so we bypass the chunked/threaded path. + +use std::cmp::Ordering; +use std::io::Read; + +use itertools::Itertools; +use uucore::error::UResult; + +use crate::Output; +use crate::chunks::{self, Chunk}; +use crate::tmp_dir::TmpDirWrapper; +use crate::{GlobalSettings, compare_by, print_sorted, sort_by}; + +/// Sort files by reading all input into memory, sorting in a single thread, and outputting directly. +pub fn ext_sort( + files: &mut impl Iterator>>, + settings: &GlobalSettings, + output: Output, + _tmp_dir: &mut TmpDirWrapper, +) -> UResult<()> { + let separator = settings.line_ending.into(); + // Read all input into memory at once. Unlike the threaded path which uses + // chunked buffered reads, WASI has no threads so we accept the memory cost. + // Note: there is no size limit here — WASI targets are expected to handle + // moderately sized inputs; very large files may cause OOM. + let mut input = Vec::new(); + for file in files { + file?.read_to_end(&mut input)?; + } + if input.is_empty() { + return Ok(()); + } + let mut chunk = Chunk::try_new(input, |buffer| { + Ok::<_, Box>(chunks::parse_into_chunk( + buffer, separator, settings, + )) + })?; + chunk.with_dependent_mut(|_, contents| { + sort_by(&mut contents.lines, settings, &contents.line_data); + }); + if settings.unique { + print_sorted( + chunk.lines().iter().dedup_by(|a, b| { + compare_by(a, b, settings, chunk.line_data(), chunk.line_data()) == Ordering::Equal + }), + settings, + output, + )?; + } else { + print_sorted(chunk.lines().iter(), settings, output)?; + } + Ok(()) +} diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 6fe6e4ee7ff..02822cee56d 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -2148,11 +2148,22 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { settings.buffer_size_is_explicit = false; } - let mut tmp_dir = TmpDirWrapper::new( - matches - .get_one::(options::TMP_DIR) - .map_or_else(env::temp_dir, PathBuf::from), - ); + let mut tmp_dir = TmpDirWrapper::new(matches.get_one::(options::TMP_DIR).map_or_else( + || { + // WASI does not support std::env::temp_dir() — it panics with + // "no filesystem on wasm". Use /tmp as a nominal fallback; + // the WASI ext_sort path never actually creates temp files. + #[cfg(target_os = "wasi")] + { + PathBuf::from("/tmp") + } + #[cfg(not(target_os = "wasi"))] + { + env::temp_dir() + } + }, + PathBuf::from, + )); settings.compress_prog = matches .get_one::(options::COMPRESS_PROG)