diff --git a/.vscode/cspell.dictionaries/jargon.wordlist.txt b/.vscode/cspell.dictionaries/jargon.wordlist.txt index ef8a2026fc1..66216702150 100644 --- a/.vscode/cspell.dictionaries/jargon.wordlist.txt +++ b/.vscode/cspell.dictionaries/jargon.wordlist.txt @@ -131,6 +131,7 @@ symlink symlinks syscall syscalls +sysconf tokenize toolchain truthy diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 069b68dc3db..347e88266a5 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -231,18 +231,18 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.49" +version = "4.5.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4512b90fa68d3a9932cea5184017c5d200f5921df706d45e853537dea51508f" +checksum = "0c2cfd7bf8a6017ddaa4e32ffe7403d547790db06bd171c1c53926faab501623" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.49" +version = "4.5.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0025e98baa12e766c67ba13ff4695a887a1eba19569aad00a472546795bd6730" +checksum = "0a4c05b9e80c5ccd3a7ef080ad7b6ba7d6fc00a985b8b157197075677c82c7a0" dependencies = [ "anstream", "anstyle", @@ -1538,9 +1538,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.19" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" +checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06" [[package]] name = "unicode-width" diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml index a44cc7570ae..c1b4c07084c 100644 --- a/src/uu/sort/Cargo.toml +++ b/src/uu/sort/Cargo.toml @@ -36,8 +36,6 @@ thiserror = { workspace = true } unicode-width = { workspace = true } uucore = { workspace = true, features = ["fs", "parser", "version-cmp"] } fluent = { workspace = true } - -[target.'cfg(target_os = "linux")'.dependencies] nix = { workspace = true } [dev-dependencies] diff --git a/src/uu/sort/src/buffer_hint.rs b/src/uu/sort/src/buffer_hint.rs new file mode 100644 index 00000000000..bb0ea754094 --- /dev/null +++ b/src/uu/sort/src/buffer_hint.rs @@ -0,0 +1,152 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +//! Heuristics for determining buffer size for external sorting. +use std::ffi::OsString; + +use crate::{ + FALLBACK_AUTOMATIC_BUF_SIZE, MAX_AUTOMATIC_BUF_SIZE, MIN_AUTOMATIC_BUF_SIZE, STDIN_FILE, +}; + +// Heuristics to size the external sort buffer without overcommit memory. +pub(crate) fn automatic_buffer_size(files: &[OsString]) -> usize { + let file_hint = file_size_hint(files); + let mem_hint = available_memory_hint(); + + // Prefer the tighter bound when both hints exist, otherwise fall back to whichever hint is available. + match (file_hint, mem_hint) { + (Some(file), Some(mem)) => file.min(mem), + (Some(file), None) => file, + (None, Some(mem)) => mem, + (None, None) => FALLBACK_AUTOMATIC_BUF_SIZE, + } +} + +fn file_size_hint(files: &[OsString]) -> Option { + // Estimate total bytes across real files; non-regular inputs are skipped. + let mut total_bytes: u128 = 0; + + for file in files { + if file == STDIN_FILE { + continue; + } + + let Ok(metadata) = std::fs::metadata(file) else { + continue; + }; + + if !metadata.is_file() { + continue; + } + + total_bytes = total_bytes.saturating_add(metadata.len() as u128); + + if total_bytes >= (MAX_AUTOMATIC_BUF_SIZE as u128) * 8 { + break; + } + } + + if total_bytes == 0 { + return None; + } + + let desired_bytes = desired_file_buffer_bytes(total_bytes); + Some(clamp_hint(desired_bytes)) +} + +fn available_memory_hint() -> Option { + #[cfg(target_os = "linux")] + if let Some(bytes) = uucore::parser::parse_size::available_memory_bytes() { + return Some(clamp_hint(bytes / 4)); + } + + physical_memory_bytes().map(|bytes| clamp_hint(bytes / 4)) +} + +fn clamp_hint(bytes: u128) -> usize { + let min = MIN_AUTOMATIC_BUF_SIZE as u128; + let max = MAX_AUTOMATIC_BUF_SIZE as u128; + let clamped = bytes.clamp(min, max); + clamped.min(usize::MAX as u128) as usize +} + +fn desired_file_buffer_bytes(total_bytes: u128) -> u128 { + if total_bytes == 0 { + return 0; + } + + let max = MAX_AUTOMATIC_BUF_SIZE as u128; + + if total_bytes <= max { + return total_bytes.saturating_mul(12).clamp(total_bytes, max); + } + + let quarter = total_bytes / 4; + quarter.max(max) +} + +fn physical_memory_bytes() -> Option { + #[cfg(all( + target_family = "unix", + not(target_os = "redox"), + any(target_os = "linux", target_os = "android") + ))] + { + physical_memory_bytes_unix() + } + + #[cfg(any( + not(target_family = "unix"), + target_os = "redox", + not(any(target_os = "linux", target_os = "android")) + ))] + { + // No portable or safe API is available here to detect total physical memory. + None + } +} + +#[cfg(all( + target_family = "unix", + not(target_os = "redox"), + any(target_os = "linux", target_os = "android") +))] +fn physical_memory_bytes_unix() -> Option { + use nix::unistd::{SysconfVar, sysconf}; + + let pages = match sysconf(SysconfVar::_PHYS_PAGES) { + Ok(Some(pages)) if pages > 0 => u128::try_from(pages).ok()?, + _ => return None, + }; + + let page_size = match sysconf(SysconfVar::PAGE_SIZE) { + Ok(Some(page_size)) if page_size > 0 => u128::try_from(page_size).ok()?, + _ => return None, + }; + + Some(pages.saturating_mul(page_size)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn desired_buffer_matches_total_when_small() { + let six_mebibytes = 6 * 1024 * 1024; + let expected = ((six_mebibytes as u128) * 12) + .clamp(six_mebibytes as u128, crate::MAX_AUTOMATIC_BUF_SIZE as u128); + assert_eq!(desired_file_buffer_bytes(six_mebibytes as u128), expected); + } + + #[test] + fn desired_buffer_caps_at_max_for_large_inputs() { + let large = 256 * 1024 * 1024; // 256 MiB + assert_eq!( + desired_file_buffer_bytes(large as u128), + crate::MAX_AUTOMATIC_BUF_SIZE as u128 + ); + } +} diff --git a/src/uu/sort/src/chunks.rs b/src/uu/sort/src/chunks.rs index af10a008844..837cb1fa95c 100644 --- a/src/uu/sort/src/chunks.rs +++ b/src/uu/sort/src/chunks.rs @@ -271,11 +271,12 @@ fn read_to_buffer( if max_buffer_size > buffer.len() { // we can grow the buffer let prev_len = buffer.len(); - if buffer.len() < max_buffer_size / 2 { - buffer.resize(buffer.len() * 2, 0); + let target = if buffer.len() < max_buffer_size / 2 { + buffer.len().saturating_mul(2) } else { - buffer.resize(max_buffer_size, 0); - } + max_buffer_size + }; + buffer.resize(target.min(max_buffer_size), 0); read_target = &mut buffer[prev_len..]; continue; } @@ -295,8 +296,8 @@ fn read_to_buffer( // We need to read more lines let len = buffer.len(); - // resize the vector to 10 KB more - buffer.resize(len + 1024 * 10, 0); + let grow_by = (len / 2).max(1024 * 1024); + buffer.resize(len + grow_by, 0); read_target = &mut buffer[len..]; } else { // This file has been fully read. diff --git a/src/uu/sort/src/ext_sort.rs b/src/uu/sort/src/ext_sort.rs index e43ad4b3a38..ddbc278d7e3 100644 --- a/src/uu/sort/src/ext_sort.rs +++ b/src/uu/sort/src/ext_sort.rs @@ -86,9 +86,15 @@ fn reader_writer< ) -> UResult<()> { let separator = settings.line_ending.into(); - // Heuristically chosen: Dividing by 10 seems to keep our memory usage roughly - // around settings.buffer_size as a whole. - let buffer_size = settings.buffer_size / 10; + // Cap oversized buffer requests to avoid unnecessary allocations and give the automatic + // heuristic room to grow when the user does not provide an explicit value. + let mut buffer_size = match settings.buffer_size { + size if size <= 512 * 1024 * 1024 => size, + size => size / 2, + }; + if !settings.buffer_size_is_explicit { + buffer_size = buffer_size.max(8 * 1024 * 1024); + } let read_result: ReadResult = read_write_loop( files, tmp_dir, diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 1d4948186fe..c9d1bac97aa 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -9,6 +9,7 @@ // spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit +mod buffer_hint; mod check; mod chunks; mod custom_str_cmp; @@ -54,6 +55,7 @@ use uucore::show_error; use uucore::translate; use uucore::version_cmp::version_cmp; +use crate::buffer_hint::automatic_buffer_size; use crate::tmp_dir::TmpDirWrapper; mod options { @@ -115,10 +117,12 @@ const DECIMAL_PT: u8 = b'.'; const NEGATIVE: &u8 = &b'-'; const POSITIVE: &u8 = &b'+'; -// Choosing a higher buffer size does not result in performance improvements -// (at least not on my machine). TODO: In the future, we should also take the amount of -// available memory into consideration, instead of relying on this constant only. -const DEFAULT_BUF_SIZE: usize = 1_000_000_000; // 1 GB +// The automatic buffer heuristics clamp to this range to avoid +// over-committing memory on constrained systems while still keeping +// reasonably large chunks for typical workloads. +const MIN_AUTOMATIC_BUF_SIZE: usize = 512 * 1024; // 512 KiB +const FALLBACK_AUTOMATIC_BUF_SIZE: usize = 32 * 1024 * 1024; // 32 MiB +const MAX_AUTOMATIC_BUF_SIZE: usize = 1024 * 1024 * 1024; // 1 GiB #[derive(Debug, Error)] pub enum SortError { @@ -283,6 +287,7 @@ pub struct GlobalSettings { threads: String, line_ending: LineEnding, buffer_size: usize, + buffer_size_is_explicit: bool, compress_prog: Option, merge_batch_size: usize, precomputed: Precomputed, @@ -359,9 +364,10 @@ impl Default for GlobalSettings { separator: None, threads: String::new(), line_ending: LineEnding::Newline, - buffer_size: DEFAULT_BUF_SIZE, + buffer_size: FALLBACK_AUTOMATIC_BUF_SIZE, + buffer_size_is_explicit: false, compress_prog: None, - merge_batch_size: 32, + merge_batch_size: default_merge_batch_size(), precomputed: Precomputed::default(), } } @@ -1036,6 +1042,31 @@ fn get_rlimit() -> UResult { } const STDIN_FILE: &str = "-"; +#[cfg(target_os = "linux")] +const LINUX_BATCH_DIVISOR: usize = 4; +#[cfg(target_os = "linux")] +const LINUX_BATCH_MIN: usize = 32; +#[cfg(target_os = "linux")] +const LINUX_BATCH_MAX: usize = 256; + +fn default_merge_batch_size() -> usize { + #[cfg(target_os = "linux")] + { + // Adjust merge batch size dynamically based on available file descriptors. + match get_rlimit() { + Ok(limit) => { + let usable_limit = limit.saturating_div(LINUX_BATCH_DIVISOR); + usable_limit.clamp(LINUX_BATCH_MIN, LINUX_BATCH_MAX) + } + Err(_) => 64, + } + } + + #[cfg(not(target_os = "linux"))] + { + 64 + } +} #[uucore::main] #[allow(clippy::cognitive_complexity)] @@ -1157,14 +1188,15 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { } } - settings.buffer_size = - matches - .get_one::(options::BUF_SIZE) - .map_or(Ok(DEFAULT_BUF_SIZE), |s| { - GlobalSettings::parse_byte_count(s).map_err(|e| { - USimpleError::new(2, format_error_message(&e, s, options::BUF_SIZE)) - }) - })?; + if let Some(size_str) = matches.get_one::(options::BUF_SIZE) { + settings.buffer_size = GlobalSettings::parse_byte_count(size_str).map_err(|e| { + USimpleError::new(2, format_error_message(&e, size_str, options::BUF_SIZE)) + })?; + settings.buffer_size_is_explicit = true; + } else { + settings.buffer_size = automatic_buffer_size(&files); + settings.buffer_size_is_explicit = false; + } let mut tmp_dir = TmpDirWrapper::new( matches