From 810d0e64bab95a60e3e2359da7f1513a8648f1e0 Mon Sep 17 00:00:00 2001 From: mattsu Date: Mon, 20 Oct 2025 17:01:01 +0900 Subject: [PATCH 01/15] feat(sort): auto-tune buffer sizing from available memory --- Cargo.lock | 1 + src/uu/sort/Cargo.toml | 1 + src/uu/sort/src/chunks.rs | 13 ++-- src/uu/sort/src/ext_sort.rs | 12 ++- src/uu/sort/src/sort.rs | 143 ++++++++++++++++++++++++++++++++---- 5 files changed, 147 insertions(+), 23 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 694da9dcc75..ead64119886 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3986,6 +3986,7 @@ dependencies = [ "fluent", "fnv", "itertools 0.14.0", + "libc", "memchr", "nix 0.30.1", "rand 0.9.2", diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml index a44cc7570ae..aa8e0d650d1 100644 --- a/src/uu/sort/Cargo.toml +++ b/src/uu/sort/Cargo.toml @@ -36,6 +36,7 @@ thiserror = { workspace = true } unicode-width = { workspace = true } uucore = { workspace = true, features = ["fs", "parser", "version-cmp"] } fluent = { workspace = true } +libc = { workspace = true } [target.'cfg(target_os = "linux")'.dependencies] nix = { workspace = true } diff --git a/src/uu/sort/src/chunks.rs b/src/uu/sort/src/chunks.rs index af10a008844..837cb1fa95c 100644 --- a/src/uu/sort/src/chunks.rs +++ b/src/uu/sort/src/chunks.rs @@ -271,11 +271,12 @@ fn read_to_buffer( if max_buffer_size > buffer.len() { // we can grow the buffer let prev_len = buffer.len(); - if buffer.len() < max_buffer_size / 2 { - buffer.resize(buffer.len() * 2, 0); + let target = if buffer.len() < max_buffer_size / 2 { + buffer.len().saturating_mul(2) } else { - buffer.resize(max_buffer_size, 0); - } + max_buffer_size + }; + buffer.resize(target.min(max_buffer_size), 0); read_target = &mut buffer[prev_len..]; continue; } @@ -295,8 +296,8 @@ fn read_to_buffer( // We need to read more lines let len = buffer.len(); - // resize the vector to 10 KB more - buffer.resize(len + 1024 * 10, 0); + let grow_by = (len / 2).max(1024 * 1024); + buffer.resize(len + grow_by, 0); read_target = &mut buffer[len..]; } else { // This file has been fully read. diff --git a/src/uu/sort/src/ext_sort.rs b/src/uu/sort/src/ext_sort.rs index e43ad4b3a38..ddbc278d7e3 100644 --- a/src/uu/sort/src/ext_sort.rs +++ b/src/uu/sort/src/ext_sort.rs @@ -86,9 +86,15 @@ fn reader_writer< ) -> UResult<()> { let separator = settings.line_ending.into(); - // Heuristically chosen: Dividing by 10 seems to keep our memory usage roughly - // around settings.buffer_size as a whole. - let buffer_size = settings.buffer_size / 10; + // Cap oversized buffer requests to avoid unnecessary allocations and give the automatic + // heuristic room to grow when the user does not provide an explicit value. + let mut buffer_size = match settings.buffer_size { + size if size <= 512 * 1024 * 1024 => size, + size => size / 2, + }; + if !settings.buffer_size_is_explicit { + buffer_size = buffer_size.max(8 * 1024 * 1024); + } let read_result: ReadResult = read_write_loop( files, tmp_dir, diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 1d4948186fe..0c7965b42c2 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -115,10 +115,12 @@ const DECIMAL_PT: u8 = b'.'; const NEGATIVE: &u8 = &b'-'; const POSITIVE: &u8 = &b'+'; -// Choosing a higher buffer size does not result in performance improvements -// (at least not on my machine). TODO: In the future, we should also take the amount of -// available memory into consideration, instead of relying on this constant only. -const DEFAULT_BUF_SIZE: usize = 1_000_000_000; // 1 GB +// The automatic buffer heuristics clamp to this range to avoid +// over-committing memory on constrained systems while still keeping +// reasonably large chunks for typical workloads. +const MIN_AUTOMATIC_BUF_SIZE: usize = 512 * 1024; // 512 KiB +const FALLBACK_AUTOMATIC_BUF_SIZE: usize = 32 * 1024 * 1024; // 32 MiB +const MAX_AUTOMATIC_BUF_SIZE: usize = 1024 * 1024 * 1024; // 1 GiB #[derive(Debug, Error)] pub enum SortError { @@ -283,6 +285,7 @@ pub struct GlobalSettings { threads: String, line_ending: LineEnding, buffer_size: usize, + buffer_size_is_explicit: bool, compress_prog: Option, merge_batch_size: usize, precomputed: Precomputed, @@ -359,9 +362,10 @@ impl Default for GlobalSettings { separator: None, threads: String::new(), line_ending: LineEnding::Newline, - buffer_size: DEFAULT_BUF_SIZE, + buffer_size: FALLBACK_AUTOMATIC_BUF_SIZE, + buffer_size_is_explicit: false, compress_prog: None, - merge_batch_size: 32, + merge_batch_size: default_merge_batch_size(), precomputed: Precomputed::default(), } } @@ -1037,6 +1041,116 @@ fn get_rlimit() -> UResult { const STDIN_FILE: &str = "-"; +fn default_merge_batch_size() -> usize { + #[cfg(target_os = "linux")] + { + match get_rlimit() { + Ok(limit) => limit.saturating_div(4).clamp(32, 256), + Err(_) => 64, + } + } + + #[cfg(not(target_os = "linux"))] + { + 64 + } +} + +fn automatic_buffer_size(files: &[OsString]) -> usize { + let file_hint = file_size_hint(files); + let mem_hint = available_memory_hint(); + + // Prefer the tighter bound when both hints exist, otherwise fall back to whichever hint is available. + match (file_hint, mem_hint) { + (Some(file), Some(mem)) => file.min(mem), + (Some(file), None) => file, + (None, Some(mem)) => mem, + (None, None) => FALLBACK_AUTOMATIC_BUF_SIZE, + } +} + +fn file_size_hint(files: &[OsString]) -> Option { + let mut total_bytes: u128 = 0; + + for file in files { + if file == STDIN_FILE { + continue; + } + + let Ok(metadata) = std::fs::metadata(file) else { + continue; + }; + + if !metadata.is_file() { + continue; + } + + total_bytes = total_bytes.saturating_add(metadata.len() as u128); + + if total_bytes >= (MAX_AUTOMATIC_BUF_SIZE as u128) * 8 { + break; + } + } + + if total_bytes == 0 { + return None; + } + + let desired_bytes = desired_file_buffer_bytes(total_bytes); + Some(clamp_hint(desired_bytes)) +} + +fn available_memory_hint() -> Option { + #[cfg(target_os = "linux")] + if let Some(bytes) = uucore::parser::parse_size::available_memory_bytes() { + return Some(clamp_hint(bytes / 4)); + } + + physical_memory_bytes().map(|bytes| clamp_hint(bytes / 4)) +} + +fn clamp_hint(bytes: u128) -> usize { + let min = MIN_AUTOMATIC_BUF_SIZE as u128; + let max = MAX_AUTOMATIC_BUF_SIZE as u128; + let clamped = bytes.clamp(min, max); + clamped.min(usize::MAX as u128) as usize +} + +fn desired_file_buffer_bytes(total_bytes: u128) -> u128 { + if total_bytes == 0 { + return 0; + } + + let max = MAX_AUTOMATIC_BUF_SIZE as u128; + + if total_bytes <= max { + let expanded = total_bytes.saturating_mul(12).clamp(total_bytes, max); + return expanded; + } + + let quarter = total_bytes / 4; + quarter.max(max) +} + +fn physical_memory_bytes() -> Option { + #[cfg(all(target_family = "unix", not(target_os = "redox")))] + { + let pages = unsafe { libc::sysconf(libc::_SC_PHYS_PAGES) }; + let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) }; + if pages <= 0 || page_size <= 0 { + return None; + } + let pages = u128::try_from(pages).ok()?; + let page_size = u128::try_from(page_size).ok()?; + Some(pages.saturating_mul(page_size)) + } + + #[cfg(any(not(target_family = "unix"), target_os = "redox"))] + { + None + } +} + #[uucore::main] #[allow(clippy::cognitive_complexity)] pub fn uumain(args: impl uucore::Args) -> UResult<()> { @@ -1157,14 +1271,15 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { } } - settings.buffer_size = - matches - .get_one::(options::BUF_SIZE) - .map_or(Ok(DEFAULT_BUF_SIZE), |s| { - GlobalSettings::parse_byte_count(s).map_err(|e| { - USimpleError::new(2, format_error_message(&e, s, options::BUF_SIZE)) - }) - })?; + if let Some(size_str) = matches.get_one::(options::BUF_SIZE) { + settings.buffer_size = GlobalSettings::parse_byte_count(size_str).map_err(|e| { + USimpleError::new(2, format_error_message(&e, size_str, options::BUF_SIZE)) + })?; + settings.buffer_size_is_explicit = true; + } else { + settings.buffer_size = automatic_buffer_size(&files); + settings.buffer_size_is_explicit = false; + } let mut tmp_dir = TmpDirWrapper::new( matches From 7160f59e89bc6a4a5abeb96ca4a6824c5f108b60 Mon Sep 17 00:00:00 2001 From: mattsu Date: Mon, 20 Oct 2025 18:23:34 +0900 Subject: [PATCH 02/15] docs: add 'sysconf' to jargon wordlist Added 'sysconf' to the CSpell jargon dictionary to include the system configuration function term, preventing false positives in spell-checking for technical code. --- .vscode/cspell.dictionaries/jargon.wordlist.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/.vscode/cspell.dictionaries/jargon.wordlist.txt b/.vscode/cspell.dictionaries/jargon.wordlist.txt index ef8a2026fc1..66216702150 100644 --- a/.vscode/cspell.dictionaries/jargon.wordlist.txt +++ b/.vscode/cspell.dictionaries/jargon.wordlist.txt @@ -131,6 +131,7 @@ symlink symlinks syscall syscalls +sysconf tokenize toolchain truthy From e694ae61a3fc8023870d4a4f35d7d75cdf39996f Mon Sep 17 00:00:00 2001 From: mattsu Date: Mon, 20 Oct 2025 19:14:48 +0900 Subject: [PATCH 03/15] refactor(sort): extract buffer hint logic to separate module Moved buffer size calculation functions (e.g., automatic_buffer_size, file_size_hint) to a new buffer_hint module for better code organization and modularity. Removed the Linux-specific nix dependency as the memory hint functionality is now handled internally without external crates. --- src/uu/sort/Cargo.toml | 2 - src/uu/sort/src/buffer_hint.rs | 138 +++++++++++++++++++++++++++++++++ src/uu/sort/src/sort.rs | 97 +---------------------- 3 files changed, 140 insertions(+), 97 deletions(-) create mode 100644 src/uu/sort/src/buffer_hint.rs diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml index aa8e0d650d1..2f76635096a 100644 --- a/src/uu/sort/Cargo.toml +++ b/src/uu/sort/Cargo.toml @@ -37,8 +37,6 @@ unicode-width = { workspace = true } uucore = { workspace = true, features = ["fs", "parser", "version-cmp"] } fluent = { workspace = true } libc = { workspace = true } - -[target.'cfg(target_os = "linux")'.dependencies] nix = { workspace = true } [dev-dependencies] diff --git a/src/uu/sort/src/buffer_hint.rs b/src/uu/sort/src/buffer_hint.rs new file mode 100644 index 00000000000..ecff7cc8674 --- /dev/null +++ b/src/uu/sort/src/buffer_hint.rs @@ -0,0 +1,138 @@ +use std::ffi::OsString; + +use crate::{ + FALLBACK_AUTOMATIC_BUF_SIZE, MAX_AUTOMATIC_BUF_SIZE, MIN_AUTOMATIC_BUF_SIZE, STDIN_FILE, +}; + +// Heuristics to size the external sort buffer without overcommitting memory. +pub(crate) fn automatic_buffer_size(files: &[OsString]) -> usize { + let file_hint = file_size_hint(files); + let mem_hint = available_memory_hint(); + + // Prefer the tighter bound when both hints exist, otherwise fall back to whichever hint is available. + match (file_hint, mem_hint) { + (Some(file), Some(mem)) => file.min(mem), + (Some(file), None) => file, + (None, Some(mem)) => mem, + (None, None) => FALLBACK_AUTOMATIC_BUF_SIZE, + } +} + +fn file_size_hint(files: &[OsString]) -> Option { + // Estimate total bytes across real files; non-regular inputs are skipped. + let mut total_bytes: u128 = 0; + + for file in files { + if file == STDIN_FILE { + continue; + } + + let Ok(metadata) = std::fs::metadata(file) else { + continue; + }; + + if !metadata.is_file() { + continue; + } + + total_bytes = total_bytes.saturating_add(metadata.len() as u128); + + if total_bytes >= (MAX_AUTOMATIC_BUF_SIZE as u128) * 8 { + break; + } + } + + if total_bytes == 0 { + return None; + } + + let desired_bytes = desired_file_buffer_bytes(total_bytes); + Some(clamp_hint(desired_bytes)) +} + +fn available_memory_hint() -> Option { + #[cfg(target_os = "linux")] + if let Some(bytes) = uucore::parser::parse_size::available_memory_bytes() { + return Some(clamp_hint(bytes / 4)); + } + + physical_memory_bytes().map(|bytes| clamp_hint(bytes / 4)) +} + +fn clamp_hint(bytes: u128) -> usize { + let min = MIN_AUTOMATIC_BUF_SIZE as u128; + let max = MAX_AUTOMATIC_BUF_SIZE as u128; + let clamped = bytes.clamp(min, max); + clamped.min(usize::MAX as u128) as usize +} + +fn desired_file_buffer_bytes(total_bytes: u128) -> u128 { + if total_bytes == 0 { + return 0; + } + + let max = MAX_AUTOMATIC_BUF_SIZE as u128; + + if total_bytes <= max { + let expanded = total_bytes.saturating_mul(12).clamp(total_bytes, max); + return expanded; + } + + let quarter = total_bytes / 4; + quarter.max(max) +} + +fn physical_memory_bytes() -> Option { + #[cfg(all(target_family = "unix", not(target_os = "redox")))] + { + return physical_memory_bytes_unix(); + } + + #[cfg(any(not(target_family = "unix"), target_os = "redox"))] + { + None + } +} + +#[cfg(all(target_family = "unix", not(target_os = "redox"), any(target_os = "linux", target_os = "android")))] +fn physical_memory_bytes_unix() -> Option { + use nix::unistd::{sysconf, SysconfVar}; + + let pages = sysconf(SysconfVar::_PHYS_PAGES).ok().flatten()?.try_into().ok()?; + let page_size = sysconf(SysconfVar::PAGE_SIZE).ok().flatten()?.try_into().ok()?; + Some(pages.saturating_mul(page_size)) +} + +#[cfg(all( + target_family = "unix", + not(target_os = "redox"), + not(any(target_os = "linux", target_os = "android")) +))] +fn physical_memory_bytes_unix() -> Option { + // SAFETY: `nix` does not expose `_PHYS_PAGES` outside Linux/Android, so we fallback to libc. + let pages = unsafe { libc::sysconf(libc::_SC_PHYS_PAGES) }; + let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) }; + (pages > 0 && page_size > 0).then(|| (pages as u128).saturating_mul(page_size as u128)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn desired_buffer_matches_total_when_small() { + let six_mebibytes = 6 * 1024 * 1024; + let expected = ((six_mebibytes as u128) * 12) + .clamp(six_mebibytes as u128, crate::MAX_AUTOMATIC_BUF_SIZE as u128); + assert_eq!(desired_file_buffer_bytes(six_mebibytes as u128), expected); + } + + #[test] + fn desired_buffer_caps_at_max_for_large_inputs() { + let large = 256 * 1024 * 1024; // 256 MiB + assert_eq!( + desired_file_buffer_bytes(large as u128), + crate::MAX_AUTOMATIC_BUF_SIZE as u128 + ); + } +} diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 0c7965b42c2..912008c5185 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -9,6 +9,7 @@ // spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit +mod buffer_hint; mod check; mod chunks; mod custom_str_cmp; @@ -54,6 +55,7 @@ use uucore::show_error; use uucore::translate; use uucore::version_cmp::version_cmp; +use crate::buffer_hint::automatic_buffer_size; use crate::tmp_dir::TmpDirWrapper; mod options { @@ -1056,101 +1058,6 @@ fn default_merge_batch_size() -> usize { } } -fn automatic_buffer_size(files: &[OsString]) -> usize { - let file_hint = file_size_hint(files); - let mem_hint = available_memory_hint(); - - // Prefer the tighter bound when both hints exist, otherwise fall back to whichever hint is available. - match (file_hint, mem_hint) { - (Some(file), Some(mem)) => file.min(mem), - (Some(file), None) => file, - (None, Some(mem)) => mem, - (None, None) => FALLBACK_AUTOMATIC_BUF_SIZE, - } -} - -fn file_size_hint(files: &[OsString]) -> Option { - let mut total_bytes: u128 = 0; - - for file in files { - if file == STDIN_FILE { - continue; - } - - let Ok(metadata) = std::fs::metadata(file) else { - continue; - }; - - if !metadata.is_file() { - continue; - } - - total_bytes = total_bytes.saturating_add(metadata.len() as u128); - - if total_bytes >= (MAX_AUTOMATIC_BUF_SIZE as u128) * 8 { - break; - } - } - - if total_bytes == 0 { - return None; - } - - let desired_bytes = desired_file_buffer_bytes(total_bytes); - Some(clamp_hint(desired_bytes)) -} - -fn available_memory_hint() -> Option { - #[cfg(target_os = "linux")] - if let Some(bytes) = uucore::parser::parse_size::available_memory_bytes() { - return Some(clamp_hint(bytes / 4)); - } - - physical_memory_bytes().map(|bytes| clamp_hint(bytes / 4)) -} - -fn clamp_hint(bytes: u128) -> usize { - let min = MIN_AUTOMATIC_BUF_SIZE as u128; - let max = MAX_AUTOMATIC_BUF_SIZE as u128; - let clamped = bytes.clamp(min, max); - clamped.min(usize::MAX as u128) as usize -} - -fn desired_file_buffer_bytes(total_bytes: u128) -> u128 { - if total_bytes == 0 { - return 0; - } - - let max = MAX_AUTOMATIC_BUF_SIZE as u128; - - if total_bytes <= max { - let expanded = total_bytes.saturating_mul(12).clamp(total_bytes, max); - return expanded; - } - - let quarter = total_bytes / 4; - quarter.max(max) -} - -fn physical_memory_bytes() -> Option { - #[cfg(all(target_family = "unix", not(target_os = "redox")))] - { - let pages = unsafe { libc::sysconf(libc::_SC_PHYS_PAGES) }; - let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) }; - if pages <= 0 || page_size <= 0 { - return None; - } - let pages = u128::try_from(pages).ok()?; - let page_size = u128::try_from(page_size).ok()?; - Some(pages.saturating_mul(page_size)) - } - - #[cfg(any(not(target_family = "unix"), target_os = "redox"))] - { - None - } -} - #[uucore::main] #[allow(clippy::cognitive_complexity)] pub fn uumain(args: impl uucore::Args) -> UResult<()> { From 0ebf12be338459e26517c9f41e9a4bf8d360a73b Mon Sep 17 00:00:00 2001 From: mattsu Date: Mon, 20 Oct 2025 19:24:50 +0900 Subject: [PATCH 04/15] refactor(sort): Explicitly cast to u128 in physical_memory_bytes_unix Updated the `physical_memory_bytes_unix` function to explicitly cast `pages` and `page_size` to `u128` before multiplication, ensuring safe arithmetic and preventing potential overflow. Also added "libc" dependency to `fuzz/Cargo.lock` to support the changes. --- fuzz/Cargo.lock | 1 + src/uu/sort/src/buffer_hint.rs | 12 ++++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index d7477fde6d1..13e5cd1bbd3 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -1644,6 +1644,7 @@ dependencies = [ "fluent", "fnv", "itertools", + "libc", "memchr", "nix", "rand", diff --git a/src/uu/sort/src/buffer_hint.rs b/src/uu/sort/src/buffer_hint.rs index ecff7cc8674..ede232ca5ac 100644 --- a/src/uu/sort/src/buffer_hint.rs +++ b/src/uu/sort/src/buffer_hint.rs @@ -98,8 +98,16 @@ fn physical_memory_bytes() -> Option { fn physical_memory_bytes_unix() -> Option { use nix::unistd::{sysconf, SysconfVar}; - let pages = sysconf(SysconfVar::_PHYS_PAGES).ok().flatten()?.try_into().ok()?; - let page_size = sysconf(SysconfVar::PAGE_SIZE).ok().flatten()?.try_into().ok()?; + let pages: u128 = sysconf(SysconfVar::_PHYS_PAGES) + .ok() + .flatten()? + .try_into() + .ok()?; + let page_size: u128 = sysconf(SysconfVar::PAGE_SIZE) + .ok() + .flatten()? + .try_into() + .ok()?; Some(pages.saturating_mul(page_size)) } From e4d46f42cc75e58ed748a5be9b342dd670468ed9 Mon Sep 17 00:00:00 2001 From: mattsu Date: Mon, 20 Oct 2025 19:28:22 +0900 Subject: [PATCH 05/15] refactor(sort): improve readability of cfg attribute for physical_memory_bytes_unix Reformatted the multi-line `#[cfg(...)]` attribute and reordered imports in `physical_memory_bytes_unix()` for better code clarity and consistency. --- src/uu/sort/src/buffer_hint.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/uu/sort/src/buffer_hint.rs b/src/uu/sort/src/buffer_hint.rs index ede232ca5ac..2af8face79d 100644 --- a/src/uu/sort/src/buffer_hint.rs +++ b/src/uu/sort/src/buffer_hint.rs @@ -94,9 +94,14 @@ fn physical_memory_bytes() -> Option { } } -#[cfg(all(target_family = "unix", not(target_os = "redox"), any(target_os = "linux", target_os = "android")))] +#[cfg(all( + target_family = "unix", + not(target_os = "redox"), + any(target_os = "linux", target_os = "android") +))] fn physical_memory_bytes_unix() -> Option { - use nix::unistd::{sysconf, SysconfVar}; + use nix::unistd::{SysconfVar, sysconf}; + let pages: u128 = sysconf(SysconfVar::_PHYS_PAGES) .ok() From 280e12744b87ca6b16f7de6ffa05e634c5f2384f Mon Sep 17 00:00:00 2001 From: mattsu Date: Mon, 20 Oct 2025 19:29:20 +0900 Subject: [PATCH 06/15] style(buffer_hint): remove unnecessary blank line in physical_memory_bytes_unix Minor code style cleanup to improve readability and consistency in the buffer hint module. --- src/uu/sort/src/buffer_hint.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/uu/sort/src/buffer_hint.rs b/src/uu/sort/src/buffer_hint.rs index 2af8face79d..0d73b0d7b7b 100644 --- a/src/uu/sort/src/buffer_hint.rs +++ b/src/uu/sort/src/buffer_hint.rs @@ -102,7 +102,6 @@ fn physical_memory_bytes() -> Option { fn physical_memory_bytes_unix() -> Option { use nix::unistd::{SysconfVar, sysconf}; - let pages: u128 = sysconf(SysconfVar::_PHYS_PAGES) .ok() .flatten()? From ddb36bc758576f590c2b848dd035eb2f4c5d491f Mon Sep 17 00:00:00 2001 From: mattsu Date: Mon, 20 Oct 2025 19:32:33 +0900 Subject: [PATCH 07/15] refactor(sort): remove unnecessary return statement in physical_memory_bytes The explicit return is redundant in Rust, as the last expression in a block is implicitly returned, improving code style and adherence to idiomatic practices. --- src/uu/sort/src/buffer_hint.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/sort/src/buffer_hint.rs b/src/uu/sort/src/buffer_hint.rs index 0d73b0d7b7b..bdd409c0fca 100644 --- a/src/uu/sort/src/buffer_hint.rs +++ b/src/uu/sort/src/buffer_hint.rs @@ -85,7 +85,7 @@ fn desired_file_buffer_bytes(total_bytes: u128) -> u128 { fn physical_memory_bytes() -> Option { #[cfg(all(target_family = "unix", not(target_os = "redox")))] { - return physical_memory_bytes_unix(); + physical_memory_bytes_unix() } #[cfg(any(not(target_family = "unix"), target_os = "redox"))] From bfa172e91a75f3036d2870977175b0474aafb9fa Mon Sep 17 00:00:00 2001 From: mattsu Date: Mon, 20 Oct 2025 19:36:15 +0900 Subject: [PATCH 08/15] fix: correct typo in buffer_hint.rs comment Corrected "overcommitting" to "overcommit" in the comment for accurate spelling. --- src/uu/sort/src/buffer_hint.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/sort/src/buffer_hint.rs b/src/uu/sort/src/buffer_hint.rs index bdd409c0fca..6006d7aebfd 100644 --- a/src/uu/sort/src/buffer_hint.rs +++ b/src/uu/sort/src/buffer_hint.rs @@ -4,7 +4,7 @@ use crate::{ FALLBACK_AUTOMATIC_BUF_SIZE, MAX_AUTOMATIC_BUF_SIZE, MIN_AUTOMATIC_BUF_SIZE, STDIN_FILE, }; -// Heuristics to size the external sort buffer without overcommitting memory. +// Heuristics to size the external sort buffer without overcommit memory. pub(crate) fn automatic_buffer_size(files: &[OsString]) -> usize { let file_hint = file_size_hint(files); let mem_hint = available_memory_hint(); From 92a4574dd09ac244cea70405cddb7852473dcb0f Mon Sep 17 00:00:00 2001 From: mattsu Date: Tue, 21 Oct 2025 16:20:29 +0900 Subject: [PATCH 09/15] docs: add license header to buffer_hint.rs Added standard copyright and license header to the buffer_hint.rs file in the sort utility to comply with project licensing requirements and ensure proper attribution. --- src/uu/sort/src/buffer_hint.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/uu/sort/src/buffer_hint.rs b/src/uu/sort/src/buffer_hint.rs index 6006d7aebfd..140921e50c9 100644 --- a/src/uu/sort/src/buffer_hint.rs +++ b/src/uu/sort/src/buffer_hint.rs @@ -1,3 +1,9 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +//! Heuristics for determining buffer size for external sorting. use std::ffi::OsString; use crate::{ From f8de88e3922cfaceec79403ab4a3948cff45ee7f Mon Sep 17 00:00:00 2001 From: mattsu <35655889+mattsu2020@users.noreply.github.com> Date: Tue, 21 Oct 2025 20:25:06 +0900 Subject: [PATCH 10/15] Update src/uu/sort/src/buffer_hint.rs Removal of unnecessary variables Co-authored-by: Sylvestre Ledru --- src/uu/sort/src/buffer_hint.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/uu/sort/src/buffer_hint.rs b/src/uu/sort/src/buffer_hint.rs index 140921e50c9..d6d04e22044 100644 --- a/src/uu/sort/src/buffer_hint.rs +++ b/src/uu/sort/src/buffer_hint.rs @@ -80,8 +80,7 @@ fn desired_file_buffer_bytes(total_bytes: u128) -> u128 { let max = MAX_AUTOMATIC_BUF_SIZE as u128; if total_bytes <= max { - let expanded = total_bytes.saturating_mul(12).clamp(total_bytes, max); - return expanded; + return total_bytes.saturating_mul(12).clamp(total_bytes, max); } let quarter = total_bytes / 4; From 5725d06c24634851156354b0872de87ac8837203 Mon Sep 17 00:00:00 2001 From: mattsu Date: Tue, 21 Oct 2025 20:28:03 +0900 Subject: [PATCH 11/15] docs(sort): add comment explaining memory detection limitation Added a comment in the `physical_memory_bytes` function for non-Unix or Redox targets to clarify why `None` is returned, improving code readability and documenting the absence of a portable API for detecting total physical memory. --- src/uu/sort/src/buffer_hint.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/uu/sort/src/buffer_hint.rs b/src/uu/sort/src/buffer_hint.rs index d6d04e22044..2826d8e8bd8 100644 --- a/src/uu/sort/src/buffer_hint.rs +++ b/src/uu/sort/src/buffer_hint.rs @@ -95,6 +95,7 @@ fn physical_memory_bytes() -> Option { #[cfg(any(not(target_family = "unix"), target_os = "redox"))] { + // No portable or safe API is available here to detect total physical memory. None } } From f941f1c4b255dfca6c8321de557bc92be79f76eb Mon Sep 17 00:00:00 2001 From: mattsu Date: Tue, 21 Oct 2025 20:54:05 +0900 Subject: [PATCH 12/15] refactor(sort): enhance physical memory detection for Unix systems Updated cfg conditions in `physical_memory_bytes` to explicitly handle Linux and Android targets. Refined `physical_memory_bytes_unix` with improved error handling for sysconf calls, ensuring safer memory page and size calculations while removing fallback libc implementation for non-Linux/Android Unix systems. --- src/uu/sort/src/buffer_hint.rs | 48 ++++++++++++++++------------------ 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/src/uu/sort/src/buffer_hint.rs b/src/uu/sort/src/buffer_hint.rs index 2826d8e8bd8..43d54831d53 100644 --- a/src/uu/sort/src/buffer_hint.rs +++ b/src/uu/sort/src/buffer_hint.rs @@ -88,12 +88,20 @@ fn desired_file_buffer_bytes(total_bytes: u128) -> u128 { } fn physical_memory_bytes() -> Option { - #[cfg(all(target_family = "unix", not(target_os = "redox")))] + #[cfg(all( + target_family = "unix", + not(target_os = "redox"), + any(target_os = "linux", target_os = "android") + ))] { physical_memory_bytes_unix() } - #[cfg(any(not(target_family = "unix"), target_os = "redox"))] + #[cfg(any( + not(target_family = "unix"), + target_os = "redox", + not(any(target_os = "linux", target_os = "android")) + ))] { // No portable or safe API is available here to detect total physical memory. None @@ -106,31 +114,19 @@ fn physical_memory_bytes() -> Option { any(target_os = "linux", target_os = "android") ))] fn physical_memory_bytes_unix() -> Option { - use nix::unistd::{SysconfVar, sysconf}; - - let pages: u128 = sysconf(SysconfVar::_PHYS_PAGES) - .ok() - .flatten()? - .try_into() - .ok()?; - let page_size: u128 = sysconf(SysconfVar::PAGE_SIZE) - .ok() - .flatten()? - .try_into() - .ok()?; - Some(pages.saturating_mul(page_size)) -} + use nix::unistd::{sysconf, SysconfVar}; -#[cfg(all( - target_family = "unix", - not(target_os = "redox"), - not(any(target_os = "linux", target_os = "android")) -))] -fn physical_memory_bytes_unix() -> Option { - // SAFETY: `nix` does not expose `_PHYS_PAGES` outside Linux/Android, so we fallback to libc. - let pages = unsafe { libc::sysconf(libc::_SC_PHYS_PAGES) }; - let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) }; - (pages > 0 && page_size > 0).then(|| (pages as u128).saturating_mul(page_size as u128)) + let pages = match sysconf(SysconfVar::_PHYS_PAGES) { + Ok(Some(pages)) if pages > 0 => u128::try_from(pages).ok()?, + _ => return None, + }; + + let page_size = match sysconf(SysconfVar::PAGE_SIZE) { + Ok(Some(page_size)) if page_size > 0 => u128::try_from(page_size).ok()?, + _ => return None, + }; + + Some(pages.saturating_mul(page_size)) } #[cfg(test)] From 5586e8a1c75326a77b955c557edf0c81ceaf15c1 Mon Sep 17 00:00:00 2001 From: mattsu Date: Tue, 21 Oct 2025 21:15:19 +0900 Subject: [PATCH 13/15] refactor(uu/sort): remove libc dependency and use named constants for batch size Removed the libc crate dependency from Cargo.toml and Cargo.lock to reduce unnecessary dependencies. Refactored default_merge_batch_size() in sort.rs to use named constants (LINUX_BATCH_DIVISOR, LINUX_BATCH_MIN, LINUX_BATCH_MAX) for better code readability and maintainability, while preserving the same dynamic batch size calculation logic based on file descriptor limits. --- Cargo.lock | 1 - src/uu/sort/Cargo.toml | 1 - src/uu/sort/src/sort.rs | 12 +++++++++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5cc6db41435..4b150520ebf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3967,7 +3967,6 @@ dependencies = [ "fluent", "fnv", "itertools 0.14.0", - "libc", "memchr", "nix", "rand 0.9.2", diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml index 2f76635096a..c1b4c07084c 100644 --- a/src/uu/sort/Cargo.toml +++ b/src/uu/sort/Cargo.toml @@ -36,7 +36,6 @@ thiserror = { workspace = true } unicode-width = { workspace = true } uucore = { workspace = true, features = ["fs", "parser", "version-cmp"] } fluent = { workspace = true } -libc = { workspace = true } nix = { workspace = true } [dev-dependencies] diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 912008c5185..c9d1bac97aa 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -1042,12 +1042,22 @@ fn get_rlimit() -> UResult { } const STDIN_FILE: &str = "-"; +#[cfg(target_os = "linux")] +const LINUX_BATCH_DIVISOR: usize = 4; +#[cfg(target_os = "linux")] +const LINUX_BATCH_MIN: usize = 32; +#[cfg(target_os = "linux")] +const LINUX_BATCH_MAX: usize = 256; fn default_merge_batch_size() -> usize { #[cfg(target_os = "linux")] { + // Adjust merge batch size dynamically based on available file descriptors. match get_rlimit() { - Ok(limit) => limit.saturating_div(4).clamp(32, 256), + Ok(limit) => { + let usable_limit = limit.saturating_div(LINUX_BATCH_DIVISOR); + usable_limit.clamp(LINUX_BATCH_MIN, LINUX_BATCH_MAX) + } Err(_) => 64, } } From c7298c9346d41bc18a1296a46c4744df232ebdec Mon Sep 17 00:00:00 2001 From: mattsu Date: Tue, 21 Oct 2025 21:17:23 +0900 Subject: [PATCH 14/15] refactor(sort): reorder imports in buffer_hint.rs for consistency Reordered the imports in the `use nix::unistd` statement to place `SysconfVar` before `sysconf`, improving code readability and adhering to alphabetical import ordering conventions. --- src/uu/sort/src/buffer_hint.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/sort/src/buffer_hint.rs b/src/uu/sort/src/buffer_hint.rs index 43d54831d53..bb0ea754094 100644 --- a/src/uu/sort/src/buffer_hint.rs +++ b/src/uu/sort/src/buffer_hint.rs @@ -114,7 +114,7 @@ fn physical_memory_bytes() -> Option { any(target_os = "linux", target_os = "android") ))] fn physical_memory_bytes_unix() -> Option { - use nix::unistd::{sysconf, SysconfVar}; + use nix::unistd::{SysconfVar, sysconf}; let pages = match sysconf(SysconfVar::_PHYS_PAGES) { Ok(Some(pages)) if pages > 0 => u128::try_from(pages).ok()?, From 7fd534e6c94ce703d8322d666d803fbd67f93685 Mon Sep 17 00:00:00 2001 From: mattsu Date: Mon, 20 Oct 2025 10:33:57 +0000 Subject: [PATCH 15/15] fix Cargo.lock linux enviroments --- fuzz/Cargo.lock | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 13e5cd1bbd3..e638c34bee4 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -231,18 +231,18 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.49" +version = "4.5.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4512b90fa68d3a9932cea5184017c5d200f5921df706d45e853537dea51508f" +checksum = "0c2cfd7bf8a6017ddaa4e32ffe7403d547790db06bd171c1c53926faab501623" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.49" +version = "4.5.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0025e98baa12e766c67ba13ff4695a887a1eba19569aad00a472546795bd6730" +checksum = "0a4c05b9e80c5ccd3a7ef080ad7b6ba7d6fc00a985b8b157197075677c82c7a0" dependencies = [ "anstream", "anstyle", @@ -1513,9 +1513,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.19" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" +checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06" [[package]] name = "unicode-width" @@ -1644,7 +1644,6 @@ dependencies = [ "fluent", "fnv", "itertools", - "libc", "memchr", "nix", "rand",