From a0e77d95a0e811a5c566579f2336eaa32daa4ba3 Mon Sep 17 00:00:00 2001 From: mattsu Date: Sat, 4 Oct 2025 10:05:00 +0900 Subject: [PATCH 1/8] feat(sort): implement automatic buffer size calculation Replaced fixed 1GB buffer size with dynamic calculation based on input file sizes and available memory. Added heuristics to clamp buffer size within 512 KiB to 128 MiB range, preventing overcommitment on constrained systems while optimizing for typical workloads. Updated dependencies and imports to use libc directly for better portability. --- Cargo.lock | 1 + src/uu/sort/Cargo.toml | 1 + src/uu/sort/src/sort.rs | 137 +++++++++++++++++++++++++++++++++++----- 3 files changed, 124 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c40ccccf981..c2c949d29a6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3943,6 +3943,7 @@ dependencies = [ "fluent", "fnv", "itertools 0.14.0", + "libc", "memchr", "nix 0.30.1", "rand 0.9.2", diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml index a44cc7570ae..aa8e0d650d1 100644 --- a/src/uu/sort/Cargo.toml +++ b/src/uu/sort/Cargo.toml @@ -36,6 +36,7 @@ thiserror = { workspace = true } unicode-width = { workspace = true } uucore = { workspace = true, features = ["fs", "parser", "version-cmp"] } fluent = { workspace = true } +libc = { workspace = true } [target.'cfg(target_os = "linux")'.dependencies] nix = { workspace = true } diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 1d4948186fe..d0c1b8a5662 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -24,8 +24,10 @@ use clap::{Arg, ArgAction, Command}; use custom_str_cmp::custom_str_cmp; use ext_sort::ext_sort; use fnv::FnvHasher; +#[cfg(target_family = "unix")] +use libc::{_SC_PAGESIZE, _SC_PHYS_PAGES}; #[cfg(target_os = "linux")] -use nix::libc::{RLIMIT_NOFILE, getrlimit, rlimit}; +use libc::{RLIMIT_NOFILE, rlimit}; use numeric_str_cmp::{NumInfo, NumInfoParseSettings, human_numeric_str_cmp, numeric_str_cmp}; use rand::{Rng, rng}; use rayon::prelude::*; @@ -115,10 +117,12 @@ const DECIMAL_PT: u8 = b'.'; const NEGATIVE: &u8 = &b'-'; const POSITIVE: &u8 = &b'+'; -// Choosing a higher buffer size does not result in performance improvements -// (at least not on my machine). TODO: In the future, we should also take the amount of -// available memory into consideration, instead of relying on this constant only. -const DEFAULT_BUF_SIZE: usize = 1_000_000_000; // 1 GB +// NOTE: The automatic buffer heuristics clamp to this range to avoid +// overcommitting memory on constrained systems while still keeping +// reasonably large chunks for typical workloads. +const MIN_AUTOMATIC_BUF_SIZE: usize = 512 * 1024; // 512 KiB +const FALLBACK_AUTOMATIC_BUF_SIZE: usize = 4 * 1024 * 1024; // 4 MiB +const MAX_AUTOMATIC_BUF_SIZE: usize = 128 * 1024 * 1024; // 128 MiB #[derive(Debug, Error)] pub enum SortError { @@ -359,7 +363,7 @@ impl Default for GlobalSettings { separator: None, threads: String::new(), line_ending: LineEnding::Newline, - buffer_size: DEFAULT_BUF_SIZE, + buffer_size: FALLBACK_AUTOMATIC_BUF_SIZE, compress_prog: None, merge_batch_size: 32, precomputed: Precomputed::default(), @@ -1029,7 +1033,7 @@ fn get_rlimit() -> UResult { rlim_cur: 0, rlim_max: 0, }; - match unsafe { getrlimit(RLIMIT_NOFILE, &raw mut limit) } { + match unsafe { libc::getrlimit(RLIMIT_NOFILE, &mut limit) } { 0 => Ok(limit.rlim_cur as usize), _ => Err(UUsageError::new(2, translate!("sort-failed-fetch-rlimit"))), } @@ -1037,6 +1041,110 @@ fn get_rlimit() -> UResult { const STDIN_FILE: &str = "-"; +fn automatic_buffer_size(files: &[OsString]) -> usize { + let file_hint = file_size_hint(files); + let mem_hint = available_memory_hint(); + + match (file_hint, mem_hint) { + (Some(file), Some(mem)) => file.min(mem), + (Some(file), None) => file, + (None, Some(mem)) => mem, + (None, None) => FALLBACK_AUTOMATIC_BUF_SIZE, + } +} + +fn file_size_hint(files: &[OsString]) -> Option { + let mut total_bytes: u128 = 0; + + for file in files { + if file == STDIN_FILE { + continue; + } + + let Ok(metadata) = std::fs::metadata(file) else { + continue; + }; + + if !metadata.is_file() { + continue; + } + + total_bytes = total_bytes.saturating_add(metadata.len() as u128); + + if total_bytes >= (MAX_AUTOMATIC_BUF_SIZE as u128) * 4 { + break; + } + } + + if total_bytes == 0 { + return None; + } + + Some(clamp_hint(total_bytes / 4)) +} + +fn available_memory_hint() -> Option { + #[cfg(target_os = "linux")] + if let Some(bytes) = available_memory_bytes() { + return Some(clamp_hint(bytes / 8)); + } + + physical_memory_bytes().map(|bytes| clamp_hint(bytes / 8)) +} + +fn clamp_hint(bytes: u128) -> usize { + let min = MIN_AUTOMATIC_BUF_SIZE as u128; + let max = MAX_AUTOMATIC_BUF_SIZE as u128; + let clamped = bytes.clamp(min, max); + clamped.min(usize::MAX as u128) as usize +} + +#[cfg(target_os = "linux")] +fn available_memory_bytes() -> Option { + let meminfo = std::fs::read_to_string("/proc/meminfo").ok()?; + let mut mem_available = None; + let mut mem_total = None; + + for line in meminfo.lines() { + if let Some(value) = line.strip_prefix("MemAvailable:") { + mem_available = parse_meminfo_value(value); + } else if let Some(value) = line.strip_prefix("MemTotal:") { + mem_total = parse_meminfo_value(value); + } + + if mem_available.is_some() && mem_total.is_some() { + break; + } + } + + mem_available.or(mem_total) +} + +#[cfg(target_os = "linux")] +fn parse_meminfo_value(value: &str) -> Option { + let amount_kib = value.split_whitespace().next()?.parse::().ok()?; + Some(amount_kib * 1024) +} + +fn physical_memory_bytes() -> Option { + #[cfg(target_family = "unix")] + { + let pages = unsafe { libc::sysconf(_SC_PHYS_PAGES) }; + let page_size = unsafe { libc::sysconf(_SC_PAGESIZE) }; + if pages <= 0 || page_size <= 0 { + return None; + } + let pages = u128::try_from(pages).ok()?; + let page_size = u128::try_from(page_size).ok()?; + Some(pages.saturating_mul(page_size)) + } + + #[cfg(not(target_family = "unix"))] + { + None + } +} + #[uucore::main] #[allow(clippy::cognitive_complexity)] pub fn uumain(args: impl uucore::Args) -> UResult<()> { @@ -1157,14 +1265,13 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { } } - settings.buffer_size = - matches - .get_one::(options::BUF_SIZE) - .map_or(Ok(DEFAULT_BUF_SIZE), |s| { - GlobalSettings::parse_byte_count(s).map_err(|e| { - USimpleError::new(2, format_error_message(&e, s, options::BUF_SIZE)) - }) - })?; + settings.buffer_size = matches.get_one::(options::BUF_SIZE).map_or_else( + || Ok(automatic_buffer_size(&files)), + |s| { + GlobalSettings::parse_byte_count(s) + .map_err(|e| USimpleError::new(2, format_error_message(&e, s, options::BUF_SIZE))) + }, + )?; let mut tmp_dir = TmpDirWrapper::new( matches From 59fc5302f840a95404aaf009274e93ddb55bb0b3 Mon Sep 17 00:00:00 2001 From: mattsu Date: Sat, 4 Oct 2025 10:18:15 +0900 Subject: [PATCH 2/8] style: update codespell config and refine comment in sort.rs - Add sysconf to codespell ignore list to prevent false positives - Remove "NOTE:" from buffer heuristics comment for cleaner style --- .codespell.rc | 3 +++ src/uu/sort/src/sort.rs | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.codespell.rc b/.codespell.rc index 914ca295128..b612f6ab7c6 100644 --- a/.codespell.rc +++ b/.codespell.rc @@ -1,3 +1,6 @@ [codespell] ignore-words-list = crate skip = ./.git/**,./.vscode/cspell.dictionaries/**,./target/**,./tests/fixtures/** + +[default.extend-words] +sysconf = "sysconf" \ No newline at end of file diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index d0c1b8a5662..ef590ec35ab 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -117,8 +117,8 @@ const DECIMAL_PT: u8 = b'.'; const NEGATIVE: &u8 = &b'-'; const POSITIVE: &u8 = &b'+'; -// NOTE: The automatic buffer heuristics clamp to this range to avoid -// overcommitting memory on constrained systems while still keeping +// The automatic buffer heuristics clamp to this range to avoid +// over-committing memory on constrained systems while still keeping // reasonably large chunks for typical workloads. const MIN_AUTOMATIC_BUF_SIZE: usize = 512 * 1024; // 512 KiB const FALLBACK_AUTOMATIC_BUF_SIZE: usize = 4 * 1024 * 1024; // 4 MiB From c9db85ea3c2aec71720636249b1af5d2a9eb5978 Mon Sep 17 00:00:00 2001 From: mattsu Date: Sat, 4 Oct 2025 10:25:04 +0900 Subject: [PATCH 3/8] fix --- .codespell.rc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.codespell.rc b/.codespell.rc index b612f6ab7c6..e20b133a2f0 100644 --- a/.codespell.rc +++ b/.codespell.rc @@ -1,6 +1,4 @@ [codespell] -ignore-words-list = crate +ignore-words-list = crate,sysconf skip = ./.git/**,./.vscode/cspell.dictionaries/**,./target/**,./tests/fixtures/** -[default.extend-words] -sysconf = "sysconf" \ No newline at end of file From 5b2950f6da94f7fbc6154394b28af4bcd53ef2df Mon Sep 17 00:00:00 2001 From: mattsu Date: Sat, 4 Oct 2025 10:30:31 +0900 Subject: [PATCH 4/8] fix --- .codespell.rc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.codespell.rc b/.codespell.rc index e20b133a2f0..5584b58fccc 100644 --- a/.codespell.rc +++ b/.codespell.rc @@ -1,4 +1,4 @@ [codespell] ignore-words-list = crate,sysconf -skip = ./.git/**,./.vscode/cspell.dictionaries/**,./target/**,./tests/fixtures/** +skip = ./.git/**,./.vscode/cspell.dictionaries/**,./target/**,./tests/fixtures/**,.codespell.rc From e1d92d10b1eb596bf1c1a1d449c739e6fdde196f Mon Sep 17 00:00:00 2001 From: mattsu Date: Sat, 4 Oct 2025 10:32:51 +0900 Subject: [PATCH 5/8] fix --- .codespell.rc | 4 ++-- .vscode/cSpell.json | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.codespell.rc b/.codespell.rc index 5584b58fccc..bded6f5f819 100644 --- a/.codespell.rc +++ b/.codespell.rc @@ -1,4 +1,4 @@ [codespell] -ignore-words-list = crate,sysconf -skip = ./.git/**,./.vscode/cspell.dictionaries/**,./target/**,./tests/fixtures/**,.codespell.rc +ignore-words-list = crate +skip = ./.git/**,./.vscode/cspell.dictionaries/**,./target/**,./tests/fixtures/**, diff --git a/.vscode/cSpell.json b/.vscode/cSpell.json index 5d3e3524b03..27b38090366 100644 --- a/.vscode/cSpell.json +++ b/.vscode/cSpell.json @@ -42,5 +42,7 @@ "ignoreWords": [], // words to always consider correct - "words": [] + "words": [ + "sysconf" + ] } From dc88a11029c344e64e08470f3a86c5d5be398df8 Mon Sep 17 00:00:00 2001 From: mattsu Date: Sat, 4 Oct 2025 10:58:16 +0900 Subject: [PATCH 6/8] fix --- .codespell.rc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.codespell.rc b/.codespell.rc index bded6f5f819..47f36b4a8b0 100644 --- a/.codespell.rc +++ b/.codespell.rc @@ -1,4 +1,4 @@ [codespell] ignore-words-list = crate -skip = ./.git/**,./.vscode/cspell.dictionaries/**,./target/**,./tests/fixtures/**, +skip = ./.git/**,./.vscode/cspell.dictionaries/**,./target/**,./tests/fixtures/** From e77a20c59524d4290997825811692beaedf26e50 Mon Sep 17 00:00:00 2001 From: mattsu Date: Sat, 4 Oct 2025 11:03:19 +0900 Subject: [PATCH 7/8] fix --- .codespell.rc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.codespell.rc b/.codespell.rc index 47f36b4a8b0..7a811dee3ad 100644 --- a/.codespell.rc +++ b/.codespell.rc @@ -1,4 +1,3 @@ [codespell] ignore-words-list = crate -skip = ./.git/**,./.vscode/cspell.dictionaries/**,./target/**,./tests/fixtures/** - +skip = ./.git/**,./.vscode/cspell.dictionaries/**,./target/**,./tests/fixtures/** \ No newline at end of file From 0c5ee5ffa0a74c64d43b4edd5357bb376ec1d8d2 Mon Sep 17 00:00:00 2001 From: mattsu Date: Mon, 6 Oct 2025 17:05:56 +0900 Subject: [PATCH 8/8] fix --- .vscode/cSpell.json | 4 +- .../cspell.dictionaries/jargon.wordlist.txt | 1 + src/uu/sort/src/sort.rs | 40 ++++++- src/uu/sort/src/tmp_dir.rs | 110 +++++++++++++----- 4 files changed, 123 insertions(+), 32 deletions(-) diff --git a/.vscode/cSpell.json b/.vscode/cSpell.json index 27b38090366..5d3e3524b03 100644 --- a/.vscode/cSpell.json +++ b/.vscode/cSpell.json @@ -42,7 +42,5 @@ "ignoreWords": [], // words to always consider correct - "words": [ - "sysconf" - ] + "words": [] } diff --git a/.vscode/cspell.dictionaries/jargon.wordlist.txt b/.vscode/cspell.dictionaries/jargon.wordlist.txt index d1f36618c87..975e8dbabef 100644 --- a/.vscode/cspell.dictionaries/jargon.wordlist.txt +++ b/.vscode/cspell.dictionaries/jargon.wordlist.txt @@ -130,6 +130,7 @@ symlink symlinks syscall syscalls +sysconf tokenize toolchain truthy diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index ef590ec35ab..bd0f0f05a8a 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -122,7 +122,7 @@ const POSITIVE: &u8 = &b'+'; // reasonably large chunks for typical workloads. const MIN_AUTOMATIC_BUF_SIZE: usize = 512 * 1024; // 512 KiB const FALLBACK_AUTOMATIC_BUF_SIZE: usize = 4 * 1024 * 1024; // 4 MiB -const MAX_AUTOMATIC_BUF_SIZE: usize = 128 * 1024 * 1024; // 128 MiB +const MAX_AUTOMATIC_BUF_SIZE: usize = 1024 * 1024 * 1024; // 1 GiB #[derive(Debug, Error)] pub enum SortError { @@ -1071,7 +1071,7 @@ fn file_size_hint(files: &[OsString]) -> Option { total_bytes = total_bytes.saturating_add(metadata.len() as u128); - if total_bytes >= (MAX_AUTOMATIC_BUF_SIZE as u128) * 4 { + if total_bytes >= (MAX_AUTOMATIC_BUF_SIZE as u128) * 8 { break; } } @@ -1080,7 +1080,8 @@ fn file_size_hint(files: &[OsString]) -> Option { return None; } - Some(clamp_hint(total_bytes / 4)) + let desired_bytes = desired_file_buffer_bytes(total_bytes); + Some(clamp_hint(desired_bytes)) } fn available_memory_hint() -> Option { @@ -1099,6 +1100,22 @@ fn clamp_hint(bytes: u128) -> usize { clamped.min(usize::MAX as u128) as usize } +fn desired_file_buffer_bytes(total_bytes: u128) -> u128 { + if total_bytes == 0 { + return 0; + } + + let max = MAX_AUTOMATIC_BUF_SIZE as u128; + + if total_bytes <= max { + let expanded = total_bytes.saturating_mul(12).clamp(total_bytes, max); + return expanded; + } + + let quarter = total_bytes / 4; + quarter.max(max) +} + #[cfg(target_os = "linux")] fn available_memory_bytes() -> Option { let meminfo = std::fs::read_to_string("/proc/meminfo").ok()?; @@ -2090,6 +2107,23 @@ mod tests { use super::*; + #[test] + fn desired_buffer_matches_total_when_small() { + let six_mebibytes = 6 * 1024 * 1024; + let expected = ((six_mebibytes as u128) * 12) + .clamp(six_mebibytes as u128, MAX_AUTOMATIC_BUF_SIZE as u128); + assert_eq!(desired_file_buffer_bytes(six_mebibytes as u128), expected); + } + + #[test] + fn desired_buffer_caps_at_max_for_large_inputs() { + let large = 256 * 1024 * 1024; // 256 MiB + assert_eq!( + desired_file_buffer_bytes(large as u128), + MAX_AUTOMATIC_BUF_SIZE as u128 + ); + } + fn tokenize_helper(line: &[u8], separator: Option) -> Vec { let mut buffer = vec![]; tokenize(line, separator, &mut buffer); diff --git a/src/uu/sort/src/tmp_dir.rs b/src/uu/sort/src/tmp_dir.rs index 474e01ae2f3..7d8222d86d2 100644 --- a/src/uu/sort/src/tmp_dir.rs +++ b/src/uu/sort/src/tmp_dir.rs @@ -2,10 +2,11 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. +use std::sync::atomic::{AtomicBool, Ordering}; use std::{ fs::File, path::{Path, PathBuf}, - sync::{Arc, Mutex}, + sync::{Arc, Mutex, OnceLock}, }; use tempfile::TempDir; @@ -29,6 +30,64 @@ pub struct TmpDirWrapper { lock: Arc>, } +#[derive(Default, Clone)] +struct HandlerRegistration { + lock: Option>>, + path: Option, +} + +fn handler_state() -> Arc> { + static HANDLER_STATE: OnceLock>> = OnceLock::new(); + HANDLER_STATE + .get_or_init(|| Arc::new(Mutex::new(HandlerRegistration::default()))) + .clone() +} + +fn ensure_signal_handler_installed(state: Arc>) -> UResult<()> { + static HANDLER_INSTALLED: AtomicBool = AtomicBool::new(false); + + if HANDLER_INSTALLED + .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) + .is_err() + { + return Ok(()); + } + + let handler_state = state.clone(); + if let Err(e) = ctrlc::set_handler(move || { + let (lock, path) = { + let state = handler_state.lock().unwrap(); + (state.lock.clone(), state.path.clone()) + }; + + if let Some(lock) = lock { + let _guard = lock.lock().unwrap(); + if let Some(path) = path { + if let Err(e) = remove_tmp_dir(&path) { + show_error!( + "{}", + translate!( + "sort-failed-to-delete-temporary-directory", + "error" => e + ) + ); + } + } + } + + std::process::exit(2) + }) { + HANDLER_INSTALLED.store(false, Ordering::Release); + return Err(USimpleError::new( + 2, + translate!("sort-failed-to-set-up-signal-handler", "error" => e), + ) + .into()); + } + + Ok(()) +} + impl TmpDirWrapper { pub fn new(path: PathBuf) -> Self { Self { @@ -52,31 +111,14 @@ impl TmpDirWrapper { ); let path = self.temp_dir.as_ref().unwrap().path().to_owned(); - let lock = self.lock.clone(); - ctrlc::set_handler(move || { - // Take the lock so that `next_file_path` returns no new file path, - // and the program doesn't terminate before the handler has finished - let _lock = lock.lock().unwrap(); - if let Err(e) = remove_tmp_dir(&path) { - show_error!( - "{}", - translate!( - "sort-failed-to-delete-temporary-directory", - "error" => e - ) - ); - } - std::process::exit(2) - }) - .map_err(|e| { - USimpleError::new( - 2, - translate!( - "sort-failed-to-set-up-signal-handler", - "error" => e - ), - ) - }) + let state = handler_state(); + { + let mut guard = state.lock().unwrap(); + guard.lock = Some(self.lock.clone()); + guard.path = Some(path); + } + + ensure_signal_handler_installed(state) } pub fn next_file(&mut self) -> UResult<(File, PathBuf)> { @@ -100,6 +142,22 @@ impl TmpDirWrapper { } } +impl Drop for TmpDirWrapper { + fn drop(&mut self) { + let state = handler_state(); + let mut guard = state.lock().unwrap(); + + if guard + .lock + .as_ref() + .is_some_and(|current| Arc::ptr_eq(current, &self.lock)) + { + guard.lock = None; + guard.path = None; + } + } +} + /// Remove the directory at `path` by deleting its child files and then itself. /// Errors while deleting child files are ignored. fn remove_tmp_dir(path: &Path) -> std::io::Result<()> {