Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
810d0e6
feat(sort): auto-tune buffer sizing from available memory
mattsu2020 Oct 20, 2025
e197e64
Merge branch 'uutils:main' into sort-memory-functions
mattsu2020 Oct 20, 2025
7160f59
docs: add 'sysconf' to jargon wordlist
mattsu2020 Oct 20, 2025
e694ae6
refactor(sort): extract buffer hint logic to separate module
mattsu2020 Oct 20, 2025
0ebf12b
refactor(sort): Explicitly cast to u128 in physical_memory_bytes_unix
mattsu2020 Oct 20, 2025
e4d46f4
refactor(sort): improve readability of cfg attribute for physical_mem…
mattsu2020 Oct 20, 2025
280e127
style(buffer_hint): remove unnecessary blank line in physical_memory_…
mattsu2020 Oct 20, 2025
ddb36bc
refactor(sort): remove unnecessary return statement in physical_memor…
mattsu2020 Oct 20, 2025
bfa172e
fix: correct typo in buffer_hint.rs comment
mattsu2020 Oct 20, 2025
d273a69
Merge branch 'uutils:main' into sort-memory-functions
mattsu2020 Oct 21, 2025
92a4574
docs: add license header to buffer_hint.rs
mattsu2020 Oct 21, 2025
f8de88e
Update src/uu/sort/src/buffer_hint.rs
mattsu2020 Oct 21, 2025
5725d06
docs(sort): add comment explaining memory detection limitation
mattsu2020 Oct 21, 2025
f941f1c
refactor(sort): enhance physical memory detection for Unix systems
mattsu2020 Oct 21, 2025
5586e8a
refactor(uu/sort): remove libc dependency and use named constants for…
mattsu2020 Oct 21, 2025
c7298c9
refactor(sort): reorder imports in buffer_hint.rs for consistency
mattsu2020 Oct 21, 2025
7fd534e
fix Cargo.lock linux enviroments
mattsu2020 Oct 20, 2025
08a9548
Merge branch 'uutils:main' into sort-memory-functions
mattsu2020 Oct 22, 2025
19fd282
Merge branch 'uutils:main' into sort-memory-functions
mattsu2020 Oct 22, 2025
72201b2
Merge branch 'uutils:main' into sort-memory-functions
mattsu2020 Oct 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .vscode/cspell.dictionaries/jargon.wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ symlink
symlinks
syscall
syscalls
sysconf
tokenize
toolchain
truthy
Expand Down
12 changes: 6 additions & 6 deletions fuzz/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions src/uu/sort/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@ thiserror = { workspace = true }
unicode-width = { workspace = true }
uucore = { workspace = true, features = ["fs", "parser", "version-cmp"] }
fluent = { workspace = true }

[target.'cfg(target_os = "linux")'.dependencies]
nix = { workspace = true }

[dev-dependencies]
Expand Down
152 changes: 152 additions & 0 deletions src/uu/sort/src/buffer_hint.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
// This file is part of the uutils coreutils package.
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.

//! Heuristics for determining buffer size for external sorting.
use std::ffi::OsString;

use crate::{
FALLBACK_AUTOMATIC_BUF_SIZE, MAX_AUTOMATIC_BUF_SIZE, MIN_AUTOMATIC_BUF_SIZE, STDIN_FILE,
};

// Heuristics to size the external sort buffer without overcommit memory.
pub(crate) fn automatic_buffer_size(files: &[OsString]) -> usize {
let file_hint = file_size_hint(files);
let mem_hint = available_memory_hint();

// Prefer the tighter bound when both hints exist, otherwise fall back to whichever hint is available.
match (file_hint, mem_hint) {
(Some(file), Some(mem)) => file.min(mem),
(Some(file), None) => file,
(None, Some(mem)) => mem,
(None, None) => FALLBACK_AUTOMATIC_BUF_SIZE,
}
}

fn file_size_hint(files: &[OsString]) -> Option<usize> {
// Estimate total bytes across real files; non-regular inputs are skipped.
let mut total_bytes: u128 = 0;

for file in files {
if file == STDIN_FILE {
continue;
}

let Ok(metadata) = std::fs::metadata(file) else {
continue;
};

if !metadata.is_file() {
continue;
}

total_bytes = total_bytes.saturating_add(metadata.len() as u128);

if total_bytes >= (MAX_AUTOMATIC_BUF_SIZE as u128) * 8 {
break;
}
}

if total_bytes == 0 {
return None;
}

let desired_bytes = desired_file_buffer_bytes(total_bytes);
Some(clamp_hint(desired_bytes))
}

fn available_memory_hint() -> Option<usize> {
#[cfg(target_os = "linux")]
if let Some(bytes) = uucore::parser::parse_size::available_memory_bytes() {
return Some(clamp_hint(bytes / 4));
}

physical_memory_bytes().map(|bytes| clamp_hint(bytes / 4))
}

fn clamp_hint(bytes: u128) -> usize {
let min = MIN_AUTOMATIC_BUF_SIZE as u128;
let max = MAX_AUTOMATIC_BUF_SIZE as u128;
let clamped = bytes.clamp(min, max);
clamped.min(usize::MAX as u128) as usize
}

fn desired_file_buffer_bytes(total_bytes: u128) -> u128 {
if total_bytes == 0 {
return 0;
}

let max = MAX_AUTOMATIC_BUF_SIZE as u128;

if total_bytes <= max {
return total_bytes.saturating_mul(12).clamp(total_bytes, max);
}

let quarter = total_bytes / 4;
quarter.max(max)
}

fn physical_memory_bytes() -> Option<u128> {
#[cfg(all(
target_family = "unix",
not(target_os = "redox"),
any(target_os = "linux", target_os = "android")
))]
{
physical_memory_bytes_unix()
}

#[cfg(any(
not(target_family = "unix"),
target_os = "redox",
not(any(target_os = "linux", target_os = "android"))
))]
{
// No portable or safe API is available here to detect total physical memory.
None
}
}

#[cfg(all(
target_family = "unix",
not(target_os = "redox"),
any(target_os = "linux", target_os = "android")
))]
fn physical_memory_bytes_unix() -> Option<u128> {
use nix::unistd::{SysconfVar, sysconf};

let pages = match sysconf(SysconfVar::_PHYS_PAGES) {
Ok(Some(pages)) if pages > 0 => u128::try_from(pages).ok()?,
_ => return None,
};

let page_size = match sysconf(SysconfVar::PAGE_SIZE) {
Ok(Some(page_size)) if page_size > 0 => u128::try_from(page_size).ok()?,
_ => return None,
};

Some(pages.saturating_mul(page_size))
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn desired_buffer_matches_total_when_small() {
let six_mebibytes = 6 * 1024 * 1024;
let expected = ((six_mebibytes as u128) * 12)
.clamp(six_mebibytes as u128, crate::MAX_AUTOMATIC_BUF_SIZE as u128);
assert_eq!(desired_file_buffer_bytes(six_mebibytes as u128), expected);
}

#[test]
fn desired_buffer_caps_at_max_for_large_inputs() {
let large = 256 * 1024 * 1024; // 256 MiB
assert_eq!(
desired_file_buffer_bytes(large as u128),
crate::MAX_AUTOMATIC_BUF_SIZE as u128
);
}
}
13 changes: 7 additions & 6 deletions src/uu/sort/src/chunks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -271,11 +271,12 @@ fn read_to_buffer<T: Read>(
if max_buffer_size > buffer.len() {
// we can grow the buffer
let prev_len = buffer.len();
if buffer.len() < max_buffer_size / 2 {
buffer.resize(buffer.len() * 2, 0);
let target = if buffer.len() < max_buffer_size / 2 {
buffer.len().saturating_mul(2)
} else {
buffer.resize(max_buffer_size, 0);
}
max_buffer_size
};
buffer.resize(target.min(max_buffer_size), 0);
read_target = &mut buffer[prev_len..];
continue;
}
Expand All @@ -295,8 +296,8 @@ fn read_to_buffer<T: Read>(

// We need to read more lines
let len = buffer.len();
// resize the vector to 10 KB more
buffer.resize(len + 1024 * 10, 0);
let grow_by = (len / 2).max(1024 * 1024);
buffer.resize(len + grow_by, 0);
read_target = &mut buffer[len..];
} else {
// This file has been fully read.
Expand Down
12 changes: 9 additions & 3 deletions src/uu/sort/src/ext_sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,15 @@ fn reader_writer<
) -> UResult<()> {
let separator = settings.line_ending.into();

// Heuristically chosen: Dividing by 10 seems to keep our memory usage roughly
// around settings.buffer_size as a whole.
let buffer_size = settings.buffer_size / 10;
// Cap oversized buffer requests to avoid unnecessary allocations and give the automatic
// heuristic room to grow when the user does not provide an explicit value.
let mut buffer_size = match settings.buffer_size {
size if size <= 512 * 1024 * 1024 => size,
size => size / 2,
};
if !settings.buffer_size_is_explicit {
buffer_size = buffer_size.max(8 * 1024 * 1024);
}
let read_result: ReadResult<Tmp> = read_write_loop(
files,
tmp_dir,
Expand Down
60 changes: 46 additions & 14 deletions src/uu/sort/src/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit

mod buffer_hint;
mod check;
mod chunks;
mod custom_str_cmp;
Expand Down Expand Up @@ -54,6 +55,7 @@ use uucore::show_error;
use uucore::translate;
use uucore::version_cmp::version_cmp;

use crate::buffer_hint::automatic_buffer_size;
use crate::tmp_dir::TmpDirWrapper;

mod options {
Expand Down Expand Up @@ -115,10 +117,12 @@ const DECIMAL_PT: u8 = b'.';
const NEGATIVE: &u8 = &b'-';
const POSITIVE: &u8 = &b'+';

// Choosing a higher buffer size does not result in performance improvements
// (at least not on my machine). TODO: In the future, we should also take the amount of
// available memory into consideration, instead of relying on this constant only.
const DEFAULT_BUF_SIZE: usize = 1_000_000_000; // 1 GB
// The automatic buffer heuristics clamp to this range to avoid
// over-committing memory on constrained systems while still keeping
// reasonably large chunks for typical workloads.
const MIN_AUTOMATIC_BUF_SIZE: usize = 512 * 1024; // 512 KiB
const FALLBACK_AUTOMATIC_BUF_SIZE: usize = 32 * 1024 * 1024; // 32 MiB
const MAX_AUTOMATIC_BUF_SIZE: usize = 1024 * 1024 * 1024; // 1 GiB

#[derive(Debug, Error)]
pub enum SortError {
Expand Down Expand Up @@ -283,6 +287,7 @@ pub struct GlobalSettings {
threads: String,
line_ending: LineEnding,
buffer_size: usize,
buffer_size_is_explicit: bool,
compress_prog: Option<String>,
merge_batch_size: usize,
precomputed: Precomputed,
Expand Down Expand Up @@ -359,9 +364,10 @@ impl Default for GlobalSettings {
separator: None,
threads: String::new(),
line_ending: LineEnding::Newline,
buffer_size: DEFAULT_BUF_SIZE,
buffer_size: FALLBACK_AUTOMATIC_BUF_SIZE,
buffer_size_is_explicit: false,
compress_prog: None,
merge_batch_size: 32,
merge_batch_size: default_merge_batch_size(),
precomputed: Precomputed::default(),
}
}
Expand Down Expand Up @@ -1036,6 +1042,31 @@ fn get_rlimit() -> UResult<usize> {
}

const STDIN_FILE: &str = "-";
#[cfg(target_os = "linux")]
const LINUX_BATCH_DIVISOR: usize = 4;
#[cfg(target_os = "linux")]
const LINUX_BATCH_MIN: usize = 32;
#[cfg(target_os = "linux")]
const LINUX_BATCH_MAX: usize = 256;

fn default_merge_batch_size() -> usize {
#[cfg(target_os = "linux")]
{
// Adjust merge batch size dynamically based on available file descriptors.
match get_rlimit() {
Ok(limit) => {
let usable_limit = limit.saturating_div(LINUX_BATCH_DIVISOR);
usable_limit.clamp(LINUX_BATCH_MIN, LINUX_BATCH_MAX)
}
Err(_) => 64,
}
}

#[cfg(not(target_os = "linux"))]
{
64
}
}

#[uucore::main]
#[allow(clippy::cognitive_complexity)]
Expand Down Expand Up @@ -1157,14 +1188,15 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
}
}

settings.buffer_size =
matches
.get_one::<String>(options::BUF_SIZE)
.map_or(Ok(DEFAULT_BUF_SIZE), |s| {
GlobalSettings::parse_byte_count(s).map_err(|e| {
USimpleError::new(2, format_error_message(&e, s, options::BUF_SIZE))
})
})?;
if let Some(size_str) = matches.get_one::<String>(options::BUF_SIZE) {
settings.buffer_size = GlobalSettings::parse_byte_count(size_str).map_err(|e| {
USimpleError::new(2, format_error_message(&e, size_str, options::BUF_SIZE))
})?;
settings.buffer_size_is_explicit = true;
} else {
settings.buffer_size = automatic_buffer_size(&files);
settings.buffer_size_is_explicit = false;
}

let mut tmp_dir = TmpDirWrapper::new(
matches
Expand Down
Loading