diff --git a/.vscode/cspell.dictionaries/jargon.wordlist.txt b/.vscode/cspell.dictionaries/jargon.wordlist.txt index 66216702150..cacaad1af19 100644 --- a/.vscode/cspell.dictionaries/jargon.wordlist.txt +++ b/.vscode/cspell.dictionaries/jargon.wordlist.txt @@ -6,6 +6,7 @@ autogenerated autogenerates bitmask bitwise +bufferram bytewise canonicalization canonicalize @@ -45,6 +46,7 @@ fileio filesystem filesystems flamegraph +freeram fsxattr fullblock getfacl @@ -123,6 +125,7 @@ shortcode shortcodes siginfo sigusr +strcasecmp subcommand subexpression submodule @@ -134,6 +137,7 @@ syscalls sysconf tokenize toolchain +totalram truthy ucase unbuffered diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 347e88266a5..b7c7e85cf2e 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -806,9 +806,9 @@ dependencies = [ [[package]] name = "is_terminal_polyfill" -version = "1.70.1" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" [[package]] name = "itertools" @@ -1018,9 +1018,9 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "once_cell_polyfill" -version = "1.70.1" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] name = "onig" @@ -1142,9 +1142,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.101" +version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +checksum = "8e0f6df8eaa422d97d72edcd152e1451618fed47fabbdbd5a8864167b1d4aff7" dependencies = [ "unicode-ident", ] @@ -1303,9 +1303,9 @@ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "self_cell" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f7d95a54511e0c7be3f51e8867aa8cf35148d7b9445d44de2f943e2b206e749" +checksum = "16c2f82143577edb4921b71ede051dac62ca3c16084e918bf7b40c96ae10eb33" [[package]] name = "serde" @@ -1421,9 +1421,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.107" +version = "2.0.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a26dbd934e5451d21ef060c018dae56fc073894c5a7896f882928a76e6d081b" +checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917" dependencies = [ "proc-macro2", "quote", diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index c9d1bac97aa..22c96a436ed 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -301,6 +301,8 @@ struct Precomputed { num_infos_per_line: usize, floats_per_line: usize, selections_per_line: usize, + fast_lexicographic: bool, + fast_ascii_insensitive: bool, } impl GlobalSettings { @@ -341,6 +343,47 @@ impl GlobalSettings { .iter() .filter(|s| matches!(s.settings.mode, SortMode::GeneralNumeric)) .count(); + + self.precomputed.fast_lexicographic = self.can_use_fast_lexicographic(); + self.precomputed.fast_ascii_insensitive = self.can_use_fast_ascii_insensitive(); + } + + /// Returns true when the fast lexicographic path can be used safely. + fn can_use_fast_lexicographic(&self) -> bool { + self.mode == SortMode::Default + && !self.ignore_case + && !self.dictionary_order + && !self.ignore_non_printing + && !self.ignore_leading_blanks + && self.selectors.len() == 1 + && { + let selector = &self.selectors[0]; + !selector.needs_selection + && matches!(selector.settings.mode, SortMode::Default) + && !selector.settings.ignore_case + && !selector.settings.dictionary_order + && !selector.settings.ignore_non_printing + && !selector.settings.ignore_blanks + } + } + + /// Returns true when the ASCII case-insensitive fast path is valid. + fn can_use_fast_ascii_insensitive(&self) -> bool { + self.mode == SortMode::Default + && self.ignore_case + && !self.dictionary_order + && !self.ignore_non_printing + && !self.ignore_leading_blanks + && self.selectors.len() == 1 + && { + let selector = &self.selectors[0]; + !selector.needs_selection + && matches!(selector.settings.mode, SortMode::Default) + && selector.settings.ignore_case + && !selector.settings.dictionary_order + && !selector.settings.ignore_non_printing + && !selector.settings.ignore_blanks + } } } @@ -1643,6 +1686,26 @@ fn compare_by<'a>( a_line_data: &LineData<'a>, b_line_data: &LineData<'a>, ) -> Ordering { + if global_settings.precomputed.fast_lexicographic { + let cmp = a.line.cmp(b.line); + return if global_settings.reverse { + cmp.reverse() + } else { + cmp + }; + } + + if global_settings.precomputed.fast_ascii_insensitive { + let cmp = ascii_case_insensitive_cmp(a.line, b.line); + if cmp != Ordering::Equal || a.line == b.line { + return if global_settings.reverse { + cmp.reverse() + } else { + cmp + }; + } + } + let mut selection_index = 0; let mut num_info_index = 0; let mut parsed_float_index = 0; @@ -1754,6 +1817,26 @@ fn compare_by<'a>( } } +/// Compare two byte slices in ASCII case-insensitive order without allocating. +/// We lower each byte on the fly so that binary input (including `NUL`) stays +/// untouched and we avoid locale-sensitive routines such as `strcasecmp`. +fn ascii_case_insensitive_cmp(a: &[u8], b: &[u8]) -> Ordering { + #[inline] + fn lower(byte: u8) -> u8 { + byte.to_ascii_lowercase() + } + + for (lhs, rhs) in a.iter().copied().zip(b.iter().copied()) { + let l = lower(lhs); + let r = lower(rhs); + if l != r { + return l.cmp(&r); + } + } + + a.len().cmp(&b.len()) +} + // This function cleans up the initial comparison done by leading_num_common for a general numeric compare. // In contrast to numeric compare, GNU general numeric/FP sort *should* recognize positive signs and // scientific notation, so we strip those lines only after the end of the following numeric string. diff --git a/src/uu/sort/src/tmp_dir.rs b/src/uu/sort/src/tmp_dir.rs index 474e01ae2f3..815ba510970 100644 --- a/src/uu/sort/src/tmp_dir.rs +++ b/src/uu/sort/src/tmp_dir.rs @@ -2,10 +2,11 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. +use std::sync::atomic::{AtomicBool, Ordering}; use std::{ fs::File, path::{Path, PathBuf}, - sync::{Arc, Mutex}, + sync::{Arc, Mutex, OnceLock}, }; use tempfile::TempDir; @@ -29,6 +30,70 @@ pub struct TmpDirWrapper { lock: Arc>, } +#[derive(Default, Clone)] +struct HandlerRegistration { + lock: Option>>, + path: Option, +} + +fn handler_state() -> Arc> { + // Lazily create the global HandlerRegistration so all TmpDirWrapper instances and the + // SIGINT handler operate on the same lock/path snapshot. + static HANDLER_STATE: OnceLock>> = OnceLock::new(); + HANDLER_STATE + .get_or_init(|| Arc::new(Mutex::new(HandlerRegistration::default()))) + .clone() +} + +fn ensure_signal_handler_installed(state: Arc>) -> UResult<()> { + // This shared state must originate from `handler_state()` so the handler always sees + // the current lock/path pair and can clean up the active temp directory on SIGINT. + // Install a shared SIGINT handler so the active temp directory is deleted when the user aborts. + // Guard to ensure the SIGINT handler is registered once per process and reused. + static HANDLER_INSTALLED: AtomicBool = AtomicBool::new(false); + + if HANDLER_INSTALLED + .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) + .is_err() + { + return Ok(()); + } + + let handler_state = state.clone(); + if let Err(e) = ctrlc::set_handler(move || { + // Load the latest lock/path snapshot so the handler cleans the active temp dir. + let (lock, path) = { + let state = handler_state.lock().unwrap(); + (state.lock.clone(), state.path.clone()) + }; + + if let Some(lock) = lock { + let _guard = lock.lock().unwrap(); + if let Some(path) = path { + if let Err(e) = remove_tmp_dir(&path) { + show_error!( + "{}", + translate!( + "sort-failed-to-delete-temporary-directory", + "error" => e + ) + ); + } + } + } + + std::process::exit(2) + }) { + HANDLER_INSTALLED.store(false, Ordering::Release); + return Err(USimpleError::new( + 2, + translate!("sort-failed-to-set-up-signal-handler", "error" => e), + )); + } + + Ok(()) +} + impl TmpDirWrapper { pub fn new(path: PathBuf) -> Self { Self { @@ -52,31 +117,14 @@ impl TmpDirWrapper { ); let path = self.temp_dir.as_ref().unwrap().path().to_owned(); - let lock = self.lock.clone(); - ctrlc::set_handler(move || { - // Take the lock so that `next_file_path` returns no new file path, - // and the program doesn't terminate before the handler has finished - let _lock = lock.lock().unwrap(); - if let Err(e) = remove_tmp_dir(&path) { - show_error!( - "{}", - translate!( - "sort-failed-to-delete-temporary-directory", - "error" => e - ) - ); - } - std::process::exit(2) - }) - .map_err(|e| { - USimpleError::new( - 2, - translate!( - "sort-failed-to-set-up-signal-handler", - "error" => e - ), - ) - }) + let state = handler_state(); + { + let mut guard = state.lock().unwrap(); + guard.lock = Some(self.lock.clone()); + guard.path = Some(path); + } + + ensure_signal_handler_installed(state) } pub fn next_file(&mut self) -> UResult<(File, PathBuf)> { @@ -100,6 +148,22 @@ impl TmpDirWrapper { } } +impl Drop for TmpDirWrapper { + fn drop(&mut self) { + let state = handler_state(); + let mut guard = state.lock().unwrap(); + + if guard + .lock + .as_ref() + .is_some_and(|current| Arc::ptr_eq(current, &self.lock)) + { + guard.lock = None; + guard.path = None; + } + } +} + /// Remove the directory at `path` by deleting its child files and then itself. /// Errors while deleting child files are ignored. fn remove_tmp_dir(path: &Path) -> std::io::Result<()> {