From 7f0758346b52bde43eb403e2cbcee9ee98f5cad7 Mon Sep 17 00:00:00 2001 From: Alvaro Guimaraes Date: Sun, 22 Feb 2026 20:38:45 -0500 Subject: [PATCH 1/3] uniq: fix -w to count bytes in C locale --- src/uu/uniq/src/uniq.rs | 18 +++++++++++++++++- tests/by-util/test_uniq.rs | 12 ++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/uu/uniq/src/uniq.rs b/src/uu/uniq/src/uniq.rs index 3288241e4ef..25c06f2e573 100644 --- a/src/uu/uniq/src/uniq.rs +++ b/src/uu/uniq/src/uniq.rs @@ -186,6 +186,17 @@ impl Uniq { } } + fn is_c_locale() -> bool { + for key in ["LC_ALL", "LC_CTYPE", "LANG"] { + if let Ok(v) = std::env::var(key) { + if !v.is_empty() { + return v == "C" || v == "POSIX"; + } + } + } + true + } + fn key_end_index(&self, line: &[u8], key_start: usize) -> usize { let remainder = &line[key_start..]; match self.slice_stop { @@ -194,10 +205,15 @@ impl Uniq { if remainder.is_empty() { return key_start; } - if let Ok(valid) = std::str::from_utf8(remainder) { + if Self::is_c_locale() { + // for C or POSIX we count bytes + key_start + remainder.len().min(limit) + } else if let Ok(valid) = std::str::from_utf8(remainder) { + // for UTF-8 we count characters let prefix_len = Self::char_prefix_len(valid, limit); key_start + prefix_len } else { + // for invalid UTF-8 we count bytes key_start + remainder.len().min(limit) } } diff --git a/tests/by-util/test_uniq.rs b/tests/by-util/test_uniq.rs index 6c9107ce527..8e940a743cd 100644 --- a/tests/by-util/test_uniq.rs +++ b/tests/by-util/test_uniq.rs @@ -1176,11 +1176,23 @@ fn test_stdin_w1_multibyte() { let input = "à\ná\n"; new_ucmd!() .args(&["-w1"]) + .env("LC_ALL", "en_US.UTF-8") .pipe_in(input) .succeeds() .stdout_is("à\ná\n"); } +#[test] +fn test_c_locale_counts_bytes() { + let input = "가나다라마\n가나다바사\n"; + new_ucmd!() + .args(&["-w", "4"]) + .env("LC_ALL", "C") + .pipe_in(input) + .succeeds() + .stdout_is("가나다라마\n"); +} + #[cfg(target_os = "linux")] #[test] fn test_failed_write_is_reported() { From 85fa5b9799aec922f41633ba3620545c60430269 Mon Sep 17 00:00:00 2001 From: Alvaro Guimaraes Date: Sun, 22 Feb 2026 20:54:36 -0500 Subject: [PATCH 2/3] uniq: add CTYPE to spellchecker ignore list --- src/uu/uniq/src/uniq.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/uniq/src/uniq.rs b/src/uu/uniq/src/uniq.rs index 25c06f2e573..febe0566122 100644 --- a/src/uu/uniq/src/uniq.rs +++ b/src/uu/uniq/src/uniq.rs @@ -2,7 +2,7 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore badoption +// spell-checker:ignore badoption CTYPE use clap::{ Arg, ArgAction, ArgMatches, Command, builder::ValueParser, error::ContextKind, error::Error, error::ErrorKind, From 1bc1507f46e5a674d4fc117079031c4ce24a9ff2 Mon Sep 17 00:00:00 2001 From: Alvaro Guimaraes Date: Sun, 22 Feb 2026 23:09:42 -0500 Subject: [PATCH 3/3] uniq: avoid String allocation by using std::env::var_os() --- src/uu/uniq/src/uniq.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/uniq/src/uniq.rs b/src/uu/uniq/src/uniq.rs index febe0566122..6f324929cc3 100644 --- a/src/uu/uniq/src/uniq.rs +++ b/src/uu/uniq/src/uniq.rs @@ -188,7 +188,7 @@ impl Uniq { fn is_c_locale() -> bool { for key in ["LC_ALL", "LC_CTYPE", "LANG"] { - if let Ok(v) = std::env::var(key) { + if let Some(v) = std::env::var_os(key) { if !v.is_empty() { return v == "C" || v == "POSIX"; }