diff --git a/src/uu/uniq/src/uniq.rs b/src/uu/uniq/src/uniq.rs index 3288241e4ef..6f324929cc3 100644 --- a/src/uu/uniq/src/uniq.rs +++ b/src/uu/uniq/src/uniq.rs @@ -2,7 +2,7 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore badoption +// spell-checker:ignore badoption CTYPE use clap::{ Arg, ArgAction, ArgMatches, Command, builder::ValueParser, error::ContextKind, error::Error, error::ErrorKind, @@ -186,6 +186,17 @@ impl Uniq { } } + fn is_c_locale() -> bool { + for key in ["LC_ALL", "LC_CTYPE", "LANG"] { + if let Some(v) = std::env::var_os(key) { + if !v.is_empty() { + return v == "C" || v == "POSIX"; + } + } + } + true + } + fn key_end_index(&self, line: &[u8], key_start: usize) -> usize { let remainder = &line[key_start..]; match self.slice_stop { @@ -194,10 +205,15 @@ impl Uniq { if remainder.is_empty() { return key_start; } - if let Ok(valid) = std::str::from_utf8(remainder) { + if Self::is_c_locale() { + // for C or POSIX we count bytes + key_start + remainder.len().min(limit) + } else if let Ok(valid) = std::str::from_utf8(remainder) { + // for UTF-8 we count characters let prefix_len = Self::char_prefix_len(valid, limit); key_start + prefix_len } else { + // for invalid UTF-8 we count bytes key_start + remainder.len().min(limit) } } diff --git a/tests/by-util/test_uniq.rs b/tests/by-util/test_uniq.rs index 6c9107ce527..8e940a743cd 100644 --- a/tests/by-util/test_uniq.rs +++ b/tests/by-util/test_uniq.rs @@ -1176,11 +1176,23 @@ fn test_stdin_w1_multibyte() { let input = "à\ná\n"; new_ucmd!() .args(&["-w1"]) + .env("LC_ALL", "en_US.UTF-8") .pipe_in(input) .succeeds() .stdout_is("à\ná\n"); } +#[test] +fn test_c_locale_counts_bytes() { + let input = "가나다라마\n가나다바사\n"; + new_ucmd!() + .args(&["-w", "4"]) + .env("LC_ALL", "C") + .pipe_in(input) + .succeeds() + .stdout_is("가나다라마\n"); +} + #[cfg(target_os = "linux")] #[test] fn test_failed_write_is_reported() {