From bf040967c4bee91b50eadcb94d24aa93c8adf2d6 Mon Sep 17 00:00:00 2001 From: naoNao89 <90588855+naoNao89@users.noreply.github.com> Date: Wed, 18 Feb 2026 11:24:20 +0700 Subject: [PATCH 1/2] fix(wc): respect C/POSIX locale for character counting Modify wc -m to count bytes instead of UTF-8 characters when LC_ALL, LC_CTYPE, or LANG is set to C or POSIX. This matches GNU coreutils behavior where MB_CUR_MAX == 1 in these locales. Changes: - Add is_c_or_posix_locale() helper in count_fast.rs - Export and reuse function in wc.rs to avoid duplication - Update fast path and UTF-8 decoding path - Add regression tests with Vietnamese text Fixes #9712, fixes #5831. --- src/uu/wc/src/count_fast.rs | 38 +++++++-- src/uu/wc/src/wc.rs | 14 +++- tests/by-util/test_wc.rs | 158 +++++++++++++++++++++++++++++++++++- 3 files changed, 201 insertions(+), 9 deletions(-) diff --git a/src/uu/wc/src/count_fast.rs b/src/uu/wc/src/count_fast.rs index 5e2ecd080a0..7a79e96a8f7 100644 --- a/src/uu/wc/src/count_fast.rs +++ b/src/uu/wc/src/count_fast.rs @@ -3,12 +3,30 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// cSpell:ignore sysconf +// spell-checker:ignore sysconf CTYPE use crate::{wc_simd_allowed, word_count::WordCount}; use uucore::hardware::SimdPolicy; use super::WordCountable; +/// Check if the current locale is C or POSIX (where characters == bytes). +/// This follows GNU coreutils behavior where MB_CUR_MAX == 1 in these locales. +pub(crate) fn is_c_or_posix_locale() -> bool { + // Check LC_ALL, LC_CTYPE, and LANG in order of precedence + let locale_val = ["LC_ALL", "LC_CTYPE", "LANG"] + .iter() + .find_map(|&var| std::env::var(var).ok().filter(|v| !v.is_empty())); + + if let Some(locale) = locale_val { + // Extract the base locale name (before any '.' or '@') + let base_locale = locale.split(&['.', '@']).next().unwrap_or(&locale); + base_locale == "C" || base_locale == "POSIX" + } else { + // No locale set, default to POSIX behavior (chars == bytes) + true + } +} + #[cfg(any(target_os = "linux", target_os = "android"))] use std::fs::OpenOptions; use std::io::{self, ErrorKind, Read}; @@ -235,6 +253,11 @@ pub(crate) fn count_bytes_chars_and_lines_fast< let buf: &mut [u8] = &mut AlignedBuffer::default().data; let policy = SimdPolicy::detect(); let simd_allowed = wc_simd_allowed(policy); + + // In C/POSIX locale, characters are equivalent to bytes (MB_CUR_MAX == 1). + // This follows GNU coreutils behavior. + let chars_are_bytes = is_c_or_posix_locale(); + loop { match handle.read(buf) { Ok(0) => return (total, None), @@ -243,11 +266,16 @@ pub(crate) fn count_bytes_chars_and_lines_fast< total.bytes += n; } if COUNT_CHARS { - total.chars += if simd_allowed { - bytecount::num_chars(&buf[..n]) + if chars_are_bytes { + // In C/POSIX locale, count bytes instead of UTF-8 chars + total.chars += n; } else { - bytecount::naive_num_chars(&buf[..n]) - }; + total.chars += if simd_allowed { + bytecount::num_chars(&buf[..n]) + } else { + bytecount::naive_num_chars(&buf[..n]) + }; + } } if COUNT_LINES { total.lines += if simd_allowed { diff --git a/src/uu/wc/src/wc.rs b/src/uu/wc/src/wc.rs index 3c762353ca3..a9689f02fc7 100644 --- a/src/uu/wc/src/wc.rs +++ b/src/uu/wc/src/wc.rs @@ -3,7 +3,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// cSpell:ignore ilog wc wc's +// spell-checker:ignore ilog wc wc's mod count_fast; mod countable; @@ -37,7 +37,7 @@ use uucore::{ }; use crate::{ - count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast}, + count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast, is_c_or_posix_locale}, countable::WordCountable, word_count::WordCount, }; @@ -580,6 +580,7 @@ fn process_chunk< current_len: &mut usize, in_word: &mut bool, posixly_correct: bool, + chars_are_bytes: bool, ) { for ch in text.chars() { if SHOW_WORDS { @@ -615,12 +616,17 @@ fn process_chunk< if SHOW_LINES && ch == '\n' { total.lines += 1; } - if SHOW_CHARS { + if SHOW_CHARS && !chars_are_bytes { total.chars += 1; } } total.bytes += text.len(); + // In C/POSIX locale, chars count equals bytes count + if SHOW_CHARS && chars_are_bytes { + total.chars += text.len(); + } + total.max_line_length = max(*current_len, total.max_line_length); } @@ -656,6 +662,7 @@ fn word_count_from_reader_specialized< let mut in_word = false; let mut current_len = 0; let posixly_correct = env::var_os("POSIXLY_CORRECT").is_some(); + let chars_are_bytes = SHOW_CHARS && is_c_or_posix_locale(); while let Some(chunk) = reader.next_strict() { match chunk { Ok(text) => { @@ -665,6 +672,7 @@ fn word_count_from_reader_specialized< &mut current_len, &mut in_word, posixly_correct, + chars_are_bytes, ); } Err(e) => { diff --git a/tests/by-util/test_wc.rs b/tests/by-util/test_wc.rs index d62c1da6e8e..f4e35f96879 100644 --- a/tests/by-util/test_wc.rs +++ b/tests/by-util/test_wc.rs @@ -8,7 +8,8 @@ use uutests::at_and_ucmd; use uutests::new_ucmd; use uutests::util::vec_of_size; -// spell-checker:ignore (flags) lwmcL clmwL ; (path) bogusfile emptyfile manyemptylines moby notrailingnewline onelongemptyline onelongword weirdchars ioerrdir +// spell-checker:ignore (flags) lwmcL clmwL ; (path) bogusfile emptyfile manyemptylines moby notrailingnewline onelongemptyline onelongword weirdchars ioerrdir CTYPE +// spell-checker:ignore (Vietnamese) Tiếng Việt chào #[test] fn test_invalid_arg() { new_ucmd!().arg("--definitely-invalid").fails_with_code(1); @@ -61,8 +62,10 @@ fn test_stdin_explicit() { #[test] fn test_utf8() { + // Requires UTF-8 locale for character counting new_ucmd!() .args(&["-lwmcL"]) + .env("LC_ALL", "en_US.UTF-8") .pipe_in_fixture("UTF_8_test.txt") .succeeds() .stdout_is(" 303 2178 22457 23025 79\n"); @@ -88,8 +91,10 @@ fn test_utf8_line_length_words() { #[test] fn test_utf8_line_length_chars() { + // Requires UTF-8 locale for character counting new_ucmd!() .arg("-Lm") + .env("LC_ALL", "en_US.UTF-8") .pipe_in_fixture("UTF_8_weirdchars.txt") .succeeds() .stdout_is(" 442 48\n"); @@ -97,8 +102,10 @@ fn test_utf8_line_length_chars() { #[test] fn test_utf8_line_length_chars_words() { + // Requires UTF-8 locale for character counting new_ucmd!() .arg("-Lmw") + .env("LC_ALL", "en_US.UTF-8") .pipe_in_fixture("UTF_8_weirdchars.txt") .succeeds() .stdout_is(" 89 442 48\n"); @@ -106,8 +113,10 @@ fn test_utf8_line_length_chars_words() { #[test] fn test_utf8_chars() { + // Requires UTF-8 locale for character counting new_ucmd!() .arg("-m") + .env("LC_ALL", "en_US.UTF-8") .pipe_in_fixture("UTF_8_weirdchars.txt") .succeeds() .stdout_is("442\n"); @@ -115,8 +124,10 @@ fn test_utf8_chars() { #[test] fn test_utf8_bytes_chars() { + // Requires UTF-8 locale for character counting new_ucmd!() .arg("-cm") + .env("LC_ALL", "en_US.UTF-8") .pipe_in_fixture("UTF_8_weirdchars.txt") .succeeds() .stdout_is(" 442 513\n"); @@ -133,8 +144,10 @@ fn test_utf8_bytes_lines() { #[test] fn test_utf8_bytes_chars_lines() { + // Requires UTF-8 locale for character counting new_ucmd!() .arg("-cml") + .env("LC_ALL", "en_US.UTF-8") .pipe_in_fixture("UTF_8_weirdchars.txt") .succeeds() .stdout_is(" 25 442 513\n"); @@ -142,8 +155,10 @@ fn test_utf8_bytes_chars_lines() { #[test] fn test_utf8_chars_words() { + // Requires UTF-8 locale for character counting new_ucmd!() .arg("-mw") + .env("LC_ALL", "en_US.UTF-8") .pipe_in_fixture("UTF_8_weirdchars.txt") .succeeds() .stdout_is(" 89 442\n"); @@ -169,8 +184,10 @@ fn test_utf8_line_length_lines_words() { #[test] fn test_utf8_lines_chars() { + // Requires UTF-8 locale for character counting new_ucmd!() .arg("-ml") + .env("LC_ALL", "en_US.UTF-8") .pipe_in_fixture("UTF_8_weirdchars.txt") .succeeds() .stdout_is(" 25 442\n"); @@ -178,8 +195,10 @@ fn test_utf8_lines_chars() { #[test] fn test_utf8_lines_words_chars() { + // Requires UTF-8 locale for character counting new_ucmd!() .arg("-mlw") + .env("LC_ALL", "en_US.UTF-8") .pipe_in_fixture("UTF_8_weirdchars.txt") .succeeds() .stdout_is(" 25 89 442\n"); @@ -187,8 +206,10 @@ fn test_utf8_lines_words_chars() { #[test] fn test_utf8_line_length_lines_chars() { + // Requires UTF-8 locale for character counting new_ucmd!() .arg("-Llm") + .env("LC_ALL", "en_US.UTF-8") .pipe_in_fixture("UTF_8_weirdchars.txt") .succeeds() .stdout_is(" 25 442 48\n"); @@ -196,8 +217,10 @@ fn test_utf8_line_length_lines_chars() { #[test] fn test_utf8_all() { + // Requires UTF-8 locale for character counting new_ucmd!() .arg("-lwmcL") + .env("LC_ALL", "en_US.UTF-8") .pipe_in_fixture("UTF_8_weirdchars.txt") .succeeds() .stdout_is(" 25 89 442 513 48\n"); @@ -921,3 +944,136 @@ fn test_posixly_correct_whitespace() { .succeeds() .stdout_is("1\n"); } + +#[test] +fn test_wc_chars_c_locale() { + // In C/POSIX locale, wc -m should count bytes, not UTF-8 characters + // Vietnamese "Tiếng Việt" uses diacritics (2 bytes per char in UTF-8) + // "Tiếng" = 5 chars, 7 bytes ("ế" is 2 bytes) + let vietnamese_text = "Tiếng"; + + // With LC_ALL=C, chars should equal bytes (7) + new_ucmd!() + .arg("-m") + .env("LC_ALL", "C") + .pipe_in(vietnamese_text) + .succeeds() + .stdout_is("7\n"); + + // Same with LC_ALL=POSIX + new_ucmd!() + .arg("-m") + .env("LC_ALL", "POSIX") + .pipe_in(vietnamese_text) + .succeeds() + .stdout_is("7\n"); + + // Test combined with bytes flag - should show same count + new_ucmd!() + .args(&["-cm"]) + .env("LC_ALL", "C") + .pipe_in(vietnamese_text) + .succeeds() + .stdout_is(" 7 7\n"); +} + +#[test] +fn test_wc_chars_utf8_locale() { + // In UTF-8 locale, wc -m should count UTF-8 characters + // Vietnamese "Tiếng" is 7 bytes in UTF-8 but 5 characters ("ế" is 2 bytes) + let vietnamese_text = "Tiếng"; + + // With vi_VN.UTF-8 locale, chars should be 5 (not 7) + new_ucmd!() + .arg("-m") + .env("LC_ALL", "vi_VN.UTF-8") + .pipe_in(vietnamese_text) + .succeeds() + .stdout_is("5\n"); + + // Test combined with bytes flag - should show different counts + // Order is: chars, bytes (since show_chars comes before show_bytes in print_stats) + new_ucmd!() + .args(&["-cm"]) + .env("LC_ALL", "vi_VN.UTF-8") + .pipe_in(vietnamese_text) + .succeeds() + .stdout_is(" 5 7\n"); +} + +#[test] +fn test_wc_chars_default_locale() { + // When no locale is set (empty LC_ALL), it defaults to POSIX (chars == bytes) + // This ensures backward compatibility + let vietnamese_text = "Tiếng"; + + new_ucmd!() + .arg("-m") + .env("LC_ALL", "") + .env("LC_CTYPE", "") + .env("LANG", "") + .pipe_in(vietnamese_text) + .succeeds() + .stdout_is("7\n"); +} + +#[test] +fn test_wc_multibyte_c_locale() { + // Issue #9712 and #5831: Test various multibyte characters in C locale + // All should be counted as bytes + + // Vietnamese text with multiple diacritics: "Tiếng Việt" + // 10 chars, 14 bytes ("ế" and "ệ" are 2 bytes each) + new_ucmd!() + .args(&["-cm"]) + .env("LC_ALL", "C") + .pipe_in("Tiếng Việt") + .succeeds() + .stdout_is(" 14 14\n"); + + // Single Vietnamese character "ệ" = 1 char, 3 bytes in UTF-8 (e1 bb 87) + new_ucmd!() + .args(&["-cm"]) + .env("LC_ALL", "C") + .pipe_in("ệ") + .succeeds() + .stdout_is(" 3 3\n"); + + // Mixed ASCII and Vietnamese: "Xin chào" = 8 chars, 9 bytes ("à" is 2 bytes) + new_ucmd!() + .args(&["-cm"]) + .env("LC_ALL", "C") + .pipe_in("Xin chào") + .succeeds() + .stdout_is(" 9 9\n"); +} + +#[test] +fn test_wc_multibyte_utf8_locale() { + // In UTF-8 locale, multibyte characters should be counted correctly + // Order is: chars, bytes (since show_chars comes before show_bytes in print_stats) + + // Vietnamese "Tiếng Việt": 10 chars, 14 bytes ("ế" and "ệ" are 2 bytes each) + new_ucmd!() + .args(&["-cm"]) + .env("LC_ALL", "vi_VN.UTF-8") + .pipe_in("Tiếng Việt") + .succeeds() + .stdout_is(" 10 14\n"); + + // Single Vietnamese character "ệ" = 1 char, 3 bytes in UTF-8 (e1 bb 87) + new_ucmd!() + .args(&["-cm"]) + .env("LC_ALL", "vi_VN.UTF-8") + .pipe_in("ệ") + .succeeds() + .stdout_is(" 1 3\n"); + + // Mixed ASCII and Vietnamese "Xin chào": 8 chars, 9 bytes ("à" is 2 bytes) + new_ucmd!() + .args(&["-cm"]) + .env("LC_ALL", "vi_VN.UTF-8") + .pipe_in("Xin chào") + .succeeds() + .stdout_is(" 8 9\n"); +} From 224212a3cb4a3a2f9bc9d9f188bfadce7c4b915d Mon Sep 17 00:00:00 2001 From: naoNao89 <90588855+naoNao89@users.noreply.github.com> Date: Wed, 18 Feb 2026 20:53:35 +0700 Subject: [PATCH 2/2] test(wc): add tests for wc.rs locale path Add tests with -w flag to ensure both count_fast.rs and wc.rs paths are tested for locale-aware character counting. --- tests/by-util/test_wc.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/by-util/test_wc.rs b/tests/by-util/test_wc.rs index f4e35f96879..51d284317cb 100644 --- a/tests/by-util/test_wc.rs +++ b/tests/by-util/test_wc.rs @@ -975,6 +975,15 @@ fn test_wc_chars_c_locale() { .pipe_in(vietnamese_text) .succeeds() .stdout_is(" 7 7\n"); + + // Test with -w to trigger wc.rs path (word_count_from_reader_specialized) + // Order: words, chars, bytes + new_ucmd!() + .args(&["-cmw"]) + .env("LC_ALL", "C") + .pipe_in(vietnamese_text) + .succeeds() + .stdout_is(" 1 7 7\n"); } #[test] @@ -999,6 +1008,15 @@ fn test_wc_chars_utf8_locale() { .pipe_in(vietnamese_text) .succeeds() .stdout_is(" 5 7\n"); + + // Test with -w to trigger wc.rs path (word_count_from_reader_specialized) + // Order: words, chars, bytes + new_ucmd!() + .args(&["-cmw"]) + .env("LC_ALL", "vi_VN.UTF-8") + .pipe_in(vietnamese_text) + .succeeds() + .stdout_is(" 1 5 7\n"); } #[test] @@ -1015,6 +1033,17 @@ fn test_wc_chars_default_locale() { .pipe_in(vietnamese_text) .succeeds() .stdout_is("7\n"); + + // Test with -w to trigger wc.rs path (word_count_from_reader_specialized) + // Order: words, chars + new_ucmd!() + .args(&["-mw"]) + .env("LC_ALL", "") + .env("LC_CTYPE", "") + .env("LANG", "") + .pipe_in(vietnamese_text) + .succeeds() + .stdout_is(" 1 7\n"); } #[test]