From bf040967c4bee91b50eadcb94d24aa93c8adf2d6 Mon Sep 17 00:00:00 2001
From: naoNao89 <90588855+naoNao89@users.noreply.github.com>
Date: Wed, 18 Feb 2026 11:24:20 +0700
Subject: [PATCH 1/2] fix(wc): respect C/POSIX locale for character counting

Modify wc -m to count bytes instead of UTF-8 characters when LC_ALL,
LC_CTYPE, or LANG is set to C or POSIX. This matches GNU coreutils
behavior where MB_CUR_MAX == 1 in these locales.

Changes:
- Add is_c_or_posix_locale() helper in count_fast.rs
- Export and reuse function in wc.rs to avoid duplication
- Update fast path and UTF-8 decoding path
- Add regression tests with Vietnamese text

Fixes #9712, fixes #5831.
---
 src/uu/wc/src/count_fast.rs |  38 +++++++--
 src/uu/wc/src/wc.rs         |  14 +++-
 tests/by-util/test_wc.rs    | 158 +++++++++++++++++++++++++++++++++++-
 3 files changed, 201 insertions(+), 9 deletions(-)

diff --git a/src/uu/wc/src/count_fast.rs b/src/uu/wc/src/count_fast.rs
index 5e2ecd080a0..7a79e96a8f7 100644
--- a/src/uu/wc/src/count_fast.rs
+++ b/src/uu/wc/src/count_fast.rs
@@ -3,12 +3,30 @@
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.
 
-// cSpell:ignore sysconf
+// spell-checker:ignore sysconf CTYPE
 use crate::{wc_simd_allowed, word_count::WordCount};
 use uucore::hardware::SimdPolicy;
 
 use super::WordCountable;
 
+/// Check if the current locale is C or POSIX (where characters == bytes).
+/// This follows GNU coreutils behavior where MB_CUR_MAX == 1 in these locales.
+pub(crate) fn is_c_or_posix_locale() -> bool {
+    // Check LC_ALL, LC_CTYPE, and LANG in order of precedence
+    let locale_val = ["LC_ALL", "LC_CTYPE", "LANG"]
+        .iter()
+        .find_map(|&var| std::env::var(var).ok().filter(|v| !v.is_empty()));
+
+    if let Some(locale) = locale_val {
+        // Extract the base locale name (before any '.' or '@')
+        let base_locale = locale.split(&['.', '@']).next().unwrap_or(&locale);
+        base_locale == "C" || base_locale == "POSIX"
+    } else {
+        // No locale set, default to POSIX behavior (chars == bytes)
+        true
+    }
+}
+
 #[cfg(any(target_os = "linux", target_os = "android"))]
 use std::fs::OpenOptions;
 use std::io::{self, ErrorKind, Read};
@@ -235,6 +253,11 @@ pub(crate) fn count_bytes_chars_and_lines_fast<
     let buf: &mut [u8] = &mut AlignedBuffer::default().data;
     let policy = SimdPolicy::detect();
     let simd_allowed = wc_simd_allowed(policy);
+
+    // In C/POSIX locale, characters are equivalent to bytes (MB_CUR_MAX == 1).
+    // This follows GNU coreutils behavior.
+    let chars_are_bytes = is_c_or_posix_locale();
+
     loop {
         match handle.read(buf) {
             Ok(0) => return (total, None),
@@ -243,11 +266,16 @@ pub(crate) fn count_bytes_chars_and_lines_fast<
                     total.bytes += n;
                 }
                 if COUNT_CHARS {
-                    total.chars += if simd_allowed {
-                        bytecount::num_chars(&buf[..n])
+                    if chars_are_bytes {
+                        // In C/POSIX locale, count bytes instead of UTF-8 chars
+                        total.chars += n;
                     } else {
-                        bytecount::naive_num_chars(&buf[..n])
-                    };
+                        total.chars += if simd_allowed {
+                            bytecount::num_chars(&buf[..n])
+                        } else {
+                            bytecount::naive_num_chars(&buf[..n])
+                        };
+                    }
                 }
                 if COUNT_LINES {
                     total.lines += if simd_allowed {
diff --git a/src/uu/wc/src/wc.rs b/src/uu/wc/src/wc.rs
index 3c762353ca3..a9689f02fc7 100644
--- a/src/uu/wc/src/wc.rs
+++ b/src/uu/wc/src/wc.rs
@@ -3,7 +3,7 @@
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.
 
-// cSpell:ignore ilog wc wc's
+// spell-checker:ignore ilog wc wc's
 
 mod count_fast;
 mod countable;
@@ -37,7 +37,7 @@ use uucore::{
 };
 
 use crate::{
-    count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast},
+    count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast, is_c_or_posix_locale},
     countable::WordCountable,
     word_count::WordCount,
 };
@@ -580,6 +580,7 @@ fn process_chunk<
     current_len: &mut usize,
     in_word: &mut bool,
     posixly_correct: bool,
+    chars_are_bytes: bool,
 ) {
     for ch in text.chars() {
         if SHOW_WORDS {
@@ -615,12 +616,17 @@ fn process_chunk<
         if SHOW_LINES && ch == '\n' {
             total.lines += 1;
         }
-        if SHOW_CHARS {
+        if SHOW_CHARS && !chars_are_bytes {
             total.chars += 1;
         }
     }
     total.bytes += text.len();
 
+    // In C/POSIX locale, chars count equals bytes count
+    if SHOW_CHARS && chars_are_bytes {
+        total.chars += text.len();
+    }
+
     total.max_line_length = max(*current_len, total.max_line_length);
 }
 
@@ -656,6 +662,7 @@ fn word_count_from_reader_specialized<
     let mut in_word = false;
     let mut current_len = 0;
     let posixly_correct = env::var_os("POSIXLY_CORRECT").is_some();
+    let chars_are_bytes = SHOW_CHARS && is_c_or_posix_locale();
     while let Some(chunk) = reader.next_strict() {
         match chunk {
             Ok(text) => {
@@ -665,6 +672,7 @@ fn word_count_from_reader_specialized<
                     &mut current_len,
                     &mut in_word,
                     posixly_correct,
+                    chars_are_bytes,
                 );
             }
             Err(e) => {
diff --git a/tests/by-util/test_wc.rs b/tests/by-util/test_wc.rs
index d62c1da6e8e..f4e35f96879 100644
--- a/tests/by-util/test_wc.rs
+++ b/tests/by-util/test_wc.rs
@@ -8,7 +8,8 @@ use uutests::at_and_ucmd;
 use uutests::new_ucmd;
 use uutests::util::vec_of_size;
 
-// spell-checker:ignore (flags) lwmcL clmwL ; (path) bogusfile emptyfile manyemptylines moby notrailingnewline onelongemptyline onelongword weirdchars ioerrdir
+// spell-checker:ignore (flags) lwmcL clmwL ; (path) bogusfile emptyfile manyemptylines moby notrailingnewline onelongemptyline onelongword weirdchars ioerrdir CTYPE
+// spell-checker:ignore (Vietnamese) Tiếng Việt chào
 #[test]
 fn test_invalid_arg() {
     new_ucmd!().arg("--definitely-invalid").fails_with_code(1);
@@ -61,8 +62,10 @@ fn test_stdin_explicit() {
 
 #[test]
 fn test_utf8() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .args(&["-lwmcL"])
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_test.txt")
         .succeeds()
         .stdout_is("    303    2178   22457   23025      79\n");
@@ -88,8 +91,10 @@ fn test_utf8_line_length_words() {
 
 #[test]
 fn test_utf8_line_length_chars() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-Lm")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("    442      48\n");
@@ -97,8 +102,10 @@ fn test_utf8_line_length_chars() {
 
 #[test]
 fn test_utf8_line_length_chars_words() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-Lmw")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("     89     442      48\n");
@@ -106,8 +113,10 @@ fn test_utf8_line_length_chars_words() {
 
 #[test]
 fn test_utf8_chars() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-m")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("442\n");
@@ -115,8 +124,10 @@ fn test_utf8_chars() {
 
 #[test]
 fn test_utf8_bytes_chars() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-cm")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("    442     513\n");
@@ -133,8 +144,10 @@ fn test_utf8_bytes_lines() {
 
 #[test]
 fn test_utf8_bytes_chars_lines() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-cml")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("     25     442     513\n");
@@ -142,8 +155,10 @@ fn test_utf8_bytes_chars_lines() {
 
 #[test]
 fn test_utf8_chars_words() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-mw")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("     89     442\n");
@@ -169,8 +184,10 @@ fn test_utf8_line_length_lines_words() {
 
 #[test]
 fn test_utf8_lines_chars() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-ml")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("     25     442\n");
@@ -178,8 +195,10 @@ fn test_utf8_lines_chars() {
 
 #[test]
 fn test_utf8_lines_words_chars() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-mlw")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("     25      89     442\n");
@@ -187,8 +206,10 @@ fn test_utf8_lines_words_chars() {
 
 #[test]
 fn test_utf8_line_length_lines_chars() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-Llm")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("     25     442      48\n");
@@ -196,8 +217,10 @@ fn test_utf8_line_length_lines_chars() {
 
 #[test]
 fn test_utf8_all() {
+    // Requires UTF-8 locale for character counting
     new_ucmd!()
         .arg("-lwmcL")
+        .env("LC_ALL", "en_US.UTF-8")
         .pipe_in_fixture("UTF_8_weirdchars.txt")
         .succeeds()
         .stdout_is("     25      89     442     513      48\n");
@@ -921,3 +944,136 @@ fn test_posixly_correct_whitespace() {
         .succeeds()
         .stdout_is("1\n");
 }
+
+#[test]
+fn test_wc_chars_c_locale() {
+    // In C/POSIX locale, wc -m should count bytes, not UTF-8 characters
+    // Vietnamese "Tiếng Việt" uses diacritics (2 bytes per char in UTF-8)
+    // "Tiếng" = 5 chars, 7 bytes ("ế" is 2 bytes)
+    let vietnamese_text = "Tiếng";
+
+    // With LC_ALL=C, chars should equal bytes (7)
+    new_ucmd!()
+        .arg("-m")
+        .env("LC_ALL", "C")
+        .pipe_in(vietnamese_text)
+        .succeeds()
+        .stdout_is("7\n");
+
+    // Same with LC_ALL=POSIX
+    new_ucmd!()
+        .arg("-m")
+        .env("LC_ALL", "POSIX")
+        .pipe_in(vietnamese_text)
+        .succeeds()
+        .stdout_is("7\n");
+
+    // Test combined with bytes flag - should show same count
+    new_ucmd!()
+        .args(&["-cm"])
+        .env("LC_ALL", "C")
+        .pipe_in(vietnamese_text)
+        .succeeds()
+        .stdout_is("      7       7\n");
+}
+
+#[test]
+fn test_wc_chars_utf8_locale() {
+    // In UTF-8 locale, wc -m should count UTF-8 characters
+    // Vietnamese "Tiếng" is 7 bytes in UTF-8 but 5 characters ("ế" is 2 bytes)
+    let vietnamese_text = "Tiếng";
+
+    // With vi_VN.UTF-8 locale, chars should be 5 (not 7)
+    new_ucmd!()
+        .arg("-m")
+        .env("LC_ALL", "vi_VN.UTF-8")
+        .pipe_in(vietnamese_text)
+        .succeeds()
+        .stdout_is("5\n");
+
+    // Test combined with bytes flag - should show different counts
+    // Order is: chars, bytes (since show_chars comes before show_bytes in print_stats)
+    new_ucmd!()
+        .args(&["-cm"])
+        .env("LC_ALL", "vi_VN.UTF-8")
+        .pipe_in(vietnamese_text)
+        .succeeds()
+        .stdout_is("      5       7\n");
+}
+
+#[test]
+fn test_wc_chars_default_locale() {
+    // When no locale is set (empty LC_ALL), it defaults to POSIX (chars == bytes)
+    // This ensures backward compatibility
+    let vietnamese_text = "Tiếng";
+
+    new_ucmd!()
+        .arg("-m")
+        .env("LC_ALL", "")
+        .env("LC_CTYPE", "")
+        .env("LANG", "")
+        .pipe_in(vietnamese_text)
+        .succeeds()
+        .stdout_is("7\n");
+}
+
+#[test]
+fn test_wc_multibyte_c_locale() {
+    // Issue #9712 and #5831: Test various multibyte characters in C locale
+    // All should be counted as bytes
+
+    // Vietnamese text with multiple diacritics: "Tiếng Việt"
+    // 10 chars, 14 bytes ("ế" and "ệ" are 2 bytes each)
+    new_ucmd!()
+        .args(&["-cm"])
+        .env("LC_ALL", "C")
+        .pipe_in("Tiếng Việt")
+        .succeeds()
+        .stdout_is("     14      14\n");
+
+    // Single Vietnamese character "ệ" = 1 char, 3 bytes in UTF-8 (e1 bb 87)
+    new_ucmd!()
+        .args(&["-cm"])
+        .env("LC_ALL", "C")
+        .pipe_in("ệ")
+        .succeeds()
+        .stdout_is("      3       3\n");
+
+    // Mixed ASCII and Vietnamese: "Xin chào" = 8 chars, 9 bytes ("à" is 2 bytes)
+    new_ucmd!()
+        .args(&["-cm"])
+        .env("LC_ALL", "C")
+        .pipe_in("Xin chào")
+        .succeeds()
+        .stdout_is("      9       9\n");
+}
+
+#[test]
+fn test_wc_multibyte_utf8_locale() {
+    // In UTF-8 locale, multibyte characters should be counted correctly
+    // Order is: chars, bytes (since show_chars comes before show_bytes in print_stats)
+
+    // Vietnamese "Tiếng Việt": 10 chars, 14 bytes ("ế" and "ệ" are 2 bytes each)
+    new_ucmd!()
+        .args(&["-cm"])
+        .env("LC_ALL", "vi_VN.UTF-8")
+        .pipe_in("Tiếng Việt")
+        .succeeds()
+        .stdout_is("     10      14\n");
+
+    // Single Vietnamese character "ệ" = 1 char, 3 bytes in UTF-8 (e1 bb 87)
+    new_ucmd!()
+        .args(&["-cm"])
+        .env("LC_ALL", "vi_VN.UTF-8")
+        .pipe_in("ệ")
+        .succeeds()
+        .stdout_is("      1       3\n");
+
+    // Mixed ASCII and Vietnamese "Xin chào": 8 chars, 9 bytes ("à" is 2 bytes)
+    new_ucmd!()
+        .args(&["-cm"])
+        .env("LC_ALL", "vi_VN.UTF-8")
+        .pipe_in("Xin chào")
+        .succeeds()
+        .stdout_is("      8       9\n");
+}

From 224212a3cb4a3a2f9bc9d9f188bfadce7c4b915d Mon Sep 17 00:00:00 2001
From: naoNao89 <90588855+naoNao89@users.noreply.github.com>
Date: Wed, 18 Feb 2026 20:53:35 +0700
Subject: [PATCH 2/2] test(wc): add tests for wc.rs locale path

Add tests with -w flag to ensure both count_fast.rs and wc.rs
paths are tested for locale-aware character counting.
---
 tests/by-util/test_wc.rs | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tests/by-util/test_wc.rs b/tests/by-util/test_wc.rs
index f4e35f96879..51d284317cb 100644
--- a/tests/by-util/test_wc.rs
+++ b/tests/by-util/test_wc.rs
@@ -975,6 +975,15 @@ fn test_wc_chars_c_locale() {
         .pipe_in(vietnamese_text)
         .succeeds()
         .stdout_is("      7       7\n");
+
+    // Test with -w to trigger wc.rs path (word_count_from_reader_specialized)
+    // Order: words, chars, bytes
+    new_ucmd!()
+        .args(&["-cmw"])
+        .env("LC_ALL", "C")
+        .pipe_in(vietnamese_text)
+        .succeeds()
+        .stdout_is("      1       7       7\n");
 }
 
 #[test]
@@ -999,6 +1008,15 @@ fn test_wc_chars_utf8_locale() {
         .pipe_in(vietnamese_text)
         .succeeds()
         .stdout_is("      5       7\n");
+
+    // Test with -w to trigger wc.rs path (word_count_from_reader_specialized)
+    // Order: words, chars, bytes
+    new_ucmd!()
+        .args(&["-cmw"])
+        .env("LC_ALL", "vi_VN.UTF-8")
+        .pipe_in(vietnamese_text)
+        .succeeds()
+        .stdout_is("      1       5       7\n");
 }
 
 #[test]
@@ -1015,6 +1033,17 @@ fn test_wc_chars_default_locale() {
         .pipe_in(vietnamese_text)
         .succeeds()
         .stdout_is("7\n");
+
+    // Test with -w to trigger wc.rs path (word_count_from_reader_specialized)
+    // Order: words, chars
+    new_ucmd!()
+        .args(&["-mw"])
+        .env("LC_ALL", "")
+        .env("LC_CTYPE", "")
+        .env("LANG", "")
+        .pipe_in(vietnamese_text)
+        .succeeds()
+        .stdout_is("      1       7\n");
 }
 
 #[test]