Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 33 additions & 5 deletions src/uu/wc/src/count_fast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,30 @@
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.

// cSpell:ignore sysconf
// spell-checker:ignore sysconf CTYPE
use crate::{wc_simd_allowed, word_count::WordCount};
use uucore::hardware::SimdPolicy;

use super::WordCountable;

/// Check if the current locale is C or POSIX (where characters == bytes).
/// This follows GNU coreutils behavior where MB_CUR_MAX == 1 in these locales.
pub(crate) fn is_c_or_posix_locale() -> bool {
// Check LC_ALL, LC_CTYPE, and LANG in order of precedence
let locale_val = ["LC_ALL", "LC_CTYPE", "LANG"]
.iter()
.find_map(|&var| std::env::var(var).ok().filter(|v| !v.is_empty()));

if let Some(locale) = locale_val {
// Extract the base locale name (before any '.' or '@')
let base_locale = locale.split(&['.', '@']).next().unwrap_or(&locale);
base_locale == "C" || base_locale == "POSIX"
} else {
// No locale set, default to POSIX behavior (chars == bytes)
true
}
}

#[cfg(any(target_os = "linux", target_os = "android"))]
use std::fs::OpenOptions;
use std::io::{self, ErrorKind, Read};
Expand Down Expand Up @@ -235,6 +253,11 @@ pub(crate) fn count_bytes_chars_and_lines_fast<
let buf: &mut [u8] = &mut AlignedBuffer::default().data;
let policy = SimdPolicy::detect();
let simd_allowed = wc_simd_allowed(policy);

// In C/POSIX locale, characters are equivalent to bytes (MB_CUR_MAX == 1).
// This follows GNU coreutils behavior.
let chars_are_bytes = is_c_or_posix_locale();

loop {
match handle.read(buf) {
Ok(0) => return (total, None),
Expand All @@ -243,11 +266,16 @@ pub(crate) fn count_bytes_chars_and_lines_fast<
total.bytes += n;
}
if COUNT_CHARS {
total.chars += if simd_allowed {
bytecount::num_chars(&buf[..n])
if chars_are_bytes {
// In C/POSIX locale, count bytes instead of UTF-8 chars
total.chars += n;
} else {
bytecount::naive_num_chars(&buf[..n])
};
total.chars += if simd_allowed {
bytecount::num_chars(&buf[..n])
} else {
bytecount::naive_num_chars(&buf[..n])
};
}
}
if COUNT_LINES {
total.lines += if simd_allowed {
Expand Down
14 changes: 11 additions & 3 deletions src/uu/wc/src/wc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.

// cSpell:ignore ilog wc wc's
// spell-checker:ignore ilog wc wc's

mod count_fast;
mod countable;
Expand Down Expand Up @@ -37,7 +37,7 @@ use uucore::{
};

use crate::{
count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast},
count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast, is_c_or_posix_locale},
countable::WordCountable,
word_count::WordCount,
};
Expand Down Expand Up @@ -580,6 +580,7 @@ fn process_chunk<
current_len: &mut usize,
in_word: &mut bool,
posixly_correct: bool,
chars_are_bytes: bool,
) {
for ch in text.chars() {
if SHOW_WORDS {
Expand Down Expand Up @@ -615,12 +616,17 @@ fn process_chunk<
if SHOW_LINES && ch == '\n' {
total.lines += 1;
}
if SHOW_CHARS {
if SHOW_CHARS && !chars_are_bytes {
total.chars += 1;
}
}
total.bytes += text.len();

// In C/POSIX locale, chars count equals bytes count
if SHOW_CHARS && chars_are_bytes {
total.chars += text.len();
}

total.max_line_length = max(*current_len, total.max_line_length);
}

Expand Down Expand Up @@ -656,6 +662,7 @@ fn word_count_from_reader_specialized<
let mut in_word = false;
let mut current_len = 0;
let posixly_correct = env::var_os("POSIXLY_CORRECT").is_some();
let chars_are_bytes = SHOW_CHARS && is_c_or_posix_locale();
while let Some(chunk) = reader.next_strict() {
match chunk {
Ok(text) => {
Expand All @@ -665,6 +672,7 @@ fn word_count_from_reader_specialized<
&mut current_len,
&mut in_word,
posixly_correct,
chars_are_bytes,
);
}
Err(e) => {
Expand Down
Loading
Loading