diff --git a/src/uu/paste/Cargo.toml b/src/uu/paste/Cargo.toml index ca0efa67c28..f78837e334c 100644 --- a/src/uu/paste/Cargo.toml +++ b/src/uu/paste/Cargo.toml @@ -19,7 +19,7 @@ path = "src/paste.rs" [dependencies] clap = { workspace = true } -uucore = { workspace = true } +uucore = { workspace = true, features = ["i18n-charmap"] } fluent = { workspace = true } [[bin]] diff --git a/src/uu/paste/src/paste.rs b/src/uu/paste/src/paste.rs index 5d6ae5ed93d..b0f51e8ced6 100644 --- a/src/uu/paste/src/paste.rs +++ b/src/uu/paste/src/paste.rs @@ -14,6 +14,7 @@ use std::rc::Rc; use std::slice::Iter; use uucore::error::{UResult, USimpleError}; use uucore::format_usage; +use uucore::i18n::charmap::mb_char_len; use uucore::line_ending::LineEnding; use uucore::translate; @@ -29,7 +30,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?; let serial = matches.get_flag(options::SERIAL); - let delimiters = matches.get_one::(options::DELIMITER).unwrap(); + let delimiters = matches.get_one::(options::DELIMITER).unwrap(); let files = matches .get_many::(options::FILE) .unwrap() @@ -61,7 +62,8 @@ pub fn uu_app() -> Command { .help(translate!("paste-help-delimiter")) .value_name("LIST") .default_value("\t") - .hide_default_value(true), + .hide_default_value(true) + .value_parser(clap::value_parser!(OsString)), ) .arg( Arg::new(options::FILE) @@ -84,7 +86,7 @@ pub fn uu_app() -> Command { fn paste( filenames: Vec, serial: bool, - delimiters: &str, + delimiters: &OsString, line_ending: LineEnding, ) -> UResult<()> { let unescaped_and_encoded_delimiters = parse_delimiters(delimiters)?; @@ -185,65 +187,44 @@ fn paste( Ok(()) } -fn parse_delimiters(delimiters: &str) -> UResult]>> { - /// A single backslash char - const BACKSLASH: char = '\\'; - - fn add_one_byte_single_char_delimiter(vec: &mut Vec>, byte: u8) { - vec.push(Box::new([byte])); - } - - // a buffer of length four is large enough to encode any char - let mut buffer = [0; 4]; - - let mut add_single_char_delimiter = |vec: &mut Vec>, ch: char| { - let delimiter_encoded = ch.encode_utf8(&mut buffer); - - vec.push(Box::<[u8]>::from(delimiter_encoded.as_bytes())); - }; - - let mut vec = Vec::>::with_capacity(delimiters.len()); - - let mut chars = delimiters.chars(); - - // Unescape all special characters - while let Some(char) = chars.next() { - match char { - BACKSLASH => match chars.next() { - // "Empty string (not a null character)" - // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html - Some('0') => { - vec.push(Box::<[u8; 0]>::new([])); - } - // "\\" to "\" (U+005C) - Some(BACKSLASH) => { - add_one_byte_single_char_delimiter(&mut vec, b'\\'); - } - // "\n" to U+000A - Some('n') => { - add_one_byte_single_char_delimiter(&mut vec, b'\n'); - } - // "\t" to U+0009 - Some('t') => { - add_one_byte_single_char_delimiter(&mut vec, b'\t'); - } - Some(other_char) => { - // "If any other characters follow the , the results are unspecified." - // https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html - // However, other implementations remove the backslash - // See "test_posix_unspecified_delimiter" - add_single_char_delimiter(&mut vec, other_char); - } - None => { - return Err(USimpleError::new( - 1, - translate!("paste-error-delimiter-unescaped-backslash", "delimiters" => delimiters), - )); +fn parse_delimiters(delimiters: &OsString) -> UResult]>> { + let bytes = uucore::os_str_as_bytes(delimiters)?; + let mut vec = Vec::>::with_capacity(bytes.len()); + let mut i = 0; + + while i < bytes.len() { + if bytes[i] == b'\\' { + i += 1; + if i >= bytes.len() { + return Err(USimpleError::new( + 1, + translate!("paste-error-delimiter-unescaped-backslash", "delimiters" => delimiters.to_string_lossy()), + )); + } + match bytes[i] { + b'0' => vec.push(Box::new([])), + b'\\' => vec.push(Box::new([b'\\'])), + b'n' => vec.push(Box::new([b'\n'])), + b't' => vec.push(Box::new([b'\t'])), + b'b' => vec.push(Box::new([b'\x08'])), + b'f' => vec.push(Box::new([b'\x0C'])), + b'r' => vec.push(Box::new([b'\r'])), + b'v' => vec.push(Box::new([b'\x0B'])), + _ => { + // Unknown escape: strip backslash, use the following character(s) + let remaining = &bytes[i..]; + let len = mb_char_len(remaining).min(remaining.len()); + vec.push(Box::from(&bytes[i..i + len])); + i += len; + continue; } - }, - non_backslash_char => { - add_single_char_delimiter(&mut vec, non_backslash_char); } + i += 1; + } else { + let remaining = &bytes[i..]; + let len = mb_char_len(remaining).min(remaining.len()); + vec.push(Box::from(&bytes[i..i + len])); + i += len; } } diff --git a/src/uucore/Cargo.toml b/src/uucore/Cargo.toml index d18d0630ed5..9d24ed920c7 100644 --- a/src/uucore/Cargo.toml +++ b/src/uucore/Cargo.toml @@ -150,7 +150,8 @@ format = [ "quoting-style", "unit-prefix", ] -i18n-all = ["i18n-collator", "i18n-decimal", "i18n-datetime"] +i18n-all = ["i18n-charmap", "i18n-collator", "i18n-decimal", "i18n-datetime"] +i18n-charmap = ["i18n-common"] i18n-common = ["icu_locale"] i18n-collator = ["i18n-common", "icu_collator"] i18n-decimal = ["i18n-common", "icu_decimal", "icu_provider"] diff --git a/src/uucore/src/lib/features/i18n/charmap.rs b/src/uucore/src/lib/features/i18n/charmap.rs new file mode 100644 index 00000000000..2ec99229bc8 --- /dev/null +++ b/src/uucore/src/lib/features/i18n/charmap.rs @@ -0,0 +1,140 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +// spell-checker:ignore langinfo charmap eucjp euckr euctw CTYPE HKSCS hkscs localedata + +//! Locale-aware multi-byte character length detection via `LC_CTYPE`. + +use std::sync::OnceLock; + +enum MbEncoding { + Utf8, + Gb18030, + EucJp, + EucKr, + Big5, +} + +fn encoding_from_name(enc: &str) -> MbEncoding { + match enc { + "gb18030" | "gbk" | "gb2312" => MbEncoding::Gb18030, + "euc-jp" | "eucjp" => MbEncoding::EucJp, + "euc-kr" | "euckr" => MbEncoding::EucKr, + "big5" | "big5-hkscs" | "big5hkscs" | "euc-tw" | "euctw" => MbEncoding::Big5, + _ => MbEncoding::Utf8, + } +} + +fn get_encoding() -> &'static MbEncoding { + static ENCODING: OnceLock = OnceLock::new(); + ENCODING.get_or_init(|| { + let val = ["LC_ALL", "LC_CTYPE", "LANG"] + .iter() + .find_map(|&k| std::env::var(k).ok().filter(|v| !v.is_empty())); + let s = match val.as_deref() { + Some(s) if s != "C" && s != "POSIX" => s, + _ => return MbEncoding::Utf8, + }; + if let Some(enc) = s.split('.').nth(1) { + let enc = enc.split('@').next().unwrap_or(enc); + encoding_from_name(&enc.to_ascii_lowercase()) + } else { + // Bare locale defaults from glibc localedata/SUPPORTED + match s.split('@').next().unwrap_or(s) { + "zh_CN" | "zh_SG" => MbEncoding::Gb18030, + "zh_TW" | "zh_HK" => MbEncoding::Big5, + _ => MbEncoding::Utf8, + } + } + }) +} + +/// Byte length of the first character in `bytes` under the current locale encoding. +pub fn mb_char_len(bytes: &[u8]) -> usize { + debug_assert!(!bytes.is_empty()); + let b0 = bytes[0]; + if b0 <= 0x7F { + return 1; + } + match get_encoding() { + MbEncoding::Utf8 => utf8_len(bytes, b0), + MbEncoding::Gb18030 => gb18030_len(bytes, b0), + MbEncoding::EucJp => eucjp_len(bytes, b0), + MbEncoding::EucKr => euckr_len(bytes, b0), + MbEncoding::Big5 => big5_len(bytes, b0), + } +} + +// All helpers below assume b0 > 0x7F (ASCII already handled by caller). + +fn utf8_len(b: &[u8], b0: u8) -> usize { + let n = match b0 { + 0xC2..=0xDF => 2, + 0xE0..=0xEF => 3, + 0xF0..=0xF4 => 4, + _ => return 1, + }; + if b.len() >= n && b[1..n].iter().all(|&c| c & 0xC0 == 0x80) { + n + } else { + 1 + } +} + +// 2-byte: [81-FE][40-7E,80-FE] 4-byte: [81-FE][30-39][81-FE][30-39] +fn gb18030_len(b: &[u8], b0: u8) -> usize { + if !(0x81..=0xFE).contains(&b0) { + return 1; + } + if b.len() >= 4 + && (0x30..=0x39).contains(&b[1]) + && (0x81..=0xFE).contains(&b[2]) + && (0x30..=0x39).contains(&b[3]) + { + return 4; + } + if b.len() >= 2 && ((0x40..=0x7E).contains(&b[1]) || (0x80..=0xFE).contains(&b[1])) { + return 2; + } + 1 +} + +// 3-byte: [8F][A1-FE][A1-FE] 2-byte: [8E][A1-DF] or [A1-FE][A1-FE] +fn eucjp_len(b: &[u8], b0: u8) -> usize { + if b0 == 0x8F && b.len() >= 3 && (0xA1..=0xFE).contains(&b[1]) && (0xA1..=0xFE).contains(&b[2]) + { + return 3; + } + if b.len() >= 2 { + if b0 == 0x8E && (0xA1..=0xDF).contains(&b[1]) { + return 2; + } + if (0xA1..=0xFE).contains(&b0) && (0xA1..=0xFE).contains(&b[1]) { + return 2; + } + } + 1 +} + +// 2-byte: [A1-FE][A1-FE] +fn euckr_len(b: &[u8], b0: u8) -> usize { + if (0xA1..=0xFE).contains(&b0) && b.len() >= 2 && (0xA1..=0xFE).contains(&b[1]) { + 2 + } else { + 1 + } +} + +// 2-byte: [81-FE][40-7E,A1-FE] +fn big5_len(b: &[u8], b0: u8) -> usize { + if (0x81..=0xFE).contains(&b0) + && b.len() >= 2 + && ((0x40..=0x7E).contains(&b[1]) || (0xA1..=0xFE).contains(&b[1])) + { + 2 + } else { + 1 + } +} diff --git a/src/uucore/src/lib/features/i18n/mod.rs b/src/uucore/src/lib/features/i18n/mod.rs index e8e0f3f3c5d..13629710ca0 100644 --- a/src/uucore/src/lib/features/i18n/mod.rs +++ b/src/uucore/src/lib/features/i18n/mod.rs @@ -7,6 +7,8 @@ use std::sync::OnceLock; use icu_locale::{Locale, locale}; +#[cfg(feature = "i18n-charmap")] +pub mod charmap; #[cfg(feature = "i18n-collator")] pub mod collator; #[cfg(feature = "i18n-datetime")] diff --git a/tests/by-util/test_paste.rs b/tests/by-util/test_paste.rs index a87f2159883..11767e82a7e 100644 --- a/tests/by-util/test_paste.rs +++ b/tests/by-util/test_paste.rs @@ -135,6 +135,30 @@ const EXAMPLE_DATA: &[TestData] = &[ ins: &["1 \na \n", "2\t\nb\t\n"], out: "1 |2\t\na |b\t\n", }, + TestData { + name: "utf8-2byte-delim", + args: &["-d", "\u{00A2}"], + ins: &["1\n2\n", "a\nb\n"], + out: "1\u{00A2}a\n2\u{00A2}b\n", + }, + TestData { + name: "utf8-3byte-delim", + args: &["-d", "\u{20AC}"], + ins: &["1\n2\n", "a\nb\n"], + out: "1\u{20AC}a\n2\u{20AC}b\n", + }, + TestData { + name: "utf8-4byte-delim", + args: &["-d", "\u{1F600}", "-s"], + ins: &["1\n2\n3\n"], + out: "1\u{1F600}2\u{1F600}3\n", + }, + TestData { + name: "utf8-multi-delim-cycle", + args: &["-d", "\u{00A2}\u{20AC}"], + ins: &["a\nb\nc\n", "1\n2\n3\n", "x\ny\nz\n"], + out: "a\u{00A2}1\u{20AC}x\nb\u{00A2}2\u{20AC}y\nc\u{00A2}3\u{20AC}z\n", + }, ]; #[test] @@ -334,6 +358,19 @@ fn test_backslash_zero_delimiter() { } } +#[test] +fn test_gnu_escape_sequences() { + let cases: &[(&str, u8)] = &[(r"\b", 0x08), (r"\f", 0x0C), (r"\r", 0x0D), (r"\v", 0x0B)]; + for &(esc, byte) in cases { + let expected = [b'1', byte, b'2', byte, b'3', b'\n']; + new_ucmd!() + .args(&["-s", "-d", esc]) + .pipe_in("1\n2\n3\n") + .succeeds() + .stdout_only_bytes(expected); + } +} + // As of 2024-10-09, only bsdutils (https://github.com/dcantrell/bsdutils, derived from FreeBSD) and toybox handle // multibyte delimiter characters in the way a user would likely expect. BusyBox and GNU Core Utilities do not. #[test] @@ -378,6 +415,21 @@ fn test_data() { } } +#[test] +#[cfg(target_os = "linux")] +fn test_non_utf8_delimiter() { + let (at, mut ucmd) = at_and_ucmd!(); + at.write("f1", "1\n2\n"); + at.write("f2", "a\nb\n"); + let delim = std::ffi::OsString::from_vec(vec![0xA2, 0xE3]); + ucmd.env("LC_ALL", "zh_CN.gb18030") + .arg("-d") + .arg(&delim) + .args(&["f1", "f2"]) + .succeeds() + .stdout_only_bytes(b"1\xA2\xE3a\n2\xA2\xE3b\n"); +} + #[test] #[cfg(target_os = "linux")] fn test_paste_non_utf8_paths() { diff --git a/util/build-gnu.sh b/util/build-gnu.sh index 70886e5e912..46da5b8248d 100755 --- a/util/build-gnu.sh +++ b/util/build-gnu.sh @@ -162,6 +162,9 @@ fi grep -rl 'path_prepend_' tests/* | xargs -r "${SED}" -i 's| path_prepend_ ./src||' # path_prepend_ sets $abs_path_dir_: set it manually instead. grep -rl '\$abs_path_dir_' tests/*/*.sh | xargs -r "${SED}" -i "s|\$abs_path_dir_|${UU_BUILD_DIR//\//\\/}|g" +# Some tests use $abs_top_builddir/src for shebangs: point them to the uutils build dir. +grep -rl '\$abs_top_builddir/src' tests/*/*.sh tests/*/*.pl | xargs -r "${SED}" -i "s|\$abs_top_builddir/src|${UU_BUILD_DIR//\//\\/}|g" +grep -rl '\$ENV{abs_top_builddir}/src' tests/*/*.pl | xargs -r "${SED}" -i "s|\$ENV{abs_top_builddir}/src|${UU_BUILD_DIR//\//\\/}|g" # We can't build runcon and chcon without libselinux. But GNU no longer builds dummies of them. So consider they are SELinux specific. sed -i 's/^print_ver_.*/require_selinux_/' tests/runcon/runcon-compute.sh