Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 99 additions & 83 deletions src/uu/tac/src/tac.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@ mod error;
use clap::{Arg, ArgAction, Command};
use memchr::memmem;
use memmap2::Mmap;
use std::ffi::OsString;
use std::ffi::{OsStr, OsString};
use std::io::{BufWriter, Read, Write, stdin, stdout};
use std::{
fs::{File, read},
io::copy,
path::Path,
};

#[cfg(unix)]
use uucore::error::set_exit_code;
use uucore::error::{UError, UResult};
Expand All @@ -40,10 +41,11 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
let before = matches.get_flag(options::BEFORE);
let regex = matches.get_flag(options::REGEX);
let raw_separator = matches
.get_one::<String>(options::SEPARATOR)
.map_or("\n", |s| s.as_str());
.get_one::<OsString>(options::SEPARATOR)
.map_or(OsStr::new("\n"), |s| s.as_os_str());

let separator = if raw_separator.is_empty() {
"\0"
OsStr::new("\0")
} else {
raw_separator
};
Expand Down Expand Up @@ -82,6 +84,7 @@ pub fn uu_app() -> Command {
.short('s')
.long(options::SEPARATOR)
.help(translate!("tac-help-separator"))
.value_parser(clap::value_parser!(OsString))
.value_name("STRING"),
)
.arg(
Expand Down Expand Up @@ -147,7 +150,9 @@ fn buffer_tac_regex(
// Determine if there is a match for `pattern` starting at index
// `i` in `data`. Only search up to the line ending that was
// found previously.
if let Some(match_) = pattern.find_at(&data[..this_line_end], i) {
if let Some(match_) = pattern.find_at(&data[..this_line_end], i)
&& match_.start() == i
{
// Record this index as the ending of the current line.
this_line_end = i;

Expand Down Expand Up @@ -183,7 +188,7 @@ fn buffer_tac_regex(
/// If `before` is `true`, then this function assumes that the
/// `separator` appears at the beginning of each line, as in
/// `"/abc/def"`.
fn buffer_tac(data: &[u8], before: bool, separator: &str) -> std::io::Result<()> {
fn buffer_tac(data: &[u8], before: bool, separator: &OsStr) -> std::io::Result<()> {
let out = stdout();
let mut out = BufWriter::new(out.lock());

Expand All @@ -206,7 +211,7 @@ fn buffer_tac(data: &[u8], before: bool, separator: &str) -> std::io::Result<()>
// The `before` flag controls whether the line separator appears at
// the end of the line (as in "abc\ndef\n") or at the beginning of
// the line (as in "/abc/def").
for i in memmem::rfind_iter(data, separator) {
for i in memmem::rfind_iter(data, separator.as_encoded_bytes()) {
if before {
out.write_all(&data[i..following_line_start])?;
following_line_start = i;
Expand All @@ -228,91 +233,102 @@ fn buffer_tac(data: &[u8], before: bool, separator: &str) -> std::io::Result<()>
/// Concretely:
/// - Toggle escaping of (), |, {}
/// - Escape ^ and $ when not at edges
/// - Leave expressions inside [] unchanged
fn translate_regex_flavor(regex: &str) -> String {
let mut result = String::new();
let mut chars = regex.chars().peekable();
/// - Leave only ASCII bytes inside []
/// - Escape non-ASCII bytes as `(?-u:\xFF)` outside []
fn translate_regex_flavor(bytes: &[u8]) -> String {
let mut result = Vec::new();
let mut i = 0;
let mut inside_brackets = false;
let mut prev_was_backslash = false;
let mut last_char: Option<char> = None;
let mut last_byte: Option<u8> = None;

while let Some(c) = chars.next() {
while let Some(b) = bytes.get(i) {
let is_escaped = prev_was_backslash;
prev_was_backslash = false;

match c {
match b {
_ if inside_brackets && !b.is_ascii() => {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TIL GNU also ignores non-ASCII bytes inside bracket expressions

i += 1;
continue;
}
// Unescape escaped (), |, {} when not inside brackets
'\\' if !inside_brackets && !is_escaped => {
if let Some(&next) = chars.peek() {
if matches!(next, '(' | ')' | '|' | '{' | '}') {
result.push(next);
last_char = Some(next);
chars.next();
b'\\' if !inside_brackets && !is_escaped => {
if let Some(next) = bytes.get(i + 1) {
if matches!(next, b'(' | b')' | b'|' | b'{' | b'}') {
result.push(*next);
last_byte = Some(*next);
i += 2;
continue;
}
}

result.push('\\');
last_char = Some('\\');
result.push(b'\\');
last_byte = Some(b'\\');
prev_was_backslash = true;
}
// Bracket tracking
'[' => {
b'[' => {
inside_brackets = true;
result.push(c);
last_char = Some(c);
result.push(*b);
last_byte = Some(*b);
}
']' => {
b']' => {
inside_brackets = false;
result.push(c);
last_char = Some(c);
result.push(*b);
last_byte = Some(*b);
}
// Escape (), |, {} when not escaped and outside brackets
'(' | ')' | '|' | '{' | '}' if !inside_brackets && !is_escaped => {
result.push('\\');
result.push(c);
last_char = Some(c);
b'(' | b')' | b'|' | b'{' | b'}' if !inside_brackets && !is_escaped => {
result.push(b'\\');
result.push(*b);
last_byte = Some(*b);
}
'^' if !inside_brackets && !is_escaped => {
let is_anchor_position = result.is_empty() || matches!(last_char, Some('(' | '|'));
b'^' if !inside_brackets && !is_escaped => {
let is_anchor_position =
result.is_empty() || matches!(last_byte, Some(b'(' | b'|'));
if !is_anchor_position {
result.push('\\');
result.push(b'\\');
}
result.push(c);
last_char = Some(c);
result.push(*b);
last_byte = Some(*b);
}
'$' if !inside_brackets && !is_escaped => {
let next_is_anchor_position = match chars.peek() {
b'$' if !inside_brackets && !is_escaped => {
let next_is_anchor_position = match bytes.get(i + 1) {
None => true,
Some(&')' | &'|') => true,
Some(&'\\') => {
Some(b')' | b'|') => true,
Some(b'\\') => {
// Peek two ahead to see if it's \) or \|
let chars_vec: Vec<char> = chars.clone().take(2).collect();
matches!(chars_vec.get(1), Some(&')' | &'|'))
matches!(bytes.get(i + 2), Some(b')' | b'|'))
}
_ => false,
};
if !next_is_anchor_position {
result.push('\\');
result.push(b'\\');
}
result.push(c);
last_char = Some(c);
result.push(*b);
last_byte = Some(*b);
}
_ if !b.is_ascii() => {
let _ = write!(result, r"(?-u:\x{b:02x})");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great find!

last_byte = None;
}
_ => {
result.push(c);
last_char = Some(c);
result.push(*b);
last_byte = Some(*b);
}
}

i += 1;
}

result
String::from_utf8(result).expect("produces ASCII bytes")
}

#[allow(clippy::cognitive_complexity)]
fn tac(filenames: &[OsString], before: bool, regex: bool, separator: &str) -> UResult<()> {
fn tac(filenames: &[OsString], before: bool, regex: bool, separator: &OsStr) -> UResult<()> {
// Compile the regular expression pattern if it is provided.
let maybe_pattern = if regex {
match regex::bytes::RegexBuilder::new(&translate_regex_flavor(separator))
match regex::bytes::RegexBuilder::new(&translate_regex_flavor(separator.as_encoded_bytes()))
.multi_line(true)
.build()
{
Expand Down Expand Up @@ -458,81 +474,81 @@ mod tests_hybrid_flavor {

#[test]
fn test_grouping_and_alternation() {
assert_eq!(translate_regex_flavor(r"\(abc\)"), r"(abc)");
assert_eq!(translate_regex_flavor(br"\(abc\)"), r"(abc)");

assert_eq!(translate_regex_flavor(r"(abc)"), r"\(abc\)");
assert_eq!(translate_regex_flavor(br"(abc)"), r"\(abc\)");

assert_eq!(translate_regex_flavor(r"a\|b"), r"a|b");
assert_eq!(translate_regex_flavor(br"a\|b"), r"a|b");

assert_eq!(translate_regex_flavor(r"a|b"), r"a\|b");
assert_eq!(translate_regex_flavor(br"a|b"), r"a\|b");
}

#[test]
fn test_quantifiers() {
assert_eq!(translate_regex_flavor("a+"), "a+");
assert_eq!(translate_regex_flavor(b"a+"), "a+");

assert_eq!(translate_regex_flavor("a*"), "a*");
assert_eq!(translate_regex_flavor(b"a*"), "a*");

assert_eq!(translate_regex_flavor("a?"), "a?");
assert_eq!(translate_regex_flavor(b"a?"), "a?");

assert_eq!(translate_regex_flavor(r"a\+"), r"a\+");
assert_eq!(translate_regex_flavor(br"a\+"), r"a\+");

assert_eq!(translate_regex_flavor(r"a\*"), r"a\*");
assert_eq!(translate_regex_flavor(br"a\*"), r"a\*");

assert_eq!(translate_regex_flavor(r"a\?"), r"a\?");
assert_eq!(translate_regex_flavor(br"a\?"), r"a\?");
}

#[test]
fn test_intervals() {
assert_eq!(translate_regex_flavor(r"a\{1,3\}"), r"a{1,3}");
assert_eq!(translate_regex_flavor(br"a\{1,3\}"), r"a{1,3}");

assert_eq!(translate_regex_flavor(r"a{1,3}"), r"a\{1,3\}");
assert_eq!(translate_regex_flavor(br"a{1,3}"), r"a\{1,3\}");
}

#[test]
fn test_anchors_context() {
assert_eq!(translate_regex_flavor(r"^abc$"), r"^abc$");
assert_eq!(translate_regex_flavor(br"^abc$"), r"^abc$");

assert_eq!(translate_regex_flavor(r"a^b"), r"a\^b");
assert_eq!(translate_regex_flavor(r"a$b"), r"a\$b");
assert_eq!(translate_regex_flavor(br"a^b"), r"a\^b");
assert_eq!(translate_regex_flavor(br"a$b"), r"a\$b");

// Anchors inside groups (reset by \(...\) regardless of position)
assert_eq!(translate_regex_flavor(r"\(^abc\)"), r"(^abc)");
assert_eq!(translate_regex_flavor(r"z\(^abc\)"), r"z(^abc)");
assert_eq!(translate_regex_flavor(r"\(abc$\)"), r"(abc$)");
assert_eq!(translate_regex_flavor(r"\(abc$\)z"), r"(abc$)z");
assert_eq!(translate_regex_flavor(br"\(^abc\)"), r"(^abc)");
assert_eq!(translate_regex_flavor(br"z\(^abc\)"), r"z(^abc)");
assert_eq!(translate_regex_flavor(br"\(abc$\)"), r"(abc$)");
assert_eq!(translate_regex_flavor(br"\(abc$\)z"), r"(abc$)z");

// Anchors inside alternation (reset by \| regardless of position)
assert_eq!(translate_regex_flavor(r"^a\|^b"), r"^a|^b");
assert_eq!(translate_regex_flavor(r"x\|^b"), r"x|^b");
assert_eq!(translate_regex_flavor(r"a$\|b$"), r"a$|b$");
assert_eq!(translate_regex_flavor(br"^a\|^b"), r"^a|^b");
assert_eq!(translate_regex_flavor(br"x\|^b"), r"x|^b");
assert_eq!(translate_regex_flavor(br"a$\|b$"), r"a$|b$");
}

#[test]
fn test_character_classes() {
assert_eq!(translate_regex_flavor(r"[a-z]"), r"[a-z]");
assert_eq!(translate_regex_flavor(br"[a-z]"), r"[a-z]");

assert_eq!(translate_regex_flavor(r"[.]"), r"[.]");
assert_eq!(translate_regex_flavor(r"[+]"), r"[+]");
assert_eq!(translate_regex_flavor(br"[.]"), r"[.]");
assert_eq!(translate_regex_flavor(br"[+]"), r"[+]");

assert_eq!(translate_regex_flavor(r"[]abc]"), r"[]abc]");
assert_eq!(translate_regex_flavor(br"[]abc]"), r"[]abc]");

assert_eq!(translate_regex_flavor(r"[^]abc]"), r"[^]abc]");
assert_eq!(translate_regex_flavor(br"[^]abc]"), r"[^]abc]");
}

#[test]
fn test_complex_strings() {
assert_eq!(translate_regex_flavor(r"(\d+)[+*]"), r"\(\d+\)[+*]");
assert_eq!(translate_regex_flavor(br"(\d+)[+*]"), r"\(\d+\)[+*]");

assert_eq!(translate_regex_flavor(r"\(\d+\)\{2\}"), r"(\d+){2}");
assert_eq!(translate_regex_flavor(br"\(\d+\)\{2\}"), r"(\d+){2}");
}

#[test]
fn test_edge_cases() {
assert_eq!(translate_regex_flavor(r"abc\"), r"abc\");
assert_eq!(translate_regex_flavor(br"abc\"), r"abc\");

assert_eq!(translate_regex_flavor(r"\\"), r"\\");
assert_eq!(translate_regex_flavor(br"\\"), r"\\");

assert_eq!(translate_regex_flavor(r"\^"), r"\^");
assert_eq!(translate_regex_flavor(br"\^"), r"\^");
}
}
Loading
Loading