uutils · sylvestre · Dec 16, 2025 · Nov 29, 2025 · Nov 30, 2025
diff --git a/src/sed/fast_regex.rs b/src/sed/fast_regex.rs
@@ -27,44 +27,12 @@ use crate::sed::fast_io::IOChunk;
 
 /// REs requiring the fancy_regex capabilities rather than the
 /// faster regex::bytes engine
-// Consider . as one character that requires fancy_regex,
-// because it can match more than one byte when matching a
-// two or more byte Unicode UTF-8 representation.
-// It is an RE . rather than a literal one in the following
-// example sitations.
-// .        First character of the line
-// [^\\].   Second character after non \
-//
-//   \*.    A consumed backslash anywhere on the line
-//   \\.    An escaped backslash anywhere on the line
-//   xx.    A non-escaped sequence anywhere on the line
-// But the following are literal dots and can be captured by bytes:
-// \.       escaped at the beginning of the line
-//   x\.    escaped after a non escaped \ anywhere on the line
-//
-// The following RE captures these situations.
-static NEEDS_FANCY_RE: Lazy<RustRegex> = Lazy::new(|| {
-    regex::Regex::new(
-        r"(?x) # Turn on verbose mode
-          (                       # An ASCII-incompatible RE
-            ( ^                   # Non-escaped: i.e. at BOL
-              | ^[^\\]            # or after a BOL non \
-              | [^\\] {2}         # or after two non \ characters
-              | \\.               # or after a consumed or escaped \
-            )
-            (                     # A potentially incompatible match
-              \.                  # . matches any Unicode character
-              | \[\^              # Bracketed -ve character class
-              | \(\?i             # (Unicode) case insensitive
-              | \\[WwDdSsBbPp]    # Unicode classes
-              | \\[0-9]           # Back-references need fancy
-            )
-          )
-          | [^\x01-\x7f]          # Any non-ASCII character
-        ",
-    )
-    .unwrap()
-});
+// False positives only result in a small performance pessimization,
+// so this is just a maximally sensitive, good-enough approximation.
+// For example, r"\\1" and r"[\1]" will match, whereas only a number
+// after an odd number of backslashes and outside a character class
+// should match.
+static NEEDS_FANCY_RE: Lazy<RustRegex> = Lazy::new(|| regex::Regex::new(r"\\[1-9]").unwrap());
 
 /// All characters signifying that the match must be handled by an RE
 /// rather than by plain string pattern matching.
@@ -476,52 +444,22 @@ mod tests {
     #[test]
     fn test_needs_fancy_re_matches() {
         let should_match = [
-            // Unicode classes BOL
-            r"\p{L}+", // Unicode letter class
-            r"\W",     // \W is Unicode-aware.
-            r"\S+",    // \S is Unicode-aware.
-            r"\d",     // \d includes all Unicode digits.
-            // Unicode classes non-BOL
-            r"x\p{L}+", // Unicode letter class
-            r"x\W",     // \W is Unicode-aware.
-            r"x\S+",    // \S is Unicode-aware.
-            r"x\d",     // \d includes all Unicode digits.
-            // .
-            r".",
-            r"x.",
-            r"xx.",
-            // Consumed \
-            r"\*.",
-            r"x\*.",
-            // Escaped \
-            r"\\.",
-            r"x\\.",
-            // Inline flags
-            r"(?i)abc",  // Unicode case-insensitive
-            r"x(?i)abc", // Unicode case-insensitive
             r"(\w+):\1", // back-reference \1
-            // Non-ASCII literals
-            "naïve", // Contains literal non-ASCII.
-            "café",  // Contains literal non-ASCII.
         ];
 
         for pat in &should_match {
             assert!(
                 NEEDS_FANCY_RE.is_match(pat),
-                "Expected NEEDS_FANCY_RE to match: {:?}",
-                pat
+                "Expected NEEDS_FANCY_RE to match: {pat:?}"
             );
         }
     }
 
     #[test]
     fn test_needs_fancy_re_does_not_match() {
         let should_not_match = [
-            r"\.",     // Escaped . at BOL
-            r"x\.",    // Escaped . at non BOL
-            r"\[^x]",  // Escaped character class
-            r"\(?i\)", // Escaped case insesitive flag
-            r"\\w",    // Escaped Unicode class
+            r"\ 1", // Non-adjacent
+            r"\0",  // Only \[1-9]
             // Simple ASCII
             r"foo",
             r"foo|bar",
@@ -531,8 +469,7 @@ mod tests {
         for pat in &should_not_match {
             assert!(
                 !NEEDS_FANCY_RE.is_match(pat),
-                "Expected NEEDS_FANCY_RE to NOT match: {:?}",
-                pat
+                "Expected NEEDS_FANCY_RE to NOT match: {pat:?}"
             );
         }
     }
@@ -558,8 +495,7 @@ mod tests {
         for pat in &should_match {
             assert!(
                 NEEDS_RE.is_match(pat),
-                "Expected NEEDS_RE to match: {:?}",
-                pat
+                "Expected NEEDS_RE to match: {pat:?}"
             );
         }
     }
@@ -579,8 +515,7 @@ mod tests {
         for pat in &should_not_match {
             assert!(
                 !NEEDS_RE.is_match(pat),
-                "Expected NEEDS_RE to NOT match: {:?}",
-                pat
+                "Expected NEEDS_RE to NOT match: {pat:?}"
             );
         }
     }
@@ -594,7 +529,7 @@ mod tests {
 
     #[test]
     fn assert_fancy() {
-        let re = Regex::new(r"\d").unwrap();
+        let re = Regex::new(r"(.)\1").unwrap();
         assert!(matches!(re, Regex::Fancy(_)));
     }
 
@@ -609,8 +544,7 @@ mod tests {
         let err = Regex::new("(").unwrap_err().to_string();
         assert!(
             err.contains("unclosed group") || err.contains("error parsing"),
-            "Unexpected error: {}",
-            err
+            "Unexpected error: {err:?}"
         );
     }
 

diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs
@@ -295,6 +295,21 @@ check_output!(subst_re_reuse, ["-e", r"2s//M/;1s/l/L/", LINES1]);
 check_output!(subst_newline_class, ["-n", r"1{;N;s/[\n]/X/;p;}", LINES1]);
 check_output!(subst_newline_re, ["-n", r"1{;N;s/\n/X/;p;}", LINES1]);
 
+// Check appropriate selection and behavior of fast_Regex matcher
+// Literal matcher
+check_output!(subst_literal_start, ["-e", r"s/^l1/L1/", LINES1]);
+check_output!(subst_literal_end, ["-e", r"s/2$/TWO/", LINES1]);
+check_output!(subst_literal, ["-e", r"s/_/-/", LINES1]);
+
+// Fancy matcher
+check_output!(subst_backref, ["-e", r"s/l\(.\)_\1/same-number/", LINES1]);
+
+// Bytes matcher with Unicode
+check_output!(subst_greek, ["-e", r"s/[α-ω]/G/g", "input/unicode"]);
+check_output!(subst_any_unicode, ["-e", r"s/.$/:-)/", "input/unicode"]);
+check_output!(subst_lcase, ["-e", r"s/κ/*/gi", "input/unicode"]);
+check_output!(subst_word, ["-E", "-e", r"s/\w+/WORD/g", "input/unicode"]);
+
 #[test]
 fn subst_write_file() -> std::io::Result<()> {
     let temp = NamedTempFile::new()?;

diff --git a/tests/fixtures/sed/output/subst_any_unicode b/tests/fixtures/sed/output/subst_any_unicode
@@ -0,0 +1 @@
+Hello World or Καλημέρα κόσμε or こんにちは 世界 :-)
diff --git a/tests/fixtures/sed/output/subst_backref b/tests/fixtures/sed/output/subst_backref
@@ -0,0 +1,14 @@
+same-number
+l1_2
+l1_3
+l1_4
+l1_5
+l1_6
+l1_7
+l1_8
+l1_9
+same-number0
+same-number1
+same-number2
+same-number3
+same-number4
diff --git a/tests/fixtures/sed/output/subst_greek b/tests/fixtures/sed/output/subst_greek
@@ -0,0 +1 @@
+Hello World or ΚGGGGέGG GόGGG or こんにちは 世界 😀
diff --git a/tests/fixtures/sed/output/subst_lcase b/tests/fixtures/sed/output/subst_lcase
@@ -0,0 +1 @@
+Hello World or *αλημέρα *όσμε or こんにちは 世界 😀
diff --git a/tests/fixtures/sed/output/subst_literal b/tests/fixtures/sed/output/subst_literal
@@ -0,0 +1,14 @@
+l1-1
+l1-2
+l1-3
+l1-4
+l1-5
+l1-6
+l1-7
+l1-8
+l1-9
+l1-10
+l1-11
+l1-12
+l1-13
+l1-14
diff --git a/tests/fixtures/sed/output/subst_literal_end b/tests/fixtures/sed/output/subst_literal_end
@@ -0,0 +1,14 @@
+l1_1
+l1_TWO
+l1_3
+l1_4
+l1_5
+l1_6
+l1_7
+l1_8
+l1_9
+l1_10
+l1_11
+l1_1TWO
+l1_13
+l1_14
diff --git a/tests/fixtures/sed/output/subst_literal_start b/tests/fixtures/sed/output/subst_literal_start
@@ -0,0 +1,14 @@
+L1_1
+L1_2
+L1_3
+L1_4
+L1_5
+L1_6
+L1_7
+L1_8
+L1_9
+L1_10
+L1_11
+L1_12
+L1_13
+L1_14
diff --git a/tests/fixtures/sed/output/subst_word b/tests/fixtures/sed/output/subst_word
@@ -0,0 +1 @@
+WORD WORD WORD WORD WORD WORD WORD WORD 😀
diff --git a/util/benchmark.sh b/util/benchmark.sh
@@ -40,7 +40,7 @@ echo 'command,mean,stddev,median,user,system,min,max' >"$OUT"
 awk 'BEGIN { for (i = 0; i < 50000000; i++) { print i } }' > lines.txt
 
 # No operation
-bench_run no-op-short "$PROG '' lines.txt"
+bench_run no-op-short "$PROG "'"" lines.txt'
 
 # Log file processing
 
@@ -58,7 +58,7 @@ create_access_log()
 create_access_log 5000000
 
 # No operation
-bench_run access-log-no-op "$PROG '' access.log"
+bench_run access-log-no-op "$PROG "'"" access.log'
 
 # No substitution
 bench_run access-log-no-subst "$PROG s/Chrome/Chromium/ access.log"
@@ -93,9 +93,10 @@ awk 'BEGIN {
   }
 }' > legacy_input.txt
 
-bench_run remove-cr "$PROG 's/\r$//' legacy_input.txt"
+echo 's/\r$//' >script.sed
+bench_run remove-cr "$PROG -f script.sed legacy_input.txt"
 
-rm legacy_input.txt
+rm -f legacy_input.txt script.sed
 
 # Genomic data cleanup
 
@@ -106,10 +107,10 @@ awk 'BEGIN {
   }
 }' > genome.tsv
 
-CMD='/^#/d; s/\t\./\tNA/g; s/\.$/NA/'
-bench_run genome-subst "$PROG '$CMD' genome.tsv"
+echo '/^#/d; s/\t\./\tNA/g; s/\.$/NA/' >script.sed
+bench_run genome-subst "$PROG -f script.sed genome.tsv"
 
-rm -f genome.tsv
+rm -f genome.tsv script.sed
 
 # Number fixups: remove thousands separator, change , into .
 awk 'BEGIN {
@@ -122,10 +123,10 @@ awk 'BEGIN {
   }
 }' > finance.csv
 
-CMD='s/\([0-9]\)\.\([0-9]\)/\1\2/g;s/\([0-9]\),\([0-9]\)/\1.\2/g'
-bench_run number-fix "$PROG '$CMD' finance.csv"
+echo 's/\([0-9]\)\.\([0-9]\)/\1\2/g;s/\([0-9]\),\([0-9]\)/\1.\2/g' >script.sed
+bench_run number-fix "$PROG -f script.sed finance.csv"
 
-rm -f finance.csv
+rm -f finance.csv script.sed
 
 # Long script compilation
 for i in $(seq 1 99) ; do
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Hello World or Καλημέρα κόσμε or こんにちは世界 :-)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Hello World or ΚGGGGέGG GόGGG or こんにちは世界 😀
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Hello World or αλημέρα όσμε or こんにちは世界 😀
-Original file line number
+Diff line change
@@ -0,0 +1,14 @@
+    l1-1
+    l1-2
+    l1-3
+    l1-4
+    l1-5
+    l1-6
+    l1-7
+    l1-8
+    l1-9
+    l1-10
+    l1-11
+    l1-12
+    l1-13
+    l1-14