diff --git a/src/sed/fast_regex.rs b/src/sed/fast_regex.rs index 942d1113..0c20a253 100644 --- a/src/sed/fast_regex.rs +++ b/src/sed/fast_regex.rs @@ -27,44 +27,12 @@ use crate::sed::fast_io::IOChunk; /// REs requiring the fancy_regex capabilities rather than the /// faster regex::bytes engine -// Consider . as one character that requires fancy_regex, -// because it can match more than one byte when matching a -// two or more byte Unicode UTF-8 representation. -// It is an RE . rather than a literal one in the following -// example sitations. -// . First character of the line -// [^\\]. Second character after non \ -// -// \*. A consumed backslash anywhere on the line -// \\. An escaped backslash anywhere on the line -// xx. A non-escaped sequence anywhere on the line -// But the following are literal dots and can be captured by bytes: -// \. escaped at the beginning of the line -// x\. escaped after a non escaped \ anywhere on the line -// -// The following RE captures these situations. -static NEEDS_FANCY_RE: Lazy = Lazy::new(|| { - regex::Regex::new( - r"(?x) # Turn on verbose mode - ( # An ASCII-incompatible RE - ( ^ # Non-escaped: i.e. at BOL - | ^[^\\] # or after a BOL non \ - | [^\\] {2} # or after two non \ characters - | \\. # or after a consumed or escaped \ - ) - ( # A potentially incompatible match - \. # . matches any Unicode character - | \[\^ # Bracketed -ve character class - | \(\?i # (Unicode) case insensitive - | \\[WwDdSsBbPp] # Unicode classes - | \\[0-9] # Back-references need fancy - ) - ) - | [^\x01-\x7f] # Any non-ASCII character - ", - ) - .unwrap() -}); +// False positives only result in a small performance pessimization, +// so this is just a maximally sensitive, good-enough approximation. +// For example, r"\\1" and r"[\1]" will match, whereas only a number +// after an odd number of backslashes and outside a character class +// should match. +static NEEDS_FANCY_RE: Lazy = Lazy::new(|| regex::Regex::new(r"\\[1-9]").unwrap()); /// All characters signifying that the match must be handled by an RE /// rather than by plain string pattern matching. @@ -476,40 +444,13 @@ mod tests { #[test] fn test_needs_fancy_re_matches() { let should_match = [ - // Unicode classes BOL - r"\p{L}+", // Unicode letter class - r"\W", // \W is Unicode-aware. - r"\S+", // \S is Unicode-aware. - r"\d", // \d includes all Unicode digits. - // Unicode classes non-BOL - r"x\p{L}+", // Unicode letter class - r"x\W", // \W is Unicode-aware. - r"x\S+", // \S is Unicode-aware. - r"x\d", // \d includes all Unicode digits. - // . - r".", - r"x.", - r"xx.", - // Consumed \ - r"\*.", - r"x\*.", - // Escaped \ - r"\\.", - r"x\\.", - // Inline flags - r"(?i)abc", // Unicode case-insensitive - r"x(?i)abc", // Unicode case-insensitive r"(\w+):\1", // back-reference \1 - // Non-ASCII literals - "naïve", // Contains literal non-ASCII. - "café", // Contains literal non-ASCII. ]; for pat in &should_match { assert!( NEEDS_FANCY_RE.is_match(pat), - "Expected NEEDS_FANCY_RE to match: {:?}", - pat + "Expected NEEDS_FANCY_RE to match: {pat:?}" ); } } @@ -517,11 +458,8 @@ mod tests { #[test] fn test_needs_fancy_re_does_not_match() { let should_not_match = [ - r"\.", // Escaped . at BOL - r"x\.", // Escaped . at non BOL - r"\[^x]", // Escaped character class - r"\(?i\)", // Escaped case insesitive flag - r"\\w", // Escaped Unicode class + r"\ 1", // Non-adjacent + r"\0", // Only \[1-9] // Simple ASCII r"foo", r"foo|bar", @@ -531,8 +469,7 @@ mod tests { for pat in &should_not_match { assert!( !NEEDS_FANCY_RE.is_match(pat), - "Expected NEEDS_FANCY_RE to NOT match: {:?}", - pat + "Expected NEEDS_FANCY_RE to NOT match: {pat:?}" ); } } @@ -558,8 +495,7 @@ mod tests { for pat in &should_match { assert!( NEEDS_RE.is_match(pat), - "Expected NEEDS_RE to match: {:?}", - pat + "Expected NEEDS_RE to match: {pat:?}" ); } } @@ -579,8 +515,7 @@ mod tests { for pat in &should_not_match { assert!( !NEEDS_RE.is_match(pat), - "Expected NEEDS_RE to NOT match: {:?}", - pat + "Expected NEEDS_RE to NOT match: {pat:?}" ); } } @@ -594,7 +529,7 @@ mod tests { #[test] fn assert_fancy() { - let re = Regex::new(r"\d").unwrap(); + let re = Regex::new(r"(.)\1").unwrap(); assert!(matches!(re, Regex::Fancy(_))); } @@ -609,8 +544,7 @@ mod tests { let err = Regex::new("(").unwrap_err().to_string(); assert!( err.contains("unclosed group") || err.contains("error parsing"), - "Unexpected error: {}", - err + "Unexpected error: {err:?}" ); } diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index b12095d1..d00ece1d 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -295,6 +295,21 @@ check_output!(subst_re_reuse, ["-e", r"2s//M/;1s/l/L/", LINES1]); check_output!(subst_newline_class, ["-n", r"1{;N;s/[\n]/X/;p;}", LINES1]); check_output!(subst_newline_re, ["-n", r"1{;N;s/\n/X/;p;}", LINES1]); +// Check appropriate selection and behavior of fast_Regex matcher +// Literal matcher +check_output!(subst_literal_start, ["-e", r"s/^l1/L1/", LINES1]); +check_output!(subst_literal_end, ["-e", r"s/2$/TWO/", LINES1]); +check_output!(subst_literal, ["-e", r"s/_/-/", LINES1]); + +// Fancy matcher +check_output!(subst_backref, ["-e", r"s/l\(.\)_\1/same-number/", LINES1]); + +// Bytes matcher with Unicode +check_output!(subst_greek, ["-e", r"s/[α-ω]/G/g", "input/unicode"]); +check_output!(subst_any_unicode, ["-e", r"s/.$/:-)/", "input/unicode"]); +check_output!(subst_lcase, ["-e", r"s/κ/*/gi", "input/unicode"]); +check_output!(subst_word, ["-E", "-e", r"s/\w+/WORD/g", "input/unicode"]); + #[test] fn subst_write_file() -> std::io::Result<()> { let temp = NamedTempFile::new()?; diff --git a/tests/fixtures/sed/output/subst_any_unicode b/tests/fixtures/sed/output/subst_any_unicode new file mode 100644 index 00000000..ec33993b --- /dev/null +++ b/tests/fixtures/sed/output/subst_any_unicode @@ -0,0 +1 @@ +Hello World or Καλημέρα κόσμε or こんにちは 世界 :-) diff --git a/tests/fixtures/sed/output/subst_backref b/tests/fixtures/sed/output/subst_backref new file mode 100644 index 00000000..75953034 --- /dev/null +++ b/tests/fixtures/sed/output/subst_backref @@ -0,0 +1,14 @@ +same-number +l1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +same-number0 +same-number1 +same-number2 +same-number3 +same-number4 diff --git a/tests/fixtures/sed/output/subst_greek b/tests/fixtures/sed/output/subst_greek new file mode 100644 index 00000000..2b918eb9 --- /dev/null +++ b/tests/fixtures/sed/output/subst_greek @@ -0,0 +1 @@ +Hello World or ΚGGGGέGG GόGGG or こんにちは 世界 😀 diff --git a/tests/fixtures/sed/output/subst_lcase b/tests/fixtures/sed/output/subst_lcase new file mode 100644 index 00000000..69296ee6 --- /dev/null +++ b/tests/fixtures/sed/output/subst_lcase @@ -0,0 +1 @@ +Hello World or *αλημέρα *όσμε or こんにちは 世界 😀 diff --git a/tests/fixtures/sed/output/subst_literal b/tests/fixtures/sed/output/subst_literal new file mode 100644 index 00000000..55fb7f95 --- /dev/null +++ b/tests/fixtures/sed/output/subst_literal @@ -0,0 +1,14 @@ +l1-1 +l1-2 +l1-3 +l1-4 +l1-5 +l1-6 +l1-7 +l1-8 +l1-9 +l1-10 +l1-11 +l1-12 +l1-13 +l1-14 diff --git a/tests/fixtures/sed/output/subst_literal_end b/tests/fixtures/sed/output/subst_literal_end new file mode 100644 index 00000000..1e8bc96e --- /dev/null +++ b/tests/fixtures/sed/output/subst_literal_end @@ -0,0 +1,14 @@ +l1_1 +l1_TWO +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_1TWO +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/subst_literal_start b/tests/fixtures/sed/output/subst_literal_start new file mode 100644 index 00000000..20529388 --- /dev/null +++ b/tests/fixtures/sed/output/subst_literal_start @@ -0,0 +1,14 @@ +L1_1 +L1_2 +L1_3 +L1_4 +L1_5 +L1_6 +L1_7 +L1_8 +L1_9 +L1_10 +L1_11 +L1_12 +L1_13 +L1_14 diff --git a/tests/fixtures/sed/output/subst_word b/tests/fixtures/sed/output/subst_word new file mode 100644 index 00000000..4dc20ad8 --- /dev/null +++ b/tests/fixtures/sed/output/subst_word @@ -0,0 +1 @@ +WORD WORD WORD WORD WORD WORD WORD WORD 😀 diff --git a/util/benchmark.sh b/util/benchmark.sh index 17d405c6..77cfd0f5 100755 --- a/util/benchmark.sh +++ b/util/benchmark.sh @@ -40,7 +40,7 @@ echo 'command,mean,stddev,median,user,system,min,max' >"$OUT" awk 'BEGIN { for (i = 0; i < 50000000; i++) { print i } }' > lines.txt # No operation -bench_run no-op-short "$PROG '' lines.txt" +bench_run no-op-short "$PROG "'"" lines.txt' # Log file processing @@ -58,7 +58,7 @@ create_access_log() create_access_log 5000000 # No operation -bench_run access-log-no-op "$PROG '' access.log" +bench_run access-log-no-op "$PROG "'"" access.log' # No substitution bench_run access-log-no-subst "$PROG s/Chrome/Chromium/ access.log" @@ -93,9 +93,10 @@ awk 'BEGIN { } }' > legacy_input.txt -bench_run remove-cr "$PROG 's/\r$//' legacy_input.txt" +echo 's/\r$//' >script.sed +bench_run remove-cr "$PROG -f script.sed legacy_input.txt" -rm legacy_input.txt +rm -f legacy_input.txt script.sed # Genomic data cleanup @@ -106,10 +107,10 @@ awk 'BEGIN { } }' > genome.tsv -CMD='/^#/d; s/\t\./\tNA/g; s/\.$/NA/' -bench_run genome-subst "$PROG '$CMD' genome.tsv" +echo '/^#/d; s/\t\./\tNA/g; s/\.$/NA/' >script.sed +bench_run genome-subst "$PROG -f script.sed genome.tsv" -rm -f genome.tsv +rm -f genome.tsv script.sed # Number fixups: remove thousands separator, change , into . awk 'BEGIN { @@ -122,10 +123,10 @@ awk 'BEGIN { } }' > finance.csv -CMD='s/\([0-9]\)\.\([0-9]\)/\1\2/g;s/\([0-9]\),\([0-9]\)/\1.\2/g' -bench_run number-fix "$PROG '$CMD' finance.csv" +echo 's/\([0-9]\)\.\([0-9]\)/\1\2/g;s/\([0-9]\),\([0-9]\)/\1.\2/g' >script.sed +bench_run number-fix "$PROG -f script.sed finance.csv" -rm -f finance.csv +rm -f finance.csv script.sed # Long script compilation for i in $(seq 1 99) ; do