Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 14 additions & 80 deletions src/sed/fast_regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,44 +27,12 @@ use crate::sed::fast_io::IOChunk;

/// REs requiring the fancy_regex capabilities rather than the
/// faster regex::bytes engine
// Consider . as one character that requires fancy_regex,
// because it can match more than one byte when matching a
// two or more byte Unicode UTF-8 representation.
// It is an RE . rather than a literal one in the following
// example sitations.
// . First character of the line
// [^\\]. Second character after non \
//
// \*. A consumed backslash anywhere on the line
// \\. An escaped backslash anywhere on the line
// xx. A non-escaped sequence anywhere on the line
// But the following are literal dots and can be captured by bytes:
// \. escaped at the beginning of the line
// x\. escaped after a non escaped \ anywhere on the line
//
// The following RE captures these situations.
static NEEDS_FANCY_RE: Lazy<RustRegex> = Lazy::new(|| {
regex::Regex::new(
r"(?x) # Turn on verbose mode
( # An ASCII-incompatible RE
( ^ # Non-escaped: i.e. at BOL
| ^[^\\] # or after a BOL non \
| [^\\] {2} # or after two non \ characters
| \\. # or after a consumed or escaped \
)
( # A potentially incompatible match
\. # . matches any Unicode character
| \[\^ # Bracketed -ve character class
| \(\?i # (Unicode) case insensitive
| \\[WwDdSsBbPp] # Unicode classes
| \\[0-9] # Back-references need fancy
)
)
| [^\x01-\x7f] # Any non-ASCII character
",
)
.unwrap()
});
// False positives only result in a small performance pessimization,
// so this is just a maximally sensitive, good-enough approximation.
// For example, r"\\1" and r"[\1]" will match, whereas only a number
// after an odd number of backslashes and outside a character class
// should match.
static NEEDS_FANCY_RE: Lazy<RustRegex> = Lazy::new(|| regex::Regex::new(r"\\[1-9]").unwrap());

/// All characters signifying that the match must be handled by an RE
/// rather than by plain string pattern matching.
Expand Down Expand Up @@ -476,52 +444,22 @@ mod tests {
#[test]
fn test_needs_fancy_re_matches() {
let should_match = [
// Unicode classes BOL
r"\p{L}+", // Unicode letter class
r"\W", // \W is Unicode-aware.
r"\S+", // \S is Unicode-aware.
r"\d", // \d includes all Unicode digits.
// Unicode classes non-BOL
r"x\p{L}+", // Unicode letter class
r"x\W", // \W is Unicode-aware.
r"x\S+", // \S is Unicode-aware.
r"x\d", // \d includes all Unicode digits.
// .
r".",
r"x.",
r"xx.",
// Consumed \
r"\*.",
r"x\*.",
// Escaped \
r"\\.",
r"x\\.",
// Inline flags
r"(?i)abc", // Unicode case-insensitive
r"x(?i)abc", // Unicode case-insensitive
r"(\w+):\1", // back-reference \1
// Non-ASCII literals
"naïve", // Contains literal non-ASCII.
"café", // Contains literal non-ASCII.
];

for pat in &should_match {
assert!(
NEEDS_FANCY_RE.is_match(pat),
"Expected NEEDS_FANCY_RE to match: {:?}",
pat
"Expected NEEDS_FANCY_RE to match: {pat:?}"
);
}
}

#[test]
fn test_needs_fancy_re_does_not_match() {
let should_not_match = [
r"\.", // Escaped . at BOL
r"x\.", // Escaped . at non BOL
r"\[^x]", // Escaped character class
r"\(?i\)", // Escaped case insesitive flag
r"\\w", // Escaped Unicode class
r"\ 1", // Non-adjacent
r"\0", // Only \[1-9]
// Simple ASCII
r"foo",
r"foo|bar",
Expand All @@ -531,8 +469,7 @@ mod tests {
for pat in &should_not_match {
assert!(
!NEEDS_FANCY_RE.is_match(pat),
"Expected NEEDS_FANCY_RE to NOT match: {:?}",
pat
"Expected NEEDS_FANCY_RE to NOT match: {pat:?}"
);
}
}
Expand All @@ -558,8 +495,7 @@ mod tests {
for pat in &should_match {
assert!(
NEEDS_RE.is_match(pat),
"Expected NEEDS_RE to match: {:?}",
pat
"Expected NEEDS_RE to match: {pat:?}"
);
}
}
Expand All @@ -579,8 +515,7 @@ mod tests {
for pat in &should_not_match {
assert!(
!NEEDS_RE.is_match(pat),
"Expected NEEDS_RE to NOT match: {:?}",
pat
"Expected NEEDS_RE to NOT match: {pat:?}"
);
}
}
Expand All @@ -594,7 +529,7 @@ mod tests {

#[test]
fn assert_fancy() {
let re = Regex::new(r"\d").unwrap();
let re = Regex::new(r"(.)\1").unwrap();
assert!(matches!(re, Regex::Fancy(_)));
}

Expand All @@ -609,8 +544,7 @@ mod tests {
let err = Regex::new("(").unwrap_err().to_string();
assert!(
err.contains("unclosed group") || err.contains("error parsing"),
"Unexpected error: {}",
err
"Unexpected error: {err:?}"
);
}

Expand Down
15 changes: 15 additions & 0 deletions tests/by-util/test_sed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,21 @@ check_output!(subst_re_reuse, ["-e", r"2s//M/;1s/l/L/", LINES1]);
check_output!(subst_newline_class, ["-n", r"1{;N;s/[\n]/X/;p;}", LINES1]);
check_output!(subst_newline_re, ["-n", r"1{;N;s/\n/X/;p;}", LINES1]);

// Check appropriate selection and behavior of fast_Regex matcher
// Literal matcher
check_output!(subst_literal_start, ["-e", r"s/^l1/L1/", LINES1]);
check_output!(subst_literal_end, ["-e", r"s/2$/TWO/", LINES1]);
check_output!(subst_literal, ["-e", r"s/_/-/", LINES1]);

// Fancy matcher
check_output!(subst_backref, ["-e", r"s/l\(.\)_\1/same-number/", LINES1]);

// Bytes matcher with Unicode
check_output!(subst_greek, ["-e", r"s/[α-ω]/G/g", "input/unicode"]);
check_output!(subst_any_unicode, ["-e", r"s/.$/:-)/", "input/unicode"]);
check_output!(subst_lcase, ["-e", r"s/κ/*/gi", "input/unicode"]);
check_output!(subst_word, ["-E", "-e", r"s/\w+/WORD/g", "input/unicode"]);

#[test]
fn subst_write_file() -> std::io::Result<()> {
let temp = NamedTempFile::new()?;
Expand Down
1 change: 1 addition & 0 deletions tests/fixtures/sed/output/subst_any_unicode
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hello World or Καλημέρα κόσμε or こんにちは 世界 :-)
14 changes: 14 additions & 0 deletions tests/fixtures/sed/output/subst_backref
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
same-number
l1_2
l1_3
l1_4
l1_5
l1_6
l1_7
l1_8
l1_9
same-number0
same-number1
same-number2
same-number3
same-number4
1 change: 1 addition & 0 deletions tests/fixtures/sed/output/subst_greek
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hello World or ΚGGGGέGG GόGGG or こんにちは 世界 😀
1 change: 1 addition & 0 deletions tests/fixtures/sed/output/subst_lcase
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hello World or *αλημέρα *όσμε or こんにちは 世界 😀
14 changes: 14 additions & 0 deletions tests/fixtures/sed/output/subst_literal
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
l1-1
l1-2
l1-3
l1-4
l1-5
l1-6
l1-7
l1-8
l1-9
l1-10
l1-11
l1-12
l1-13
l1-14
14 changes: 14 additions & 0 deletions tests/fixtures/sed/output/subst_literal_end
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
l1_1
l1_TWO
l1_3
l1_4
l1_5
l1_6
l1_7
l1_8
l1_9
l1_10
l1_11
l1_1TWO
l1_13
l1_14
14 changes: 14 additions & 0 deletions tests/fixtures/sed/output/subst_literal_start
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
L1_1
L1_2
L1_3
L1_4
L1_5
L1_6
L1_7
L1_8
L1_9
L1_10
L1_11
L1_12
L1_13
L1_14
1 change: 1 addition & 0 deletions tests/fixtures/sed/output/subst_word
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
WORD WORD WORD WORD WORD WORD WORD WORD 😀
21 changes: 11 additions & 10 deletions util/benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ echo 'command,mean,stddev,median,user,system,min,max' >"$OUT"
awk 'BEGIN { for (i = 0; i < 50000000; i++) { print i } }' > lines.txt

# No operation
bench_run no-op-short "$PROG '' lines.txt"
bench_run no-op-short "$PROG "'"" lines.txt'

# Log file processing

Expand All @@ -58,7 +58,7 @@ create_access_log()
create_access_log 5000000

# No operation
bench_run access-log-no-op "$PROG '' access.log"
bench_run access-log-no-op "$PROG "'"" access.log'

# No substitution
bench_run access-log-no-subst "$PROG s/Chrome/Chromium/ access.log"
Expand Down Expand Up @@ -93,9 +93,10 @@ awk 'BEGIN {
}
}' > legacy_input.txt

bench_run remove-cr "$PROG 's/\r$//' legacy_input.txt"
echo 's/\r$//' >script.sed
bench_run remove-cr "$PROG -f script.sed legacy_input.txt"

rm legacy_input.txt
rm -f legacy_input.txt script.sed

# Genomic data cleanup

Expand All @@ -106,10 +107,10 @@ awk 'BEGIN {
}
}' > genome.tsv

CMD='/^#/d; s/\t\./\tNA/g; s/\.$/NA/'
bench_run genome-subst "$PROG '$CMD' genome.tsv"
echo '/^#/d; s/\t\./\tNA/g; s/\.$/NA/' >script.sed
bench_run genome-subst "$PROG -f script.sed genome.tsv"

rm -f genome.tsv
rm -f genome.tsv script.sed

# Number fixups: remove thousands separator, change , into .
awk 'BEGIN {
Expand All @@ -122,10 +123,10 @@ awk 'BEGIN {
}
}' > finance.csv

CMD='s/\([0-9]\)\.\([0-9]\)/\1\2/g;s/\([0-9]\),\([0-9]\)/\1.\2/g'
bench_run number-fix "$PROG '$CMD' finance.csv"
echo 's/\([0-9]\)\.\([0-9]\)/\1\2/g;s/\([0-9]\),\([0-9]\)/\1.\2/g' >script.sed
bench_run number-fix "$PROG -f script.sed finance.csv"

rm -f finance.csv
rm -f finance.csv script.sed

# Long script compilation
for i in $(seq 1 99) ; do
Expand Down
Loading