From a082b2d429692d298ee5fa27a8c77db2efcc3403 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 13 Oct 2024 23:51:55 +0900 Subject: [PATCH 1/2] CRuby: optimize strscan_do_scan() method It shows String as a pattern is 1.23x faster than Regexp as a pattern. ``` $ benchmark-driver benchmark/check_until.yaml Warming up -------------------------------------- regexp 9.300M i/s - 9.509M times in 1.022507s (107.53ns/i) regexp_var 9.110M i/s - 9.262M times in 1.016682s (109.76ns/i) string 9.051M i/s - 9.304M times in 1.028047s (110.49ns/i) string_var 11.187M i/s - 11.722M times in 1.047826s (89.39ns/i) Calculating ------------------------------------- regexp 10.197M i/s - 27.899M times in 2.735904s (98.06ns/i) regexp_var 10.198M i/s - 27.331M times in 2.680120s (98.06ns/i) string 10.089M i/s - 27.152M times in 2.691312s (99.12ns/i) string_var 12.530M i/s - 33.562M times in 2.678533s (79.81ns/i) Comparison: string_var: 12529824.3 i/s regexp_var: 10197773.2 i/s - 1.23x slower regexp: 10197371.0 i/s - 1.23x slower string: 10088701.3 i/s - 1.24x slower ``` See: https://github.com/ruby/ruby/blob/cf8388f76c4c2ff2f46d0d2aa2cf5186e05ff606/re.c#L251-L256 --- ext/strscan/strscan.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 24cc5d734e..62a7123499 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -709,19 +709,20 @@ strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly } else { StringValue(pattern); - rb_enc_check(p->str, pattern); - if (S_RESTLEN(p) < RSTRING_LEN(pattern)) { - return Qnil; - } + rb_encoding *enc = rb_enc_check(p->str, pattern); if (headonly) { + if (S_RESTLEN(p) < RSTRING_LEN(pattern)) { + return Qnil; + } if (memcmp(CURPTR(p), RSTRING_PTR(pattern), RSTRING_LEN(pattern)) != 0) { return Qnil; } set_registers(p, RSTRING_LEN(pattern)); - } else { + } + else { long pos = rb_memsearch(RSTRING_PTR(pattern), RSTRING_LEN(pattern), - CURPTR(p), S_RESTLEN(p), rb_enc_get(pattern)); + CURPTR(p), S_RESTLEN(p), enc); if (pos == -1) { return Qnil; } From 7bf4cf02ec1fa254676ba70bfc703d37b9fef4ff Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 14 Oct 2024 11:41:09 +0900 Subject: [PATCH 2/2] JRuby: optimize scan() method It shows String as a pattern is 2.43x faster than Regexp as a pattern. ``` $ benchmark-driver benchmark/check_until.yaml Warming up -------------------------------------- regexp 7.371M i/s - 7.352M times in 0.997443s (135.67ns/i) regexp_var 7.303M i/s - 7.262M times in 0.994284s (136.92ns/i) string 13.596M i/s - 13.535M times in 0.995475s (73.55ns/i) string_var 15.032M i/s - 14.942M times in 0.994038s (66.53ns/i) Calculating ------------------------------------- regexp 9.120M i/s - 22.113M times in 2.424781s (109.65ns/i) regexp_var 8.914M i/s - 21.910M times in 2.458050s (112.19ns/i) string 22.174M i/s - 40.789M times in 1.839495s (45.10ns/i) string_var 19.994M i/s - 45.095M times in 2.255454s (50.02ns/i) Comparison: string: 22174077.0 i/s string_var: 19993967.8 i/s - 1.11x slower regexp: 9119635.2 i/s - 2.43x slower regexp_var: 8913743.3 i/s - 2.49x slower ``` See: https://github.com/jruby/jruby/blob/be7815ec02356a58891c8727bb448f0c6a826d96/core/src/main/java/org/jruby/util/StringSupport.java#L1706-L1720 --- .../jruby/ext/strscan/RubyStringScanner.java | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java b/ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java index db33717881..1322deffa3 100644 --- a/ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java +++ b/ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java @@ -263,20 +263,17 @@ private IRubyObject extractBegLen(Ruby runtime, int beg, int len) { private IRubyObject scan(ThreadContext context, IRubyObject regex, boolean succptr, boolean getstr, boolean headonly) { final Ruby runtime = context.runtime; check(context); - - ByteList strBL = str.getByteList(); - int strBeg = strBL.getBegin(); - clearMatched(); if (restLen() < 0) { return context.nil; } + ByteList strBL = str.getByteList(); + int currPtr = currPtr(); + if (regex instanceof RubyRegexp) { pattern = ((RubyRegexp) regex).preparePattern(str); - - int currPtr = currPtr(); int range = currPtr + restLen(); Matcher matcher = pattern.matcher(strBL.getUnsafeBytes(), matchTarget(), range); @@ -300,23 +297,20 @@ private IRubyObject scan(ThreadContext context, IRubyObject regex, boolean succp if (ret < 0) return context.nil; } else { RubyString pattern = regex.convertToString(); - Encoding patternEnc = str.checkEncoding(pattern); - - if (restLen() < pattern.size()) { - return context.nil; - } - ByteList patternBL = pattern.getByteList(); int patternSize = patternBL.realSize(); if (headonly) { - if (ByteList.memcmp(strBL.unsafeBytes(), strBeg + curr, patternBL.unsafeBytes(), patternBL.begin(), patternSize) != 0) { + if (restLen() < pattern.size()) { + return context.nil; + } + if (ByteList.memcmp(strBL.unsafeBytes(), currPtr, patternBL.unsafeBytes(), patternBL.begin(), patternSize) != 0) { return context.nil; } setRegisters(patternSize); } else { - int pos = StringSupport.index(strBL, patternBL, strBeg + curr, patternEnc); + int pos = StringSupport.index(strBL, patternBL, currPtr, patternEnc); if (pos == -1) { return context.nil; }