From 5b950caaaaeb5c4dab65fa15cb84e58e054717b5 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Tue, 26 Nov 2024 12:23:16 +0100 Subject: [PATCH] StringScanner#scan_integer support base 16 integers Followup: https://github.com/ruby/strscan/pull/115 `scan_integer` is now implemented in Ruby as to efficiently handle keyword arguments without allocating a Hash. Given the goal of `scan_integer` is to more effciently parse integers without having to allocate an intermediary object, using `rb_scan_args` would defeat the purpose. Additionally, the C implementation now uses `rb_isdigit` and `rb_isxdigit`, because on Windows `isdigit` is locale dependent. --- ext/jruby/lib/strscan.rb | 1 + .../jruby/ext/strscan/RubyStringScanner.java | 50 ++++++++++- ext/strscan/strscan.c | 90 ++++++++++++++----- lib/strscan/strscan.rb | 25 ++++++ strscan.gemspec | 9 +- test/strscan/test_stringscanner.rb | 75 ++++++++++++++++ 6 files changed, 221 insertions(+), 29 deletions(-) create mode 100644 lib/strscan/strscan.rb diff --git a/ext/jruby/lib/strscan.rb b/ext/jruby/lib/strscan.rb index 4f796c25cd..420d7d501d 100644 --- a/ext/jruby/lib/strscan.rb +++ b/ext/jruby/lib/strscan.rb @@ -1,2 +1,3 @@ require 'strscan.jar' JRuby::Util.load_ext("org.jruby.ext.strscan.StringScannerLibrary") +require "strscan/strscan" diff --git a/ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java b/ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java index b59d7a0a65..d90f905410 100644 --- a/ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java +++ b/ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java @@ -557,8 +557,8 @@ public IRubyObject peep(ThreadContext context, IRubyObject length) { return peek(context, length); } - @JRubyMethod(name = "scan_integer") - public IRubyObject scan_integer(ThreadContext context) { + @JRubyMethod(name = "scan_base10_integer", visibility = PRIVATE) + public IRubyObject scan_base10_integer(ThreadContext context) { final Ruby runtime = context.runtime; check(context); clearMatched(); @@ -598,6 +598,52 @@ public IRubyObject scan_integer(ThreadContext context) { return ConvertBytes.byteListToInum(runtime, bytes, prev, curr, 10, true); } + @JRubyMethod(name = "scan_base16_integer", visibility = PRIVATE) + public IRubyObject scan_base16_integer(ThreadContext context) { + final Ruby runtime = context.runtime; + check(context); + clearMatched(); + + if (!str.getEncoding().isAsciiCompatible()) { + throw runtime.newEncodingCompatibilityError("ASCII incompatible encoding: " + str.getEncoding()); + } + + + ByteList bytes = str.getByteList(); + int curr = this.curr; + + int bite = bytes.get(curr); + if (bite == '-' || bite == '+') { + curr++; + bite = bytes.get(curr); + } + + if (bite == '0' && bytes.get(curr + 1) == 'x') { + curr += 2; + bite = bytes.get(curr); + } + + if (!((bite >= '0' && bite <= '9') || (bite >= 'a' && bite <= 'f') || (bite >= 'A' && bite <= 'F'))) { + return context.nil; + } + + while ((bite >= '0' && bite <= '9') || (bite >= 'a' && bite <= 'f') || (bite >= 'A' && bite <= 'F')) { + curr++; + if (curr >= bytes.getRealSize()) { + break; + } + bite = bytes.get(curr); + } + + int length = curr - this.curr; + prev = this.curr; + this.curr = curr; + setMatched(); + adjustRegisters(); + + return ConvertBytes.byteListToInum(runtime, bytes, prev, curr, 16, true); + } + @JRubyMethod(name = "unscan") public IRubyObject unscan(ThreadContext context) { check(context); diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index d0cae0f425..165cd4271b 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -20,7 +20,6 @@ extern size_t onig_region_memsize(const struct re_registers *regs); #endif -#include #include #define STRSCAN_VERSION "3.1.1" @@ -116,7 +115,7 @@ static VALUE strscan_get_byte _((VALUE self)); static VALUE strscan_getbyte _((VALUE self)); static VALUE strscan_peek _((VALUE self, VALUE len)); static VALUE strscan_peep _((VALUE self, VALUE len)); -static VALUE strscan_scan_integer _((VALUE self)); +static VALUE strscan_scan_base10_integer _((VALUE self)); static VALUE strscan_unscan _((VALUE self)); static VALUE strscan_bol_p _((VALUE self)); static VALUE strscan_eos_p _((VALUE self)); @@ -1268,21 +1267,26 @@ strscan_peep(VALUE self, VALUE vlen) return strscan_peek(self, vlen); } -/* - * call-seq: - * scan_integer - * - * Equivalent to #scan with a [+-]?\d+ pattern, and returns an Integer or nil. - * - * The scanned string must be encoded with an ASCII compatible encoding, otherwise - * Encoding::CompatibilityError will be raised. - */ static VALUE -strscan_scan_integer(VALUE self) +strscan_parse_integer(struct strscanner *p, int base, long len) { - char *ptr, *buffer; - long len = 0; VALUE buffer_v, integer; + + char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1); + + MEMCPY(buffer, CURPTR(p), char, len); + buffer[len] = '\0'; + integer = rb_cstr2inum(buffer, base); + RB_ALLOCV_END(buffer_v); + p->curr += len; + return integer; +} + +static VALUE +strscan_scan_base10_integer(VALUE self) +{ + char *ptr; + long len = 0; struct strscanner *p; GET_SCANNER(self, p); @@ -1302,25 +1306,60 @@ strscan_scan_integer(VALUE self) len++; } - if (!isdigit(ptr[len])) { + if (!rb_isdigit(ptr[len])) { return Qnil; } MATCHED(p); p->prev = p->curr; - while (len < remaining_len && isdigit(ptr[len])) { + while (len < remaining_len && rb_isdigit(ptr[len])) { len++; } - buffer = RB_ALLOCV_N(char, buffer_v, len + 1); + return strscan_parse_integer(p, 10, len); +} - MEMCPY(buffer, CURPTR(p), char, len); - buffer[len] = '\0'; - integer = rb_cstr2inum(buffer, 10); - RB_ALLOCV_END(buffer_v); - p->curr += len; - return integer; +static VALUE +strscan_scan_base16_integer(VALUE self) +{ + char *ptr; + long len = 0; + struct strscanner *p; + + GET_SCANNER(self, p); + CLEAR_MATCH_STATUS(p); + + rb_must_asciicompat(p->str); + + ptr = CURPTR(p); + + long remaining_len = S_RESTLEN(p); + + if (remaining_len <= 0) { + return Qnil; + } + + if (ptr[len] == '-' || ptr[len] == '+') { + len++; + } + + if ((remaining_len >= (len + 2)) && ptr[len] == '0' && ptr[len + 1] == 'x') { + len += 2; + } + + if (len >= remaining_len || !rb_isxdigit(ptr[len])) { + return Qnil; + } + + MATCHED(p); + p->prev = p->curr; + + while (len < remaining_len && rb_isxdigit(ptr[len])) { + len++; + } + + return strscan_parse_integer(p, 16, len); } /* @@ -2261,7 +2300,8 @@ Init_strscan(void) rb_define_method(StringScanner, "peek_byte", strscan_peek_byte, 0); rb_define_method(StringScanner, "peep", strscan_peep, 1); - rb_define_method(StringScanner, "scan_integer", strscan_scan_integer, 0); + rb_define_private_method(StringScanner, "scan_base10_integer", strscan_scan_base10_integer, 0); + rb_define_private_method(StringScanner, "scan_base16_integer", strscan_scan_base16_integer, 0); rb_define_method(StringScanner, "unscan", strscan_unscan, 0); @@ -2290,4 +2330,6 @@ Init_strscan(void) rb_define_method(StringScanner, "fixed_anchor?", strscan_fixed_anchor_p, 0); rb_define_method(StringScanner, "named_captures", strscan_named_captures, 0); + + rb_require("strscan/strscan"); } diff --git a/lib/strscan/strscan.rb b/lib/strscan/strscan.rb new file mode 100644 index 0000000000..3581f2e939 --- /dev/null +++ b/lib/strscan/strscan.rb @@ -0,0 +1,25 @@ +# frozen_string_literal: true + +class StringScanner + # call-seq: + # scan_integer(base: 10) + # + # If `base` isn't provided or is `10`, then it is equivalent to calling `#scan` with a `[+-]?\d+` pattern, + # and returns an Integer or nil. + # + # If `base` is `16`, then it is equivalent to calling `#scan` with a `[+-]?(0x)?[0-9a-fA-F]+` pattern, + # and returns an Integer or nil. + # + # The scanned string must be encoded with an ASCII compatible encoding, otherwise + # Encoding::CompatibilityError will be raised. + def scan_integer(base: 10) + case base + when 10 + scan_base10_integer + when 16 + scan_base16_integer + else + raise ArgumentError, "Unsupported integer base: #{base.inspect}, expected 10 or 16" + end + end +end diff --git a/strscan.gemspec b/strscan.gemspec index 925edcd2d3..47180bb8d8 100644 --- a/strscan.gemspec +++ b/strscan.gemspec @@ -19,14 +19,17 @@ Gem::Specification.new do |s| files = [ "COPYING", "LICENSE.txt", + "lib/strscan/strscan.rb" ] + + s.require_paths = %w{lib} + if RUBY_ENGINE == "jruby" - s.require_paths = %w{ext/jruby/lib lib} - files << "ext/jruby/lib/strscan.rb" files << "lib/strscan.jar" + files << "ext/jruby/lib/strscan.rb" + s.require_paths += %w{ext/jruby/lib} s.platform = "java" else - s.require_paths = %w{lib} files << "ext/strscan/extconf.rb" files << "ext/strscan/strscan.c" s.rdoc_options << "-idoc" diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index ae05254d49..a032765661 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -945,6 +945,81 @@ def test_scan_integer_encoding s.scan_integer end end + + def test_scan_integer_base_16 + omit "scan_integer isn't implemented on TruffleRuby yet" if RUBY_ENGINE == "truffleruby" + + s = create_string_scanner('0') + assert_equal 0x0, s.scan_integer(base: 16) + assert_equal 1, s.pos + assert_predicate s, :matched? + + s = create_string_scanner('abc') + assert_equal 0xabc, s.scan_integer(base: 16) + assert_equal 3, s.pos + assert_predicate s, :matched? + + s = create_string_scanner('123abc') + assert_equal 0x123abc, s.scan_integer(base: 16) + assert_equal 6, s.pos + assert_predicate s, :matched? + + s = create_string_scanner('0x123abc') + assert_equal 0x123abc, s.scan_integer(base: 16) + assert_equal 8, s.pos + assert_predicate s, :matched? + + s = create_string_scanner('0x123ABC') + assert_equal 0x123abc, s.scan_integer(base: 16) + assert_equal 8, s.pos + assert_predicate s, :matched? + + s = create_string_scanner('-0x123ABC') + assert_equal -0x123abc, s.scan_integer(base: 16) + assert_equal 9, s.pos + assert_predicate s, :matched? + + s = create_string_scanner('+0x123ABC') + assert_equal +0x123abc, s.scan_integer(base: 16) + assert_equal 9, s.pos + assert_predicate s, :matched? + + s = create_string_scanner('0x') + assert_nil s.scan_integer(base: 16) + assert_equal 0, s.pos + refute_predicate s, :matched? + + s = create_string_scanner('-0x') + assert_nil s.scan_integer(base: 16) + assert_equal 0, s.pos + refute_predicate s, :matched? + + s = create_string_scanner('+0x') + assert_nil s.scan_integer(base: 16) + assert_equal 0, s.pos + refute_predicate s, :matched? + + s = create_string_scanner('-123abc') + assert_equal -0x123abc, s.scan_integer(base: 16) + assert_equal 7, s.pos + assert_predicate s, :matched? + + s = create_string_scanner('+123') + assert_equal 0x123, s.scan_integer(base: 16) + assert_equal 4, s.pos + assert_predicate s, :matched? + + s = create_string_scanner('-abc') + assert_equal -0xabc, s.scan_integer(base: 16) + assert_equal 4, s.pos + assert_predicate s, :matched? + + huge_integer = 'F' * 2_000 + s = create_string_scanner(huge_integer) + assert_equal huge_integer.to_i(16), s.scan_integer(base: 16) + assert_equal 2_000, s.pos + assert_predicate s, :matched? + end end class TestStringScanner < Test::Unit::TestCase