Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
import org.jruby.runtime.builtin.IRubyObject;
import org.jruby.util.ByteList;
import org.jruby.util.StringSupport;
import org.jruby.util.ConvertBytes;

import java.util.Iterator;

Expand Down Expand Up @@ -556,6 +557,47 @@ public IRubyObject peep(ThreadContext context, IRubyObject length) {
return peek(context, length);
}

@JRubyMethod(name = "scan_integer")
public IRubyObject scan_integer(ThreadContext context) {
final Ruby runtime = context.runtime;
check(context);
clearMatched();

if (!str.getEncoding().isAsciiCompatible()) {
throw getRuntime().newEncodingCompatibilityError("ASCII incompatible encoding: " + str.getEncoding());
}


ByteList bytes = str.getByteList();
int curr = this.curr;

int bite = bytes.get(curr);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

byte?

Copy link
Contributor

@olleolleolle olleolleolle Nov 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for asking that, I now re-learned that in Java, there are certain reserved words: https://en.wikipedia.org/wiki/List_of_Java_keywords and byte is one of them.

EDIT: Oh,

Of these 68 keywords, 17 of them are only contextually reserved, and can sometimes be used as an identifier, unlike standard reserved words

(Ah, right, that list of contextually-reserved didn't include byte.)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

!!!

if (bite == '-' || bite == '+') {
curr++;
bite = bytes.get(curr);
}

if (!(bite >= '0' && bite <= '9')) {
return runtime.getNil();
}

while (bite >= '0' && bite <= '9') {
curr++;
if (curr >= bytes.getRealSize()) {
break;
}
bite = bytes.get(curr);
}

int length = curr - this.curr;
prev = this.curr;
this.curr = curr;
setMatched();
adjustRegisters();

return ConvertBytes.byteListToInum(runtime, bytes, prev, curr, 10, true);
}

@JRubyMethod(name = "unscan")
public IRubyObject unscan(ThreadContext context) {
check(context);
Expand Down
53 changes: 53 additions & 0 deletions ext/strscan/strscan.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
extern size_t onig_region_memsize(const struct re_registers *regs);
#endif

#include <ctype.h>
#include <stdbool.h>

#define STRSCAN_VERSION "3.1.1"
Expand Down Expand Up @@ -115,6 +116,7 @@ static VALUE strscan_get_byte _((VALUE self));
static VALUE strscan_getbyte _((VALUE self));
static VALUE strscan_peek _((VALUE self, VALUE len));
static VALUE strscan_peep _((VALUE self, VALUE len));
static VALUE strscan_scan_integer _((VALUE self));
static VALUE strscan_unscan _((VALUE self));
static VALUE strscan_bol_p _((VALUE self));
static VALUE strscan_eos_p _((VALUE self));
Expand Down Expand Up @@ -1266,6 +1268,55 @@ strscan_peep(VALUE self, VALUE vlen)
return strscan_peek(self, vlen);
}

/*
* call-seq:
* scan_integer
*
* Equivalent to #scan with a [+-]?\d+ pattern, and returns an Integer or nil.
*
* The scanned string must be encoded with an ASCII compatible encoding, otherwise
* Encoding::CompatibilityError will be raised.
*/
static VALUE
strscan_scan_integer(VALUE self)
{
char *ptr, *buffer;
long len = 0;
VALUE buffer_v, integer;
struct strscanner *p;

GET_SCANNER(self, p);
CLEAR_MATCH_STATUS(p);

rb_must_asciicompat(p->str);

ptr = CURPTR(p);

if (ptr[len] == '-' || ptr[len] == '+') {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is ptr[len] safe when S_RESTLEN() is 0?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think so, because (for now?) Ruby strings are guaranteed to be NULL terminated. So if we're EOF, ptr[len] is \0.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that a String created by rb_str_new_static() may not be terminated by \0.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically you can but the documentation clearly states:

Identical to rb_str_new(), except it takes a C string literal.

So it assume it's a literal, hence is NUL terminated.

I can add the checks if you feel strongly about it, but it's a lot of busy code for something I think isn't supposed to happen.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added the extra checks in #115 noetheless.

len++;
}

if (!isdigit(ptr[len])) {
return Qnil;
}

MATCHED(p);
p->prev = p->curr;

while (isdigit(ptr[len])) {
len++;
}

buffer = ALLOCV_N(char, buffer_v, len + 1);

MEMCPY(buffer, CURPTR(p), char, len);
buffer[len] = '\0';
integer = rb_cstr2inum(buffer, 10);
Comment on lines +1310 to +1314
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, we want to avoid this copy...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, Ruby doesn't provide an API that doesn't require a NULL terminated string. But if you want I can implement a fast path like I did in ruby/json: ruby/json@3a4dc9e

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. We can use rb_cstr2inum() for now. Could you propose the fast path to Ruby itself? It may be useful for extensions.

RB_GC_GUARD(buffer_v);
p->curr += len;
return integer;
}

/*
* :markup: markdown
* :include: strscan/link_refs.txt
Expand Down Expand Up @@ -2204,6 +2255,8 @@ Init_strscan(void)
rb_define_method(StringScanner, "peek_byte", strscan_peek_byte, 0);
rb_define_method(StringScanner, "peep", strscan_peep, 1);

rb_define_method(StringScanner, "scan_integer", strscan_scan_integer, 0);

rb_define_method(StringScanner, "unscan", strscan_unscan, 0);

rb_define_method(StringScanner, "beginning_of_line?", strscan_bol_p, 0);
Expand Down
55 changes: 55 additions & 0 deletions test/strscan/test_stringscanner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -890,6 +890,61 @@ def test_named_captures
assert_equal(9, scan.match?(/(?<f>foo)(?<r>bar)(?<z>baz)/))
assert_equal({"f" => "foo", "r" => "bar", "z" => "baz"}, scan.named_captures)
end

def test_scan_integer
omit "scan_integer isn't implemented on TruffleRuby yet" if RUBY_ENGINE == "truffleruby"

s = create_string_scanner('abc')
assert_nil s.scan_integer
assert_equal 0, s.pos
refute_predicate s, :matched?

s = create_string_scanner('123abc')
assert_equal 123, s.scan_integer
assert_equal 3, s.pos
assert_predicate s, :matched?

s = create_string_scanner('-123abc')
assert_equal -123, s.scan_integer
assert_equal 4, s.pos
assert_predicate s, :matched?

s = create_string_scanner('+123')
assert_equal 123, s.scan_integer
assert_equal 4, s.pos
assert_predicate s, :matched?

s = create_string_scanner('-abc')
assert_nil s.scan_integer
assert_equal 0, s.pos
refute_predicate s, :matched?

huge_integer = '1' * 2_000
s = create_string_scanner(huge_integer)
assert_equal huge_integer.to_i, s.scan_integer
assert_equal 2_000, s.pos
assert_predicate s, :matched?
end

def test_scan_integer_unmatch
omit "scan_integer isn't implemented on TruffleRuby yet" if RUBY_ENGINE == "truffleruby"

s = create_string_scanner('123abc')
assert_equal 123, s.scan_integer
assert_equal 3, s.pos

s.unscan
assert_equal 0, s.pos
end

def test_scan_integer_encoding
omit "scan_integer isn't implemented on TruffleRuby yet" if RUBY_ENGINE == "truffleruby"

s = create_string_scanner('123abc'.encode(Encoding::UTF_32LE))
assert_raise(Encoding::CompatibilityError) do
s.scan_integer
end
end
end

class TestStringScanner < Test::Unit::TestCase
Expand Down