Skip to content

Commit 6a3c74b

Browse files
authored
Implement #scan_integer to efficiently parse Integer (#115)
Fix: #113 This allows to directly parse an Integer from a String without needing to first allocate a sub string. Notes: The implementation is limited by design, it's meant as a first step, only the most straightforward, based 10 integers are supported.
1 parent 81a80a1 commit 6a3c74b

File tree

3 files changed

+156
-0
lines changed

3 files changed

+156
-0
lines changed

ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
import org.jruby.runtime.builtin.IRubyObject;
5555
import org.jruby.util.ByteList;
5656
import org.jruby.util.StringSupport;
57+
import org.jruby.util.ConvertBytes;
5758

5859
import java.util.Iterator;
5960

@@ -556,6 +557,47 @@ public IRubyObject peep(ThreadContext context, IRubyObject length) {
556557
return peek(context, length);
557558
}
558559

560+
@JRubyMethod(name = "scan_integer")
561+
public IRubyObject scan_integer(ThreadContext context) {
562+
final Ruby runtime = context.runtime;
563+
check(context);
564+
clearMatched();
565+
566+
if (!str.getEncoding().isAsciiCompatible()) {
567+
throw runtime.newEncodingCompatibilityError("ASCII incompatible encoding: " + str.getEncoding());
568+
}
569+
570+
571+
ByteList bytes = str.getByteList();
572+
int curr = this.curr;
573+
574+
int bite = bytes.get(curr);
575+
if (bite == '-' || bite == '+') {
576+
curr++;
577+
bite = bytes.get(curr);
578+
}
579+
580+
if (!(bite >= '0' && bite <= '9')) {
581+
return context.nil;
582+
}
583+
584+
while (bite >= '0' && bite <= '9') {
585+
curr++;
586+
if (curr >= bytes.getRealSize()) {
587+
break;
588+
}
589+
bite = bytes.get(curr);
590+
}
591+
592+
int length = curr - this.curr;
593+
prev = this.curr;
594+
this.curr = curr;
595+
setMatched();
596+
adjustRegisters();
597+
598+
return ConvertBytes.byteListToInum(runtime, bytes, prev, curr, 10, true);
599+
}
600+
559601
@JRubyMethod(name = "unscan")
560602
public IRubyObject unscan(ThreadContext context) {
561603
check(context);

ext/strscan/strscan.c

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
extern size_t onig_region_memsize(const struct re_registers *regs);
2121
#endif
2222

23+
#include <ctype.h>
2324
#include <stdbool.h>
2425

2526
#define STRSCAN_VERSION "3.1.1"
@@ -115,6 +116,7 @@ static VALUE strscan_get_byte _((VALUE self));
115116
static VALUE strscan_getbyte _((VALUE self));
116117
static VALUE strscan_peek _((VALUE self, VALUE len));
117118
static VALUE strscan_peep _((VALUE self, VALUE len));
119+
static VALUE strscan_scan_integer _((VALUE self));
118120
static VALUE strscan_unscan _((VALUE self));
119121
static VALUE strscan_bol_p _((VALUE self));
120122
static VALUE strscan_eos_p _((VALUE self));
@@ -1266,6 +1268,61 @@ strscan_peep(VALUE self, VALUE vlen)
12661268
return strscan_peek(self, vlen);
12671269
}
12681270

1271+
/*
1272+
* call-seq:
1273+
* scan_integer
1274+
*
1275+
* Equivalent to #scan with a [+-]?\d+ pattern, and returns an Integer or nil.
1276+
*
1277+
* The scanned string must be encoded with an ASCII compatible encoding, otherwise
1278+
* Encoding::CompatibilityError will be raised.
1279+
*/
1280+
static VALUE
1281+
strscan_scan_integer(VALUE self)
1282+
{
1283+
char *ptr, *buffer;
1284+
long len = 0;
1285+
VALUE buffer_v, integer;
1286+
struct strscanner *p;
1287+
1288+
GET_SCANNER(self, p);
1289+
CLEAR_MATCH_STATUS(p);
1290+
1291+
rb_must_asciicompat(p->str);
1292+
1293+
ptr = CURPTR(p);
1294+
1295+
long remaining_len = S_RESTLEN(p);
1296+
1297+
if (remaining_len <= 0) {
1298+
return Qnil;
1299+
}
1300+
1301+
if (ptr[len] == '-' || ptr[len] == '+') {
1302+
len++;
1303+
}
1304+
1305+
if (!isdigit(ptr[len])) {
1306+
return Qnil;
1307+
}
1308+
1309+
MATCHED(p);
1310+
p->prev = p->curr;
1311+
1312+
while (len < remaining_len && isdigit(ptr[len])) {
1313+
len++;
1314+
}
1315+
1316+
buffer = RB_ALLOCV_N(char, buffer_v, len + 1);
1317+
1318+
MEMCPY(buffer, CURPTR(p), char, len);
1319+
buffer[len] = '\0';
1320+
integer = rb_cstr2inum(buffer, 10);
1321+
RB_ALLOCV_END(buffer_v);
1322+
p->curr += len;
1323+
return integer;
1324+
}
1325+
12691326
/*
12701327
* :markup: markdown
12711328
* :include: strscan/link_refs.txt
@@ -2204,6 +2261,8 @@ Init_strscan(void)
22042261
rb_define_method(StringScanner, "peek_byte", strscan_peek_byte, 0);
22052262
rb_define_method(StringScanner, "peep", strscan_peep, 1);
22062263

2264+
rb_define_method(StringScanner, "scan_integer", strscan_scan_integer, 0);
2265+
22072266
rb_define_method(StringScanner, "unscan", strscan_unscan, 0);
22082267

22092268
rb_define_method(StringScanner, "beginning_of_line?", strscan_bol_p, 0);

test/strscan/test_stringscanner.rb

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -890,6 +890,61 @@ def test_named_captures
890890
assert_equal(9, scan.match?(/(?<f>foo)(?<r>bar)(?<z>baz)/))
891891
assert_equal({"f" => "foo", "r" => "bar", "z" => "baz"}, scan.named_captures)
892892
end
893+
894+
def test_scan_integer
895+
omit "scan_integer isn't implemented on TruffleRuby yet" if RUBY_ENGINE == "truffleruby"
896+
897+
s = create_string_scanner('abc')
898+
assert_nil s.scan_integer
899+
assert_equal 0, s.pos
900+
refute_predicate s, :matched?
901+
902+
s = create_string_scanner('123abc')
903+
assert_equal 123, s.scan_integer
904+
assert_equal 3, s.pos
905+
assert_predicate s, :matched?
906+
907+
s = create_string_scanner('-123abc')
908+
assert_equal -123, s.scan_integer
909+
assert_equal 4, s.pos
910+
assert_predicate s, :matched?
911+
912+
s = create_string_scanner('+123')
913+
assert_equal 123, s.scan_integer
914+
assert_equal 4, s.pos
915+
assert_predicate s, :matched?
916+
917+
s = create_string_scanner('-abc')
918+
assert_nil s.scan_integer
919+
assert_equal 0, s.pos
920+
refute_predicate s, :matched?
921+
922+
huge_integer = '1' * 2_000
923+
s = create_string_scanner(huge_integer)
924+
assert_equal huge_integer.to_i, s.scan_integer
925+
assert_equal 2_000, s.pos
926+
assert_predicate s, :matched?
927+
end
928+
929+
def test_scan_integer_unmatch
930+
omit "scan_integer isn't implemented on TruffleRuby yet" if RUBY_ENGINE == "truffleruby"
931+
932+
s = create_string_scanner('123abc')
933+
assert_equal 123, s.scan_integer
934+
assert_equal 3, s.pos
935+
936+
s.unscan
937+
assert_equal 0, s.pos
938+
end
939+
940+
def test_scan_integer_encoding
941+
omit "scan_integer isn't implemented on TruffleRuby yet" if RUBY_ENGINE == "truffleruby"
942+
943+
s = create_string_scanner('123abc'.encode(Encoding::UTF_32LE))
944+
assert_raise(Encoding::CompatibilityError) do
945+
s.scan_integer
946+
end
947+
end
893948
end
894949

895950
class TestStringScanner < Test::Unit::TestCase

0 commit comments

Comments
 (0)