Skip to content

Commit 873aba2

Browse files
tenderlovekou
andauthored
Add a method for peeking and reading bytes as integers (#89)
This commit adds `scan_byte` and `peek_byte`. `scan_byte` will scan the current byte, return it as an integer, and advance the cursor. `peek_byte` will return the current byte as an integer without advancing the cursor. Currently `StringScanner#get_byte` returns a string, but I want to get the current byte without allocating a string. I think this will help with writing high performance lexers. --------- Co-authored-by: Sutou Kouhei <kou@clear-code.com>
1 parent 338d870 commit 873aba2

File tree

4 files changed

+106
-0
lines changed

4 files changed

+106
-0
lines changed

ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,33 @@ public IRubyObject getbyte(ThreadContext context) {
508508
return get_byte(context);
509509
}
510510

511+
@JRubyMethod(name = "scan_byte")
512+
public IRubyObject scan_byte(ThreadContext context) {
513+
Ruby runtime = context.runtime;
514+
check(context);
515+
clearMatched();
516+
if (curr >= str.getByteList().getRealSize()) return context.nil;
517+
518+
byte[] bytes = str.getBytes();
519+
520+
byte bite = bytes[curr];
521+
prev = curr;
522+
curr++;
523+
524+
setMatched();
525+
adjustRegisters();
526+
return RubyFixnum.newFixnum(context.runtime, bite & 0xff);
527+
}
528+
529+
@JRubyMethod(name = "peek_byte")
530+
public IRubyObject peek_byte(ThreadContext context) {
531+
Ruby runtime = context.runtime;
532+
check(context);
533+
if (curr >= str.getByteList().getRealSize()) return context.nil;
534+
535+
return RubyFixnum.newFixnum(context.runtime, (str.getBytes()[curr]) & 0xff);
536+
}
537+
511538
@JRubyMethod(name = "peek")
512539
public IRubyObject peek(ThreadContext context, IRubyObject length) {
513540
check(context);

ext/strscan/strscan.c

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -902,6 +902,57 @@ strscan_getch(VALUE self)
902902
adjust_register_position(p, p->regs.end[0]));
903903
}
904904

905+
/*
906+
* Scans one byte and returns it as an integer.
907+
* This method is not multibyte character sensitive.
908+
* See also: #getch.
909+
*
910+
* s = StringScanner.new('ab')
911+
* s.scan_byte # => 97
912+
* s.scan_byte # => 98
913+
* s.scan_byte # => nil
914+
*
915+
* s = StringScanner.new("\244\242".force_encoding("euc-jp"))
916+
* s.scan_byte # => 0xA4
917+
* s.scan_byte # => 0xA2
918+
* s.scan_byte # => nil
919+
*/
920+
static VALUE
921+
strscan_scan_byte(VALUE self)
922+
{
923+
struct strscanner *p;
924+
925+
GET_SCANNER(self, p);
926+
CLEAR_MATCH_STATUS(p);
927+
if (EOS_P(p))
928+
return Qnil;
929+
930+
VALUE byte = INT2FIX((unsigned char)*CURPTR(p));
931+
p->prev = p->curr;
932+
p->curr++;
933+
MATCHED(p);
934+
adjust_registers_to_matched(p);
935+
return byte;
936+
}
937+
938+
/*
939+
* Peeks at the current byte and returns it as an integer.
940+
*
941+
* s = StringScanner.new('ab')
942+
* s.peek_byte # => 97
943+
*/
944+
static VALUE
945+
strscan_peek_byte(VALUE self)
946+
{
947+
struct strscanner *p;
948+
949+
GET_SCANNER(self, p);
950+
if (EOS_P(p))
951+
return Qnil;
952+
953+
return INT2FIX((unsigned char)*CURPTR(p));
954+
}
955+
905956
/*
906957
* Scans one byte and returns it.
907958
* This method is not multibyte character sensitive.
@@ -1605,6 +1656,7 @@ strscan_named_captures(VALUE self)
16051656
*
16061657
* - #getch
16071658
* - #get_byte
1659+
* - #scan_byte
16081660
* - #scan
16091661
* - #scan_until
16101662
* - #skip
@@ -1617,6 +1669,7 @@ strscan_named_captures(VALUE self)
16171669
* - #exist?
16181670
* - #match?
16191671
* - #peek
1672+
* - #peek_byte
16201673
*
16211674
* === Finding Where we Are
16221675
*
@@ -1708,7 +1761,9 @@ Init_strscan(void)
17081761
rb_define_method(StringScanner, "getch", strscan_getch, 0);
17091762
rb_define_method(StringScanner, "get_byte", strscan_get_byte, 0);
17101763
rb_define_method(StringScanner, "getbyte", strscan_getbyte, 0);
1764+
rb_define_method(StringScanner, "scan_byte", strscan_scan_byte, 0);
17111765
rb_define_method(StringScanner, "peek", strscan_peek, 1);
1766+
rb_define_method(StringScanner, "peek_byte", strscan_peek_byte, 0);
17121767
rb_define_method(StringScanner, "peep", strscan_peep, 1);
17131768

17141769
rb_define_method(StringScanner, "unscan", strscan_unscan, 0);

run-test.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#!/usr/bin/env ruby
22

3+
gem 'strscan'
34
require 'strscan'
45
puts "Loaded strscan from #{$".grep(/\/strscan\./).join(', ')}"
56
puts "Gem from #{Gem.loaded_specs["strscan"]&.full_gem_path}"

test/strscan/test_stringscanner.rb

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,29 @@
88
require 'test/unit'
99

1010
module StringScannerTests
11+
def test_peek_byte
12+
s = create_string_scanner('ab')
13+
assert_equal 97, s.peek_byte
14+
assert_equal 97, s.scan_byte
15+
assert_equal 98, s.peek_byte
16+
assert_equal 98, s.scan_byte
17+
assert_nil s.peek_byte
18+
assert_nil s.scan_byte
19+
end
20+
21+
def test_scan_byte
22+
s = create_string_scanner('ab')
23+
assert_equal 97, s.scan_byte
24+
assert_equal 98, s.scan_byte
25+
assert_nil s.scan_byte
26+
27+
str = "\244\242".dup.force_encoding("euc-jp")
28+
s = StringScanner.new(str)
29+
assert_equal str.getbyte(s.pos), s.scan_byte
30+
assert_equal str.getbyte(s.pos), s.scan_byte
31+
assert_nil s.scan_byte
32+
end
33+
1134
def test_s_new
1235
s = create_string_scanner('test string')
1336
assert_instance_of StringScanner, s

0 commit comments

Comments
 (0)