Skip to content

Commit 11302ee

Browse files
committed
Deduplicate Regexp literals
Ruby ticket: https://bugs.ruby-lang.org/issues/16557 Real world application contain many duplicated Regexp literals. From a rails/console in Redmine: ``` >> ObjectSpace.each_object(Regexp).count => 6828 >> ObjectSpace.each_object(Regexp).uniq.count => 4162 >> ObjectSpace.each_object(Regexp).to_a.map { |r| ObjectSpace.memsize_of(r) }.sum => 4611957 # 4.4 MB total >> ObjectSpace.each_object(Regexp).to_a.map { |r| ObjectSpace.memsize_of(r) }.sum - ObjectSpace.each_object(Regexp).to_a.uniq.map { |r| ObjectSpace.memsize_of(r) }.sum => 1490601 # 1.42 MB could be saved ``` Here's the to 10 duplicated regexps in Redmine: ``` 147: /"/ 107: /\s+/ 103: // 89: /\n/ 83: /'/ 76: /\s+/m 37: /\d+/ 35: /\[/ 33: /./ 33: /\\./ ``` Any empty Rails application will have a similar amount of regexps. Since https://bugs.ruby-lang.org/issues/16377 made literal regexps frozen, it is possible to deduplicate literal regexps without changing any semantic. This patch is heavily inspired by the `frozen_strings` table, but applied to literal regexps.
1 parent ad1ebef commit 11302ee

File tree

3 files changed

+47
-5
lines changed

3 files changed

+47
-5
lines changed

re.c

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "ruby/encoding.h"
2424
#include "ruby/re.h"
2525
#include "ruby/util.h"
26+
#include "vm_core.h"
2627

2728
VALUE rb_eRegexpError;
2829

@@ -2956,19 +2957,50 @@ rb_reg_new(const char *s, long len, int options)
29562957
return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options);
29572958
}
29582959

2960+
static VALUE
2961+
rb_reg_lookup_literal(VALUE str, int options)
2962+
{
2963+
VALUE cache = GET_VM()->regexp_literals_cache;
2964+
VALUE options_cache = rb_hash_lookup(cache, INT2FIX(options));
2965+
if (RTEST(options_cache)) {
2966+
return rb_hash_lookup(options_cache, str);
2967+
}
2968+
return Qnil;
2969+
}
2970+
2971+
static void
2972+
rb_reg_cache_literal(VALUE str, int options, VALUE re)
2973+
{
2974+
VALUE cache = GET_VM()->regexp_literals_cache;
2975+
VALUE options_cache = rb_hash_lookup(cache, INT2FIX(options));
2976+
if (!RTEST(options_cache)) {
2977+
options_cache = rb_ident_hash_new();
2978+
rb_hash_aset(cache, INT2FIX(options), options_cache);
2979+
}
2980+
rb_hash_aset(options_cache, str, re);
2981+
}
2982+
29592983
VALUE
29602984
rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline)
29612985
{
2962-
VALUE re = rb_reg_alloc();
2963-
onig_errmsg_buffer err = "";
2964-
29652986
if (!str) str = rb_str_new(0,0);
2987+
str = rb_fstring(str);
2988+
2989+
VALUE re = rb_reg_lookup_literal(str, options);
2990+
if (RTEST(re)) {
2991+
return re;
2992+
}
2993+
2994+
re = rb_reg_alloc();
2995+
onig_errmsg_buffer err = "";
29662996
if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) {
2967-
rb_set_errinfo(rb_reg_error_desc(str, options, err));
2968-
return Qnil;
2997+
rb_set_errinfo(rb_reg_error_desc(str, options, err));
2998+
return Qnil;
29692999
}
29703000
FL_SET(re, REG_LITERAL);
29713001
rb_obj_freeze(re);
3002+
rb_reg_cache_literal(str, options, re);
3003+
29723004
return re;
29733005
}
29743006

@@ -4111,4 +4143,6 @@ Init_Regexp(void)
41114143
rb_define_method(rb_cMatch, "hash", match_hash, 0);
41124144
rb_define_method(rb_cMatch, "eql?", match_equal, 1);
41134145
rb_define_method(rb_cMatch, "==", match_equal, 1);
4146+
4147+
rb_gc_register_mark_object(GET_VM()->regexp_literals_cache = rb_hash_new());
41144148
}

test/ruby/test_regexp.rb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,13 @@ def test_assert_normal_exit
6262
Regexp.union("a", "a")
6363
end
6464

65+
def test_literal_deduplication
66+
assert_same(/a/, /a/)
67+
refute_same(/a/, /a/m)
68+
refute_same(/a/, Regexp.new('a'))
69+
assert_equal(/a/, Regexp.new('a'))
70+
end
71+
6572
def test_to_s
6673
assert_equal '(?-mix:\x00)', Regexp.new("\0").to_s
6774

vm_core.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -634,6 +634,7 @@ typedef struct rb_vm_struct {
634634

635635
VALUE *defined_strings;
636636
st_table *frozen_strings;
637+
VALUE regexp_literals_cache;
637638

638639
const struct rb_builtin_function *builtin_function_table;
639640
int builtin_inline_index;

0 commit comments

Comments
 (0)