From 14989869bdc6df00120b1fe49f1114256f6d32e7 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Thu, 4 Jan 2024 21:52:08 +0900
Subject: [PATCH 1/7] delete unuse method

---
 lib/rexml/source.rb | 72 ---------------------------------------------
 test/test_core.rb   |  2 +-
 2 files changed, 1 insertion(+), 73 deletions(-)

diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 90b370b9..54eea54b 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -58,46 +58,9 @@ def encoding=(enc)
       encoding_updated
     end
 
-    # Scans the source for a given pattern.  Note, that this is not your
-    # usual scan() method.  For one thing, the pattern argument has some
-    # requirements; for another, the source can be consumed.  You can easily
-    # confuse this method.  Originally, the patterns were easier
-    # to construct and this method more robust, because this method
-    # generated search regexps on the fly; however, this was
-    # computationally expensive and slowed down the entire REXML package
-    # considerably, since this is by far the most commonly called method.
-    # @param pattern must be a Regexp, and must be in the form of
-    # /^\s*(#{your pattern, with no groups})(.*)/.  The first group
-    # will be returned; the second group is used if the consume flag is
-    # set.
-    # @param consume if true, the pattern returned will be consumed, leaving
-    # everything after it in the Source.
-    # @return the pattern, if found, or nil if the Source is empty or the
-    # pattern is not found.
-    def scan(pattern, cons=false)
-      return nil if @buffer.nil?
-      rv = @buffer.scan(pattern)
-      @buffer = $' if cons and rv.size>0
-      rv
-    end
-
     def read
     end
 
-    def consume( pattern )
-      @buffer = $' if pattern.match( @buffer )
-    end
-
-    def match_to( char, pattern )
-      return pattern.match(@buffer)
-    end
-
-    def match_to_consume( char, pattern )
-      md = pattern.match(@buffer)
-      @buffer = $'
-      return md
-    end
-
     def match(pattern, cons=false)
       md = pattern.match(@buffer)
       @buffer = $' if cons and md
@@ -109,10 +72,6 @@ def empty?
       @buffer == ""
     end
 
-    def position
-      @orig.index( @buffer )
-    end
-
     # @return the current line in the source
     def current_line
       lines = @orig.split
@@ -181,29 +140,6 @@ def initialize(arg, block_size=500, encoding=nil)
       end
     end
 
-    def scan(pattern, cons=false)
-      rv = super
-      # You'll notice that this next section is very similar to the same
-      # section in match(), but just a liiittle different.  This is
-      # because it is a touch faster to do it this way with scan()
-      # than the way match() does it; enough faster to warrant duplicating
-      # some code
-      if rv.size == 0
-        until @buffer =~ pattern or @source.nil?
-          begin
-            @buffer << readline
-          rescue Iconv::IllegalSequence
-            raise
-          rescue
-            @source = nil
-          end
-        end
-        rv = super
-      end
-      rv.taint if RUBY_VERSION < '2.7'
-      rv
-    end
-
     def read
       begin
         @buffer << readline
@@ -212,10 +148,6 @@ def read
       end
     end
 
-    def consume( pattern )
-      match( pattern, true )
-    end
-
     def match( pattern, cons=false )
       rv = pattern.match(@buffer)
       @buffer = $' if cons and rv
@@ -236,10 +168,6 @@ def empty?
       super and ( @source.nil? || @source.eof? )
     end
 
-    def position
-      @er_source.pos rescue 0
-    end
-
     # @return the current line in the source
     def current_line
       begin
diff --git a/test/test_core.rb b/test/test_core.rb
index 7c18c03f..8c33d834 100644
--- a/test/test_core.rb
+++ b/test/test_core.rb
@@ -727,7 +727,7 @@ def test_iso_8859_1_output_function
       koln_iso_8859_1 = "K\xF6ln"
       koln_utf8 = "K\xc3\xb6ln"
       source = Source.new( koln_iso_8859_1, 'iso-8859-1' )
-      results = source.scan(/.*/)[0]
+      results = source.match(/.*/)[0]
       koln_utf8.force_encoding('UTF-8') if koln_utf8.respond_to?(:force_encoding)
       assert_equal koln_utf8, results
       output << results

From 55cc84b1b681e310cd191ade7055289901099397 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Sun, 7 Jan 2024 11:16:39 +0900
Subject: [PATCH 2/7] use StringScanner with match method

[Why]
Using StringScanner reduces the string copying process and speeds up the process.
---
 lib/rexml/parsers/baseparser.rb |  4 ++--
 lib/rexml/source.rb             | 24 ++++++++++++++----------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 305b1207..112b8f4d 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -96,7 +96,7 @@ class BaseParser
       ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
       PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
       GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
-      ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
+      ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
 
       NOTATIONDECL_START = /\A\s*<!NOTATION/um
       EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -274,7 +274,7 @@ def pull_event
             return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
 
           when ENTITY_START
-            match = @source.match( ENTITYDECL, true ).to_a.compact
+            match = @source.match( ENTITYDECL, true ).compact
             match[0] = :entitydecl
             ref = false
             if match[1] == '%'
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 54eea54b..729475fb 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -62,9 +62,11 @@ def read
     end
 
     def match(pattern, cons=false)
-      md = pattern.match(@buffer)
-      @buffer = $' if cons and md
-      return md
+      @scanner = StringScanner.new(@buffer)
+      @scanner.scan(pattern)
+      @buffer = @scanner.rest if cons and @scanner.matched?
+
+      @scanner.matched? ? [@scanner.matched, *@scanner.captures] : nil
     end
 
     # @return true if the Source is exhausted
@@ -149,19 +151,21 @@ def read
     end
 
     def match( pattern, cons=false )
-      rv = pattern.match(@buffer)
-      @buffer = $' if cons and rv
-      while !rv and @source
+      @scanner = StringScanner.new(@buffer)
+      @scanner.scan(pattern)
+      @buffer = @scanner.rest if cons and @scanner.matched?
+      while !@scanner.matched? and @source
         begin
           @buffer << readline
-          rv = pattern.match(@buffer)
-          @buffer = $' if cons and rv
+          @scanner.string = @buffer
+          @scanner.scan(pattern)
+          @buffer = @scanner.rest if cons and @scanner.matched?
         rescue
           @source = nil
         end
       end
-      rv.taint if RUBY_VERSION < '2.7'
-      rv
+
+      @scanner.matched? ? [@scanner.matched, *@scanner.captures] : nil
     end
 
     def empty?

From ff9dabfc6e69e8c78e205a37a7abcb9354c93789 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Sun, 7 Jan 2024 12:11:20 +0900
Subject: [PATCH 3/7] Change StringScanner.new to a call in the initialize
 method.

---
 lib/rexml/parsers/baseparser.rb | 14 ++++++--------
 lib/rexml/source.rb             |  5 +++--
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 112b8f4d..13084685 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -392,6 +392,7 @@ def pull_event
               unless md
                 raise REXML::ParseException.new("malformed XML: missing tag start", @source)
               end
+              tag = md[1]
               @document_status = :in_element
               prefixes = Set.new
               prefixes << md[2] if md[2]
@@ -405,23 +406,20 @@ def pull_event
               end
 
               if closed
-                @closed = md[1]
+                @closed = tag
                 @nsstack.shift
               else
-                @tags.push( md[1] )
+                @tags.push( tag )
               end
-              return [ :start_element, md[1], attributes ]
+              return [ :start_element, tag, attributes ]
             end
           else
             md = @source.match( TEXT_PATTERN, true )
+            text = md[1]
             if md[0].length == 0
               @source.match( /(\s+)/, true )
             end
-            #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
-            #return [ :text, "" ] if md[0].length == 0
-            # unnormalized = Text::unnormalize( md[1], self )
-            # return PullEvent.new( :text, md[1], unnormalized )
-            return [ :text, md[1] ]
+            return [ :text, text ]
           end
         rescue REXML::UndefinedNamespaceException
           raise
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 729475fb..360624be 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -42,6 +42,7 @@ class Source
     # value, overriding all encoding detection
     def initialize(arg, encoding=nil)
       @orig = @buffer = arg
+      @scanner = StringScanner.new(@buffer)
       if encoding
         self.encoding = encoding
       else
@@ -62,7 +63,7 @@ def read
     end
 
     def match(pattern, cons=false)
-      @scanner = StringScanner.new(@buffer)
+      @scanner.string = @buffer
       @scanner.scan(pattern)
       @buffer = @scanner.rest if cons and @scanner.matched?
 
@@ -151,7 +152,7 @@ def read
     end
 
     def match( pattern, cons=false )
-      @scanner = StringScanner.new(@buffer)
+      @scanner.string = @buffer
       @scanner.scan(pattern)
       @buffer = @scanner.rest if cons and @scanner.matched?
       while !@scanner.matched? and @source

From 88bde3324e3ea83353722e2109ea475135eebae9 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Sun, 7 Jan 2024 14:11:05 +0900
Subject: [PATCH 4/7] Remove @buffer and process only use @scanner.

Removed `attr_reader :buffer` and added similar `def buffer` and `def buffer_encoding=` interfaces.
---
 lib/rexml/parsers/baseparser.rb |  2 +-
 lib/rexml/source.rb             | 74 +++++++++++++++++++--------------
 2 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 13084685..5ce7b8d9 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -259,7 +259,7 @@ def pull_event
           else
             @document_status = :after_doctype
             if @source.encoding == "UTF-8"
-              @source.buffer.force_encoding(::Encoding::UTF_8)
+              @source.buffer_encoding = ::Encoding::UTF_8
             end
           end
         end
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 360624be..4de68731 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -30,8 +30,6 @@ def SourceFactory::create_from(arg)
   # objects and provides consumption of text
   class Source
     include Encoding
-    # The current buffer (what we're going to read next)
-    attr_reader :buffer
     # The line number of the last consumed text
     attr_reader :line
     attr_reader :encoding
@@ -41,8 +39,8 @@ class Source
     # @param encoding if non-null, sets the encoding of the source to this
     # value, overriding all encoding detection
     def initialize(arg, encoding=nil)
-      @orig = @buffer = arg
-      @scanner = StringScanner.new(@buffer)
+      @orig = arg
+      @scanner = StringScanner.new(@orig)
       if encoding
         self.encoding = encoding
       else
@@ -51,6 +49,14 @@ def initialize(arg, encoding=nil)
       @line = 0
     end
 
+    # The current buffer (what we're going to read next)
+    def buffer
+      @scanner.rest
+    end
+
+    def buffer_encoding=(encoding)
+      @scanner.string.force_encoding(encoding)
+    end
 
     # Inherited from Encoding
     # Overridden to support optimized en/decoding
@@ -63,55 +69,54 @@ def read
     end
 
     def match(pattern, cons=false)
-      @scanner.string = @buffer
-      @scanner.scan(pattern)
-      @buffer = @scanner.rest if cons and @scanner.matched?
-
+      if cons
+        @scanner.scan(pattern)
+      else
+        @scanner.check(pattern)
+      end
       @scanner.matched? ? [@scanner.matched, *@scanner.captures] : nil
     end
 
     # @return true if the Source is exhausted
     def empty?
-      @buffer == ""
+      @scanner.eos?
     end
 
     # @return the current line in the source
     def current_line
       lines = @orig.split
-      res = lines.grep @buffer[0..30]
+      res = lines.grep @scanner.rest[0..30]
       res = res[-1] if res.kind_of? Array
       lines.index( res ) if res
     end
 
     private
+
     def detect_encoding
-      buffer_encoding = @buffer.encoding
+      scanner_encoding = @scanner.rest.encoding
       detected_encoding = "UTF-8"
       begin
-        @buffer.force_encoding("ASCII-8BIT")
-        if @buffer[0, 2] == "\xfe\xff"
-          @buffer[0, 2] = ""
+        @scanner.string.force_encoding("ASCII-8BIT")
+        if @scanner.scan(/\xfe\xff/n)
           detected_encoding = "UTF-16BE"
-        elsif @buffer[0, 2] == "\xff\xfe"
-          @buffer[0, 2] = ""
+        elsif @scanner.scan(/\xff\xfe/n)
           detected_encoding = "UTF-16LE"
-        elsif @buffer[0, 3] == "\xef\xbb\xbf"
-          @buffer[0, 3] = ""
+        elsif @scanner.scan(/\xef\xbb\xbf/n)
           detected_encoding = "UTF-8"
         end
       ensure
-        @buffer.force_encoding(buffer_encoding)
+        @scanner.string.force_encoding(scanner_encoding)
       end
       self.encoding = detected_encoding
     end
 
     def encoding_updated
       if @encoding != 'UTF-8'
-        @buffer = decode(@buffer)
+        @scanner.string = decode(@scanner.rest)
         @to_utf = true
       else
         @to_utf = false
-        @buffer.force_encoding ::Encoding::UTF_8
+        @scanner.string.force_encoding(::Encoding::UTF_8)
       end
     end
   end
@@ -134,7 +139,7 @@ def initialize(arg, block_size=500, encoding=nil)
       end
 
       if !@to_utf and
-          @buffer.respond_to?(:force_encoding) and
+          @orig.respond_to?(:force_encoding) and
           @source.respond_to?(:external_encoding) and
           @source.external_encoding != ::Encoding::UTF_8
         @force_utf8 = true
@@ -145,22 +150,29 @@ def initialize(arg, block_size=500, encoding=nil)
 
     def read
       begin
-        @buffer << readline
+        # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM,
+        # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs.
+        # `@scanner.string = @scanner.rest + readline` frees memory and avoids this problem.
+        @scanner.string = @scanner.rest + readline
       rescue Exception, NameError
         @source = nil
       end
     end
 
     def match( pattern, cons=false )
-      @scanner.string = @buffer
-      @scanner.scan(pattern)
-      @buffer = @scanner.rest if cons and @scanner.matched?
+      if cons
+        @scanner.scan(pattern)
+      else
+        @scanner.check(pattern)
+      end
       while !@scanner.matched? and @source
         begin
-          @buffer << readline
-          @scanner.string = @buffer
-          @scanner.scan(pattern)
-          @buffer = @scanner.rest if cons and @scanner.matched?
+          @scanner << readline
+          if cons
+            @scanner.scan(pattern)
+          else
+            @scanner.check(pattern)
+          end
         rescue
           @source = nil
         end
@@ -223,7 +235,7 @@ def encoding_updated
         @source.set_encoding(@encoding, @encoding)
       end
       @line_break = encode(">")
-      @pending_buffer, @buffer = @buffer, ""
+      @pending_buffer, @scanner.string = @scanner.rest, ""
       @pending_buffer.force_encoding(@encoding)
       super
     end

From 2b97e16ef00c1201ae1988c24f151d4452ea0daf Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Sat, 13 Jan 2024 14:08:42 +0900
Subject: [PATCH 5/7] Added strscan 3.0.8 dependency.

---
 benchmark/parse.yaml | 4 ++++
 rexml.gemspec        | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/benchmark/parse.yaml b/benchmark/parse.yaml
index e7066fcb..8818b50c 100644
--- a/benchmark/parse.yaml
+++ b/benchmark/parse.yaml
@@ -5,6 +5,8 @@ contexts:
     require: false
     prelude: require 'rexml'
   - name: master
+    gems:
+      strscan: 3.0.8
     prelude: |
       $LOAD_PATH.unshift(File.expand_path("lib"))
       require 'rexml'
@@ -16,6 +18,8 @@ contexts:
       require 'rexml'
       RubyVM::YJIT.enable
   - name: master(YJIT)
+    gems:
+      strscan: 3.0.8
     prelude: |
       $LOAD_PATH.unshift(File.expand_path("lib"))
       require 'rexml'
diff --git a/rexml.gemspec b/rexml.gemspec
index b51df33b..2ba1c64d 100644
--- a/rexml.gemspec
+++ b/rexml.gemspec
@@ -55,6 +55,8 @@ Gem::Specification.new do |spec|
 
   spec.required_ruby_version = '>= 2.5.0'
 
+  spec.add_runtime_dependency("strscan", ">= 3.0.8")
+
   spec.add_development_dependency "benchmark_driver"
   spec.add_development_dependency "bundler"
   spec.add_development_dependency "rake"

From eeb45e153d047d1e123d24a5a3ba783958dfb709 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Sun, 14 Jan 2024 21:48:27 +0900
Subject: [PATCH 6/7] Change `@scanner.match` to respond `nil`/`@scanner` in
 order to improve processing speed.

---
 lib/rexml/parsers/baseparser.rb       |  3 +--
 lib/rexml/source.rb                   | 17 ++++++-------
 test/parse/test_entity_declaration.rb | 36 +++++++++++++++++++++++++++
 3 files changed, 45 insertions(+), 11 deletions(-)
 create mode 100644 test/parse/test_entity_declaration.rb

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 5ce7b8d9..65bad260 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -274,8 +274,7 @@ def pull_event
             return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
 
           when ENTITY_START
-            match = @source.match( ENTITYDECL, true ).compact
-            match[0] = :entitydecl
+            match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact]
             ref = false
             if match[1] == '%'
               ref = true
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 4de68731..390d0ad5 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -70,11 +70,10 @@ def read
 
     def match(pattern, cons=false)
       if cons
-        @scanner.scan(pattern)
+        @scanner.scan(pattern).nil? ? nil : @scanner
       else
-        @scanner.check(pattern)
+        @scanner.check(pattern).nil? ? nil : @scanner
       end
-      @scanner.matched? ? [@scanner.matched, *@scanner.captures] : nil
     end
 
     # @return true if the Source is exhausted
@@ -161,24 +160,24 @@ def read
 
     def match( pattern, cons=false )
       if cons
-        @scanner.scan(pattern)
+        md = @scanner.scan(pattern)
       else
-        @scanner.check(pattern)
+        md = @scanner.check(pattern)
       end
-      while !@scanner.matched? and @source
+      while md.nil? and @source
         begin
           @scanner << readline
           if cons
-            @scanner.scan(pattern)
+            md = @scanner.scan(pattern)
           else
-            @scanner.check(pattern)
+            md = @scanner.check(pattern)
           end
         rescue
           @source = nil
         end
       end
 
-      @scanner.matched? ? [@scanner.matched, *@scanner.captures] : nil
+      md.nil? ? nil : @scanner
     end
 
     def empty?
diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb
new file mode 100644
index 00000000..e15deec6
--- /dev/null
+++ b/test/parse/test_entity_declaration.rb
@@ -0,0 +1,36 @@
+# frozen_string_literal: false
+require 'test/unit'
+require 'rexml/document'
+
+module REXMLTests
+  class TestParseEntityDeclaration < Test::Unit::TestCase
+    private
+    def xml(internal_subset)
+      <<-XML
+<!DOCTYPE r SYSTEM "urn:x-henrikmartensson:test" [
+#{internal_subset}
+]>
+<r/>
+      XML
+    end
+
+    def parse(internal_subset)
+      REXML::Document.new(xml(internal_subset)).doctype
+    end
+
+    def test_empty
+      exception = assert_raise(REXML::ParseException) do
+        parse(<<-INTERNAL_SUBSET)
+<!ENTITY>
+        INTERNAL_SUBSET
+      end
+      assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed notation declaration: name is missing
+Line: 5
+Position: 72
+Last 80 unconsumed characters:
+ <!ENTITY>  ]> <r/>
+      DETAIL
+    end
+  end
+end

From ea8e4bb05c98c07350fd0868ed0f78d4b0a4acc2 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@cozmixng.org>
Date: Wed, 17 Jan 2024 10:55:47 +0900
Subject: [PATCH 7/7] Add more description

---
 lib/rexml/source.rb | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 390d0ad5..71b08f99 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -151,7 +151,8 @@ def read
       begin
         # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM,
         # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs.
-        # `@scanner.string = @scanner.rest + readline` frees memory and avoids this problem.
+        # `@scanner.string = @scanner.rest + readline` frees memory that is already consumed
+        # and avoids this problem.
         @scanner.string = @scanner.rest + readline
       rescue Exception, NameError
         @source = nil