From 14989869bdc6df00120b1fe49f1114256f6d32e7 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Thu, 4 Jan 2024 21:52:08 +0900 Subject: [PATCH 1/7] delete unuse method --- lib/rexml/source.rb | 72 --------------------------------------------- test/test_core.rb | 2 +- 2 files changed, 1 insertion(+), 73 deletions(-) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 90b370b9..54eea54b 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -58,46 +58,9 @@ def encoding=(enc) encoding_updated end - # Scans the source for a given pattern. Note, that this is not your - # usual scan() method. For one thing, the pattern argument has some - # requirements; for another, the source can be consumed. You can easily - # confuse this method. Originally, the patterns were easier - # to construct and this method more robust, because this method - # generated search regexps on the fly; however, this was - # computationally expensive and slowed down the entire REXML package - # considerably, since this is by far the most commonly called method. - # @param pattern must be a Regexp, and must be in the form of - # /^\s*(#{your pattern, with no groups})(.*)/. The first group - # will be returned; the second group is used if the consume flag is - # set. - # @param consume if true, the pattern returned will be consumed, leaving - # everything after it in the Source. - # @return the pattern, if found, or nil if the Source is empty or the - # pattern is not found. - def scan(pattern, cons=false) - return nil if @buffer.nil? - rv = @buffer.scan(pattern) - @buffer = $' if cons and rv.size>0 - rv - end - def read end - def consume( pattern ) - @buffer = $' if pattern.match( @buffer ) - end - - def match_to( char, pattern ) - return pattern.match(@buffer) - end - - def match_to_consume( char, pattern ) - md = pattern.match(@buffer) - @buffer = $' - return md - end - def match(pattern, cons=false) md = pattern.match(@buffer) @buffer = $' if cons and md @@ -109,10 +72,6 @@ def empty? @buffer == "" end - def position - @orig.index( @buffer ) - end - # @return the current line in the source def current_line lines = @orig.split @@ -181,29 +140,6 @@ def initialize(arg, block_size=500, encoding=nil) end end - def scan(pattern, cons=false) - rv = super - # You'll notice that this next section is very similar to the same - # section in match(), but just a liiittle different. This is - # because it is a touch faster to do it this way with scan() - # than the way match() does it; enough faster to warrant duplicating - # some code - if rv.size == 0 - until @buffer =~ pattern or @source.nil? - begin - @buffer << readline - rescue Iconv::IllegalSequence - raise - rescue - @source = nil - end - end - rv = super - end - rv.taint if RUBY_VERSION < '2.7' - rv - end - def read begin @buffer << readline @@ -212,10 +148,6 @@ def read end end - def consume( pattern ) - match( pattern, true ) - end - def match( pattern, cons=false ) rv = pattern.match(@buffer) @buffer = $' if cons and rv @@ -236,10 +168,6 @@ def empty? super and ( @source.nil? || @source.eof? ) end - def position - @er_source.pos rescue 0 - end - # @return the current line in the source def current_line begin diff --git a/test/test_core.rb b/test/test_core.rb index 7c18c03f..8c33d834 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -727,7 +727,7 @@ def test_iso_8859_1_output_function koln_iso_8859_1 = "K\xF6ln" koln_utf8 = "K\xc3\xb6ln" source = Source.new( koln_iso_8859_1, 'iso-8859-1' ) - results = source.scan(/.*/)[0] + results = source.match(/.*/)[0] koln_utf8.force_encoding('UTF-8') if koln_utf8.respond_to?(:force_encoding) assert_equal koln_utf8, results output << results From 55cc84b1b681e310cd191ade7055289901099397 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 7 Jan 2024 11:16:39 +0900 Subject: [PATCH 2/7] use StringScanner with match method [Why] Using StringScanner reduces the string copying process and speeds up the process. --- lib/rexml/parsers/baseparser.rb | 4 ++-- lib/rexml/source.rb | 24 ++++++++++++++---------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 305b1207..112b8f4d 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -96,7 +96,7 @@ class BaseParser ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" PEDECL = "" GEDECL = "" - ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um + ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um NOTATIONDECL_START = /\A\s* Date: Sun, 7 Jan 2024 12:11:20 +0900 Subject: [PATCH 3/7] Change StringScanner.new to a call in the initialize method. --- lib/rexml/parsers/baseparser.rb | 14 ++++++-------- lib/rexml/source.rb | 5 +++-- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 112b8f4d..13084685 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -392,6 +392,7 @@ def pull_event unless md raise REXML::ParseException.new("malformed XML: missing tag start", @source) end + tag = md[1] @document_status = :in_element prefixes = Set.new prefixes << md[2] if md[2] @@ -405,23 +406,20 @@ def pull_event end if closed - @closed = md[1] + @closed = tag @nsstack.shift else - @tags.push( md[1] ) + @tags.push( tag ) end - return [ :start_element, md[1], attributes ] + return [ :start_element, tag, attributes ] end else md = @source.match( TEXT_PATTERN, true ) + text = md[1] if md[0].length == 0 @source.match( /(\s+)/, true ) end - #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0 - #return [ :text, "" ] if md[0].length == 0 - # unnormalized = Text::unnormalize( md[1], self ) - # return PullEvent.new( :text, md[1], unnormalized ) - return [ :text, md[1] ] + return [ :text, text ] end rescue REXML::UndefinedNamespaceException raise diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 729475fb..360624be 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -42,6 +42,7 @@ class Source # value, overriding all encoding detection def initialize(arg, encoding=nil) @orig = @buffer = arg + @scanner = StringScanner.new(@buffer) if encoding self.encoding = encoding else @@ -62,7 +63,7 @@ def read end def match(pattern, cons=false) - @scanner = StringScanner.new(@buffer) + @scanner.string = @buffer @scanner.scan(pattern) @buffer = @scanner.rest if cons and @scanner.matched? @@ -151,7 +152,7 @@ def read end def match( pattern, cons=false ) - @scanner = StringScanner.new(@buffer) + @scanner.string = @buffer @scanner.scan(pattern) @buffer = @scanner.rest if cons and @scanner.matched? while !@scanner.matched? and @source From 88bde3324e3ea83353722e2109ea475135eebae9 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 7 Jan 2024 14:11:05 +0900 Subject: [PATCH 4/7] Remove @buffer and process only use @scanner. Removed `attr_reader :buffer` and added similar `def buffer` and `def buffer_encoding=` interfaces. --- lib/rexml/parsers/baseparser.rb | 2 +- lib/rexml/source.rb | 74 +++++++++++++++++++-------------- 2 files changed, 44 insertions(+), 32 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 13084685..5ce7b8d9 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -259,7 +259,7 @@ def pull_event else @document_status = :after_doctype if @source.encoding == "UTF-8" - @source.buffer.force_encoding(::Encoding::UTF_8) + @source.buffer_encoding = ::Encoding::UTF_8 end end end diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 360624be..4de68731 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -30,8 +30,6 @@ def SourceFactory::create_from(arg) # objects and provides consumption of text class Source include Encoding - # The current buffer (what we're going to read next) - attr_reader :buffer # The line number of the last consumed text attr_reader :line attr_reader :encoding @@ -41,8 +39,8 @@ class Source # @param encoding if non-null, sets the encoding of the source to this # value, overriding all encoding detection def initialize(arg, encoding=nil) - @orig = @buffer = arg - @scanner = StringScanner.new(@buffer) + @orig = arg + @scanner = StringScanner.new(@orig) if encoding self.encoding = encoding else @@ -51,6 +49,14 @@ def initialize(arg, encoding=nil) @line = 0 end + # The current buffer (what we're going to read next) + def buffer + @scanner.rest + end + + def buffer_encoding=(encoding) + @scanner.string.force_encoding(encoding) + end # Inherited from Encoding # Overridden to support optimized en/decoding @@ -63,55 +69,54 @@ def read end def match(pattern, cons=false) - @scanner.string = @buffer - @scanner.scan(pattern) - @buffer = @scanner.rest if cons and @scanner.matched? - + if cons + @scanner.scan(pattern) + else + @scanner.check(pattern) + end @scanner.matched? ? [@scanner.matched, *@scanner.captures] : nil end # @return true if the Source is exhausted def empty? - @buffer == "" + @scanner.eos? end # @return the current line in the source def current_line lines = @orig.split - res = lines.grep @buffer[0..30] + res = lines.grep @scanner.rest[0..30] res = res[-1] if res.kind_of? Array lines.index( res ) if res end private + def detect_encoding - buffer_encoding = @buffer.encoding + scanner_encoding = @scanner.rest.encoding detected_encoding = "UTF-8" begin - @buffer.force_encoding("ASCII-8BIT") - if @buffer[0, 2] == "\xfe\xff" - @buffer[0, 2] = "" + @scanner.string.force_encoding("ASCII-8BIT") + if @scanner.scan(/\xfe\xff/n) detected_encoding = "UTF-16BE" - elsif @buffer[0, 2] == "\xff\xfe" - @buffer[0, 2] = "" + elsif @scanner.scan(/\xff\xfe/n) detected_encoding = "UTF-16LE" - elsif @buffer[0, 3] == "\xef\xbb\xbf" - @buffer[0, 3] = "" + elsif @scanner.scan(/\xef\xbb\xbf/n) detected_encoding = "UTF-8" end ensure - @buffer.force_encoding(buffer_encoding) + @scanner.string.force_encoding(scanner_encoding) end self.encoding = detected_encoding end def encoding_updated if @encoding != 'UTF-8' - @buffer = decode(@buffer) + @scanner.string = decode(@scanner.rest) @to_utf = true else @to_utf = false - @buffer.force_encoding ::Encoding::UTF_8 + @scanner.string.force_encoding(::Encoding::UTF_8) end end end @@ -134,7 +139,7 @@ def initialize(arg, block_size=500, encoding=nil) end if !@to_utf and - @buffer.respond_to?(:force_encoding) and + @orig.respond_to?(:force_encoding) and @source.respond_to?(:external_encoding) and @source.external_encoding != ::Encoding::UTF_8 @force_utf8 = true @@ -145,22 +150,29 @@ def initialize(arg, block_size=500, encoding=nil) def read begin - @buffer << readline + # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM, + # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs. + # `@scanner.string = @scanner.rest + readline` frees memory and avoids this problem. + @scanner.string = @scanner.rest + readline rescue Exception, NameError @source = nil end end def match( pattern, cons=false ) - @scanner.string = @buffer - @scanner.scan(pattern) - @buffer = @scanner.rest if cons and @scanner.matched? + if cons + @scanner.scan(pattern) + else + @scanner.check(pattern) + end while !@scanner.matched? and @source begin - @buffer << readline - @scanner.string = @buffer - @scanner.scan(pattern) - @buffer = @scanner.rest if cons and @scanner.matched? + @scanner << readline + if cons + @scanner.scan(pattern) + else + @scanner.check(pattern) + end rescue @source = nil end @@ -223,7 +235,7 @@ def encoding_updated @source.set_encoding(@encoding, @encoding) end @line_break = encode(">") - @pending_buffer, @buffer = @buffer, "" + @pending_buffer, @scanner.string = @scanner.rest, "" @pending_buffer.force_encoding(@encoding) super end From 2b97e16ef00c1201ae1988c24f151d4452ea0daf Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 13 Jan 2024 14:08:42 +0900 Subject: [PATCH 5/7] Added strscan 3.0.8 dependency. --- benchmark/parse.yaml | 4 ++++ rexml.gemspec | 2 ++ 2 files changed, 6 insertions(+) diff --git a/benchmark/parse.yaml b/benchmark/parse.yaml index e7066fcb..8818b50c 100644 --- a/benchmark/parse.yaml +++ b/benchmark/parse.yaml @@ -5,6 +5,8 @@ contexts: require: false prelude: require 'rexml' - name: master + gems: + strscan: 3.0.8 prelude: | $LOAD_PATH.unshift(File.expand_path("lib")) require 'rexml' @@ -16,6 +18,8 @@ contexts: require 'rexml' RubyVM::YJIT.enable - name: master(YJIT) + gems: + strscan: 3.0.8 prelude: | $LOAD_PATH.unshift(File.expand_path("lib")) require 'rexml' diff --git a/rexml.gemspec b/rexml.gemspec index b51df33b..2ba1c64d 100644 --- a/rexml.gemspec +++ b/rexml.gemspec @@ -55,6 +55,8 @@ Gem::Specification.new do |spec| spec.required_ruby_version = '>= 2.5.0' + spec.add_runtime_dependency("strscan", ">= 3.0.8") + spec.add_development_dependency "benchmark_driver" spec.add_development_dependency "bundler" spec.add_development_dependency "rake" From eeb45e153d047d1e123d24a5a3ba783958dfb709 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 14 Jan 2024 21:48:27 +0900 Subject: [PATCH 6/7] Change `@scanner.match` to respond `nil`/`@scanner` in order to improve processing speed. --- lib/rexml/parsers/baseparser.rb | 3 +-- lib/rexml/source.rb | 17 ++++++------- test/parse/test_entity_declaration.rb | 36 +++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 11 deletions(-) create mode 100644 test/parse/test_entity_declaration.rb diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 5ce7b8d9..65bad260 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -274,8 +274,7 @@ def pull_event return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] when ENTITY_START - match = @source.match( ENTITYDECL, true ).compact - match[0] = :entitydecl + match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact] ref = false if match[1] == '%' ref = true diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 4de68731..390d0ad5 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -70,11 +70,10 @@ def read def match(pattern, cons=false) if cons - @scanner.scan(pattern) + @scanner.scan(pattern).nil? ? nil : @scanner else - @scanner.check(pattern) + @scanner.check(pattern).nil? ? nil : @scanner end - @scanner.matched? ? [@scanner.matched, *@scanner.captures] : nil end # @return true if the Source is exhausted @@ -161,24 +160,24 @@ def read def match( pattern, cons=false ) if cons - @scanner.scan(pattern) + md = @scanner.scan(pattern) else - @scanner.check(pattern) + md = @scanner.check(pattern) end - while !@scanner.matched? and @source + while md.nil? and @source begin @scanner << readline if cons - @scanner.scan(pattern) + md = @scanner.scan(pattern) else - @scanner.check(pattern) + md = @scanner.check(pattern) end rescue @source = nil end end - @scanner.matched? ? [@scanner.matched, *@scanner.captures] : nil + md.nil? ? nil : @scanner end def empty? diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb new file mode 100644 index 00000000..e15deec6 --- /dev/null +++ b/test/parse/test_entity_declaration.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: false +require 'test/unit' +require 'rexml/document' + +module REXMLTests + class TestParseEntityDeclaration < Test::Unit::TestCase + private + def xml(internal_subset) + <<-XML + + + XML + end + + def parse(internal_subset) + REXML::Document.new(xml(internal_subset)).doctype + end + + def test_empty + exception = assert_raise(REXML::ParseException) do + parse(<<-INTERNAL_SUBSET) + + INTERNAL_SUBSET + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed notation declaration: name is missing +Line: 5 +Position: 72 +Last 80 unconsumed characters: + ]> + DETAIL + end + end +end From ea8e4bb05c98c07350fd0868ed0f78d4b0a4acc2 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 17 Jan 2024 10:55:47 +0900 Subject: [PATCH 7/7] Add more description --- lib/rexml/source.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 390d0ad5..71b08f99 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -151,7 +151,8 @@ def read begin # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM, # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs. - # `@scanner.string = @scanner.rest + readline` frees memory and avoids this problem. + # `@scanner.string = @scanner.rest + readline` frees memory that is already consumed + # and avoids this problem. @scanner.string = @scanner.rest + readline rescue Exception, NameError @source = nil