From 1c694d1e7f72d31fd11dcd13a0d7918384e320c9 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 22 Aug 2024 10:06:03 +0900 Subject: [PATCH 01/17] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 99d574b3..37331199 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.3.6" + VERSION = "3.3.7" REVISION = "" Copyright = COPYRIGHT From caec1879433e86914755245116d4acb416864e0d Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 26 Aug 2024 18:49:02 +0900 Subject: [PATCH 02/17] Add local entity expansion limit methods (#202) GitHub: fix GH-192 Add local entity expansion limit methods. - `REXML::Document#entity_expansion_limit=` - `REXML::Document#entity_expansion_text_limit=` - `REXML::Parsers::SAX2Parser#entity_expansion_limit=` - `REXML::Parsers::SAX2Parser#entity_expansion_text_limit=` - `REXML::Parsers::StreamParser#entity_expansion_limit=` - `REXML::Parsers::StreamParser#entity_expansion_text_limit=` - `REXML::Parsers::PullParser#entity_expansion_limit=` - `REXML::Parsers::PullParser#entity_expansion_text_limit=` --------- Co-authored-by: Sutou Kouhei --- lib/rexml/attribute.rb | 5 +++-- lib/rexml/document.rb | 6 ++++- lib/rexml/entity.rb | 7 ++++-- lib/rexml/parsers/baseparser.rb | 8 +++++-- lib/rexml/parsers/pullparser.rb | 8 +++++++ lib/rexml/parsers/sax2parser.rb | 8 +++++++ lib/rexml/parsers/streamparser.rb | 8 +++++++ lib/rexml/text.rb | 8 ++++--- test/test_document.rb | 22 +++++------------- test/test_pullparser.rb | 27 +++++++--------------- test/test_sax.rb | 27 +++++++--------------- test/test_stream.rb | 37 ++++++++++++------------------- 12 files changed, 83 insertions(+), 88 deletions(-) diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index 11893a95..fe48745c 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -148,8 +148,9 @@ def to_s # have been expanded to their values def value return @unnormalized if @unnormalized - @unnormalized = Text::unnormalize( @normalized, doctype ) - @unnormalized + + @unnormalized = Text::unnormalize(@normalized, doctype, + entity_expansion_text_limit: @element&.document&.entity_expansion_text_limit) end # The normalized value of this attribute. That is, the attribute with diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index b1caa020..d1747dd4 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -91,6 +91,8 @@ class Document < Element # def initialize( source = nil, context = {} ) @entity_expansion_count = 0 + @entity_expansion_limit = Security.entity_expansion_limit + @entity_expansion_text_limit = Security.entity_expansion_text_limit super() @context = context return if source.nil? @@ -431,10 +433,12 @@ def Document::entity_expansion_text_limit end attr_reader :entity_expansion_count + attr_writer :entity_expansion_limit + attr_accessor :entity_expansion_text_limit def record_entity_expansion @entity_expansion_count += 1 - if @entity_expansion_count > Security.entity_expansion_limit + if @entity_expansion_count > @entity_expansion_limit raise "number of entity expansions exceeded, processing aborted." end end diff --git a/lib/rexml/entity.rb b/lib/rexml/entity.rb index 12bbad3f..1ba5a7bb 100644 --- a/lib/rexml/entity.rb +++ b/lib/rexml/entity.rb @@ -71,9 +71,12 @@ def Entity::matches? string # Evaluates to the unnormalized value of this entity; that is, replacing # &ent; entities. def unnormalized - document.record_entity_expansion unless document.nil? + document&.record_entity_expansion + return nil if @value.nil? - @unnormalized = Text::unnormalize(@value, parent) + + @unnormalized = Text::unnormalize(@value, parent, + entity_expansion_text_limit: document&.entity_expansion_text_limit) end #once :unnormalized diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index d11c2766..89a9d0b6 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -164,6 +164,8 @@ def initialize( source ) @listeners = [] @prefixes = Set.new @entity_expansion_count = 0 + @entity_expansion_limit = Security.entity_expansion_limit + @entity_expansion_text_limit = Security.entity_expansion_text_limit end def add_listener( listener ) @@ -172,6 +174,8 @@ def add_listener( listener ) attr_reader :source attr_reader :entity_expansion_count + attr_writer :entity_expansion_limit + attr_writer :entity_expansion_text_limit def stream=( source ) @source = SourceFactory.create_from( source ) @@ -585,7 +589,7 @@ def unnormalize( string, entities=nil, filter=nil ) end re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/ rv.gsub!( re, entity_value ) - if rv.bytesize > Security.entity_expansion_text_limit + if rv.bytesize > @entity_expansion_text_limit raise "entity expansion has grown too large" end else @@ -627,7 +631,7 @@ def pop_namespaces_restore def record_entity_expansion(delta=1) @entity_expansion_count += delta - if @entity_expansion_count > Security.entity_expansion_limit + if @entity_expansion_count > @entity_expansion_limit raise "number of entity expansions exceeded, processing aborted." end end diff --git a/lib/rexml/parsers/pullparser.rb b/lib/rexml/parsers/pullparser.rb index 36b45953..a331eff5 100644 --- a/lib/rexml/parsers/pullparser.rb +++ b/lib/rexml/parsers/pullparser.rb @@ -51,6 +51,14 @@ def entity_expansion_count @parser.entity_expansion_count end + def entity_expansion_limit=( limit ) + @parser.entity_expansion_limit = limit + end + + def entity_expansion_text_limit=( limit ) + @parser.entity_expansion_text_limit = limit + end + def each while has_next? yield self.pull diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index cec9d2fc..5452d4b8 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -26,6 +26,14 @@ def entity_expansion_count @parser.entity_expansion_count end + def entity_expansion_limit=( limit ) + @parser.entity_expansion_limit = limit + end + + def entity_expansion_text_limit=( limit ) + @parser.entity_expansion_text_limit = limit + end + def add_listener( listener ) @parser.add_listener( listener ) end diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb index 7781fe44..6c64d978 100644 --- a/lib/rexml/parsers/streamparser.rb +++ b/lib/rexml/parsers/streamparser.rb @@ -18,6 +18,14 @@ def entity_expansion_count @parser.entity_expansion_count end + def entity_expansion_limit=( limit ) + @parser.entity_expansion_limit = limit + end + + def entity_expansion_text_limit=( limit ) + @parser.entity_expansion_text_limit = limit + end + def parse # entity string while true diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 7e0befe9..997f77d3 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -268,7 +268,8 @@ def inspect # u = Text.new( "sean russell", false, nil, true ) # u.value #-> "sean russell" def value - @unnormalized ||= Text::unnormalize( @string, doctype ) + @unnormalized ||= Text::unnormalize(@string, doctype, + entity_expansion_text_limit: document&.entity_expansion_text_limit) end # Sets the contents of this text node. This expects the text to be @@ -411,11 +412,12 @@ def Text::normalize( input, doctype=nil, entity_filter=nil ) end # Unescapes all possible entities - def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil ) + def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expansion_text_limit: nil ) + entity_expansion_text_limit ||= Security.entity_expansion_text_limit sum = 0 string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) { s = Text.expand($&, doctype, filter) - if sum + s.bytesize > Security.entity_expansion_text_limit + if sum + s.bytesize > entity_expansion_text_limit raise "entity expansion has grown too large" else sum += s.bytesize diff --git a/test/test_document.rb b/test/test_document.rb index 25a8828f..cda4354f 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -31,16 +31,6 @@ def test_new end class EntityExpansionLimitTest < Test::Unit::TestCase - def setup - @default_entity_expansion_limit = REXML::Security.entity_expansion_limit - @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit - end - - def teardown - REXML::Security.entity_expansion_limit = @default_entity_expansion_limit - REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit - end - class GeneralEntityTest < self def test_have_value xml = < XML - REXML::Security.entity_expansion_limit = 4 doc = REXML::Document.new(xml) + doc.entity_expansion_limit = 4 assert_equal("\na\na a\n<\n", doc.root.children.first.value) - REXML::Security.entity_expansion_limit = 3 doc = REXML::Document.new(xml) + doc.entity_expansion_limit = 3 assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do doc.root.children.first.value end @@ -142,8 +130,8 @@ def test_entity_expansion_text_limit &a; XML - REXML::Security.entity_expansion_text_limit = 90 doc = REXML::Document.new(xml) + doc.entity_expansion_text_limit = 90 assert_equal(90, doc.root.children.first.value.bytesize) end end diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index 005a106a..bdf8be17 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -157,16 +157,6 @@ def test_peek end class EntityExpansionLimitTest < Test::Unit::TestCase - def setup - @default_entity_expansion_limit = REXML::Security.entity_expansion_limit - @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit - end - - def teardown - REXML::Security.entity_expansion_limit = @default_entity_expansion_limit - REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit - end - class GeneralEntityTest < self def test_have_value source = <<-XML @@ -206,14 +196,13 @@ def test_empty_value XML - REXML::Security.entity_expansion_limit = 100000 parser = REXML::Parsers::PullParser.new(source) + parser.entity_expansion_limit = 100000 while parser.has_next? parser.pull end assert_equal(11111, parser.entity_expansion_count) - REXML::Security.entity_expansion_limit = @default_entity_expansion_limit parser = REXML::Parsers::PullParser.new(source) assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do while parser.has_next? @@ -221,7 +210,7 @@ def test_empty_value end end assert do - parser.entity_expansion_count > @default_entity_expansion_limit + parser.entity_expansion_count > REXML::Security.entity_expansion_limit end end @@ -239,14 +228,14 @@ def test_with_default_entity XML - REXML::Security.entity_expansion_limit = 4 parser = REXML::Parsers::PullParser.new(source) + parser.entity_expansion_limit = 4 while parser.has_next? parser.pull end - REXML::Security.entity_expansion_limit = 3 parser = REXML::Parsers::PullParser.new(source) + parser.entity_expansion_limit = 3 assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do while parser.has_next? parser.pull @@ -255,7 +244,7 @@ def test_with_default_entity end def test_with_only_default_entities - member_value = "<p>#{'A' * @default_entity_expansion_text_limit}</p>" + member_value = "<p>#{'A' * REXML::Security.entity_expansion_text_limit}</p>" source = <<-XML @@ -276,11 +265,11 @@ def test_with_only_default_entities end end - expected_value = "

#{'A' * @default_entity_expansion_text_limit}

" + expected_value = "

#{'A' * REXML::Security.entity_expansion_text_limit}

" assert_equal(expected_value, events['member'].strip) assert_equal(0, parser.entity_expansion_count) assert do - events['member'].bytesize > @default_entity_expansion_text_limit + events['member'].bytesize > REXML::Security.entity_expansion_text_limit end end @@ -296,8 +285,8 @@ def test_entity_expansion_text_limit &a; XML - REXML::Security.entity_expansion_text_limit = 90 parser = REXML::Parsers::PullParser.new(source) + parser.entity_expansion_text_limit = 90 events = {} element_name = '' while parser.has_next? diff --git a/test/test_sax.rb b/test/test_sax.rb index ae17e364..6aaeb618 100644 --- a/test/test_sax.rb +++ b/test/test_sax.rb @@ -100,16 +100,6 @@ def test_sax2 end class EntityExpansionLimitTest < Test::Unit::TestCase - def setup - @default_entity_expansion_limit = REXML::Security.entity_expansion_limit - @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit - end - - def teardown - REXML::Security.entity_expansion_limit = @default_entity_expansion_limit - REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit - end - class GeneralEntityTest < self def test_have_value source = <<-XML @@ -147,18 +137,17 @@ def test_empty_value
XML - REXML::Security.entity_expansion_limit = 100000 sax = REXML::Parsers::SAX2Parser.new(source) + sax.entity_expansion_limit = 100000 sax.parse assert_equal(11111, sax.entity_expansion_count) - REXML::Security.entity_expansion_limit = @default_entity_expansion_limit sax = REXML::Parsers::SAX2Parser.new(source) assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do sax.parse end assert do - sax.entity_expansion_count > @default_entity_expansion_limit + sax.entity_expansion_count > REXML::Security.entity_expansion_limit end end @@ -176,19 +165,19 @@ def test_with_default_entity XML - REXML::Security.entity_expansion_limit = 4 sax = REXML::Parsers::SAX2Parser.new(source) + sax.entity_expansion_limit = 4 sax.parse - REXML::Security.entity_expansion_limit = 3 sax = REXML::Parsers::SAX2Parser.new(source) + sax.entity_expansion_limit = 3 assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do sax.parse end end def test_with_only_default_entities - member_value = "<p>#{'A' * @default_entity_expansion_text_limit}</p>" + member_value = "<p>#{'A' * REXML::Security.entity_expansion_text_limit}</p>" source = <<-XML @@ -203,11 +192,11 @@ def test_with_only_default_entities end sax.parse - expected_value = "

#{'A' * @default_entity_expansion_text_limit}

" + expected_value = "

#{'A' * REXML::Security.entity_expansion_text_limit}

" assert_equal(expected_value, text_value.strip) assert_equal(0, sax.entity_expansion_count) assert do - text_value.bytesize > @default_entity_expansion_text_limit + text_value.bytesize > REXML::Security.entity_expansion_text_limit end end @@ -223,8 +212,8 @@ def test_entity_expansion_text_limit &a; XML - REXML::Security.entity_expansion_text_limit = 90 sax = REXML::Parsers::SAX2Parser.new(source) + sax.entity_expansion_text_limit = 90 text_size = nil sax.listen(:characters, ["member"]) do |text| text_size = text.size diff --git a/test/test_stream.rb b/test/test_stream.rb index 782066c2..7917760a 100644 --- a/test/test_stream.rb +++ b/test/test_stream.rb @@ -126,16 +126,6 @@ def text(text) end class EntityExpansionLimitTest < Test::Unit::TestCase - def setup - @default_entity_expansion_limit = REXML::Security.entity_expansion_limit - @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit - end - - def teardown - REXML::Security.entity_expansion_limit = @default_entity_expansion_limit - REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit - end - def test_have_value source = <<-XML @@ -172,18 +162,17 @@ def test_empty_value XML listener = MyListener.new - REXML::Security.entity_expansion_limit = 100000 parser = REXML::Parsers::StreamParser.new( source, listener ) + parser.entity_expansion_limit = 100000 parser.parse assert_equal(11111, parser.entity_expansion_count) - REXML::Security.entity_expansion_limit = @default_entity_expansion_limit parser = REXML::Parsers::StreamParser.new( source, listener ) assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do parser.parse end assert do - parser.entity_expansion_count > @default_entity_expansion_limit + parser.entity_expansion_count > REXML::Security.entity_expansion_limit end end @@ -202,17 +191,19 @@ def test_with_default_entity XML listener = MyListener.new - REXML::Security.entity_expansion_limit = 4 - REXML::Document.parse_stream(source, listener) + parser = REXML::Parsers::StreamParser.new( source, listener ) + parser.entity_expansion_limit = 4 + parser.parse - REXML::Security.entity_expansion_limit = 3 + parser = REXML::Parsers::StreamParser.new( source, listener ) + parser.entity_expansion_limit = 3 assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do - REXML::Document.parse_stream(source, listener) + parser.parse end end def test_with_only_default_entities - member_value = "<p>#{'A' * @default_entity_expansion_text_limit}</p>" + member_value = "<p>#{'A' * REXML::Security.entity_expansion_text_limit}</p>" source = <<-XML @@ -231,11 +222,11 @@ def text(text) parser = REXML::Parsers::StreamParser.new( source, listener ) parser.parse - expected_value = "

#{'A' * @default_entity_expansion_text_limit}

" + expected_value = "

#{'A' * REXML::Security.entity_expansion_text_limit}

" assert_equal(expected_value, listener.text_value.strip) assert_equal(0, parser.entity_expansion_count) assert do - listener.text_value.bytesize > @default_entity_expansion_text_limit + listener.text_value.bytesize > REXML::Security.entity_expansion_text_limit end end @@ -259,9 +250,9 @@ def text(text) end end listener.text_value = "" - REXML::Security.entity_expansion_text_limit = 90 - REXML::Document.parse_stream(source, listener) - + parser = REXML::Parsers::StreamParser.new( source, listener ) + parser.entity_expansion_text_limit = 90 + parser.parse assert_equal(90, listener.text_value.size) end end From ad02f99c616385bca1b84e161b93a144a99f71bf Mon Sep 17 00:00:00 2001 From: Bo Anderson Date: Wed, 4 Sep 2024 04:03:39 +0100 Subject: [PATCH 03/17] Remove strscan dependency declaration from gemspec (#204) `strscan` is a part of the Ruby standard library in all versions of Ruby supported by REXML. So we don't need to declare it as a dependency explicitly. See also: https://github.com/ruby/rexml/issues/140#issuecomment-2327645303 --- rexml.gemspec | 2 -- 1 file changed, 2 deletions(-) diff --git a/rexml.gemspec b/rexml.gemspec index 0de3e845..e5cf8581 100644 --- a/rexml.gemspec +++ b/rexml.gemspec @@ -58,6 +58,4 @@ Gem::Specification.new do |spec| spec.extra_rdoc_files = rdoc_files spec.required_ruby_version = '>= 2.5.0' - - spec.add_runtime_dependency("strscan") end From 6246ba112140372ee3e40cb3bfb1fabef65130e6 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 4 Sep 2024 13:54:24 +0900 Subject: [PATCH 04/17] ci document: fix method forwarding with recent Ruby ArgumentError: wrong number of arguments (given 2, expected 1) (ArgumentError) /home/runner/work/rexml/rexml/Rakefile:18:in `warn' /home/runner/work/rexml/rexml/Rakefile:18:in `warn' /opt/hostedtoolcache/Ruby/3.3.5/x64/bin/bundle:25:in `load' /opt/hostedtoolcache/Ruby/3.3.5/x64/bin/bundle:25:in `
' --- Rakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Rakefile b/Rakefile index 76a56296..4676930b 100644 --- a/Rakefile +++ b/Rakefile @@ -14,7 +14,7 @@ task :default => :test namespace :warning do desc "Treat warning as error" task :error do - def Warning.warn(*message) + def Warning.warn(*message, **) super raise "Treat warning as error:\n" + message.join("\n") end From 9294410f6eb90834a69a3fa363de61f5a3f6a927 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 4 Sep 2024 14:02:34 +0900 Subject: [PATCH 05/17] ci document: suppress a ostruct warning --- Gemfile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Gemfile b/Gemfile index 67f21dfb..1710ec99 100644 --- a/Gemfile +++ b/Gemfile @@ -7,6 +7,14 @@ gemspec group :development do gem "bundler" + # This is for suppressing the following warning: + # + # warning: ostruct was loaded from the standard library, but will + # no longer be part of the default gems starting from Ruby 3.5.0. + # + # This should be part of "json". We can remove this when "json" + # depends on "ostruct" explicitly. + gem "ostruct" gem "rake" end From 86a11c05f53dbb3dfbe504a365f1412f2e691c25 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 4 Sep 2024 14:13:15 +0900 Subject: [PATCH 06/17] Add 3.3.7 entry --- NEWS.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/NEWS.md b/NEWS.md index 6c290678..844eeb94 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,27 @@ # News +## 3.3.7 - 2024-09-04 {#version-3-3-7} + +### Improvements + + * Added local entity expansion limit methods + * GH-192 + * GH-202 + * Reported by takuya kodama. + * Patch by NAITOH Jun. + + * Removed explicit strscan dependency + * GH-204 + * Patch by Bo Anderson. + +### Thanks + + * takuya kodama + + * NAITOH Jun + + * Bo Anderson + ## 3.3.6 - 2024-08-22 {#version-3-3-6} ### Improvements From 35ee73e0cd125633cfcb53996c0bcb7897e97cd2 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 4 Sep 2024 14:13:49 +0900 Subject: [PATCH 07/17] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 37331199..f27b4261 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.3.7" + VERSION = "3.3.8" REVISION = "" Copyright = COPYRIGHT From 2e1cd64f2f9c0667a840a0e31f9bb99f9e1c2b33 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 25 Sep 2024 06:29:02 +0900 Subject: [PATCH 08/17] Optimize SAX2Parser#get_namespace (#207) ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.4/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 18.085 17.677 33.086 32.778 i/s - 100.000 times in 5.529372s 5.657097s 3.022471s 3.050832s sax 25.450 26.182 44.797 47.916 i/s - 100.000 times in 3.929249s 3.819475s 2.232309s 2.086982s pull 29.160 29.089 55.407 53.531 i/s - 100.000 times in 3.429304s 3.437757s 1.804825s 1.868072s stream 29.137 29.055 52.780 51.368 i/s - 100.000 times in 3.432007s 3.441754s 1.894649s 1.946724s Comparison: dom before(YJIT): 33.1 i/s after(YJIT): 32.8 i/s - 1.01x slower before: 18.1 i/s - 1.83x slower after: 17.7 i/s - 1.87x slower sax after(YJIT): 47.9 i/s before(YJIT): 44.8 i/s - 1.07x slower after: 26.2 i/s - 1.83x slower before: 25.5 i/s - 1.88x slower pull before(YJIT): 55.4 i/s after(YJIT): 53.5 i/s - 1.04x slower before: 29.2 i/s - 1.90x slower after: 29.1 i/s - 1.90x slower stream before(YJIT): 52.8 i/s after(YJIT): 51.4 i/s - 1.03x slower before: 29.1 i/s - 1.81x slower after: 29.1 i/s - 1.82x slower ``` - sax - YJIT=ON : 1.07x faster - YJIT=OFF : 1.03x faster --- lib/rexml/parsers/sax2parser.rb | 2 ++ test/test_sax.rb | 46 +++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index 5452d4b8..a51477de 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -259,6 +259,8 @@ def add( pair ) end def get_namespace( prefix ) + return nil if @namespace_stack.empty? + uris = (@namespace_stack.find_all { |ns| not ns[prefix].nil? }) || (@namespace_stack.find { |ns| not ns[nil].nil? }) uris[-1][prefix] unless uris.nil? or 0 == uris.size diff --git a/test/test_sax.rb b/test/test_sax.rb index 6aaeb618..caec983b 100644 --- a/test/test_sax.rb +++ b/test/test_sax.rb @@ -99,6 +99,52 @@ def test_sax2 end end + def test_without_namespace + xml = <<-XML + + + + + + XML + + parser = REXML::Parsers::SAX2Parser.new(xml) + elements = [] + parser.listen(:start_element) do |uri, localname, qname, attrs| + elements << [uri, localname, qname, attrs] + end + parser.parse + assert_equal([ + [nil, "root", "root", {}], + [nil, "a", "a", {"att1"=>"1", "att2"=>"2", "att3"=>"<"}], + [nil, "b", "b", {}] + ], elements) + end + + def test_with_namespace + xml = <<-XML + + + + + + XML + + parser = REXML::Parsers::SAX2Parser.new(xml) + elements = [] + parser.listen(:start_element) do |uri, localname, qname, attrs| + elements << [uri, localname, qname, attrs] + end + parser.parse + assert_equal([ + ["http://example.org/default", "root", "root", {"xmlns"=>"http://example.org/default", "xmlns:bar"=>"http://example.org/bar", "xmlns:foo"=>"http://example.org/foo"}], + ["http://example.org/default", "a", "a", {"att"=>"<", "bar:att"=>"2", "foo:att"=>"1"}], + ["http://example.org/bar", "b", "bar:b", {}] + ], elements) + end + class EntityExpansionLimitTest < Test::Unit::TestCase class GeneralEntityTest < self def test_have_value From 78f8712dccad773a51dc5eef31c02d523e994570 Mon Sep 17 00:00:00 2001 From: KITAITI Makoto Date: Sun, 29 Sep 2024 15:57:03 +0900 Subject: [PATCH 09/17] Fix handling with "xml:" prefixed namespace (#208) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I found parsing XHTML documents like below fails since v3.3.3: ```xml XHTML Document

XHTML Document

この段落は日本語です。

``` [XML namespace spec][spec] is a little bit ambiguous but document above is valid according to an [article W3C serves][article]. I fixed the parsing algorithm. Can you review it? As an aside, `` style language declaration is often used in XHTML files included in EPUB files because [sample EPUB files][samples] provided by IDPF, former EPUB spec authority, use the style. [spec]: https://www.w3.org/TR/REC-xml-names/#defaulting [article]: https://www.w3.org/International/questions/qa-html-language-declarations#attributes [samples]: https://github.com/IDPF/epub3-samples --- lib/rexml/parsers/baseparser.rb | 5 +++-- test/parser/test_base_parser.rb | 35 +++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 89a9d0b6..a567e045 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -156,6 +156,7 @@ module Private default_entities.each do |term| DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/ end + XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace" end private_constant :Private @@ -185,7 +186,7 @@ def stream=( source ) @tags = [] @stack = [] @entities = [] - @namespaces = {} + @namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE} @namespaces_restore_stack = [] end @@ -790,7 +791,7 @@ def parse_attributes(prefixes) @source.match(/\s*/um, true) if prefix == "xmlns" if local_part == "xml" - if value != "http://www.w3.org/XML/1998/namespace" + if value != Private::XML_PREFIXED_NAMESPACE msg = "The 'xml' prefix must not be bound to any other namespace "+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" raise REXML::ParseException.new( msg, @source, self ) diff --git a/test/parser/test_base_parser.rb b/test/parser/test_base_parser.rb index 17d01979..da169a25 100644 --- a/test/parser/test_base_parser.rb +++ b/test/parser/test_base_parser.rb @@ -23,5 +23,40 @@ def test_large_xml parser.position < xml.bytesize end end + + def test_attribute_prefixed_by_xml + xml = <<-XML + + + + + XHTML Document + + +

XHTML Document

+

この段落は日本語です。

+ + + XML + + parser = REXML::Parsers::BaseParser.new(xml) + 5.times {parser.pull} + + html = parser.pull + assert_equal([:start_element, + "html", + {"xmlns" => "http://www.w3.org/1999/xhtml", + "xml:lang" => "en", + "lang" => "en"}], + html) + + 15.times {parser.pull} + + p = parser.pull + assert_equal([:start_element, + "p", + {"xml:lang" => "ja", "lang" => "ja"}], + p) + end end end From 4197054a19e65511fb51983518a134a5c65aa840 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 29 Sep 2024 16:03:58 +0900 Subject: [PATCH 10/17] Add 3.3.8 entry --- NEWS.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/NEWS.md b/NEWS.md index 844eeb94..5f0f2e01 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,26 @@ # News +## 3.3.8 - 2024-09-29 {#version-3-3-8} + +### Improvements + + * SAX2: Improve parse performance. + * GH-207 + * Patch by NAITOH Jun. + +### Fixes + + * Fixed a bug that unexpected attribute namespace conflict error for + the predefined "xml" namespace is reported. + * GH-208 + * Patch by KITAITI Makoto + +### Thanks + + * NAITOH Jun + + * KITAITI Makoto + ## 3.3.7 - 2024-09-04 {#version-3-3-7} ### Improvements From 036d50851ce091c797db0b9ba3ed8e5a39c3918c Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 29 Sep 2024 16:04:43 +0900 Subject: [PATCH 11/17] test: avoid using needless non ASCII characters --- test/parser/test_base_parser.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/parser/test_base_parser.rb b/test/parser/test_base_parser.rb index da169a25..6f213978 100644 --- a/test/parser/test_base_parser.rb +++ b/test/parser/test_base_parser.rb @@ -34,7 +34,7 @@ def test_attribute_prefixed_by_xml

XHTML Document

-

この段落は日本語です。

+

For Japanese

XML From 622011f25ac1519fd553d6c56da52d7eba14a787 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 29 Sep 2024 16:05:32 +0900 Subject: [PATCH 12/17] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index f27b4261..0fbd5eb2 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.3.8" + VERSION = "3.3.9" REVISION = "" Copyright = COPYRIGHT From 1d0c362526f6e25e2abcd13e2fcefcc718c20e78 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 9 Oct 2024 10:21:44 +0900 Subject: [PATCH 13/17] Optimize `IOSource#read_until` method (#210) ## Why? The result of `encode(term)` can be cached. ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.4/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 17.546 18.512 32.282 32.306 i/s - 100.000 times in 5.699323s 5.402026s 3.097658s 3.095448s sax 25.435 28.294 47.526 50.074 i/s - 100.000 times in 3.931613s 3.534310s 2.104122s 1.997057s pull 29.471 31.870 54.400 57.554 i/s - 100.000 times in 3.393211s 3.137793s 1.838222s 1.737494s stream 29.169 31.153 51.613 52.898 i/s - 100.000 times in 3.428318s 3.209941s 1.937508s 1.890424s Comparison: dom after(YJIT): 32.3 i/s before(YJIT): 32.3 i/s - 1.00x slower after: 18.5 i/s - 1.75x slower before: 17.5 i/s - 1.84x slower sax after(YJIT): 50.1 i/s before(YJIT): 47.5 i/s - 1.05x slower after: 28.3 i/s - 1.77x slower before: 25.4 i/s - 1.97x slower pull after(YJIT): 57.6 i/s before(YJIT): 54.4 i/s - 1.06x slower after: 31.9 i/s - 1.81x slower before: 29.5 i/s - 1.95x slower stream after(YJIT): 52.9 i/s before(YJIT): 51.6 i/s - 1.02x slower after: 31.2 i/s - 1.70x slower before: 29.2 i/s - 1.81x slower ``` - YJIT=ON : 1.00x - 1.06x faster - YJIT=OFF : 1.05x - 1.11x faster --- lib/rexml/source.rb | 3 ++- test/test_document.rb | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index ff887fc0..e1a466e9 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -77,6 +77,7 @@ def initialize(arg, encoding=nil) detect_encoding end @line = 0 + @term_encord = {} end # The current buffer (what we're going to read next) @@ -227,7 +228,7 @@ def read(term = nil, min_bytes = 1) def read_until(term) pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/ - term = encode(term) + term = @term_encord[term] ||= encode(term) until str = @scanner.scan_until(pattern) break if @source.nil? break if @source.eof? diff --git a/test/test_document.rb b/test/test_document.rb index cda4354f..609aeba2 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -403,6 +403,40 @@ def test_utf_16 assert_equal(expected_xml, actual_xml) end end + + class ReadUntilTest < Test::Unit::TestCase + def test_utf_8 + xml = <<-EOX.force_encoding("ASCII-8BIT") + +Hello world! +EOX + document = REXML::Document.new(xml) + assert_equal("UTF-8", document.encoding) + assert_equal(">", REXML::XPath.match(document, "/message")[0].attribute("testing").value) + end + + def test_utf_16le + xml = <<-EOX.encode("UTF-16LE").force_encoding("ASCII-8BIT") + +Hello world! +EOX + bom = "\ufeff".encode("UTF-16LE").force_encoding("ASCII-8BIT") + document = REXML::Document.new(bom + xml) + assert_equal("UTF-16", document.encoding) + assert_equal(">", REXML::XPath.match(document, "/message")[0].attribute("testing").value) + end + + def test_utf_16be + xml = <<-EOX.encode("UTF-16BE").force_encoding("ASCII-8BIT") + +Hello world! +EOX + bom = "\ufeff".encode("UTF-16BE").force_encoding("ASCII-8BIT") + document = REXML::Document.new(bom + xml) + assert_equal("UTF-16", document.encoding) + assert_equal(">", REXML::XPath.match(document, "/message")[0].attribute("testing").value) + end + end end end end From cf0fb9c9ca3dc0d725c8e4644aa0e728025f42ce Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 19 Oct 2024 15:27:25 +0900 Subject: [PATCH 14/17] Fix `IOSource#readline` for `@pending_buffer` (#215) ## Why? Fixed a problem that `@pending_buffer` is not processed when `IOError` occurs in `@source.readline` although `@pending_buffer` exists when reading XML file. --- lib/rexml/parsers/baseparser.rb | 1 + lib/rexml/source.rb | 7 ++++++- test/parse/test_text.rb | 17 +++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index a567e045..7bd8adf8 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -167,6 +167,7 @@ def initialize( source ) @entity_expansion_count = 0 @entity_expansion_limit = Security.entity_expansion_limit @entity_expansion_text_limit = Security.entity_expansion_text_limit + @source.ensure_buffer end def add_listener( listener ) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index e1a466e9..dc0b5323 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -295,14 +295,19 @@ def current_line private def readline(term = nil) - str = @source.readline(term || @line_break) if @pending_buffer + begin + str = @source.readline(term || @line_break) + rescue IOError + end if str.nil? str = @pending_buffer else str = @pending_buffer + str end @pending_buffer = nil + else + str = @source.readline(term || @line_break) end return nil if str.nil? diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb index 04f553ae..bb208d47 100644 --- a/test/parse/test_text.rb +++ b/test/parse/test_text.rb @@ -4,6 +4,23 @@ module REXMLTests class TestParseText < Test::Unit::TestCase class TestInvalid < self + def test_text_only + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('a') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Content at the start of the document (got 'a') + Line: 1 + Position: 1 + Last 80 unconsumed characters: + + DETAIL + end + def test_before_root exception = assert_raise(REXML::ParseException) do parser = REXML::Parsers::BaseParser.new('b') From a09646d395a07399cbf9bc3bc8d6d8bb1d13ecea Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 24 Oct 2024 14:40:13 +0900 Subject: [PATCH 15/17] test: fix indent --- test/test_document.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_document.rb b/test/test_document.rb index 609aeba2..39b6c337 100644 --- a/test/test_document.rb +++ b/test/test_document.rb @@ -405,7 +405,7 @@ def test_utf_16 end class ReadUntilTest < Test::Unit::TestCase - def test_utf_8 + def test_utf_8 xml = <<-EOX.force_encoding("ASCII-8BIT") Hello world! From ce59f2eb1aeb371fe1643414f06618dbe031979f Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 24 Oct 2024 14:45:31 +0900 Subject: [PATCH 16/17] parser: fix a bug that �x...; is accepted as a character reference --- lib/rexml/parsers/baseparser.rb | 10 +++++++--- test/parse/test_character_reference.rb | 6 ++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 7bd8adf8..b4547ba3 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -150,7 +150,7 @@ module Private PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/ - CHARACTER_REFERENCES = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ + CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/ DEFAULT_ENTITIES_PATTERNS = {} default_entities = ['gt', 'lt', 'quot', 'apos', 'amp'] default_entities.each do |term| @@ -570,8 +570,12 @@ def unnormalize( string, entities=nil, filter=nil ) return rv if matches.size == 0 rv.gsub!( Private::CHARACTER_REFERENCES ) { m=$1 - m = "0#{m}" if m[0] == ?x - [Integer(m)].pack('U*') + if m.start_with?("x") + code_point = Integer(m[1..-1], 16) + else + code_point = Integer(m, 10) + end + [code_point].pack('U*') } matches.collect!{|x|x[0]}.compact! if filter diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb index bf8d2190..4bb5da5c 100644 --- a/test/parse/test_character_reference.rb +++ b/test/parse/test_character_reference.rb @@ -13,5 +13,11 @@ def test_linear_performance_many_preceding_zeros REXML::Document.new('') end end + + def test_hex_precedding_zero + parser = REXML::Parsers::PullParser.new("a�x61;") + parser.pull # :start_element + assert_equal("a�x61;", parser.pull[1]) # :text + end end end From 38eaa86ac7abe0d31cf49d8df57ad239fdeb80e9 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 24 Oct 2024 14:47:38 +0900 Subject: [PATCH 17/17] Add 3.3.9 entry --- NEWS.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/NEWS.md b/NEWS.md index 5f0f2e01..3d17c287 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,26 @@ # News +## 3.3.9 - 2024-10-24 {#version-3-3-9} + +### Improvements + + * Improved performance. + * GH-210 + * Patch by NAITOH Jun. + +### Fixes + + * Fixed a parse bug for text only invalid XML. + * GH-215 + * Patch by NAITOH Jun. + + * Fixed a parse bug that `�x...;` is accepted as a character + reference. + +### Thanks + + * NAITOH Jun + ## 3.3.8 - 2024-09-29 {#version-3-3-8} ### Improvements