From 49ae17ceec9681b1578923a4297d32582f385d0f Mon Sep 17 00:00:00 2001 From: Kevin Menard Date: Tue, 17 Oct 2023 11:40:45 -0400 Subject: [PATCH 1/3] Fix an issue where `Regexp.union` was improperly negotiating the result encoding. As part of this fix we were able to remove the non-standard `Regexp.convert`. Co-authored-by: Simon LeVasseur --- CHANGELOG.md | 1 + spec/ruby/core/regexp/union_spec.rb | 18 +++++++ src/main/ruby/truffleruby/core/regexp.rb | 61 ++++++++++++------------ 3 files changed, 49 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 103fcdb83821..f88284ceced1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ Bug fixes: * Fix `rb_enc_left_char_head()` so it is not always `ArgumentError` (#3267, @eregon). * Fix `IO.copy_stream` with a `Tempfile` destination (#3280, @eregon). +* Fix `Regexp.union` negotiating the wrong result encoding (#3287, @nirvdrum, @simonlevasseur). Compatibility: diff --git a/spec/ruby/core/regexp/union_spec.rb b/spec/ruby/core/regexp/union_spec.rb index 807683647101..2580815f9da0 100644 --- a/spec/ruby/core/regexp/union_spec.rb +++ b/spec/ruby/core/regexp/union_spec.rb @@ -43,6 +43,24 @@ Regexp.union("\u00A9".encode("ISO-8859-1"), "a".encode("UTF-8")).encoding.should == Encoding::ISO_8859_1 end + it "returns ASCII-8BIT if the regexp encodings are ASCII-8BIT and at least one has non-ASCII characters" do + us_ascii_implicit, us_ascii_explicit, binary = /abc/, /[\x00-\x7f]/n, /[\x80-\xBF]/n + + Regexp.union(us_ascii_implicit, us_ascii_explicit, binary).encoding.should == Encoding::BINARY + Regexp.union(us_ascii_implicit, binary, us_ascii_explicit).encoding.should == Encoding::BINARY + Regexp.union(us_ascii_explicit, us_ascii_implicit, binary).encoding.should == Encoding::BINARY + Regexp.union(us_ascii_explicit, binary, us_ascii_implicit).encoding.should == Encoding::BINARY + Regexp.union(binary, us_ascii_implicit, us_ascii_explicit).encoding.should == Encoding::BINARY + Regexp.union(binary, us_ascii_explicit, us_ascii_implicit).encoding.should == Encoding::BINARY + end + + it "return US-ASCII if all patterns are ASCII-only" do + Regexp.union(/abc/e, /def/e).encoding.should == Encoding::US_ASCII + Regexp.union(/abc/n, /def/n).encoding.should == Encoding::US_ASCII + Regexp.union(/abc/s, /def/s).encoding.should == Encoding::US_ASCII + Regexp.union(/abc/u, /def/u).encoding.should == Encoding::US_ASCII + end + it "returns a Regexp with UTF-8 if one part is UTF-8" do Regexp.union(/probl[éeè]me/i, /help/i).encoding.should == Encoding::UTF_8 end diff --git a/src/main/ruby/truffleruby/core/regexp.rb b/src/main/ruby/truffleruby/core/regexp.rb index 6cb3e6f02b53..06c370e12f20 100644 --- a/src/main/ruby/truffleruby/core/regexp.rb +++ b/src/main/ruby/truffleruby/core/regexp.rb @@ -55,22 +55,23 @@ def self.try_convert(obj) Truffle::Type.try_convert obj, Regexp, :to_regexp end - def self.convert(pattern) - return pattern if Primitive.is_a?(pattern, Regexp) - if Primitive.is_a?(pattern, Array) - union(*pattern) - else - Regexp.quote(pattern.to_s) - end - end + def self.negotiate_union_encoding(*patterns) + res = nil + + patterns.each do |pattern| + converted = Primitive.is_a?(pattern, Regexp) ? pattern : Regexp.quote(pattern) - def self.compatible?(*patterns) - encodings = patterns.map { |r| convert(r).encoding } - last_enc = encodings.pop - encodings.each do |encoding| - raise ArgumentError, "incompatible encodings: #{encoding} and #{last_enc}" unless Primitive.encoding_compatible?(last_enc, encoding) - last_enc = encoding + enc = converted.encoding + + if Primitive.nil?(res) + res = enc + else + res = Primitive.encoding_compatible?(enc, res) + raise ArgumentError, "incompatible encodings: #{enc} and #{res}" unless res + end end + + res end def self.last_match(index = nil) @@ -96,37 +97,35 @@ def self.last_match(index = nil) def self.union(*patterns) case patterns.size when 0 - return %r/(?!)/ + %r/(?!)/ when 1 pattern = patterns.first case pattern when Array - return union(*pattern) + union(*pattern) else converted = Truffle::Type.rb_check_convert_type(pattern, Regexp, :to_regexp) if Primitive.nil? converted - return Regexp.new(Regexp.quote(pattern)) + Regexp.new(Regexp.quote(pattern)) else - return converted + converted end end else - compatible?(*patterns) - enc = convert(patterns.first).encoding - end + patterns = patterns.map do |pat| + if Primitive.is_a?(pat, Regexp) + pat + else + StringValue(pat) + end + end - sep = '|'.encode(enc) - str = ''.encode(enc) + enc = negotiate_union_encoding(*patterns) + sep = '|'.encode(enc) + str = ''.encode(enc) - patterns = patterns.map do |pat| - if Primitive.is_a?(pat, Regexp) - pat - else - StringValue(pat) - end + Truffle::RegexpOperations.union(str, sep, *patterns) end - - Truffle::RegexpOperations.union(str, sep, *patterns) end Truffle::Graal.always_split(method(:union)) From e80982a3d6ff77a6a08a8eff4d2803bf40ca86ff Mon Sep 17 00:00:00 2001 From: Benoit Daloze Date: Wed, 18 Oct 2023 12:23:32 +0200 Subject: [PATCH 2/3] Spec exception messages for Regexp.union --- spec/ruby/core/regexp/union_spec.rb | 30 +++++++++++++----------- src/main/ruby/truffleruby/core/regexp.rb | 16 ++++++++----- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/spec/ruby/core/regexp/union_spec.rb b/spec/ruby/core/regexp/union_spec.rb index 2580815f9da0..ddbe869f810f 100644 --- a/spec/ruby/core/regexp/union_spec.rb +++ b/spec/ruby/core/regexp/union_spec.rb @@ -72,83 +72,83 @@ it "raises ArgumentError if the arguments include conflicting ASCII-incompatible Strings" do -> { Regexp.union("a".encode("UTF-16LE"), "b".encode("UTF-16BE")) - }.should raise_error(ArgumentError) + }.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and UTF-16BE') end it "raises ArgumentError if the arguments include conflicting ASCII-incompatible Regexps" do -> { Regexp.union(Regexp.new("a".encode("UTF-16LE")), Regexp.new("b".encode("UTF-16BE"))) - }.should raise_error(ArgumentError) + }.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and UTF-16BE') end it "raises ArgumentError if the arguments include conflicting fixed encoding Regexps" do -> { Regexp.union(Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING), Regexp.new("b".encode("US-ASCII"), Regexp::FIXEDENCODING)) - }.should raise_error(ArgumentError) + }.should raise_error(ArgumentError, 'incompatible encodings: UTF-8 and US-ASCII') end it "raises ArgumentError if the arguments include a fixed encoding Regexp and a String containing non-ASCII-compatible characters in a different encoding" do -> { Regexp.union(Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING), "\u00A9".encode("ISO-8859-1")) - }.should raise_error(ArgumentError) + }.should raise_error(ArgumentError, 'incompatible encodings: UTF-8 and ISO-8859-1') end it "raises ArgumentError if the arguments include a String containing non-ASCII-compatible characters and a fixed encoding Regexp in a different encoding" do -> { Regexp.union("\u00A9".encode("ISO-8859-1"), Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING)) - }.should raise_error(ArgumentError) + }.should raise_error(ArgumentError, 'incompatible encodings: ISO-8859-1 and UTF-8') end it "raises ArgumentError if the arguments include an ASCII-incompatible String and an ASCII-only String" do -> { Regexp.union("a".encode("UTF-16LE"), "b".encode("UTF-8")) - }.should raise_error(ArgumentError) + }.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/) end it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and an ASCII-only String" do -> { Regexp.union(Regexp.new("a".encode("UTF-16LE")), "b".encode("UTF-8")) - }.should raise_error(ArgumentError) + }.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/) end it "raises ArgumentError if the arguments include an ASCII-incompatible String and an ASCII-only Regexp" do -> { Regexp.union("a".encode("UTF-16LE"), Regexp.new("b".encode("UTF-8"))) - }.should raise_error(ArgumentError) + }.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/) end it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and an ASCII-only Regexp" do -> { Regexp.union(Regexp.new("a".encode("UTF-16LE")), Regexp.new("b".encode("UTF-8"))) - }.should raise_error(ArgumentError) + }.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/) end it "raises ArgumentError if the arguments include an ASCII-incompatible String and a String containing non-ASCII-compatible characters in a different encoding" do -> { Regexp.union("a".encode("UTF-16LE"), "\u00A9".encode("ISO-8859-1")) - }.should raise_error(ArgumentError) + }.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1') end it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and a String containing non-ASCII-compatible characters in a different encoding" do -> { Regexp.union(Regexp.new("a".encode("UTF-16LE")), "\u00A9".encode("ISO-8859-1")) - }.should raise_error(ArgumentError) + }.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1') end it "raises ArgumentError if the arguments include an ASCII-incompatible String and a Regexp containing non-ASCII-compatible characters in a different encoding" do -> { Regexp.union("a".encode("UTF-16LE"), Regexp.new("\u00A9".encode("ISO-8859-1"))) - }.should raise_error(ArgumentError) + }.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1') end it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and a Regexp containing non-ASCII-compatible characters in a different encoding" do -> { Regexp.union(Regexp.new("a".encode("UTF-16LE")), Regexp.new("\u00A9".encode("ISO-8859-1"))) - }.should raise_error(ArgumentError) + }.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1') end it "uses to_str to convert arguments (if not Regexp)" do @@ -172,6 +172,8 @@ not_supported_on :opal do Regexp.union([/dogs/, /cats/i]).should == /(?-mix:dogs)|(?i-mx:cats)/ end - ->{Regexp.union(["skiing", "sledding"], [/dogs/, /cats/i])}.should raise_error(TypeError) + -> { + Regexp.union(["skiing", "sledding"], [/dogs/, /cats/i]) + }.should raise_error(TypeError, 'no implicit conversion of Array into String') end end diff --git a/src/main/ruby/truffleruby/core/regexp.rb b/src/main/ruby/truffleruby/core/regexp.rb index 06c370e12f20..0837f2530699 100644 --- a/src/main/ruby/truffleruby/core/regexp.rb +++ b/src/main/ruby/truffleruby/core/regexp.rb @@ -56,22 +56,26 @@ def self.try_convert(obj) end def self.negotiate_union_encoding(*patterns) - res = nil + compatible_enc = nil patterns.each do |pattern| converted = Primitive.is_a?(pattern, Regexp) ? pattern : Regexp.quote(pattern) enc = converted.encoding - if Primitive.nil?(res) - res = enc + if Primitive.nil?(compatible_enc) + compatible_enc = enc else - res = Primitive.encoding_compatible?(enc, res) - raise ArgumentError, "incompatible encodings: #{enc} and #{res}" unless res + if test = Primitive.encoding_compatible?(enc, compatible_enc) + compatible_enc = test + else + raise ArgumentError, "incompatible encodings: #{compatible_enc} and #{enc}" + end + end end - res + compatible_enc end def self.last_match(index = nil) From 264328ee66236db1ca8c16ed1820a6a3704d625d Mon Sep 17 00:00:00 2001 From: Benoit Daloze Date: Wed, 18 Oct 2023 12:29:10 +0200 Subject: [PATCH 3/3] Clarify spec --- spec/ruby/core/regexp/union_spec.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spec/ruby/core/regexp/union_spec.rb b/spec/ruby/core/regexp/union_spec.rb index ddbe869f810f..ea5a5053f725 100644 --- a/spec/ruby/core/regexp/union_spec.rb +++ b/spec/ruby/core/regexp/union_spec.rb @@ -45,6 +45,9 @@ it "returns ASCII-8BIT if the regexp encodings are ASCII-8BIT and at least one has non-ASCII characters" do us_ascii_implicit, us_ascii_explicit, binary = /abc/, /[\x00-\x7f]/n, /[\x80-\xBF]/n + us_ascii_implicit.encoding.should == Encoding::US_ASCII + us_ascii_explicit.encoding.should == Encoding::US_ASCII + binary.encoding.should == Encoding::BINARY Regexp.union(us_ascii_implicit, us_ascii_explicit, binary).encoding.should == Encoding::BINARY Regexp.union(us_ascii_implicit, binary, us_ascii_explicit).encoding.should == Encoding::BINARY