Throw ParseException on out of range in UTF8toUTF16, add more tests

apache · paulirwin · Dec 4, 2024 · Dec 5, 2024 · Dec 5, 2024 · Dec 17, 2024
commit a04df3a197cd18fe7d4ef979f2d979346a629359
diff --git a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
@@ -328,8 +328,29 @@ public virtual void TestUTF8UTF16CharsRef()
             }
         }
 
+        [Test]
+        [LuceneNetSpecific]
+        [TestCase(new byte[] { 0x63, 0x61, 0xc3 }, true)] // ca�, start of 2-byte sequence
+        [TestCase(new byte[] { 0x63, 0x61, 0xe3 }, true)] // ca�, start of 3-byte sequence
+        [TestCase(new byte[] { 0x63, 0x61, 0xf3 }, true)] // ca�, start of 4-byte sequence
+        [TestCase(new byte[] { 0x63, 0x61, 0xc3, 0xb1, 0x6f, 0x6e }, false)] // cañon
+        public void TestUTF8toUTF16Exception(byte[] invalidUtf8, bool shouldThrow)
+        {
+            var scratch = new CharsRef();
+
+            if (shouldThrow)
+            {
+                Assert.Throws<ParseException>(() => UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch));
+            }
+            else
+            {
+                UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch);
+            }
+        }
+
         [Test]
         [LuceneNetSpecific] // this is a Lucene.NET specific method
+        [Repeat(100)]
         public void TestTryUTF8toUTF16()
         {
             string unicode = TestUtil.RandomRealisticUnicodeString(Random);
@@ -343,14 +364,17 @@ public void TestTryUTF8toUTF16()
 
         [Test]
         [LuceneNetSpecific] // this is a Lucene.NET specific method
-        public void TestUTF8toUTF16WithFallback()
+        [TestCase(new byte[] { 0x63, 0x61, 0xc3 }, "ca\ufffd")] // ca�, start of 2-byte sequence
+        [TestCase(new byte[] { 0x63, 0x61, 0xe3 }, "ca\ufffd")] // ca�, start of 3-byte sequence
+        [TestCase(new byte[] { 0x63, 0x61, 0xf3 }, "ca\ufffd")] // ca�, start of 4-byte sequence
+        [TestCase(new byte[] { 0x63, 0x61, 0xc3, 0xb1, 0x6f, 0x6e }, "cañon")]
+        public void TestUTF8toUTF16WithFallback(byte[] utf8, string expected)
         {
-            byte[] invalidUtf8 = { 0x63, 0xc3 }; // Invalid ending UTF-8 sequence
             var scratch = new CharsRef();
 
-            UnicodeUtil.UTF8toUTF16WithFallback(invalidUtf8, scratch);
+            UnicodeUtil.UTF8toUTF16WithFallback(utf8, scratch);
 
-            Assert.AreEqual("c\ufffd", scratch.ToString());
+            Assert.AreEqual(expected, scratch.ToString());
         }
     }
 }
diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs
@@ -925,15 +925,27 @@ public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, CharsRef chars)
                 }
                 else if (b < 0xe0)
                 {
+                    if (utf8.Length <= i)
+                    {
+                        throw new ParseException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}", i - 1);
+                    }
                     @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f));
                 }
                 else if (b < 0xf0)
                 {
+                    if (utf8.Length <= i + 1)
+                    {
+                        throw new ParseException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}", i - 1);
+                    }
                     @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f));
                     i += 2;
                 }
                 else
                 {
+                    if (utf8.Length <= i + 2)
+                    {
+                        throw new ParseException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}", i - 1);
+                    }
                     if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b);
                     int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f);
                     i += 3;