Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UnicodeUtil updates: TryUTF8toUTF16, ReadOnlySpan methods, #1024 #1057

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Prev Previous commit
Next Next commit
Throw ParseException on out of range in UTF8toUTF16, add more tests
  • Loading branch information
paulirwin committed Dec 18, 2024
commit a04df3a197cd18fe7d4ef979f2d979346a629359
32 changes: 28 additions & 4 deletions src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
Original file line number Diff line number Diff line change
@@ -328,8 +328,29 @@ public virtual void TestUTF8UTF16CharsRef()
}
}

[Test]
[LuceneNetSpecific]
[TestCase(new byte[] { 0x63, 0x61, 0xc3 }, true)] // ca�, start of 2-byte sequence
[TestCase(new byte[] { 0x63, 0x61, 0xe3 }, true)] // ca�, start of 3-byte sequence
[TestCase(new byte[] { 0x63, 0x61, 0xf3 }, true)] // ca�, start of 4-byte sequence
[TestCase(new byte[] { 0x63, 0x61, 0xc3, 0xb1, 0x6f, 0x6e }, false)] // cañon
public void TestUTF8toUTF16Exception(byte[] invalidUtf8, bool shouldThrow)
{
var scratch = new CharsRef();

if (shouldThrow)
{
Assert.Throws<ParseException>(() => UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch));
}
else
{
UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch);
}
}

[Test]
[LuceneNetSpecific] // this is a Lucene.NET specific method
[Repeat(100)]
public void TestTryUTF8toUTF16()
{
string unicode = TestUtil.RandomRealisticUnicodeString(Random);
@@ -343,14 +364,17 @@ public void TestTryUTF8toUTF16()

[Test]
[LuceneNetSpecific] // this is a Lucene.NET specific method
public void TestUTF8toUTF16WithFallback()
[TestCase(new byte[] { 0x63, 0x61, 0xc3 }, "ca\ufffd")] // ca�, start of 2-byte sequence
[TestCase(new byte[] { 0x63, 0x61, 0xe3 }, "ca\ufffd")] // ca�, start of 3-byte sequence
[TestCase(new byte[] { 0x63, 0x61, 0xf3 }, "ca\ufffd")] // ca�, start of 4-byte sequence
[TestCase(new byte[] { 0x63, 0x61, 0xc3, 0xb1, 0x6f, 0x6e }, "cañon")]
public void TestUTF8toUTF16WithFallback(byte[] utf8, string expected)
{
byte[] invalidUtf8 = { 0x63, 0xc3 }; // Invalid ending UTF-8 sequence
var scratch = new CharsRef();

UnicodeUtil.UTF8toUTF16WithFallback(invalidUtf8, scratch);
UnicodeUtil.UTF8toUTF16WithFallback(utf8, scratch);

Assert.AreEqual("c\ufffd", scratch.ToString());
Assert.AreEqual(expected, scratch.ToString());
}
}
}
12 changes: 12 additions & 0 deletions src/Lucene.Net/Util/UnicodeUtil.cs
Original file line number Diff line number Diff line change
@@ -925,15 +925,27 @@ public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, CharsRef chars)
}
else if (b < 0xe0)
{
if (utf8.Length <= i)
{
throw new ParseException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}", i - 1);
}
@out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f));
}
else if (b < 0xf0)
{
if (utf8.Length <= i + 1)
{
throw new ParseException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}", i - 1);
}
@out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f));
i += 2;
}
else
{
if (utf8.Length <= i + 2)
{
throw new ParseException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}", i - 1);
}
if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b);
int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f);
i += 3;