From 1ff22e6b6219a4bbfac4860ff1f55a7ba8c19dcf Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Mon, 16 Dec 2024 18:35:01 +0000 Subject: [PATCH 1/3] Special handling for "Latin Capital Letter I with Dot Above": Java follow Unicode standard which says lowercasing should produce "i" plus "Combining Dot Above". Our C++ code doesn't agree that this makes sense, and note that this is the only character which produces two characters as output from lowercasing. Ideally we should do "Case Folding" instead of lowercasing. --- .../src/main/java/com/yahoo/text/Lowercase.java | 7 ++++++- .../java/com/yahoo/text/LowercaseTestCase.java | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/vespajlib/src/main/java/com/yahoo/text/Lowercase.java b/vespajlib/src/main/java/com/yahoo/text/Lowercase.java index 3f9e943d2c13..3c82a0c31119 100644 --- a/vespajlib/src/main/java/com/yahoo/text/Lowercase.java +++ b/vespajlib/src/main/java/com/yahoo/text/Lowercase.java @@ -12,6 +12,8 @@ */ public final class Lowercase { + private static final char upperIwithDot = 0x0130; + /** * Return a lowercased version of the given string. Since this is language * independent, this is more of a case normalization operation than @@ -22,9 +24,12 @@ public final class Lowercase { * @return a string containing only lowercase character */ public static String toLowerCase(String in) { + if (in.indexOf(upperIwithDot) != -1) { + return in.replace(upperIwithDot, 'I').toLowerCase(Locale.ENGLISH); + } return in.toLowerCase(Locale.ENGLISH); - } + public static String toUpperCase(String in) { return in.toUpperCase(Locale.ENGLISH); } diff --git a/vespajlib/src/test/java/com/yahoo/text/LowercaseTestCase.java b/vespajlib/src/test/java/com/yahoo/text/LowercaseTestCase.java index a1bd793bd884..fe4417748c3b 100644 --- a/vespajlib/src/test/java/com/yahoo/text/LowercaseTestCase.java +++ b/vespajlib/src/test/java/com/yahoo/text/LowercaseTestCase.java @@ -57,6 +57,23 @@ public void test7bitAscii() { } } + @Test + public void testTurkishI() { + String dottedCapitalI = "\u0130"; + String lc = Lowercase.toLowerCase(dottedCapitalI); + assertEquals("i", lc); + } + + @Test + public void testAllChars() { + for (char c = 1; c != 0; c++) { + char [] carray = {c}; + String s = new String(carray); + String lc = Lowercase.toLowerCase(s); + assertEquals(1, lc.length()); + } + } + @Test @Ignore public void performance() { From 25d629324c786f6255bde28a165b6734e5132b83 Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Tue, 17 Dec 2024 10:46:05 +0000 Subject: [PATCH 2/3] handle that our Java lowercasing is not exactly same as java8u20; also extend test of fastlib/vespalib lowercasing to show that they have the same difference. --- .../tests/lowercasing/casingvariants_fastlib.cpp | 13 +++++++++++-- .../tests/lowercasing/casingvariants_vespalib.cpp | 13 +++++++++++-- lowercasing_test/src/tests/lowercasing/dotest.sh | 2 +- .../src/tests/lowercasing/ref.fastlib.txt.java8u20 | 1 + .../src/tests/lowercasing/ref.vespalib.txt.java8u20 | 1 + 5 files changed, 25 insertions(+), 5 deletions(-) diff --git a/lowercasing_test/src/tests/lowercasing/casingvariants_fastlib.cpp b/lowercasing_test/src/tests/lowercasing/casingvariants_fastlib.cpp index 3aa2bbe5a86f..21fb376719cf 100644 --- a/lowercasing_test/src/tests/lowercasing/casingvariants_fastlib.cpp +++ b/lowercasing_test/src/tests/lowercasing/casingvariants_fastlib.cpp @@ -8,7 +8,17 @@ ucs4_t getUCS4Char(const char *src) { - return Fast_UnicodeUtil::GetUTF8Char(src); + const char *input = src; + ucs4_t result = Fast_UnicodeUtil::GetUTF8Char(src); + if (result != 0) { + ucs4_t extra = Fast_UnicodeUtil::GetUTF8Char(src); + if (extra != 0) { + fprintf(stderr, "Warning: extra character from '%s' -> U+%04x U+%04X\n", + input, result, extra); + } + result |= (extra << 16); + } + return result; } int @@ -39,4 +49,3 @@ main(int argc, char ** argv) input.close(); return 0; } - diff --git a/lowercasing_test/src/tests/lowercasing/casingvariants_vespalib.cpp b/lowercasing_test/src/tests/lowercasing/casingvariants_vespalib.cpp index fb40ce4b0ddb..51e0a63860e9 100644 --- a/lowercasing_test/src/tests/lowercasing/casingvariants_vespalib.cpp +++ b/lowercasing_test/src/tests/lowercasing/casingvariants_vespalib.cpp @@ -12,8 +12,18 @@ using vespalib::Utf8Writer; uint32_t getUCS4Char(const char *src) { + const char *input = src; Utf8ReaderForZTS reader(src); - return reader.getChar(); + uint32_t result = reader.getChar(); + if (result != 0) { + uint32_t extra = reader.getChar(); + if (extra != 0) { + fprintf(stderr, "Warning: extra character from '%s' -> U+%04x U+%04X\n", + input, result, extra); + } + result |= (extra << 16); + } + return result; } std::string @@ -50,4 +60,3 @@ main(int argc, char ** argv) input.close(); return 0; } - diff --git a/lowercasing_test/src/tests/lowercasing/dotest.sh b/lowercasing_test/src/tests/lowercasing/dotest.sh index e70a96b7d53f..ad1175bc71c7 100755 --- a/lowercasing_test/src/tests/lowercasing/dotest.sh +++ b/lowercasing_test/src/tests/lowercasing/dotest.sh @@ -16,7 +16,7 @@ $BINREF/runjava CasingVariants > out.txt echo "Verify Java" if ! diff -u out.txt $SOURCE_DIRECTORY/ref.txt.$ver; then - exit 1 + echo "As expected (should be 1 line diff)" fi echo "Verify fastlib" diff --git a/lowercasing_test/src/tests/lowercasing/ref.fastlib.txt.java8u20 b/lowercasing_test/src/tests/lowercasing/ref.fastlib.txt.java8u20 index e13939e7217a..29e1e8802129 100644 --- a/lowercasing_test/src/tests/lowercasing/ref.fastlib.txt.java8u20 +++ b/lowercasing_test/src/tests/lowercasing/ref.fastlib.txt.java8u20 @@ -1,3 +1,4 @@ +input(İ,304,0x130), lower(i,105,0x69), ref(i̇,50790505,0x3070069) input(Ӏ,1216,0x4C0), lower(Ӏ,1216,0x4C0), ref(ӏ,1231,0x4CF) input(Ⴀ,4256,0x10A0), lower(Ⴀ,4256,0x10A0), ref(ⴀ,11520,0x2D00) input(Ⴁ,4257,0x10A1), lower(Ⴁ,4257,0x10A1), ref(ⴁ,11521,0x2D01) diff --git a/lowercasing_test/src/tests/lowercasing/ref.vespalib.txt.java8u20 b/lowercasing_test/src/tests/lowercasing/ref.vespalib.txt.java8u20 index d84ac779e339..3d84ee9c1fc1 100644 --- a/lowercasing_test/src/tests/lowercasing/ref.vespalib.txt.java8u20 +++ b/lowercasing_test/src/tests/lowercasing/ref.vespalib.txt.java8u20 @@ -1,3 +1,4 @@ +input(İ,304,0x130), lower(i,105,0x69), ref(i̇,50790505,0x3070069) input(IJ,306,0x132), lower(IJ,306,0x132), ref(ij,307,0x133) input(Ŀ,319,0x13F), lower(Ŀ,319,0x13F), ref(ŀ,320,0x140) input(DŽ,452,0x1C4), lower(DŽ,452,0x1C4), ref(dž,454,0x1C6) From 53a27eca29450199329ee0f0499b05796e27505d Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Thu, 19 Dec 2024 09:53:50 +0000 Subject: [PATCH 3/3] add comment to explain corner case --- .../src/tests/lowercasing/casingvariants_fastlib.cpp | 2 ++ .../src/tests/lowercasing/casingvariants_vespalib.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/lowercasing_test/src/tests/lowercasing/casingvariants_fastlib.cpp b/lowercasing_test/src/tests/lowercasing/casingvariants_fastlib.cpp index 21fb376719cf..d140978eea65 100644 --- a/lowercasing_test/src/tests/lowercasing/casingvariants_fastlib.cpp +++ b/lowercasing_test/src/tests/lowercasing/casingvariants_fastlib.cpp @@ -16,6 +16,8 @@ getUCS4Char(const char *src) fprintf(stderr, "Warning: extra character from '%s' -> U+%04x U+%04X\n", input, result, extra); } + // mangle two characters into one fake UCS4 number + // (in theory we should compare vector, but this is good enough) result |= (extra << 16); } return result; diff --git a/lowercasing_test/src/tests/lowercasing/casingvariants_vespalib.cpp b/lowercasing_test/src/tests/lowercasing/casingvariants_vespalib.cpp index 51e0a63860e9..8b2a428854cd 100644 --- a/lowercasing_test/src/tests/lowercasing/casingvariants_vespalib.cpp +++ b/lowercasing_test/src/tests/lowercasing/casingvariants_vespalib.cpp @@ -21,6 +21,8 @@ getUCS4Char(const char *src) fprintf(stderr, "Warning: extra character from '%s' -> U+%04x U+%04X\n", input, result, extra); } + // mangle two characters into one fake UCS4 number + // (in theory we should compare vector, but this is good enough) result |= (extra << 16); } return result;