Skip to content

Commit

Permalink
Merge pull request #33044 from vespa-engine/arnej/capital-i-with-dot
Browse files Browse the repository at this point in the history
Special handling for "Latin Capital Letter I with Dot Above":
  • Loading branch information
arnej27959 authored Dec 19, 2024
2 parents 0cd4daa + 53a27ec commit d0c54c8
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 6 deletions.
15 changes: 13 additions & 2 deletions lowercasing_test/src/tests/lowercasing/casingvariants_fastlib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,19 @@
ucs4_t
getUCS4Char(const char *src)
{
return Fast_UnicodeUtil::GetUTF8Char(src);
const char *input = src;
ucs4_t result = Fast_UnicodeUtil::GetUTF8Char(src);
if (result != 0) {
ucs4_t extra = Fast_UnicodeUtil::GetUTF8Char(src);
if (extra != 0) {
fprintf(stderr, "Warning: extra character from '%s' -> U+%04x U+%04X\n",
input, result, extra);
}
// mangle two characters into one fake UCS4 number
// (in theory we should compare vector<UCS4>, but this is good enough)
result |= (extra << 16);
}
return result;
}

int
Expand Down Expand Up @@ -39,4 +51,3 @@ main(int argc, char ** argv)
input.close();
return 0;
}

15 changes: 13 additions & 2 deletions lowercasing_test/src/tests/lowercasing/casingvariants_vespalib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,20 @@ using vespalib::Utf8Writer;
uint32_t
getUCS4Char(const char *src)
{
const char *input = src;
Utf8ReaderForZTS reader(src);
return reader.getChar();
uint32_t result = reader.getChar();
if (result != 0) {
uint32_t extra = reader.getChar();
if (extra != 0) {
fprintf(stderr, "Warning: extra character from '%s' -> U+%04x U+%04X\n",
input, result, extra);
}
// mangle two characters into one fake UCS4 number
// (in theory we should compare vector<UCS4>, but this is good enough)
result |= (extra << 16);
}
return result;
}

std::string
Expand Down Expand Up @@ -50,4 +62,3 @@ main(int argc, char ** argv)
input.close();
return 0;
}

2 changes: 1 addition & 1 deletion lowercasing_test/src/tests/lowercasing/dotest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ $BINREF/runjava CasingVariants > out.txt

echo "Verify Java"
if ! diff -u out.txt $SOURCE_DIRECTORY/ref.txt.$ver; then
exit 1
echo "As expected (should be 1 line diff)"
fi

echo "Verify fastlib"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
input(İ,304,0x130), lower(i,105,0x69), ref(i̇,50790505,0x3070069)
input(Ӏ,1216,0x4C0), lower(Ӏ,1216,0x4C0), ref(ӏ,1231,0x4CF)
input(Ⴀ,4256,0x10A0), lower(Ⴀ,4256,0x10A0), ref(ⴀ,11520,0x2D00)
input(Ⴁ,4257,0x10A1), lower(Ⴁ,4257,0x10A1), ref(ⴁ,11521,0x2D01)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
input(İ,304,0x130), lower(i,105,0x69), ref(i̇,50790505,0x3070069)
input(IJ,306,0x132), lower(IJ,306,0x132), ref(ij,307,0x133)
input(Ŀ,319,0x13F), lower(Ŀ,319,0x13F), ref(ŀ,320,0x140)
input(DŽ,452,0x1C4), lower(DŽ,452,0x1C4), ref(dž,454,0x1C6)
Expand Down
7 changes: 6 additions & 1 deletion vespajlib/src/main/java/com/yahoo/text/Lowercase.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
*/
public final class Lowercase {

private static final char upperIwithDot = 0x0130;

/**
* Return a lowercased version of the given string. Since this is language
* independent, this is more of a case normalization operation than
Expand All @@ -22,9 +24,12 @@ public final class Lowercase {
* @return a string containing only lowercase character
*/
public static String toLowerCase(String in) {
if (in.indexOf(upperIwithDot) != -1) {
return in.replace(upperIwithDot, 'I').toLowerCase(Locale.ENGLISH);
}
return in.toLowerCase(Locale.ENGLISH);

}

public static String toUpperCase(String in) {
return in.toUpperCase(Locale.ENGLISH);
}
Expand Down
17 changes: 17 additions & 0 deletions vespajlib/src/test/java/com/yahoo/text/LowercaseTestCase.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,23 @@ public void test7bitAscii() {
}
}

@Test
public void testTurkishI() {
String dottedCapitalI = "\u0130";
String lc = Lowercase.toLowerCase(dottedCapitalI);
assertEquals("i", lc);
}

@Test
public void testAllChars() {
for (char c = 1; c != 0; c++) {
char [] carray = {c};
String s = new String(carray);
String lc = Lowercase.toLowerCase(s);
assertEquals(1, lc.length());
}
}

@Test
@Ignore
public void performance() {
Expand Down

0 comments on commit d0c54c8

Please sign in to comment.