Skip to content

Commit

Permalink
Check-language-consistency
Browse files Browse the repository at this point in the history
  • Loading branch information
macchiati committed Dec 19, 2024
1 parent c51182b commit 8c74a38
Showing 1 changed file with 126 additions and 14 deletions.
Original file line number Diff line number Diff line change
@@ -1,19 +1,5 @@
package org.unicode.cldr.unittest;

import com.google.common.base.Joiner;
import com.google.common.base.Objects;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import com.google.common.collect.TreeMultimap;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.UnicodeMap;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.VersionInfo;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
Expand All @@ -23,6 +9,7 @@
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;

import org.unicode.cldr.draft.ScriptMetadata;
import org.unicode.cldr.draft.ScriptMetadata.Info;
import org.unicode.cldr.tool.LikelySubtags;
Expand All @@ -44,9 +31,26 @@
import org.unicode.cldr.util.StandardCodes;
import org.unicode.cldr.util.StandardCodes.LstrType;
import org.unicode.cldr.util.SupplementalDataInfo;
import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData;
import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData.Type;
import org.unicode.cldr.util.Validity;
import org.unicode.cldr.util.Validity.Status;

import com.google.common.base.Joiner;
import com.google.common.base.Objects;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import com.google.common.collect.TreeMultimap;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.UnicodeMap;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.VersionInfo;

public class LikelySubtagsTest extends TestFmwk {

private static final Validity VALIDITY = Validity.getInstance();
Expand Down Expand Up @@ -915,4 +919,112 @@ public void testConflicts() {
}
}
}

final Joiner JOIN = Joiner.on("").useForNull("null");

/**
* The first primary script in scripts must be the likely script for the language with no region.
* <pre>
* &lt;languageData>
* &lt;likelySubtag from="sr" to="sr_Cyrl_RS"/>
* &lt;likelySubtag from="sr_ME" to="sr_Latn_ME"/>
* </pre>
* So because of the above, we should see Cyrl as the first in the scripts list in the following (which we do).
* <pre>
* &lt;language type="sr" scripts="Cyrl Latn" territories="BA ME RS XK"/>
* <pre>
*/
public void testBasicLanguageDataConsistency() {
Map<String, String> likelyData = SUPPLEMENTAL_DATA_INFO.getLikelySubtags();
Set<String> langOnlyLikelyFrom = new LinkedHashSet<>();

for (Entry<String, String> likelyEntry : likelyData.entrySet()) {
CLDRLocale from = CLDRLocale.getInstance(likelyEntry.getKey());
CLDRLocale to = CLDRLocale.getInstance(likelyEntry.getValue());
String fromLang = from.getLanguage();
if (fromLang.equals("und")) {
continue;
}
if (!from.getScript().isEmpty()) {
continue;
}
boolean noFromRegion = from.getRegion().isEmpty();
if (noFromRegion) {
langOnlyLikelyFrom.add(fromLang);
}
String toScript = to.getScript();

final Map<Type, BasicLanguageData> basicLanguageDataMap =
SUPPLEMENTAL_DATA_INFO.getBasicLanguageDataMap(fromLang);
if (basicLanguageDataMap == null) {
continue;
}
for (Entry<Type, BasicLanguageData> entry : basicLanguageDataMap.entrySet()) {
if (entry.getKey() == Type.secondary) { // skip secondaries
continue;
}
BasicLanguageData data = entry.getValue();
Set<String> scripts = data.getScripts();
// NOTE: this should be an immutable linked hash set to preserve order

String fromAndTo =
JOIN.join(
from.getDisplayName(),
" (",
from,
") ⇒ ",
to.getDisplayName(),
" (",
to,
")");

if (noFromRegion) {
// if there is no fromRegion, then it must match the *first* script.
String first = scripts.isEmpty() ? "missing" : scripts.iterator().next();
assertEquals(

Check failure on line 984 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:984) Error: : aii (aii) ⇒ aii (Syriac,Iraq) (aii_Syrc_IQ): first primary languageData script = likely : expected "Syrc", got "Cyrl"

Check failure on line 984 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:984) Error: : Azerbaijani (az) ⇒ Azerbaijani (Latin,Azerbaijan) (az_Latn_AZ): first primary languageData script = likely : expected "Latn", got "Arab"

Check failure on line 984 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:984) Error: : Bosnian (bs) ⇒ Bosnian (Latin,Bosnia & Herzegovina) (bs_Latn_BA): first primary languageData script = likely : expected "Latn", got "Cyrl"

Check failure on line 984 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:984) Error: : Chakma (ccp) ⇒ Chakma (Chakma,Bangladesh) (ccp_Cakm_BD): first primary languageData script = likely : expected "Cakm", got "Beng"

Check failure on line 984 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:984) Error: : cjs (cjs) ⇒ cjs (Latin,Russia) (cjs_Latn_RU): first primary languageData script = likely : expected "Latn", got "Cyrl"

Check failure on line 984 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:984) Error: : ctd (ctd) ⇒ ctd (Pau Cin Hau,Myanmar (Burma)) (ctd_Pauc_MM): first primary languageData script = likely : expected "Pauc", got "Latn"

Check failure on line 984 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:984) Error: : Hausa (ha) ⇒ Hausa (Latin,Nigeria) (ha_Latn_NG): first primary languageData script = likely : expected "Latn", got "Arab"

Check failure on line 984 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:984) Error: : Hmong Njua (hnj) ⇒ Hmong Njua (Nyiakeng Puachue Hmong,United States) (hnj_Hmnp_US): first primary languageData script = likely : expected "Hmnp", got "Laoo"

Check failure on line 984 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:984) Error: : Kazakh (kk) ⇒ Kazakh (Cyrillic,Kazakhstan) (kk_Cyrl_KZ): first primary languageData script = likely : expected "Cyrl", got "Arab"

Check failure on line 984 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:984) Error: : Kurdish (ku) ⇒ Kurdish (Latin,Türkiye) (ku_Latn_TR): first primary languageData script = likely : expected "Latn", got "Arab"
fromAndTo + ": first primary languageData script = likely ",
toScript,
first);
} else {
// otherwise, the likely script must be somewhere in the list,
// but doesn't need to be first
assertTrue(
JOIN.join(
fromAndTo,
": primary languageData scripts ",
scripts,
" must contain ",
toScript),
scripts.contains(toScript));
}
}
}

Set<String> basicDataLanguages = SUPPLEMENTAL_DATA_INFO.getBasicLanguageDataLanguages();
if (basicDataLanguages.contains("und")) {
errln(
"NOTE: should not have 'und' in basic data, eg no:\n\t<language type='und' territories='AQ CP HM' alt='secondary'/>");
basicDataLanguages =
Sets.difference(
SUPPLEMENTAL_DATA_INFO.getBasicLanguageDataLanguages(), Set.of("und"));
}

Set<String> inBasicLanguageDataButNotLikely =
Sets.difference(basicDataLanguages, langOnlyLikelyFrom);
if (!inBasicLanguageDataButNotLikely.isEmpty()) {
errln(
JOIN.join(
"Basic data languages missing some from likely",
inBasicLanguageDataButNotLikely));
}

Set<String> inLikelyButNotBasicLanguageData =
Sets.difference(langOnlyLikelyFrom, basicDataLanguages);
if (!inLikelyButNotBasicLanguageData.isEmpty()) {
warnln(

Check warning on line 1024 in tools/cldr-code/src/test/java/org/unicode/cldr/unittest/LikelySubtagsTest.java

View workflow job for this annotation

GitHub Actions / build

(LikelySubtagsTest.java:1024) Warning: Basic data languages missing some from likely (not serious issue)[aaa, aab, aac, aad, aae, aaf, aag, aah, aai, aak, aal, aan, aao, aap, aaq, aas, aat, aau, aaw, aax, aaz, aba, abb, abc, abd, abe, abf, abg, abh, abi, abl, abm, abn, abo, abp, abs, abt, abu, abv, abw, abx, aby, abz, aca, acb, acd, acf, acm, acn, acp, acq, acr, acs, act, acu, acv, acw, acx, acy, acz, adb, add, ade, adf, adg, adh, adi, adj, adl, adn, ado, adq, adr, adt, adu, adw, adx, adz, aea, aec, aee, aek, ael, aem, aeq, aer, aeu, aew, aey, aez, afb, afd, afe, afh, afi, afk, afn, afo, afp, afs, afu, afz, aga, agb, agc, agd, age, agf, agg, agh, agi, agj, agk, agl, agm, agn, ago, agr, ags, agt, agu, agv, agw, agx, agy, agz, aha, ahb, ahg, ahh, ahi, ahk, ahl, ahm, ahn, aho, ahp, ahr, ahs, aht, aia, aib, aic, aid, aie, aif, aig, aij, aik, ail, aim, aio, aip, aiq, air, ait, aiw, aix, aiy, aja, ajg, aji, ajn, ajw, ajz, akb, akc, akd, ake, akf, akg, akh, aki, akl, ako, akp, akq, akr, aks, akt, aku, akv, akw, ala, alc, ald, alf, alh, ali, alj, alk, all, alm, alo, alp, alq, alr, alu, alw, alx, aly, alz, ama, amb, amc, ame, amf, amg, ami, amj, amk, amm, amn, amp, amq, amr, ams, amt, amu, amv, amw, amx, amy, amz, ana, anb, anc, and, ane, anf, anh, ani, anj, ank, anl, anm, ano, anq, anr, ans, ant, anu, anv, anw, anx, any, anz, aoa, aob, aoc, aod, aoe, aof, aog, aoi, aoj, aok, aol, aom, aon, aor, aos, aot, aox, apb, ape, apf, apg, aph, api, apj, apk, apl, apm, apn, apo, app, apr, aps, apt, apu, apv, apw, apx, apy, apz, aqc, aqd, aqg, aqk, aqm, aqn, aqr, aqt, aqz, ard, are, arh, ari, arj, ark, arl, arr, aru, arx, asb, asc, ase, asg, ash, asi, asj, ask, asl, asn, aso, asr, ass, asu, asv, asx, asy, asz, ata, atb, atc, atd, ate, atg, ati, atk, atl, atm, atn, ato, atp, atq, atr, ats, att, atu, atv, atw, atx, aty, atz, aua, auc, aud, aug, auh, aui, auj, auk, aul, aum, aun, auo, aup, auq, aur, aut, auu, auw, auy, auz, avb, avd, avi, avl, avm, avn, avo, avs, avt, avu, avv, awb, awc, awe, awg, awh, awi, awk, awm, awn, awo, awr, aws, awt, awu, awv, aww, awx, awy, axb, axe, axg, axk, axl, axm, axx, aya, ayb, ayc, ayd, aye, ayg, ayh, ayi, ayk, ayl, ayn, ayo, ayp, ayq, ays, ayt, ayu, ayz, azb, azd, azg, azm, azn, azo, azt, azz, baa, bab, bac, bae, baf, bag, bah, baj, bao, bau, bav, baw, bay, bba, bbb, bbd, bbe, bbf, bbg, bbi, bbk, bbl, bbm, bbn, bbo, bbp, bbq, bbr, bbs, bbt, bbu, bbv, bbw, bbx, bby, bca, bcb, bcd, bce, bcf, bcg, bch, bcj, bck, bcm, bcn, bco, bcp, bcq, bcr, bcs, bct, bcu, bcv, bcw, bcy, bcz, bda, bdb, bdc, bdd, bde, bdf, bdg, bdh, bdi, bdj, bdk, bdl, bdm, bdn, bdo, bdp, bdq, bdr, bds, bdt, bdu, bdv, bdw, bdx, bdy, bdz, bea, beb, bec, bed, bee, bef, beh, bei, bek, beo, bep, beq, bes, bet, beu, bev, bex, bey, bfa, bfb, bfc, bfe, bff, bfg, bfh, bfj, bfl, bfm, bfn, bfo, bfp, bfs, bfu, bfw, bfx, bfz, bga, bgb, bgd, bgf, bgg, bgi, bgj, bgo, bgp, bgq, bgr, bgs, bgt, bgu, bgv, bgw, bgy, bgz, bha, bhc, bhd, bhe, bhf, bhg, bhh, bhj, bhl, bhm, bhn, bhp, bhq, bhr, bhs, bht, bhu, bhv, bhw, bhy, bhz, bia, bib, bid, bie, bif, big, bil, bim, bio, bip, biq, bir, bit, biu, biv, biw, biy, biz, bja, bjb, bjc, bjf, bjg, bjh, bji, bjk, bjl, bjm, bjo, bjp, bjr, bjs, bju, bjv, bjw, bjx, bjy, bjz, bka, bkc, bkd, bkf, bkg, bkh, bki, bkj, bkk, bkl, bkn, bko, bkp, bkq, bkr, bks, bkt, bkv, bkw, bkx, bky, bkz, blb, blc, bld, ble, blf, blh, bli, blj, blk, blm, bln, blp, blq, blr, bls, blv, blw, blx, bly, blz, bma, bmb, bmc, bmd, bme, bmf, bmg, bmh, bmi, bmj, bmk, bml, bmm, bmn, bmo, bmp, bmr, bms, bmu, bmv, bmw, bmx, bmz, bna, bnb, bnc, bnd, bne, bnf, bng, bni, bnj, bnk, bnm, bnn, bno, bnp, bnq, bnr, bns, bnu, bnv, bnw, bnx, bny, bnz, boa, bob, boe, bof, boh, boj, bok, bol, bom, bon, boo, bop, boq, bor, bot, bou, bov, bow, box, boy, boz, bpa, bpc, bpd, bpe, bpg, bph, bpi, bpj, bpk, bpl, bpm, bpo, bpp, bpq, bpr, bps, bpt, bpu, bpv, bpw, bpx, bpz, bqa, bqb, bqc, bqd, bqf, bqg, bqj, bqk, bql, bqm, bqo, bqp, bqq, bqr, bqs, bqt, bqu, bqw, bqx, bqz, brb, brc, brd, brf, brg, bri, brj, brk, brl, brm, brn, bro, brp, brq, brr, brs, brt, bru, brv, brw, bry, brz, bsa,
JOIN.join(
"Basic data languages missing some from likely (not serious issue)",
inLikelyButNotBasicLanguageData));
}
}
}

0 comments on commit 8c74a38

Please sign in to comment.