From 683deee5a0efba36759a622d17cb2de9a7195855 Mon Sep 17 00:00:00 2001 From: Pablo Diaz Date: Fri, 17 May 2024 12:53:02 -0500 Subject: [PATCH] english support --- examples/alpha/en/main.go | 13 ++ examples/alpha/{ => es}/main.go | 3 +- itn/base.go | 218 ++++++++++------------------- itn/en_test.go | 173 +++++++++++++++++++++++ itn/es_test.go | 4 +- itn/i18n.go | 241 ++++++++++++++++++++++++++++++++ 6 files changed, 506 insertions(+), 146 deletions(-) create mode 100644 examples/alpha/en/main.go rename examples/alpha/{ => es}/main.go (80%) create mode 100644 itn/en_test.go create mode 100644 itn/i18n.go diff --git a/examples/alpha/en/main.go b/examples/alpha/en/main.go new file mode 100644 index 0000000..bf6017b --- /dev/null +++ b/examples/alpha/en/main.go @@ -0,0 +1,13 @@ +package main + +import ( + "github.com/pablodz/itn/itn" +) + +func main() { + itn.SetDebug(true) + + processor, _ := itn.NewLanguage(itn.English) + new_string := processor.Alpha2Digit("first, second, third, fourth, fifth, sixth, seventh, eighth, ninth, tenth.", false, true, 3) + println(new_string) +} diff --git a/examples/alpha/main.go b/examples/alpha/es/main.go similarity index 80% rename from examples/alpha/main.go rename to examples/alpha/es/main.go index 7b6fc72..dbc5a00 100644 --- a/examples/alpha/main.go +++ b/examples/alpha/es/main.go @@ -5,10 +5,9 @@ import ( ) func main() { - itn.SetDebug(true) - processor := itn.NewLanguageES() + processor, _ := itn.NewLanguage(itn.Spanish) new_string := processor.Alpha2Digit("uno dos quince", false, true, 3) println(new_string) } diff --git a/itn/base.go b/itn/base.go index 9d2ccbd..9fffd30 100644 --- a/itn/base.go +++ b/itn/base.go @@ -7,6 +7,7 @@ import ( ) type Language struct { + LangCode LanguageCode Multipliers map[string]int Units map[string]int STens map[string]int @@ -23,143 +24,9 @@ type Language struct { And string NeverIfAlone []string Relaxed map[string]RelaxTuple - Simplify_check_coef_appliable bool -} - -func NewLanguageES() *Language { - l := &Language{ - Multipliers: map[string]int{ - "mil": 1000, - "miles": 1000, - "millon": 1000000, - "millón": 1000000, - "millones": 1000000, - }, - Units: map[string]int{ - "uno": 1, - "dos": 2, - "tres": 3, - "cuatro": 4, - "cinco": 5, - "seis": 6, - "siete": 7, - "ocho": 8, - "nueve": 9, - "un": 1, // optional - "una": 1, // optional - }, - STens: map[string]int{ - "diez": 10, - "once": 11, - "doce": 12, - "trece": 13, - "catorce": 14, - "quince": 15, - "dieciseis": 16, - "diecisiete": 17, - "dieciocho": 18, - "diecinueve": 19, - "veinte": 20, - "veintiuno": 21, - "veintidos": 22, - "veintitres": 23, - "veinticuatro": 24, - "veinticinco": 25, - "veintiseis": 26, - "veintisiete": 27, - "veintiocho": 28, - "veintinueve": 29, - "veintitrés": 23, // with accent - "veintidós": 22, // with accent - "dieciséis": 16, // with typo - "veintiséis": 26, // with typo - }, - MTens: map[string]int{ - "treinta": 30, - "cuarenta": 40, - "cincuenta": 50, - "sesenta": 60, - "setenta": 70, - "ochenta": 80, - "noventa": 90, - }, - MTensWSTens: []string{}, - Hundred: map[string]int{ - "cien": 100, - "ciento": 100, - "cienta": 100, - "doscientos": 200, - "trescientos": 300, - "cuatrocientos": 400, - "quinientos": 500, - "seiscientos": 600, - "setecientos": 700, - "ochocientos": 800, - "novecientos": 900, - "doscientas": 200, // with feminine - "trescientas": 300, // with feminine - "cuatrocientas": 400, // with feminine - "quinientas": 500, // with feminine - "seiscientas": 600, // with feminine - "setecientas": 700, // with feminine - "ochocientas": 800, // with feminine - "novecientas": 900, // with feminine - }, - Sign: map[string]string{ - "mas": "+", - "menos": "-", - }, - Zero: []string{ - "cero", - }, - DecimalSep: "coma", - DecimalSYM: ".", - AndNums: []string{ - "un", - "uno", - "una", - "dos", - "tres", - "cuatro", - "cinco", - "seis", - "siete", - "ocho", - "nueve", - }, - - And: "y", - NeverIfAlone: []string{ - "un", - // "uno", // Telephony first - "una", - }, - Relaxed: map[string]RelaxTuple{}, - } - - // deep copy from l.multipliers - l.Numbers = map[string]int{ - "mil": 1000, - "miles": 1000, - "millon": 1000000, - "millón": 1000000, - "millones": 1000000, - } - - for k, v := range l.Units { - l.Numbers[k] = v - } - for k, v := range l.STens { - l.Numbers[k] = v - } - for k, v := range l.MTens { - l.Numbers[k] = v - } - for k, v := range l.Hundred { - l.Numbers[k] = v - } - - return l + Simplify_check_coef_appliable bool // Optional + RadMap map[string]string // Optional + Composites map[string]int // Optional } type RelaxTuple struct { @@ -168,18 +35,85 @@ type RelaxTuple struct { } func (lg *Language) Ord2Card(word string) string { - return "" + switch lg.LangCode { + case English: + logPrintf(">>>> Ord2Card.0 %s", word) + plurSuff := strings.HasSuffix(word, "ths") + singSuff := strings.HasSuffix(word, "th") + source := "" + if !(plurSuff || singSuff) { + if strings.HasSuffix(word, "first") { + source = strings.ReplaceAll(word, "first", "one") + } else if strings.HasSuffix(word, "second") { + source = strings.ReplaceAll(word, "second", "two") + } else if strings.HasSuffix(word, "third") { + source = strings.ReplaceAll(word, "third", "three") + } else { + logPrintf(">>>> Ord2Card.1 %s", word) + return "" + } + } else { + if plurSuff { + source = word[:len(word)-3] + } else { + source = word[:len(word)-2] + } + } + + if containsKey(lg.RadMap, source) { + source = lg.RadMap[source] + } else if strings.HasSuffix(source, "ie") { + source = source[:len(source)-2] + "y" + } else if strings.HasSuffix(source, "fif") { + source = source[:len(source)-1] + "ve" + } else if strings.HasSuffix(source, "eigh") { + source = source + "t" + } else if strings.HasSuffix(source, "nin") { + source = source + "e" + } + + if !containsKey(lg.Numbers, source) { + logPrintf(">>>> Ord2Card.2 %s", source) + return "" + } + + logPrintf(">>>> Ord2Card.3 %s", source) + return source + case Spanish: + return "" + default: + return "" + } } func (lg *Language) NumOrd(digits string, originalWord string) string { - if strings.HasSuffix(originalWord, "o") { - return fmt.Sprintf("%sº", digits) + switch lg.LangCode { + case English: + sf := "" + if strings.HasSuffix(originalWord, "s") { + sf = originalWord[len(originalWord)-3:] + } else { + sf = originalWord[len(originalWord)-2:] + } + + return fmt.Sprintf("%s%s", digits, sf) + + case Spanish: + + if strings.HasSuffix(originalWord, "o") { + return fmt.Sprintf("%sº", digits) + } + return fmt.Sprintf("%sª", digits) } - return fmt.Sprintf("%sª", digits) + + return "ERROR" } func (lg *Language) Normalize(word string) string { - return word + switch lg.LangCode { + default: + return word + } } func (lg *Language) NotNumericWord(word string) bool { diff --git a/itn/en_test.go b/itn/en_test.go new file mode 100644 index 0000000..d5e4107 --- /dev/null +++ b/itn/en_test.go @@ -0,0 +1,173 @@ +package itn + +import ( + "testing" +) + +func TestAlpha2DigitEN(t *testing.T) { + type test struct { + input string + output string + } + + tests := []test{ + { + input: "twenty-five cows, twelve chickens and one hundred twenty five kg of potatoes.", + output: "25 cows, 12 chickens and 125 kg of potatoes.", + }, + { + input: "one thousand two hundred sixty-six dollars.", + output: "1266 dollars.", + }, + { + input: "one two three four twenty fifteen", + output: "1 2 3 4 20 15", + }, + { + input: "twenty-one, thirty-one.", + output: "21, 31.", + }, + { + input: "one two three four twenty five.", + output: "1 2 3 4 25.", + }, + { + input: "one two three four twenty, five.", + output: "1 2 3 4 20, 5.", + }, + { + input: "thirty-four = thirty four", + output: "34 = 34", + }, + { + input: "forty five hundred thirty eight dollars and eighteen cents", + output: "4538 dollars and 18 cents", + }, + { + input: "plus thirty-three nine sixty zero six twelve twenty-one", + output: "+33 9 60 06 12 21", + }, + { + input: "plus thirty-three nine sixty o six twelve twenty-one", + output: "+33 9 60 06 12 21", + }, + { + input: "zero nine sixty zero six twelve twenty-one", + output: "09 60 06 12 21", + }, + { + input: "o nine sixty o six twelve twenty-one", + output: "09 60 06 12 21", + }, + { + input: "My name is o s c a r.", + output: "My name is o s c a r.", + }, + { + input: "fifty sixty thirty and eleven", + output: "50 60 30 and 11", + }, + { + input: "thirteen thousand zero ninety", + output: "13000 090", + }, + { + input: "thirteen thousand o ninety", + output: "13000 090", + }, + { + input: "zero", + output: "0", + }, + { + input: "zero love", + output: "0 love", + }, + { + input: "Fifth third second twenty-first hundredth one thousand two hundred thirtieth twenty-fifth thirty-eighth forty-ninth.", + output: "5th third second 21st 100th 1230th 25th 38th 49th.", + }, + { + input: "first, second, third, fourth, fifth, sixth, seventh, eighth, ninth, tenth.", + output: "first, second, third, 4th, 5th, 6th, 7th, 8th, 9th, 10th.", + }, + { + input: "twenty second position at the twenty first event lost after the first second", + output: "22nd position at the 21st event lost after the first second", + }, + { + input: "twelve point ninety-nine, one hundred twenty point zero five, one hundred twenty point o five, one point two hundred thirty-six.", + output: "12.99, 120.05, 120.05, 1.236.", + }, + { + input: "point fifteen", + output: "0.15", + }, + { + input: "The average density is zero point five", + output: "The average density is 0.5", + }, + { + input: "This is the one I'm looking for. One moment please! Twenty one cats. One two three four!", + output: "This is the one I'm looking for. One moment please! 21 cats. 1 2 3 4!", + }, + { + input: "No one is innocent. Another one bites the dust.", + output: "No one is innocent. Another one bites the dust.", + }, + { + input: "one cannot know", + output: "one cannot know", + }, + { + input: "the sixth one", + output: "the 6th one", + }, + { + input: "No one. Another one. One one. Twenty one", + output: "No one. Another one. 1 1. 21", + }, + { + input: "One second please! twenty second is parsed as twenty-second and is different from twenty seconds.", + output: "One second please! 22nd is parsed as 22nd and is different from 20 seconds.", + }, + { + input: "FIFTEEN ONE TEN ONE", + output: "15 1 10 1", + }, + } + + for _, tt := range tests { + processor, _ := NewLanguage(English) + new_string := processor.Alpha2Digit(tt.input, false, true, 3) + if new_string != tt.output { + t.Errorf("❌ Expected <%s>, got <%s>", tt.output, new_string) + } else { + t.Logf("✅ Expected <%s>, got <%s>", tt.output, new_string) + } + } +} + +func TestAlpha2DigitENSpecialConfig(t *testing.T) { + type test struct { + input string + output string + } + + tests := []test{ + { + input: "first, second, third, fourth, fifth, sixth, seventh, eighth, ninth, tenth.", + output: "1st, 2nd, 3rd, 4th, 5th, 6th, 7th, 8th, 9th, 10th.", + }, + } + + for _, tt := range tests { + processor, _ := NewLanguage(English) + new_string := processor.Alpha2Digit(tt.input, false, true, 0) + if new_string != tt.output { + t.Errorf("❌ Expected <%s>, got <%s>", tt.output, new_string) + } else { + t.Logf("✅ Expected <%s>, got <%s>", tt.output, new_string) + } + } +} diff --git a/itn/es_test.go b/itn/es_test.go index e3313b3..c6128d1 100644 --- a/itn/es_test.go +++ b/itn/es_test.go @@ -4,7 +4,7 @@ import ( "testing" ) -func TestAlpha2Digit(t *testing.T) { +func TestAlpha2DigitES(t *testing.T) { type test struct { input string output string @@ -110,7 +110,7 @@ func TestAlpha2Digit(t *testing.T) { } for _, tt := range tests { - processor := NewLanguageES() + processor, _ := NewLanguage(Spanish) new_string := processor.Alpha2Digit(tt.input, false, true, 3) if new_string != tt.output { t.Errorf("❌ Expected <%s>, got <%s>", tt.output, new_string) diff --git a/itn/i18n.go b/itn/i18n.go new file mode 100644 index 0000000..37b45a2 --- /dev/null +++ b/itn/i18n.go @@ -0,0 +1,241 @@ +package itn + +import ( + "fmt" + "maps" +) + +type LanguageCode int + +const ( + Spanish LanguageCode = iota + English + French + Portuguese +) + +func NewLanguage(LangCode LanguageCode) (*Language, error) { + switch LangCode { + case Spanish: + l := &Language{ + LangCode: LangCode, + Multipliers: map[string]int{ + "mil": 1000, + "miles": 1000, + "millon": 1000000, + "millón": 1000000, + "millones": 1000000, + }, + Units: map[string]int{ + "uno": 1, + "dos": 2, + "tres": 3, + "cuatro": 4, + "cinco": 5, + "seis": 6, + "siete": 7, + "ocho": 8, + "nueve": 9, + "un": 1, // optional + "una": 1, // optional + }, + STens: map[string]int{ + "diez": 10, + "once": 11, + "doce": 12, + "trece": 13, + "catorce": 14, + "quince": 15, + "dieciseis": 16, + "diecisiete": 17, + "dieciocho": 18, + "diecinueve": 19, + "veinte": 20, + "veintiuno": 21, + "veintidos": 22, + "veintitres": 23, + "veinticuatro": 24, + "veinticinco": 25, + "veintiseis": 26, + "veintisiete": 27, + "veintiocho": 28, + "veintinueve": 29, + "veintitrés": 23, // with accent + "veintidós": 22, // with accent + "dieciséis": 16, // with typo + "veintiséis": 26, // with typo + }, + MTens: map[string]int{ + "treinta": 30, + "cuarenta": 40, + "cincuenta": 50, + "sesenta": 60, + "setenta": 70, + "ochenta": 80, + "noventa": 90, + }, + MTensWSTens: []string{}, + Hundred: map[string]int{ + "cien": 100, + "ciento": 100, + "cienta": 100, + "doscientos": 200, + "trescientos": 300, + "cuatrocientos": 400, + "quinientos": 500, + "seiscientos": 600, + "setecientos": 700, + "ochocientos": 800, + "novecientos": 900, + "doscientas": 200, // with feminine + "trescientas": 300, // with feminine + "cuatrocientas": 400, // with feminine + "quinientas": 500, // with feminine + "seiscientas": 600, // with feminine + "setecientas": 700, // with feminine + "ochocientas": 800, // with feminine + "novecientas": 900, // with feminine + }, + Sign: map[string]string{ + "mas": "+", + "menos": "-", + }, + Zero: []string{ + "cero", + }, + DecimalSep: "coma", + DecimalSYM: ".", + AndNums: []string{ + "un", + "uno", + "una", + "dos", + "tres", + "cuatro", + "cinco", + "seis", + "siete", + "ocho", + "nueve", + }, + + And: "y", + NeverIfAlone: []string{ + "un", + // "uno", // Telephony first + "una", + }, + Relaxed: map[string]RelaxTuple{}, + Composites: map[string]int{}, + } + + // deep copy from l.multipliers + l.Numbers = maps.Clone(l.Multipliers) + maps.Copy(l.Numbers, l.Units) + maps.Copy(l.Numbers, l.STens) + maps.Copy(l.Numbers, l.MTens) + maps.Copy(l.Numbers, l.Hundred) + maps.Copy(l.Numbers, l.MTens) + + return l, nil + + case English: + + l := &Language{ + LangCode: LangCode, + Multipliers: map[string]int{ + "hundred": 100, + "hundreds": 100, + "thousand": 1000, + "thousands": 1000, + "million": 1000000, + "millions": 1000000, + "billion": 1000000000, + "billions": 1000000000, + "trillion": 1000000000000, + "trillions": 1000000000000, + }, + Units: map[string]int{ + "one": 1, + "two": 2, + "three": 3, + "four": 4, + "five": 5, + "six": 6, + "seven": 7, + "eight": 8, + "nine": 9, + }, + STens: map[string]int{ + "ten": 10, + "eleven": 11, + "twelve": 12, + "thirteen": 13, + "fourteen": 14, + "fifteen": 15, + "sixteen": 16, + "seventeen": 17, + "eighteen": 18, + "nineteen": 19, + }, + MTens: map[string]int{ + "twenty": 20, + "thirty": 30, + "forty": 40, + "fifty": 50, + "sixty": 60, + "seventy": 70, + "eighty": 80, + "ninety": 90, + }, + MTensWSTens: []string{}, + Hundred: map[string]int{ + "hundred": 100, + "hundreds": 100, + }, + Sign: map[string]string{ + "plus": "+", + "minus": "-", + }, + Zero: []string{ + "zero", + "o", + }, + DecimalSep: "point", + DecimalSYM: ".", + AndNums: []string{}, + + And: "and", + NeverIfAlone: []string{ + "one", + "o", + }, + Relaxed: map[string]RelaxTuple{}, + RadMap: map[string]string{ + "fif": "five", + "eigh": "eight", + "nin": "nine", + "twelf": "twelve", + }, + Composites: map[string]int{}, + } + + for k1, v1 := range l.MTens { + for k2, v2 := range l.Units { + l.Composites[fmt.Sprintf("%s-%s", k1, k2)] = v1 + v2 + } + } + + l.Numbers = maps.Clone(l.Multipliers) + maps.Copy(l.Numbers, l.Units) + maps.Copy(l.Numbers, l.STens) + maps.Copy(l.Numbers, l.MTens) + maps.Copy(l.Numbers, l.Hundred) + maps.Copy(l.Numbers, l.Composites) + + return l, nil + + default: + return nil, fmt.Errorf("Language not implemented") + } +}