From 3aca18a90e77380f8f08b683aa5edab5f3c64dc8 Mon Sep 17 00:00:00 2001 From: xingchensong Date: Thu, 4 Jul 2024 17:13:48 +0800 Subject: [PATCH 1/2] [itn] add whitelist --- itn/chinese/data/default/whitelist.tsv | 15 +++++++++++++++ itn/chinese/test/data/normalizer.txt | 1 + ...r_disable_standalone_number_disable_0_to_9.txt | 3 ++- ...er_disable_standalone_number_enable_0_to_9.txt | 3 ++- ...er_enable_standalone_number_disable_0_to_9.txt | 3 ++- tn/processor.py | 4 ++-- 6 files changed, 24 insertions(+), 5 deletions(-) diff --git a/itn/chinese/data/default/whitelist.tsv b/itn/chinese/data/default/whitelist.tsv index 896f41e..5739236 100644 --- a/itn/chinese/data/default/whitelist.tsv +++ b/itn/chinese/data/default/whitelist.tsv @@ -61,3 +61,18 @@ 五台山 五台山 六盘水 六盘水 八宿 八宿 +十二五 十二五 +十三五 十三五 +十四五 十四五 +几十万 几十万 +几百万 几百万 +几千万 几千万 +十几万 十几万 +二十几万 二十几万 +三十几万 三十几万 +四十几万 四十几万 +五十几万 五十几万 +六十几万 六十几万 +七十几万 七十几万 +八十几万 八十几万 +九十几万 九十几万 diff --git a/itn/chinese/test/data/normalizer.txt b/itn/chinese/test/data/normalizer.txt index 986a52e..665db8e 100644 --- a/itn/chinese/test/data/normalizer.txt +++ b/itn/chinese/test/data/normalizer.txt @@ -30,3 +30,4 @@ 可以拨打幺二三零六来咨询 => 可以拨打12306来咨询 二点五平方电线,五,五十五,疑是银河落九天,十二块五 => 2.5平方电线,5,55,疑是银河落9天,12块5 三百九十九三盒 => 3993盒 +十三五规划期间获得了十几万和几十万甚至二十几万的投资 => 十三五规划期间获得了十几万和几十万甚至二十几万的投资 diff --git a/itn/chinese/test/data/normalizer_disable_standalone_number_disable_0_to_9.txt b/itn/chinese/test/data/normalizer_disable_standalone_number_disable_0_to_9.txt index 53b4bc6..6ebf578 100644 --- a/itn/chinese/test/data/normalizer_disable_standalone_number_disable_0_to_9.txt +++ b/itn/chinese/test/data/normalizer_disable_standalone_number_disable_0_to_9.txt @@ -38,4 +38,5 @@ 这是零百 => 这是零百 这是零千 => 这是零千 这是一百一个,一千两位,一万三天 => 这是100一个,1000两位,10000三天 -这是九百九周,九千九月,九万九年 => 这是900九周,9000九月,90000九年 \ No newline at end of file +这是九百九周,九千九月,九万九年 => 这是900九周,9000九月,90000九年 +十三五规划期间获得了十几万和几十万甚至二十几万的投资 => 十三五规划期间获得了十几万和几十万甚至二十几万的投资 diff --git a/itn/chinese/test/data/normalizer_disable_standalone_number_enable_0_to_9.txt b/itn/chinese/test/data/normalizer_disable_standalone_number_enable_0_to_9.txt index af35c1d..85caa69 100644 --- a/itn/chinese/test/data/normalizer_disable_standalone_number_enable_0_to_9.txt +++ b/itn/chinese/test/data/normalizer_disable_standalone_number_enable_0_to_9.txt @@ -7,4 +7,5 @@ 这是零百 => 这是零百 这是零千 => 这是零千 这是一百一个,一千两位,一万三天 => 这是100 1个,1000 2位,10000 3天 -这是九百九周,九千九月,九万九年 => 这是900 9周,9000 9月,90000 9年 \ No newline at end of file +这是九百九周,九千九月,九万九年 => 这是900 9周,9000 9月,90000 9年 +十三五规划期间获得了十几万和几十万甚至二十几万的投资 => 十三五规划期间获得了十几万和几十万甚至二十几万的投资 diff --git a/itn/chinese/test/data/normalizer_enable_standalone_number_disable_0_to_9.txt b/itn/chinese/test/data/normalizer_enable_standalone_number_disable_0_to_9.txt index 1850adf..b645bc0 100644 --- a/itn/chinese/test/data/normalizer_enable_standalone_number_disable_0_to_9.txt +++ b/itn/chinese/test/data/normalizer_enable_standalone_number_disable_0_to_9.txt @@ -38,4 +38,5 @@ 这是零百 => 这是零百 这是零千 => 这是零千 这是一百一个,一千两位,一万三天 => 这是100一个,1000两位,10000三天 -这是九百九周,九千九月,九万九年 => 这是900九周,9000九月,90000九年 \ No newline at end of file +这是九百九周,九千九月,九万九年 => 这是900九周,9000九月,90000九年 +十三五规划期间获得了十几万和几十万甚至二十几万的投资 => 十三五规划期间获得了十几万和几十万甚至二十几万的投资 diff --git a/tn/processor.py b/tn/processor.py index 429a129..53cf5b4 100644 --- a/tn/processor.py +++ b/tn/processor.py @@ -47,8 +47,8 @@ def __init__(self, name, ordertype="tn"): self.DELETE_ZERO_OR_ONE_SPACE = delete(closure(self.SPACE, 0, 1)) self.MIN_NEG_WEIGHT = -0.0001 self.TO_LOWER = union(*[ - cross(x, y) - for x, y in zip(string.ascii_uppercase, string.ascii_lowercase) + cross(x, y) for x, y in zip( + string.ascii_uppercase, string.ascii_lowercase, strict=True) ]) self.TO_UPPER = invert(self.TO_LOWER) From 4f94a405e660e1e59a6dc3e4d4abc7a3f701bf8e Mon Sep 17 00:00:00 2001 From: xingchensong Date: Thu, 4 Jul 2024 17:16:47 +0800 Subject: [PATCH 2/2] [itn] add whitelist --- tn/processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tn/processor.py b/tn/processor.py index 53cf5b4..429a129 100644 --- a/tn/processor.py +++ b/tn/processor.py @@ -47,8 +47,8 @@ def __init__(self, name, ordertype="tn"): self.DELETE_ZERO_OR_ONE_SPACE = delete(closure(self.SPACE, 0, 1)) self.MIN_NEG_WEIGHT = -0.0001 self.TO_LOWER = union(*[ - cross(x, y) for x, y in zip( - string.ascii_uppercase, string.ascii_lowercase, strict=True) + cross(x, y) + for x, y in zip(string.ascii_uppercase, string.ascii_lowercase) ]) self.TO_UPPER = invert(self.TO_LOWER)