diff --git a/minlp-tokenizer/minlptokenizer/tokenizer.py b/minlp-tokenizer/minlptokenizer/tokenizer.py index fb401a1b..e4b884a1 100644 --- a/minlp-tokenizer/minlptokenizer/tokenizer.py +++ b/minlp-tokenizer/minlptokenizer/tokenizer.py @@ -55,7 +55,7 @@ def format_string(ustring): inside_code = ord(uchar) if inside_code == 12288: # 全角空格直接转换 inside_code = 32 - elif 65281 <= inside_code <= 65374: # 全角字符(除空格)转化 + elif 65296 <= inside_code <= 65305 or 65313 <= inside_code <= 65339: # 全角字符(除空格和英文标点)转化 inside_code -= 65248 half_wide_string += chr(inside_code) diff --git a/minlp-tokenizer/setup.py b/minlp-tokenizer/setup.py index 48caf0f0..0d0d2c35 100644 --- a/minlp-tokenizer/setup.py +++ b/minlp-tokenizer/setup.py @@ -33,7 +33,7 @@ setup( name='minlp-tokenizer', - version='3.3.0', + version='3.3.1', description='MiNLP-Tokenizer中文分词工具', author='Yuankai Guo, Liang Shi, Yupeng Chen', author_email='guoyuankai@xiaomi.com, shiliang1@xiaomi.com',