diff --git a/opuscleaner/filters/alpha_ratio.py b/opuscleaner/filters/alpha_ratio.py index f140537..a901842 100755 --- a/opuscleaner/filters/alpha_ratio.py +++ b/opuscleaner/filters/alpha_ratio.py @@ -3,6 +3,7 @@ from typing import Optional import argparse import re +import unicodedata from clean_common import CHARS def parse_user_args(): @@ -30,6 +31,12 @@ def clean_parallel(src_lang: str, ratio_words_src: float, ratio_alpha_src: float src = fields[-2].strip() trg = fields[-1].strip() + # Ensure the text is normalized, so that combining diacritical marks are + # represented as a single codepoint. This will better match the alphabet regexes. + src = unicodedata.normalize("NFC", src) + if trg: + trg = unicodedata.normalize("NFC", trg) + if src_lang in CHARS: src_toks = src.split() src_len = len(src_toks)