Normalize text before running alphabet regexes

hplt-project · Jun 18, 2024 · acda70c · acda70c
1 parent c13aed6
commit acda70c
Showing 1 changed file with 7 additions and 0 deletions.
diff --git a/opuscleaner/filters/alpha_ratio.py b/opuscleaner/filters/alpha_ratio.py
@@ -3,6 +3,7 @@
 from typing import Optional
 import argparse
 import re
+import unicodedata
 from clean_common import CHARS
 
 def parse_user_args():
@@ -30,6 +31,12 @@ def clean_parallel(src_lang: str, ratio_words_src: float, ratio_alpha_src: float
             src = fields[-2].strip()
             trg = fields[-1].strip()
 
+        # Ensure the text is normalized, so that combining diacritical marks are
+        # represented as a single codepoint. This will better match the alphabet regexes.
+        src = unicodedata.normalize("NFC", src)
+        if trg:
+            trg = unicodedata.normalize("NFC", trg)
+
         if src_lang in CHARS:
             src_toks = src.split()
             src_len = len(src_toks)