Skip to content

Commit

Permalink
Normalize text before running alphabet regexes
Browse files Browse the repository at this point in the history
  • Loading branch information
gregtatum committed Jun 18, 2024
1 parent c13aed6 commit acda70c
Showing 1 changed file with 7 additions and 0 deletions.
7 changes: 7 additions & 0 deletions opuscleaner/filters/alpha_ratio.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Optional
import argparse
import re
import unicodedata
from clean_common import CHARS

def parse_user_args():
Expand Down Expand Up @@ -30,6 +31,12 @@ def clean_parallel(src_lang: str, ratio_words_src: float, ratio_alpha_src: float
src = fields[-2].strip()
trg = fields[-1].strip()

# Ensure the text is normalized, so that combining diacritical marks are
# represented as a single codepoint. This will better match the alphabet regexes.
src = unicodedata.normalize("NFC", src)
if trg:
trg = unicodedata.normalize("NFC", trg)

if src_lang in CHARS:
src_toks = src.split()
src_len = len(src_toks)
Expand Down

0 comments on commit acda70c

Please sign in to comment.