From 5474935ffbf0a329cb892a183cd97cc8be9cebbb Mon Sep 17 00:00:00 2001 From: pprobst Date: Tue, 17 Oct 2023 10:57:03 -0300 Subject: [PATCH] fix: ignore single punctuation sentences and correctly strip extra whitespace --- utils/text.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/utils/text.py b/utils/text.py index 99d7db1..ab5057c 100644 --- a/utils/text.py +++ b/utils/text.py @@ -134,7 +134,20 @@ def post_process_sentences(sentences: List[str], modify=True) -> List[str]: else: if random.random() < 0.25: sentence += random.choice([" parágrafo", " nova linha"]) + if sentence in [ + "", + "ponto", + "parágrafo", + "ponto parágrafo", + "nova linha", + "ponto nova linha", + ]: + continue post_processed_sentences.append(sentence.strip()) + + post_processed_sentences = [ + " ".join(sentence.split()) for sentence in post_processed_sentences + ] return post_processed_sentences