Skip to content

Commit

Permalink
fix: ignore single punctuation sentences and correctly strip extra wh…
Browse files Browse the repository at this point in the history
…itespace
  • Loading branch information
pprobst committed Oct 17, 2023
1 parent 51c7f2f commit 5474935
Showing 1 changed file with 13 additions and 0 deletions.
13 changes: 13 additions & 0 deletions utils/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,20 @@ def post_process_sentences(sentences: List[str], modify=True) -> List[str]:
else:
if random.random() < 0.25:
sentence += random.choice([" parágrafo", " nova linha"])
if sentence in [
"",
"ponto",
"parágrafo",
"ponto parágrafo",
"nova linha",
"ponto nova linha",
]:
continue
post_processed_sentences.append(sentence.strip())

post_processed_sentences = [
" ".join(sentence.split()) for sentence in post_processed_sentences
]
return post_processed_sentences


Expand Down

0 comments on commit 5474935

Please sign in to comment.