From aa60c0e783c1db3b2af3873ed00c36d19f8c74d7 Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Tue, 8 Aug 2023 14:53:09 +0000 Subject: [PATCH 1/7] Add a filter that fixes sentences that have punctuatiton mismatch at the end --- opuscleaner/filters/fix_sent_final_punct.json | 6 +++++ opuscleaner/filters/fix_sent_final_punct.py | 24 +++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 opuscleaner/filters/fix_sent_final_punct.json create mode 100755 opuscleaner/filters/fix_sent_final_punct.py diff --git a/opuscleaner/filters/fix_sent_final_punct.json b/opuscleaner/filters/fix_sent_final_punct.json new file mode 100644 index 0000000..e8ad783 --- /dev/null +++ b/opuscleaner/filters/fix_sent_final_punct.json @@ -0,0 +1,6 @@ +{ + "type": "bilingual", + "description": "Fixes mismatched punctuation at the end of the sentences. Works for latin/cyrillic based languages, WILL BREAK CJK AND ANY LANGUAGE THAT USES NON ENGLISH LIKE SENTENCE ENDING TOKENS.", + "parameters": {}, + "command": "./fix_sent_final_punct.py" +} diff --git a/opuscleaner/filters/fix_sent_final_punct.py b/opuscleaner/filters/fix_sent_final_punct.py new file mode 100755 index 0000000..a8f7ee3 --- /dev/null +++ b/opuscleaner/filters/fix_sent_final_punct.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +import sys + +my_punct = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '»', '«', '“', '”'] + +for line in sys.stdin: + src, trg = line.rstrip("\r\n").split("\t") + # Sometimes we have a space between the final letter and the punctuation + if src[-1] in my_punct and src[-2] == " ": + src = src[:-2] + src[-1] + if trg[-1] in my_punct and trg[-2] == " ": + trg = trg[:-2] + trg[-1] + # check for the french quotes special case + if src[-1] == '»' or src[-1] == '«' and trg[-1] not in my_punct: + trg = trg + '"' + elif trg[-1] == '»' or trg[-1] == '«' and src[-1] not in my_punct: + src = src + '"' + elif src[-1] in my_punct and trg[-1] not in my_punct: + trg = trg + src[-1] + elif trg[-1] in my_punct and src[-1] not in my_punct: + src = src + trg[-1] + elif trg[-1] in my_punct and src[-1] in my_punct and src[-1] != trg[-1]: + trg = trg[:-1] + src[-1] + print(src + '\t' + trg) From 701ead2b2833627f1af06db58c8aec46a69ca16e Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Tue, 8 Aug 2023 15:06:09 +0000 Subject: [PATCH 2/7] Handle empty lines --- opuscleaner/filters/fix_sent_final_punct.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/opuscleaner/filters/fix_sent_final_punct.py b/opuscleaner/filters/fix_sent_final_punct.py index a8f7ee3..aa1d7d8 100755 --- a/opuscleaner/filters/fix_sent_final_punct.py +++ b/opuscleaner/filters/fix_sent_final_punct.py @@ -5,11 +5,16 @@ for line in sys.stdin: src, trg = line.rstrip("\r\n").split("\t") + if len(src) == 0 or len(trg) == 0: + print(src + '\t' + trg) + continue + # Sometimes we have a space between the final letter and the punctuation - if src[-1] in my_punct and src[-2] == " ": + if len(src) >= 2 and src[-1] in my_punct and src[-2] == " ": src = src[:-2] + src[-1] - if trg[-1] in my_punct and trg[-2] == " ": + if len(trg) >= 2 and trg[-1] in my_punct and trg[-2] == " ": trg = trg[:-2] + trg[-1] + # check for the french quotes special case if src[-1] == '»' or src[-1] == '«' and trg[-1] not in my_punct: trg = trg + '"' From a9d4c11043d86e48570984488f181b607d6e2d2d Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Tue, 8 Aug 2023 15:15:44 +0000 Subject: [PATCH 3/7] use sets and proper condition aroudn french quotes --- opuscleaner/filters/fix_sent_final_punct.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/opuscleaner/filters/fix_sent_final_punct.py b/opuscleaner/filters/fix_sent_final_punct.py index aa1d7d8..2aaaf2d 100755 --- a/opuscleaner/filters/fix_sent_final_punct.py +++ b/opuscleaner/filters/fix_sent_final_punct.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import sys -my_punct = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '»', '«', '“', '”'] +my_punct = {'!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '»', '«', '“', '”'} for line in sys.stdin: src, trg = line.rstrip("\r\n").split("\t") @@ -16,9 +16,9 @@ trg = trg[:-2] + trg[-1] # check for the french quotes special case - if src[-1] == '»' or src[-1] == '«' and trg[-1] not in my_punct: + if (src[-1] == '»' or src[-1] == '«') and trg[-1] not in my_punct: trg = trg + '"' - elif trg[-1] == '»' or trg[-1] == '«' and src[-1] not in my_punct: + elif (trg[-1] == '»' or trg[-1] == '«') and src[-1] not in my_punct: src = src + '"' elif src[-1] in my_punct and trg[-1] not in my_punct: trg = trg + src[-1] From 1fd0c5dce16010562241801c3612311bfb3c15dc Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Tue, 8 Aug 2023 15:22:52 +0000 Subject: [PATCH 4/7] Extra special cases for French, as it is an extra special language. --- opuscleaner/filters/fix_sent_final_punct.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/opuscleaner/filters/fix_sent_final_punct.py b/opuscleaner/filters/fix_sent_final_punct.py index 2aaaf2d..dd6cb56 100755 --- a/opuscleaner/filters/fix_sent_final_punct.py +++ b/opuscleaner/filters/fix_sent_final_punct.py @@ -9,10 +9,10 @@ print(src + '\t' + trg) continue - # Sometimes we have a space between the final letter and the punctuation - if len(src) >= 2 and src[-1] in my_punct and src[-2] == " ": + # Sometimes we have a space between the final letter and the punctuation, which is wrong except if using french quotes + if len(src) >= 2 and src[-1] in my_punct and src[-2] == " " and src[-1] != '»' and src[-1] != '«': src = src[:-2] + src[-1] - if len(trg) >= 2 and trg[-1] in my_punct and trg[-2] == " ": + if len(trg) >= 2 and trg[-1] in my_punct and trg[-2] == " " and trg[-1] != '»' and trg[-1] != '«': trg = trg[:-2] + trg[-1] # check for the french quotes special case @@ -24,6 +24,6 @@ trg = trg + src[-1] elif trg[-1] in my_punct and src[-1] not in my_punct: src = src + trg[-1] - elif trg[-1] in my_punct and src[-1] in my_punct and src[-1] != trg[-1]: + elif trg[-1] in my_punct and src[-1] in my_punct and src[-1] != trg[-1] and src[-1] != '»' and src[-1] != '«' and trg[-1] != '»' and trg[-1] != '«': trg = trg[:-1] + src[-1] print(src + '\t' + trg) From 8d15e8e131a491c579cee006ceac89d682f5be82 Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Tue, 8 Aug 2023 15:34:23 +0000 Subject: [PATCH 5/7] Leave emdash and dash alone --- opuscleaner/filters/fix_sent_final_punct.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/opuscleaner/filters/fix_sent_final_punct.py b/opuscleaner/filters/fix_sent_final_punct.py index dd6cb56..794c7e8 100755 --- a/opuscleaner/filters/fix_sent_final_punct.py +++ b/opuscleaner/filters/fix_sent_final_punct.py @@ -24,6 +24,8 @@ trg = trg + src[-1] elif trg[-1] in my_punct and src[-1] not in my_punct: src = src + trg[-1] - elif trg[-1] in my_punct and src[-1] in my_punct and src[-1] != trg[-1] and src[-1] != '»' and src[-1] != '«' and trg[-1] != '»' and trg[-1] != '«': + # Final case. Fix mismatched punctuation on the src and trg. EXCEPT in cases like french quotes. And in cases where we have emdash at the front, as it means spech + elif trg[-1] in my_punct and src[-1] in my_punct and src[-1] != trg[-1] and src[-1] != '»' \ +and src[-1] != '«' and trg[-1] != '»' and trg[-1] != '«' and src[0] != '–' and trg[0] != '–' and src[0] != '—' and trg[0] != '—': trg = trg[:-1] + src[-1] print(src + '\t' + trg) From dcaaff01f7b26444d26e97f78f1fac8cdd894f62 Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Tue, 8 Aug 2023 17:01:51 +0000 Subject: [PATCH 6/7] Deal with yet another special case --- opuscleaner/filters/fix_sent_final_punct.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/opuscleaner/filters/fix_sent_final_punct.py b/opuscleaner/filters/fix_sent_final_punct.py index 794c7e8..d5cb62a 100755 --- a/opuscleaner/filters/fix_sent_final_punct.py +++ b/opuscleaner/filters/fix_sent_final_punct.py @@ -14,6 +14,9 @@ src = src[:-2] + src[-1] if len(trg) >= 2 and trg[-1] in my_punct and trg[-2] == " " and trg[-1] != '»' and trg[-1] != '«': trg = trg[:-2] + trg[-1] + # Sometimes two punctuation marks are swapped... + if len(src) >=2 and len(trg) >= 2 and src[-2] == trg[-1] and src[-1] == trg[-2]: + trg = trg[:-2] + src[-2] + src[-1] # check for the french quotes special case if (src[-1] == '»' or src[-1] == '«') and trg[-1] not in my_punct: From 61f4671eee9c529a705b3c766dbbf9f2dc7b4451 Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Tue, 8 Aug 2023 17:11:05 +0000 Subject: [PATCH 7/7] More fixes towards swapped punctuation --- opuscleaner/filters/fix_sent_final_punct.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/opuscleaner/filters/fix_sent_final_punct.py b/opuscleaner/filters/fix_sent_final_punct.py index d5cb62a..6c3abc9 100755 --- a/opuscleaner/filters/fix_sent_final_punct.py +++ b/opuscleaner/filters/fix_sent_final_punct.py @@ -17,6 +17,12 @@ # Sometimes two punctuation marks are swapped... if len(src) >=2 and len(trg) >= 2 and src[-2] == trg[-1] and src[-1] == trg[-2]: trg = trg[:-2] + src[-2] + src[-1] + # Sometimes they are swapped with space around eg SPACE». -> .SPACE» + if len(src) >=3 and src[-1] in my_punct and src[-2] == '»' and src[-3] == ' ': + src = src[:-3] + src[-1] + ' ' + src[-2] + if len(trg) >=3 and trg[-1] in my_punct and trg[-2] == '»' and trg[-3] == ' ': + trg = trg[:-3] + trg[-1] + ' ' + trg[-2] + # check for the french quotes special case if (src[-1] == '»' or src[-1] == '«') and trg[-1] not in my_punct: