-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_processing.py
99 lines (79 loc) · 3.29 KB
/
text_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import re
from html import escape
CONTEXT_SHIFT = 60
WORD_SHIFT = 5
QUESTION_PROMPT = "Пацвердзіць замену?\n[[[ Калі вы бачыце штосьці ненармальнае, то вам здаецца ;) ]]]"
def replace_char(match, replace):
"""
Replaces a character in a match while preserving its case.
Args:
match (re.Match): A regular expression match containing the character to be replaced.
replace (str): The replacement character.
Returns:
str: The replacement character with the case preserved.
"""
return (replace, replace.upper())[match.group(0).isupper()]
def find_word_boundaries(text, position):
# Use regular expressions to find the word boundaries
pattern = r'\W*[\b]\w+[\b]'
matches = re.finditer(pattern, text)
for match in matches:
start, end = match.span()
if start <= position <= end:
return start, end
return None, None
def update_text(text, rule):
search_str, replace_str = rule
EXCEPTIONS = ('', '’')
if replace_str in EXCEPTIONS:
return re.sub(search_str, replace_str, text, flags=re.IGNORECASE)
return re.sub(search_str, lambda match: replace_char(match, replace_str), text, flags=re.IGNORECASE)
def apply_rule(text, rule, match, main_window):
search_str, replace_str = rule
start, end = match.span()
segment = match.group()
ctx_start = max(0, start - CONTEXT_SHIFT)
ctx_end = end + CONTEXT_SHIFT
ctx = text[ctx_start:start] + segment + text[end:ctx_end]
# Find the word boundaries
word_start, word_end = find_word_boundaries(text, start)
if word_start is None:
word_start = max(0, start - WORD_SHIFT)
if word_end is None:
word_end = end + WORD_SHIFT
ctx_hl = (
text[ctx_start:word_start]
+ f'<span style="color:yellow">{text[word_start:start]}</span>'
+ f'<span style="color:red">{segment}</span>'
+ f'<span style="color:yellow">{text[end:word_end]}</span>'
+ text[word_end:ctx_end]
)
replace_hl = (
text[ctx_start:word_start]
+ f'<span style="color:green">{update_text(text[word_start:word_end], rule)}</span>'
+ text[word_end:ctx_end]
)
if segment in text[word_start:word_end]:
if main_window.show_confirmation_dialog(QUESTION_PROMPT, ctx_hl, replace_hl):
text = text[:word_start] + update_text(text[word_start:word_end], rule) + text[word_end:]
else:
print(f"User declined. {ctx}")
return text
def apply_all_rules(text, rules, main_window):
# Escape HTML-like tags to prevent truncation
text = escape(text)
# Convert newline characters to HTML line breaks
text = text.replace('\n', '<br>')
# Convert tabs to HTML entities only if not within HTML tags
def replace_tab(match):
return '	' if not re.search(r'<[^>]*$', match.group(0)) else '\t'
text = re.sub(r'\t+', replace_tab, text)
for rule in rules:
rule_tr = (rule.search_str, rule.replace_str)
if rule.ask_flag is None:
text = update_text(text, rule_tr)
continue
matches = re.finditer(rule.search_str, text, flags=re.IGNORECASE)
for match in matches:
text = apply_rule(text, rule_tr, match, main_window)
return text