-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranscript_compare.py
109 lines (97 loc) · 5.29 KB
/
transcript_compare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import re
from transcript_numbers import ennumberize
import jiwer
def compare_transcripts(base: str, comp: str, edit_width=75, differences=True, gen_viz=True):
"""Compare two pieces of text and return the details of the comparison"""
c = jiwer.process_words(normalize_transcript_text(base),
normalize_transcript_text(comp))
results = vars(c)
for n in ('alignments', 'references', 'hypotheses'):
del(results[n])
if gen_viz:
vis, _ = generate_visualization(c, edit_width, differences=differences)
results['visualization'] = vis
return results
def normalize_transcript_text(text: str) -> str:
"""Remove punctuation, case, extraneous whtiespace, etc"""
text = text.strip().lower()
# remove commas from numbers
text = re.sub(r"(\d),(\d)", r'\1\2', text)
# get rid of internal newlines, tabs, etc
text = re.sub(r"[\r\n\t]", ' ', text)
# spaceless punctuation
text = re.sub(r"[_]+", '', text)
# spaceful punctuation
text = re.sub(r"[\-!@#$%^&*()+=\[\]{}\\|;:\",./<>?]+", ' ', text)
# get rid of all extraneous whitespace...
text = " ".join(ennumberize(text.split()))
return text
def generate_visualization(output: jiwer.WordOutput, length=75, differences=False):
"""Create a visualization of the differences in the text, optionally
only showing the differences"""
results = [{'ref': '', 'hyp': '', 'chg': '', 'dif': 0}]
stats = {'hit': 0, 'sub': 0, 'del': 0, 'ins': 0}
for idx, (gt, hp, chunks) in enumerate(zip(output.references, output.hypotheses, output.alignments)):
#print(idx, gt, hp, chunks)
for chunk in chunks:
if chunk.type == 'equal':
# copy ref, and hyp words until either we
# end up too long or we come to the end.
for i in range(chunk.ref_end_idx - chunk.ref_start_idx):
stats['hit'] += 1
word_len = len(gt[i + chunk.ref_start_idx])
if word_len + len(results[-1]['ref']) + 1> length:
# too long. create a new result
results.append({'ref': '', 'hyp': '', 'chg': '', 'dif': 0})
results[-1]['ref'] += gt[i + chunk.ref_start_idx] + " "
results[-1]['hyp'] += hp[i + chunk.hyp_start_idx] + " "
results[-1]['chg'] += ' ' * (word_len + 1)
elif chunk.type == 'insert':
# hyp has an additional word that's not in ref.
for i in range(chunk.hyp_end_idx - chunk.hyp_start_idx):
stats['ins'] += 1
word_len = len(hp[i + chunk.hyp_start_idx])
if word_len + len(results[-1]['ref']) + 1> length:
# too long. create a new result
results.append({'ref': '', 'hyp': '', 'chg': '', 'dif': 0})
results[-1]['ref'] += ('*' * word_len) + " "
results[-1]['hyp'] += hp[i + chunk.hyp_start_idx] + " "
results[-1]['chg'] += ('I' * word_len) + " "
results[-1]['dif'] += 1
elif chunk.type == 'delete':
# ref has an additional word that's not in hyp.
for i in range(chunk.ref_end_idx - chunk.ref_start_idx):
stats['del'] += 1
word_len = len(gt[i + chunk.ref_start_idx])
if word_len + len(results[-1]['ref']) + 1> length:
# too long. create a new result
results.append({'ref': '', 'hyp': '', 'chg': '', 'dif': 0})
results[-1]['ref'] += gt[i + chunk.ref_start_idx] + " "
results[-1]['hyp'] += ('*' * word_len) + " "
results[-1]['chg'] += ('D' * word_len) + " "
results[-1]['dif'] += 1
elif chunk.type == 'substitute':
# ref and hyp have different words (but the same number)
for i in range(chunk.ref_end_idx - chunk.ref_start_idx):
stats['sub'] += 1
word_len = max([len(gt[i + chunk.ref_start_idx]),
len(hp[i + chunk.hyp_start_idx])])
if word_len + len(results[-1]['ref']) + 1> length:
# too long. create a new result
results.append({'ref': '', 'hyp': '', 'chg': '', 'dif': 0})
results[-1]['ref'] += gt[i + chunk.ref_start_idx].ljust(word_len) + ' '
results[-1]['hyp'] += hp[i + chunk.hyp_start_idx].ljust(word_len) + ' '
results[-1]['chg'] += 'S' * (word_len) + ' '
results[-1]['dif'] += 1
else:
print(chunk)
if differences:
results = [x for x in results if x['dif'] > 0]
# render the differences as text.
report = []
for s in results:
report.append(f"BASE: {s['ref']}")
report.append(f"COMP: {s['hyp']}")
report.append(f"EDIT: {s['chg']}")
report.append("")
return report, stats