-
Notifications
You must be signed in to change notification settings - Fork 4
/
process_cmudict.py
150 lines (125 loc) · 5.34 KB
/
process_cmudict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import re
CMUDICT = 'cmudict.0.6-syl'
ALIGN = 'm2m.input.m-mAlign.2-2.delX.1-best.conYX.align'
# ------------------
# Helper functions
# ------------------
DIGITS = re.compile('\d')
def contains_digits(d):
"""Returns True if d contains any numeric characters"""
return bool(DIGITS.search(d))
def align(orthographic_alignment, phonetic_alignment):
"""Returns a list of tuples of corresponding indices between
an aligned orthographic string and an aligned phonetic string.
Phonetic index FIRST, to the index of the *last* character of the
corresponding orthographic substring
Example: A|B|O|L|I|T:I|O|N| AE|B|AH|L|IH|SH|AH|N|
-> [(0,0),(1,1),(2,2),(3,3),(4,4),(5,5),(6,7),(7,8)]
"""
index_pairs = list()
phonetic_index = 0
orthographic_index = 0
for phon, ortho in zip(phonetic_alignment.split('|'),
orthographic_alignment.split('|')):
index_pairs.append((phonetic_index, orthographic_index))
if phon == '_':
pass
else:
for i in range(len(phon.split(':'))):
phonetic_index += 1
index_pairs.append((phonetic_index, orthographic_index))
if ortho == '_':
pass
else:
orthographic_index += len(ortho.split(':'))
index_pairs.append((len(phonetic_alignment), len(orthographic_alignment)))
return index_pairs
def stressed_representation(orthographic_string, phonetic_string,
alignment_dict):
"""Given three ingredients: the orthographic representation of a word,
its pronunciation (CMU dict representation), and a dictionary of
index alignments, return a stressed representation of the *orthography*.
<1></1> around the primary-stressed syllable,
<2></2> around the secondary-stressed syllables,
<0></0> around the unstressed syllables
"""
# zeroth, let's convert the orthographic string into sentence case
# for easier reading
orthographic_string = orthographic_string.capitalize()
# first, process the phonetic string to determine the ENDING indices
# of each syllable and the stress value
phon_indices = list()
end_index = 0
for syllable in phonetic_string.split('.'):
syllable = syllable.strip()
end_index += len(syllable.split(' '))
syllable = syllable.strip()
phon_indices.append((end_index, DIGITS.findall(syllable)[0]))
# convert the phonetic indices to orthographic indices using
# the alignment dictionary
ortho_indices = [(alignment_dict[phon_index], syllable_stress)
for phon_index, syllable_stress in phon_indices]
# now insert <n></n> around each syllable for appropriate value of n
stressed_string = ""
prev_index = 0
for ortho_index, syllable_stress in ortho_indices:
stressed_string += "<%s>" % syllable_stress + \
orthographic_string[prev_index:ortho_index] + \
"</%s>" % syllable_stress
prev_index = ortho_index
return stressed_string
# ------------------
# File processing
# ------------------
def process_aligned_data():
"""Read in the aligned data and return a dictionary of words to a dictionary
of alignments (the result of align(ortho, pron))"""
dictionary = dict()
with open(ALIGN, 'r') as f:
for line in f:
ortho, pron = line.split('\t')
word = ''.join(ortho.split('|')).replace(':','').replace('_','')
dictionary[word] = dict(align(ortho, pron))
return dictionary
ALIGNMENTS = process_aligned_data()
def process_dictionary():
"""Read CMUDICT and return dictionary of words (English orthography)
to a list of tuples representing
(start_index_of_syllable, stress_type_of_syllable) where graphemes are
the entities indexed over.
Example: ABOLITIONIST -> A2.BO0.LI1.TIO0.NIST0
-> [(0,2), (1,0), (3,1), (5,0), (8,0)]
Only use first pronunciation in case of multiple ones.
"""
dictionary = dict()
with open(CMUDICT, 'r') as f:
for line in f:
line = line.strip()
if line.startswith('##'):
continue
if not line:
continue
word, pron = line.split(' ')
# filter out duplicates, punctuation marks and words with numbers
if word.endswith(')'):
continue
if not word[0].isalpha():
continue
if contains_digits(word):
continue
# eliminate abbreviations and things like "HMM",
# which have no stress repr anyway
if not contains_digits(pron):
continue
if word in ALIGNMENTS:
dictionary[word] = stressed_representation(word, pron,
ALIGNMENTS[word])
return dictionary
# --------------
# MAIN
# --------------
STRESS_DICT = process_dictionary()
for word, stressed_spelling in sorted(STRESS_DICT.items()):
print stressed_spelling.replace('<1>','<b>').replace('</1>','</b>')\
.replace('<2>','<em>').replace('</2>','</em>')\
.replace('<0>','').replace('</0>','')