-
Notifications
You must be signed in to change notification settings - Fork 2
/
romanizer.py
65 lines (47 loc) · 2.27 KB
/
romanizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import pycrfsuite
import re
import pickle
from tools.vectorizer_crf import create_sentence_features_crf
from tools.keras_predict import Translit
from keras.models import load_model
class Romanizer():
def __init__(self):
self.__segmenter = pycrfsuite.Tagger()
self.__segmenter.open('models/MCR-segmentation')
romanizer_model = load_model('models/s2s.h5')
with open('models/input_token_index.dat', 'rb') as fp:
input_token_index = pickle.load(fp)
with open('models/target_token_index.dat', 'rb') as fp:
target_token_index = pickle.load(fp)
self.__romanizer = Translit(romanizer_model, input_token_index, target_token_index)
def Segment(self, text, return_array = False):
sentences = re.split(r'(?<=\.) ', text)
no_space_sentences = []
segmented_sentences = []
delimiters = []
for sentence in sentences:
no_space_sentence = ([c for c in ''.join(sentence.split())])
no_space_sentences.append(no_space_sentence)
delimiters.append(self.__segmenter.tag(create_sentence_features_crf(no_space_sentence)))
for i, delimiter in enumerate(delimiters):
current_sentence = ''
for j, label in enumerate(delimiter):
current_sentence += no_space_sentences[i][j]
if label == '1' and j != len(delimiter) - 1:
current_sentence += ' '
segmented_sentences.append(current_sentence)
if return_array:
return segmented_sentences
return '. '.join(segmented_sentences).strip()
def Romanize(self, text, return_array = False):
sentences = self.Segment(text, return_array = True)
romanized_sentences = []
for sentence in sentences:
words = sentence.split()
romanized_sentence = ''
for word in words:
romanized_sentence += self.__romanizer.Romanize(word) + ' '
romanized_sentences.append(romanized_sentence.strip(' '))
if return_array:
return romanized_sentences
return '. '.join(romanized_sentences).strip()