-
Notifications
You must be signed in to change notification settings - Fork 6
/
decompoundIndexes.py
131 lines (108 loc) · 4.76 KB
/
decompoundIndexes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
__author__ = 'lqrz'
import gensim
import logging
from nltk.corpus import PlaintextCorpusReader
import pickle
import numpy as np
import sys
def decompound(inputCompound):
global vectors
global model
global resultIndexes
global resultVectors
# get all matching prefixes
logger.info('Getting all matching prefixes')
prefixes = set()
for prefix in vectors.keys():
found = inputCompound.find(prefix)
if found == 0 and len(vectors[prefix]) > 0 and len(inputCompound[len(prefix):]) > 0:
prefixes.add(prefix)
logger.debug('Possible prefixes')
logger.debug(prefixes)
# get all possible splits
logger.info('Getting possible splits')
splits = set()
# splitsWithNoRep = set()
fugenlaute = ['', 'e', 'es']
for prefix in prefixes:
for fug in fugenlaute:
if fug == '' or inputCompound[len(prefix):].find(fug) == 0:
try:
# look for the uppercased rest representation
tail = inputCompound[len(prefix) + len(fug):].title()
tailRepresentationIndex = model.vocab[tail].index
resultIndexes[tail] = tailRepresentationIndex
resultVectors[tailRepresentationIndex] = np.array(model[tail])
splits.add((prefix, tail, tailRepresentationIndex))
msg = ' '.join(['Considering split', inputCompound, prefix, tail])
logger.debug(msg)
except KeyError:
# if i dont have a vector rep for the rest, i discard it
# splitsWithNoRep.add((prefix, tail))
msg = ' '.join(['Discarding split', inputCompound, prefix, tail])
logger.debug(msg)
try:
# look for the lowercased rest representation
tail = inputCompound[len(prefix) + len(fug):]
tailRepresentationIndex = model.vocab[tail].index
resultIndexes[tail] = tailRepresentationIndex
resultVectors[tailRepresentationIndex] = np.array(model[tail])
splits.add((prefix, tail, tailRepresentationIndex))
msg = ' '.join(['Considering split', inputCompound, prefix, tail])
logger.debug(msg)
except KeyError:
# if i dont have a vector rep for the rest, i discard it
msg = ' '.join(['Discarding split', inputCompound, prefix, tail])
logger.debug(msg)
# splitsWithNoRep.add((prefix, tail))
continue
for prefix, tail, tailRepresentationIndex in splits:
msg = ' '.join(['Applying', str(len(vectors[prefix])), 'direction vectors to split', prefix, tail])
logger.debug(msg)
for origin, evidence in vectors[prefix]:
resultVectors[origin[0]] = np.array(model.syn0[origin[0]])
resultVectors[origin[1]] = np.array(model.syn0[origin[1]])
return True
if __name__ == '__main__':
if len(sys.argv) == 5:
resultsPath = sys.argv[1]
corpusPath = sys.argv[2]
w2vPath = sys.argv[3]
resultsFolder = sys.argv[4]
elif len(sys.argv) > 1:
print 'Error in params'
exit()
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger()
# resultsPath = 'results/dir_vecs_4_100.p'
# corpusPath = './prueba.txt'
# outPath = 'splits.txt'
# w2vPath = 'models/mono_500_de.bin'
indexesPickleFilename = resultsFolder + 'decompoundIndexes.p'
vectorsPickleFilename = resultsFolder + 'decompoundVectors.p'
resultIndexes = dict()
resultVectors = dict()
logger.info('Getting pickle results file')
vectors = pickle.load(open(resultsPath, 'rb'))
model = gensim.models.Word2Vec.load_word2vec_format(w2vPath, binary=True)
idx = corpusPath.rfind('/') + 1
folder = corpusPath[0:idx]
filename = corpusPath[idx:]
corpus = PlaintextCorpusReader(folder, filename, encoding='utf-8')
inputCompounds = corpus.words()
# results = []
for inputCompound in inputCompounds:
try:
compoundIndex = model.vocab[inputCompound].index
# results.append(decompound(inputCompound))
resultIndexes[inputCompound] = compoundIndex
resultVectors[compoundIndex] = np.array(model[inputCompound])
decompound(inputCompound)
except KeyError:
logger.error('No word2vec representation for input compound'+inputCompound)
# exit()
# results.append(inputCompound)
continue
logger.info('Pickling files')
pickle.dump(resultIndexes, open(indexesPickleFilename,'wb'))
pickle.dump(resultVectors, open(vectorsPickleFilename,'wb'))