forked from jli05/cs224n-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathembeddings.py
182 lines (161 loc) · 7.58 KB
/
embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
from __future__ import (division, absolute_import,
print_function, unicode_literals)
import nltk
from nltk.util import ngrams
import os.path
from nltk.collocations import *
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from gensim.models.word2vec import Word2Vec
from gensim.models import Phrases
import sys
#sys.path.insert(0, "./skip-thoughts")
#import skipthoughts
import numpy as np
def _get_least_common_word_vector(num_vecs = 0, glove_filename="glove/glove.6B.50d.txt"):
if num_vecs == 0:
with open(glove_filename, 'r') as f:
for line in f: pass
vec = line.split(' ')[1:]
return list(map(float, vec))
else:
total = 0
with open(glove_filename, 'r') as f:
for line in f: total += 1
count = 0
uncommon_vectors = []
with open(glove_filename, 'r') as f:
for line in f:
if total - count < num_vecs:
uncommon_vectors.append(list(map(float, line.split(' ')[1:])))
count += 1
return np.mean(np.array(uncommon_vectors) , axis = 0)
def map_sentence_to_glove(sentence, glove_filename="glove/glove.6B.50d.txt", cache={}, default_vec = []):
vectorized_sentence = []
indexed_sentence = []
for word in sentence:
lcw = word.lower()
index = None
if lcw not in cache.keys():
with open(glove_filename, 'r') as f:
v = None
count = 1 # so we can make 0 UNK
for line in f:
first_word = line.split(' ', 1)[0]
if first_word == lcw:
v = line.split(' ')[1:]
v = map(float, v)
cache[lcw] = v, count
index = count
break
count += 1
if v is None:
v = default_vec
index = count
else:
v, index = cache[lcw]
vectorized_sentence.append(v)
indexed_sentence.append(index)
return vectorized_sentence, cache, indexed_sentence
class gloveDocumentParser(object):
def __init__(self, glove_file_name, unk_size=100, pad_token = "--PAD--", unk_token = "--UNK--"):
self.word_to_vector, self.word_to_id, self.id_to_word, \
self.vocab_size, vec_length = self.loadGloveFromFile(glove_file_name)
padding_vector = [0] * vec_length
PAD = pad_token
UNK = unk_token
self.unk = UNK
self.pad = PAD
self.word_to_vector[UNK] = _get_least_common_word_vector(unk_size, glove_filename=glove_file_name)
self.word_to_id[UNK] = 0
self.id_to_word[0] = UNK
self.word_to_vector[PAD] = padding_vector
self.word_to_id[PAD] = 1
self.id_to_word[1] = PAD
self.word_to_vector_matrix = []
for i in range(self.vocab_size):
self.word_to_vector_matrix.append( self.word_to_vector[ self.id_to_word[i] ])
self.word_to_vector_matrix = np.array( self.word_to_vector_matrix )
self.embedding_n_tokens = self.word_to_vector_matrix.shape[0]
self.token_dim = self.word_to_vector_matrix.shape[1]
def loadGloveFromFile(self, glove_file_name):
word_to_vec = {}
word_to_id = {}
id_to_word = {}
index = 2
with open(glove_file_name, 'r') as f:
for line in f:
split_line = line.split(' ')
word_key = split_line[0]
vector = list(map(float, split_line[1:]))
vector_dim = len(vector)
word_to_vec[word_key] = vector
word_to_id[word_key] = index
id_to_word[index] = word_key
index += 1
return word_to_vec, word_to_id, id_to_word, index, vector_dim
def parseDocument(self, document):
tokens = wordpunct_tokenize(document)
index_matrix = [self.word_to_id[token.lower()] if token.lower() in self.word_to_id
else self.word_to_id[self.unk] for token in tokens]
return index_matrix
def documentFromVector(self, id_vector):
doc = [self.id_to_word[_id] for _id in id_vector]
return doc
def rouge_score(reference, hypothesis, n):
ref_ngrams = list(ngrams(wordpunct_tokenize(reference), n))
hyp_ngrams = list(ngrams(wordpunct_tokenize(hypothesis), n))
matching_ngrams = [x for x in hyp_ngrams if x in ref_ngrams]
return 1.0 * len(matching_ngrams) / len(ref_ngrams)
class SkipThoughts(object):
def __init__(self, num_sentences=10):
self.model = skipthoughts.load_model()
self.word_to_id = {}
self.id_to_word = {}
self.next_mapped_index = 1
self.next_row = 0
self.emb_matrix = np.zeros( (num_sentences, 4800) )
def parseDocument(self, text):
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = sent_tokenizer.tokenize(text)
vectors = self.vectorize(sentences)
index_vector = []
for sentence, vector in zip(sentences, vectors):
_id = None
if sentence not in self.word_to_id.keys():
self.word_to_id[sentence] = self.next_mapped_index
self.id_to_word[self.next_mapped_index] = sentence
_id = self.next_mapped_index
self.next_mapped_index += 1
else: _id = self.word_to_id[sentence]
self.emb_matrix[self.next_row, :] = np.array( vector )
self.next_row += 1
index_vector.append( _id )
return self.emb_matrix[:self.next_row, :], index_vector
def reset(self):
self.word_to_id = {}
self.id_to_word = {}
self.next_mapped_index = 1
self.next_row = 0
def vectorize(self, list_of_sentences):
return skipthoughts.encode(self.model, list_of_sentences)
def rouge_test():
reference = "The book was very good"
hyp = "The book was very interesting"
print(rouge_score(reference, hyp, 1))
def doc_parser_test():
g = gloveDocumentParser("glove.6B.50d.txt")
print(g.parseDocument("This is a sentence. This is another sentence. What can you do about it, glove?"))
print(g.documentFromVector([39, 16, 9, 2424, 4]))
def skip_thoughts_test():
s = SkipThoughts()
a, b = s.parseDocument("This is a sentence. This is another sentence. Hopefully, we get a good matrix representation of this!")
print(a)
print(b)
print(s.emb_matrix)
def UNKtest():
w = word2vec()
w.generateUNK()
if __name__=="__main__":
#doc_parser_test()
#rouge_test()
skip_thoughts_test()