-
Notifications
You must be signed in to change notification settings - Fork 2
/
table_memnn.py
170 lines (145 loc) · 6.5 KB
/
table_memnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Created on Mar 01, 2017
.. codeauthor: svitlana vakulenko
Based on the babi_memnn Keras implementation:
https://github.com/fchollet/keras/blob/master/examples/babi_memnn.py
Trains a memory network on a single table.
References:
- Sainbayar Sukhbaatar, Arthur Szlam, Jason Weston, Rob Fergus,
"End-To-End Memory Networks",
http://arxiv.org/abs/1503.08895
'''
from __future__ import print_function
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Activation, Dense, Merge, Permute, Dropout
from keras.layers import LSTM
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
from functools import reduce
import tarfile
import numpy as np
import re
from parse_table import get_tables
BATCH_SIZE = 32
EPOCHS = 120
def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
X = []
Xq = []
Y = []
for story, query, answer in data:
x = [word_idx[w] for w in story]
xq = [word_idx[w] for w in query]
y = np.zeros(len(word_idx) + 1) # let's not forget that index 0 is reserved
y[word_idx[answer]] = 1
X.append(x)
Xq.append(xq)
Y.append(y)
return (pad_sequences(X, maxlen=story_maxlen),
pad_sequences(Xq, maxlen=query_maxlen), np.array(Y))
def train_memnn(train, test):
vocab = sorted(reduce(lambda x, y: x | y, (set(story + q + [answer]) for story, q, answer in train + test)))
# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
story_maxlen = max(map(len, (x for x, _, _ in train + test)))
query_maxlen = max(map(len, (x for _, x, _ in train + test)))
print('-')
print('Vocab size:', vocab_size, 'unique words')
print('Story max length:', story_maxlen, 'words')
print('Query max length:', query_maxlen, 'words')
print('Number of training samples:', len(train))
print('Number of test samples:', len(test))
print('-')
print('Here\'s what a "story" tuple looks like (input, query, answer):')
print(train[0])
print('-')
print('Vectorizing the word sequences...')
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
inputs_train, queries_train, answers_train = vectorize_stories(train, word_idx, story_maxlen, query_maxlen)
inputs_test, queries_test, answers_test = vectorize_stories(test, word_idx, story_maxlen, query_maxlen)
print('-')
print('inputs: integer tensor of shape (samples, max_length)')
print('inputs_train shape:', inputs_train.shape)
print('inputs_test shape:', inputs_test.shape)
print('-')
print('queries: integer tensor of shape (samples, max_length)')
print('queries_train shape:', queries_train.shape)
print('queries_test shape:', queries_test.shape)
print('-')
print('answers: binary (1 or 0) tensor of shape (samples, vocab_size)')
print('answers_train shape:', answers_train.shape)
print('answers_test shape:', answers_test.shape)
print('-')
print('Compiling...')
# embed the input sequence into a sequence of vectors
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,
output_dim=64,
input_length=story_maxlen))
input_encoder_m.add(Dropout(0.3))
# output: (samples, story_maxlen, embedding_dim)
# embed the question into a sequence of vectors
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,
output_dim=64,
input_length=query_maxlen))
question_encoder.add(Dropout(0.3))
# output: (samples, query_maxlen, embedding_dim)
# compute a 'match' between input sequence elements (which are vectors)
# and the question vector sequence
match = Sequential()
match.add(Merge([input_encoder_m, question_encoder],
mode='dot',
dot_axes=[2, 2]))
match.add(Activation('softmax'))
# output: (samples, story_maxlen, query_maxlen)
# embed the input into a single vector with size = story_maxlen:
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,
output_dim=query_maxlen,
input_length=story_maxlen))
input_encoder_c.add(Dropout(0.3))
# output: (samples, story_maxlen, query_maxlen)
# sum the match vector with the input vector:
response = Sequential()
response.add(Merge([match, input_encoder_c], mode='sum'))
# output: (samples, story_maxlen, query_maxlen)
response.add(Permute((2, 1))) # output: (samples, query_maxlen, story_maxlen)
# concatenate the match vector with the question vector,
# and do logistic regression on top
model = Sequential()
model.add(Merge([response, question_encoder], mode='concat', concat_axis=-1))
# the original paper uses a matrix multiplication for this reduction step.
# we choose to use a RNN instead.
model.add(LSTM(32))
# one regularization layer -- more would probably be needed.
model.add(Dropout(0.3))
model.add(Dense(vocab_size))
# we output a probability distribution over the vocabulary
model.add(Activation('softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
metrics=['accuracy'])
earlyStopping = EarlyStopping(monitor='val_loss', patience=2, verbose=0, mode='auto')
# Note: you could use a Graph model to avoid repeat the input twice
model.fit([inputs_train, queries_train, inputs_train], answers_train,
batch_size=BATCH_SIZE,
nb_epoch=EPOCHS,
# callbacks=[earlyStopping],
validation_split=0.05)
# validation_data=([inputs_test, queries_test, inputs_test], answers_test))
loss, acc = model.evaluate([inputs_test, queries_test, inputs_test], answers_test, batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))
if __name__ == "__main__":
# data_path = './data/synth_data_{}.txt'
# data_path = './data/pattern_1/synth_data_{}.txt'
# data_path = './data/pattern_2/synth_data_{}.txt'
# data_path = './data/table_data_{}.txt'
data_path = './data/sim_data_{}.txt'
train = get_tables(data_path.format('train'))
test = get_tables(data_path.format('test'))
# print (test)
train_memnn(train, test)