forked from ultimate010/crnn
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsst2_data.py
77 lines (70 loc) · 2.44 KB
/
sst2_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import cPickle
import numpy as np
from process_sst2_data import SentimentPhrase
def get_idx_from_sent(sent, word_idx_map, max_l=51, k=300, filter_h=5, pad_left=True):
"""
Transforms sentence into a list of indices. Pad with zeroes.
"""
x = []
pad = filter_h - 1
if pad_left:
for i in xrange(pad):
x.append(0)
words = sent.split()
for word in words:
if word in word_idx_map:
x.append(word_idx_map[word])
while len(x) < max_l+2*pad:
x.append(0)
return x
def get_label(sentiment):
if sentiment > 0.6:
return 1
elif sentiment <= 0.4:
return 0
else:
return -1
def make_idx_data_cv(phrases, sentences, word_idx_map, max_l=51, k=300, filter_h=5, pad_left=True):
"""
Transforms sentences into a 2-d matrix.
"""
debug = True
if debug:
train_file = open('sst2_train.txt', 'w')
test_file = open('sst2_test.txt', 'w')
train, test = [], []
for p in phrases: # put all phrase into train data
sent = get_idx_from_sent(' '.join(p.words), word_idx_map, max_l, k, filter_h, pad_left=pad_left)
sent.append(get_label(p.sentiment))
if debug:
train_file.write('%s\n' % ' '.join(p.words))
train.append(sent)
for s in sentences:
sent = get_idx_from_sent(' '.join(s.words), word_idx_map, max_l, k, filter_h, pad_left=pad_left)
sent.append(get_label(s.sentiment))
if s.split == 'train':
train.append(sent)
if debug:
train_file.write('%s\n' % ' '.join(s.words))
elif s.split == 'dev':
train.append(sent)
if debug:
train_file.write('%s\n' % ' '.join(s.words))
else:
test.append(sent)
if debug:
test_file.write('%s\n' % ' '.join(s.words))
train = np.array(train,dtype="int")
test = np.array(test,dtype="int")
return [train, test]
x = None
def load_data(pad_left=True):
global x
if x is None:
x = cPickle.load(open("sst2.p","rb"))
phrases, sentences, W, W2, word_idx_map, vocab = x[0], x[1], x[2], x[3], x[4], x[5]
datasets = make_idx_data_cv(phrases, sentences, word_idx_map, max_l=56, k=300, filter_h=5, pad_left=pad_left)
img_h = len(datasets[0][0])-1
return datasets[0][:,:img_h], datasets[0][:, -1], datasets[1][:,: img_h], datasets[1][: , -1], W, W2
if __name__ == '__main__':
load_data()