forked from asampat3090/arctic-captions
-
Notifications
You must be signed in to change notification settings - Fork 5
/
homogeneous_data.py
72 lines (59 loc) · 2.43 KB
/
homogeneous_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import numpy
import copy
class HomogeneousData():
def __init__(self, data, batch_size=128, maxlen=None):
self.batch_size = 128
self.data = data
self.batch_size = batch_size
self.maxlen = maxlen
self.prepare()
self.reset()
def prepare(self):
self.caps = self.data[0]
self.feats = self.data[1]
# find the unique lengths
self.lengths = [len(cc[0].split()) for cc in self.caps]
self.len_unique = numpy.unique(self.lengths)
# remove any overly long captions
if self.maxlen:
self.len_unique = [ll for ll in self.len_unique if ll <= self.maxlen]
# indices of unique lengths
self.len_indices = dict()
self.len_counts = dict()
for ll in self.len_unique:
self.len_indices[ll] = numpy.where(self.lengths == ll)[0]
self.len_counts[ll] = len(self.len_indices[ll])
# current counter
self.len_curr_counts = copy.copy(self.len_counts)
def reset(self):
self.len_curr_counts = copy.copy(self.len_counts)
self.len_unique = numpy.random.permutation(self.len_unique)
self.len_indices_pos = dict()
for ll in self.len_unique:
self.len_indices_pos[ll] = 0
self.len_indices[ll] = numpy.random.permutation(self.len_indices[ll])
self.len_idx = -1
def next(self):
# randomly choose the length
count = 0
while True:
self.len_idx = numpy.mod(self.len_idx+1, len(self.len_unique))
if self.len_curr_counts[self.len_unique[self.len_idx]] > 0:
break
count += 1
if count >= len(self.len_unique):
break
if count >= len(self.len_unique):
self.reset()
raise StopIteration()
# get the batch size
curr_batch_size = numpy.minimum(self.batch_size, self.len_curr_counts[self.len_unique[self.len_idx]])
curr_pos = self.len_indices_pos[self.len_unique[self.len_idx]]
# get the indices for the current batch
curr_indices = self.len_indices[self.len_unique[self.len_idx]][curr_pos:curr_pos+curr_batch_size]
self.len_indices_pos[self.len_unique[self.len_idx]] += curr_batch_size
self.len_curr_counts[self.len_unique[self.len_idx]] -= curr_batch_size
caps = [self.caps[ii] for ii in curr_indices]
return caps
def __iter__(self):
return self