forked from PatrykChrabaszcz/Imagenet32_Scripts
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathimage2numpy_imagenet_train.py
117 lines (95 loc) · 3.83 KB
/
image2numpy_imagenet_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# http://stackoverflow.com/questions/35032675/how-to-create-dataset-similar-to-cifar-10/35034287
from argparse import ArgumentParser
from utils import *
import os
import imageio
import numpy as np
# Number of classes to be subsampled
num_classes = 1000
# Make sure height and width match the size of the input images
height = width = 128
# Number of pickle files to be created for the training set. Preferably, each file should not be too small and can fit into your memory
n = 100
def parse_arguments():
parser = ArgumentParser()
parser.add_argument('-i', '--in_dir', help="Input directory with source images")
parser.add_argument('-o', '--out_dir', help="Output directory for pickle files")
args = parser.parse_args()
return args.in_dir, args.out_dir
# Strong assumption about in_dir and out_dir (They must contain proper data)
def process_folder(in_dir, out_dir):
label_dict = get_label_dict()
folders = get_ordered_folders()
folders = folders[0::1000//num_classes]
print('Chosen classes: ')
print([label_dict[folder] for folder in folders])
print("Processing folder %s" % in_dir)
x = np.zeros([1281167, height*width*3], dtype=np.uint8)
row = 0
labels_list_train = []
num_images = 0
for folder in folders:
label = label_dict[folder]
print("Processing images from folder %s as class %d" % (folder, label))
# Get images from this folder
images = []
for image_name in os.listdir(os.path.join(in_dir, folder)):
try:
img = imageio.imread(os.path.join(in_dir, folder, image_name),pilmode='RGB')
r = img[:, :, 0].flatten()
g = img[:, :, 1].flatten()
b = img[:, :, 2].flatten()
num_images+=1
except:
print('Cant process image %s' % image_name)
with open("log_img2np.txt", "a") as f:
f.write("Couldn't read: %s \n" % os.path.join(in_dir, image_name))
continue
arr = np.array(list(r) + list(g) + list(b), dtype=np.uint8)
x[row] = arr
row += 1
samples_num = len(os.listdir(os.path.join(in_dir, folder)))
labels = [label] * samples_num
labels_list_train.extend(labels)
print('Label %d: %s has %d samples' % (label, folder, samples_num))
y = np.array(labels_list_train)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
# If you subsample folders [1*] this will not compute mean over all training images
x_mean = np.mean(x[:num_images], axis=0)
# Shuffled indices
train_indices = np.arange(num_images)
np.random.shuffle(train_indices)
curr_index = 0
size = num_images // n
# Create first n-1 files
y_test = []
for i in range(1, n):
d = {
'data': x[train_indices[curr_index: (curr_index + size)], :],
'labels': y[train_indices[curr_index: (curr_index + size)]].tolist(),
'mean': x_mean
}
pickle.dump(d, open(os.path.join(out_dir, 'train_data_batch_%d' % i), 'wb'))
curr_index += size
y_test.extend(d['labels'])
# Create last file
d = {
'data': x[train_indices[curr_index:], :],
'labels': y[train_indices[curr_index:]].tolist(),
'mean': x_mean
}
pickle.dump(d, open(os.path.join(out_dir, 'train_data_batch_%d' % n), 'wb'))
y_test.extend(d['labels'])
count = np.zeros([1000])
for i in y_test:
count[i-1] += 1
for i in range(1000):
print('%d : %d' % (i, count[i]))
print('A total of %d images' % num_images)
print('A total of %d labels' % len(y_test))
if __name__ == '__main__':
in_dir, out_dir = parse_arguments()
print("Start program ...")
process_folder(in_dir=in_dir, out_dir=out_dir)
print("Finished.")