-
Notifications
You must be signed in to change notification settings - Fork 57
/
voc_convert.py
388 lines (342 loc) · 13.2 KB
/
voc_convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
########################################################################
# YouTube BoundingBox VOC2007 Converter
########################################################################
#
# This script converts the downloaded YouTube BoundingBox detection
# dataset into the VOC2007 format. This includes decoding the source
# videos into frames. If you do not yet have the source videos then
# be sure to run the download script first.
#
# Original VOC 2007 Dataset devkit documentation:
# http://host.robots.ox.ac.uk/pascal/VOC/voc2007/devkit_doc_07-Jun-2007.pdf
#
# Author: Mark Buckler
#
########################################################################
from __future__ import unicode_literals
import youtube_bb
import sys
import random
import os
import csv
import subprocess
from xml.etree.ElementTree import Element, SubElement, Comment, tostring
from xml.dom import minidom
from PIL import Image
from concurrent import futures
from subprocess import check_call
## Decode all the clips in a given vid
def decode_frame(clips,
annot,
max_ratio,
d_set,
src_dir,
dest_dir):
yt_id = annot[0]
class_id = annot[2]
obj_id = annot[4]
annot_clip_path = src_dir+'/'+d_set+'/'+class_id+'/'
annot_clip_name = yt_id+'+'+class_id+'+'+obj_id+'.mp4'
clip_name = yt_id+'+'+class_id+'+'+obj_id
# Find the clip in vids
clip = next((x for x in clips if x.name == clip_name), None)
assert(clip != None), \
"Annotation doesn't have a corresponding clip"
# Convert the annotation time stamp (in original video) to a time in the clip
annot_time = float(annot[1])
clip_start = float(clip.start)
decode_time = annot_time - clip_start
# Extract a frame at that time stamp to the appropriate place within the
# destination directory
frame_dest = dest_dir+'/youtubebbdevkit2017/youtubebb2017/JPEGImages/'
frame_name = yt_id+'+'+class_id+'+'+obj_id+'+'+str(int(annot_time))+'.jpg'
FNULL = open(os.devnull, 'w')
check_call(['ffmpeg',\
'-ss', str(float(decode_time)/1000.0),\
'-i', (annot_clip_path+annot_clip_name),\
'-qscale:v','2',\
'-vframes','1',\
'-threads','1',\
(frame_dest+frame_name)],\
stdout=FNULL,stderr=subprocess.STDOUT )
with Image.open(frame_dest+frame_name) as img:
width, height = img.size
# If this frame's aspect ratio exheeds the maximum aspect ratio
if ( (max_ratio!=0) and \
( ((width/height) > max_ratio) or
((height/width) > max_ratio) ) ):
os.remove(frame_dest+frame_name)
def decode_frames(d_set,
src_dir,
dest_dir,
num_threads,
num_annots,
max_ratio,
include_absent):
# Get list of annotations
# Download & extract the annotation list
annotations,clips,vids = youtube_bb.parse_annotations(d_set,src_dir)
# Filter out annotations with no matching video
print(d_set + \
': Filtering out last, missing, and or absent frames (if requested)...')
present_annots = []
for annot in annotations:
yt_id = annot[0]
class_id = annot[2]
obj_id = annot[4]
annot_clip_path = src_dir+'/'+d_set+'/'+class_id+'/'
annot_clip_name = yt_id+'+'+class_id+'+'+obj_id+'.mp4'
clip_name = yt_id+'+'+class_id+'+'+obj_id
# If video exists
if (os.path.exists(annot_clip_path+annot_clip_name)):
# If we are including all frames, or if the labeled object is present
if ( include_absent or (annot[5]=='present') ):
# If this is not the first or last frame
annot_clip = next((x for x in clips if x.name == clip_name), None)
if ((int(annot_clip.stop ) != int(annot[1])) and \
(int(annot_clip.start) != int(annot[1]))):
present_annots.append(annot)
# Gather subset of random annotations
print(d_set+': Gathering annotations/frames to decode...')
random.shuffle(present_annots)
if num_annots == 0: # Convert all present annotations
annot_to_convert = present_annots
else:
assert(len(present_annots) >= num_annots), \
"Number of frames requested exceeds number of present frames"
annot_to_convert = present_annots[:num_annots]
# Run frame decoding in parallel, extract frames from each video
#for annot in annot_to_convert:
# decode_frame(clips,annot,d_set,src_dir,dest_dir)
with futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
fs = [executor.submit( \
decode_frame,clips,annot,max_ratio,d_set,src_dir,dest_dir) \
for annot in annot_to_convert]
for i, f in enumerate(futures.as_completed(fs)):
# Check for an exception in the workers.
try:
f.result()
except Exception as exc:
print('decode failed', exc)
else:
# Write progress to error so that it can be seen
sys.stderr.write( \
"Decoded frame: {} / {} \r".format(i, len(annot_to_convert)))
print(d_set+': Finished decoding frames!')
return annot_to_convert
def write_xml_annot(dest_dir,xml_params):
# Write the xml annotation to file
xml_annot = Element('annotation')
folder = SubElement(xml_annot, 'folder')
folder.text = xml_params.folder
filename = SubElement(xml_annot, 'filename')
filename.text = xml_params.filename
source = SubElement(xml_annot, 'source')
database = SubElement(source, 'database')
database.text = xml_params.database
annotation = SubElement(source, 'annotation')
annotation.text = xml_params.annotation
image_source = SubElement(source, 'image')
image_source.text = xml_params.image_source
image_flickrid = SubElement(source, 'flickrid')
image_flickrid.text = xml_params.image_flickrid
owner = SubElement(xml_annot, 'owner')
owner_flickrid = SubElement(owner, 'flickrid')
owner_flickrid.text = xml_params.owner_flickrid
owner_name = SubElement(owner, 'name')
owner_name.text = xml_params.owner_name
size = SubElement(xml_annot, 'size')
width = SubElement(size, 'width')
width.text = xml_params.image_width
height = SubElement(size, 'height')
height.text = xml_params.image_height
depth = SubElement(size, 'depth')
depth.text = xml_params.image_depth
segmented = SubElement(xml_annot, 'segmented')
segmented.text = xml_params.segmented
if ('present' in xml_params.annotation):
object_ = SubElement(xml_annot, 'object')
class_name = SubElement(object_, 'name')
class_name.text = xml_params.class_name
pose = SubElement(object_, 'pose')
pose.text = xml_params.pose
truncated = SubElement(object_, 'truncated')
truncated.text = xml_params.truncated
difficult = SubElement(object_, 'difficult')
difficult.text = xml_params.difficult
bndbox = SubElement(object_, 'bndbox')
xmin = SubElement(bndbox, 'xmin')
xmin.text = xml_params.xmin
ymin = SubElement(bndbox, 'ymin')
ymin.text = xml_params.ymin
xmax = SubElement(bndbox, 'xmax')
xmax.text = xml_params.xmax
ymax = SubElement(bndbox, 'ymax')
ymax.text = xml_params.ymax
# Write the XML file
xml_str = minidom.parseString(tostring(xml_annot)).toprettyxml(indent=" ")
with open(dest_dir + \
'youtubebbdevkit2017/youtubebb2017/Annotations/' + \
xml_params.annot_name + \
'.xml', 'w') as f:
f.write(xml_str)
def write_xml_annots(dest_dir,annots):
xml_annots = []
# For each annotation
for annot in annots:
# Get file details
yt_id = annot[0]
annot_time = annot[1]
class_id = annot[2]
obj_id = annot[4]
annot_name = yt_id+'+'+class_id+'+'+obj_id+'+'+annot_time
filename = annot_name+'.jpg'
frame_path = dest_dir + 'youtubebbdevkit2017/youtubebb2017/JPEGImages/'
# Check to verify the frame was extracted
if (os.path.exists(frame_path+filename)):
# Get image dimensions
img = Image.open(frame_path+filename)
image_width,image_height = img.size
# Check to see if this annotation is on the border
# (likely a truncated annotation)
xmin_frac = float(annot[6])
xmax_frac = float(annot[7])
ymin_frac = float(annot[8])
ymax_frac = float(annot[9])
if ( (xmin_frac == 0.0) or (xmax_frac == 1.0) or \
(ymin_frac == 0.0) or (ymax_frac == 1.0) ):
truncated = 1
else:
truncated = 0
# Convert bounding boxes to pixel dimensions, set minimum as 1
xmin_pix = int(float(image_width)*xmin_frac)
if xmin_pix == 0: xmin_pix = 1
ymin_pix = int(float(image_height)*ymin_frac)
if ymin_pix == 0: ymin_pix = 1
xmax_pix = int(float(image_width)*xmax_frac)
ymax_pix = int(float(image_height)*ymax_frac)
xml_params = youtube_bb.xml_annot( \
annot_name,
filename,
annot,
image_width,
image_height,
truncated,
xmin_pix,
ymin_pix,
xmax_pix,
ymax_pix)
write_xml_annot(dest_dir,xml_params)
xml_annots.append(xml_params)
return xml_annots
def write_class_det_files(dest_dir, filename, xml_annots):
out_file = open((dest_dir + \
'youtubebbdevkit2017/youtubebb2017/ImageSets/Main/' + \
filename),
"w")
for Layout_annot in xml_annots:
out_file.write(Layout_annot.annot_name+'\n')
out_file.close()
def write_class_files(dest_dir, filename, xml_annots, class_):
out_file = open((dest_dir + \
'youtubebbdevkit2017/youtubebb2017/ImageSets/Main/' + \
filename),
"w")
# If this is not an empty list
for Main_annot in xml_annots:
if ((Main_annot.class_name == class_[1]) and \
('present' in Main_annot.annotation)):
# Class of interest is present
present_flag = '1'
else:
present_flag = '-1'
out_file.write(Main_annot.annot_name+' '+present_flag+'\n')
out_file.close()
def write_txt_files(dest_dir, train_xml_annots, val_xml_annots):
# Get the list of classes
class_list = youtube_bb.class_list
# NOTE:
# VOC converted test: YouTube BoundingBox validation
# VOC converted train: YouTube BoundingBox training
# VOC converted validation: Empty
d_set_sections = ['test',
'train',
'trainval',
'val',
]
section_annots = [val_xml_annots,
train_xml_annots,
train_xml_annots,
[],
]
# Print Classification/Detection task files (test, train, trainval, val)
for idx in range(len(d_set_sections)):
write_class_det_files(dest_dir,
(d_set_sections[idx]+'.txt'),
section_annots[idx])
# Print Classification task files (all classes for each dataset)
for idx in range(len(d_set_sections)):
for class_ in class_list:
# Skip the None class (no examples for detection)
if class_[1] != 'none':
write_class_files(dest_dir,
(class_[1]+'_'+d_set_sections[idx]+'.txt'),
section_annots[idx],
class_)
if __name__ == '__main__':
assert(len(sys.argv) == 8), \
"Usage: python voc_convert.py [VID_SOURCE] [DSET_DEST] [NUM_THREADS] " \
"[NUM_TRAIN] [NUM_VAL] [MAX_RATIO] [INCL_ABS]"
src_dir = sys.argv[1]+'/'
dest_dir = sys.argv[2]+'/'
num_threads = int(sys.argv[3])
num_train_frames = int(sys.argv[4])
num_val_frames = int(sys.argv[5])
max_ratio = float(sys.argv[6])
assert((sys.argv[7]=='0')or(sys.argv[7]=='1')), \
["Please indicate if frames with absent objects should be included with",
"a 1, or should not be included with a 0"]
if sys.argv[6] == '1':
include_absent = True
else:
include_absent = False
# Download VOC 2007 devkit
devkit_link = \
"http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCdevkit_08-Jun-2007.tar"
check_call(['wget','-P',dest_dir,devkit_link])
# Extract, rename, add missing directories
check_call(['tar','-xvf',
dest_dir+'VOCdevkit_08-Jun-2007.tar',
'-C',dest_dir])
check_call(['rm',dest_dir+'VOCdevkit_08-Jun-2007.tar'])
check_call(['mv',dest_dir+'VOCdevkit',dest_dir+'youtubebbdevkit2017'])
check_call(['mkdir','-p',
dest_dir+'youtubebbdevkit2017/youtubebb2017/ImageSets/Main'])
check_call(['mkdir','-p',
dest_dir+'youtubebbdevkit2017/youtubebb2017/JPEGImages'])
check_call(['mkdir','-p',
dest_dir+'youtubebbdevkit2017/youtubebb2017/Annotations'])
check_call(['mkdir','-p',
dest_dir+'youtubebbdevkit2017/results/youtubebb2017/Main'])
# Decode frames for training detection
train_frame_annots = decode_frames('yt_bb_detection_train',
src_dir,
dest_dir,
num_threads,
num_train_frames,
max_ratio,
include_absent)
# Write the xml annotations for training detection
train_xml_annots = write_xml_annots(dest_dir,train_frame_annots)
# Decode frames for validation detection
val_frame_annots = decode_frames('yt_bb_detection_validation',
src_dir,
dest_dir,
num_threads,
num_val_frames,
max_ratio,
include_absent)
# Write the xml annotations for validation detection
val_xml_annots = write_xml_annots(dest_dir,val_frame_annots)
# Write txt files
write_txt_files(dest_dir, train_xml_annots, val_xml_annots)