-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset2md5
57 lines (44 loc) · 1.46 KB
/
dataset2md5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/bin/bash
## Synopsis:
## Convert a set of files downloaded by img2dataset,
## into md5 based naming and hash buckets.
## Usage:
## Pass in a list of img files, that have .jpg or png extension.
## They must also have a matching .json file that has md5 sum
## Otherwise we have to calculate it from the file!
# You will probably want to pass in the FULL path to existing
# images. So,
# find /some/path -name '*jpg'
#
# NOT "find ./img -name '*jpg'
#
# You will also need to modify the DESTDIR var, possibly the LINK var,
# and you will need to download the printmd5fromjson.py util
DESTDIR=${DESTDIR:=/data2/cached/CC8M}
EXTRACT=/data/SCRIPTS/printmd5fromjson.py
# LINK can be "cp", "ln -s", or "ln"
LINK="ln -s"
#LINK="echo FAKE ln -s"
ls -d $DESTDIR >/dev/null || exit 1
# bash magic string
# Ensure all the hash directories exist first.
(cd $DESTDIR; mkdir -p {0..9}{0..9} {a..f}{0..9} {0..9}{a..f} {a..f}{a..f} 2>/dev/null)
while read imgfile ; do
basename=${imgfile##*/}
ext=${basename##*.}
longbase=${imgfile%.*}
if [[ -f $longbase.json ]] ; then
md5=$(echo $longbase.json | $EXTRACT)
else
echo ERROR: havent made the md5sum call yet
echo $longbase.json
fi
if [[ -z $md5 ]] ; then
echo ERROR: cant find md5 for $imgfile
exit 1
fi
hashcode=${md5:0:2}
$LINK $imgfile $DESTDIR/$hashcode/$md5.$ext
$LINK $longbase.json $DESTDIR/$hashcode/$md5.json
[[ -f $longbase.txt ]] && $LINK $longbase.txt $DESTDIR/$hashcode/$md5.txt
done