update to new version

zhreshold · Jun 26, 2017 · b550a8f · b550a8f
1 parent 5c59ddc
commit b550a8f
Show file tree

Hide file tree

Showing 20 changed files with 903 additions and 68 deletions.
diff --git a/README.md b/README.md
@@ -16,6 +16,8 @@ remarkable traits of MXNet.
 * The result is almost identical to the original version. However, due to different implementation details, the results might differ slightly.
 
 ### What's new
+* Added multiple trained models.
+* Added a much simpler way to compose network from mainstream classification networks (resnet, inception...) and [Guide](symbol/README.md).
 * Update to the latest version according to caffe version, with 5% mAP increase.
 * Use C++ record iterator based on back-end multi-thread engine to achieve huge speed up on multi-gpu environments.
 * Monitor validation mAP during training.
@@ -103,7 +105,7 @@ tar -xvf VOCtrainval_11-May-2012.tar
 tar -xvf VOCtrainval_06-Nov-2007.tar
 tar -xvf VOCtest_06-Nov-2007.tar
 ```
-* We are goint to use `trainval` set in VOC2007/2012 as a common strategy.
+* We are going to use `trainval` set in VOC2007/2012 as a common strategy.
 The suggested directory structure is to store `VOC2007` and `VOC2012` directories
 in the same `VOCdevkit` folder.
 * Then link `VOCdevkit` folder to `data/VOCdevkit` by default:
@@ -160,3 +162,12 @@ python convert_model.py deploy.prototxt name_of_pretrained_caffe_model.caffemode
 python demo.py --prefix ssd_converted --epoch 1 --deploy
 ```
 There is no guarantee that conversion will always work, but at least it's good for now.
+
+### Legacy models
+Since the new interface for composing network is introduced, the old models have inconsistent names for weights.
+You can still load the previous model by rename the symbol to `legacy_xxx.py`
+and call with `python train/demo.py --network legacy_xxx `
+For example:
+```
+python demo.py --network 'legacy_vgg16_ssd_300.py' --prefix model/ssd_300 --epoch 0
+```
diff --git a/config/config.py b/config/config.py
@@ -53,7 +53,7 @@
 cfg.train.rand_mirror_prob = 0.5
 cfg.train.shuffle = True
 cfg.train.seed = 233
-cfg.train.preprocess_threads = 6
+cfg.train.preprocess_threads = 48
 cfg.train = config_as_dict(cfg.train)  # convert to normal dict
 
 # validation
@@ -64,4 +64,5 @@
 cfg.valid.rand_mirror_prob = 0
 cfg.valid.shuffle = False
 cfg.valid.seed = 0
+cfg.valid.preprocess_threads = 32
 cfg.valid = config_as_dict(cfg.valid)  # convert to normal dict
diff --git a/dataset/imdb.py b/dataset/imdb.py
@@ -91,3 +91,20 @@ def progress_bar(count, total, suffix=''):
                     f.write(line)
         else:
             raise RuntimeError("No image in imdb")
+
+    def _load_class_names(self, filename, dirname):
+        """
+        load class names from text file
+
+        Parameters:
+        ----------
+        filename: str
+            file stores class names
+        dirname: str
+            file directory
+        """
+        full_path = osp.join(dirname, filename)
+        classes = []
+        with open(full_path, 'r') as f:
+            classes = [l.strip() for l in f.readlines()]
+        return classes
diff --git a/dataset/mscoco.py b/dataset/mscoco.py
@@ -18,13 +18,14 @@ class Coco(Imdb):
         whether initially shuffle image list
 
     """
-    def __init__(self, anno_file, image_dir, shuffle=True, names='mscoco.txt'):
+    def __init__(self, anno_file, image_dir, shuffle=True, names='mscoco.names'):
         assert os.path.isfile(anno_file), "Invalid annotation file: " + anno_file
         basename = os.path.splitext(os.path.basename(anno_file))[0]
         super(Coco, self).__init__('coco_' + basename)
         self.image_dir = image_dir
 
-        self._load_class_names(names, os.path.join(os.path.dirname(__file__), 'names'))
+        self.classes = self._load_class_names(names,
+            os.path.join(os.path.dirname(__file__), 'names'))
 
         self.num_classes = len(self.classes)
         self._load_all(anno_file, shuffle)
@@ -112,18 +113,3 @@ def _load_all(self, anno_file, shuffle):
         # store the results
         self.image_set_index = image_set_index
         self.labels = labels
-
-    def _load_class_names(self, filename, dirname):
-        """
-        load class names from text file
-
-        Parameters:
-        ----------
-        filename: str
-            file stores class names
-        dirname: str
-            file directory
-        """
-        full_path = os.path.join(dirname, filename)
-        with open(full_path, 'r') as f:
-            self.classes = [l.strip() for l in f.readlines()]
diff --git a/dataset/names/pascal_voc.names b/dataset/names/pascal_voc.names
@@ -0,0 +1,20 @@
+aeroplane
+bicycle
+bird
+boat
+bottle
+bus
+car
+cat
+chair
+cow
+diningtable
+dog
+horse
+motorbike
+person
+pottedplant
+sheep
+sofa
+train
+tvmonitor
diff --git a/dataset/pascal_voc.py b/dataset/pascal_voc.py
@@ -24,7 +24,8 @@ class PascalVoc(Imdb):
     is_train : boolean
         if true, will load annotations
     """
-    def __init__(self, image_set, year, devkit_path, shuffle=False, is_train=False):
+    def __init__(self, image_set, year, devkit_path, shuffle=False, is_train=False,
+            names='pascal_voc.names'):
         super(PascalVoc, self).__init__('voc_' + year + '_' + image_set)
         self.image_set = image_set
         self.year = year
@@ -33,11 +34,8 @@ def __init__(self, image_set, year, devkit_path, shuffle=False, is_train=False):
         self.extension = '.jpg'
         self.is_train = is_train
 
-        self.classes = ['aeroplane', 'bicycle', 'bird', 'boat',
-                        'bottle', 'bus', 'car', 'cat', 'chair',
-                        'cow', 'diningtable', 'dog', 'horse',
-                        'motorbike', 'person', 'pottedplant',
-                        'sheep', 'sofa', 'train', 'tvmonitor']
+        self.classes = self._load_class_names(names,
+            os.path.join(os.path.dirname(__file__), 'names'))
 
         self.config = {'use_difficult': True,
                        'comp_id': 'comp4',}

diff --git a/dataset/yolo_format.py b/dataset/yolo_format.py
@@ -102,7 +102,7 @@ def label_from_index(self, index):
         ground-truths of this image
         """
         assert self.labels is not None, "Labels not processed"
-        return self.labels[index, :, :]
+        return self.labels[index]
 
     def _label_path_from_index(self, index):
         """
@@ -130,7 +130,6 @@ def _load_image_labels(self):
         labels packed in [num_images x max_num_objects x 5] tensor
         """
         temp = []
-        max_objects = 0
 
         # load ground-truths
         for idx in self.image_set_index:
@@ -151,13 +150,4 @@ def _load_image_labels(self):
                     ymax = y + half_height
                     label.append([cls_id, xmin, ymin, xmax, ymax])
                 temp.append(np.array(label))
-                max_objects = max(max_objects, len(label))
-        # add padding to labels so that the dimensions match in each batch
-        assert max_objects > 0, "No objects found for any of the images"
-        self.padding = max_objects
-        labels = []
-        for label in temp:
-            label = np.lib.pad(label, ((0, max_objects-label.shape[0]), (0,0)), \
-                               'constant', constant_values=(-1, -1))
-            labels.append(label)
-        return np.array(labels)
+        return temp
diff --git a/evaluate/eval_voc.py b/evaluate/eval_voc.py
@@ -51,8 +51,8 @@ def voc_ap(rec, prec, use_07_metric=False):
             ap += p / 11.
     else:
         # append sentinel values at both ends
-        mrec = np.concatenate([0.], rec, [1.])
-        mpre = np.concatenate([0.], prec, [0.])
+        mrec = np.concatenate(([0.], rec, [1.]))
+        mpre = np.concatenate(([0.], prec, [0.]))
 
         # compute precision integration ladder
         for i in range(mpre.size - 1, 0, -1):
@@ -93,10 +93,10 @@ def voc_eval(detpath, annopath, imageset_file, classname, cache_dir, ovthresh=0.
             if ind % 100 == 0:
                 print('reading annotations for {:d}/{:d}'.format(ind + 1, len(image_filenames)))
         print('saving annotations cache to {:s}'.format(cache_file))
-        with open(cache_file, 'w') as f:
+        with open(cache_file, 'wb') as f:
             pickle.dump(recs, f)
     else:
-        with open(cache_file, 'r') as f:
+        with open(cache_file, 'rb') as f:
             recs = pickle.load(f)
 
     # extract objects in :param classname:

diff --git a/evaluate/evaluate_net.py b/evaluate/evaluate_net.py
@@ -7,6 +7,7 @@
 from config.config import cfg
 from evaluate.eval_metric import MApMetric, VOC07MApMetric
 import logging
+from symbol.symbol_factory import get_symbol
 
 def evaluate_net(net, path_imgrec, num_classes, mean_pixels, data_shape,
                  model_prefix, epoch, ctx=mx.cpu(), batch_size=1,
@@ -71,9 +72,8 @@ class names in string, must correspond to num_classes if set
     if net is None:
         net = load_net
     else:
-        sys.path.append(os.path.join(cfg.ROOT_DIR, 'symbol'))
-        net = importlib.import_module("symbol_" + net) \
-            .get_symbol(num_classes, nms_thresh, force_nms)
+        net = get_symbol(net, data_shape[1], num_classes=num_classes,
+            nms_thresh=nms_thresh, force_suppress=force_nms)
     if not 'label' in net.list_arguments():
         label = mx.sym.Variable(name='label')
         net = mx.sym.Group([net, label])

diff --git a/symbol/README.md b/symbol/README.md
@@ -0,0 +1,49 @@
+## How to compose SSD network on top of mainstream classification networks
+
+1. Have the base network ready in this directory as `name.py`, such as `inceptionv3.py`.
+2. Add configuration to `symbol_factory.py`, an example would be:
+```
+if network == 'vgg16_reduced':
+    if data_shape >= 448:
+        from_layers = ['relu4_3', 'relu7', '', '', '', '', '']
+        num_filters = [512, -1, 512, 256, 256, 256, 256]
+        strides = [-1, -1, 2, 2, 2, 2, 1]
+        pads = [-1, -1, 1, 1, 1, 1, 1]
+        sizes = [[.07, .1025], [.15,.2121], [.3, .3674], [.45, .5196], [.6, .6708], \
+            [.75, .8216], [.9, .9721]]
+        ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \
+            [1,2,.5,3,1./3], [1,2,.5], [1,2,.5]]
+        normalizations = [20, -1, -1, -1, -1, -1, -1]
+        steps = [] if data_shape != 512 else [x / 512.0 for x in
+            [8, 16, 32, 64, 128, 256, 512]]
+    else:
+        from_layers = ['relu4_3', 'relu7', '', '', '', '']
+        num_filters = [512, -1, 512, 256, 256, 256]
+        strides = [-1, -1, 2, 2, 1, 1]
+        pads = [-1, -1, 1, 1, 0, 0]
+        sizes = [[.1, .141], [.2,.272], [.37, .447], [.54, .619], [.71, .79], [.88, .961]]
+        ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \
+            [1,2,.5], [1,2,.5]]
+        normalizations = [20, -1, -1, -1, -1, -1]
+        steps = [] if data_shape != 300 else [x / 300.0 for x in [8, 16, 32, 64, 100, 300]]
+    return locals()
+elif network == 'inceptionv3':
+    from_layers = ['ch_concat_mixed_7_chconcat', 'ch_concat_mixed_10_chconcat', '', '', '', '']
+    num_filters = [-1, -1, 512, 256, 256, 128]
+    strides = [-1, -1, 2, 2, 2, 2]
+    pads = [-1, -1, 1, 1, 1, 1]
+    sizes = [[.1, .141], [.2,.272], [.37, .447], [.54, .619], [.71, .79], [.88, .961]]
+    ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \
+        [1,2,.5], [1,2,.5]]
+    normalizations = -1
+    steps = []
+    return locals()
+```
+Here `from_layers` indicate the feature layer you would like to extract from the base network.
+`''` indicate that we want add extra new layers on top of the last feature layer,
+and the number of filters must be specified in `num_filters`. Similarly, `strides` and `pads`
+are required to compose these new layers. `sizes` and `ratios` are the parameters controlling
+the anchor generation algorithm. `normalizations` is used to normalize and rescale feature if
+not `-1`. `steps`: optional, used to calculate the anchor sliding steps.
+
+3. Train or test with arguments `--network name --data-shape xxx --pretrained pretrained_model`
diff --git a/symbol/common.py b/symbol/common.py
@@ -29,20 +29,74 @@ def conv_act_layer(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
     ----------
     (conv, relu) mx.Symbols
     """
-    assert not use_batchnorm, "batchnorm not yet supported"
-    bias = mx.symbol.Variable(name="conv{}_bias".format(name),
-        init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0'})
-    conv = mx.symbol.Convolution(data=from_layer, bias=bias, kernel=kernel, pad=pad, \
-        stride=stride, num_filter=num_filter, name="conv{}".format(name))
-    relu = mx.symbol.Activation(data=conv, act_type=act_type, \
-        name="{}{}".format(act_type, name))
+    conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+        stride=stride, num_filter=num_filter, name="{}_conv".format(name))
     if use_batchnorm:
-        relu = mx.symbol.BatchNorm(data=relu, name="bn{}".format(name))
-    return conv, relu
+        conv = mx.symbol.BatchNorm(data=conv, name="{}_bn".format(name))
+    relu = mx.symbol.Activation(data=conv, act_type=act_type, \
+        name="{}_{}".format(name, act_type))
+    return relu
+
+def multi_layer_feature(body, from_layers, num_filters, strides, pads, min_filter=128):
+    """Wrapper function to extract features from base network, attaching extra
+    layers and SSD specific layers
+
+    Parameters
+    ----------
+    from_layers : list of str
+        feature extraction layers, use '' for add extra layers
+        For example:
+        from_layers = ['relu4_3', 'fc7', '', '', '', '']
+        which means extract feature from relu4_3 and fc7, adding 4 extra layers
+        on top of fc7
+    num_filters : list of int
+        number of filters for extra layers, you can use -1 for extracted features,
+        however, if normalization and scale is applied, the number of filter for
+        that layer must be provided.
+        For example:
+        num_filters = [512, -1, 512, 256, 256, 256]
+    strides : list of int
+        strides for the 3x3 convolution appended, -1 can be used for extracted
+        feature layers
+    pads : list of int
+        paddings for the 3x3 convolution, -1 can be used for extracted layers
+    min_filter : int
+        minimum number of filters used in 1x1 convolution
+
+    Returns
+    -------
+    list of mx.Symbols
+
+    """
+    # arguments check
+    assert len(from_layers) > 0
+    assert isinstance(from_layers[0], str) and len(from_layers[0].strip()) > 0
+    assert len(from_layers) == len(num_filters) == len(strides) == len(pads)
+
+    internals = body.get_internals()
+    layers = []
+    for k, params in enumerate(zip(from_layers, num_filters, strides, pads)):
+        from_layer, num_filter, s, p = params
+        if from_layer.strip():
+            # extract from base network
+            layer = internals[from_layer.strip() + '_output']
+            layers.append(layer)
+        else:
+            # attach from last feature layer
+            assert len(layers) > 0
+            assert num_filter > 0
+            layer = layers[-1]
+            num_1x1 = max(min_filter, num_filter // 2)
+            conv_1x1 = conv_act_layer(layer, 'multi_feat_%d_conv_1x1' % (k),
+                num_1x1, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu')
+            conv_3x3 = conv_act_layer(conv_1x1, 'multi_feat_%d_conv_3x3' % (k),
+                num_filter, kernel=(3, 3), pad=(p, p), stride=(s, s), act_type='relu')
+            layers.append(conv_3x3)
+    return layers
 
 def multibox_layer(from_layers, num_classes, sizes=[.2, .95],
                     ratios=[1], normalization=-1, num_channels=[],
-                    clip=True, interm_layer=0, steps=[]):
+                    clip=False, interm_layer=0, steps=[]):
     """
     the basic aggregation module for SSD detection. Takes in multiple layers,
     generate multiple object detection targets by customized layers
@@ -106,7 +160,7 @@ def multibox_layer(from_layers, num_classes, sizes=[.2, .95],
         normalization = [normalization] * len(from_layers)
     assert len(normalization) == len(from_layers)
 
-    assert sum(x > 0 for x in normalization) == len(num_channels), \
+    assert sum(x > 0 for x in normalization) <= len(num_channels), \
         "must provide number of channels for each normalized layer"
 
     if steps:
@@ -125,7 +179,8 @@ def multibox_layer(from_layers, num_classes, sizes=[.2, .95],
                 mode="channel", name="{}_norm".format(from_name))
             scale = mx.symbol.Variable(name="{}_scale".format(from_name),
                 shape=(1, num_channels.pop(0), 1, 1),
-                init=mx.init.Constant(normalization[k]))
+                init=mx.init.Constant(normalization[k]),
+                attr={'__wd_mult__': '0.1'})
             from_layer = mx.symbol.broadcast_mul(lhs=scale, rhs=from_layer)
         if interm_layer > 0:
             from_layer = mx.symbol.Convolution(data=from_layer, kernel=(3,3), \