From 272ae1939d6984c5760bbf6cf5dee44dbfd51cfc Mon Sep 17 00:00:00 2001
From: Emanuele Plebani <banus80@gmail.com>
Date: Mon, 17 Apr 2017 23:46:28 +0200
Subject: [PATCH] support detector anchors, spin out NMS, add YOLO v2 models

The detector anchors for YOLO v2 models are supported but using the coco
/ voc distinction. Non-maxima suppression has been separated in its
own function.
Added converted tiny_yolo and tiny_yolo_voc models.
---
 README.rst                                   |  26 +-
 prototxt/tiny_yolo_deploy.prototxt           | 411 +++++++++++++++++++
 prototxt/tiny_yolo_voc_deploy.prototxt       | 411 +++++++++++++++++++
 prototxt/{ => v1}/coco_tiny_deploy.prototxt  |   0
 prototxt/{ => v1}/yolo_small_deploy.prototxt |   0
 prototxt/{ => v1}/yolo_tiny_deploy.prototxt  |   0
 yolo_detect.py                               |  60 ++-
 7 files changed, 877 insertions(+), 31 deletions(-)
 create mode 100644 prototxt/tiny_yolo_deploy.prototxt
 create mode 100644 prototxt/tiny_yolo_voc_deploy.prototxt
 rename prototxt/{ => v1}/coco_tiny_deploy.prototxt (100%)
 rename prototxt/{ => v1}/yolo_small_deploy.prototxt (100%)
 rename prototxt/{ => v1}/yolo_tiny_deploy.prototxt (100%)

diff --git a/README.rst b/README.rst
index 2618baa..0d4f2c9 100644
--- a/README.rst
+++ b/README.rst
@@ -1,4 +1,4 @@
-﻿|Python27|
+|Python27|
 
 .. |Python27| image:: https://img.shields.io/badge/python-2.7-blue.svg
     :target: https://www.python.org/
@@ -81,21 +81,29 @@ the Darknet v2 YOLO models are also not supported.
 Model files
 ===========
 
-Three converted models are available in the `prototxt` directory:
+Two models converted from YOLO v2 are available in the ``prototxt`` directory:
 
-* **YOLO tiny**: converted from `tiny-yolo.cfg`, 
-  `caffemodel <https://drive.google.com/file/d/0Bx7QZuu7oVBbLVktdDJEQ3FZTEk/view?usp=sharing>`__.
-
-* **YOLO small**: converted from `yolo-small.cfg`,
-  `caffemodel <https://drive.google.com/file/d/0Bx7QZuu7oVBbVVJaVzh2WV9CR28/view?usp=sharing>`__.
+* **YOLO tiny** (CoCo): converted from ``tiny-yolo.cfg``,
+  `caffemodel <https://drive.google.com/open?id=0Bx7QZuu7oVBbNEt5YmUzRGNXZlk>`__.
 
-* **YOLO CoCo tiny**: converted from `tiny-coco.cfg`,
-  `caffemodel <https://drive.google.com/file/d/0Bx7QZuu7oVBbcWRpVG9NNl9EanM/view?usp=sharing>`__.
+* **YOLO tiny VOC**: converted from ``tiny-yolo-voc.cfg``,
+  `caffemodel <https://drive.google.com/open?id=0Bx7QZuu7oVBbSEdpaDBGMVFIVk0>`__.
 
 
 Legacy models
 ^^^^^^^^^^^^^
 
+Three converted models from YOLO v1 are available in the ``prototxt/v1`` directory:
+
+* **YOLO tiny**: converted from ``yolov1/tiny-yolo.cfg``, 
+  `caffemodel <https://drive.google.com/file/d/0Bx7QZuu7oVBbLVktdDJEQ3FZTEk/view?usp=sharing>`__.
+
+* **YOLO small**: converted from ``yolov1/yolo-small.cfg``,
+  `caffemodel <https://drive.google.com/file/d/0Bx7QZuu7oVBbVVJaVzh2WV9CR28/view?usp=sharing>`__.
+
+* **YOLO CoCo tiny**: converted from ``yolov1/tiny-coco.cfg``,
+  `caffemodel <https://drive.google.com/file/d/0Bx7QZuu7oVBbcWRpVG9NNl9EanM/view?usp=sharing>`__.
+
 The models originally converted by *xingwangsfu* (https://github.com/xingwangsfu/caffe-yolo)
 are available in the directory ``prototxt/legacy``. The converted weights can
 be downloaded here:
diff --git a/prototxt/tiny_yolo_deploy.prototxt b/prototxt/tiny_yolo_deploy.prototxt
new file mode 100644
index 0000000..8dc221f
--- /dev/null
+++ b/prototxt/tiny_yolo_deploy.prototxt
@@ -0,0 +1,411 @@
+layer {
+  name: "data"
+  type: "Input"
+  top: "data"
+  input_param {
+    shape {
+      dim: 1
+      dim: 3
+      dim: 416
+      dim: 416
+    }
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  convolution_param {
+    num_output: 16
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+  }
+}
+layer {
+  name: "conv1_bn"
+  type: "BatchNorm"
+  bottom: "conv1"
+  top: "conv1_bn"
+  batch_norm_param {
+    use_global_stats: true
+  }
+}
+layer {
+  name: "conv1_scale"
+  type: "Scale"
+  bottom: "conv1_bn"
+  top: "conv1_scale"
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1_scale"
+  top: "conv1_scale"
+  relu_param {
+    negative_slope: 0.1
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_scale"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  convolution_param {
+    num_output: 32
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+  }
+}
+layer {
+  name: "conv2_bn"
+  type: "BatchNorm"
+  bottom: "conv2"
+  top: "conv2_bn"
+  batch_norm_param {
+    use_global_stats: true
+  }
+}
+layer {
+  name: "conv2_scale"
+  type: "Scale"
+  bottom: "conv2_bn"
+  top: "conv2_scale"
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2_scale"
+  top: "conv2_scale"
+  relu_param {
+    negative_slope: 0.1
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_scale"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  convolution_param {
+    num_output: 64
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+  }
+}
+layer {
+  name: "conv3_bn"
+  type: "BatchNorm"
+  bottom: "conv3"
+  top: "conv3_bn"
+  batch_norm_param {
+    use_global_stats: true
+  }
+}
+layer {
+  name: "conv3_scale"
+  type: "Scale"
+  bottom: "conv3_bn"
+  top: "conv3_scale"
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3_scale"
+  top: "conv3_scale"
+  relu_param {
+    negative_slope: 0.1
+  }
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_scale"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4"
+  convolution_param {
+    num_output: 128
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+  }
+}
+layer {
+  name: "conv4_bn"
+  type: "BatchNorm"
+  bottom: "conv4"
+  top: "conv4_bn"
+  batch_norm_param {
+    use_global_stats: true
+  }
+}
+layer {
+  name: "conv4_scale"
+  type: "Scale"
+  bottom: "conv4_bn"
+  top: "conv4_scale"
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4_scale"
+  top: "conv4_scale"
+  relu_param {
+    negative_slope: 0.1
+  }
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_scale"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5"
+  convolution_param {
+    num_output: 256
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+  }
+}
+layer {
+  name: "conv5_bn"
+  type: "BatchNorm"
+  bottom: "conv5"
+  top: "conv5_bn"
+  batch_norm_param {
+    use_global_stats: true
+  }
+}
+layer {
+  name: "conv5_scale"
+  type: "Scale"
+  bottom: "conv5_bn"
+  top: "conv5_scale"
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5_scale"
+  top: "conv5_scale"
+  relu_param {
+    negative_slope: 0.1
+  }
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5_scale"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv6"
+  type: "Convolution"
+  bottom: "pool5"
+  top: "conv6"
+  convolution_param {
+    num_output: 512
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+  }
+}
+layer {
+  name: "conv6_bn"
+  type: "BatchNorm"
+  bottom: "conv6"
+  top: "conv6_bn"
+  batch_norm_param {
+    use_global_stats: true
+  }
+}
+layer {
+  name: "conv6_scale"
+  type: "Scale"
+  bottom: "conv6_bn"
+  top: "conv6_scale"
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "conv6_scale"
+  top: "conv6_scale"
+  relu_param {
+    negative_slope: 0.1
+  }
+}
+layer {
+  name: "pool6"
+  type: "Pooling"
+  bottom: "conv6_scale"
+  top: "pool6"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 1
+  }
+}
+layer {
+  name: "conv7"
+  type: "Convolution"
+  bottom: "pool6"
+  top: "conv7"
+  convolution_param {
+    num_output: 1024
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+  }
+}
+layer {
+  name: "conv7_bn"
+  type: "BatchNorm"
+  bottom: "conv7"
+  top: "conv7_bn"
+  batch_norm_param {
+    use_global_stats: true
+  }
+}
+layer {
+  name: "conv7_scale"
+  type: "Scale"
+  bottom: "conv7_bn"
+  top: "conv7_scale"
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "conv7_scale"
+  top: "conv7_scale"
+  relu_param {
+    negative_slope: 0.1
+  }
+}
+layer {
+  name: "conv8"
+  type: "Convolution"
+  bottom: "conv7_scale"
+  top: "conv8"
+  convolution_param {
+    num_output: 1024
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+  }
+}
+layer {
+  name: "conv8_bn"
+  type: "BatchNorm"
+  bottom: "conv8"
+  top: "conv8_bn"
+  batch_norm_param {
+    use_global_stats: true
+  }
+}
+layer {
+  name: "conv8_scale"
+  type: "Scale"
+  bottom: "conv8_bn"
+  top: "conv8_scale"
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "relu8"
+  type: "ReLU"
+  bottom: "conv8_scale"
+  top: "conv8_scale"
+  relu_param {
+    negative_slope: 0.1
+  }
+}
+layer {
+  name: "conv9"
+  type: "Convolution"
+  bottom: "conv8_scale"
+  top: "result"
+  convolution_param {
+    num_output: 425
+    pad: 0
+    kernel_size: 1
+    stride: 1
+  }
+}
diff --git a/prototxt/tiny_yolo_voc_deploy.prototxt b/prototxt/tiny_yolo_voc_deploy.prototxt
new file mode 100644
index 0000000..9d8d1fe
--- /dev/null
+++ b/prototxt/tiny_yolo_voc_deploy.prototxt
@@ -0,0 +1,411 @@
+layer {
+  name: "data"
+  type: "Input"
+  top: "data"
+  input_param {
+    shape {
+      dim: 1
+      dim: 3
+      dim: 416
+      dim: 416
+    }
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  convolution_param {
+    num_output: 16
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+  }
+}
+layer {
+  name: "conv1_bn"
+  type: "BatchNorm"
+  bottom: "conv1"
+  top: "conv1_bn"
+  batch_norm_param {
+    use_global_stats: true
+  }
+}
+layer {
+  name: "conv1_scale"
+  type: "Scale"
+  bottom: "conv1_bn"
+  top: "conv1_scale"
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1_scale"
+  top: "conv1_scale"
+  relu_param {
+    negative_slope: 0.1
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_scale"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  convolution_param {
+    num_output: 32
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+  }
+}
+layer {
+  name: "conv2_bn"
+  type: "BatchNorm"
+  bottom: "conv2"
+  top: "conv2_bn"
+  batch_norm_param {
+    use_global_stats: true
+  }
+}
+layer {
+  name: "conv2_scale"
+  type: "Scale"
+  bottom: "conv2_bn"
+  top: "conv2_scale"
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2_scale"
+  top: "conv2_scale"
+  relu_param {
+    negative_slope: 0.1
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_scale"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  convolution_param {
+    num_output: 64
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+  }
+}
+layer {
+  name: "conv3_bn"
+  type: "BatchNorm"
+  bottom: "conv3"
+  top: "conv3_bn"
+  batch_norm_param {
+    use_global_stats: true
+  }
+}
+layer {
+  name: "conv3_scale"
+  type: "Scale"
+  bottom: "conv3_bn"
+  top: "conv3_scale"
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3_scale"
+  top: "conv3_scale"
+  relu_param {
+    negative_slope: 0.1
+  }
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_scale"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4"
+  convolution_param {
+    num_output: 128
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+  }
+}
+layer {
+  name: "conv4_bn"
+  type: "BatchNorm"
+  bottom: "conv4"
+  top: "conv4_bn"
+  batch_norm_param {
+    use_global_stats: true
+  }
+}
+layer {
+  name: "conv4_scale"
+  type: "Scale"
+  bottom: "conv4_bn"
+  top: "conv4_scale"
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4_scale"
+  top: "conv4_scale"
+  relu_param {
+    negative_slope: 0.1
+  }
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_scale"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5"
+  convolution_param {
+    num_output: 256
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+  }
+}
+layer {
+  name: "conv5_bn"
+  type: "BatchNorm"
+  bottom: "conv5"
+  top: "conv5_bn"
+  batch_norm_param {
+    use_global_stats: true
+  }
+}
+layer {
+  name: "conv5_scale"
+  type: "Scale"
+  bottom: "conv5_bn"
+  top: "conv5_scale"
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5_scale"
+  top: "conv5_scale"
+  relu_param {
+    negative_slope: 0.1
+  }
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5_scale"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv6"
+  type: "Convolution"
+  bottom: "pool5"
+  top: "conv6"
+  convolution_param {
+    num_output: 512
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+  }
+}
+layer {
+  name: "conv6_bn"
+  type: "BatchNorm"
+  bottom: "conv6"
+  top: "conv6_bn"
+  batch_norm_param {
+    use_global_stats: true
+  }
+}
+layer {
+  name: "conv6_scale"
+  type: "Scale"
+  bottom: "conv6_bn"
+  top: "conv6_scale"
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "conv6_scale"
+  top: "conv6_scale"
+  relu_param {
+    negative_slope: 0.1
+  }
+}
+layer {
+  name: "pool6"
+  type: "Pooling"
+  bottom: "conv6_scale"
+  top: "pool6"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 1
+  }
+}
+layer {
+  name: "conv7"
+  type: "Convolution"
+  bottom: "pool6"
+  top: "conv7"
+  convolution_param {
+    num_output: 1024
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+  }
+}
+layer {
+  name: "conv7_bn"
+  type: "BatchNorm"
+  bottom: "conv7"
+  top: "conv7_bn"
+  batch_norm_param {
+    use_global_stats: true
+  }
+}
+layer {
+  name: "conv7_scale"
+  type: "Scale"
+  bottom: "conv7_bn"
+  top: "conv7_scale"
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "conv7_scale"
+  top: "conv7_scale"
+  relu_param {
+    negative_slope: 0.1
+  }
+}
+layer {
+  name: "conv8"
+  type: "Convolution"
+  bottom: "conv7_scale"
+  top: "conv8"
+  convolution_param {
+    num_output: 1024
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+  }
+}
+layer {
+  name: "conv8_bn"
+  type: "BatchNorm"
+  bottom: "conv8"
+  top: "conv8_bn"
+  batch_norm_param {
+    use_global_stats: true
+  }
+}
+layer {
+  name: "conv8_scale"
+  type: "Scale"
+  bottom: "conv8_bn"
+  top: "conv8_scale"
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "relu8"
+  type: "ReLU"
+  bottom: "conv8_scale"
+  top: "conv8_scale"
+  relu_param {
+    negative_slope: 0.1
+  }
+}
+layer {
+  name: "conv9"
+  type: "Convolution"
+  bottom: "conv8_scale"
+  top: "result"
+  convolution_param {
+    num_output: 125
+    pad: 0
+    kernel_size: 1
+    stride: 1
+  }
+}
diff --git a/prototxt/coco_tiny_deploy.prototxt b/prototxt/v1/coco_tiny_deploy.prototxt
similarity index 100%
rename from prototxt/coco_tiny_deploy.prototxt
rename to prototxt/v1/coco_tiny_deploy.prototxt
diff --git a/prototxt/yolo_small_deploy.prototxt b/prototxt/v1/yolo_small_deploy.prototxt
similarity index 100%
rename from prototxt/yolo_small_deploy.prototxt
rename to prototxt/v1/yolo_small_deploy.prototxt
diff --git a/prototxt/yolo_tiny_deploy.prototxt b/prototxt/v1/yolo_tiny_deploy.prototxt
similarity index 100%
rename from prototxt/yolo_tiny_deploy.prototxt
rename to prototxt/v1/yolo_tiny_deploy.prototxt
diff --git a/yolo_detect.py b/yolo_detect.py
index b9d0db2..b3051ea 100644
--- a/yolo_detect.py
+++ b/yolo_detect.py
@@ -26,7 +26,8 @@ def get_boxes(output, img_size, grid_size, num_boxes):
     w_img, h_img = img_size[1], img_size[0]
     boxes = np.reshape(output, (grid_size, grid_size, num_boxes, 4))
 
-    offset = np.tile(np.arange(grid_size)[:, np.newaxis], (grid_size, 1, num_boxes))
+    offset = np.tile(np.arange(grid_size)[:, np.newaxis],
+                     (grid_size, 1, num_boxes))
 
     boxes[:, :, :, 0] += offset
     boxes[:, :, :, 1] += np.transpose(offset, (1, 0, 2))
@@ -77,12 +78,9 @@ def softmax(val, axis=-1):
     return exp / np.sum(exp, axis=axis, keepdims=True)
 
 
-def get_boxes_v2(output, img_size):
+def get_boxes_v2(output, img_size, anchors):
     """ extract bounding boxes from the last layer (Darknet v2) """
-    # bias_w = [1.08, 3.42, 6.63, 9.42, 16.62]
-    # bias_h = [1.19, 4.41, 11.38, 5.11, 10.52]
-    bias_w = [0.738768, 2.42204, 4.30971, 10.246, 12.6868]
-    bias_h = [0.874946, 2.65704, 7.04493, 4.59428, 11.8741]
+    bias_w, bias_h = anchors
 
     w_img, h_img = img_size[1], img_size[0]
     grid_w, grid_h, num_boxes = output.shape[:3]
@@ -104,7 +102,7 @@ def get_boxes_v2(output, img_size):
     return boxes
 
 
-def parse_yolo_output_v2(output, img_size, num_classes):
+def parse_yolo_output_v2(output, img_size, num_classes, anchors):
     """ convert the output of the last convolutional layer (Darknet v2) """
     n_coord_box = 4
 
@@ -114,18 +112,18 @@ def parse_yolo_output_v2(output, img_size, num_classes):
              .transpose((2, 3, 0, 1))
 
     probs = logistic(output[:, :, :, 4:5]) * softmax(output[:, :, :, 5:], axis=3)
-    boxes = get_boxes_v2(output[:, :, :, :4], img_size)
+    boxes = get_boxes_v2(output[:, :, :, :4], img_size, anchors)
 
     return boxes, probs
 
 
-def parse_yolo_output(output, img_size, num_classes):
+def parse_yolo_output(output, img_size, num_classes, anchors=None):
     """ convert the output of YOLO's last layer to boxes and confidence in each
     class """
     if len(output.shape) == 1:
         return parse_yolo_output_v1(output, img_size, num_classes)
-    elif len(output.shape) == 3:
-        return parse_yolo_output_v2(output, img_size, num_classes)
+    elif len(output.shape) == 3 and anchors is not None:
+        return parse_yolo_output_v2(output, img_size, num_classes, anchors)
     else:
         raise ValueError(" output format not recognized")
 
@@ -152,12 +150,19 @@ def get_candidate_objects(output, img_size, coco=False):
         "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase",
         "scissors", "teddy bear", "hair drier", "toothbrush"
     ]
-    classes = classes_coco if coco else classes_voc
+    if coco:
+        classes = classes_coco
+        anchors = [[0.738768, 2.42204, 4.30971, 10.246, 12.6868],
+                   [0.874946, 2.65704, 7.04493, 4.59428, 11.8741]]
+    else:
+        classes = classes_voc
+        anchors =  [[1.08, 3.42, 6.63, 9.42, 16.62],
+                    [1.19, 4.41, 11.38, 5.11, 10.52]]
 
     threshold = 0.2
     iou_threshold = 0.4
 
-    boxes, probs = parse_yolo_output(output, img_size, len(classes))
+    boxes, probs = parse_yolo_output(output, img_size, len(classes), anchors)
 
     filter_mat_probs = (probs >= threshold)
     filter_mat_boxes = np.nonzero(filter_mat_probs)[0:3]
@@ -176,13 +181,8 @@ def get_candidate_objects(output, img_size, coco=False):
             len(boxes_filtered)))
         return []
 
-    # Non-Maxima Suppression: greedily suppress low-score overlapped boxes
-    for i, box_filtered in enumerate(boxes_filtered):
-        if probs_filtered[i] == 0:
-            continue
-        for j in range(i+1, len(boxes_filtered)):
-            if iou(box_filtered, boxes_filtered[j]) > iou_threshold:
-                probs_filtered[j] = 0.0
+    probs_filtered = non_maxima_suppression(boxes_filtered, probs_filtered,
+                                            classes_num_filtered, iou_threshold)
 
     filter_iou = (probs_filtered > 0.0)
     boxes_filtered = boxes_filtered[filter_iou]
@@ -196,15 +196,31 @@ def get_candidate_objects(output, img_size, coco=False):
     return result
 
 
-def iou(box1, box2):
+def non_maxima_suppression(boxes, probs, classes_num, thr=0.2):
+    """ greedily suppress low-scoring overlapped boxes """
+    for i, box in enumerate(boxes):
+        if probs[i] == 0:
+            continue
+        for j in range(i+1, len(boxes)):
+            if classes_num[i] == classes_num[j] and iou(box, boxes[j]) > thr:
+                probs[j] = 0.0
+
+    return probs
+
+
+def iou(box1, box2, denom="min"):
     """ compute intersection over union score """
     int_tb = min(box1[0]+0.5*box1[2], box2[0]+0.5*box2[2]) - \
              max(box1[0]-0.5*box1[2], box2[0]-0.5*box2[2])
     int_lr = min(box1[1]+0.5*box1[3], box2[1]+0.5*box2[3]) - \
              max(box1[1]-0.5*box1[3], box2[1]-0.5*box2[3])
+
     intersection = max(0.0, int_tb) * max(0.0, int_lr)
+    area1, area2 = box1[2]*box1[3], box2[2]*box2[3]
+    control_area = min(area1, area2) if denom == "min"  \
+                   else area1 + area2 - intersection
 
-    return intersection / (box1[2]*box1[3] + box2[2]*box2[3] - intersection)
+    return intersection / control_area
 
 
 def draw_box(img, name, box, score):