From 272ae1939d6984c5760bbf6cf5dee44dbfd51cfc Mon Sep 17 00:00:00 2001 From: Emanuele Plebani Date: Mon, 17 Apr 2017 23:46:28 +0200 Subject: [PATCH] support detector anchors, spin out NMS, add YOLO v2 models The detector anchors for YOLO v2 models are supported but using the coco / voc distinction. Non-maxima suppression has been separated in its own function. Added converted tiny_yolo and tiny_yolo_voc models. --- README.rst | 26 +- prototxt/tiny_yolo_deploy.prototxt | 411 +++++++++++++++++++ prototxt/tiny_yolo_voc_deploy.prototxt | 411 +++++++++++++++++++ prototxt/{ => v1}/coco_tiny_deploy.prototxt | 0 prototxt/{ => v1}/yolo_small_deploy.prototxt | 0 prototxt/{ => v1}/yolo_tiny_deploy.prototxt | 0 yolo_detect.py | 60 ++- 7 files changed, 877 insertions(+), 31 deletions(-) create mode 100644 prototxt/tiny_yolo_deploy.prototxt create mode 100644 prototxt/tiny_yolo_voc_deploy.prototxt rename prototxt/{ => v1}/coco_tiny_deploy.prototxt (100%) rename prototxt/{ => v1}/yolo_small_deploy.prototxt (100%) rename prototxt/{ => v1}/yolo_tiny_deploy.prototxt (100%) diff --git a/README.rst b/README.rst index 2618baa..0d4f2c9 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -|Python27| +|Python27| .. |Python27| image:: https://img.shields.io/badge/python-2.7-blue.svg :target: https://www.python.org/ @@ -81,21 +81,29 @@ the Darknet v2 YOLO models are also not supported. Model files =========== -Three converted models are available in the `prototxt` directory: +Two models converted from YOLO v2 are available in the ``prototxt`` directory: -* **YOLO tiny**: converted from `tiny-yolo.cfg`, - `caffemodel `__. - -* **YOLO small**: converted from `yolo-small.cfg`, - `caffemodel `__. +* **YOLO tiny** (CoCo): converted from ``tiny-yolo.cfg``, + `caffemodel `__. -* **YOLO CoCo tiny**: converted from `tiny-coco.cfg`, - `caffemodel `__. +* **YOLO tiny VOC**: converted from ``tiny-yolo-voc.cfg``, + `caffemodel `__. Legacy models ^^^^^^^^^^^^^ +Three converted models from YOLO v1 are available in the ``prototxt/v1`` directory: + +* **YOLO tiny**: converted from ``yolov1/tiny-yolo.cfg``, + `caffemodel `__. + +* **YOLO small**: converted from ``yolov1/yolo-small.cfg``, + `caffemodel `__. + +* **YOLO CoCo tiny**: converted from ``yolov1/tiny-coco.cfg``, + `caffemodel `__. + The models originally converted by *xingwangsfu* (https://github.com/xingwangsfu/caffe-yolo) are available in the directory ``prototxt/legacy``. The converted weights can be downloaded here: diff --git a/prototxt/tiny_yolo_deploy.prototxt b/prototxt/tiny_yolo_deploy.prototxt new file mode 100644 index 0000000..8dc221f --- /dev/null +++ b/prototxt/tiny_yolo_deploy.prototxt @@ -0,0 +1,411 @@ +layer { + name: "data" + type: "Input" + top: "data" + input_param { + shape { + dim: 1 + dim: 3 + dim: 416 + dim: 416 + } + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + convolution_param { + num_output: 16 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + } +} +layer { + name: "conv1_bn" + type: "BatchNorm" + bottom: "conv1" + top: "conv1_bn" + batch_norm_param { + use_global_stats: true + } +} +layer { + name: "conv1_scale" + type: "Scale" + bottom: "conv1_bn" + top: "conv1_scale" + scale_param { + bias_term: true + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1_scale" + top: "conv1_scale" + relu_param { + negative_slope: 0.1 + } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "conv1_scale" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + convolution_param { + num_output: 32 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + } +} +layer { + name: "conv2_bn" + type: "BatchNorm" + bottom: "conv2" + top: "conv2_bn" + batch_norm_param { + use_global_stats: true + } +} +layer { + name: "conv2_scale" + type: "Scale" + bottom: "conv2_bn" + top: "conv2_scale" + scale_param { + bias_term: true + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2_scale" + top: "conv2_scale" + relu_param { + negative_slope: 0.1 + } +} +layer { + name: "pool2" + type: "Pooling" + bottom: "conv2_scale" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "pool2" + top: "conv3" + convolution_param { + num_output: 64 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + } +} +layer { + name: "conv3_bn" + type: "BatchNorm" + bottom: "conv3" + top: "conv3_bn" + batch_norm_param { + use_global_stats: true + } +} +layer { + name: "conv3_scale" + type: "Scale" + bottom: "conv3_bn" + top: "conv3_scale" + scale_param { + bias_term: true + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3_scale" + top: "conv3_scale" + relu_param { + negative_slope: 0.1 + } +} +layer { + name: "pool3" + type: "Pooling" + bottom: "conv3_scale" + top: "pool3" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv4" + type: "Convolution" + bottom: "pool3" + top: "conv4" + convolution_param { + num_output: 128 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + } +} +layer { + name: "conv4_bn" + type: "BatchNorm" + bottom: "conv4" + top: "conv4_bn" + batch_norm_param { + use_global_stats: true + } +} +layer { + name: "conv4_scale" + type: "Scale" + bottom: "conv4_bn" + top: "conv4_scale" + scale_param { + bias_term: true + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4_scale" + top: "conv4_scale" + relu_param { + negative_slope: 0.1 + } +} +layer { + name: "pool4" + type: "Pooling" + bottom: "conv4_scale" + top: "pool4" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv5" + type: "Convolution" + bottom: "pool4" + top: "conv5" + convolution_param { + num_output: 256 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + } +} +layer { + name: "conv5_bn" + type: "BatchNorm" + bottom: "conv5" + top: "conv5_bn" + batch_norm_param { + use_global_stats: true + } +} +layer { + name: "conv5_scale" + type: "Scale" + bottom: "conv5_bn" + top: "conv5_scale" + scale_param { + bias_term: true + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5_scale" + top: "conv5_scale" + relu_param { + negative_slope: 0.1 + } +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5_scale" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv6" + type: "Convolution" + bottom: "pool5" + top: "conv6" + convolution_param { + num_output: 512 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + } +} +layer { + name: "conv6_bn" + type: "BatchNorm" + bottom: "conv6" + top: "conv6_bn" + batch_norm_param { + use_global_stats: true + } +} +layer { + name: "conv6_scale" + type: "Scale" + bottom: "conv6_bn" + top: "conv6_scale" + scale_param { + bias_term: true + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "conv6_scale" + top: "conv6_scale" + relu_param { + negative_slope: 0.1 + } +} +layer { + name: "pool6" + type: "Pooling" + bottom: "conv6_scale" + top: "pool6" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 1 + } +} +layer { + name: "conv7" + type: "Convolution" + bottom: "pool6" + top: "conv7" + convolution_param { + num_output: 1024 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + } +} +layer { + name: "conv7_bn" + type: "BatchNorm" + bottom: "conv7" + top: "conv7_bn" + batch_norm_param { + use_global_stats: true + } +} +layer { + name: "conv7_scale" + type: "Scale" + bottom: "conv7_bn" + top: "conv7_scale" + scale_param { + bias_term: true + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "conv7_scale" + top: "conv7_scale" + relu_param { + negative_slope: 0.1 + } +} +layer { + name: "conv8" + type: "Convolution" + bottom: "conv7_scale" + top: "conv8" + convolution_param { + num_output: 1024 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + } +} +layer { + name: "conv8_bn" + type: "BatchNorm" + bottom: "conv8" + top: "conv8_bn" + batch_norm_param { + use_global_stats: true + } +} +layer { + name: "conv8_scale" + type: "Scale" + bottom: "conv8_bn" + top: "conv8_scale" + scale_param { + bias_term: true + } +} +layer { + name: "relu8" + type: "ReLU" + bottom: "conv8_scale" + top: "conv8_scale" + relu_param { + negative_slope: 0.1 + } +} +layer { + name: "conv9" + type: "Convolution" + bottom: "conv8_scale" + top: "result" + convolution_param { + num_output: 425 + pad: 0 + kernel_size: 1 + stride: 1 + } +} diff --git a/prototxt/tiny_yolo_voc_deploy.prototxt b/prototxt/tiny_yolo_voc_deploy.prototxt new file mode 100644 index 0000000..9d8d1fe --- /dev/null +++ b/prototxt/tiny_yolo_voc_deploy.prototxt @@ -0,0 +1,411 @@ +layer { + name: "data" + type: "Input" + top: "data" + input_param { + shape { + dim: 1 + dim: 3 + dim: 416 + dim: 416 + } + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + convolution_param { + num_output: 16 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + } +} +layer { + name: "conv1_bn" + type: "BatchNorm" + bottom: "conv1" + top: "conv1_bn" + batch_norm_param { + use_global_stats: true + } +} +layer { + name: "conv1_scale" + type: "Scale" + bottom: "conv1_bn" + top: "conv1_scale" + scale_param { + bias_term: true + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1_scale" + top: "conv1_scale" + relu_param { + negative_slope: 0.1 + } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "conv1_scale" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + convolution_param { + num_output: 32 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + } +} +layer { + name: "conv2_bn" + type: "BatchNorm" + bottom: "conv2" + top: "conv2_bn" + batch_norm_param { + use_global_stats: true + } +} +layer { + name: "conv2_scale" + type: "Scale" + bottom: "conv2_bn" + top: "conv2_scale" + scale_param { + bias_term: true + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2_scale" + top: "conv2_scale" + relu_param { + negative_slope: 0.1 + } +} +layer { + name: "pool2" + type: "Pooling" + bottom: "conv2_scale" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "pool2" + top: "conv3" + convolution_param { + num_output: 64 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + } +} +layer { + name: "conv3_bn" + type: "BatchNorm" + bottom: "conv3" + top: "conv3_bn" + batch_norm_param { + use_global_stats: true + } +} +layer { + name: "conv3_scale" + type: "Scale" + bottom: "conv3_bn" + top: "conv3_scale" + scale_param { + bias_term: true + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3_scale" + top: "conv3_scale" + relu_param { + negative_slope: 0.1 + } +} +layer { + name: "pool3" + type: "Pooling" + bottom: "conv3_scale" + top: "pool3" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv4" + type: "Convolution" + bottom: "pool3" + top: "conv4" + convolution_param { + num_output: 128 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + } +} +layer { + name: "conv4_bn" + type: "BatchNorm" + bottom: "conv4" + top: "conv4_bn" + batch_norm_param { + use_global_stats: true + } +} +layer { + name: "conv4_scale" + type: "Scale" + bottom: "conv4_bn" + top: "conv4_scale" + scale_param { + bias_term: true + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4_scale" + top: "conv4_scale" + relu_param { + negative_slope: 0.1 + } +} +layer { + name: "pool4" + type: "Pooling" + bottom: "conv4_scale" + top: "pool4" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv5" + type: "Convolution" + bottom: "pool4" + top: "conv5" + convolution_param { + num_output: 256 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + } +} +layer { + name: "conv5_bn" + type: "BatchNorm" + bottom: "conv5" + top: "conv5_bn" + batch_norm_param { + use_global_stats: true + } +} +layer { + name: "conv5_scale" + type: "Scale" + bottom: "conv5_bn" + top: "conv5_scale" + scale_param { + bias_term: true + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5_scale" + top: "conv5_scale" + relu_param { + negative_slope: 0.1 + } +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5_scale" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv6" + type: "Convolution" + bottom: "pool5" + top: "conv6" + convolution_param { + num_output: 512 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + } +} +layer { + name: "conv6_bn" + type: "BatchNorm" + bottom: "conv6" + top: "conv6_bn" + batch_norm_param { + use_global_stats: true + } +} +layer { + name: "conv6_scale" + type: "Scale" + bottom: "conv6_bn" + top: "conv6_scale" + scale_param { + bias_term: true + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "conv6_scale" + top: "conv6_scale" + relu_param { + negative_slope: 0.1 + } +} +layer { + name: "pool6" + type: "Pooling" + bottom: "conv6_scale" + top: "pool6" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 1 + } +} +layer { + name: "conv7" + type: "Convolution" + bottom: "pool6" + top: "conv7" + convolution_param { + num_output: 1024 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + } +} +layer { + name: "conv7_bn" + type: "BatchNorm" + bottom: "conv7" + top: "conv7_bn" + batch_norm_param { + use_global_stats: true + } +} +layer { + name: "conv7_scale" + type: "Scale" + bottom: "conv7_bn" + top: "conv7_scale" + scale_param { + bias_term: true + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "conv7_scale" + top: "conv7_scale" + relu_param { + negative_slope: 0.1 + } +} +layer { + name: "conv8" + type: "Convolution" + bottom: "conv7_scale" + top: "conv8" + convolution_param { + num_output: 1024 + bias_term: false + pad: 1 + kernel_size: 3 + stride: 1 + } +} +layer { + name: "conv8_bn" + type: "BatchNorm" + bottom: "conv8" + top: "conv8_bn" + batch_norm_param { + use_global_stats: true + } +} +layer { + name: "conv8_scale" + type: "Scale" + bottom: "conv8_bn" + top: "conv8_scale" + scale_param { + bias_term: true + } +} +layer { + name: "relu8" + type: "ReLU" + bottom: "conv8_scale" + top: "conv8_scale" + relu_param { + negative_slope: 0.1 + } +} +layer { + name: "conv9" + type: "Convolution" + bottom: "conv8_scale" + top: "result" + convolution_param { + num_output: 125 + pad: 0 + kernel_size: 1 + stride: 1 + } +} diff --git a/prototxt/coco_tiny_deploy.prototxt b/prototxt/v1/coco_tiny_deploy.prototxt similarity index 100% rename from prototxt/coco_tiny_deploy.prototxt rename to prototxt/v1/coco_tiny_deploy.prototxt diff --git a/prototxt/yolo_small_deploy.prototxt b/prototxt/v1/yolo_small_deploy.prototxt similarity index 100% rename from prototxt/yolo_small_deploy.prototxt rename to prototxt/v1/yolo_small_deploy.prototxt diff --git a/prototxt/yolo_tiny_deploy.prototxt b/prototxt/v1/yolo_tiny_deploy.prototxt similarity index 100% rename from prototxt/yolo_tiny_deploy.prototxt rename to prototxt/v1/yolo_tiny_deploy.prototxt diff --git a/yolo_detect.py b/yolo_detect.py index b9d0db2..b3051ea 100644 --- a/yolo_detect.py +++ b/yolo_detect.py @@ -26,7 +26,8 @@ def get_boxes(output, img_size, grid_size, num_boxes): w_img, h_img = img_size[1], img_size[0] boxes = np.reshape(output, (grid_size, grid_size, num_boxes, 4)) - offset = np.tile(np.arange(grid_size)[:, np.newaxis], (grid_size, 1, num_boxes)) + offset = np.tile(np.arange(grid_size)[:, np.newaxis], + (grid_size, 1, num_boxes)) boxes[:, :, :, 0] += offset boxes[:, :, :, 1] += np.transpose(offset, (1, 0, 2)) @@ -77,12 +78,9 @@ def softmax(val, axis=-1): return exp / np.sum(exp, axis=axis, keepdims=True) -def get_boxes_v2(output, img_size): +def get_boxes_v2(output, img_size, anchors): """ extract bounding boxes from the last layer (Darknet v2) """ - # bias_w = [1.08, 3.42, 6.63, 9.42, 16.62] - # bias_h = [1.19, 4.41, 11.38, 5.11, 10.52] - bias_w = [0.738768, 2.42204, 4.30971, 10.246, 12.6868] - bias_h = [0.874946, 2.65704, 7.04493, 4.59428, 11.8741] + bias_w, bias_h = anchors w_img, h_img = img_size[1], img_size[0] grid_w, grid_h, num_boxes = output.shape[:3] @@ -104,7 +102,7 @@ def get_boxes_v2(output, img_size): return boxes -def parse_yolo_output_v2(output, img_size, num_classes): +def parse_yolo_output_v2(output, img_size, num_classes, anchors): """ convert the output of the last convolutional layer (Darknet v2) """ n_coord_box = 4 @@ -114,18 +112,18 @@ def parse_yolo_output_v2(output, img_size, num_classes): .transpose((2, 3, 0, 1)) probs = logistic(output[:, :, :, 4:5]) * softmax(output[:, :, :, 5:], axis=3) - boxes = get_boxes_v2(output[:, :, :, :4], img_size) + boxes = get_boxes_v2(output[:, :, :, :4], img_size, anchors) return boxes, probs -def parse_yolo_output(output, img_size, num_classes): +def parse_yolo_output(output, img_size, num_classes, anchors=None): """ convert the output of YOLO's last layer to boxes and confidence in each class """ if len(output.shape) == 1: return parse_yolo_output_v1(output, img_size, num_classes) - elif len(output.shape) == 3: - return parse_yolo_output_v2(output, img_size, num_classes) + elif len(output.shape) == 3 and anchors is not None: + return parse_yolo_output_v2(output, img_size, num_classes, anchors) else: raise ValueError(" output format not recognized") @@ -152,12 +150,19 @@ def get_candidate_objects(output, img_size, coco=False): "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" ] - classes = classes_coco if coco else classes_voc + if coco: + classes = classes_coco + anchors = [[0.738768, 2.42204, 4.30971, 10.246, 12.6868], + [0.874946, 2.65704, 7.04493, 4.59428, 11.8741]] + else: + classes = classes_voc + anchors = [[1.08, 3.42, 6.63, 9.42, 16.62], + [1.19, 4.41, 11.38, 5.11, 10.52]] threshold = 0.2 iou_threshold = 0.4 - boxes, probs = parse_yolo_output(output, img_size, len(classes)) + boxes, probs = parse_yolo_output(output, img_size, len(classes), anchors) filter_mat_probs = (probs >= threshold) filter_mat_boxes = np.nonzero(filter_mat_probs)[0:3] @@ -176,13 +181,8 @@ def get_candidate_objects(output, img_size, coco=False): len(boxes_filtered))) return [] - # Non-Maxima Suppression: greedily suppress low-score overlapped boxes - for i, box_filtered in enumerate(boxes_filtered): - if probs_filtered[i] == 0: - continue - for j in range(i+1, len(boxes_filtered)): - if iou(box_filtered, boxes_filtered[j]) > iou_threshold: - probs_filtered[j] = 0.0 + probs_filtered = non_maxima_suppression(boxes_filtered, probs_filtered, + classes_num_filtered, iou_threshold) filter_iou = (probs_filtered > 0.0) boxes_filtered = boxes_filtered[filter_iou] @@ -196,15 +196,31 @@ def get_candidate_objects(output, img_size, coco=False): return result -def iou(box1, box2): +def non_maxima_suppression(boxes, probs, classes_num, thr=0.2): + """ greedily suppress low-scoring overlapped boxes """ + for i, box in enumerate(boxes): + if probs[i] == 0: + continue + for j in range(i+1, len(boxes)): + if classes_num[i] == classes_num[j] and iou(box, boxes[j]) > thr: + probs[j] = 0.0 + + return probs + + +def iou(box1, box2, denom="min"): """ compute intersection over union score """ int_tb = min(box1[0]+0.5*box1[2], box2[0]+0.5*box2[2]) - \ max(box1[0]-0.5*box1[2], box2[0]-0.5*box2[2]) int_lr = min(box1[1]+0.5*box1[3], box2[1]+0.5*box2[3]) - \ max(box1[1]-0.5*box1[3], box2[1]-0.5*box2[3]) + intersection = max(0.0, int_tb) * max(0.0, int_lr) + area1, area2 = box1[2]*box1[3], box2[2]*box2[3] + control_area = min(area1, area2) if denom == "min" \ + else area1 + area2 - intersection - return intersection / (box1[2]*box1[3] + box2[2]*box2[3] - intersection) + return intersection / control_area def draw_box(img, name, box, score):