"""SSD 300 utils adapted from MLCommons.
Based on MLCommons Reference Implementation `here`_
.. _here: https://github.com/mlcommons/training/tree/master/single_stage_detector/ssd
"""
import itertools
import random
from math import sqrt
import numpy as np
import torch
import torch.nn.functional as F
import torchvision.transforms as transforms
from PIL import Image
__all__ = ["Encoder", "SSDTransformer", "dboxes300_coco"]
# This function is from https://github.com/kuangliu/pytorch-ssd.
[docs]def calc_iou_tensor(box1, box2):
"""Calculation of IoU based on two boxes tensor, Reference to https://github.com/kuangliu/pytorch-ssd."""
N = box1.size(0)
M = box2.size(0)
be1 = box1.unsqueeze(1).expand(-1, M, -1)
be2 = box2.unsqueeze(0).expand(N, -1, -1)
# Left Top & Right Bottom
lt = torch.max(be1[:, :, :2], be2[:, :, :2])
rb = torch.min(be1[:, :, 2:], be2[:, :, 2:])
delta = rb - lt
delta[delta < 0] = 0
intersect = delta[:, :, 0] * delta[:, :, 1]
delta1 = be1[:, :, 2:] - be1[:, :, :2]
area1 = delta1[:, :, 0] * delta1[:, :, 1]
delta2 = be2[:, :, 2:] - be2[:, :, :2]
area2 = delta2[:, :, 0] * delta2[:, :, 1]
iou = intersect / (area1 + area2 - intersect)
return iou
# This function is from https://github.com/kuangliu/pytorch-ssd.
[docs]class Encoder(object):
"""Inspired by https://github.com/kuangliu/pytorch-ssd Transform between (bboxes, lables) <-> SSD output.
dboxes: default boxes in size 8732 x 4,
encoder: input ltrb format, output xywh format
decoder: input xywh format, output ltrb format
encode:
input : bboxes_in (Tensor nboxes x 4), labels_in (Tensor nboxes)
output : bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
criteria : IoU threshold of bboexes
decode:
input : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
criteria : IoU threshold of bboexes
max_output : maximum number of output bboxes
"""
def __init__(self, dboxes):
self.dboxes = dboxes(order="ltrb")
self.dboxes_xywh = dboxes(order="xywh").unsqueeze(dim=0)
self.nboxes = self.dboxes.size(0)
self.scale_xy = dboxes.scale_xy
self.scale_wh = dboxes.scale_wh
def encode(self, bboxes_in, labels_in, criteria=0.5):
ious = calc_iou_tensor(bboxes_in, self.dboxes)
best_dbox_ious, best_dbox_idx = ious.max(dim=0)
_, best_bbox_idx = ious.max(dim=1)
# set best ious 2.0
best_dbox_ious.index_fill_(0, best_bbox_idx, 2.0)
idx = torch.arange(0, best_bbox_idx.size(0), dtype=torch.int64)
best_dbox_idx[best_bbox_idx[idx]] = idx
# filter IoU > 0.5
masks = best_dbox_ious > criteria
labels_out = torch.zeros(self.nboxes, dtype=torch.long)
labels_out[masks] = labels_in[best_dbox_idx[masks]]
bboxes_out = self.dboxes.clone()
bboxes_out[masks, :] = bboxes_in[best_dbox_idx[masks], :]
# Transform format to xywh format
x, y, w, h = 0.5 * (bboxes_out[:, 0] + bboxes_out[:, 2]), \
0.5 * (bboxes_out[:, 1] + bboxes_out[:, 3]), \
-bboxes_out[:, 0] + bboxes_out[:, 2], \
-bboxes_out[:, 1] + bboxes_out[:, 3]
bboxes_out[:, 0] = x
bboxes_out[:, 1] = y
bboxes_out[:, 2] = w
bboxes_out[:, 3] = h
return bboxes_out, labels_out
[docs] def scale_back_batch(self, bboxes_in, scores_in):
"""Do scale and transform from xywh to ltrb suppose input Nx4xnum_bbox Nxlabel_numxnum_bbox."""
if bboxes_in.device == torch.device("cpu"):
self.dboxes = self.dboxes.cpu()
self.dboxes_xywh = self.dboxes_xywh.cpu()
else:
self.dboxes = self.dboxes.cuda()
self.dboxes_xywh = self.dboxes_xywh.cuda()
bboxes_in = bboxes_in.permute(0, 2, 1)
scores_in = scores_in.permute(0, 2, 1)
bboxes_in[:, :, :2] = self.scale_xy * bboxes_in[:, :, :2]
bboxes_in[:, :, 2:] = self.scale_wh * bboxes_in[:, :, 2:]
bboxes_in[:, :, :2] = bboxes_in[:, :, :2] * self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2]
bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp() * self.dboxes_xywh[:, :, 2:]
# Transform format to ltrb
l, t, r, b = bboxes_in[:, :, 0] - 0.5 * bboxes_in[:, :, 2], \
bboxes_in[:, :, 1] - 0.5 * bboxes_in[:, :, 3], \
bboxes_in[:, :, 0] + 0.5 * bboxes_in[:, :, 2], \
bboxes_in[:, :, 1] + 0.5 * bboxes_in[:, :, 3]
bboxes_in[:, :, 0] = l
bboxes_in[:, :, 1] = t
bboxes_in[:, :, 2] = r
bboxes_in[:, :, 3] = b
return bboxes_in, F.softmax(scores_in, dim=-1)
def decode_batch(self, bboxes_in, scores_in, criteria=0.45, max_output=200, nms_valid_thresh=0.05):
bboxes, probs = self.scale_back_batch(bboxes_in, scores_in)
output = []
for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):
bbox = bbox.squeeze(0)
prob = prob.squeeze(0)
output.append(self.decode_single(bbox, prob, criteria, max_output, nms_valid_thresh=nms_valid_thresh))
return output
# perform non-maximum suppression
def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200, nms_valid_thresh=0.05):
# Reference to https://github.com/amdegroot/ssd.pytorch
bboxes_out = []
scores_out = []
labels_out = []
for i, score in enumerate(scores_in.split(1, 1)):
if i == 0:
continue
score = score.squeeze(1)
mask = score > nms_valid_thresh
bboxes, score = bboxes_in[mask, :], score[mask]
if score.size(0) == 0:
continue
_, score_idx_sorted = score.sort(dim=0)
# select max_output indices
score_idx_sorted = score_idx_sorted[-max_num:]
candidates = []
while score_idx_sorted.numel() > 0:
idx = score_idx_sorted[-1].item()
bboxes_sorted = bboxes[score_idx_sorted, :]
bboxes_idx = bboxes[idx, :].unsqueeze(dim=0)
iou_sorted = calc_iou_tensor(bboxes_sorted, bboxes_idx).squeeze()
# we only need iou < criteria
score_idx_sorted = score_idx_sorted[iou_sorted < criteria]
candidates.append(idx)
bboxes_out.append(bboxes[candidates, :])
scores_out.append(score[candidates])
labels_out.extend([i] * len(candidates))
bboxes_out, labels_out, scores_out = torch.cat(bboxes_out, dim=0), \
torch.tensor(labels_out,
dtype=torch.long), \
torch.cat(scores_out, dim=0)
_, max_ids = scores_out.sort(dim=0)
max_ids = max_ids[-max_output:]
return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
class DefaultBoxes(object):
def __init__(self, fig_size, feat_size, steps, scales, aspect_ratios, \
scale_xy=0.1, scale_wh=0.2):
self.feat_size = feat_size
self.fig_size = fig_size
self.scale_xy_ = scale_xy
self.scale_wh_ = scale_wh
# According to https://github.com/weiliu89/caffe
# Calculation method slightly different from paper
self.steps = steps
self.scales = scales
fk = fig_size / np.array(steps)
self.aspect_ratios = aspect_ratios
self.default_boxes = []
# size of feature and number of feature
for idx, sfeat in enumerate(self.feat_size):
sk1 = scales[idx] / fig_size
sk2 = scales[idx + 1] / fig_size
sk3 = sqrt(sk1 * sk2)
all_sizes = [(sk1, sk1), (sk3, sk3)]
for alpha in aspect_ratios[idx]:
w, h = sk1 * sqrt(alpha), sk1 / sqrt(alpha)
all_sizes.append((w, h))
all_sizes.append((h, w))
for w, h in all_sizes:
for i, j in itertools.product(range(sfeat), repeat=2):
cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx]
self.default_boxes.append((cx, cy, w, h))
self.dboxes = torch.tensor(self.default_boxes, dtype=torch.float)
self.dboxes.clamp_(min=0, max=1)
# For IoU calculation
self.dboxes_ltrb = self.dboxes.clone()
self.dboxes_ltrb[:, 0] = self.dboxes[:, 0] - 0.5 * self.dboxes[:, 2]
self.dboxes_ltrb[:, 1] = self.dboxes[:, 1] - 0.5 * self.dboxes[:, 3]
self.dboxes_ltrb[:, 2] = self.dboxes[:, 0] + 0.5 * self.dboxes[:, 2]
self.dboxes_ltrb[:, 3] = self.dboxes[:, 1] + 0.5 * self.dboxes[:, 3]
@property
def scale_xy(self):
return self.scale_xy_
@property
def scale_wh(self):
return self.scale_wh_
def __call__(self, order="ltrb"):
if order == "ltrb":
return self.dboxes_ltrb
if order == "xywh":
return self.dboxes
# This function is from https://github.com/chauhan-utk/ssd.DomainAdaptation.
[docs]class SSDCropping(object):
"""Cropping for SSD, according to original paper Choose between following 3 conditions:
1. Preserve the original image
2. Random crop minimum IoU is among 0.1, 0.3, 0.5, 0.7, 0.9
3. Random crop
Reference to https://github.com/chauhan-utk/ssd.DomainAdaptation
"""
def __init__(self, num_cropping_iterations=1):
self.sample_options = (
# Do nothing
None,
# min IoU, max IoU
(0.1, None),
(0.3, None),
(0.5, None),
(0.7, None),
(0.9, None),
# no IoU requirements
(None, None),
)
# Implementation uses 1 iteration to find a possible candidate, this
# was shown to produce the same mAP as using more iterations.
self.num_cropping_iterations = num_cropping_iterations
def __call__(self, img, img_size, bboxes, labels):
# Ensure always return cropped image
while True:
mode = random.choice(self.sample_options)
if mode is None:
return img, img_size, bboxes, labels
htot, wtot = img_size
min_iou, max_iou = mode
min_iou = float("-inf") if min_iou is None else min_iou
max_iou = float("+inf") if max_iou is None else max_iou
for _ in range(self.num_cropping_iterations):
# suze of each sampled path in [0.1, 1] 0.3*0.3 approx. 0.1
w = random.uniform(0.3, 1.0)
h = random.uniform(0.3, 1.0)
if w / h < 0.5 or w / h > 2:
continue
# left 0 ~ wtot - w, top 0 ~ htot - h
left = random.uniform(0, 1.0 - w)
top = random.uniform(0, 1.0 - h)
right = left + w
bottom = top + h
ious = calc_iou_tensor(bboxes, torch.tensor([[left, top, right, bottom]]))
# tailor all the bboxes and return
if not ((ious > min_iou) & (ious < max_iou)).all():
continue
# discard any bboxes whose center not in the cropped image
xc = 0.5 * (bboxes[:, 0] + bboxes[:, 2])
yc = 0.5 * (bboxes[:, 1] + bboxes[:, 3])
masks = (xc > left) & (xc < right) & (yc > top) & (yc < bottom)
# if no such boxes, continue searching again
if not masks.any():
continue
bboxes[bboxes[:, 0] < left, 0] = left
bboxes[bboxes[:, 1] < top, 1] = top
bboxes[bboxes[:, 2] > right, 2] = right
bboxes[bboxes[:, 3] > bottom, 3] = bottom
bboxes = bboxes[masks, :]
labels = labels[masks]
left_idx = int(left * wtot)
top_idx = int(top * htot)
right_idx = int(right * wtot)
bottom_idx = int(bottom * htot)
img = img.crop((left_idx, top_idx, right_idx, bottom_idx))
bboxes[:, 0] = (bboxes[:, 0] - left) / w
bboxes[:, 1] = (bboxes[:, 1] - top) / h
bboxes[:, 2] = (bboxes[:, 2] - left) / w
bboxes[:, 3] = (bboxes[:, 3] - top) / h
htot = bottom_idx - top_idx
wtot = right_idx - left_idx
return img, (htot, wtot), bboxes, labels
class RandomHorizontalFlip(object):
def __init__(self, p=0.5):
self.p = p
def __call__(self, image, bboxes):
if random.random() < self.p:
bboxes[:, 0], bboxes[:, 2] = 1.0 - bboxes[:, 2], 1.0 - bboxes[:, 0]
return image.transpose(Image.FLIP_LEFT_RIGHT), bboxes
return image, bboxes
# Do data augumentation
def dboxes300_coco():
figsize = 300
feat_size = [38, 19, 10, 5, 3, 1]
steps = [8, 16, 32, 64, 100, 300]
# use the scales here: https://github.com/amdegroot/ssd.pytorch/blob/master/data/config.py
scales = [21, 45, 99, 153, 207, 261, 315]
aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
dboxes = DefaultBoxes(figsize, feat_size, steps, scales, aspect_ratios)
return dboxes