Source code for composer.models.ssd.utils

"""SSD 300 utils adapted from MLCommons.

Based on MLCommons Reference Implementation `here`_

.. _here:

import itertools
import random
from math import sqrt

import numpy as np
import torch
import torch.nn.functional as F
import torchvision.transforms as transforms
from PIL import Image

__all__ = ["Encoder", "SSDTransformer", "dboxes300_coco"]

# This function is from
[docs]def calc_iou_tensor(box1, box2): """Calculation of IoU based on two boxes tensor, Reference to""" N = box1.size(0) M = box2.size(0) be1 = box1.unsqueeze(1).expand(-1, M, -1) be2 = box2.unsqueeze(0).expand(N, -1, -1) # Left Top & Right Bottom lt = torch.max(be1[:, :, :2], be2[:, :, :2]) rb = torch.min(be1[:, :, 2:], be2[:, :, 2:]) delta = rb - lt delta[delta < 0] = 0 intersect = delta[:, :, 0] * delta[:, :, 1] delta1 = be1[:, :, 2:] - be1[:, :, :2] area1 = delta1[:, :, 0] * delta1[:, :, 1] delta2 = be2[:, :, 2:] - be2[:, :, :2] area2 = delta2[:, :, 0] * delta2[:, :, 1] iou = intersect / (area1 + area2 - intersect) return iou
# This function is from
[docs]class Encoder(object): """Inspired by Transform between (bboxes, lables) <-> SSD output. dboxes: default boxes in size 8732 x 4, encoder: input ltrb format, output xywh format decoder: input xywh format, output ltrb format encode: input : bboxes_in (Tensor nboxes x 4), labels_in (Tensor nboxes) output : bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732) criteria : IoU threshold of bboexes decode: input : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems) output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes) criteria : IoU threshold of bboexes max_output : maximum number of output bboxes """ def __init__(self, dboxes): self.dboxes = dboxes(order="ltrb") self.dboxes_xywh = dboxes(order="xywh").unsqueeze(dim=0) self.nboxes = self.dboxes.size(0) self.scale_xy = dboxes.scale_xy self.scale_wh = dboxes.scale_wh def encode(self, bboxes_in, labels_in, criteria=0.5): ious = calc_iou_tensor(bboxes_in, self.dboxes) best_dbox_ious, best_dbox_idx = ious.max(dim=0) _, best_bbox_idx = ious.max(dim=1) # set best ious 2.0 best_dbox_ious.index_fill_(0, best_bbox_idx, 2.0) idx = torch.arange(0, best_bbox_idx.size(0), dtype=torch.int64) best_dbox_idx[best_bbox_idx[idx]] = idx # filter IoU > 0.5 masks = best_dbox_ious > criteria labels_out = torch.zeros(self.nboxes, dtype=torch.long) labels_out[masks] = labels_in[best_dbox_idx[masks]] bboxes_out = self.dboxes.clone() bboxes_out[masks, :] = bboxes_in[best_dbox_idx[masks], :] # Transform format to xywh format x, y, w, h = 0.5 * (bboxes_out[:, 0] + bboxes_out[:, 2]), \ 0.5 * (bboxes_out[:, 1] + bboxes_out[:, 3]), \ -bboxes_out[:, 0] + bboxes_out[:, 2], \ -bboxes_out[:, 1] + bboxes_out[:, 3] bboxes_out[:, 0] = x bboxes_out[:, 1] = y bboxes_out[:, 2] = w bboxes_out[:, 3] = h return bboxes_out, labels_out
[docs] def scale_back_batch(self, bboxes_in, scores_in): """Do scale and transform from xywh to ltrb suppose input Nx4xnum_bbox Nxlabel_numxnum_bbox.""" if bboxes_in.device == torch.device("cpu"): self.dboxes = self.dboxes.cpu() self.dboxes_xywh = self.dboxes_xywh.cpu() else: self.dboxes = self.dboxes.cuda() self.dboxes_xywh = self.dboxes_xywh.cuda() bboxes_in = bboxes_in.permute(0, 2, 1) scores_in = scores_in.permute(0, 2, 1) bboxes_in[:, :, :2] = self.scale_xy * bboxes_in[:, :, :2] bboxes_in[:, :, 2:] = self.scale_wh * bboxes_in[:, :, 2:] bboxes_in[:, :, :2] = bboxes_in[:, :, :2] * self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2] bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp() * self.dboxes_xywh[:, :, 2:] # Transform format to ltrb l, t, r, b = bboxes_in[:, :, 0] - 0.5 * bboxes_in[:, :, 2], \ bboxes_in[:, :, 1] - 0.5 * bboxes_in[:, :, 3], \ bboxes_in[:, :, 0] + 0.5 * bboxes_in[:, :, 2], \ bboxes_in[:, :, 1] + 0.5 * bboxes_in[:, :, 3] bboxes_in[:, :, 0] = l bboxes_in[:, :, 1] = t bboxes_in[:, :, 2] = r bboxes_in[:, :, 3] = b return bboxes_in, F.softmax(scores_in, dim=-1)
def decode_batch(self, bboxes_in, scores_in, criteria=0.45, max_output=200, nms_valid_thresh=0.05): bboxes, probs = self.scale_back_batch(bboxes_in, scores_in) output = [] for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)): bbox = bbox.squeeze(0) prob = prob.squeeze(0) output.append(self.decode_single(bbox, prob, criteria, max_output, nms_valid_thresh=nms_valid_thresh)) return output # perform non-maximum suppression def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200, nms_valid_thresh=0.05): # Reference to bboxes_out = [] scores_out = [] labels_out = [] for i, score in enumerate(scores_in.split(1, 1)): if i == 0: continue score = score.squeeze(1) mask = score > nms_valid_thresh bboxes, score = bboxes_in[mask, :], score[mask] if score.size(0) == 0: continue _, score_idx_sorted = score.sort(dim=0) # select max_output indices score_idx_sorted = score_idx_sorted[-max_num:] candidates = [] while score_idx_sorted.numel() > 0: idx = score_idx_sorted[-1].item() bboxes_sorted = bboxes[score_idx_sorted, :] bboxes_idx = bboxes[idx, :].unsqueeze(dim=0) iou_sorted = calc_iou_tensor(bboxes_sorted, bboxes_idx).squeeze() # we only need iou < criteria score_idx_sorted = score_idx_sorted[iou_sorted < criteria] candidates.append(idx) bboxes_out.append(bboxes[candidates, :]) scores_out.append(score[candidates]) labels_out.extend([i] * len(candidates)) bboxes_out, labels_out, scores_out =, dim=0), \ torch.tensor(labels_out, dtype=torch.long), \, dim=0) _, max_ids = scores_out.sort(dim=0) max_ids = max_ids[-max_output:] return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
class DefaultBoxes(object): def __init__(self, fig_size, feat_size, steps, scales, aspect_ratios, \ scale_xy=0.1, scale_wh=0.2): self.feat_size = feat_size self.fig_size = fig_size self.scale_xy_ = scale_xy self.scale_wh_ = scale_wh # According to # Calculation method slightly different from paper self.steps = steps self.scales = scales fk = fig_size / np.array(steps) self.aspect_ratios = aspect_ratios self.default_boxes = [] # size of feature and number of feature for idx, sfeat in enumerate(self.feat_size): sk1 = scales[idx] / fig_size sk2 = scales[idx + 1] / fig_size sk3 = sqrt(sk1 * sk2) all_sizes = [(sk1, sk1), (sk3, sk3)] for alpha in aspect_ratios[idx]: w, h = sk1 * sqrt(alpha), sk1 / sqrt(alpha) all_sizes.append((w, h)) all_sizes.append((h, w)) for w, h in all_sizes: for i, j in itertools.product(range(sfeat), repeat=2): cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx] self.default_boxes.append((cx, cy, w, h)) self.dboxes = torch.tensor(self.default_boxes, dtype=torch.float) self.dboxes.clamp_(min=0, max=1) # For IoU calculation self.dboxes_ltrb = self.dboxes.clone() self.dboxes_ltrb[:, 0] = self.dboxes[:, 0] - 0.5 * self.dboxes[:, 2] self.dboxes_ltrb[:, 1] = self.dboxes[:, 1] - 0.5 * self.dboxes[:, 3] self.dboxes_ltrb[:, 2] = self.dboxes[:, 0] + 0.5 * self.dboxes[:, 2] self.dboxes_ltrb[:, 3] = self.dboxes[:, 1] + 0.5 * self.dboxes[:, 3] @property def scale_xy(self): return self.scale_xy_ @property def scale_wh(self): return self.scale_wh_ def __call__(self, order="ltrb"): if order == "ltrb": return self.dboxes_ltrb if order == "xywh": return self.dboxes # This function is from
[docs]class SSDCropping(object): """Cropping for SSD, according to original paper Choose between following 3 conditions: 1. Preserve the original image 2. Random crop minimum IoU is among 0.1, 0.3, 0.5, 0.7, 0.9 3. Random crop Reference to """ def __init__(self, num_cropping_iterations=1): self.sample_options = ( # Do nothing None, # min IoU, max IoU (0.1, None), (0.3, None), (0.5, None), (0.7, None), (0.9, None), # no IoU requirements (None, None), ) # Implementation uses 1 iteration to find a possible candidate, this # was shown to produce the same mAP as using more iterations. self.num_cropping_iterations = num_cropping_iterations def __call__(self, img, img_size, bboxes, labels): # Ensure always return cropped image while True: mode = random.choice(self.sample_options) if mode is None: return img, img_size, bboxes, labels htot, wtot = img_size min_iou, max_iou = mode min_iou = float("-inf") if min_iou is None else min_iou max_iou = float("+inf") if max_iou is None else max_iou for _ in range(self.num_cropping_iterations): # suze of each sampled path in [0.1, 1] 0.3*0.3 approx. 0.1 w = random.uniform(0.3, 1.0) h = random.uniform(0.3, 1.0) if w / h < 0.5 or w / h > 2: continue # left 0 ~ wtot - w, top 0 ~ htot - h left = random.uniform(0, 1.0 - w) top = random.uniform(0, 1.0 - h) right = left + w bottom = top + h ious = calc_iou_tensor(bboxes, torch.tensor([[left, top, right, bottom]])) # tailor all the bboxes and return if not ((ious > min_iou) & (ious < max_iou)).all(): continue # discard any bboxes whose center not in the cropped image xc = 0.5 * (bboxes[:, 0] + bboxes[:, 2]) yc = 0.5 * (bboxes[:, 1] + bboxes[:, 3]) masks = (xc > left) & (xc < right) & (yc > top) & (yc < bottom) # if no such boxes, continue searching again if not masks.any(): continue bboxes[bboxes[:, 0] < left, 0] = left bboxes[bboxes[:, 1] < top, 1] = top bboxes[bboxes[:, 2] > right, 2] = right bboxes[bboxes[:, 3] > bottom, 3] = bottom bboxes = bboxes[masks, :] labels = labels[masks] left_idx = int(left * wtot) top_idx = int(top * htot) right_idx = int(right * wtot) bottom_idx = int(bottom * htot) img = img.crop((left_idx, top_idx, right_idx, bottom_idx)) bboxes[:, 0] = (bboxes[:, 0] - left) / w bboxes[:, 1] = (bboxes[:, 1] - top) / h bboxes[:, 2] = (bboxes[:, 2] - left) / w bboxes[:, 3] = (bboxes[:, 3] - top) / h htot = bottom_idx - top_idx wtot = right_idx - left_idx return img, (htot, wtot), bboxes, labels
class RandomHorizontalFlip(object): def __init__(self, p=0.5): self.p = p def __call__(self, image, bboxes): if random.random() < self.p: bboxes[:, 0], bboxes[:, 2] = 1.0 - bboxes[:, 2], 1.0 - bboxes[:, 0] return image.transpose(Image.FLIP_LEFT_RIGHT), bboxes return image, bboxes # Do data augumentation
[docs]class SSDTransformer(object): """SSD Data Augumentation, according to original paper Composed by several steps: Cropping Resize Flipping Jittering """ def __init__(self, dboxes, size=(300, 300), val=False, num_cropping_iterations=1): # define vgg16 mean self.size = size self.val = val self.dboxes_ = dboxes # DefaultBoxes300() self.encoder = Encoder(self.dboxes_) self.crop = SSDCropping(num_cropping_iterations=num_cropping_iterations) self.img_trans = transforms.Compose([ transforms.Resize(self.size), transforms.ColorJitter(brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05), transforms.ToTensor() ]) self.hflip = RandomHorizontalFlip() # All Pytorch Tensors will be normalized # normalization_mean = [0.485, 0.456, 0.406] normalization_std = [0.229, 0.224, 0.225] self.normalize = transforms.Normalize(mean=normalization_mean, std=normalization_std) self.trans_val = transforms.Compose([ transforms.Resize(self.size), transforms.ToTensor(), self.normalize, ]) @property def dboxes(self): return self.dboxes_ def __call__(self, img, img_size, bbox=None, label=None, max_num=200): if self.val: bbox_out = torch.zeros(max_num, 4) label_out = torch.zeros(max_num, dtype=torch.long) bbox_out[:bbox.size(0), :] = bbox #type: ignore label_out[:label.size(0)] = label #type: ignore return self.trans_val(img), img_size, bbox_out, label_out img, img_size, bbox, label = self.crop(img, img_size, bbox, label) img, bbox = self.hflip(img, bbox) img = self.img_trans(img).contiguous() img = self.normalize(img) bbox, label = self.encoder.encode(bbox, label) return img, img_size, bbox, label
def dboxes300_coco(): figsize = 300 feat_size = [38, 19, 10, 5, 3, 1] steps = [8, 16, 32, 64, 100, 300] # use the scales here: scales = [21, 45, 99, 153, 207, 261, 315] aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] dboxes = DefaultBoxes(figsize, feat_size, steps, scales, aspect_ratios) return dboxes