Paper-info
FaceBoxes : A CPU Real-time Face Detector with High Accuracy [2017-IJCB]Github : faceboxes-pytorchBackbone
Prior boxes
CoreCode
class MultiBoxLoss(nn.Module): def __init__(self, args): ...... def forward(self, predictions, targets): loc_data, conf_data = predictions num_images = loc_data.size(0) num_priors = (self.priors.size(0)) loc_t = torch.Tensor(num_images, num_priors, 4) conf_t = torch.LongTensor(num_images, num_priors) priors = self.priors.data for idx in range(num_images): gt_boxes = targets[idx][:, :-1].data labels = targets[idx][:, -1].data loc, conf = boxlib.match(gt_boxes, priors, labels, self.args.overlap_th, self.args.variance) loc_t[idx], conf_t[idx] = loc, conf if self.args.use_gpu: loc_t = loc_t.cuda() conf_t = conf_t.cuda() # calculate location loss pos = conf_t > 0 pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) # (n_images, n_priors, 4) loc_p = loc_data[pos_idx].view(-1, 4) loc_t = loc_t[pos_idx].view(-1, 4) loss_l = F.smooth_l1_loss(loc_p, loc_t, reduction='sum') # (cx, cy, w, h) # hard negative mining batch_conf = conf_data.view(-1, self.args.num_classes) # conf_data.shape = (n_images, n_priors, 2) loss_c = boxlib.log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1)) # calculate the -log(softmax()) = - x_y + log_sum_exp loss_c[pos.view(-1, 1)] = 0 loss_c = loss_c.view(num_images, -1) _, loss_idx = loss_c.sort(1, descending=True) _, idx_rank = loss_idx.sort(1) num_pos = pos.long().sum(1, keepdim=True) num_neg = torch.clamp(self.args.np_ratio * num_pos, max=pos.size(1)-1) neg = idx_rank < num_neg.expand_as(idx_rank) pos_idx = pos.unsqueeze(2).expand_as(conf_data) neg_idx = neg.unsqueeze(2).expand_as(conf_data) scores = conf_data[(pos_idx + neg_idx).gt(0)].view(-1, self.args.num_classes) np_gty = conf_t[(pos + neg).gt(0)] loss_c = F.cross_entropy(scores, np_gty, reduction='sum') # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + α * Lloc(x,l,g)) / N N = max(num_pos.data.sum().float(), 1) loss_l /= N loss_c /= N return loss_l, loss_c
