下面的3个feature_map是仿照v5的head随机产生的输出。为了方便后面代码讲解,这里我设置的num_classes为1 。
feature_map1 = torch.rand([batch_size, 3, 80, 80, 5 + num_classes]) feature_map2 = torch.rand([batch_size, 3, 40, 40, 5 + num_classes]) feature_map3 = torch.rand([batch_size, 3, 20, 20, 5 + num_classes]) pred = [feature_map1, feature_map2, feature_map3]
我这里的target也是我随便举例的,可以看到他的shape为[3,6],也就是【num_obj, batch_idex+classes+xywh】。这里的num_obj表示当前图像中有出现了几个目标,batch_idex是第几个图像或者说第几个batch的索引,比如我这里batch是2,但这个是第一张图像的target信息,class表示当前目标是什么类【注意和num_classes区分】,后面的xywh就是box信息。
targets = torch.tensor([[0.00000, 0.00000, 0.04204, 0.21125, 0.08408, 0.36503], [0.00000, 0.00000, 0.14960, 0.24400, 0.23867, 0.36503], [0.00000, 0.00000, 0.36253, 0.24517, 0.21995, 0.39545]])
yolo系列的损失函数通常为三个部分。cls_loss[分类],obj_loss[置信度loss],loc_loss[box loss].
class ComputeLoss: sort_obj_iou = False # Compute losses def __init__(self, model, autobalance=False): device = next(model.parameters()).device # get model device h = model.hyp # 获取超参数 # 损失函数定义,cls:二分类交叉熵, obj:二分类交叉熵 BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['cls_pw']], device=device)) BCEobj = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([h['obj_pw']], device=device)) # Class label smoothing https://arxiv.org/pdf/1902.04103.pdf eqn 3 self.cp, self.cn = smooth_BCE(eps=h.get('label_smoothing', 0.0)) # positive, negative BCE targets # Focal loss g = h['fl_gamma'] # focal loss gamma if g > 0: BCEcls, BCEobj = FocalLoss(BCEcls, g), FocalLoss(BCEobj, g) m = de_parallel(model).model[-1] # Detect() module # balance用于判断是否输出为3层,如果是返回[4.0, 1.0, 0.4], 否则返回[4, 1, 0.25, 0.06, 0.02] 这些value值是给小中大目标Head给的权重 self.balance = {3: [4.0, 1.0, 0.4]}.get(m.nl, [4.0, 1.0, 0.25, 0.06, 0.02]) # P3-P7 self.ssi = list(m.stride).index(16) if autobalance else 0 # stride 16 index 返回步长为16的索引 self.BCEcls, self.BCEobj, self.gr, self.hyp, self.autobalance = BCEcls, BCEobj, 1.0, h, autobalance self.na = m.na # number of anchors self.nc = m.nc # number of classes self.nl = m.nl # number of layers self.anchors = m.anchors self.device = device
def build_targets(self, p, targets): # Build targets for compute_loss(), input targets(image,class,x,y,w,h) na, nt = self.na, targets.shape[0] # number of anchors, targets. anchor数量, target数量 tcls, tbox, indices, anch = [], [], [], [] gain = torch.ones(7, device=self.device) # normalized to gridspace gain
ai = torch.arange(na, device=self.device).float().view(na, 1).repeat(1, nt)
ai tensor([[0., 0., 0.],
[1., 1., 1.],
[2., 2., 2.],
行数等于target数量,shape is [target[0], 1]
targets = torch.cat((targets.repeat(na, 1, 1), ai[..., None]), 2) # append anchor indices
tensor([[[0.00000, 0.00000, 0.04204, 0.21125, 0.08408, 0.36503, 0.00000],
[0.00000, 0.00000, 0.14960, 0.24400, 0.23867, 0.36503, 0.00000],
[0.00000, 0.00000, 0.36253, 0.24517, 0.21995, 0.39545, 0.00000]],
[ [0.00000, 0.00000, 0.04204, 0.21125, 0.08408, 0.36503, 1.00000],
[0.00000, 0.00000, 0.14960, 0.24400, 0.23867, 0.36503, 1.00000],
[0.00000, 0.00000, 0.36253, 0.24517, 0.21995, 0.39545, 1.00000]],
[ [0.00000, 0.00000, 0.04204, 0.21125, 0.08408, 0.36503, 2.00000],
[0.00000, 0.00000, 0.14960, 0.24400, 0.23867, 0.36503, 2.00000],
[0.00000, 0.00000, 0.36253, 0.24517, 0.21995, 0.39545, 2.00000]]])
targets:[num_obj, 6],repeat(3,1,1)表示复制3个1行1列,则repeat后 targets shape变为[3,num_obj,6]
ai = [[[0.],
ai[...,None] shape [3, num_obj, 1]
cat后targets shape [3, num_obj,6+1]=[3,num_obj,7]
for i in range(self.nl): anchors, shape = self.anchors[i], p[i].shape # torch.tensor(shape)=[batch,3,80,80,85] gain[2:6] = torch.tensor(shape)[[3, 2, 3, 2]] # xyxy gain 获取特征层的w和h[80,80,80,80]
大家可以回看target中的box信息,可以看到这些最初的box信息值范围是0~1的,但此时我们的特征层head w和h是80*80,这肯定是不匹配的,所以可以通过targets * gain将这些Box缩放到特征层上得到真实的尺寸。
# Match targets to anchors 先验框和target框的匹配 t = targets * gain # shape(3,n,7) 将targets中的box缩放到特征层上
tensor([[[ 0.00000, 0.00000, 3.36320, 16.90000, 6.72640, 29.20240, 0.00000],
[ 0.00000, 0.00000, 11.96800, 19.52000, 19.09360, 29.20240, 0.00000],
[ 0.00000, 0.00000, 29.00240, 19.61360, 17.59600, 31.63600, 0.00000]],
[[ 0.00000, 0.00000, 3.36320, 16.90000, 6.72640, 29.20240, 1.00000],
[ 0.00000, 0.00000, 11.96800, 19.52000, 19.09360, 29.20240, 1.00000],
[ 0.00000, 0.00000, 29.00240, 19.61360, 17.59600, 31.63600, 1.00000]],
[[ 0.00000, 0.00000, 3.36320, 16.90000, 6.72640, 29.20240, 2.00000],
[ 0.00000, 0.00000, 11.96800, 19.52000, 19.09360, 29.20240, 2.00000],
[ 0.00000, 0.00000, 29.00240, 19.61360, 17.59600, 31.63600, 2.00000]]])
r = t[..., 4:6] / anchors[:, None] # wh ratio j = torch.max(r, 1 / r).max(2)[0] < self.hyp['anchor_t'] # compare
tensor([[False, False, False],
[False, False, False],
[False, False, False]])
tensor([[[ 0.00000, 0.00000, 1.68160, 8.45000, 3.36320, 14.60120, 0.00000],
[ 0.00000, 0.00000, 5.98400, 9.76000, 9.54680, 14.60120, 0.00000],
[ 0.00000, 0.00000, 14.50120, 9.80680, 8.79800, 15.81800, 0.00000]],
[ [ 0.00000, 0.00000, 1.68160, 8.45000, 3.36320, 14.60120, 1.00000],
[ 0.00000, 0.00000, 5.98400, 9.76000, 9.54680, 14.60120, 1.00000],
[ 0.00000, 0.00000, 14.50120, 9.80680, 8.79800, 15.81800, 1.00000]],
[ [ 0.00000, 0.00000, 1.68160, 8.45000, 3.36320, 14.60120, 2.00000],
[ 0.00000, 0.00000, 5.98400, 9.76000, 9.54680, 14.60120, 2.00000],
[ 0.00000, 0.00000, 14.50120, 9.80680, 8.79800, 15.81800, 2.00000]]])
tensor([[ True, False, False],
[False, False, False],
[ True, True, True]])
tensor([[ 0.00000, 0.00000, 1.68160, 8.45000, 3.36320, 14.60120, 0.00000],
[ 0.00000, 0.00000, 1.68160, 8.45000, 3.36320, 14.60120, 2.00000],
[ 0.00000, 0.00000, 5.98400, 9.76000, 9.54680, 14.60120, 2.00000],
[ 0.00000, 0.00000, 14.50120, 9.80680, 8.79800, 15.81800, 2.00000]])
gxy = t[:, 2:4] # grid xy
gxi = gain[[2, 3]] - gxy
j, k = ((gxy % 1 < g) & (gxy > 1)).T l, m = ((gxi % 1 < g) & (gxi > 1)).T j = torch.stack((torch.ones_like(j), j, k, l, m))
tensor([[ True, True, True, True],
[False, False, False, False],
[ True, True, False, False],
[ True, True, True, True],
[False, False, True, True]])
t = t.repeat((5, 1, 1))[j] offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j]
def build_targets(self, p, targets): # Build targets for compute_loss(), input targets(image,class,x,y,w,h) na, nt = self.na, targets.shape[0] # number of anchors, targets. anchor数量, target数量 tcls, tbox, indices, anch = [], [], [], [] gain = torch.ones(7, device=self.device) # normalized to gridspace gain ''' ai tensor([[0., 0., 0.], [1., 1., 1.], [2., 2., 2.], ..., ]) 行数等于target数量,shape is [target[0], 1] ''' ai = torch.arange(na, device=self.device).float().view(na, 1).repeat(1, nt) # same as .repeat_interleave(nt) ''' targets:[num_obj, 6],repeat(3,1,1)表示复制3个1行1列,则repeat后 targets shape变为[3,num_obj,6] ai = [[[0.], [0.], [0.]..] [[1.], [1.], [1.]...] [[2.], [2.] [2.]...]] ai[...,None] shape [3, num_obj, 1] cat后targets shape [3, num_obj,6+1]=[3,num_obj,7] 也就是targets[...,:6]保存的是targets信息,targets[...,6:]保存的是对于anchors索引 ''' targets = torch.cat((targets.repeat(na, 1, 1), ai[..., None]), 2) # append anchor indices g = 0.5 # bias off = torch.tensor( [ [0, 0], [1, 0], [0, 1], [-1, 0], [0, -1], # j,k,l,m # [1, 1], [1, -1], [-1, 1], [-1, -1], # jk,jm,lk,lm ], device=self.device).float() * g # offsets for i in range(self.nl): anchors, shape = self.anchors[i], p[i].shape # torch.tensor(shape)=[batch,3,80,80,85] gain[2:6] = torch.tensor(shape)[[3, 2, 3, 2]] # xyxy gain 获取特征层的w和h[80,80,80,80] # Match targets to anchors 先验框和target框的匹配 t = targets * gain # shape(3,n,7) 将targets中的box缩放到特征层上 if nt: # Matches ''' yolov5采用宽高比例的匹配策略,不同于iou匹配。 target的宽高与anchors宽高对应相除得到ratio1 anchors与target的宽高相处得到ratio2[也就是代码中的1/r] 取两个ratio最大值作为最后的宽高比,该宽高比和设定的阈值(默认为4[anchor_t])比较,小于该阈值的anchor则为匹配到的anchor ''' r = t[..., 4:6] / anchors[:, None] # wh ratio j = torch.max(r, 1 / r).max(2)[0] < self.hyp['anchor_t'] # compare # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t'] # iou(3,n)=wh_iou(anchors(3,2), gwh(n,2)) t = t[j] # filter # Offsets gxy = t[:, 2:4] # grid xy gxi = gain[[2, 3]] - gxy # inverse j, k = ((gxy % 1 < g) & (gxy > 1)).T l, m = ((gxi % 1 < g) & (gxi > 1)).T j = torch.stack((torch.ones_like(j), j, k, l, m)) t = t.repeat((5, 1, 1))[j] offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j] else: t = targets[0] offsets = 0 # Define bc, gxy, gwh, a = t.chunk(4, 1) # (image, class), grid xy, grid wh, anchors a, (b, c) = a.long().view(-1), bc.long().T # anchors, image, class gij = (gxy - offsets).long() gi, gj = gij.T # grid indices # Append indices.append((b, a, gj.clamp_(0, shape[2] - 1), gi.clamp_(0, shape[3] - 1))) # image, anchor, grid tbox.append(torch.cat((gxy - gij, gwh), 1)) # box anch.append(anchors[a]) # anchors tcls.append(c) # class return tcls, tbox, indices, anch
获得b:图像;a:anchor, gj:gi Cell的纵坐标与横坐标
b, a, gj, gi = indices[i]
tobj是用来后面存储gt中的目标信息,shape[batch_size,3, 80,80]
tobj = torch.zeros(pi.shape[:4], dtype=pi.dtype, device=self.device)
pxy, pwh, _, pcls = pi[b, a, gj, gi].split((2, 2, 1, self.nc), 1) # target-subset of predictions
# Regression pxy = pxy.sigmoid() * 2 - 0.5 pwh = (pwh.sigmoid() * 2) ** 2 * anchors[i] pbox = torch.cat((pxy, pwh), 1) # predicted box iou = bbox_iou(pbox, tbox[i], CIoU=True).squeeze() # iou(prediction, target) lbox += (1.0 - iou).mean() # iou loss
if self.nc > 1: # cls loss (only if multiple classes) t = torch.full_like(pcls, self.cn, device=self.device) # targets t[range(n), tcls[i]] = self.cp lcls += self.BCEcls(pcls, t) # BCE
obji = self.BCEobj(pi[..., 4], tobj)
lobj += obji * self.balance[i]
lbox *= self.hyp['box'] lobj *= self.hyp['obj'] lcls *= self.hyp['cls'] bs = tobj.shape[0] # batch size return (lbox + lobj + lcls) * bs, torch.cat((lbox, lobj, lcls)).detach()
loss: tensor([10.73431]) loss_item: tensor([0.06105, 5.30610, 0.00000])