diff --git a/detect.py b/detect.py
index 2a3295b2..cfd91665 100755
--- a/detect.py
+++ b/detect.py
@@ -66,7 +66,9 @@ def detect(
 
         # Get detections
         with torch.no_grad():
-            pred = model(torch.from_numpy(img).unsqueeze(0).to(device))
+            img = torch.from_numpy(img).unsqueeze(0).to(device)
+            # pred = torch.onnx._export(model, img, 'weights/model.onnx', verbose=True,); return  # ONNX export
+            pred = model(img)
             pred = pred[pred[:, :, 4] > conf_thres]
 
             if len(pred) > 0:
diff --git a/models.py b/models.py
index 31e2b7fd..ed7f2ded 100755
--- a/models.py
+++ b/models.py
@@ -133,6 +133,8 @@ class YOLOLayer(nn.Module):
         # Get outputs
         x = torch.sigmoid(p[..., 0])  # Center x
         y = torch.sigmoid(p[..., 1])  # Center y
+        p_conf = p[..., 4]  # Conf
+        p_cls = p[..., 5:]  # Class
 
         # Width and height (yolo method)
         w = p[..., 2]  # Width
@@ -146,28 +148,25 @@ class YOLOLayer(nn.Module):
         # width = ((w.data * 2) ** 2) * self.anchor_w
         # height = ((h.data * 2) ** 2) * self.anchor_h
 
-        # Add offset and scale with anchors (in grid space, i.e. 0-13)
-        pred_boxes = FT(bs, self.nA, nG, nG, 4)
-        pred_conf = p[..., 4]  # Conf
-        pred_cls = p[..., 5:]  # Class
-
         # Training
         if targets is not None:
             MSELoss = nn.MSELoss()
             BCEWithLogitsLoss = nn.BCEWithLogitsLoss()
             CrossEntropyLoss = nn.CrossEntropyLoss()
 
+            p_boxes = None
             if batch_report:
+                # Predictd boxes: add offset and scale with anchors (in grid space, i.e. 0-13)
                 gx = self.grid_x[:, :, :nG, :nG]
                 gy = self.grid_y[:, :, :nG, :nG]
-                pred_boxes[..., 0] = x.data + gx - width / 2
-                pred_boxes[..., 1] = y.data + gy - height / 2
-                pred_boxes[..., 2] = x.data + gx + width / 2
-                pred_boxes[..., 3] = y.data + gy + height / 2
+                p_boxes = torch.stack((x.data + gx - width / 2,
+                                       y.data + gy - height / 2,
+                                       x.data + gx + width / 2,
+                                       y.data + gy + height / 2), 4)  # x1y1x2y2
 
             tx, ty, tw, th, mask, tcls, TP, FP, FN, TC = \
-                build_targets(pred_boxes, pred_conf, pred_cls, targets, self.scaled_anchors, self.nA, self.nC, nG,
-                              batch_report)
+                build_targets(p_boxes, p_conf, p_cls, targets, self.scaled_anchors, self.nA, self.nC, nG, batch_report)
+
             tcls = tcls[mask]
             if x.is_cuda:
                 tx, ty, tw, th, mask, tcls = tx.cuda(), ty.cuda(), tw.cuda(), th.cuda(), mask.cuda(), tcls.cuda()
@@ -194,15 +193,15 @@ class YOLOLayer(nn.Module):
                 # import matplotlib.pyplot as plt
                 # plt.hist(self.x)
 
-                # lconf = k * BCEWithLogitsLoss(pred_conf[mask], mask[mask].float())
+                # lconf = k * BCEWithLogitsLoss(p_conf[mask], mask[mask].float())
 
-                lcls = (k / 4) * CrossEntropyLoss(pred_cls[mask], torch.argmax(tcls, 1))
-                # lcls = (k * 10) * BCEWithLogitsLoss(pred_cls[mask], tcls.float())
+                lcls = (k / 4) * CrossEntropyLoss(p_cls[mask], torch.argmax(tcls, 1))
+                # lcls = (k * 10) * BCEWithLogitsLoss(p_cls[mask], tcls.float())
             else:
                 lx, ly, lw, lh, lcls, lconf = FT([0]), FT([0]), FT([0]), FT([0]), FT([0]), FT([0])
 
-            # lconf += k * BCEWithLogitsLoss(pred_conf[~mask], mask[~mask].float())
-            lconf = (k * 64) * BCEWithLogitsLoss(pred_conf, mask.float())
+            # lconf += k * BCEWithLogitsLoss(p_conf[~mask], mask[~mask].float())
+            lconf = (k * 64) * BCEWithLogitsLoss(p_conf, mask.float())
 
             # Sum loss components
             balance_losses_flag = False
@@ -218,24 +217,23 @@ class YOLOLayer(nn.Module):
             # Sum False Positives from unassigned anchors
             FPe = torch.zeros(self.nC)
             if batch_report:
-                i = torch.sigmoid(pred_conf[~mask]) > 0.5
+                i = torch.sigmoid(p_conf[~mask]) > 0.5
                 if i.sum() > 0:
-                    FP_classes = torch.argmax(pred_cls[~mask][i], 1)
+                    FP_classes = torch.argmax(p_cls[~mask][i], 1)
                     FPe = torch.bincount(FP_classes, minlength=self.nC).float().cpu()  # extra FPs
 
             return loss, loss.item(), lx.item(), ly.item(), lw.item(), lh.item(), lconf.item(), lcls.item(), \
                    nT, TP, FP, FPe, FN, TC
 
         else:
-            pred_boxes[..., 0] = x.data + self.grid_x
-            pred_boxes[..., 1] = y.data + self.grid_y
-            pred_boxes[..., 2] = width
-            pred_boxes[..., 3] = height
-
             # If not in training phase return predictions
-            output = torch.cat((pred_boxes.view(bs, -1, 4) * stride,
-                                torch.sigmoid(pred_conf.view(bs, -1, 1)), pred_cls.view(bs, -1, self.nC)), -1)
-            return output.data
+            p_boxes = torch.stack((x + self.grid_x, y + self.grid_y, width, height), 4)  # xywh
+
+            # output.shape = [1, 3, 13, 13, 85]
+            output = torch.cat((p_boxes * stride, torch.sigmoid(p_conf).unsqueeze(4), p_cls), 4)
+
+            # returns shape = [1, 507, 85]
+            return output.data.view(bs, -1, 5 + self.nC)
 
 
 class Darknet(nn.Module):