diff --git a/models.py b/models.py
index bc9c7d0e..87b02cc5 100755
--- a/models.py
+++ b/models.py
@@ -64,7 +64,7 @@ def create_modules(module_defs):
             anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
             anchors = [anchors[i] for i in anchor_idxs]
             nc = int(module_def['classes'])  # number of classes
-            img_size = int(hyperparams['height'])
+            img_size = hyperparams['height']
             # Define detection layer
             yolo_layer = YOLOLayer(anchors, nc, img_size, yolo_layer_count, cfg=hyperparams['cfg'])
             modules.add_module('yolo_%d' % i, yolo_layer)
@@ -103,38 +103,37 @@ class YOLOLayer(nn.Module):
     def __init__(self, anchors, nc, img_size, yolo_layer, cfg):
         super(YOLOLayer, self).__init__()
 
-        self.anchors = torch.FloatTensor(anchors)
+        self.anchors = torch.Tensor(anchors)
         self.na = len(anchors)  # number of anchors (3)
         self.nc = nc  # number of classes (80)
         self.img_size = 0
-        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        create_grids(self, 32, 1, device=device)
 
         if ONNX_EXPORT:  # grids must be computed in __init__
             stride = [32, 16, 8][yolo_layer]  # stride of this layer
             if cfg.endswith('yolov3-tiny.cfg'):
                 stride *= 2
 
-            ng = int(img_size / stride)  # number grid points
-            create_grids(self, img_size, ng)
+            ng = (int(img_size[0] / stride), int(img_size[1] / stride))  # number grid points
+            create_grids(self, max(img_size), ng)
 
     def forward(self, p, img_size, var=None):
         if ONNX_EXPORT:
-            bs, ng = 1, self.ng  # batch size, grid size
+            bs = 1  # batch size
         else:
-            bs, ng = p.shape[0], p.shape[-1]
+            bs, nx, ny = p.shape[0], p.shape[-2], p.shape[-1]
             if self.img_size != img_size:
-                create_grids(self, img_size, ng, p.device)
+                create_grids(self, img_size, (nx, ny), p.device)
 
         # p.view(bs, 255, 13, 13) -- > (bs, 3, 13, 13, 85)  # (bs, anchors, grid, grid, classes + xywh)
-        p = p.view(bs, self.na, self.nc + 5, ng, ng).permute(0, 1, 3, 4, 2).contiguous()  # prediction
+        p = p.view(bs, self.na, self.nc + 5, self.nx, self.ny).permute(0, 1, 3, 4, 2).contiguous()  # prediction
 
         if self.training:
             return p
 
         elif ONNX_EXPORT:
+            ngu = self.ng.view((1, 1, 2))
             grid_xy = self.grid_xy.repeat((1, self.na, 1, 1, 1)).view((1, -1, 2))
-            anchor_wh = self.anchor_wh.repeat((1, 1, ng, ng, 1)).view((1, -1, 2)) / ng
+            anchor_wh = self.anchor_wh.repeat((1, 1, self.nx, self.ny, 1)).view((1, -1, 2)) / self.nx
 
             # p = p.view(-1, 5 + self.nc)
             # xy = torch.sigmoid(p[..., 0:2]) + grid_xy[0]  # x, y
@@ -153,7 +152,7 @@ class YOLOLayer(nn.Module):
             p_cls = torch.exp(p_cls).permute((2, 1, 0))
             p_cls = p_cls / p_cls.sum(0).unsqueeze(0) * p_conf.permute((2, 1, 0))  # F.softmax() equivalent
             p_cls = p_cls.permute(2, 1, 0)
-            return torch.cat((xy / ng, wh, p_conf, p_cls), 2).squeeze().t()
+            return torch.cat((xy / self.nx, wh, p_conf, p_cls), 2).squeeze().t()
 
         else:  # inference
             io = p.clone()  # inference output
@@ -234,9 +233,9 @@ def get_yolo_layers(model):
 
 
 def create_grids(self, img_size, ng, device='cpu'):
-    nx, ny = ng, ng  # x and y grid size
+    nx, ny = ng  # x and y grid size
     self.img_size = img_size
-    self.stride = img_size / nx
+    self.stride = img_size / max(ng)
 
     # build xy offsets
     yv, xv = torch.meshgrid([torch.arange(nx), torch.arange(ny)])
@@ -245,7 +244,7 @@ def create_grids(self, img_size, ng, device='cpu'):
     # build wh gains
     self.anchor_vec = self.anchors.to(device) / self.stride
     self.anchor_wh = self.anchor_vec.view(1, self.na, 1, 1, 2).to(device)
-    self.ng = torch.Tensor([ng]).to(device)
+    self.ng = torch.Tensor(ng).to(device)
     self.nx = nx
     self.ny = ny
 
diff --git a/test.py b/test.py
index f218e7e9..e78d6be1 100644
--- a/test.py
+++ b/test.py
@@ -93,7 +93,7 @@ def test(
                 # [{"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}, ...
                 image_id = int(Path(paths[si]).stem.split('_')[-1])
                 box = pred[:, :4].clone()  # xyxy
-                scale_coords(imgs[si].shape, box, shapes[si])  # to original shape
+                scale_coords(img_size, box, shapes[si])  # to original shape
                 box = xyxy2xywh(box)  # xywh
                 box[:, :2] -= box[:, 2:] / 2  # xy center to top-left corner
                 for di, d in enumerate(pred):
diff --git a/utils/utils.py b/utils/utils.py
index bd348b9a..e25f1983 100755
--- a/utils/utils.py
+++ b/utils/utils.py
@@ -102,13 +102,13 @@ def xywh2xyxy(x):
 
 def scale_coords(img1_shape, coords, img0_shape):
     # Rescale coords1 (xyxy) from img1_shape to img0_shape
-    gain = max(img1_shape[1:3]) / max(img0_shape[:2])  # gain  = old / new
-    pad_x = (img1_shape[2] - img0_shape[1] * gain) / 2  # width padding
-    pad_y = (img1_shape[1] - img0_shape[0] * gain) / 2  # height padding
+    gain = img1_shape / max(img0_shape[:2])  # gain  = old / new
+    pad_x = np.mod(img1_shape - img0_shape[1] * gain, 32) / 2  # width padding
+    pad_y = np.mod(img1_shape - img0_shape[0] * gain, 32) / 2  # height padding
     coords[:, [0, 2]] -= pad_x
     coords[:, [1, 3]] -= pad_y
     coords[:, :4] /= gain
-    coords[:, :4] = torch.clamp(coords[:, :4], min=0)
+    coords[:, :4] = coords[:, :4].clamp(min=0)
     return coords