Merge remote-tracking branch 'origin/master'

2019-07-29 00:45:37 +02:00 · 2019-07-29 00:45:37 +02:00 · 3a7711856e
parent 9cf7f2215a 9bf31f8100
commit 3a7711856e
10 changed files with 792 additions and 199 deletions
--- a/README.md
+++ b/README.md
@ -5,7 +5,7 @@
    </td>
    <td align="center">
    <a href="https://www.ultralytics.com" target="_blank">
-    <img src="https://storage.googleapis.com/ultralytics/logo/logoname1000.png" width="200"></a>
+    <img src="https://storage.googleapis.com/ultralytics/logo/logoname1000.png" width="160"></a>
      <img src="https://user-images.githubusercontent.com/26833433/61591093-2b4d4480-abc2-11e9-8b46-d88eb1dabba1.jpg">
          <a href="https://itunes.apple.com/app/id1452689527" target="_blank">
    <img src="https://user-images.githubusercontent.com/26833433/50044365-9b22ac00-0082-11e9-862f-e77aee7aa7b0.png" width="180"></a>
@ -43,8 +43,7 @@ Python 3.7 or later with the following `pip3 install -U -r requirements.txt` pac
 # Jupyter Notebook
-A jupyter notebook with training, inference and testing examples is available at: 
+Our Jupyter [notebook](https://colab.research.google.com/github/ultralytics/yolov3/blob/master/examples.ipynb) provides quick training, inference and testing examples.
 https://colab.research.google.com/drive/1G8T-VFxQkjDe4idzN8F-hbIBqkkkQnxw
 # Training
@ -87,10 +86,11 @@ https://cloud.google.com/deep-learning-vm/
 GPUs | `batch_size` | batch time | epoch time | epoch cost
 --- |---| --- | --- | --- 
 1 K80 | 64 (32x2) | 2.9s  | 175min  | $0.58
-1 T4 | 64 (32x2) | 0.8s  | 49min  | $0.29
+1 T4 | 64 (32x2) | 0.80s  | 49min  | $0.29
 2 T4 | 64 (64x1) | 0.52s  | 32min  | $0.36
 1 2080ti | 64 (32x2) | -  | -  | -
 1 V100 | 64 (32x2) | 0.38s | 23min | $0.31
-2 V100 | 64 (64x1) | 0.38s | 23min | $0.62
+2 V100 | 64 (64x1) | 0.30s | 18min | $0.46
 # Inference
--- a/examples.ipynb
+++ b/examples.ipynb
--- a/models.py
+++ b/models.py
@ -1,7 +1,3 @@
 import os
 import torch.nn.functional as F
 from utils.parse_config import *
 from utils.utils import *
@ -145,6 +141,7 @@ class YOLOLayer(nn.Module):
            return torch.cat((xy / ngu, wh, p_conf, p_cls), 2).squeeze().t()
        else:  # inference
            # s = 1.5  # scale_xy  (pxy = pxy * s - (s - 1) / 2)
            io = p.clone()  # inference output
            io[..., 0:2] = torch.sigmoid(io[..., 0:2]) + self.grid_xy  # xy
            io[..., 2:4] = torch.exp(io[..., 2:4]) * self.anchor_wh  # wh yolo method
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 # pip3 install -U -r requirements.txt
 # conda install numpy opencv matplotlib tqdm pillow
 # conda install pytorch torchvision -c pytorch
-# conda install -c conda-forge scikit-image
+# conda install scikit-image -c conda-forge
 numpy
 opencv-python
 torch >= 1.1.0
--- a/train.py
+++ b/train.py
@ -4,19 +4,29 @@ import time
 import torch.distributed as dist
 import torch.optim as optim
 import torch.optim.lr_scheduler as lr_scheduler
 from torch.utils.data import DataLoader
 import test  # import test.py to get mAP after each epoch
 from models import *
 from utils.adabound import *
 from utils.datasets import *
 from utils.utils import *
 mixed_precision = True
 try:  # Mixed precision training https://github.com/NVIDIA/apex
    from apex import amp
 except:  # not installed: install help: https://github.com/NVIDIA/apex/issues/259
    mixed_precision = False
 # 320 --epochs 1
-#      0.109      0.297       0.15      0.126       7.04      1.666      4.062     0.1845       42.6       3.34      12.61      8.338     0.2705      0.001         -4        0.9     0.0005 a  320 giou + best_anchor False
+#      0.109      0.297       0.150       0.126       7.04      1.666      4.062     0.1845       42.6       3.34      12.61      8.338     0.2705      0.001         -4        0.9     0.0005 a  320 giou + best_anchor False
-#      0.223      0.218      0.138      0.189       9.28      1.153      4.376    0.08263      24.28       3.05      20.93      2.842     0.2759   0.001357     -5.036     0.9158  0.0005722 b  mAP/F1 - 50/50 weighting
+#      0.223      0.218       0.138       0.189       9.28      1.153      4.376    0.08263      24.28       3.05      20.93      2.842     0.2759   0.001357     -5.036     0.9158  0.0005722 b  mAP/F1 - 50/50 weighting
-#      0.231      0.215      0.135      0.191       9.51      1.432      3.007    0.06082      24.87      3.477      24.13      2.802     0.3436   0.001127     -5.036     0.9232  0.0005874 c
+#      0.231      0.215       0.135       0.191       9.51      1.432      3.007    0.06082      24.87      3.477      24.13      2.802     0.3436   0.001127     -5.036     0.9232  0.0005874 c
-#      0.246      0.194      0.128      0.192       8.12      1.101      3.954     0.0817      22.83      3.967      19.83      1.779     0.3352   0.000895     -5.036     0.9238  0.0007973 d
+#      0.246      0.194       0.128       0.192       8.12      1.101      3.954     0.0817      22.83      3.967      19.83      1.779     0.3352   0.000895     -5.036     0.9238  0.0007973 d
-#      0.187      0.237      0.144      0.186       14.6      1.607      4.202    0.09439      39.27      3.726      31.26      2.634      0.273   0.001542     -5.036     0.8364  0.0008393 e
+#      0.187      0.237       0.144       0.186       14.6      1.607      4.202    0.09439      39.27      3.726      31.26      2.634      0.273   0.001542     -5.036     0.8364  0.0008393 e
 #      0.250      0.217       0.136       0.195         3.3         1.2           2       0.604        15.7        3.67          20        1.36       0.194     0.00128          -4        0.95    0.000201         0.8       0.388         1.2       0.119      0.0589       0.401 f
 #      0.269      0.225       0.149       0.218        6.71        1.13        5.25       0.246        22.4        3.64        17.8        1.31       0.256     0.00146          -4       0.936     0.00042       0.123        0.18        1.81      0.0987      0.0788       0.441 g
 #      0.179      0.274       0.165       0.187        7.95        1.22        7.62       0.224          17        5.71        17.7        3.28       0.295     0.00136          -4       0.875    0.000319       0.131       0.208        2.14        0.14      0.0773       0.228 h
 #      0.296      0.228       0.152       0.220        5.18        1.43        4.27       0.265        11.7        4.81        11.5        1.56       0.281      0.0013          -4       0.944    0.000427      0.0599       0.142        1.03      0.0552      0.0555       0.434 i
 # 320 --epochs 2
 # 0.242	0.296	0.196	0.231	5.67	0.8541	4.286	0.1539	21.61	1.957	22.9	2.894	0.3689	0.001844	-4	0.913	0.000467  # ha 0.417 mAP @ epoch 100
@ -25,40 +35,45 @@ from utils.utils import *
 # 0.161	0.327	0.190	0.193	7.82	1.153	4.062	0.1845	24.28	3.05	20.93	2.842	0.2759	0.001357	-4	0.916	0.000572  # hd 0.438 mAP @ epoch 100
-# Training hyperparameters d
+# Training hyperparameters g
-hyp = {'giou': 1.153,  # giou loss gain
+# hyp = {'giou': 1.13,  # giou loss gain
-       'xy': 4.062,  # xy loss gain
+#        'xy': 5.25,  # xy loss gain
-       'wh': 0.1845,  # wh loss gain
+#        'wh': 0.246,  # wh loss gain
-       'cls': 24.28,  # cls loss gain
+#        'cls': 22.4,  # cls loss gain
-       'cls_pw': 3.05,  # cls BCELoss positive_weight
+#        'cls_pw': 3.64,  # cls BCELoss positive_weight
-       'obj': 20.93,  # obj loss gain
+#        'obj': 17.8,  # obj loss gain
-       'obj_pw': 2.842,  # obj BCELoss positive_weight
+#        'obj_pw': 1.31,  # obj BCELoss positive_weight
-       'iou_t': 0.2759,  # iou training threshold
+#        'iou_t': 0.256,  # iou training threshold
-       'lr0': 0.001357,  # initial learning rate
+#        'lr0': 0.00146,  # initial learning rate
       'lrf': -4.,  # final LambdaLR learning rate = lr0 * (10 ** lrf)
       'momentum': 0.916,  # SGD momentum
       'weight_decay': 0.0000572,  # optimizer weight decay
       'hsv_s': 0.5,  # image HSV-Saturation augmentation (fraction)
       'hsv_v': 0.5,  # image HSV-Value augmentation (fraction)
       'degrees': 5,  # image rotation (+/- deg)
       'translate': 0.1,  # image translation (+/- fraction)
       'scale': 0.1,  # image scale (+/- gain)
       'shear': 2}  # image shear (+/- deg)
 # # Training hyperparameters e
 # hyp = {'giou': 1.607,  # giou loss gain
 #        'xy': 4.062,  # xy loss gain
 #        'wh': 0.1845,  # wh loss gain
 #        'cls': 39.27,  # cls loss gain
 #        'cls_pw': 3.726,  # cls BCELoss positive_weight
 #        'obj': 31.26,  # obj loss gain
 #        'obj_pw': 2.634,  # obj BCELoss positive_weight
 #        'iou_t': 0.273,  # iou target-anchor training threshold
 #        'lr0': 0.001542,  # initial learning rate
 #        'lrf': -4.,  # final LambdaLR learning rate = lr0 * (10 ** lrf)
-#        'momentum': 0.8364,  # SGD momentum
+#        'momentum': 0.936,  # SGD momentum
-#        'weight_decay': 0.0008393}  # optimizer weight decay
+#        'weight_decay': 0.00042,  # optimizer weight decay
 #        'hsv_s': 0.123,  # image HSV-Saturation augmentation (fraction)
 #        'hsv_v': 0.18,  # image HSV-Value augmentation (fraction)
 #        'degrees': 1.81,  # image rotation (+/- deg)
 #        'translate': 0.0987,  # image translation (+/- fraction)
 #        'scale': 0.0788,  # image scale (+/- gain)
 #        'shear': 0.441}  # image shear (+/- deg)
 # Training hyperparameters i
 hyp = {'giou': 1.43,  # giou loss gain
       'xy': 4.27,  # xy loss gain
       'wh': 0.265,  # wh loss gain
       'cls': 11.7,  # cls loss gain
       'cls_pw': 4.81,  # cls BCELoss positive_weight
       'obj': 11.5,  # obj loss gain
       'obj_pw': 1.56,  # obj BCELoss positive_weight
       'iou_t': 0.281,  # iou training threshold
       'lr0': 0.0013,  # initial learning rate
       'lrf': -4.,  # final LambdaLR learning rate = lr0 * (10 ** lrf)
       'momentum': 0.944,  # SGD momentum
       'weight_decay': 0.000427,  # optimizer weight decay
       'hsv_s': 0.0599,  # image HSV-Saturation augmentation (fraction)
       'hsv_v': 0.142,  # image HSV-Value augmentation (fraction)
       'degrees': 1.03,  # image rotation (+/- deg)
       'translate': 0.0552,  # image translation (+/- fraction)
       'scale': 0.0555,  # image scale (+/- gain)
       'shear': 0.434}  # image shear (+/- deg)
 def train(cfg,
@ -66,13 +81,13 @@ def train(cfg,
          img_size=416,
          epochs=100,  # 500200 batches at bs 16, 117263 images = 273 epochs
          batch_size=16,
-          accumulate=4):  # effective bs = batch_size * accumulate = 8 * 8 = 64
+          accumulate=4):  # effective bs = batch_size * accumulate = 16 * 4 = 64
    # Initialize
    init_seeds()
    weights = 'weights' + os.sep
    last = weights + 'last.pt'
    best = weights + 'best.pt'
-    device = torch_utils.select_device()
+    device = torch_utils.select_device(apex=mixed_precision)
    multi_scale = opt.multi_scale
    if multi_scale:
@ -89,11 +104,13 @@ def train(cfg,
    model = Darknet(cfg).to(device)
    # Optimizer
-    optimizer = optim.SGD(model.parameters(), lr=hyp['lr0'], momentum=hyp['momentum'], weight_decay=hyp['weight_decay'])
+    optimizer = optim.SGD(model.parameters(), lr=hyp['lr0'], momentum=hyp['momentum'], weight_decay=hyp['weight_decay'],
                          nesterov=True)
    # optimizer = AdaBound(model.parameters(), lr=hyp['lr0'], final_lr=0.1)
    cutoff = -1  # backbone reaches to cutoff layer
    start_epoch = 0
-    best_fitness = 0.0
+    best_fitness = 0.
    if opt.resume or opt.transfer:  # Load previously saved model
        if opt.transfer:  # Transfer learning
            nf = int(model.module_defs[model.yolo_layers[0] - 1]['filters'])  # yolo layer size (i.e. 255)
@ -136,7 +153,7 @@ def train(cfg,
    # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs)  # exp ramp
    # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs))  # inverse exp ramp
    # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
-    scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[round(opt.epochs * x) for x in (0.8, 0.9)], gamma=0.1)
+    scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[round(opt.epochs * x) for x in [0.8]], gamma=0.1)
    scheduler.last_epoch = start_epoch - 1
    # # Plot lr schedule
@ -150,6 +167,18 @@ def train(cfg,
    # plt.tight_layout()
    # plt.savefig('LR.png', dpi=300)
    # Mixed precision training https://github.com/NVIDIA/apex
    if mixed_precision:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)
    # Initialize distributed training
    if torch.cuda.device_count() > 1:
        dist.init_process_group(backend='nccl',  # 'distributed backend'
                                init_method='tcp://127.0.0.1:9999',  # distributed training init method
                                world_size=1,  # number of nodes for distributed training
                                rank=0)  # distributed training node rank
        model = torch.nn.parallel.DistributedDataParallel(model)
    # Dataset
    dataset = LoadImagesAndLabels(train_path,
                                  img_size,
@ -158,32 +187,13 @@ def train(cfg,
                                  hyp=hyp,  # augmentation hyperparameters
                                  rect=opt.rect)  # rectangular training
    # Initialize distributed training
    if torch.cuda.device_count() > 1:
        dist.init_process_group(backend='nccl',  # 'distributed backend'
                                init_method='tcp://127.0.0.1:9999',  # distributed training init method
                                world_size=1,  # number of nodes for distributed training
                                rank=0)  # distributed training node rank
        model = torch.nn.parallel.DistributedDataParallel(model)
        # sampler = torch.utils.data.distributed.DistributedSampler(dataset)
    # Dataloader
-    dataloader = DataLoader(dataset,
+    dataloader = torch.utils.data.DataLoader(dataset,
-                            batch_size=batch_size,
+                                             batch_size=batch_size,
-                            num_workers=opt.num_workers,
+                                             num_workers=opt.num_workers,
-                            shuffle=not opt.rect,  # Shuffle=True unless rectangular training is used
+                                             shuffle=not opt.rect,  # Shuffle=True unless rectangular training is used
-                            pin_memory=True,
+                                             pin_memory=True,
-                            collate_fn=dataset.collate_fn)
+                                             collate_fn=dataset.collate_fn)
    # Mixed precision training https://github.com/NVIDIA/apex
    mixed_precision = True
    if mixed_precision:
        try:
            from apex import amp
            model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)
        except:  # not installed: install help: https://github.com/NVIDIA/apex/issues/259
            mixed_precision = False
    # Start training
    model.hyp = hyp  # attach hyperparameters to model
@ -192,7 +202,7 @@ def train(cfg,
    nb = len(dataloader)
    maps = np.zeros(nc)  # mAP per class
    results = (0, 0, 0, 0, 0)  # P, R, mAP, F1, test_loss
-    n_burnin = min(round(nb / 5 + 1), 1000)  # burn-in batches
+    # n_burnin = min(round(nb / 5 + 1), 1000)  # burn-in batches
    t0 = time.time()
    for epoch in range(start_epoch, epochs):
        model.train()
@ -234,11 +244,11 @@ def train(cfg,
                plot_images(imgs=imgs, targets=targets, paths=paths, fname='train_batch%g.jpg' % i)
            # SGD burn-in
-            if epoch == 0 and i <= n_burnin:
+            # if epoch == 0 and i <= n_burnin:
-                g = (i / n_burnin) ** 4  # gain
+            #     g = (i / n_burnin) ** 4  # gain
-                for x in optimizer.param_groups:
+            #     for x in optimizer.param_groups:
-                    x['lr'] = hyp['lr0'] * g
+            #         x['lr'] = hyp['lr0'] * g
-                    x['weight_decay'] = hyp['weight_decay'] * g
+            #         x['weight_decay'] = hyp['weight_decay'] * g
            # Run model
            pred = model(imgs)
@ -313,33 +323,11 @@ def train(cfg,
    # Report time
    print('%g epochs completed in %.3f hours.' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
-    del model, optimizer
+    dist.destroy_process_group() if torch.cuda.device_count() > 1 else None
    torch.cuda.empty_cache()
    return results
 def print_mutation(hyp, results):
    # Write mutation results
    a = '%11s' * len(hyp) % tuple(hyp.keys())  # hyperparam keys
    b = '%11.3g' * len(hyp) % tuple(hyp.values())  # hyperparam values
    c = '%11.3g' * len(results) % results  # results (P, R, mAP, F1, test_loss)
    print('\n%s\n%s\nEvolved fitness: %s\n' % (a, b, c))
    if opt.bucket:
        os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket)  # download evolve.txt
        with open('evolve.txt', 'a') as f:  # append result
            f.write(c + b + '\n')
        x = np.unique(np.loadtxt('evolve.txt', ndmin=2), axis=0)  # load unique rows
        np.savetxt('evolve.txt', x[np.argsort(-fitness(x))], '%11.3g')  # save sort by fitness
        os.system('gsutil cp evolve.txt gs://%s' % opt.bucket)  # upload evolve.txt
    else:
        with open('evolve.txt', 'a') as f:
            f.write(c + b + '\n')
 def fitness(x):  # returns fitness of hyp evolution vectors
    return x[:, 2] * 0.5 + x[:, 3] * 0.5  # fitness = weighted combination of mAP and F1
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', type=int, default=100, help='number of epochs')
@ -362,38 +350,38 @@ if __name__ == '__main__':
    opt = parser.parse_args()
    print(opt)
-    if opt.evolve:
+    if not opt.evolve:  # Train normally
        results = train(opt.cfg,
                        opt.data,
                        img_size=opt.img_size,
                        epochs=opt.epochs,
                        batch_size=opt.batch_size,
                        accumulate=opt.accumulate)
    else:  # Evolve hyperparameters (optional)
        opt.notest = True  # only test final epoch
        opt.nosave = True  # only save final checkpoint
        if opt.bucket:
            os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket)  # download evolve.txt if exists
-    # Train
+        for _ in range(1):  # generations to evolve
-    results = train(opt.cfg,
+            if os.path.exists('evolve.txt'):  # if evolve.txt exists: select best hyps and mutate
-                    opt.data,
+                # Get best hyperparameters
-                    img_size=opt.img_size,
+                x = np.loadtxt('evolve.txt', ndmin=2)
-                    epochs=opt.epochs,
+                x = x[fitness(x).argmax()]  # select best fitness hyps
-                    batch_size=opt.batch_size,
+                for i, k in enumerate(hyp.keys()):
-                    accumulate=opt.accumulate)
+                    hyp[k] = x[i + 5]
-    # Evolve hyperparameters (optional)
+                # Mutate
-    if opt.evolve:
+                init_seeds(seed=int(time.time()))
-        print_mutation(hyp, results)  # Write mutation results
+                s = [.15, .15, .15, .15, .15, .15, .15, .15, .15, .00, .05, .20, .20, .20, .20, .20, .20, .20]  # sigmas
-        for _ in range(1000):  # generations to evolve
+                for i, k in enumerate(hyp.keys()):
-            # Get best hyperparameters
+                    x = (np.random.randn(1) * s[i] + 1) ** 2.0  # plt.hist(x.ravel(), 300)
-            x = np.loadtxt('evolve.txt', ndmin=2)
+                    hyp[k] *= float(x)  # vary by sigmas
            x = x[fitness(x).argmax()]  # select best fitness hyps
            for i, k in enumerate(hyp.keys()):
                hyp[k] = x[i + 5]
            # Mutate
            init_seeds(seed=int(time.time()))
            s = [.15, .15, .15, .15, .15, .15, .15, .15, .15, .00, .05, .20, .20, .20, .20, .20, .20, .20]  # sigmas
            for i, k in enumerate(hyp.keys()):
                x = (np.random.randn(1) * s[i] + 1) ** 2.0  # plt.hist(x.ravel(), 300)
                hyp[k] *= float(x)  # vary by sigmas
            # Clip to limits
            keys = ['lr0', 'iou_t', 'momentum', 'weight_decay', 'hsv_s', 'hsv_v', 'translate', 'scale']
-            limits = [(1e-4, 1e-2), (0.00, 0.70), (0.60, 0.95), (0, 0.001), (0, .8), (0, .8), (0, .8), (0, .8)]
+            limits = [(1e-4, 1e-2), (0.00, 0.70), (0.60, 0.97), (0, 0.001), (0, .9), (0, .9), (0, .9), (0, .9)]
            for k, v in zip(keys, limits):
                hyp[k] = np.clip(hyp[k], v[0], v[1])
@ -406,19 +394,7 @@ if __name__ == '__main__':
                            accumulate=opt.accumulate)
            # Write mutation results
-            print_mutation(hyp, results)
+            print_mutation(hyp, results, opt.bucket)
-            # # Plot results
+            # Plot results
-            # import numpy as np
+            # plot_evolution_results(hyp)
            # import matplotlib.pyplot as plt
            # a = np.loadtxt('evolve.txt')
            # x = fitness(a)
            # weights = (x - x.min()) ** 2
            # fig = plt.figure(figsize=(10, 10))
            # for i in range(len(hyp)):
            #     y = a[:, i + 5]
            #     mu = (y * weights).sum() / weights.sum()
            #     plt.subplot(4, 5, i + 1)
            #     plt.plot(x.max(), mu, 'o')
            #     plt.plot(x, y, '.')
            #     print(list(hyp.keys())[i], '%.4g' % mu)
--- a/utils/adabound.py
+++ b/utils/adabound.py
@ -0,0 +1,236 @@
 import math
 import torch
 from torch.optim import Optimizer
 class AdaBound(Optimizer):
    """Implements AdaBound algorithm.
    It has been proposed in `Adaptive Gradient Methods with Dynamic Bound of Learning Rate`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): Adam learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        final_lr (float, optional): final (SGD) learning rate (default: 0.1)
        gamma (float, optional): convergence speed of the bound functions (default: 1e-3)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsbound (boolean, optional): whether to use the AMSBound variant of this algorithm
    .. Adaptive Gradient Methods with Dynamic Bound of Learning Rate:
        https://openreview.net/forum?id=Bkg3g2R9FX
    """
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), final_lr=0.1, gamma=1e-3,
                 eps=1e-8, weight_decay=0, amsbound=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        if not 0.0 <= final_lr:
            raise ValueError("Invalid final learning rate: {}".format(final_lr))
        if not 0.0 <= gamma < 1.0:
            raise ValueError("Invalid gamma parameter: {}".format(gamma))
        defaults = dict(lr=lr, betas=betas, final_lr=final_lr, gamma=gamma, eps=eps,
                        weight_decay=weight_decay, amsbound=amsbound)
        super(AdaBound, self).__init__(params, defaults)
        self.base_lrs = list(map(lambda group: group['lr'], self.param_groups))
    def __setstate__(self, state):
        super(AdaBound, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsbound', False)
    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()
        for group, base_lr in zip(self.param_groups, self.base_lrs):
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError(
                        'Adam does not support sparse gradients, please consider SparseAdam instead')
                amsbound = group['amsbound']
                state = self.state[p]
                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsbound:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsbound:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']
                state['step'] += 1
                if group['weight_decay'] != 0:
                    grad = grad.add(group['weight_decay'], p.data)
                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsbound:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                else:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])
                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
                # Applies bounds on actual learning rate
                # lr_scheduler cannot affect final_lr, this is a workaround to apply lr decay
                final_lr = group['final_lr'] * group['lr'] / base_lr
                lower_bound = final_lr * (1 - 1 / (group['gamma'] * state['step'] + 1))
                upper_bound = final_lr * (1 + 1 / (group['gamma'] * state['step']))
                step_size = torch.full_like(denom, step_size)
                step_size.div_(denom).clamp_(lower_bound, upper_bound).mul_(exp_avg)
                p.data.add_(-step_size)
        return loss
 class AdaBoundW(Optimizer):
    """Implements AdaBound algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101)
    It has been proposed in `Adaptive Gradient Methods with Dynamic Bound of Learning Rate`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): Adam learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        final_lr (float, optional): final (SGD) learning rate (default: 0.1)
        gamma (float, optional): convergence speed of the bound functions (default: 1e-3)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsbound (boolean, optional): whether to use the AMSBound variant of this algorithm
    .. Adaptive Gradient Methods with Dynamic Bound of Learning Rate:
        https://openreview.net/forum?id=Bkg3g2R9FX
    """
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), final_lr=0.1, gamma=1e-3,
                 eps=1e-8, weight_decay=0, amsbound=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        if not 0.0 <= final_lr:
            raise ValueError("Invalid final learning rate: {}".format(final_lr))
        if not 0.0 <= gamma < 1.0:
            raise ValueError("Invalid gamma parameter: {}".format(gamma))
        defaults = dict(lr=lr, betas=betas, final_lr=final_lr, gamma=gamma, eps=eps,
                        weight_decay=weight_decay, amsbound=amsbound)
        super(AdaBoundW, self).__init__(params, defaults)
        self.base_lrs = list(map(lambda group: group['lr'], self.param_groups))
    def __setstate__(self, state):
        super(AdaBoundW, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsbound', False)
    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()
        for group, base_lr in zip(self.param_groups, self.base_lrs):
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError(
                        'Adam does not support sparse gradients, please consider SparseAdam instead')
                amsbound = group['amsbound']
                state = self.state[p]
                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsbound:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsbound:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']
                state['step'] += 1
                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsbound:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                else:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])
                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
                # Applies bounds on actual learning rate
                # lr_scheduler cannot affect final_lr, this is a workaround to apply lr decay
                final_lr = group['final_lr'] * group['lr'] / base_lr
                lower_bound = final_lr * (1 - 1 / (group['gamma'] * state['step'] + 1))
                upper_bound = final_lr * (1 + 1 / (group['gamma'] * state['step']))
                step_size = torch.full_like(denom, step_size)
                step_size.div_(denom).clamp_(lower_bound, upper_bound).mul_(exp_avg)
                if group['weight_decay'] != 0:
                    decayed_weights = torch.mul(p.data, group['weight_decay'])
                    p.data.add_(-step_size)
                    p.data.sub_(decayed_weights)
                else:
                    p.data.add_(-step_size)
        return loss
--- a/utils/datasets.py
+++ b/utils/datasets.py
@ -8,9 +8,9 @@ from pathlib import Path
 import cv2
 import numpy as np
 import torch
 from PIL import Image, ExifTags
 from torch.utils.data import Dataset
 from tqdm import tqdm
 from PIL import Image, ExifTags
 from utils.utils import xyxy2xywh, xywh2xyxy
@ -40,8 +40,6 @@ def exif_size(img):
 class LoadImages:  # for inference
    def __init__(self, path, img_size=416):
        self.height = img_size
        files = []
        if os.path.isdir(path):
            files = sorted(glob.glob('%s/*.*' % path))
@ -52,6 +50,7 @@ class LoadImages:  # for inference
        videos = [x for x in files if os.path.splitext(x)[-1].lower() in vid_formats]
        nI, nV = len(images), len(videos)
        self.img_size = img_size
        self.files = images + videos
        self.nF = nI + nV  # number of files
        self.video_flag = [False] * nI + [True] * nV
@ -96,7 +95,7 @@ class LoadImages:  # for inference
            print('image %g/%g %s: ' % (self.count, self.nF, path), end='')
        # Padded resize
-        img, *_ = letterbox(img0, new_shape=self.height)
+        img, *_ = letterbox(img0, new_shape=self.img_size)
        # Normalize RGB
        img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB
@ -117,8 +116,10 @@ class LoadImages:  # for inference
 class LoadWebcam:  # for inference
    def __init__(self, img_size=416):
-        self.cam = cv2.VideoCapture(0)
+        self.img_size = img_size
-        self.height = img_size
+        self.cam = cv2.VideoCapture(0)  # local camera
        # self.cam = cv2.VideoCapture('rtsp://192.168.1.64/1')  # IP camera
        # self.cam = cv2.VideoCapture('rtsp://username:password@192.168.1.64/1')  # IP camera with login
    def __iter__(self):
        self.count = -1
@ -138,7 +139,7 @@ class LoadWebcam:  # for inference
        print('webcam %g: ' % self.count, end='')
        # Padded resize
-        img, *_ = letterbox(img0, new_shape=self.height)
+        img, *_ = letterbox(img0, new_shape=self.img_size)
        # Normalize RGB
        img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB
@ -154,8 +155,7 @@ class LoadWebcam:  # for inference
 class LoadImagesAndLabels(Dataset):  # for training/testing
    def __init__(self, path, img_size=416, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False):
        with open(path, 'r') as f:
-            img_files = f.read().splitlines()
+            self.img_files = [x for x in f.read().splitlines() if os.path.splitext(x)[-1].lower() in img_formats]
            self.img_files = [x for x in img_files if os.path.splitext(x)[-1].lower() in img_formats]
        n = len(self.img_files)
        bi = np.floor(np.arange(n) / batch_size).astype(np.int)  # batch index
@ -405,10 +405,11 @@ def letterbox(img, new_shape=416, color=(128, 128, 128), mode='auto'):
        new_unpad = (new_shape, new_shape)
        ratiow, ratioh = new_shape / shape[1], new_shape / shape[0]
    if shape[::-1] != new_unpad:
        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_AREA)  # resize
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
-    img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_AREA)  # resized, no border
+    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # padded square
    return img, ratiow, ratioh, dw, dh
--- a/utils/gcp.sh
+++ b/utils/gcp.sh
@ -28,6 +28,12 @@ python3 detect.py
 # Test
 python3 test.py --save-json
 # Evolve
 for i in {0..500}
 do
  python3 train.py --data data/coco.data --img-size 320 --epochs 1 --batch-size 64 --accumulate 1 --evolve --bucket yolov4
 done
 # Git pull
 git pull https://github.com/ultralytics/yolov3  # master
 git pull https://github.com/ultralytics/yolov3 test  # branch
--- a/utils/torch_utils.py
+++ b/utils/torch_utils.py
@ -9,24 +9,19 @@ def init_seeds(seed=0):
    # torch.backends.cudnn.deterministic = True  # https://pytorch.org/docs/stable/notes/randomness.html
-def select_device(force_cpu=False):
+def select_device(force_cpu=False, apex=False):
    # apex if mixed precision training https://github.com/NVIDIA/apex
    cuda = False if force_cpu else torch.cuda.is_available()
    device = torch.device('cuda:0' if cuda else 'cpu')
    if not cuda:
        print('Using CPU')
    if cuda:
        try:  # Mixed precision training https://github.com/NVIDIA/apex
            from apex import amp
            apex_str = 'with Apex '
        except:
            apex_str = ''
        torch.backends.cudnn.benchmark = True  # set False for reproducible results
        c = 1024 ** 2  # bytes to MB
        ng = torch.cuda.device_count()
        x = [torch.cuda.get_device_properties(i) for i in range(ng)]
-        cuda_str = 'Using CUDA ' + apex_str
+        cuda_str = 'Using CUDA ' + ('Apex ' if apex else '')
        for i in range(0, ng):
            if i == 1:
                # torch.cuda.set_device(0)  # OPTIONAL: Set GPU ID
@ -42,14 +37,12 @@ def fuse_conv_and_bn(conv, bn):
    # https://tehnokv.com/posts/fusing-batchnorm-and-conv/
    with torch.no_grad():
        # init
-        fusedconv = torch.nn.Conv2d(
+        fusedconv = torch.nn.Conv2d(conv.in_channels,
-            conv.in_channels,
+                                    conv.out_channels,
-            conv.out_channels,
+                                    kernel_size=conv.kernel_size,
-            kernel_size=conv.kernel_size,
+                                    stride=conv.stride,
-            stride=conv.stride,
+                                    padding=conv.padding,
-            padding=conv.padding,
+                                    bias=True)
            bias=True
        )
        # prepare filters
        w_conv = conv.weight.clone().view(conv.out_channels, -1)
--- a/utils/utils.py
+++ b/utils/utils.py
@ -1,5 +1,7 @@
 import glob
 import os
 import random
 from pathlib import Path
 import cv2
 import matplotlib
@ -9,7 +11,6 @@ import torch
 import torch.nn as nn
 from PIL import Image
 from tqdm import tqdm
 from pathlib import Path
 from . import torch_utils  # , google_utils
@ -303,12 +304,14 @@ def compute_loss(p, targets, model, giou_loss=True):  # predictions, targets, mo
            tobj[b, a, gj, gi] = 1.0  # obj
            # pi[..., 2:4] = torch.sigmoid(pi[..., 2:4])  # wh power loss (uncomment)
            # s = 1.5  # scale_xy
            pxy = torch.sigmoid(pi[..., 0:2])  # * s - (s - 1) / 2
            if giou_loss:
-                pbox = torch.cat((torch.sigmoid(pi[..., 0:2]), torch.exp(pi[..., 2:4]) * anchor_vec[i]), 1)  # predicted
+                pbox = torch.cat((pxy, torch.exp(pi[..., 2:4]) * anchor_vec[i]), 1)  # predicted
                giou = bbox_iou(pbox.t(), tbox[i], x1y1x2y2=False, GIoU=True)  # giou computation
                lxy += (k * h['giou']) * (1.0 - giou).mean()  # giou loss
            else:
-                lxy += (k * h['xy']) * MSE(torch.sigmoid(pi[..., 0:2]), txy[i])  # xy loss
+                lxy += (k * h['xy']) * MSE(pxy, txy[i])  # xy loss
                lwh += (k * h['wh']) * MSE(pi[..., 2:4], twh[i])  # wh yolo loss
            tclsm = torch.zeros_like(pi[..., 5:])
@ -542,23 +545,20 @@ def select_best_evolve(path='evolve*.txt'):  # from utils.utils import *; select
        print(file, x[fitness.argmax()])
-def kmeans_targets(path='./data/coco_64img.txt'):  # from utils.utils import *; kmeans_targets()
+def kmeans_targets(path='./data/coco_64img.txt', n=9, img_size=320):  # from utils.utils import *; kmeans_targets()
    # Produces a list of target kmeans suitable for use in *.cfg files
    img_formats = ['.bmp', '.jpg', '.jpeg', '.png', '.tif']
    with open(path, 'r') as f:
-        img_files = f.read().splitlines()
+        img_files = [x for x in f.read().splitlines() if os.path.splitext(x)[-1].lower() in img_formats]
        img_files = list(filter(lambda x: len(x) > 0, img_files))
    # Read shapes
-    n = len(img_files)
+    nf = len(img_files)
-    assert n > 0, 'No images found in %s' % path
+    assert nf > 0, 'No images found in %s' % path
-    label_files = [x.replace('images', 'labels').
+    label_files = [x.replace('images', 'labels').replace(os.path.splitext(x)[-1], '.txt') for x in img_files]
                       replace('.jpeg', '.txt').
                       replace('.jpg', '.txt').
                       replace('.bmp', '.txt').
                       replace('.png', '.txt') for x in img_files]
    s = np.array([Image.open(f).size for f in tqdm(img_files, desc='Reading image shapes')])  # (width, height)
    # Read targets
-    labels = [np.zeros((0, 5))] * n
+    labels = [np.zeros((0, 5))] * nf
    iter = tqdm(label_files, desc='Reading labels')
    for i, file in enumerate(iter):
        try:
@ -570,19 +570,43 @@ def kmeans_targets(path='./data/coco_64img.txt'):  # from utils.utils import *;
                    assert (l[:, 1:] <= 1).all(), 'non-normalized or out of bounds coordinate labels: %s' % file
                    l[:, [1, 3]] *= s[i][0]
                    l[:, [2, 4]] *= s[i][1]
-                    l[:, 1:] *= 320 / max(s[i])
+                    l[:, 1:] *= img_size / max(s[i])  # nominal img_size for training here
                    labels[i] = l
        except:
            pass  # print('Warning: missing labels for %s' % self.img_files[i])  # missing label file
    assert len(np.concatenate(labels, 0)) > 0, 'No labels found. Incorrect label paths provided.'
-    # kmeans
+    # kmeans calculation
    from scipy import cluster
    wh = np.concatenate(labels, 0)[:, 3:5]
-    k = cluster.vq.kmeans(wh, 9)[0]
+    k = cluster.vq.kmeans(wh, n)[0]
    k = k[np.argsort(k.prod(1))]
    for x in k.ravel():
-        print('%.1f, ' % x, end='')
+        print('%.1f, ' % x, end='')  # drop-in replacement for *.cfg anchors
 def print_mutation(hyp, results, bucket=''):
    # Print mutation results to evolve.txt (for use with train.py --evolve)
    a = '%11s' * len(hyp) % tuple(hyp.keys())  # hyperparam keys
    b = '%11.3g' * len(hyp) % tuple(hyp.values())  # hyperparam values
    c = '%11.3g' * len(results) % results  # results (P, R, mAP, F1, test_loss)
    print('\n%s\n%s\nEvolved fitness: %s\n' % (a, b, c))
    if bucket:
        os.system('gsutil cp gs://%s/evolve.txt .' % bucket)  # download evolve.txt
        with open('evolve.txt', 'a') as f:  # append result
            f.write(c + b + '\n')
        x = np.unique(np.loadtxt('evolve.txt', ndmin=2), axis=0)  # load unique rows
        np.savetxt('evolve.txt', x[np.argsort(-fitness(x))], '%11.3g')  # save sort by fitness
        os.system('gsutil cp evolve.txt gs://%s' % bucket)  # upload evolve.txt
    else:
        with open('evolve.txt', 'a') as f:
            f.write(c + b + '\n')
 def fitness(x):
    # Returns fitness (for use with results.txt or evolve.txt)
    return 0.50 * x[:, 2] + 0.50 * x[:, 3]  # fitness = 0.9 * mAP + 0.1 * F1
 # Plotting functions ---------------------------------------------------------------------------------------------------
@ -617,7 +641,7 @@ def plot_wh_methods():  # from utils.utils import *; plot_wh_methods()
    plt.ylabel('output')
    plt.legend()
    fig.tight_layout()
-    fig.savefig('comparison.png', dpi=300)
+    fig.savefig('comparison.png', dpi=200)
 def plot_images(imgs, targets, paths=None, fname='images.jpg'):
@ -642,7 +666,7 @@ def plot_images(imgs, targets, paths=None, fname='images.jpg'):
            s = Path(paths[i]).name
            plt.title(s[:min(len(s), 40)], fontdict={'size': 8})  # limit to 40 characters
    fig.tight_layout()
-    fig.savefig(fname, dpi=300)
+    fig.savefig(fname, dpi=200)
    plt.close()
@ -662,7 +686,7 @@ def plot_test_txt():  # from utils.utils import *; plot_test()
    ax[0].hist(cx, bins=600)
    ax[1].hist(cy, bins=600)
    fig.tight_layout()
-    plt.savefig('hist1d.jpg', dpi=300)
+    plt.savefig('hist1d.jpg', dpi=200)
 def plot_targets_txt():  # from utils.utils import *; plot_targets_txt()
@ -678,7 +702,27 @@ def plot_targets_txt():  # from utils.utils import *; plot_targets_txt()
        ax[i].legend()
        ax[i].set_title(s[i])
    fig.tight_layout()
-    plt.savefig('targets.jpg', dpi=300)
+    plt.savefig('targets.jpg', dpi=200)
 def plot_evolution_results(hyp):  # from utils.utils import *; plot_evolution_results(hyp)
    # Plot hyperparameter evolution results in evolve.txt
    x = np.loadtxt('evolve.txt')
    f = fitness(x)
    weights = (f - f.min()) ** 2  # for weighted results
    fig = plt.figure(figsize=(12, 10))
    matplotlib.rc('font', **{'size': 8})
    for i, (k, v) in enumerate(hyp.items()):
        y = x[:, i + 5]
        # mu = (y * weights).sum() / weights.sum()  # best weighted result
        mu = y[f.argmax()]  # best single result
        plt.subplot(4, 5, i + 1)
        plt.plot(mu, f.max(), 'o', markersize=10)
        plt.plot(y, f, '.')
        plt.title('%s = %.3g' % (k, mu), fontdict={'size': 9})  # limit to 40 characters
        print('%15s: %.3g' % (k, mu))
    fig.tight_layout()
    plt.savefig('evolve.png', dpi=200)
 def plot_results(start=0, stop=0):  # from utils.utils import *; plot_results()
@ -698,4 +742,4 @@ def plot_results(start=0, stop=0):  # from utils.utils import *; plot_results()
            ax[i].set_title(s[i])
    fig.tight_layout()
    ax[4].legend()
-    fig.savefig('results.png', dpi=300)
+    fig.savefig('results.png', dpi=200)