diff --git a/README.md b/README.md index 35eb930d..44e7efd1 100755 --- a/README.md +++ b/README.md @@ -86,10 +86,11 @@ https://cloud.google.com/deep-learning-vm/ GPUs | `batch_size` | batch time | epoch time | epoch cost --- |---| --- | --- | --- 1 K80 | 64 (32x2) | 2.9s | 175min | $0.58 -1 T4 | 64 (32x2) | 0.8s | 49min | $0.29 +1 T4 | 64 (32x2) | 0.80s | 49min | $0.29 +2 T4 | 64 (64x1) | 0.52s | 32min | $0.36 1 2080ti | 64 (32x2) | - | - | - 1 V100 | 64 (32x2) | 0.38s | 23min | $0.31 -2 V100 | 64 (64x1) | 0.38s | 23min | $0.62 +2 V100 | 64 (64x1) | 0.30s | 18min | $0.46 # Inference diff --git a/train.py b/train.py index 9cb36fa7..e21f0728 100644 --- a/train.py +++ b/train.py @@ -3,6 +3,7 @@ import time import torch.optim as optim import torch.optim.lr_scheduler as lr_scheduler +import torch.distributed as dist import test # import test.py to get mAP after each epoch from models import * @@ -10,6 +11,12 @@ from utils.datasets import * from utils.utils import * from utils.adabound import * +mixed_precision = True +try: # Mixed precision training https://github.com/NVIDIA/apex + from apex import amp +except: # not installed: install help: https://github.com/NVIDIA/apex/issues/259 + mixed_precision = False + # 320 --epochs 1 # 0.109 0.297 0.15 0.126 7.04 1.666 4.062 0.1845 42.6 3.34 12.61 8.338 0.2705 0.001 -4 0.9 0.0005 a 320 giou + best_anchor False # 0.223 0.218 0.138 0.189 9.28 1.153 4.376 0.08263 24.28 3.05 20.93 2.842 0.2759 0.001357 -5.036 0.9158 0.0005722 b mAP/F1 - 50/50 weighting @@ -152,6 +159,18 @@ def train(cfg, # plt.tight_layout() # plt.savefig('LR.png', dpi=300) + # Mixed precision training https://github.com/NVIDIA/apex + if mixed_precision: + model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) + + # Initialize distributed training + if torch.cuda.device_count() > 1: + dist.init_process_group(backend='nccl', # 'distributed backend' + init_method='tcp://127.0.0.1:9999', # distributed training init method + world_size=1, # number of nodes for distributed training + rank=0) # distributed training node rank + model = torch.nn.parallel.DistributedDataParallel(model) + # Dataset dataset = LoadImagesAndLabels(train_path, img_size, @@ -160,16 +179,6 @@ def train(cfg, hyp=hyp, # augmentation hyperparameters rect=opt.rect) # rectangular training - # Initialize distributed training - if torch.cuda.device_count() > 1: - torch.distributed.init_process_group(backend='nccl', # 'distributed backend' - init_method='tcp://127.0.0.1:9999', # distributed training init method - world_size=1, # number of nodes for distributed training - rank=0) # distributed training node rank - - model = torch.nn.parallel.DistributedDataParallel(model) - # sampler = torch.utils.data.distributed.DistributedSampler(dataset) - # Dataloader dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, @@ -178,15 +187,6 @@ def train(cfg, pin_memory=True, collate_fn=dataset.collate_fn) - # Mixed precision training https://github.com/NVIDIA/apex - mixed_precision = True - if mixed_precision: - try: - from apex import amp - model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) - except: # not installed: install help: https://github.com/NVIDIA/apex/issues/259 - mixed_precision = False - # Start training model.hyp = hyp # attach hyperparameters to model # model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights @@ -315,7 +315,7 @@ def train(cfg, # Report time print('%g epochs completed in %.3f hours.' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) - del model, optimizer, loss, dataset, dataloader, scheduler + dist.destroy_process_group() if torch.cuda.is_available() else None torch.cuda.empty_cache() return results