This commit is contained in:
Glenn Jocher 2019-07-24 18:02:26 +02:00
parent 1cde55f7c9
commit 3cfc84a183
2 changed files with 23 additions and 22 deletions

View File

@ -86,10 +86,11 @@ https://cloud.google.com/deep-learning-vm/
GPUs | `batch_size` | batch time | epoch time | epoch cost GPUs | `batch_size` | batch time | epoch time | epoch cost
--- |---| --- | --- | --- --- |---| --- | --- | ---
1 K80 | 64 (32x2) | 2.9s | 175min | $0.58 1 K80 | 64 (32x2) | 2.9s | 175min | $0.58
1 T4 | 64 (32x2) | 0.8s | 49min | $0.29 1 T4 | 64 (32x2) | 0.80s | 49min | $0.29
2 T4 | 64 (64x1) | 0.52s | 32min | $0.36
1 2080ti | 64 (32x2) | - | - | - 1 2080ti | 64 (32x2) | - | - | -
1 V100 | 64 (32x2) | 0.38s | 23min | $0.31 1 V100 | 64 (32x2) | 0.38s | 23min | $0.31
2 V100 | 64 (64x1) | 0.38s | 23min | $0.62 2 V100 | 64 (64x1) | 0.30s | 18min | $0.46
# Inference # Inference

View File

@ -3,6 +3,7 @@ import time
import torch.optim as optim import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler import torch.optim.lr_scheduler as lr_scheduler
import torch.distributed as dist
import test # import test.py to get mAP after each epoch import test # import test.py to get mAP after each epoch
from models import * from models import *
@ -10,6 +11,12 @@ from utils.datasets import *
from utils.utils import * from utils.utils import *
from utils.adabound import * from utils.adabound import *
mixed_precision = True
try: # Mixed precision training https://github.com/NVIDIA/apex
from apex import amp
except: # not installed: install help: https://github.com/NVIDIA/apex/issues/259
mixed_precision = False
# 320 --epochs 1 # 320 --epochs 1
# 0.109 0.297 0.15 0.126 7.04 1.666 4.062 0.1845 42.6 3.34 12.61 8.338 0.2705 0.001 -4 0.9 0.0005 a 320 giou + best_anchor False # 0.109 0.297 0.15 0.126 7.04 1.666 4.062 0.1845 42.6 3.34 12.61 8.338 0.2705 0.001 -4 0.9 0.0005 a 320 giou + best_anchor False
# 0.223 0.218 0.138 0.189 9.28 1.153 4.376 0.08263 24.28 3.05 20.93 2.842 0.2759 0.001357 -5.036 0.9158 0.0005722 b mAP/F1 - 50/50 weighting # 0.223 0.218 0.138 0.189 9.28 1.153 4.376 0.08263 24.28 3.05 20.93 2.842 0.2759 0.001357 -5.036 0.9158 0.0005722 b mAP/F1 - 50/50 weighting
@ -152,6 +159,18 @@ def train(cfg,
# plt.tight_layout() # plt.tight_layout()
# plt.savefig('LR.png', dpi=300) # plt.savefig('LR.png', dpi=300)
# Mixed precision training https://github.com/NVIDIA/apex
if mixed_precision:
model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)
# Initialize distributed training
if torch.cuda.device_count() > 1:
dist.init_process_group(backend='nccl', # 'distributed backend'
init_method='tcp://127.0.0.1:9999', # distributed training init method
world_size=1, # number of nodes for distributed training
rank=0) # distributed training node rank
model = torch.nn.parallel.DistributedDataParallel(model)
# Dataset # Dataset
dataset = LoadImagesAndLabels(train_path, dataset = LoadImagesAndLabels(train_path,
img_size, img_size,
@ -160,16 +179,6 @@ def train(cfg,
hyp=hyp, # augmentation hyperparameters hyp=hyp, # augmentation hyperparameters
rect=opt.rect) # rectangular training rect=opt.rect) # rectangular training
# Initialize distributed training
if torch.cuda.device_count() > 1:
torch.distributed.init_process_group(backend='nccl', # 'distributed backend'
init_method='tcp://127.0.0.1:9999', # distributed training init method
world_size=1, # number of nodes for distributed training
rank=0) # distributed training node rank
model = torch.nn.parallel.DistributedDataParallel(model)
# sampler = torch.utils.data.distributed.DistributedSampler(dataset)
# Dataloader # Dataloader
dataloader = torch.utils.data.DataLoader(dataset, dataloader = torch.utils.data.DataLoader(dataset,
batch_size=batch_size, batch_size=batch_size,
@ -178,15 +187,6 @@ def train(cfg,
pin_memory=True, pin_memory=True,
collate_fn=dataset.collate_fn) collate_fn=dataset.collate_fn)
# Mixed precision training https://github.com/NVIDIA/apex
mixed_precision = True
if mixed_precision:
try:
from apex import amp
model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)
except: # not installed: install help: https://github.com/NVIDIA/apex/issues/259
mixed_precision = False
# Start training # Start training
model.hyp = hyp # attach hyperparameters to model model.hyp = hyp # attach hyperparameters to model
# model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights # model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights
@ -315,7 +315,7 @@ def train(cfg,
# Report time # Report time
print('%g epochs completed in %.3f hours.' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) print('%g epochs completed in %.3f hours.' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
del model, optimizer, loss, dataset, dataloader, scheduler dist.destroy_process_group() if torch.cuda.is_available() else None
torch.cuda.empty_cache() torch.cuda.empty_cache()
return results return results