updates
This commit is contained in:
parent
1cde55f7c9
commit
3cfc84a183
|
@ -86,10 +86,11 @@ https://cloud.google.com/deep-learning-vm/
|
||||||
GPUs | `batch_size` | batch time | epoch time | epoch cost
|
GPUs | `batch_size` | batch time | epoch time | epoch cost
|
||||||
--- |---| --- | --- | ---
|
--- |---| --- | --- | ---
|
||||||
1 K80 | 64 (32x2) | 2.9s | 175min | $0.58
|
1 K80 | 64 (32x2) | 2.9s | 175min | $0.58
|
||||||
1 T4 | 64 (32x2) | 0.8s | 49min | $0.29
|
1 T4 | 64 (32x2) | 0.80s | 49min | $0.29
|
||||||
|
2 T4 | 64 (64x1) | 0.52s | 32min | $0.36
|
||||||
1 2080ti | 64 (32x2) | - | - | -
|
1 2080ti | 64 (32x2) | - | - | -
|
||||||
1 V100 | 64 (32x2) | 0.38s | 23min | $0.31
|
1 V100 | 64 (32x2) | 0.38s | 23min | $0.31
|
||||||
2 V100 | 64 (64x1) | 0.38s | 23min | $0.62
|
2 V100 | 64 (64x1) | 0.30s | 18min | $0.46
|
||||||
|
|
||||||
# Inference
|
# Inference
|
||||||
|
|
||||||
|
|
40
train.py
40
train.py
|
@ -3,6 +3,7 @@ import time
|
||||||
|
|
||||||
import torch.optim as optim
|
import torch.optim as optim
|
||||||
import torch.optim.lr_scheduler as lr_scheduler
|
import torch.optim.lr_scheduler as lr_scheduler
|
||||||
|
import torch.distributed as dist
|
||||||
|
|
||||||
import test # import test.py to get mAP after each epoch
|
import test # import test.py to get mAP after each epoch
|
||||||
from models import *
|
from models import *
|
||||||
|
@ -10,6 +11,12 @@ from utils.datasets import *
|
||||||
from utils.utils import *
|
from utils.utils import *
|
||||||
from utils.adabound import *
|
from utils.adabound import *
|
||||||
|
|
||||||
|
mixed_precision = True
|
||||||
|
try: # Mixed precision training https://github.com/NVIDIA/apex
|
||||||
|
from apex import amp
|
||||||
|
except: # not installed: install help: https://github.com/NVIDIA/apex/issues/259
|
||||||
|
mixed_precision = False
|
||||||
|
|
||||||
# 320 --epochs 1
|
# 320 --epochs 1
|
||||||
# 0.109 0.297 0.15 0.126 7.04 1.666 4.062 0.1845 42.6 3.34 12.61 8.338 0.2705 0.001 -4 0.9 0.0005 a 320 giou + best_anchor False
|
# 0.109 0.297 0.15 0.126 7.04 1.666 4.062 0.1845 42.6 3.34 12.61 8.338 0.2705 0.001 -4 0.9 0.0005 a 320 giou + best_anchor False
|
||||||
# 0.223 0.218 0.138 0.189 9.28 1.153 4.376 0.08263 24.28 3.05 20.93 2.842 0.2759 0.001357 -5.036 0.9158 0.0005722 b mAP/F1 - 50/50 weighting
|
# 0.223 0.218 0.138 0.189 9.28 1.153 4.376 0.08263 24.28 3.05 20.93 2.842 0.2759 0.001357 -5.036 0.9158 0.0005722 b mAP/F1 - 50/50 weighting
|
||||||
|
@ -152,6 +159,18 @@ def train(cfg,
|
||||||
# plt.tight_layout()
|
# plt.tight_layout()
|
||||||
# plt.savefig('LR.png', dpi=300)
|
# plt.savefig('LR.png', dpi=300)
|
||||||
|
|
||||||
|
# Mixed precision training https://github.com/NVIDIA/apex
|
||||||
|
if mixed_precision:
|
||||||
|
model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)
|
||||||
|
|
||||||
|
# Initialize distributed training
|
||||||
|
if torch.cuda.device_count() > 1:
|
||||||
|
dist.init_process_group(backend='nccl', # 'distributed backend'
|
||||||
|
init_method='tcp://127.0.0.1:9999', # distributed training init method
|
||||||
|
world_size=1, # number of nodes for distributed training
|
||||||
|
rank=0) # distributed training node rank
|
||||||
|
model = torch.nn.parallel.DistributedDataParallel(model)
|
||||||
|
|
||||||
# Dataset
|
# Dataset
|
||||||
dataset = LoadImagesAndLabels(train_path,
|
dataset = LoadImagesAndLabels(train_path,
|
||||||
img_size,
|
img_size,
|
||||||
|
@ -160,16 +179,6 @@ def train(cfg,
|
||||||
hyp=hyp, # augmentation hyperparameters
|
hyp=hyp, # augmentation hyperparameters
|
||||||
rect=opt.rect) # rectangular training
|
rect=opt.rect) # rectangular training
|
||||||
|
|
||||||
# Initialize distributed training
|
|
||||||
if torch.cuda.device_count() > 1:
|
|
||||||
torch.distributed.init_process_group(backend='nccl', # 'distributed backend'
|
|
||||||
init_method='tcp://127.0.0.1:9999', # distributed training init method
|
|
||||||
world_size=1, # number of nodes for distributed training
|
|
||||||
rank=0) # distributed training node rank
|
|
||||||
|
|
||||||
model = torch.nn.parallel.DistributedDataParallel(model)
|
|
||||||
# sampler = torch.utils.data.distributed.DistributedSampler(dataset)
|
|
||||||
|
|
||||||
# Dataloader
|
# Dataloader
|
||||||
dataloader = torch.utils.data.DataLoader(dataset,
|
dataloader = torch.utils.data.DataLoader(dataset,
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
|
@ -178,15 +187,6 @@ def train(cfg,
|
||||||
pin_memory=True,
|
pin_memory=True,
|
||||||
collate_fn=dataset.collate_fn)
|
collate_fn=dataset.collate_fn)
|
||||||
|
|
||||||
# Mixed precision training https://github.com/NVIDIA/apex
|
|
||||||
mixed_precision = True
|
|
||||||
if mixed_precision:
|
|
||||||
try:
|
|
||||||
from apex import amp
|
|
||||||
model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)
|
|
||||||
except: # not installed: install help: https://github.com/NVIDIA/apex/issues/259
|
|
||||||
mixed_precision = False
|
|
||||||
|
|
||||||
# Start training
|
# Start training
|
||||||
model.hyp = hyp # attach hyperparameters to model
|
model.hyp = hyp # attach hyperparameters to model
|
||||||
# model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights
|
# model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights
|
||||||
|
@ -315,7 +315,7 @@ def train(cfg,
|
||||||
|
|
||||||
# Report time
|
# Report time
|
||||||
print('%g epochs completed in %.3f hours.' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
|
print('%g epochs completed in %.3f hours.' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
|
||||||
del model, optimizer, loss, dataset, dataloader, scheduler
|
dist.destroy_process_group() if torch.cuda.is_available() else None
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue