From 386835d7ca3e40e1bff9320c59eeb1dc1ec8bca8 Mon Sep 17 00:00:00 2001 From: perry0418 <34980036+perry0418@users.noreply.github.com> Date: Mon, 25 Mar 2019 14:56:38 +0800 Subject: [PATCH 1/3] Update train.py solve the multi-gpu training problem. --- train.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/train.py b/train.py index da1a07fe..6e461431 100644 --- a/train.py +++ b/train.py @@ -7,6 +7,7 @@ import test # Import test.py to get mAP after each epoch from models import * from utils.datasets import * from utils.utils import * +import torch.distributed as dist def train( @@ -39,11 +40,7 @@ def train( # Optimizer lr0 = 0.001 # initial learning rate - optimizer = torch.optim.SGD(model.parameters(), lr=lr0, momentum=.9) - - # Dataloader - dataset = LoadImagesAndLabels(train_path, img_size=img_size, augment=True) - dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers) + optimizer = torch.optim.SGD(model.parameters(), lr=lr0, momentum=.9,weight_decay = 0.0005) cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 @@ -62,10 +59,19 @@ def train( cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74') elif cfg.endswith('yolov3-tiny.cfg'): cutoff = load_darknet_weights(model, weights + 'yolov3-tiny.conv.15') - + + #initialize for distributed training if torch.cuda.device_count() > 1: - print('WARNING: MultiGPU Issue: https://github.com/ultralytics/yolov3/issues/146') - model = nn.DataParallel(model) + dist.init_process_group(backend=opt.dist_backend, init_method=opt.dist_url,world_size=opt.world_size, rank=args.rank) + model = torch.nn.parallel.DistributedDataParallel(model) + + # Dataloader + dataset = LoadImagesAndLabels(train_path, img_size=img_size, augment=True) + if torch.cuda.device_count() > 1: + train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) + else: + train_sampler=None + dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers,sampler=train_sampler) # Transfer learning (train only YOLO layers) # for i, (name, p) in enumerate(model.named_parameters()): @@ -172,7 +178,7 @@ def train( # Save latest checkpoint checkpoint = {'epoch': epoch, 'best_loss': best_loss, - 'model': model.module.state_dict() if type(model) is nn.DataParallel else model.state_dict(), + 'model': model.module.state_dict() if type(model) is nn.parallel.DistributedDataParallel else model.state_dict(), 'optimizer': optimizer.state_dict()} torch.save(checkpoint, latest) @@ -185,6 +191,8 @@ def train( os.system('cp ' + latest + ' ' + weights + 'backup{}.pt'.format(epoch)) # Calculate mAP + if type(model) is nn.parallel.DistributedDataParallel: + model = model.module with torch.no_grad(): P, R, mAP = test.test(cfg, data_cfg, weights=latest, batch_size=batch_size, img_size=img_size, model=model) @@ -204,6 +212,9 @@ if __name__ == '__main__': parser.add_argument('--img-size', type=int, default=32 * 13, help='pixels') parser.add_argument('--resume', action='store_true', help='resume training flag') parser.add_argument('--num-workers', type=int, default=4, help='number of Pytorch DataLoader workers') + parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,help='url used to set up distributed training') + parser.add_argument('--rank', default=-1, type=int,help='node rank for distributed training') + parser.add_argument('--world-size', default=-1, type=int,help='number of nodes for distributed training') opt = parser.parse_args() print(opt, end='\n\n') From 48845081100e834e6e3f89df1873d7a5b6362c91 Mon Sep 17 00:00:00 2001 From: perry0418 <34980036+perry0418@users.noreply.github.com> Date: Mon, 25 Mar 2019 14:59:02 +0800 Subject: [PATCH 2/3] Update utils.py solve the multi-gpu training problem --- utils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/utils.py b/utils/utils.py index 1f4e03b0..76b25df2 100755 --- a/utils/utils.py +++ b/utils/utils.py @@ -285,7 +285,7 @@ def compute_loss(p, targets): # predictions, targets def build_targets(model, targets, pred): # targets = [image, class, x, y, w, h] - if isinstance(model, nn.DataParallel): + if isinstance(model, nn.parallel.DistributedDataParallel): model = model.module yolo_layers = get_yolo_layers(model) From 5daea5882fc6ecaa78ef6cb7cea078493a78b5ce Mon Sep 17 00:00:00 2001 From: perry0418 <34980036+perry0418@users.noreply.github.com> Date: Mon, 25 Mar 2019 16:13:21 +0800 Subject: [PATCH 3/3] Update train.py fix problem of multiple gpu training --- train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/train.py b/train.py index 6e461431..8bb87ce4 100644 --- a/train.py +++ b/train.py @@ -62,7 +62,7 @@ def train( #initialize for distributed training if torch.cuda.device_count() > 1: - dist.init_process_group(backend=opt.dist_backend, init_method=opt.dist_url,world_size=opt.world_size, rank=args.rank) + dist.init_process_group(backend=opt.dist_backend, init_method=opt.dist_url,world_size=opt.world_size, rank=opt.rank) model = torch.nn.parallel.DistributedDataParallel(model) # Dataloader @@ -215,6 +215,7 @@ if __name__ == '__main__': parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,help='url used to set up distributed training') parser.add_argument('--rank', default=-1, type=int,help='node rank for distributed training') parser.add_argument('--world-size', default=-1, type=int,help='number of nodes for distributed training') + parser.add_argument('--dist-backend', default='nccl', type=str,help='distributed backend') opt = parser.parse_args() print(opt, end='\n\n')