This commit is contained in:
Glenn Jocher 2020-01-10 16:09:36 -08:00
parent b7a25e60ce
commit ba265d91b2
4 changed files with 49 additions and 55 deletions

View File

@ -53,7 +53,7 @@ def train():
cfg = opt.cfg cfg = opt.cfg
data = opt.data data = opt.data
img_size = opt.img_size img_size = opt.img_size
epochs = 1 if opt.prebias else opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs epochs = opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs
batch_size = opt.batch_size batch_size = opt.batch_size
accumulate = opt.accumulate # effective bs = batch_size * accumulate = 16 * 4 = 64 accumulate = opt.accumulate # effective bs = batch_size * accumulate = 16 * 4 = 64
weights = opt.weights # initial training weights weights = opt.weights # initial training weights
@ -65,8 +65,8 @@ def train():
# Initialize # Initialize
init_seeds() init_seeds()
if opt.multi_scale: if opt.multi_scale:
img_sz_min = 9 # round(img_size / 32 / 1.5) img_sz_min = round(img_size / 32 / 1.5)
img_sz_max = 21 # round(img_size / 32 * 1.5) img_sz_max = round(img_size / 32 * 1.5)
img_size = img_sz_max * 32 # initiate with maximum multi_scale size img_size = img_sz_max * 32 # initiate with maximum multi_scale size
print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size)) print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size))
@ -136,16 +136,6 @@ def train():
# possible weights are '*.weights', 'yolov3-tiny.conv.15', 'darknet53.conv.74' etc. # possible weights are '*.weights', 'yolov3-tiny.conv.15', 'darknet53.conv.74' etc.
cutoff = load_darknet_weights(model, weights) cutoff = load_darknet_weights(model, weights)
if opt.prebias:
# Update params (bias-only training allows more aggressive settings: i.e. SGD ~0.1 lr0, ~0.9 momentum)
for p in optimizer.param_groups:
p['lr'] = 0.1 # learning rate
if p.get('momentum') is not None: # for SGD but not Adam
p['momentum'] = 0.9
for name, p in model.named_parameters():
p.requires_grad = True if name.endswith('.bias') else False
# Scheduler https://github.com/ultralytics/yolov3/issues/238 # Scheduler https://github.com/ultralytics/yolov3/issues/238
# lf = lambda x: 1 - x / epochs # linear ramp to zero # lf = lambda x: 1 - x / epochs # linear ramp to zero
# lf = lambda x: 10 ** (hyp['lrf'] * x / epochs) # exp ramp # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs) # exp ramp
@ -186,7 +176,7 @@ def train():
rect=opt.rect, # rectangular training rect=opt.rect, # rectangular training
image_weights=False, image_weights=False,
cache_labels=epochs > 10, cache_labels=epochs > 10,
cache_images=opt.cache_images and not opt.prebias) cache_images=opt.cache_images)
# Dataloader # Dataloader
batch_size = min(batch_size, len(dataset)) batch_size = min(batch_size, len(dataset))
@ -198,17 +188,16 @@ def train():
pin_memory=True, pin_memory=True,
collate_fn=dataset.collate_fn) collate_fn=dataset.collate_fn)
# Test Dataloader # Testloader
if not opt.prebias: testloader = torch.utils.data.DataLoader(LoadImagesAndLabels(test_path, opt.img_size, batch_size * 2,
testloader = torch.utils.data.DataLoader(LoadImagesAndLabels(test_path, opt.img_size, batch_size * 2, hyp=hyp,
hyp=hyp, rect=True,
rect=True, cache_labels=True,
cache_labels=True, cache_images=opt.cache_images),
cache_images=opt.cache_images), batch_size=batch_size * 2,
batch_size=batch_size * 2, num_workers=nw,
num_workers=nw, pin_memory=True,
pin_memory=True, collate_fn=dataset.collate_fn)
collate_fn=dataset.collate_fn)
# Start training # Start training
nb = len(dataloader) nb = len(dataloader)
@ -222,11 +211,26 @@ def train():
t0 = time.time() t0 = time.time()
torch_utils.model_info(model, report='summary') # 'full' or 'summary' torch_utils.model_info(model, report='summary') # 'full' or 'summary'
print('Using %g dataloader workers' % nw) print('Using %g dataloader workers' % nw)
print('Starting %s for %g epochs...' % ('prebias' if opt.prebias else 'training', epochs)) print('Starting training for %g epochs...' % epochs)
for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ for epoch in range(start_epoch - 1 if opt.prebias else start_epoch, epochs): # epoch ------------------------------
model.train() model.train()
print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size'))
# Prebias
if opt.prebias:
if epoch < 0: # prebias
ps = 0.1, 0.9, False # prebias settings (lr=0.1, momentum=0.9, requires_grad=False)
else: # normal training
ps = hyp['lr0'], hyp['momentum'], True # normal training settings
opt.prebias = False
for p in optimizer.param_groups:
p['lr'] = ps[0] # learning rate
if p.get('momentum') is not None: # for SGD but not Adam
p['momentum'] = ps[1]
for name, p in model.named_parameters():
p.requires_grad = True if name.endswith('.bias') else ps[2]
# Update image weights (optional) # Update image weights (optional)
if dataset.image_weights: if dataset.image_weights:
w = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights w = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights
@ -300,13 +304,11 @@ def train():
# end batch ------------------------------------------------------------------------------------------------ # end batch ------------------------------------------------------------------------------------------------
# Update scheduler
scheduler.step()
# Process epoch results # Process epoch results
final_epoch = epoch + 1 == epochs final_epoch = epoch + 1 == epochs
if opt.prebias: if opt.prebias:
print_model_biases(model) print_model_biases(model)
continue
elif not opt.notest or final_epoch: # Calculate mAP elif not opt.notest or final_epoch: # Calculate mAP
is_coco = any([x in data for x in ['coco.data', 'coco2014.data', 'coco2017.data']]) and model.nc == 80 is_coco = any([x in data for x in ['coco.data', 'coco2014.data', 'coco2017.data']]) and model.nc == 80
results, maps = test.test(cfg, results, maps = test.test(cfg,
@ -319,10 +321,13 @@ def train():
save_json=final_epoch and is_coco, save_json=final_epoch and is_coco,
dataloader=testloader) dataloader=testloader)
# Update scheduler
scheduler.step()
# Write epoch results # Write epoch results
with open(results_file, 'a') as f: with open(results_file, 'a') as f:
f.write(s + '%10.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) f.write(s + '%10.3g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls)
if len(opt.name) and opt.bucket and not opt.prebias: if len(opt.name) and opt.bucket:
os.system('gsutil cp results.txt gs://%s/results%s.txt' % (opt.bucket, opt.name)) os.system('gsutil cp results.txt gs://%s/results%s.txt' % (opt.bucket, opt.name))
# Write Tensorboard results # Write Tensorboard results
@ -339,7 +344,7 @@ def train():
best_fitness = fitness best_fitness = fitness
# Save training results # Save training results
save = (not opt.nosave) or (final_epoch and not opt.evolve) or opt.prebias save = (not opt.nosave) or (final_epoch and not opt.evolve)
if save: if save:
with open(results_file, 'r') as f: with open(results_file, 'r') as f:
# Create checkpoint # Create checkpoint
@ -368,7 +373,7 @@ def train():
# end training # end training
n = opt.name n = opt.name
if len(n) and not opt.prebias: if len(n):
n = '_' + n if not n.isnumeric() else n n = '_' + n if not n.isnumeric() else n
fresults, flast, fbest = 'results%s.txt' % n, 'last%s.pt' % n, 'best%s.pt' % n fresults, flast, fbest = 'results%s.txt' % n, 'last%s.pt' % n, 'best%s.pt' % n
os.rename('results.txt', fresults) os.rename('results.txt', fresults)
@ -387,20 +392,6 @@ def train():
return results return results
def prebias():
# trains output bias layers for 1 epoch and creates new backbone
if opt.prebias:
# opt_0 = opt # save settings
# opt.rect = False # update settings (if any)
train() # train model biases
create_backbone(last) # saved results as backbone.pt
# opt = opt_0 # reset settings
opt.weights = wdir + 'backbone.pt' # assign backbone
opt.prebias = False # disable prebias
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--epochs', type=int, default=273) # 500200 batches at bs 16, 117263 COCO images = 273 epochs parser.add_argument('--epochs', type=int, default=273) # 500200 batches at bs 16, 117263 COCO images = 273 epochs
@ -444,7 +435,6 @@ if __name__ == '__main__':
except: except:
pass pass
prebias() # optional
train() # train normally train() # train normally
else: # Evolve hyperparameters (optional) else: # Evolve hyperparameters (optional)
@ -483,7 +473,6 @@ if __name__ == '__main__':
hyp[k] = np.clip(hyp[k], v[0], v[1]) hyp[k] = np.clip(hyp[k], v[0], v[1])
# Train mutation # Train mutation
prebias()
results = train() results = train()
# Write mutation results # Write mutation results

View File

@ -8,7 +8,9 @@
#t=ultralytics/yolov3:v199 && sudo docker pull $t && sudo nvidia-docker run -it --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t python3 train.py --data coco2014.data --img-size 672 --epochs 10 --batch 16 --accum 4 --weights '' --arc defaultpw --device 0 --multi #t=ultralytics/yolov3:v199 && sudo docker pull $t && sudo nvidia-docker run -it --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t python3 train.py --data coco2014.data --img-size 672 --epochs 10 --batch 16 --accum 4 --weights '' --arc defaultpw --device 0 --multi
while true; do while true; do
python3 train.py --data ../data/sm4/out.data --img-size 320 --epochs 100 --batch 64 --accum 1 --weights yolov3-tiny.pt --arc defaultpw --pre --multi --bucket ult/wer --evolve --device $1 --cfg yolov3-tiny-3cls.cfg python3 train.py --data ../data/sm4/out.data --img-size 320 --epochs 100 --batch 64 --accum 1 --weights yolov3-tiny.pt --arc defaultpw --pre --multi --bucket ult/wer --evolve --device $1 --cfg yolov3-tiny-3cls.cfg --cache
# python3 train.py --data ../out/data.data --img-size 608 --epochs 10 --batch 8 --accum 8 --weights ultralytics68.pt --arc defaultpw --pre --multi --bucket ult/athena --evolve --device $1
# python3 train.py --data coco2014.data --img-size 640 --epochs 10 --batch 22 --accum 3 --evolve --weights '' --arc defaultpw --pre --bucket yolov4/640ms_coco2014_10e --device $1 --multi # python3 train.py --data coco2014.data --img-size 640 --epochs 10 --batch 22 --accum 3 --evolve --weights '' --arc defaultpw --pre --bucket yolov4/640ms_coco2014_10e --device $1 --multi
# python3 train.py --data coco2014.data --img-size 320 --epochs 27 --batch 64 --accum 1 --evolve --weights '' --arc defaultpw --pre --bucket yolov4/320_coco2014_27e --device $1 # python3 train.py --data coco2014.data --img-size 320 --epochs 27 --batch 64 --accum 1 --evolve --weights '' --arc defaultpw --pre --bucket yolov4/320_coco2014_27e --device $1
done done

View File

@ -40,12 +40,13 @@ python3 test.py --save-json
# Evolve # Evolve
t=ultralytics/yolov3:v206 t=ultralytics/yolov3:v206
sudo docker kill $(sudo docker ps -a -q --filter ancestor=$t) sudo docker kill $(sudo docker ps -a -q --filter ancestor=$t)
for i in 0 1 2 3 4 5 6 7 for i in 0 1 2 3
do do
sudo docker pull $t && sudo nvidia-docker run -d --ipc=host -v "$(pwd)"/data:/usr/src/data $t bash utils/evolve.sh $i sudo docker pull $t && sudo docker run --gpus all -d --ipc=host -v "$(pwd)"/data:/usr/src/data $t bash utils/evolve.sh $i
# sudo docker pull $t && sudo docker run --gpus all -d --ipc=host -v "$(pwd)"/out:/usr/src/out $t bash utils/evolve.sh $i
# sudo docker pull $t && sudo nvidia-docker run -d --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t bash utils/evolve.sh $i # sudo docker pull $t && sudo nvidia-docker run -d --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t bash utils/evolve.sh $i
# sudo docker pull $t && sudo nvidia-docker run -d --ipc=host -v /mnt/disks/nvme0n1/coco:/usr/src/coco $t bash utils/evolve.sh $i # sudo docker pull $t && sudo nvidia-docker run -d --ipc=host -v /mnt/disks/nvme0n1/coco:/usr/src/coco $t bash utils/evolve.sh $i
sleep 1 sleep 120
done done
@ -257,6 +258,7 @@ n=198 && t=ultralytics/yolov3:v$n && sudo docker pull $t && sudo nvidia-docker r
n=199 && t=ultralytics/yolov3:v$n && sudo docker pull $t && sudo nvidia-docker run --ipc=host -v "$(pwd)"/out:/usr/src/out $t python3 train.py --data ../out/data.data --img-size 608 --epochs 100 --batch 8 --accum 8 --weights ultralytics68.pt --arc defaultpw --pre --multi --bucket ultralytics/athena --name $n --device 0 n=199 && t=ultralytics/yolov3:v$n && sudo docker pull $t && sudo nvidia-docker run --ipc=host -v "$(pwd)"/out:/usr/src/out $t python3 train.py --data ../out/data.data --img-size 608 --epochs 100 --batch 8 --accum 8 --weights ultralytics68.pt --arc defaultpw --pre --multi --bucket ultralytics/athena --name $n --device 0
n=200 && t=ultralytics/yolov3:v$n && sudo docker pull $t && sudo nvidia-docker run --ipc=host -v "$(pwd)"/out:/usr/src/out $t python3 train.py --data ../out/data.data --img-size 608 --epochs 100 --batch 8 --accum 8 --weights ultralytics68.pt --arc defaultpw --pre --multi --bucket ultralytics/athena --name $n --device 6 n=200 && t=ultralytics/yolov3:v$n && sudo docker pull $t && sudo nvidia-docker run --ipc=host -v "$(pwd)"/out:/usr/src/out $t python3 train.py --data ../out/data.data --img-size 608 --epochs 100 --batch 8 --accum 8 --weights ultralytics68.pt --arc defaultpw --pre --multi --bucket ultralytics/athena --name $n --device 6
n=207 && t=ultralytics/yolov3:v$n && sudo docker pull $t && sudo nvidia-docker run --ipc=host -v "$(pwd)"/out:/usr/src/out $t python3 train.py --data ../out/data.data --img-size 608 --epochs 100 --batch 8 --accum 8 --weights ultralytics68.pt --arc defaultpw --pre --multi --bucket ultralytics/athena --name $n --device 7 n=207 && t=ultralytics/yolov3:v$n && sudo docker pull $t && sudo nvidia-docker run --ipc=host -v "$(pwd)"/out:/usr/src/out $t python3 train.py --data ../out/data.data --img-size 608 --epochs 100 --batch 8 --accum 8 --weights ultralytics68.pt --arc defaultpw --pre --multi --bucket ultralytics/athena --name $n --device 7
n=208 && t=ultralytics/yolov3:v$n && sudo docker pull $t && sudo docker run --gpus all --ipc=host -v "$(pwd)"/out:/usr/src/out $t python3 train.py --data ../out/data.data --img-size 608 --epochs 10 --batch 8 --accum 8 --weights ultralytics68.pt --arc defaultpw --pre --multi --device 0
# sm4 # sm4
n=201 && t=ultralytics/yolov3:v201 && sudo docker pull $t && sudo nvidia-docker run -d --ipc=host -v "$(pwd)"/data:/usr/src/data $t python3 train.py --data ../data/sm4/out.data --img-size 320 --epochs 1000 --batch 64 --accum 1 --weights yolov3-tiny.pt --arc defaultpw --pre --multi --bucket ult/wer --name $n --device 0 --cfg yolov3-tiny-3cls.cfg n=201 && t=ultralytics/yolov3:v201 && sudo docker pull $t && sudo nvidia-docker run -d --ipc=host -v "$(pwd)"/data:/usr/src/data $t python3 train.py --data ../data/sm4/out.data --img-size 320 --epochs 1000 --batch 64 --accum 1 --weights yolov3-tiny.pt --arc defaultpw --pre --multi --bucket ult/wer --name $n --device 0 --cfg yolov3-tiny-3cls.cfg
@ -265,3 +267,4 @@ n=203 && t=ultralytics/yolov3:v201 && sudo docker pull $t && sudo nvidia-docker
n=204 && t=ultralytics/yolov3:v202 && sudo docker pull $t && sudo nvidia-docker run -d --ipc=host -v "$(pwd)"/data:/usr/src/data $t python3 train.py --data ../data/sm4/out.data --img-size 320 --epochs 1000 --batch 64 --accum 1 --weights yolov3-tiny.pt --arc defaultpw --pre --multi --bucket ult/wer --name $n --device 3 --cfg yolov3-tiny-3cls-sm4.cfg n=204 && t=ultralytics/yolov3:v202 && sudo docker pull $t && sudo nvidia-docker run -d --ipc=host -v "$(pwd)"/data:/usr/src/data $t python3 train.py --data ../data/sm4/out.data --img-size 320 --epochs 1000 --batch 64 --accum 1 --weights yolov3-tiny.pt --arc defaultpw --pre --multi --bucket ult/wer --name $n --device 3 --cfg yolov3-tiny-3cls-sm4.cfg
n=205 && t=ultralytics/yolov3:v202 && sudo docker pull $t && sudo nvidia-docker run -it --ipc=host -v "$(pwd)"/data:/usr/src/data $t python3 train.py --data ../data/sm4/out.data --img-size 320 --epochs 1000 --batch 64 --accum 1 --weights '' --arc defaultpw --pre --multi --bucket ult/wer --name $n --device 4 --cfg yolov3-tiny-3cls-sm4.cfg n=205 && t=ultralytics/yolov3:v202 && sudo docker pull $t && sudo nvidia-docker run -it --ipc=host -v "$(pwd)"/data:/usr/src/data $t python3 train.py --data ../data/sm4/out.data --img-size 320 --epochs 1000 --batch 64 --accum 1 --weights '' --arc defaultpw --pre --multi --bucket ult/wer --name $n --device 4 --cfg yolov3-tiny-3cls-sm4.cfg
n=206 && t=ultralytics/yolov3:v$n && sudo docker pull $t && sudo docker run --gpus all -it --ipc=host -v "$(pwd)"/data:/usr/src/data $t python3 train.py --data ../data/sm4/out.data --img-size 320 --epochs 100 --batch 64 --accum 1 --weights yolov3-tiny.pt --arc defaultpw --pre --multi --notest --nosave --cache --device 0 --cfg yolov3-tiny-3cls.cfg

View File

@ -633,7 +633,7 @@ def get_yolo_layers(model):
def print_model_biases(model): def print_model_biases(model):
# prints the bias neurons preceding each yolo layer # prints the bias neurons preceding each yolo layer
print('\nModel Bias Summary (per output layer):') print('\nModel Bias Summary:')
multi_gpu = type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel) multi_gpu = type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel)
for l in model.yolo_layers: # print pretrained biases for l in model.yolo_layers: # print pretrained biases
if multi_gpu: if multi_gpu:
@ -642,7 +642,7 @@ def print_model_biases(model):
else: else:
na = model.module_list[l].na na = model.module_list[l].na
b = model.module_list[l - 1][0].bias.view(na, -1) # bias 3x85 b = model.module_list[l - 1][0].bias.view(na, -1) # bias 3x85
print('regression: %5.2f+/-%-5.2f ' % (b[:, :4].mean(), b[:, :4].std()), print('layer %3g regression: %5.2f+/-%-5.2f ' % (l, b[:, :4].mean(), b[:, :4].std()),
'objectness: %5.2f+/-%-5.2f ' % (b[:, 4].mean(), b[:, 4].std()), 'objectness: %5.2f+/-%-5.2f ' % (b[:, 4].mean(), b[:, 4].std()),
'classification: %5.2f+/-%-5.2f' % (b[:, 5:].mean(), b[:, 5:].std())) 'classification: %5.2f+/-%-5.2f' % (b[:, 5:].mean(), b[:, 5:].std()))