diff --git a/configs/simrec_refcoco_scratch.py b/configs/simrec_refcoco_scratch.py index 3c38f87..3500eb3 100644 --- a/configs/simrec_refcoco_scratch.py +++ b/configs/simrec_refcoco_scratch.py @@ -11,12 +11,13 @@ dataset.mask_path["refcoco"] = "/home/rentianhe/dataset/rec/masks/refcoco" # Refine training cfg -train.output_dir = "./output/test_no_syncbn_one_gpu" +train.output_dir = "./output/test_amp" train.batch_size = 32 train.save_period = 1 train.log_period = 10 train.evaluation.eval_batch_size = 32 -train.sync_bn.enabled = False +train.sync_bn.enabled = True +train.amp.enabled = False # Refine optim optim.lr = train.base_lr diff --git a/simrec/layers/sa_layer.py b/simrec/layers/sa_layer.py index 492f077..a921605 100644 --- a/simrec/layers/sa_layer.py +++ b/simrec/layers/sa_layer.py @@ -173,7 +173,7 @@ def att(self, value, key, query, mask): # print(scores.size(),mask.size()) if mask is not None: - scores = scores.masked_fill(mask, -1e9) + scores = scores.masked_fill(mask, -1e4) att_map = F.softmax(scores, dim=-1) att_map = self.dropout(att_map) diff --git a/tools/eval_engine.py b/tools/eval_engine.py index 3a186c7..9e268ec 100644 --- a/tools/eval_engine.py +++ b/tools/eval_engine.py @@ -34,7 +34,7 @@ def validate(cfg, model, data_loader, writer, epoch, ix_to_token, logger, rank, mask_aps={} for item in np.arange(0.5, 1, 0.05): mask_aps[item]=[] - meters = [batch_time, data_time, losses, box_ap, mask_ap,inconsistency_error] + meters = [batch_time, data_time, losses, box_ap, mask_ap, inconsistency_error] meters_dict = {meter.name: meter for meter in meters} with torch.no_grad(): @@ -116,11 +116,11 @@ def validate(cfg, model, data_loader, writer, epoch, ix_to_token, logger, rank, memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0) logger.info( f'Evaluation on {prefix}: [{idx}/{len(data_loader)}] ' - f'Time {batch_time.val:.3f} ({batch_time.avg:.3f}) ' - f'Loss {losses.val:.4f} ({losses.avg:.4f}) ' - f'BoxIoU@0.5 {box_ap.val:.4f} ({box_ap.avg:.4f}) ' - f'MaskIoU {mask_ap.val:.4f} ({mask_ap.avg:.4f}) ' - f'IE {inconsistency_error.val:.4f} ({inconsistency_error.avg:.4f}) ' + f'Time {batch_time.val:.3f} ({batch_time.avg_reduce:.3f}) ' + f'Loss {losses.val:.4f} ({losses.avg_reduce:.4f}) ' + f'BoxIoU@0.5 {box_ap.val:.4f} ({box_ap.avg_reduce:.4f}) ' + f'MaskIoU {mask_ap.val:.4f} ({mask_ap.avg_reduce:.4f}) ' + f'IE {inconsistency_error.val:.4f} ({inconsistency_error.avg_reduce:.4f}) ' f'Mem {memory_used:.0f}MB') batch_time.update(time.time() - end) end = time.time() diff --git a/tools/train_engine.py b/tools/train_engine.py index b7da761..5c1a306 100644 --- a/tools/train_engine.py +++ b/tools/train_engine.py @@ -210,7 +210,10 @@ def main(cfg): ema = EMA(model, cfg.train.ema.alpha, cfg.train.ema.buffer_ema) train_one_epoch(cfg, model, optimizer, scheduler, train_loader, scalar, writer, epoch, dist.get_rank(), ema) box_ap, mask_ap = validate(cfg, model, val_loader, writer, epoch, val_set.ix_to_token, logger, dist.get_rank(), save_ids=save_ids, ema=ema) - + max_box_ap = max(best_det_acc, box_ap) + max_mask_ap = max(best_seg_acc, mask_ap) + logger.info(f"Max BoxIoU@0.5: {max_box_ap:.2f}%, MaskIoU: {max_mask_ap:.2f}%") + # save checkpoints if epoch % cfg.train.save_period == 0 or epoch == (cfg.train.epochs - 1): logger.info(f"saving checkpoints......") @@ -249,9 +252,6 @@ def main(cfg): cfg = LazyConfig.load(args.config) cfg = LazyConfig.apply_overrides(cfg, args.opts) - # Environments setting - seed_everything(cfg.train.seed) - # Distributed setting if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: rank = int(os.environ["RANK"]) @@ -270,6 +270,10 @@ def main(cfg): ) torch.distributed.barrier() + # Environments setting + seed = cfg.train.seed + dist.get_rank() + seed_everything(seed) + # Path setting output_dir = cfg.train.output_dir os.makedirs(output_dir, exist_ok=True)