From 2ab95734e4985379d7906e15a3cab90079c1a9a1 Mon Sep 17 00:00:00 2001 From: Laurent Fainsin Date: Fri, 1 Jul 2022 10:27:12 +0200 Subject: [PATCH] feat: even more wandb logging Former-commit-id: 1a3c28040a734ca2229e33603405054abc8e3000 [formerly 907e4f7cae3c25a84baf0eaa5ec4d03ddaea0bdb] Former-commit-id: fdfb7dcb7d0573efbff79956e7a4bebfe26e2171 --- src/train.py | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/src/train.py b/src/train.py index 8d4dab1..de2fd48 100644 --- a/src/train.py +++ b/src/train.py @@ -40,6 +40,9 @@ def main(): IMG_SIZE=512, SPHERES=5, ), + settings=wandb.Settings( + code_dir="./src/", + ), ) # create device @@ -51,7 +54,6 @@ def main(): # 0. Create network net = UNet(n_channels=wandb.config.N_CHANNELS, n_classes=wandb.config.N_CLASSES, features=wandb.config.FEATURES) wandb.config.PARAMETERS = sum(p.numel() for p in net.parameters() if p.requires_grad) - wandb.watch(net, log_freq=100) # TODO: 1/4 epochs # transfer network to device net.to(device=device) @@ -125,15 +127,11 @@ def main(): artifact.add_file("checkpoints/model-0.onnx") wandb.run.log_artifact(artifact) - # print the config - logging.info( - f"""wandb config: - {yaml.dump(wandb.config.as_dict())} - """ - ) + # log gradients and weights four time per epoch + wandb.watch(net, log_freq=(len(train_loader) + len(val_loader)) // 4) - # setup wandb table for saving images - table = wandb.Table(columns=["ID", "image", "ground truth", "prediction"]) + # print the config + logging.info(f"wandb config:\n{yaml.dump(wandb.config.as_dict())}") try: for epoch in range(1, wandb.config.EPOCHS + 1): @@ -165,6 +163,7 @@ def main(): # compute metrics pred_masks_bin = (torch.sigmoid(pred_masks) > 0.5).float() accuracy = (true_masks == pred_masks_bin).float().mean() + dice = dice_coeff(pred_masks_bin, true_masks) mae = torch.nn.functional.l1_loss(pred_masks_bin, true_masks) # update tqdm progress bar @@ -174,9 +173,10 @@ def main(): # log metrics wandb.log( { - "train/epoch": epoch - 1 + step / len(train_loader), + "epoch": epoch - 1 + step / len(train_loader), "train/accuracy": accuracy, "train/bce": train_loss, + "train/dice": dice, "train/mae": mae, } ) @@ -184,6 +184,7 @@ def main(): # Evaluation round net.eval() accuracy = 0 + val_loss = 0 dice = 0 mae = 0 with tqdm(val_loader, total=len(ds_valid), desc="val", unit="img", leave=False) as pbar: @@ -198,19 +199,22 @@ def main(): masks_pred = net(images) # compute metrics + val_loss += criterion(pred_masks, true_masks) + mae += torch.nn.functional.l1_loss(pred_masks_bin, true_masks) masks_pred_bin = (torch.sigmoid(masks_pred) > 0.5).float() - accuracy += (true_masks == pred_masks_bin).float().sum() - dice += dice_coeff(masks_pred_bin, masks_true, reduce_batch_first=False) - mae += torch.nn.functional.l1_loss(pred_masks_bin, true_masks, reduction="sum") + accuracy += (true_masks == pred_masks_bin).float().mean() + dice += dice_coeff(masks_pred_bin, masks_true) # update progress bar pbar.update(images.shape[0]) - accuracy /= len(ds_valid) - dice /= len(val_loader) # TODO: fix dice_coeff to not average - mae /= len(ds_valid) + accuracy /= len(val_loader) + val_loss /= len(val_loader) + dice /= len(val_loader) + mae /= len(val_loader) # save the last validation batch to table + table = wandb.Table(columns=["ID", "image", "ground truth", "prediction"]) for i, (img, mask, pred) in enumerate( zip( images.to("cpu"), @@ -223,11 +227,13 @@ def main(): # log validation metrics wandb.log( { - "val/predictions": table, + "predictions": table, "val/accuracy": accuracy, + "val/bce": val_loss, "val/dice": dice, "val/mae": mae, - } + }, + commit=False, ) # update hyperparameters