From 40ea1c3191a70e18e5384202aa523dca03822f0f Mon Sep 17 00:00:00 2001
From: Laurent Fainsin <laurentfainsin@protonmail.com>
Date: Tue, 5 Jul 2022 22:31:38 +0200
Subject: [PATCH] feat: working logging, auto_batch/lr still not working

Former-commit-id: 29d4536eb182f84eb2cc9a4e31f31bf19a4ca272 [formerly f5fd5eec9394b81f15986fb6cbabf675b2f05c04]
Former-commit-id: 3de00ee718a761221c1934b7cbaaa0ad5487856d
---
 src/train.py      |  15 ++++--
 src/unet/model.py | 124 +++++++++++++++++++++++++++-------------------
 2 files changed, 84 insertions(+), 55 deletions(-)

diff --git a/src/train.py b/src/train.py
index 2471320..c54988a 100644
--- a/src/train.py
+++ b/src/train.py
@@ -19,7 +19,7 @@ CONFIG = {
     "DIR_TEST_IMG": "/home/lilian/data_disk/lfainsin/test/",
     "DIR_SPHERE_IMG": "/home/lilian/data_disk/lfainsin/spheres/Images/",
     "DIR_SPHERE_MASK": "/home/lilian/data_disk/lfainsin/spheres/Masks/",
-    "FEATURES": [64, 128, 256, 512],
+    "FEATURES": [16, 32, 64, 128],
     "N_CHANNELS": 3,
     "N_CLASSES": 1,
     "AMP": True,
@@ -53,7 +53,13 @@ if __name__ == "__main__":
     pl.seed_everything(69420, workers=True)
 
     # 0. Create network
-    net = UNet(n_channels=CONFIG["N_CHANNELS"], n_classes=CONFIG["N_CLASSES"], features=CONFIG["FEATURES"])
+    net = UNet(
+        n_channels=CONFIG["N_CHANNELS"],
+        n_classes=CONFIG["N_CLASSES"],
+        batch_size=CONFIG["BATCH_SIZE"],
+        learning_rate=CONFIG["LEARNING_RATE"],
+        features=CONFIG["FEATURES"],
+    )
 
     # log gradients and weights regularly
     logger.watch(net, log="all")
@@ -77,7 +83,7 @@ if __name__ == "__main__":
     ds_valid = SphereDataset(image_dir=CONFIG["DIR_TEST_IMG"])
 
     # 2.5. Create subset, if uncommented
-    ds_train = torch.utils.data.Subset(ds_train, list(range(0, len(ds_train), len(ds_train) // 10000)))
+    ds_train = torch.utils.data.Subset(ds_train, list(range(0, len(ds_train), len(ds_train) // 5000)))
     # ds_valid = torch.utils.data.Subset(ds_valid, list(range(0, len(ds_valid), len(ds_valid) // 100)))
     # ds_test = torch.utils.data.Subset(ds_test, list(range(0, len(ds_test), len(ds_test) // 100)))
 
@@ -104,9 +110,12 @@ if __name__ == "__main__":
         accelerator=CONFIG["DEVICE"],
         # precision=16,
         auto_scale_batch_size="binsearch",
+        auto_lr_find=True,
         benchmark=CONFIG["BENCHMARK"],
         val_check_interval=100,
         callbacks=RichProgressBar(),
+        logger=logger,
+        log_every_n_steps=1,
     )
 
     try:
diff --git a/src/unet/model.py b/src/unet/model.py
index b9d6c18..be5712e 100644
--- a/src/unet/model.py
+++ b/src/unet/model.py
@@ -1,6 +1,5 @@
 """ Full assembly of the parts to form the complete network """
 
-import numpy as np
 import pytorch_lightning as pl
 
 import wandb
@@ -14,11 +13,16 @@ class_labels = {
 
 
 class UNet(pl.LightningModule):
-    def __init__(self, n_channels, n_classes, features=[64, 128, 256, 512]):
+    def __init__(self, n_channels, n_classes, learning_rate, batch_size, features=[64, 128, 256, 512]):
         super(UNet, self).__init__()
+
+        # Hyperparameters
         self.n_channels = n_channels
         self.n_classes = n_classes
+        self.learning_rate = learning_rate
+        self.batch_size = batch_size
 
+        # Network
         self.inc = DoubleConv(n_channels, features[0])
 
         self.downs = nn.ModuleList()
@@ -39,6 +43,7 @@ class UNet(pl.LightningModule):
         skips = []
 
         x = x.to(self.device)
+
         x = self.inc(x)
 
         for down in self.downs:
@@ -78,77 +83,97 @@ class UNet(pl.LightningModule):
                 ),
             )
 
-        wandb.log(
-            {
-                log_key: table,
-            }
-        )
+        wandb.log({log_key: table})  # replace by self.log
 
     def training_step(self, batch, batch_idx):
         # unpacking
         images, masks_true = batch
         masks_true = masks_true.unsqueeze(1)
-        masks_pred = self(images)
-        masks_pred_bin = (torch.sigmoid(masks_pred) > 0.5).float()
 
-        # compute metrics
-        loss = F.cross_entropy(masks_pred, masks_true)
+        # forward pass
+        masks_pred = self(images)
+
+        # compute loss
+        bce = F.binary_cross_entropy_with_logits(masks_pred, masks_true)
+
+        # compute other metrics
+        masks_pred_bin = (torch.sigmoid(masks_pred) > 0.5).float()
         mae = torch.nn.functional.l1_loss(masks_pred_bin, masks_true)
         accuracy = (masks_true == masks_pred_bin).float().mean()
         dice = dice_coeff(masks_pred_bin, masks_true)
 
-        self.log(
-            "train",
+        self.log_dict(
             {
-                "accuracy": accuracy,
-                "bce": loss,
-                "dice": dice,
-                "mae": mae,
+                "train/accuracy": accuracy,
+                "train/bce": bce,
+                "train/dice": dice,
+                "train/mae": mae,
             },
         )
 
-        return loss  # , dice, accuracy, mae
+        return dict(
+            loss=bce,
+            dice=dice,
+            accuracy=accuracy,
+            mae=mae,
+        )
 
     def validation_step(self, batch, batch_idx):
         # unpacking
         images, masks_true = batch
         masks_true = masks_true.unsqueeze(1)
-        masks_pred = self(images)
-        masks_pred_bin = (torch.sigmoid(masks_pred) > 0.5).float()
 
-        # compute metrics
-        loss = F.cross_entropy(masks_pred, masks_true)
-        # mae = torch.nn.functional.l1_loss(masks_pred_bin, masks_true)
-        # accuracy = (masks_true == masks_pred_bin).float().mean()
-        # dice = dice_coeff(masks_pred_bin, masks_true)
+        # forward pass
+        masks_pred = self(images)
+
+        # compute loss
+        bce = F.binary_cross_entropy_with_logits(masks_pred, masks_true)
+
+        # compute other metrics
+        masks_pred_bin = (torch.sigmoid(masks_pred) > 0.5).float()
+        mae = torch.nn.functional.l1_loss(masks_pred_bin, masks_true)
+        accuracy = (masks_true == masks_pred_bin).float().mean()
+        dice = dice_coeff(masks_pred_bin, masks_true)
 
         if batch_idx == 0:
             self.save_to_table(images, masks_true, masks_pred, masks_pred_bin, "val/predictions")
 
-        return loss  # , dice, accuracy, mae
+        return dict(
+            loss=bce,
+            dice=dice,
+            accuracy=accuracy,
+            mae=mae,
+        )
 
-    # def validation_step_end(self, validation_outputs):
-    #     # unpacking
-    #     loss, dice, accuracy, mae = validation_outputs
-    #     # optimizer = self.optimizers[0]
-    #     # learning_rate = optimizer.state_dict()["param_groups"][0]["lr"]
+    def validation_epoch_end(self, validation_outputs):
+        # unpacking
+        accuracy = torch.stack([d["accuracy"] for d in validation_outputs]).mean()
+        loss = torch.stack([d["loss"] for d in validation_outputs]).mean()
+        dice = torch.stack([d["dice"] for d in validation_outputs]).mean()
+        mae = torch.stack([d["mae"] for d in validation_outputs]).mean()
 
-    #     wandb.log(
-    #         {
-    #             # "train/learning_rate": learning_rate,
-    #             "val/accuracy": accuracy,
-    #             "val/bce": loss,
-    #             "val/dice": dice,
-    #             "val/mae": mae,
-    #         }
-    #     )
+        # logging
+        wandb.log(
+            {
+                "val/accuracy": accuracy,
+                "val/bce": loss,
+                "val/dice": dice,
+                "val/mae": mae,
+            }
+        )
 
-    #     # export model to onnx
-    #     dummy_input = torch.randn(1, 3, 512, 512, requires_grad=True)
-    #     torch.onnx.export(self, dummy_input, f"checkpoints/model.onnx")
-    #     artifact = wandb.Artifact("onnx", type="model")
-    #     artifact.add_file(f"checkpoints/model.onnx")
-    #     wandb.run.log_artifact(artifact)
+        # export model to pth
+        torch.save(self.state_dict(), f"checkpoints/model.pth")
+        artifact = wandb.Artifact("pth", type="model")
+        artifact.add_file(f"checkpoints/model.pth")
+        wandb.run.log_artifact(artifact)
+
+        # export model to onnx
+        dummy_input = torch.randn(1, 3, 512, 512, requires_grad=True)
+        torch.onnx.export(self, dummy_input, f"checkpoints/model.onnx")
+        artifact = wandb.Artifact("onnx", type="model")
+        artifact.add_file(f"checkpoints/model.onnx")
+        wandb.run.log_artifact(artifact)
 
     # def test_step(self, batch, batch_idx):
     #     # unpacking
@@ -199,10 +224,5 @@ class UNet(pl.LightningModule):
             weight_decay=wandb.config.WEIGHT_DECAY,
             momentum=wandb.config.MOMENTUM,
         )
-        # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
-        #     optimizer,
-        #     "max",
-        #     patience=2,
-        # )
 
-        return optimizer  # , scheduler
+        return optimizer