REVA-QCAV/src/train.py

import argparse
import logging
from pathlib import Path

import albumentations as A
import torch
import torch.nn as nn
from albumentations.pytorch import ToTensorV2
from torch import optim
from torch.utils.data import DataLoader
from tqdm import tqdm

import wandb
from evaluate import evaluate
from src.utils.dataset import SphereDataset
from unet import UNet
from utils.paste import RandomPaste

CHECKPOINT_DIR = Path("./checkpoints/")
DIR_TRAIN_IMG = Path("/home/lilian/data_disk/lfainsin/val2017")
DIR_VALID_IMG = Path("/home/lilian/data_disk/lfainsin/val2017/")
DIR_SPHERE_IMG = Path("/home/lilian/data_disk/lfainsin/spheres/Images/")
DIR_SPHERE_MASK = Path("/home/lilian/data_disk/lfainsin/spheres/Masks/")


def get_args():
    parser = argparse.ArgumentParser(
        description="Train the UNet on images and target masks",
    )
    parser.add_argument(
        "--epochs",
        "-e",
        metavar="E",
        type=int,
        default=5,
        help="Number of epochs",
    )
    parser.add_argument(
        "--batch-size",
        "-b",
        dest="batch_size",
        metavar="B",
        type=int,
        default=16,
        help="Batch size",
    )
    parser.add_argument(
        "--learning-rate",
        "-l",
        metavar="LR",
        type=float,
        default=1e-5,
        help="Learning rate",
        dest="lr",
    )
    parser.add_argument(
        "--load",
        "-f",
        type=str,
        default=False,
        help="Load model from a .pth file",
    )
    parser.add_argument(
        "--amp",
        action="store_true",
        default=True,
        help="Use mixed precision",
    )
    parser.add_argument(
        "--classes",
        "-c",
        type=int,
        default=1,
        help="Number of classes",
    )

    return parser.parse_args()


def main():
    # get args from cli
    args = get_args()

    # setup logging
    logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

    # enable cuda, if possible
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logging.info(f"Using device {device}")

    # 0. Create network
    net = UNet(n_channels=3, n_classes=args.classes)
    logging.info(
        f"""Network:
        input channels:  {net.n_channels}
        output channels: {net.n_classes}
        """
    )

    # Load weights, if needed
    if args.load:
        net.load_state_dict(torch.load(args.load, map_location=device))
        logging.info(f"Model loaded from {args.load}")

    # transfer network to device
    net.to(device=device)

    # 1. Create transforms
    tf_train = A.Compose(
        [
            A.Resize(500, 500),
            A.Flip(),
            A.ColorJitter(),
            RandomPaste(5, DIR_SPHERE_IMG, DIR_SPHERE_MASK),
            A.ISONoise(),
            A.ToFloat(max_value=255),
            ToTensorV2(),
        ],
    )
    tf_valid = A.Compose(
        [
            A.Resize(500, 500),
            RandomPaste(5, DIR_SPHERE_IMG, DIR_SPHERE_MASK),
            A.ToFloat(max_value=255),
            ToTensorV2(),
        ],
    )

    # 2. Create datasets
    ds_train = SphereDataset(image_dir=DIR_TRAIN_IMG, transform=tf_train)
    ds_valid = SphereDataset(image_dir=DIR_VALID_IMG, transform=tf_valid)

    # 3. Create data loaders
    loader_args = dict(batch_size=args.batch_size, num_workers=5, pin_memory=True)
    train_loader = DataLoader(ds_train, shuffle=True, **loader_args)
    val_loader = DataLoader(ds_valid, shuffle=False, drop_last=True, **loader_args)

    # 4. Set up the optimizer, the loss, the learning rate scheduler and the loss scaling for AMP
    optimizer = optim.RMSprop(net.parameters(), lr=args.lr, weight_decay=1e-8, momentum=0.9)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, "max", patience=2)
    grad_scaler = torch.cuda.amp.GradScaler(enabled=args.amp)
    criterion = nn.BCEWithLogitsLoss()

    wandb.init(
        project="U-Net-tmp",
        config=dict(
            epochs=args.epochs,
            batch_size=args.batch_size,
            learning_rate=args.lr,
            amp=args.amp,
        ),
    )

    logging.info(
        f"""Starting training:
        Epochs:          {args.epochs}
        Batch size:      {args.batch_size}
        Learning rate:   {args.lr}
        Training size:   {len(ds_train)}
        Validation size: {len(ds_valid)}
        Device:          {device.type}
        Mixed Precision: {args.amp}
        """
    )

    try:
        for epoch in range(1, args.epochs + 1):
            with tqdm(total=len(ds_train), desc=f"{epoch}/{args.epochs}", unit="img") as pbar:

                # Training round
                for step, (images, true_masks) in enumerate(train_loader):
                    assert images.shape[1] == net.n_channels, (
                        f"Network has been defined with {net.n_channels} input channels, "
                        f"but loaded images have {images.shape[1]} channels. Please check that "
                        "the images are loaded correctly."
                    )

                    # transfer images to device
                    images = images.to(device=device)
                    true_masks = true_masks.unsqueeze(1).to(device=device)

                    # forward
                    with torch.cuda.amp.autocast(enabled=args.amp):
                        pred_masks = net(images)
                        train_loss = criterion(pred_masks, true_masks)

                    # backward
                    optimizer.zero_grad(set_to_none=True)
                    grad_scaler.scale(train_loss).backward()
                    grad_scaler.step(optimizer)
                    grad_scaler.update()

                    # update tqdm progress bar
                    pbar.update(images.shape[0])
                    pbar.set_postfix(**{"loss": train_loss.item()})

                    # log training metrics
                    wandb.log(
                        {
                            "train/epoch": epoch - 1 + step / len(train_loader),
                            "train/train_loss": train_loss,
                        }
                    )

                # Evaluation round
                val_score = evaluate(net, val_loader, device)
                scheduler.step(val_score)

                # log validation metrics
                wandb.log(
                    {
                        "val/val_score": val_score,
                    }
                )

            print(f"Train Loss: {train_loss:.3f}, Valid Score: {val_score:3f}")

            # save weights when epoch end
            Path(CHECKPOINT_DIR).mkdir(parents=True, exist_ok=True)
            torch.save(net.state_dict(), str(CHECKPOINT_DIR / "checkpoint_epoch{}.pth".format(epoch)))
            logging.info(f"Checkpoint {epoch} saved!")

    except KeyboardInterrupt:
        torch.save(net.state_dict(), "INTERRUPTED.pth")
        logging.info("Saved interrupt")
        raise


if __name__ == "__main__":
    main()
Global cleanup, better logging and CLI Former-commit-id: ff1ac0936c118d129bc8a8014958948d3b3883be 2019-10-24 19:37:21 +00:00			`import argparse`
			`import logging`
Summer cleanup Former-commit-id: f6185d67a4bc50aa7ec1b8168aab3f92721c4965 2021-08-16 00:53:00 +00:00			`from pathlib import Path`
Removed unused function and general cleanup Former-commit-id: c34a455f1722e0b899e9e92c7766b83a9a641980 2018-04-09 03:15:24 +00:00
fix: broken Ups Former-commit-id: 9c33326beb0d44e8491b040e25fad57ffa820076 2022-06-28 07:36:21 +00:00			`import albumentations as A`
Created a basic train loop + changed a bit loss and utils 2017-08-17 19:16:19 +00:00			`import torch`
Added simple eval and test CRF 2017-08-19 08:59:51 +00:00			`import torch.nn as nn`
fix: broken Ups Former-commit-id: 9c33326beb0d44e8491b040e25fad57ffa820076 2022-06-28 07:36:21 +00:00			`from albumentations.pytorch import ToTensorV2`
Removed unused function and general cleanup Former-commit-id: c34a455f1722e0b899e9e92c7766b83a9a641980 2018-04-09 03:15:24 +00:00			`from torch import optim`
fix: broken Ups Former-commit-id: 9c33326beb0d44e8491b040e25fad57ffa820076 2022-06-28 07:36:21 +00:00			`from torch.utils.data import DataLoader`
Global cleanup, better logging and CLI Former-commit-id: ff1ac0936c118d129bc8a8014958948d3b3883be 2019-10-24 19:37:21 +00:00			`from tqdm import tqdm`
Created a basic train loop + changed a bit loss and utils 2017-08-17 19:16:19 +00:00
fix: broken Ups Former-commit-id: 9c33326beb0d44e8491b040e25fad57ffa820076 2022-06-28 07:36:21 +00:00			`import wandb`
Summer cleanup Former-commit-id: f6185d67a4bc50aa7ec1b8168aab3f92721c4965 2021-08-16 00:53:00 +00:00			`from evaluate import evaluate`
fix: broken Ups Former-commit-id: 9c33326beb0d44e8491b040e25fad57ffa820076 2022-06-28 07:36:21 +00:00			`from src.utils.dataset import SphereDataset`
			`from unet import UNet`
			`from utils.paste import RandomPaste`
Global cleanup, better logging and CLI Former-commit-id: ff1ac0936c118d129bc8a8014958948d3b3883be 2019-10-24 19:37:21 +00:00
fix: broken Ups Former-commit-id: 9c33326beb0d44e8491b040e25fad57ffa820076 2022-06-28 07:36:21 +00:00			`CHECKPOINT_DIR = Path("./checkpoints/")`
fix: typo Former-commit-id: 837b298afc0a8283bd18d0d4353c8f0491113b9a [formerly d64ed0d75f62f99b0be0a919d6d6bfffb6e5cd6f] Former-commit-id: 964fb5b2dde3e90265dba16aef747b34dcfb6fb6 2022-06-29 08:26:26 +00:00			`DIR_TRAIN_IMG = Path("/home/lilian/data_disk/lfainsin/val2017")`
			`DIR_VALID_IMG = Path("/home/lilian/data_disk/lfainsin/val2017/")`
fix: broken Ups Former-commit-id: 9c33326beb0d44e8491b040e25fad57ffa820076 2022-06-28 07:36:21 +00:00			`DIR_SPHERE_IMG = Path("/home/lilian/data_disk/lfainsin/spheres/Images/")`
			`DIR_SPHERE_MASK = Path("/home/lilian/data_disk/lfainsin/spheres/Masks/")`
Global cleanup, better logging and CLI Former-commit-id: ff1ac0936c118d129bc8a8014958948d3b3883be 2019-10-24 19:37:21 +00:00
Created a basic train loop + changed a bit loss and utils 2017-08-17 19:16:19 +00:00
Migration to PyTorch 0.4, code cleanup Former-commit-id: c981801ccc3b74047e94c76e67c4ff1f3097226c 2018-06-08 17:27:32 +00:00			`def get_args():`
style: autoformating Former-commit-id: 8c5c75469afa61e8d3728959390b1354033be462 2022-06-27 13:39:44 +00:00			`parser = argparse.ArgumentParser(`
			`description="Train the UNet on images and target masks",`
			`)`
			`parser.add_argument(`
			`"--epochs",`
			`"-e",`
			`metavar="E",`
			`type=int,`
			`default=5,`
			`help="Number of epochs",`
			`)`
			`parser.add_argument(`
			`"--batch-size",`
			`"-b",`
			`dest="batch_size",`
			`metavar="B",`
			`type=int,`
feat: better pasting function Former-commit-id: 43fedd3f6bafb51fe604e347f59b70cd5b0cc218 [formerly 51bb06c3b98df613710b329d3ade1febaf2b0b23] Former-commit-id: 46b89acd2b860d272ce8a13cf2c8c955d7545c46 2022-06-28 14:36:50 +00:00			`default=16,`
style: autoformating Former-commit-id: 8c5c75469afa61e8d3728959390b1354033be462 2022-06-27 13:39:44 +00:00			`help="Batch size",`
			`)`
			`parser.add_argument(`
			`"--learning-rate",`
			`"-l",`
			`metavar="LR",`
			`type=float,`
			`default=1e-5,`
			`help="Learning rate",`
			`dest="lr",`
			`)`
			`parser.add_argument(`
			`"--load",`
			`"-f",`
			`type=str,`
			`default=False,`
			`help="Load model from a .pth file",`
			`)`
			`parser.add_argument(`
			`"--amp",`
			`action="store_true",`
style: formatting Former-commit-id: 2ccef30ce44d33beb611b63adef635ab2c1226bb 2022-06-27 14:40:04 +00:00			`default=True,`
style: autoformating Former-commit-id: 8c5c75469afa61e8d3728959390b1354033be462 2022-06-27 13:39:44 +00:00			`help="Use mixed precision",`
			`)`
			`parser.add_argument(`
			`"--classes",`
			`"-c",`
			`type=int,`
style: formatting Former-commit-id: 2ccef30ce44d33beb611b63adef635ab2c1226bb 2022-06-27 14:40:04 +00:00			`default=1,`
style: autoformating Former-commit-id: 8c5c75469afa61e8d3728959390b1354033be462 2022-06-27 13:39:44 +00:00			`help="Number of classes",`
			`)`
Global cleanup, better logging and CLI Former-commit-id: ff1ac0936c118d129bc8a8014958948d3b3883be 2019-10-24 19:37:21 +00:00
			`return parser.parse_args()`


wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`def main():`
			`# get args from cli`
Migration to PyTorch 0.4, code cleanup Former-commit-id: c981801ccc3b74047e94c76e67c4ff1f3097226c 2018-06-08 17:27:32 +00:00			`args = get_args()`
Summer cleanup Former-commit-id: f6185d67a4bc50aa7ec1b8168aab3f92721c4965 2021-08-16 00:53:00 +00:00
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`# setup logging`
style: autoformating Former-commit-id: 8c5c75469afa61e8d3728959390b1354033be462 2022-06-27 13:39:44 +00:00			`logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")`
fix: broken Ups Former-commit-id: 9c33326beb0d44e8491b040e25fad57ffa820076 2022-06-28 07:36:21 +00:00
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`# enable cuda, if possible`
style: autoformating Former-commit-id: 8c5c75469afa61e8d3728959390b1354033be462 2022-06-27 13:39:44 +00:00			`device = torch.device("cuda" if torch.cuda.is_available() else "cpu")`
			`logging.info(f"Using device {device}")`
Global cleanup, better logging and CLI Former-commit-id: ff1ac0936c118d129bc8a8014958948d3b3883be 2019-10-24 19:37:21 +00:00
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`# 0. Create network`
refactor: removed bilinear stuff + simplified the construction of the Downs and Ups Former-commit-id: 4c1e0a5a9fc02047b788b13d9bfc3ad7313413e3 2022-06-27 14:13:38 +00:00			`net = UNet(n_channels=3, n_classes=args.classes)`
style: autoformating Former-commit-id: 8c5c75469afa61e8d3728959390b1354033be462 2022-06-27 13:39:44 +00:00			`logging.info(`
fix: broken Ups Former-commit-id: 9c33326beb0d44e8491b040e25fad57ffa820076 2022-06-28 07:36:21 +00:00			`f"""Network:`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`input channels: {net.n_channels}`
			`output channels: {net.n_classes}`
refactor: removed bilinear stuff + simplified the construction of the Downs and Ups Former-commit-id: 4c1e0a5a9fc02047b788b13d9bfc3ad7313413e3 2022-06-27 14:13:38 +00:00			`"""`
style: autoformating Former-commit-id: 8c5c75469afa61e8d3728959390b1354033be462 2022-06-27 13:39:44 +00:00			`)`
Added simple eval and test CRF 2017-08-19 08:59:51 +00:00
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`# Load weights, if needed`
Migration to PyTorch 0.4, code cleanup Former-commit-id: c981801ccc3b74047e94c76e67c4ff1f3097226c 2018-06-08 17:27:32 +00:00			`if args.load:`
Summer cleanup Former-commit-id: f6185d67a4bc50aa7ec1b8168aab3f92721c4965 2021-08-16 00:53:00 +00:00			`net.load_state_dict(torch.load(args.load, map_location=device))`
style: autoformating Former-commit-id: 8c5c75469afa61e8d3728959390b1354033be462 2022-06-27 13:39:44 +00:00			`logging.info(f"Model loaded from {args.load}")`
Global cleanup, better logging and CLI Former-commit-id: ff1ac0936c118d129bc8a8014958948d3b3883be 2019-10-24 19:37:21 +00:00
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`# transfer network to device`
Global cleanup, better logging and CLI Former-commit-id: ff1ac0936c118d129bc8a8014958948d3b3883be 2019-10-24 19:37:21 +00:00			`net.to(device=device)`
fix: broken Ups Former-commit-id: 9c33326beb0d44e8491b040e25fad57ffa820076 2022-06-28 07:36:21 +00:00
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`# 1. Create transforms`
			`tf_train = A.Compose(`
			`[`
			`A.Resize(500, 500),`
			`A.Flip(),`
			`A.ColorJitter(),`
f5 Former-commit-id: c3c20fee02acb1c9eec7332ec173e05e9df9fb99 [formerly a0bdb02a4ec4299e767d011e86970412be10d25c] Former-commit-id: 495e626a762ffa4d8df3f66dd6e3d7c896950f94 2022-06-29 08:20:35 +00:00			`RandomPaste(5, DIR_SPHERE_IMG, DIR_SPHERE_MASK),`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`A.ISONoise(),`
			`A.ToFloat(max_value=255),`
f5 Former-commit-id: c3c20fee02acb1c9eec7332ec173e05e9df9fb99 [formerly a0bdb02a4ec4299e767d011e86970412be10d25c] Former-commit-id: 495e626a762ffa4d8df3f66dd6e3d7c896950f94 2022-06-29 08:20:35 +00:00			`ToTensorV2(),`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`],`
			`)`
			`tf_valid = A.Compose(`
			`[`
			`A.Resize(500, 500),`
f5 Former-commit-id: c3c20fee02acb1c9eec7332ec173e05e9df9fb99 [formerly a0bdb02a4ec4299e767d011e86970412be10d25c] Former-commit-id: 495e626a762ffa4d8df3f66dd6e3d7c896950f94 2022-06-29 08:20:35 +00:00			`RandomPaste(5, DIR_SPHERE_IMG, DIR_SPHERE_MASK),`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`A.ToFloat(max_value=255),`
			`ToTensorV2(),`
			`],`
			`)`

			`# 2. Create datasets`
			`ds_train = SphereDataset(image_dir=DIR_TRAIN_IMG, transform=tf_train)`
			`ds_valid = SphereDataset(image_dir=DIR_VALID_IMG, transform=tf_valid)`

			`# 3. Create data loaders`
f5 Former-commit-id: c3c20fee02acb1c9eec7332ec173e05e9df9fb99 [formerly a0bdb02a4ec4299e767d011e86970412be10d25c] Former-commit-id: 495e626a762ffa4d8df3f66dd6e3d7c896950f94 2022-06-29 08:20:35 +00:00			`loader_args = dict(batch_size=args.batch_size, num_workers=5, pin_memory=True)`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`train_loader = DataLoader(ds_train, shuffle=True, **loader_args)`
			`val_loader = DataLoader(ds_valid, shuffle=False, drop_last=True, **loader_args)`

			`# 4. Set up the optimizer, the loss, the learning rate scheduler and the loss scaling for AMP`
			`optimizer = optim.RMSprop(net.parameters(), lr=args.lr, weight_decay=1e-8, momentum=0.9)`
f5 Former-commit-id: c3c20fee02acb1c9eec7332ec173e05e9df9fb99 [formerly a0bdb02a4ec4299e767d011e86970412be10d25c] Former-commit-id: 495e626a762ffa4d8df3f66dd6e3d7c896950f94 2022-06-29 08:20:35 +00:00			`scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, "max", patience=2)`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`grad_scaler = torch.cuda.amp.GradScaler(enabled=args.amp)`
			`criterion = nn.BCEWithLogitsLoss()`

			`wandb.init(`
f5 Former-commit-id: c3c20fee02acb1c9eec7332ec173e05e9df9fb99 [formerly a0bdb02a4ec4299e767d011e86970412be10d25c] Former-commit-id: 495e626a762ffa4d8df3f66dd6e3d7c896950f94 2022-06-29 08:20:35 +00:00			`project="U-Net-tmp",`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`config=dict(`
style: autoformating Former-commit-id: 8c5c75469afa61e8d3728959390b1354033be462 2022-06-27 13:39:44 +00:00			`epochs=args.epochs,`
			`batch_size=args.batch_size,`
			`learning_rate=args.lr,`
			`amp=args.amp,`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`),`
			`)`

			`logging.info(`
			`f"""Starting training:`
			`Epochs: {args.epochs}`
			`Batch size: {args.batch_size}`
			`Learning rate: {args.lr}`
			`Training size: {len(ds_train)}`
			`Validation size: {len(ds_valid)}`
			`Device: {device.type}`
			`Mixed Precision: {args.amp}`
			`"""`
			`)`

			`try:`
			`for epoch in range(1, args.epochs + 1):`
			`with tqdm(total=len(ds_train), desc=f"{epoch}/{args.epochs}", unit="img") as pbar:`

			`# Training round`
			`for step, (images, true_masks) in enumerate(train_loader):`
			`assert images.shape[1] == net.n_channels, (`
			`f"Network has been defined with {net.n_channels} input channels, "`
			`f"but loaded images have {images.shape[1]} channels. Please check that "`
			`"the images are loaded correctly."`
			`)`

f5 Former-commit-id: c3c20fee02acb1c9eec7332ec173e05e9df9fb99 [formerly a0bdb02a4ec4299e767d011e86970412be10d25c] Former-commit-id: 495e626a762ffa4d8df3f66dd6e3d7c896950f94 2022-06-29 08:20:35 +00:00			`# transfer images to device`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`images = images.to(device=device)`
			`true_masks = true_masks.unsqueeze(1).to(device=device)`

f5 Former-commit-id: c3c20fee02acb1c9eec7332ec173e05e9df9fb99 [formerly a0bdb02a4ec4299e767d011e86970412be10d25c] Former-commit-id: 495e626a762ffa4d8df3f66dd6e3d7c896950f94 2022-06-29 08:20:35 +00:00			`# forward`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`with torch.cuda.amp.autocast(enabled=args.amp):`
f5 Former-commit-id: c3c20fee02acb1c9eec7332ec173e05e9df9fb99 [formerly a0bdb02a4ec4299e767d011e86970412be10d25c] Former-commit-id: 495e626a762ffa4d8df3f66dd6e3d7c896950f94 2022-06-29 08:20:35 +00:00			`pred_masks = net(images)`
fix: typo Former-commit-id: 837b298afc0a8283bd18d0d4353c8f0491113b9a [formerly d64ed0d75f62f99b0be0a919d6d6bfffb6e5cd6f] Former-commit-id: 964fb5b2dde3e90265dba16aef747b34dcfb6fb6 2022-06-29 08:26:26 +00:00			`train_loss = criterion(pred_masks, true_masks)`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00
f5 Former-commit-id: c3c20fee02acb1c9eec7332ec173e05e9df9fb99 [formerly a0bdb02a4ec4299e767d011e86970412be10d25c] Former-commit-id: 495e626a762ffa4d8df3f66dd6e3d7c896950f94 2022-06-29 08:20:35 +00:00			`# backward`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`optimizer.zero_grad(set_to_none=True)`
			`grad_scaler.scale(train_loss).backward()`
			`grad_scaler.step(optimizer)`
			`grad_scaler.update()`

f5 Former-commit-id: c3c20fee02acb1c9eec7332ec173e05e9df9fb99 [formerly a0bdb02a4ec4299e767d011e86970412be10d25c] Former-commit-id: 495e626a762ffa4d8df3f66dd6e3d7c896950f94 2022-06-29 08:20:35 +00:00			`# update tqdm progress bar`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`pbar.update(images.shape[0])`
			`pbar.set_postfix(**{"loss": train_loss.item()})`

f5 Former-commit-id: c3c20fee02acb1c9eec7332ec173e05e9df9fb99 [formerly a0bdb02a4ec4299e767d011e86970412be10d25c] Former-commit-id: 495e626a762ffa4d8df3f66dd6e3d7c896950f94 2022-06-29 08:20:35 +00:00			`# log training metrics`
			`wandb.log(`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`{`
f5 Former-commit-id: c3c20fee02acb1c9eec7332ec173e05e9df9fb99 [formerly a0bdb02a4ec4299e767d011e86970412be10d25c] Former-commit-id: 495e626a762ffa4d8df3f66dd6e3d7c896950f94 2022-06-29 08:20:35 +00:00			`"train/epoch": epoch - 1 + step / len(train_loader),`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`"train/train_loss": train_loss,`
			`}`
			`)`

			`# Evaluation round`
feat: better pasting function Former-commit-id: 43fedd3f6bafb51fe604e347f59b70cd5b0cc218 [formerly 51bb06c3b98df613710b329d3ade1febaf2b0b23] Former-commit-id: 46b89acd2b860d272ce8a13cf2c8c955d7545c46 2022-06-28 14:36:50 +00:00			`val_score = evaluate(net, val_loader, device)`
			`scheduler.step(val_score)`
f5 Former-commit-id: c3c20fee02acb1c9eec7332ec173e05e9df9fb99 [formerly a0bdb02a4ec4299e767d011e86970412be10d25c] Former-commit-id: 495e626a762ffa4d8df3f66dd6e3d7c896950f94 2022-06-29 08:20:35 +00:00
			`# log validation metrics`
			`wandb.log(`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`{`
feat: better pasting function Former-commit-id: 43fedd3f6bafb51fe604e347f59b70cd5b0cc218 [formerly 51bb06c3b98df613710b329d3ade1febaf2b0b23] Former-commit-id: 46b89acd2b860d272ce8a13cf2c8c955d7545c46 2022-06-28 14:36:50 +00:00			`"val/val_score": val_score,`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00			`}`
			`)`

feat: better pasting function Former-commit-id: 43fedd3f6bafb51fe604e347f59b70cd5b0cc218 [formerly 51bb06c3b98df613710b329d3ade1febaf2b0b23] Former-commit-id: 46b89acd2b860d272ce8a13cf2c8c955d7545c46 2022-06-28 14:36:50 +00:00			`print(f"Train Loss: {train_loss:.3f}, Valid Score: {val_score:3f}")`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00
			`# save weights when epoch end`
			`Path(CHECKPOINT_DIR).mkdir(parents=True, exist_ok=True)`
			`torch.save(net.state_dict(), str(CHECKPOINT_DIR / "checkpoint_epoch{}.pth".format(epoch)))`
			`logging.info(f"Checkpoint {epoch} saved!")`

Removed dense_crf and small fixes Former-commit-id: de7507ff08510b48e6a0e11da849e0d1c94d3ac8 2019-12-21 21:04:23 +00:00			`except KeyboardInterrupt:`
style: autoformating Former-commit-id: 8c5c75469afa61e8d3728959390b1354033be462 2022-06-27 13:39:44 +00:00			`torch.save(net.state_dict(), "INTERRUPTED.pth")`
			`logging.info("Saved interrupt")`
Update train.py Former-commit-id: dee78b12ca6810f5e02febfb244dce3885ed49ac 2022-04-06 11:35:02 +00:00			`raise`
wtf am i doing Former-commit-id: dde43cce52408ec8f67372b365796b9014ceee57 2022-06-28 09:36:43 +00:00

			`if __name__ == "__main__":`
			`main()`