LION/train_dist.py

# ---------------------------------------------------------------
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
# ---------------------------------------------------------------

import importlib
import argparse
from loguru import logger
from comet_ml import Experiment
import torch
import numpy as np
import os
import sys
import torch.distributed as dist
from torch.multiprocessing import Process
from default_config import cfg as config
from utils import exp_helper, io_helper
from utils import utils


@logger.catch(onerror=lambda _: sys.exit(1), reraise=False)
def main(args, config):
    # -- trainer -- #
    logger.info('use trainer: {}', config.trainer.type)
    trainer_lib = importlib.import_module(config.trainer.type)
    Trainer = trainer_lib.Trainer

    if config.set_detect_anomaly:
        # attention: this makes thing slow
        torch.autograd.set_detect_anomaly(True)
        logger.info(
            '\n\n' + '!'*30 + '\nWARNING: ths set_detect_anomaly is turned on, it can slow down the training! \n' + '!'*30)

    # -- command init -- #
    comet_key = config.comet_key
    _, writer = utils.common_init(args.global_rank,
                                  config.trainer.seed, config.save_dir, comet_key)
    trainer = Trainer(config, args)
    writer.add_hparams(config.to_dict(), vars(args))
    nparam = utils.count_parameters_in_M(trainer.model)
    logger.info('param size = %fM ' % nparam)
    writer.log_other('nparam', nparam)

    if args.global_rank == 0:
        trainer.set_writer(writer)
        writer.set_model_graph('{}'.format(trainer.model), overwrite=True)
        if len(config.bash_name) > 0 and os.path.exists(config.bash_name):
            writer.log_asset(config.bash_name)
        if len(config.bash_name) > 0 and os.path.exists(os.path.join(config.save_dir, config.bash_name.split('/')[-1])):
            writer.log_asset(os.path.join(
                config.save_dir, config.bash_name.split('/')[-1]))
    ckpt_dir = os.path.join(config.save_dir, 'checkpoints')
    snapshot_file = os.path.join(config.save_dir, 'checkpoints', 'snapshot')

    # -- check if prev saved ckpt exist -- #
    if os.path.exists(ckpt_dir) and os.path.exists(snapshot_file):
        logger.info(
            '[Detect saved snapshot at the checkpoint dir] resume from preemption!!! ')
        args.resume = True
        args.pretrained = os.path.join(
            config.save_dir, 'checkpoints', 'snapshot')
    else:
        logger.info('not find any checkpoint: {}, (exist={}), or snapshot {}, (exist={})',
                    ckpt_dir, os.path.exists(ckpt_dir), snapshot_file, os.path.exists(snapshot_file))

    # -- prepare -- #
    if args.resume or args.eval_generation:
        if args.pretrained is not None:
            trainer.start_epoch = trainer.resume(
                args.pretrained, eval_generation=args.eval_generation)
        else:
            raise NotImplementedError
    elif args.pretrained is not None:
        logger.info('Resuming training from {}; if you dont want resume training, edit the cmt to change the exp name',
                args.pretrained)
        trainer.resume(args.pretrained)

    if not args.eval_generation:
        trainer.train_epochs()
    else:
        logger.info('[skip_sample]={}', args.skip_sample)

        save_file = None
        if not args.skip_nll:
            trainer.eval_nll(trainer.step, ntest=args.ntest, save_file=True)
        logger.info('save as : {}', save_file)
        # vis sampled output
        if not args.skip_sample:
            trainer.vis_sample(num_vis=8, writer=trainer.writer,
                               step=trainer.step, include_pred_x0=False,
                               save_file=save_file)
            trainer.eval_sample(trainer.step)
        logger.info('done')

    # make all nodes wait for rank 0 to finish saving the files
    # if args.distributed:
    #    dist.barrier()


def get_args():
    parser = argparse.ArgumentParser('encoder decoder examiner')
    # experimental results
    parser.add_argument('--exp_root', type=str, default='../exp',
                        help='location of the results')
    # parser.add_argument('--save', type=str, default='exp',
    #                     help='id used for storing intermediate results')
    # parser.add_argument('--recont_with_local_prior', type=bool, default=False,
    #                    help='eval nll with local prior sampled from normal distribution')
    parser.add_argument('--skip_sample', type=int, default=0,
                        help='only eval nll, no sampling')
    parser.add_argument('--skip_nll', type=int, default=0,
                        help='skip eval nll ')
    # data
    parser.add_argument('--ntest', type=str, default=None,
                        help='number of samples in eval_nll, if None, eval the whole val set')
    parser.add_argument('--dataset', type=str, default='cifar10',
                        choices=['cifar10', 'celeba_64', 'celeba_256',
                                 'imagenet_32', 'ffhq', 'lsun_bedroom_128'],
                        help='which dataset to use')
    parser.add_argument('--data', type=str, default='/tmp/nvae-diff/data',
                        help='location of the data corpus')
    # DDP.
    parser.add_argument('--autocast_train', action='store_true', default=True,
                        help='This flag enables FP16 in training.')
    parser.add_argument('--autocast_eval', action='store_true', default=True,
                        help='This flag enables FP16 in evaluation.')
    parser.add_argument('--num_proc_node', type=int, default=1,
                        help='The number of nodes in multi node env.')
    parser.add_argument('--node_rank', type=int, default=0,
                        help='The index of node.')
    parser.add_argument('--local_rank', type=int, default=0,
                        help='rank of process in the node')
    parser.add_argument('--global_rank', type=int, default=0,
                        help='rank of process among all the processes')
    parser.add_argument('--num_process_per_node', type=int, default=1,
                        help='number of gpus')
    parser.add_argument('--master_address', type=str, default='127.0.0.1',
                        help='address for master')
    parser.add_argument('--seed', type=int, default=1,
                        help='seed used for initialization')
    parser.add_argument('--config', type=str,
                        help='The configuration file.', default='none')
    parser.add_argument("opt",
                        help="Modify config options using the command-line",
                        default=None,
                        nargs=argparse.REMAINDER)
    # Resume:
    parser.add_argument('--resume', default=False, action='store_true')
    parser.add_argument('--eval_generation',
                        default=False, action='store_true')
    parser.add_argument('--pretrained',
                        default=None,
                        type=str,
                        help="Pretrained cehckpoint")

    args = parser.parse_args()

    # update config
    if args.eval_generation or args.resume:
        logger.info('[pretrained]: {}', args.pretrained)
        args.config = os.path.dirname(args.pretrained) + '/../cfg.yml'
        config.merge_from_file(args.config)
    elif args.config != 'none':
        logger.info('load config: {}', args.config)
        cur_exp_name = config.exp_name
        cur_hash = config.hash

        config.merge_from_file(args.config)
        config.exp_name = cur_exp_name  # not following the exp name here
        config.hash = cur_hash  # not following the exp name here
    config.merge_from_list(args.opt)

    # Create log_name
    EXP_ROOT = args.exp_root  # os.environ.get('EXP_ROOT', '../exp/')
    if config.exp_name == '' or config.exp_name == 'none':
        config.hash = io_helper.hash_str('%s' % config) + 'h'
        cfg_file_name = exp_helper.get_expname(config)
    else:
        cfg_file_name = config.exp_name

    # Currently save dir and log_dir are the same
    if args.eval_generation:
        config.save_dir = config.log_dir = config.log_name = os.path.dirname(
            args.config)
        if config.trainer.type == 'ddim':
            tag = 'eval_ddim'
        else:
            tag = 'eval'
        cfg_file_name += f'/{tag}/'
        config.log_name += f'/{tag}/'
        config.save_dir += f'/{tag}/'
        config.log_dir += f'/{tag}/'
    else:
        config.log_name = os.path.join(EXP_ROOT, cfg_file_name)
        config.save_dir = os.path.join(EXP_ROOT, cfg_file_name)
        config.log_dir = os.path.join(EXP_ROOT, cfg_file_name)
    os.makedirs(config.log_dir, exist_ok=True)

    # save config and log
    if args.global_rank == 0 and not args.eval_generation:
        logger.add(config.log_dir + '/train.log')
        logger.info('EXP_ROOT: {} + exp name: {}, save dir: {}', EXP_ROOT,
                    cfg_file_name, config.save_dir)
        saved_cfg = os.path.join(config.log_dir, 'cfg.yml')
        with open(saved_cfg, 'w') as file:
            file.write(config.dump())
        logger.info('save config at {}', saved_cfg)
    elif args.eval_generation:
        logger.add(config.log_dir + '/eval_gen.log')
    logger.info('log dir: {}', config.log_dir)

    return args, config


if __name__ == '__main__':
    args, config = get_args()
    args.ntest = int(args.ntest) if args.ntest is not None else None
    size = args.num_process_per_node

    if size > 1:
        args.distributed = True
        processes = []
        for rank in range(size):
            logger.info('In Rank={}', rank)
            args.local_rank = rank
            global_rank = rank + args.node_rank * args.num_process_per_node
            global_size = args.num_proc_node * args.num_process_per_node
            args.global_size = global_size
            args.global_rank = global_rank
            logger.info('Node rank %d, local proc %d, global proc %d' %
                        (args.node_rank, rank, global_rank))
            p = Process(target=utils.init_processes,
                        args=(global_rank, global_size, main, args, config))
            p.start()
            processes.append(p)

        for p in processes:
            logger.info('join {}', args.local_rank)
            p.join()
    else:
        # for debugging
        args.distributed = False
        args.global_size = 1
        utils.init_processes(0, size, main, args, config)
    logger.info('should end now')
    # if args.distributed:
    #    logger.info('destroy_process_group')
    #    dist.destroy_process_group()
init 2023-01-23 05:14:49 +00:00			`# ---------------------------------------------------------------`
			`# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
			`#`
			`# NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property`
			`# and proprietary rights in and to this software, related documentation`
			`# and any modifications thereto. Any use, reproduction, disclosure or`
			`# distribution of this software and related documentation without an express`
			`# license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.`
			`# ---------------------------------------------------------------`

			`import importlib`
			`import argparse`
			`from loguru import logger`
			`from comet_ml import Experiment`
			`import torch`
			`import numpy as np`
			`import os`
			`import sys`
			`import torch.distributed as dist`
			`from torch.multiprocessing import Process`
			`from default_config import cfg as config`
			`from utils import exp_helper, io_helper`
			`from utils import utils`


			`@logger.catch(onerror=lambda _: sys.exit(1), reraise=False)`
			`def main(args, config):`
			`# -- trainer -- #`
			`logger.info('use trainer: {}', config.trainer.type)`
			`trainer_lib = importlib.import_module(config.trainer.type)`
			`Trainer = trainer_lib.Trainer`

			`if config.set_detect_anomaly:`
			`# attention: this makes thing slow`
			`torch.autograd.set_detect_anomaly(True)`
			`logger.info(`
			`'\n\n' + '!'30 + '\nWARNING: ths set_detect_anomaly is turned on, it can slow down the training! \n' + '!'30)`

			`# -- command init -- #`
			`comet_key = config.comet_key`
			`_, writer = utils.common_init(args.global_rank,`
			`config.trainer.seed, config.save_dir, comet_key)`
			`trainer = Trainer(config, args)`
			`writer.add_hparams(config.to_dict(), vars(args))`
			`nparam = utils.count_parameters_in_M(trainer.model)`
			`logger.info('param size = %fM ' % nparam)`
			`writer.log_other('nparam', nparam)`

			`if args.global_rank == 0:`
			`trainer.set_writer(writer)`
			`writer.set_model_graph('{}'.format(trainer.model), overwrite=True)`
			`if len(config.bash_name) > 0 and os.path.exists(config.bash_name):`
			`writer.log_asset(config.bash_name)`
			`if len(config.bash_name) > 0 and os.path.exists(os.path.join(config.save_dir, config.bash_name.split('/')[-1])):`
			`writer.log_asset(os.path.join(`
			`config.save_dir, config.bash_name.split('/')[-1]))`
			`ckpt_dir = os.path.join(config.save_dir, 'checkpoints')`
			`snapshot_file = os.path.join(config.save_dir, 'checkpoints', 'snapshot')`

			`# -- check if prev saved ckpt exist -- #`
			`if os.path.exists(ckpt_dir) and os.path.exists(snapshot_file):`
			`logger.info(`
			`'[Detect saved snapshot at the checkpoint dir] resume from preemption!!! ')`
			`args.resume = True`
			`args.pretrained = os.path.join(`
			`config.save_dir, 'checkpoints', 'snapshot')`
			`else:`
			`logger.info('not find any checkpoint: {}, (exist={}), or snapshot {}, (exist={})',`
			`ckpt_dir, os.path.exists(ckpt_dir), snapshot_file, os.path.exists(snapshot_file))`

			`# -- prepare -- #`
			`if args.resume or args.eval_generation:`
			`if args.pretrained is not None:`
			`trainer.start_epoch = trainer.resume(`
			`args.pretrained, eval_generation=args.eval_generation)`
			`else:`
			`raise NotImplementedError`
			`elif args.pretrained is not None:`
fix training resume from snapshot 2023-03-14 16:30:23 +00:00			`logger.info('Resuming training from {}; if you dont want resume training, edit the cmt to change the exp name',`
			`args.pretrained)`
			`trainer.resume(args.pretrained)`
init 2023-01-23 05:14:49 +00:00
			`if not args.eval_generation:`
			`trainer.train_epochs()`
			`else:`
			`logger.info('[skip_sample]={}', args.skip_sample)`

			`save_file = None`
			`if not args.skip_nll:`
			`trainer.eval_nll(trainer.step, ntest=args.ntest, save_file=True)`
			`logger.info('save as : {}', save_file)`
			`# vis sampled output`
			`if not args.skip_sample:`
			`trainer.vis_sample(num_vis=8, writer=trainer.writer,`
			`step=trainer.step, include_pred_x0=False,`
			`save_file=save_file)`
			`trainer.eval_sample(trainer.step)`
			`logger.info('done')`

			`# make all nodes wait for rank 0 to finish saving the files`
			`# if args.distributed:`
			`# dist.barrier()`


			`def get_args():`
			`parser = argparse.ArgumentParser('encoder decoder examiner')`
			`# experimental results`
			`parser.add_argument('--exp_root', type=str, default='../exp',`
			`help='location of the results')`
			`# parser.add_argument('--save', type=str, default='exp',`
			`# help='id used for storing intermediate results')`
			`# parser.add_argument('--recont_with_local_prior', type=bool, default=False,`
			`# help='eval nll with local prior sampled from normal distribution')`
			`parser.add_argument('--skip_sample', type=int, default=0,`
			`help='only eval nll, no sampling')`
			`parser.add_argument('--skip_nll', type=int, default=0,`
			`help='skip eval nll ')`
			`# data`
			`parser.add_argument('--ntest', type=str, default=None,`
			`help='number of samples in eval_nll, if None, eval the whole val set')`
			`parser.add_argument('--dataset', type=str, default='cifar10',`
			`choices=['cifar10', 'celeba_64', 'celeba_256',`
			`'imagenet_32', 'ffhq', 'lsun_bedroom_128'],`
			`help='which dataset to use')`
			`parser.add_argument('--data', type=str, default='/tmp/nvae-diff/data',`
			`help='location of the data corpus')`
			`# DDP.`
			`parser.add_argument('--autocast_train', action='store_true', default=True,`
			`help='This flag enables FP16 in training.')`
			`parser.add_argument('--autocast_eval', action='store_true', default=True,`
			`help='This flag enables FP16 in evaluation.')`
			`parser.add_argument('--num_proc_node', type=int, default=1,`
			`help='The number of nodes in multi node env.')`
			`parser.add_argument('--node_rank', type=int, default=0,`
			`help='The index of node.')`
			`parser.add_argument('--local_rank', type=int, default=0,`
			`help='rank of process in the node')`
			`parser.add_argument('--global_rank', type=int, default=0,`
			`help='rank of process among all the processes')`
			`parser.add_argument('--num_process_per_node', type=int, default=1,`
			`help='number of gpus')`
			`parser.add_argument('--master_address', type=str, default='127.0.0.1',`
			`help='address for master')`
			`parser.add_argument('--seed', type=int, default=1,`
			`help='seed used for initialization')`
			`parser.add_argument('--config', type=str,`
			`help='The configuration file.', default='none')`
			`parser.add_argument("opt",`
			`help="Modify config options using the command-line",`
			`default=None,`
			`nargs=argparse.REMAINDER)`
			`# Resume:`
			`parser.add_argument('--resume', default=False, action='store_true')`
			`parser.add_argument('--eval_generation',`
			`default=False, action='store_true')`
			`parser.add_argument('--pretrained',`
			`default=None,`
			`type=str,`
			`help="Pretrained cehckpoint")`

			`args = parser.parse_args()`

			`# update config`
			`if args.eval_generation or args.resume:`
			`logger.info('[pretrained]: {}', args.pretrained)`
			`args.config = os.path.dirname(args.pretrained) + '/../cfg.yml'`
			`config.merge_from_file(args.config)`
			`elif args.config != 'none':`
			`logger.info('load config: {}', args.config)`
			`cur_exp_name = config.exp_name`
			`cur_hash = config.hash`

			`config.merge_from_file(args.config)`
			`config.exp_name = cur_exp_name # not following the exp name here`
			`config.hash = cur_hash # not following the exp name here`
			`config.merge_from_list(args.opt)`

			`# Create log_name`
			`EXP_ROOT = args.exp_root # os.environ.get('EXP_ROOT', '../exp/')`
			`if config.exp_name == '' or config.exp_name == 'none':`
			`config.hash = io_helper.hash_str('%s' % config) + 'h'`
			`cfg_file_name = exp_helper.get_expname(config)`
			`else:`
			`cfg_file_name = config.exp_name`

			`# Currently save dir and log_dir are the same`
			`if args.eval_generation:`
			`config.save_dir = config.log_dir = config.log_name = os.path.dirname(`
			`args.config)`
			`if config.trainer.type == 'ddim':`
			`tag = 'eval_ddim'`
			`else:`
			`tag = 'eval'`
			`cfg_file_name += f'/{tag}/'`
			`config.log_name += f'/{tag}/'`
			`config.save_dir += f'/{tag}/'`
			`config.log_dir += f'/{tag}/'`
			`else:`
			`config.log_name = os.path.join(EXP_ROOT, cfg_file_name)`
			`config.save_dir = os.path.join(EXP_ROOT, cfg_file_name)`
			`config.log_dir = os.path.join(EXP_ROOT, cfg_file_name)`
			`os.makedirs(config.log_dir, exist_ok=True)`

			`# save config and log`
			`if args.global_rank == 0 and not args.eval_generation:`
			`logger.add(config.log_dir + '/train.log')`
			`logger.info('EXP_ROOT: {} + exp name: {}, save dir: {}', EXP_ROOT,`
			`cfg_file_name, config.save_dir)`
			`saved_cfg = os.path.join(config.log_dir, 'cfg.yml')`
			`with open(saved_cfg, 'w') as file:`
			`file.write(config.dump())`
			`logger.info('save config at {}', saved_cfg)`
			`elif args.eval_generation:`
			`logger.add(config.log_dir + '/eval_gen.log')`
			`logger.info('log dir: {}', config.log_dir)`

			`return args, config`


			`if __name__ == '__main__':`
			`args, config = get_args()`
			`args.ntest = int(args.ntest) if args.ntest is not None else None`
			`size = args.num_process_per_node`

			`if size > 1:`
			`args.distributed = True`
			`processes = []`
			`for rank in range(size):`
			`logger.info('In Rank={}', rank)`
			`args.local_rank = rank`
			`global_rank = rank + args.node_rank * args.num_process_per_node`
			`global_size = args.num_proc_node * args.num_process_per_node`
			`args.global_size = global_size`
			`args.global_rank = global_rank`
			`logger.info('Node rank %d, local proc %d, global proc %d' %`
			`(args.node_rank, rank, global_rank))`
			`p = Process(target=utils.init_processes,`
			`args=(global_rank, global_size, main, args, config))`
			`p.start()`
			`processes.append(p)`

			`for p in processes:`
			`logger.info('join {}', args.local_rank)`
			`p.join()`
			`else:`
			`# for debugging`
			`args.distributed = False`
			`args.global_size = 1`
			`utils.init_processes(0, size, main, args, config)`
			`logger.info('should end now')`
			`# if args.distributed:`
			`# logger.info('destroy_process_group')`
			`# dist.destroy_process_group()`