2023-01-23 05:14:49 +00:00
|
|
|
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
|
|
#
|
|
|
|
# NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
|
|
|
|
# and proprietary rights in and to this software, related documentation
|
|
|
|
# and any modifications thereto. Any use, reproduction, disclosure or
|
|
|
|
# distribution of this software and related documentation without an express
|
|
|
|
# license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
|
|
|
|
"""copied and modified from https://github.com/NVlabs/LSGM/blob/5eae2f385c014f2250c3130152b6be711f6a3a5a/util/utils.py"""
|
|
|
|
from loguru import logger
|
|
|
|
from comet_ml import Experiment, ExistingExperiment
|
|
|
|
import wandb as WB
|
|
|
|
import os
|
|
|
|
import math
|
|
|
|
import shutil
|
2023-01-25 22:02:07 +00:00
|
|
|
import json
|
2023-01-23 05:14:49 +00:00
|
|
|
import time
|
|
|
|
import sys
|
|
|
|
import types
|
|
|
|
from PIL import Image
|
|
|
|
import torch
|
|
|
|
import torch.nn as nn
|
|
|
|
import numpy as np
|
|
|
|
from torch import optim
|
|
|
|
import torch.distributed as dist
|
|
|
|
from torch.cuda.amp import autocast, GradScaler
|
|
|
|
USE_COMET = int(os.environ.get('USE_COMET', 1))
|
|
|
|
USE_TFB = int(os.environ.get('USE_TFB', 0))
|
|
|
|
USE_WB = int(os.environ.get('USE_WB', 0))
|
|
|
|
print(f'utils/utils.py: USE_COMET={USE_COMET}, USE_WB={USE_WB}')
|
|
|
|
|
|
|
|
class PixelNormal(object):
|
|
|
|
def __init__(self, param, fixed_log_scales=None):
|
|
|
|
size = param.size()
|
|
|
|
C = size[1]
|
|
|
|
if fixed_log_scales is None:
|
|
|
|
self.num_c = C // 2
|
|
|
|
# B, 1 or 3, H, W
|
|
|
|
self.means = param[:, :self.num_c, :, :]
|
|
|
|
self.log_scales = torch.clamp(
|
|
|
|
param[:, self.num_c:, :, :], min=-7.0) # B, 1 or 3, H, W
|
|
|
|
raise NotImplementedError
|
|
|
|
else:
|
|
|
|
self.num_c = C
|
|
|
|
# B, 1 or 3, H, W
|
|
|
|
self.means = param
|
|
|
|
# B, 1 or 3, H, W
|
|
|
|
self.log_scales = view4D(fixed_log_scales, size)
|
|
|
|
|
|
|
|
def get_params(self):
|
|
|
|
return self.means, self.log_scales, self.num_c
|
|
|
|
|
|
|
|
def log_prob(self, samples):
|
|
|
|
B, C, H, W = samples.size()
|
|
|
|
assert C == self.num_c
|
|
|
|
|
|
|
|
log_probs = -0.5 * torch.square(self.means - samples) * torch.exp(-2.0 *
|
|
|
|
self.log_scales) - self.log_scales - 0.9189385332 # -0.5*log(2*pi)
|
|
|
|
return log_probs
|
|
|
|
|
|
|
|
def sample(self, t=1.):
|
|
|
|
z, rho = sample_normal_jit(
|
|
|
|
self.means, torch.exp(self.log_scales)*t) # B, 3, H, W
|
|
|
|
return z
|
|
|
|
|
|
|
|
def log_prob_discrete(self, samples):
|
|
|
|
"""
|
|
|
|
Calculates discrete pixel probabilities.
|
|
|
|
"""
|
|
|
|
# samples should be in [-1, 1] already
|
|
|
|
B, C, H, W = samples.size()
|
|
|
|
assert C == self.num_c
|
|
|
|
|
|
|
|
centered = samples - self.means
|
|
|
|
inv_stdv = torch.exp(- self.log_scales)
|
|
|
|
plus_in = inv_stdv * (centered + 1. / 255.)
|
|
|
|
cdf_plus = torch.distributions.Normal(0, 1).cdf(plus_in)
|
|
|
|
min_in = inv_stdv * (centered - 1. / 255.)
|
|
|
|
cdf_min = torch.distributions.Normal(0, 1).cdf(min_in)
|
|
|
|
log_cdf_plus = torch.log(torch.clamp(cdf_plus, min=1e-12))
|
|
|
|
log_one_minus_cdf_min = torch.log(torch.clamp(1. - cdf_min, min=1e-12))
|
|
|
|
cdf_delta = cdf_plus - cdf_min
|
|
|
|
log_probs = torch.where(samples < -0.999, log_cdf_plus, torch.where(samples > 0.999, log_one_minus_cdf_min,
|
|
|
|
torch.log(torch.clamp(cdf_delta, min=1e-12))))
|
|
|
|
|
|
|
|
assert log_probs.size() == samples.size()
|
|
|
|
return log_probs
|
|
|
|
|
|
|
|
def mean(self):
|
|
|
|
return self.means
|
|
|
|
|
|
|
|
|
|
|
|
class DummyGradScalar(object):
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def scale(self, input):
|
|
|
|
return input
|
|
|
|
|
|
|
|
def update(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def state_dict(self):
|
|
|
|
return {}
|
|
|
|
|
|
|
|
def load_state_dict(self, x):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def step(self, opt):
|
|
|
|
opt.step()
|
|
|
|
|
|
|
|
def unscale_(self, x):
|
|
|
|
return x
|
|
|
|
|
|
|
|
|
|
|
|
def get_opt(params, cfgopt, use_ema, other_cfg=None):
|
|
|
|
if cfgopt.type == 'adam':
|
|
|
|
optimizer = optim.Adam(params,
|
|
|
|
lr=float(cfgopt.lr),
|
|
|
|
betas=(cfgopt.beta1, cfgopt.beta2),
|
|
|
|
weight_decay=cfgopt.weight_decay)
|
|
|
|
elif cfgopt.type == 'sgd':
|
|
|
|
optimizer = torch.optim.SGD(params,
|
|
|
|
lr=float(cfgopt.lr),
|
|
|
|
momentum=cfgopt.momentum)
|
|
|
|
elif cfgopt.type == 'adamax':
|
|
|
|
from utils.adamax import Adamax
|
|
|
|
logger.info('[Optimizer] Adamax, lr={}, weight_decay={}, eps={}',
|
|
|
|
cfgopt.lr, cfgopt.weight_decay, 1e-4)
|
|
|
|
optimizer = Adamax(params, float(cfgopt.lr),
|
|
|
|
weight_decay=args.weight_decay, eps=1e-4)
|
|
|
|
|
|
|
|
else:
|
|
|
|
assert 0, "Optimizer type should be either 'adam' or 'sgd'"
|
|
|
|
if use_ema:
|
|
|
|
logger.info('use_ema')
|
|
|
|
ema_decay = 0.9999
|
|
|
|
from .ema import EMA
|
|
|
|
optimizer = EMA(optimizer, ema_decay=ema_decay)
|
|
|
|
scheduler = optim.lr_scheduler.LambdaLR(
|
|
|
|
optimizer, lr_lambda=lambda x: 1.0) # constant lr
|
|
|
|
scheduler_type = getattr(cfgopt, "scheduler", None)
|
|
|
|
if scheduler_type is not None and len(scheduler_type) > 0:
|
|
|
|
logger.info('get scheduler_type: {}', scheduler_type)
|
|
|
|
if scheduler_type == 'exponential':
|
|
|
|
decay = float(getattr(cfgopt, "step_decay", 0.1))
|
|
|
|
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, decay)
|
|
|
|
elif scheduler_type == 'step':
|
|
|
|
step_size = int(getattr(cfgopt, "step_epoch", 500))
|
|
|
|
decay = float(getattr(cfgopt, "step_decay", 0.1))
|
|
|
|
scheduler = optim.lr_scheduler.StepLR(optimizer,
|
|
|
|
step_size=step_size,
|
|
|
|
gamma=decay)
|
|
|
|
elif scheduler_type == 'linear': # use default setting from shapeLatent
|
|
|
|
start_epoch = int(getattr(cfgopt, 'sched_start_epoch', 200*1e3))
|
|
|
|
end_epoch = int(getattr(cfgopt, 'sched_end_epoch', 400*1e3))
|
|
|
|
end_lr = float(getattr(cfgopt, 'end_lr', 1e-4))
|
|
|
|
start_lr = cfgopt.lr
|
|
|
|
|
|
|
|
def lambda_rule(epoch):
|
|
|
|
if epoch <= start_epoch:
|
|
|
|
return 1.0
|
|
|
|
elif epoch <= end_epoch:
|
|
|
|
total = end_epoch - start_epoch
|
|
|
|
delta = epoch - start_epoch
|
|
|
|
frac = delta / total
|
|
|
|
return (1 - frac) * 1.0 + frac * (end_lr / start_lr)
|
|
|
|
else:
|
|
|
|
return end_lr / start_lr
|
|
|
|
scheduler = optim.lr_scheduler.LambdaLR(optimizer,
|
|
|
|
lr_lambda=lambda_rule)
|
|
|
|
|
|
|
|
elif scheduler_type == 'lambda': # linear':
|
|
|
|
step_size = int(getattr(cfgopt, "step_epoch", 2000))
|
|
|
|
final_ratio = float(getattr(cfgopt, "final_ratio", 0.01))
|
|
|
|
start_ratio = float(getattr(cfgopt, "start_ratio", 0.5))
|
|
|
|
duration_ratio = float(getattr(cfgopt, "duration_ratio", 0.45))
|
|
|
|
|
|
|
|
def lambda_rule(ep):
|
|
|
|
lr_l = 1.0 - min(
|
|
|
|
1,
|
|
|
|
max(0, ep - start_ratio * step_size) /
|
|
|
|
float(duration_ratio * step_size)) * (1 - final_ratio)
|
|
|
|
return lr_l
|
|
|
|
|
|
|
|
scheduler = optim.lr_scheduler.LambdaLR(optimizer,
|
|
|
|
lr_lambda=lambda_rule)
|
|
|
|
|
|
|
|
elif scheduler_type == 'cosine_anneal_nocycle':
|
|
|
|
## logger.info('scheduler_type: {}', scheduler_type)
|
|
|
|
assert(other_cfg is not None)
|
|
|
|
final_lr_ratio = float(getattr(cfgopt, "final_lr_ratio", 0.01))
|
|
|
|
eta_min = float(cfgopt.lr) * final_lr_ratio
|
|
|
|
eta_max = float(cfgopt.lr)
|
|
|
|
|
|
|
|
total_epoch = int(other_cfg.trainer.epochs)
|
|
|
|
##getattr(cfgopt, "step_epoch", 2000)
|
|
|
|
start_ratio = float(getattr(cfgopt, "start_ratio", 0.6))
|
|
|
|
T_max = total_epoch * (1 - start_ratio)
|
|
|
|
|
|
|
|
def lambda_rule(ep):
|
|
|
|
curr_ep = max(0., ep - start_ratio * total_epoch)
|
|
|
|
lr = eta_min + 0.5 * (eta_max - eta_min) * (
|
|
|
|
1 + np.cos(np.pi * curr_ep / T_max))
|
|
|
|
lr_l = lr / eta_max
|
|
|
|
return lr_l
|
|
|
|
|
|
|
|
scheduler = optim.lr_scheduler.LambdaLR(optimizer,
|
|
|
|
lr_lambda=lambda_rule)
|
|
|
|
|
|
|
|
else:
|
|
|
|
assert 0, "args.schedulers should be either 'exponential' or 'linear' or 'step'"
|
|
|
|
return optimizer, scheduler
|
|
|
|
|
|
|
|
|
|
|
|
class AvgrageMeter(object):
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.reset()
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
self.avg = 0
|
|
|
|
self.sum = 0
|
|
|
|
self.cnt = 0
|
|
|
|
|
|
|
|
def update(self, val, n=1):
|
|
|
|
self.sum += val * n
|
|
|
|
self.cnt += n
|
|
|
|
self.avg = self.sum / self.cnt
|
|
|
|
|
|
|
|
|
|
|
|
class ExpMovingAvgrageMeter(object):
|
|
|
|
|
|
|
|
def __init__(self, momentum=0.9):
|
|
|
|
self.momentum = momentum
|
|
|
|
self.reset()
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
self.avg = 0
|
|
|
|
|
|
|
|
def update(self, val):
|
|
|
|
self.avg = (1. - self.momentum) * self.avg + self.momentum * val
|
|
|
|
|
|
|
|
|
|
|
|
class DummyDDP(nn.Module):
|
|
|
|
def __init__(self, model):
|
|
|
|
super(DummyDDP, self).__init__()
|
|
|
|
self.module = model
|
|
|
|
|
|
|
|
def forward(self, *input, **kwargs):
|
|
|
|
return self.module(*input, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
def count_parameters_in_M(model):
|
|
|
|
return np.sum(np.prod(v.size()) for name, v in model.named_parameters() if "auxiliary" not in name) / 1e6
|
|
|
|
|
|
|
|
|
|
|
|
def save_checkpoint(state, is_best, save):
|
|
|
|
filename = os.path.join(save, 'checkpoint.pth.tar')
|
|
|
|
torch.save(state, filename)
|
|
|
|
if is_best:
|
|
|
|
best_filename = os.path.join(save, 'model_best.pth.tar')
|
|
|
|
shutil.copyfile(filename, best_filename)
|
|
|
|
|
|
|
|
|
|
|
|
def save(model, model_path):
|
|
|
|
torch.save(model.state_dict(), model_path)
|
|
|
|
|
|
|
|
|
|
|
|
def load(model, model_path):
|
|
|
|
model.load_state_dict(torch.load(model_path))
|
|
|
|
|
|
|
|
|
|
|
|
# def create_exp_dir(path, scripts_to_save=None):
|
|
|
|
# if not os.path.exists(path):
|
|
|
|
# os.makedirs(path, exist_ok=True)
|
|
|
|
# print('Experiment dir : {}'.format(path))
|
|
|
|
#
|
|
|
|
# if scripts_to_save is not None:
|
|
|
|
# if not os.path.exists(os.path.join(path, 'scripts')):
|
|
|
|
# os.mkdir(os.path.join(path, 'scripts'))
|
|
|
|
# for script in scripts_to_save:
|
|
|
|
# dst_file = os.path.join(path, 'scripts', os.path.basename(script))
|
|
|
|
# shutil.copyfile(script, dst_file)
|
|
|
|
#
|
|
|
|
|
|
|
|
# class Logger(object):
|
|
|
|
# def __init__(self, rank, save):
|
|
|
|
# # other libraries may set logging before arriving at this line.
|
|
|
|
# # by reloading logging, we can get rid of previous configs set by other libraries.
|
|
|
|
# from importlib import reload
|
|
|
|
# reload(logging)
|
|
|
|
# self.rank = rank
|
|
|
|
# if self.rank == 0:
|
|
|
|
# log_format = '%(asctime)s %(message)s'
|
|
|
|
# logging.basicConfig(stream=sys.stdout, level=logging.INFO,
|
|
|
|
# format=log_format, datefmt='%m/%d %I:%M:%S %p')
|
|
|
|
# fh = logging.FileHandler(os.path.join(save, 'log.txt'))
|
|
|
|
# fh.setFormatter(logging.Formatter(log_format))
|
|
|
|
# logging.getLogger().addHandler(fh)
|
|
|
|
# self.start_time = time.time()
|
|
|
|
#
|
|
|
|
# def info(self, string, *args):
|
|
|
|
# if self.rank == 0:
|
|
|
|
# elapsed_time = time.time() - self.start_time
|
|
|
|
# elapsed_time = time.strftime(
|
|
|
|
# '(Elapsed: %H:%M:%S) ', time.gmtime(elapsed_time))
|
|
|
|
# if isinstance(string, str):
|
|
|
|
# string = elapsed_time + string
|
|
|
|
# else:
|
|
|
|
# logging.info(elapsed_time)
|
|
|
|
# logging.info(string, *args)
|
|
|
|
|
|
|
|
def flatten_dict(dd, separator='_', prefix=''):
|
|
|
|
return {prefix + separator + k if prefix else k: v for kk, vv in dd.items()
|
|
|
|
for k, v in flatten_dict(vv, separator, kk).items()} \
|
|
|
|
if isinstance(dd, dict) else {prefix: dd}
|
|
|
|
|
|
|
|
|
|
|
|
class Writer(object):
|
|
|
|
def __init__(self, rank=0, save=None, exp=None, wandb=False):
|
|
|
|
self.rank = rank
|
|
|
|
self.exp = None
|
|
|
|
self.wandb = False
|
|
|
|
self.meter_dict = {}
|
|
|
|
if self.rank == 0:
|
|
|
|
self.exp = exp
|
|
|
|
if USE_TFB and save is not None:
|
|
|
|
logger.info('init TFB: {}', save)
|
|
|
|
from torch.utils.tensorboard import SummaryWriter
|
|
|
|
self.writer = SummaryWriter(log_dir=save, flush_secs=20)
|
|
|
|
else:
|
|
|
|
logger.info('Not init TFB')
|
|
|
|
self.writer = None
|
|
|
|
if self.exp is not None and save is not None:
|
|
|
|
with open(os.path.join(save, 'url.txt'), 'a') as f:
|
|
|
|
f.write(self.exp.url)
|
|
|
|
f.write('\n')
|
|
|
|
self.wandb = wandb
|
|
|
|
else:
|
|
|
|
logger.info('rank={}, init writer as a blackhole', rank)
|
|
|
|
|
|
|
|
def set_model_graph(self, *args, **kwargs):
|
|
|
|
if self.rank == 0 and self.exp is not None:
|
|
|
|
self.exp.set_model_graph(*args, **kwargs)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def url(self):
|
|
|
|
if self.exp is not None:
|
|
|
|
return self.exp.url
|
|
|
|
else:
|
|
|
|
return 'none'
|
|
|
|
|
|
|
|
def add_hparams(self, cfg, args): # **kwargs):
|
|
|
|
if self.exp is not None:
|
|
|
|
self.exp.log_parameters(flatten_dict(cfg))
|
|
|
|
self.exp.log_parameters(flatten_dict(args))
|
|
|
|
if self.wandb:
|
|
|
|
WB.config.update(flatten_dict(cfg))
|
|
|
|
WB.config.update(flatten_dict(args))
|
|
|
|
|
|
|
|
def avg_meter(self, name, value, step=None, epoch=None):
|
|
|
|
if self.rank == 0:
|
|
|
|
if name not in self.meter_dict:
|
|
|
|
self.meter_dict[name] = AvgrageMeter()
|
|
|
|
self.meter_dict[name].update(value)
|
|
|
|
|
|
|
|
def upload_meter(self, step=None, epoch=None):
|
|
|
|
for name, value in self.meter_dict.items():
|
|
|
|
self.add_scalar(name, value.avg, step=step, epoch=epoch)
|
|
|
|
self.meter_dict = {}
|
|
|
|
|
|
|
|
def add_scalar(self, *args, **kwargs):
|
|
|
|
if self.rank == 0 and self.writer is not None:
|
|
|
|
if 'step' in kwargs:
|
|
|
|
self.writer.add_scalar(*args,
|
|
|
|
global_step=kwargs['step'])
|
|
|
|
else:
|
|
|
|
self.writer.add_scalar(*args, **kwargs)
|
|
|
|
|
|
|
|
if self.exp is not None:
|
|
|
|
self.exp.log_metric(*args, **kwargs)
|
|
|
|
if self.wandb:
|
|
|
|
name = args[0]
|
|
|
|
v = args[1]
|
|
|
|
WB.log({name: v})
|
|
|
|
|
|
|
|
def log_model(self, name, path):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def log_other(self, name, value):
|
|
|
|
if self.rank == 0 and self.exp is not None:
|
|
|
|
self.exp.log_other(name, value)
|
|
|
|
# if self.rank == 0 and self.exp is not None:
|
|
|
|
# self.exp.log_model(name, path)
|
|
|
|
|
|
|
|
def watch(self, model):
|
|
|
|
if self.wandb:
|
|
|
|
WB.watch(model)
|
|
|
|
|
|
|
|
def log_points_3d(self, scene_name, points, step=0): # *args, **kwargs):
|
|
|
|
if self.rank == 0 and self.exp is not None:
|
|
|
|
self.exp.log_points_3d(*args, **kwargs)
|
|
|
|
if self.wandb:
|
|
|
|
WB.log({"point_cloud": WB.Object3D(points)})
|
|
|
|
|
|
|
|
def add_figure(self, *args, **kwargs):
|
|
|
|
if self.rank == 0 and self.writer is not None:
|
|
|
|
self.writer.add_figure(*args, **kwargs)
|
|
|
|
|
|
|
|
def add_image(self, *args, **kwargs):
|
|
|
|
if self.rank == 0 and self.writer is not None:
|
|
|
|
self.writer.add_image(*args, **kwargs)
|
|
|
|
self.writer.flush()
|
|
|
|
if self.exp is not None:
|
|
|
|
name, img, i = args
|
|
|
|
if isinstance(img, Image.Image):
|
|
|
|
# logger.debug('log PIL Imgae: {}, {}', name, i)
|
|
|
|
self.exp.log_image(img, name, step=i)
|
|
|
|
elif type(img) is str:
|
|
|
|
# logger.debug('log str image: {}, {}: {}', name, i, img)
|
|
|
|
self.exp.log_image(img, name, step=i)
|
|
|
|
elif torch.is_tensor(img):
|
|
|
|
if img.shape[0] in [3, 4] and len(img.shape) == 3: # 3,H,W
|
|
|
|
img = img.permute(1, 2, 0).contiguous() # 3,H,W -> H,W,3
|
|
|
|
if img.max() < 100: # [0-1]
|
|
|
|
ndarr = img.mul(255).add_(0.5).clamp_(
|
|
|
|
0, 255).to('cpu') # .squeeze()
|
|
|
|
ndarr = ndarr.numpy().astype(np.uint8)
|
|
|
|
# .reshape(-1, ndarr.shape[-1]))
|
|
|
|
im = Image.fromarray(ndarr)
|
|
|
|
self.exp.log_image(im, name, step=i)
|
|
|
|
else:
|
|
|
|
im = img.to('cpu').numpy()
|
|
|
|
self.exp.log_image(im, name, step=i)
|
|
|
|
|
|
|
|
elif isinstance(img, (np.ndarray, np.generic)):
|
|
|
|
if img.shape[0] == 3 and len(img.shape) == 3: # 3,H,W
|
|
|
|
img = img.transpose(1, 2, 0)
|
|
|
|
self.exp.log_image(img, name, step=i)
|
|
|
|
if self.wandb and torch.is_tensor(img) and self.rank == 0:
|
|
|
|
## print(img.shape, img.max(), img.type())
|
|
|
|
WB.log({name: WB.Image(img.numpy())})
|
|
|
|
|
|
|
|
def add_histogram(self, *args, **kwargs):
|
|
|
|
if self.rank == 0 and self.writer is not None:
|
|
|
|
self.writer.add_histogram(*args, **kwargs)
|
|
|
|
if self.exp is not None:
|
|
|
|
name, value, step = args
|
|
|
|
self.exp.log_histogram_3d(value, name, step)
|
|
|
|
# *args, **kwargs)
|
|
|
|
|
|
|
|
def add_histogram_if(self, write, *args, **kwargs):
|
|
|
|
if write and False: # Used for debugging.
|
|
|
|
self.add_histogram(*args, **kwargs)
|
|
|
|
|
|
|
|
def close(self, *args, **kwargs):
|
|
|
|
if self.rank == 0 and self.writer is not None:
|
|
|
|
self.writer.close()
|
|
|
|
|
|
|
|
def log_asset(self, *args, **kwargs):
|
|
|
|
if self.exp is not None:
|
|
|
|
self.exp.log_asset(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
def common_init(rank, seed, save_dir, comet_key=''):
|
|
|
|
# we use different seeds per gpu. But we sync the weights after model initialization.
|
|
|
|
logger.info('[common-init] at rank={}, seed={}', rank, seed)
|
|
|
|
torch.manual_seed(rank + seed)
|
|
|
|
np.random.seed(rank + seed)
|
|
|
|
torch.cuda.manual_seed(rank + seed)
|
|
|
|
torch.cuda.manual_seed_all(rank + seed)
|
|
|
|
torch.backends.cudnn.benchmark = True
|
|
|
|
|
|
|
|
# prepare logging and tensorboard summary
|
|
|
|
#logging = Logger(rank, save_dir)
|
|
|
|
logging = None
|
|
|
|
if rank == 0:
|
|
|
|
if os.path.exists('.comet_api'):
|
|
|
|
comet_args = json.load(open('.comet_api', 'r'))
|
|
|
|
exp = Experiment(display_summary_level=0,
|
|
|
|
disabled=USE_COMET == 0,
|
|
|
|
**comet_args)
|
|
|
|
exp.set_name(save_dir.split('exp/')[-1])
|
|
|
|
exp.set_cmd_args()
|
|
|
|
exp.log_code(folder='./models/')
|
|
|
|
exp.log_code(folder='./trainers/')
|
|
|
|
exp.log_code(folder='./utils/')
|
|
|
|
exp.log_code(folder='./datasets/')
|
|
|
|
else:
|
|
|
|
exp = None
|
|
|
|
|
|
|
|
if os.path.exists('.wandb_api'):
|
|
|
|
wb_args = json.load(open('.wandb_api', 'r'))
|
|
|
|
wb_dir = '../exp/wandb/' if not os.path.exists(
|
|
|
|
'/workspace/result') else '/workspace/result/wandb/'
|
|
|
|
if not os.path.exists(wb_dir):
|
|
|
|
os.makedirs(wb_dir)
|
|
|
|
WB.init(
|
|
|
|
project=wb_args['project'],
|
|
|
|
entity=wb_args['entity'],
|
|
|
|
name=save_dir.split('exp/')[-1],
|
|
|
|
dir=wb_dir
|
|
|
|
)
|
|
|
|
wandb = True
|
|
|
|
else:
|
|
|
|
wandb = False
|
|
|
|
else:
|
|
|
|
exp = None
|
|
|
|
wandb = False
|
|
|
|
writer = Writer(rank, save_dir, exp, wandb)
|
|
|
|
logger.info('[common-init] DONE')
|
|
|
|
|
|
|
|
return logging, writer
|
|
|
|
|
|
|
|
|
|
|
|
def reduce_tensor(tensor, world_size):
|
|
|
|
rt = tensor.clone()
|
|
|
|
dist.all_reduce(rt, op=dist.ReduceOp.SUM)
|
|
|
|
rt /= world_size
|
|
|
|
return rt
|
|
|
|
|
|
|
|
|
|
|
|
def get_stride_for_cell_type(cell_type):
|
|
|
|
if cell_type.startswith('normal') or cell_type.startswith('combiner'):
|
|
|
|
stride = 1
|
|
|
|
elif cell_type.startswith('down'):
|
|
|
|
stride = 2
|
|
|
|
elif cell_type.startswith('up'):
|
|
|
|
stride = -1
|
|
|
|
else:
|
|
|
|
raise NotImplementedError(cell_type)
|
|
|
|
|
|
|
|
return stride
|
|
|
|
|
|
|
|
|
|
|
|
def get_cout(cin, stride):
|
|
|
|
if stride == 1:
|
|
|
|
cout = cin
|
|
|
|
elif stride == -1:
|
|
|
|
cout = cin // 2
|
|
|
|
elif stride == 2:
|
|
|
|
cout = 2 * cin
|
|
|
|
|
|
|
|
return cout
|
|
|
|
|
|
|
|
|
|
|
|
def kl_balancer_coeff(num_scales, groups_per_scale, fun='square'):
|
|
|
|
if fun == 'equal':
|
|
|
|
coeff = torch.cat([torch.ones(groups_per_scale[num_scales - i - 1])
|
|
|
|
for i in range(num_scales)], dim=0).cuda()
|
|
|
|
elif fun == 'linear':
|
|
|
|
coeff = torch.cat([(2 ** i) * torch.ones(groups_per_scale[num_scales - i - 1]) for i in range(num_scales)],
|
|
|
|
dim=0).cuda()
|
|
|
|
elif fun == 'sqrt':
|
|
|
|
coeff = torch.cat(
|
|
|
|
[np.sqrt(2 ** i) * torch.ones(groups_per_scale[num_scales - i - 1])
|
|
|
|
for i in range(num_scales)],
|
|
|
|
dim=0).cuda()
|
|
|
|
elif fun == 'square':
|
|
|
|
coeff = torch.cat(
|
|
|
|
[np.square(2 ** i) / groups_per_scale[num_scales - i - 1] * torch.ones(groups_per_scale[num_scales - i - 1])
|
|
|
|
for i in range(num_scales)], dim=0).cuda()
|
|
|
|
else:
|
|
|
|
raise NotImplementedError
|
|
|
|
# convert min to 1.
|
|
|
|
coeff /= torch.min(coeff)
|
|
|
|
return coeff
|
|
|
|
|
|
|
|
|
|
|
|
def kl_per_group(kl_all):
|
|
|
|
kl_vals = torch.mean(kl_all, dim=0)
|
|
|
|
kl_coeff_i = torch.abs(kl_all)
|
|
|
|
kl_coeff_i = torch.mean(kl_coeff_i, dim=0, keepdim=True) + 0.01
|
|
|
|
|
|
|
|
return kl_coeff_i, kl_vals
|
|
|
|
|
|
|
|
|
|
|
|
def rec_balancer(rec_all, rec_coeff=1.0, npoints=None):
|
|
|
|
# layer depth increase, alpha_i increase, 1/alpha_i decrease; kl_coeff decrease
|
|
|
|
# the rec with more points should have higher loss
|
|
|
|
min_points = min(npoints)
|
|
|
|
coeff = []
|
|
|
|
rec_loss = 0
|
|
|
|
assert(len(rec_all) == len(npoints))
|
|
|
|
for ni, n in enumerate(npoints):
|
|
|
|
c = rec_coeff*np.sqrt(n/min_points)
|
|
|
|
rec_loss += rec_all[ni] * c
|
|
|
|
coeff.append(c) # the smallest points' loss weight is 1
|
|
|
|
|
|
|
|
return rec_loss, coeff, rec_all
|
|
|
|
|
|
|
|
|
|
|
|
def kl_balancer(kl_all, kl_coeff=1.0, kl_balance=False, alpha_i=None):
|
|
|
|
# layer depth increase, alpha_i increase, 1/alpha_i decrease; kl_coeff decrease
|
|
|
|
if kl_balance and kl_coeff < 1.0:
|
|
|
|
alpha_i = alpha_i.unsqueeze(0)
|
|
|
|
|
|
|
|
kl_all = torch.stack(kl_all, dim=1)
|
|
|
|
kl_coeff_i, kl_vals = kl_per_group(kl_all)
|
|
|
|
total_kl = torch.sum(kl_coeff_i)
|
|
|
|
# kl = ( sum * kl / alpha )
|
|
|
|
kl_coeff_i = kl_coeff_i / alpha_i * total_kl
|
|
|
|
kl_coeff_i = kl_coeff_i / torch.mean(kl_coeff_i, dim=1, keepdim=True)
|
|
|
|
kl = torch.sum(kl_all * kl_coeff_i.detach(), dim=1)
|
|
|
|
|
|
|
|
# for reporting
|
|
|
|
kl_coeffs = kl_coeff_i.squeeze(0)
|
|
|
|
else:
|
|
|
|
kl_all = torch.stack(kl_all, dim=1)
|
|
|
|
kl_vals = torch.mean(kl_all, dim=0)
|
|
|
|
kl = torch.sum(kl_all, dim=1)
|
|
|
|
kl_coeffs = torch.ones(size=(len(kl_vals),))
|
|
|
|
|
|
|
|
return kl_coeff * kl, kl_coeffs, kl_vals
|
|
|
|
|
|
|
|
|
|
|
|
def kl_per_group_vada(all_log_q, all_neg_log_p):
|
|
|
|
assert(len(all_log_q) == len(all_neg_log_p)
|
|
|
|
), f'get len={len(all_log_q)} and {len(all_neg_log_p)}'
|
|
|
|
|
|
|
|
kl_all_list = []
|
|
|
|
kl_diag = []
|
|
|
|
for log_q, neg_log_p in zip(all_log_q, all_neg_log_p):
|
|
|
|
kl_diag.append(torch.mean(
|
|
|
|
torch.sum(neg_log_p + log_q, dim=[2, 3]), dim=0))
|
|
|
|
kl_all_list.append(torch.sum(neg_log_p + log_q,
|
|
|
|
dim=[1, 2, 3])) # sum over D,H,W
|
|
|
|
|
|
|
|
# kl_all = torch.stack(kl_all, dim=1) # batch x num_total_groups
|
|
|
|
kl_vals = torch.mean(torch.stack(kl_all_list, dim=1),
|
|
|
|
dim=0) # mean per group
|
|
|
|
|
|
|
|
return kl_all_list, kl_vals, kl_diag
|
|
|
|
|
|
|
|
|
|
|
|
def kl_coeff(step, total_step, constant_step, min_kl_coeff, max_kl_coeff):
|
|
|
|
# return max(min(max_kl_coeff * (step - constant_step) / total_step, max_kl_coeff), min_kl_coeff)
|
|
|
|
return max(min(min_kl_coeff + (max_kl_coeff - min_kl_coeff) * (step - constant_step) / total_step, max_kl_coeff), min_kl_coeff)
|
|
|
|
|
|
|
|
|
|
|
|
def log_iw(decoder, x, log_q, log_p, crop=False):
|
|
|
|
recon = reconstruction_loss(decoder, x, crop)
|
|
|
|
return - recon - log_q + log_p
|
|
|
|
|
|
|
|
|
|
|
|
def reconstruction_loss(decoder, x, crop=False):
|
|
|
|
|
|
|
|
recon = decoder.log_p(x)
|
|
|
|
if crop:
|
|
|
|
recon = recon[:, :, 2:30, 2:30]
|
|
|
|
|
|
|
|
if isinstance(decoder, DiscMixLogistic):
|
|
|
|
return - torch.sum(recon, dim=[1, 2]) # summation over RGB is done.
|
|
|
|
else:
|
|
|
|
return - torch.sum(recon, dim=[1, 2, 3])
|
|
|
|
|
|
|
|
|
|
|
|
def vae_terms(all_log_q, all_eps):
|
|
|
|
|
|
|
|
# compute kl
|
|
|
|
kl_all = []
|
|
|
|
kl_diag = []
|
|
|
|
log_p, log_q = 0., 0.
|
|
|
|
for log_q_conv, eps in zip(all_log_q, all_eps):
|
|
|
|
log_p_conv = log_p_standard_normal(eps)
|
|
|
|
kl_per_var = log_q_conv - log_p_conv
|
|
|
|
kl_diag.append(torch.mean(torch.sum(kl_per_var, dim=[2, 3]), dim=0))
|
|
|
|
kl_all.append(torch.sum(kl_per_var, dim=[1, 2, 3]))
|
|
|
|
log_q += torch.sum(log_q_conv, dim=[1, 2, 3])
|
|
|
|
log_p += torch.sum(log_p_conv, dim=[1, 2, 3])
|
|
|
|
return log_q, log_p, kl_all, kl_diag
|
|
|
|
|
|
|
|
|
|
|
|
def sum_log_q(all_log_q):
|
|
|
|
log_q = 0.
|
|
|
|
for log_q_conv in all_log_q:
|
|
|
|
log_q += torch.sum(log_q_conv, dim=[1, 2, 3])
|
|
|
|
|
|
|
|
return log_q
|
|
|
|
|
|
|
|
|
|
|
|
def cross_entropy_normal(all_eps):
|
|
|
|
|
|
|
|
cross_entropy = 0.
|
|
|
|
neg_log_p_per_group = []
|
|
|
|
for eps in all_eps:
|
|
|
|
neg_log_p_conv = - log_p_standard_normal(eps)
|
|
|
|
neg_log_p = torch.sum(neg_log_p_conv, dim=[1, 2, 3])
|
|
|
|
cross_entropy += neg_log_p
|
|
|
|
neg_log_p_per_group.append(neg_log_p_conv)
|
|
|
|
|
|
|
|
return cross_entropy, neg_log_p_per_group
|
|
|
|
|
|
|
|
|
|
|
|
def tile_image(batch_image, n, m=None):
|
|
|
|
if m is None:
|
|
|
|
m = n
|
|
|
|
assert n * m == batch_image.size(0)
|
|
|
|
channels, height, width = batch_image.size(
|
|
|
|
1), batch_image.size(2), batch_image.size(3)
|
|
|
|
batch_image = batch_image.view(n, m, channels, height, width)
|
|
|
|
batch_image = batch_image.permute(2, 0, 3, 1, 4) # n, height, n, width, c
|
|
|
|
batch_image = batch_image.contiguous().view(channels, n * height, m * width)
|
|
|
|
return batch_image
|
|
|
|
|
|
|
|
|
|
|
|
def average_gradients_naive(params, is_distributed):
|
|
|
|
""" Gradient averaging. """
|
|
|
|
if is_distributed:
|
|
|
|
size = float(dist.get_world_size())
|
|
|
|
for param in params:
|
|
|
|
if param.requires_grad:
|
|
|
|
param.grad.data /= size
|
|
|
|
dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM)
|
|
|
|
|
|
|
|
|
|
|
|
def average_gradients(params, is_distributed):
|
|
|
|
""" Gradient averaging. """
|
|
|
|
if is_distributed:
|
|
|
|
if isinstance(params, types.GeneratorType):
|
|
|
|
params = [p for p in params]
|
|
|
|
|
|
|
|
size = float(dist.get_world_size())
|
|
|
|
grad_data = []
|
|
|
|
grad_size = []
|
|
|
|
grad_shapes = []
|
|
|
|
# Gather all grad values
|
|
|
|
for param in params:
|
|
|
|
if param.requires_grad:
|
|
|
|
if param.grad is not None:
|
|
|
|
grad_size.append(param.grad.data.numel())
|
|
|
|
grad_shapes.append(list(param.grad.data.shape))
|
|
|
|
grad_data.append(param.grad.data.flatten())
|
|
|
|
grad_data = torch.cat(grad_data).contiguous()
|
|
|
|
|
|
|
|
# All-reduce grad values
|
|
|
|
grad_data /= size
|
|
|
|
dist.all_reduce(grad_data, op=dist.ReduceOp.SUM)
|
|
|
|
|
|
|
|
# Put back the reduce grad values to parameters
|
|
|
|
base = 0
|
|
|
|
i = 0
|
|
|
|
for param in params:
|
|
|
|
if param.requires_grad and param.grad is not None:
|
|
|
|
param.grad.data = grad_data[base:base +
|
|
|
|
grad_size[i]].view(grad_shapes[i])
|
|
|
|
base += grad_size[i]
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
|
|
|
|
def average_params(params, is_distributed):
|
|
|
|
""" parameter averaging. """
|
|
|
|
if is_distributed:
|
|
|
|
size = float(dist.get_world_size())
|
|
|
|
for param in params:
|
|
|
|
param.data /= size
|
|
|
|
dist.all_reduce(param.data, op=dist.ReduceOp.SUM)
|
|
|
|
|
|
|
|
|
|
|
|
def average_tensor(t, is_distributed):
|
|
|
|
if is_distributed:
|
|
|
|
size = float(dist.get_world_size())
|
|
|
|
dist.all_reduce(t.data, op=dist.ReduceOp.SUM)
|
|
|
|
t.data /= size
|
|
|
|
|
|
|
|
|
|
|
|
def broadcast_params(params, is_distributed):
|
|
|
|
if is_distributed:
|
|
|
|
for param in params:
|
|
|
|
dist.broadcast(param.data, src=0)
|
|
|
|
|
|
|
|
|
|
|
|
def num_output(dataset):
|
|
|
|
if dataset in {'mnist', 'omniglot'}:
|
|
|
|
return 28 * 28
|
|
|
|
elif dataset == 'cifar10':
|
|
|
|
return 3 * 32 * 32
|
|
|
|
elif dataset.startswith('celeba') or dataset.startswith('imagenet') or dataset.startswith('lsun'):
|
|
|
|
size = int(dataset.split('_')[-1])
|
|
|
|
return 3 * size * size
|
|
|
|
elif dataset == 'ffhq':
|
|
|
|
return 3 * 256 * 256
|
|
|
|
else:
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
|
|
|
|
def get_input_size(dataset):
|
|
|
|
if dataset in {'mnist', 'omniglot'}:
|
|
|
|
return 32
|
|
|
|
elif dataset == 'cifar10':
|
|
|
|
return 32
|
|
|
|
elif dataset.startswith('celeba') or dataset.startswith('imagenet') or dataset.startswith('lsun'):
|
|
|
|
size = int(dataset.split('_')[-1])
|
|
|
|
return size
|
|
|
|
elif dataset == 'ffhq':
|
|
|
|
return 256
|
|
|
|
elif dataset.startswith('shape'):
|
|
|
|
return 1 # 2048
|
|
|
|
else:
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
|
|
|
|
def get_bpd_coeff(dataset):
|
|
|
|
n = num_output(dataset)
|
|
|
|
return 1. / np.log(2.) / n
|
|
|
|
|
|
|
|
|
|
|
|
def get_channel_multiplier(dataset, num_scales):
|
|
|
|
if dataset in {'cifar10', 'omniglot'}:
|
|
|
|
mult = (1, 1, 1)
|
|
|
|
elif dataset in {'celeba_256', 'ffhq', 'lsun_church_256'}:
|
|
|
|
if num_scales == 3:
|
|
|
|
mult = (1, 1, 1) # used for prior at 16
|
|
|
|
elif num_scales == 4:
|
|
|
|
mult = (1, 2, 2, 2) # used for prior at 32
|
|
|
|
elif num_scales == 5:
|
|
|
|
mult = (1, 1, 2, 2, 2) # used for prior at 64
|
|
|
|
elif dataset == 'mnist':
|
|
|
|
mult = (1, 1)
|
|
|
|
else:
|
|
|
|
mult = (1, 1)
|
|
|
|
# raise NotImplementedError
|
|
|
|
|
|
|
|
return mult
|
|
|
|
|
|
|
|
|
|
|
|
def get_attention_scales(dataset):
|
|
|
|
if dataset in {'cifar10', 'omniglot'}:
|
|
|
|
attn = (True, False, False)
|
|
|
|
elif dataset in {'celeba_256', 'ffhq', 'lsun_church_256'}:
|
|
|
|
# attn = (False, True, False, False) # used for 32
|
|
|
|
attn = (False, False, True, False, False) # used for 64
|
|
|
|
elif dataset == 'mnist':
|
|
|
|
attn = (True, False)
|
|
|
|
else:
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
return attn
|
|
|
|
|
|
|
|
|
|
|
|
def change_bit_length(x, num_bits):
|
|
|
|
if num_bits != 8:
|
|
|
|
x = torch.floor(x * 255 / 2 ** (8 - num_bits))
|
|
|
|
x /= (2 ** num_bits - 1)
|
|
|
|
return x
|
|
|
|
|
|
|
|
|
|
|
|
def view4D(t, size, inplace=True):
|
|
|
|
"""
|
|
|
|
Equal to view(-1, 1, 1, 1).expand(size)
|
|
|
|
Designed because of this bug:
|
|
|
|
https://github.com/pytorch/pytorch/pull/48696
|
|
|
|
"""
|
|
|
|
if inplace:
|
|
|
|
return t.unsqueeze_(-1).unsqueeze_(-1).unsqueeze_(-1).expand(size)
|
|
|
|
else:
|
|
|
|
return t.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(size)
|
|
|
|
|
|
|
|
|
|
|
|
def get_arch_cells(arch_type, use_se):
|
|
|
|
if arch_type == 'res_mbconv':
|
|
|
|
arch_cells = dict()
|
|
|
|
arch_cells['normal_enc'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se}
|
|
|
|
arch_cells['down_enc'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se}
|
|
|
|
arch_cells['normal_dec'] = {
|
|
|
|
'conv_branch': ['mconv_e6k5g0'], 'se': use_se}
|
|
|
|
arch_cells['up_dec'] = {'conv_branch': ['mconv_e6k5g0'], 'se': use_se}
|
|
|
|
arch_cells['normal_pre'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se}
|
|
|
|
arch_cells['down_pre'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se}
|
|
|
|
arch_cells['normal_post'] = {
|
|
|
|
'conv_branch': ['mconv_e3k5g0'], 'se': use_se}
|
|
|
|
arch_cells['up_post'] = {'conv_branch': ['mconv_e3k5g0'], 'se': use_se}
|
|
|
|
arch_cells['ar_nn'] = ['']
|
|
|
|
elif arch_type == 'res_bnswish':
|
|
|
|
arch_cells = dict()
|
|
|
|
arch_cells['normal_enc'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se}
|
|
|
|
arch_cells['down_enc'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se}
|
|
|
|
arch_cells['normal_dec'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se}
|
|
|
|
arch_cells['up_dec'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se}
|
|
|
|
arch_cells['normal_pre'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se}
|
|
|
|
arch_cells['down_pre'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se}
|
|
|
|
arch_cells['normal_post'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se}
|
|
|
|
arch_cells['up_post'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se}
|
|
|
|
arch_cells['ar_nn'] = ['']
|
|
|
|
elif arch_type == 'res_bnswish2':
|
|
|
|
arch_cells = dict()
|
|
|
|
arch_cells['normal_enc'] = {
|
|
|
|
'conv_branch': ['res_bnswish_x2'], 'se': use_se}
|
|
|
|
arch_cells['down_enc'] = {
|
|
|
|
'conv_branch': ['res_bnswish_x2'], 'se': use_se}
|
|
|
|
arch_cells['normal_dec'] = {
|
|
|
|
'conv_branch': ['res_bnswish_x2'], 'se': use_se}
|
|
|
|
arch_cells['up_dec'] = {'conv_branch': [
|
|
|
|
'res_bnswish_x2'], 'se': use_se}
|
|
|
|
arch_cells['normal_pre'] = {
|
|
|
|
'conv_branch': ['res_bnswish_x2'], 'se': use_se}
|
|
|
|
arch_cells['down_pre'] = {
|
|
|
|
'conv_branch': ['res_bnswish_x2'], 'se': use_se}
|
|
|
|
arch_cells['normal_post'] = {
|
|
|
|
'conv_branch': ['res_bnswish_x2'], 'se': use_se}
|
|
|
|
arch_cells['up_post'] = {'conv_branch': [
|
|
|
|
'res_bnswish_x2'], 'se': use_se}
|
|
|
|
arch_cells['ar_nn'] = ['']
|
|
|
|
elif arch_type == 'res_mbconv_attn':
|
|
|
|
arch_cells = dict()
|
|
|
|
arch_cells['normal_enc'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish', ], 'se': use_se, 'attn_type': 'attn'}
|
|
|
|
arch_cells['down_enc'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se, 'attn_type': 'attn'}
|
|
|
|
arch_cells['normal_dec'] = {'conv_branch': [
|
|
|
|
'mconv_e6k5g0'], 'se': use_se, 'attn_type': 'attn'}
|
|
|
|
arch_cells['up_dec'] = {'conv_branch': [
|
|
|
|
'mconv_e6k5g0'], 'se': use_se, 'attn_type': 'attn'}
|
|
|
|
arch_cells['normal_pre'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se}
|
|
|
|
arch_cells['down_pre'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se}
|
|
|
|
arch_cells['normal_post'] = {
|
|
|
|
'conv_branch': ['mconv_e3k5g0'], 'se': use_se}
|
|
|
|
arch_cells['up_post'] = {'conv_branch': ['mconv_e3k5g0'], 'se': use_se}
|
|
|
|
arch_cells['ar_nn'] = ['']
|
|
|
|
elif arch_type == 'res_mbconv_attn_half':
|
|
|
|
arch_cells = dict()
|
|
|
|
arch_cells['normal_enc'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se}
|
|
|
|
arch_cells['down_enc'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se}
|
|
|
|
arch_cells['normal_dec'] = {'conv_branch': [
|
|
|
|
'mconv_e6k5g0'], 'se': use_se, 'attn_type': 'attn'}
|
|
|
|
arch_cells['up_dec'] = {'conv_branch': [
|
|
|
|
'mconv_e6k5g0'], 'se': use_se, 'attn_type': 'attn'}
|
|
|
|
arch_cells['normal_pre'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se}
|
|
|
|
arch_cells['down_pre'] = {'conv_branch': [
|
|
|
|
'res_bnswish', 'res_bnswish'], 'se': use_se}
|
|
|
|
arch_cells['normal_post'] = {
|
|
|
|
'conv_branch': ['mconv_e3k5g0'], 'se': use_se}
|
|
|
|
arch_cells['up_post'] = {'conv_branch': ['mconv_e3k5g0'], 'se': use_se}
|
|
|
|
arch_cells['ar_nn'] = ['']
|
|
|
|
else:
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
return arch_cells
|
|
|
|
|
|
|
|
|
|
|
|
def get_arch_cells_denoising(arch_type, use_se, apply_sqrt2):
|
|
|
|
if arch_type == 'res_mbconv':
|
|
|
|
arch_cells = dict()
|
|
|
|
arch_cells['normal_enc_diff'] = {
|
|
|
|
'conv_branch': ['mconv_e6k5g0_gn'], 'se': use_se}
|
|
|
|
arch_cells['down_enc_diff'] = {
|
|
|
|
'conv_branch': ['mconv_e6k5g0_gn'], 'se': use_se}
|
|
|
|
arch_cells['normal_dec_diff'] = {
|
|
|
|
'conv_branch': ['mconv_e6k5g0_gn'], 'se': use_se}
|
|
|
|
arch_cells['up_dec_diff'] = {
|
|
|
|
'conv_branch': ['mconv_e6k5g0_gn'], 'se': use_se}
|
|
|
|
elif arch_type == 'res_ho':
|
|
|
|
arch_cells = dict()
|
|
|
|
arch_cells['normal_enc_diff'] = {
|
|
|
|
'conv_branch': ['res_gnswish_x2'], 'se': use_se}
|
|
|
|
arch_cells['down_enc_diff'] = {
|
|
|
|
'conv_branch': ['res_gnswish_x2'], 'se': use_se}
|
|
|
|
arch_cells['normal_dec_diff'] = {
|
|
|
|
'conv_branch': ['res_gnswish_x2'], 'se': use_se}
|
|
|
|
arch_cells['up_dec_diff'] = {
|
|
|
|
'conv_branch': ['res_gnswish_x2'], 'se': use_se}
|
|
|
|
elif arch_type == 'res_ho_p1':
|
|
|
|
arch_cells = dict()
|
|
|
|
arch_cells['normal_enc_diff'] = {
|
|
|
|
'conv_branch': ['res_gnswish_x2_p1'], 'se': use_se}
|
|
|
|
arch_cells['down_enc_diff'] = {
|
|
|
|
'conv_branch': ['res_gnswish_x2_p1'], 'se': use_se}
|
|
|
|
arch_cells['normal_dec_diff'] = {
|
|
|
|
'conv_branch': ['res_gnswish_x2_p1'], 'se': use_se}
|
|
|
|
arch_cells['up_dec_diff'] = {
|
|
|
|
'conv_branch': ['res_gnswish_x2_p1'], 'se': use_se}
|
|
|
|
elif arch_type == 'res_ho_attn':
|
|
|
|
arch_cells = dict()
|
|
|
|
arch_cells['normal_enc_diff'] = {
|
|
|
|
'conv_branch': ['res_gnswish_x2'], 'se': use_se}
|
|
|
|
arch_cells['down_enc_diff'] = {
|
|
|
|
'conv_branch': ['res_gnswish_x2'], 'se': use_se}
|
|
|
|
arch_cells['normal_dec_diff'] = {
|
|
|
|
'conv_branch': ['res_gnswish_x2'], 'se': use_se}
|
|
|
|
arch_cells['up_dec_diff'] = {
|
|
|
|
'conv_branch': ['res_gnswish_x2'], 'se': use_se}
|
|
|
|
else:
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
for k in arch_cells:
|
|
|
|
arch_cells[k]['apply_sqrt2'] = apply_sqrt2
|
|
|
|
|
|
|
|
return arch_cells
|
|
|
|
|
|
|
|
|
|
|
|
def groups_per_scale(num_scales, num_groups_per_scale):
|
|
|
|
g = []
|
|
|
|
n = num_groups_per_scale
|
|
|
|
for s in range(num_scales):
|
|
|
|
assert n >= 1
|
|
|
|
g.append(n)
|
|
|
|
return g
|
|
|
|
|
|
|
|
|
|
|
|
#class PositionalEmbedding(nn.Module):
|
|
|
|
# def __init__(self, embedding_dim, scale):
|
|
|
|
# super(PositionalEmbedding, self).__init__()
|
|
|
|
# self.embedding_dim = embedding_dim
|
|
|
|
# self.scale = scale
|
|
|
|
#
|
|
|
|
# def forward(self, timesteps):
|
|
|
|
# assert len(timesteps.shape) == 1
|
|
|
|
# timesteps = timesteps * self.scale
|
|
|
|
# half_dim = self.embedding_dim // 2
|
|
|
|
# emb = math.log(10000) / (half_dim - 1)
|
|
|
|
# emb = torch.exp(torch.arange(half_dim) * -emb)
|
|
|
|
# emb = emb.to(device=timesteps.device)
|
|
|
|
# emb = timesteps[:, None] * emb[None, :]
|
|
|
|
# emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
|
|
|
|
# return emb
|
|
|
|
#
|
|
|
|
#
|
|
|
|
#class RandomFourierEmbedding(nn.Module):
|
|
|
|
# def __init__(self, embedding_dim, scale):
|
|
|
|
# super(RandomFourierEmbedding, self).__init__()
|
|
|
|
# self.w = nn.Parameter(torch.randn(
|
|
|
|
# size=(1, embedding_dim // 2)) * scale, requires_grad=False)
|
|
|
|
#
|
|
|
|
# def forward(self, timesteps):
|
|
|
|
# emb = torch.mm(timesteps[:, None], self.w * 2 * 3.14159265359)
|
|
|
|
# return torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
|
|
|
|
#
|
|
|
|
#
|
|
|
|
#def init_temb_fun(embedding_type, embedding_scale, embedding_dim):
|
|
|
|
# if embedding_type == 'positional':
|
|
|
|
# temb_fun = PositionalEmbedding(embedding_dim, embedding_scale)
|
|
|
|
# elif embedding_type == 'fourier':
|
|
|
|
# temb_fun = RandomFourierEmbedding(embedding_dim, embedding_scale)
|
|
|
|
# else:
|
|
|
|
# raise NotImplementedError
|
|
|
|
#
|
|
|
|
# return temb_fun
|
|
|
|
|
|
|
|
|
|
|
|
def symmetrize_image_data(images):
|
|
|
|
return 2.0 * images - 1.0
|
|
|
|
|
|
|
|
|
|
|
|
def unsymmetrize_image_data(images):
|
|
|
|
return (images + 1.) / 2.
|
|
|
|
|
|
|
|
|
|
|
|
def normalize_symmetric(images):
|
|
|
|
"""
|
|
|
|
Normalize images by dividing the largest intensity. Used for visualizing the intermediate steps.
|
|
|
|
"""
|
|
|
|
b = images.shape[0]
|
|
|
|
m, _ = torch.max(torch.abs(images).view(b, -1), dim=1)
|
|
|
|
images /= (m.view(b, 1, 1, 1) + 1e-3)
|
|
|
|
|
|
|
|
return images
|
|
|
|
|
|
|
|
|
|
|
|
@torch.jit.script
|
|
|
|
def soft_clamp5(x: torch.Tensor):
|
|
|
|
# 5. * torch.tanh(x / 5.) <--> soft differentiable clamp between [-5, 5]
|
|
|
|
return x.div(5.).tanh_().mul(5.)
|
|
|
|
|
|
|
|
|
|
|
|
@torch.jit.script
|
|
|
|
def soft_clamp(x: torch.Tensor, a: torch.Tensor):
|
|
|
|
return x.div(a).tanh_().mul(a)
|
|
|
|
|
|
|
|
|
|
|
|
class SoftClamp5(nn.Module):
|
|
|
|
def __init__(self):
|
|
|
|
super(SoftClamp5, self).__init__()
|
|
|
|
|
|
|
|
def forward(self, x):
|
|
|
|
return soft_clamp5(x)
|
|
|
|
|
|
|
|
|
|
|
|
def override_architecture_fields(args, stored_args, logging):
|
|
|
|
# list of architecture parameters used in NVAE:
|
|
|
|
architecture_fields = ['arch_instance', 'num_nf', 'num_latent_scales', 'num_groups_per_scale',
|
|
|
|
'num_latent_per_group', 'num_channels_enc', 'num_preprocess_blocks',
|
|
|
|
'num_preprocess_cells', 'num_cell_per_cond_enc', 'num_channels_dec',
|
|
|
|
'num_postprocess_blocks', 'num_postprocess_cells', 'num_cell_per_cond_dec',
|
|
|
|
'decoder_dist', 'num_x_bits', 'log_sig_q_scale', 'latent_grad_cutoff',
|
|
|
|
'progressive_output_vae', 'progressive_input_vae', 'channel_mult']
|
|
|
|
|
|
|
|
# backward compatibility
|
|
|
|
""" We have broken backward compatibility. No need to se these manually
|
|
|
|
if not hasattr(stored_args, 'log_sig_q_scale'):
|
|
|
|
logging.info('*** Setting %s manually ****', 'log_sig_q_scale')
|
|
|
|
setattr(stored_args, 'log_sig_q_scale', 5.)
|
|
|
|
|
|
|
|
if not hasattr(stored_args, 'latent_grad_cutoff'):
|
|
|
|
logging.info('*** Setting %s manually ****', 'latent_grad_cutoff')
|
|
|
|
setattr(stored_args, 'latent_grad_cutoff', 0.)
|
|
|
|
|
|
|
|
if not hasattr(stored_args, 'progressive_input_vae'):
|
|
|
|
logging.info('*** Setting %s manually ****', 'progressive_input_vae')
|
|
|
|
setattr(stored_args, 'progressive_input_vae', 'none')
|
|
|
|
|
|
|
|
if not hasattr(stored_args, 'progressive_output_vae'):
|
|
|
|
logging.info('*** Setting %s manually ****', 'progressive_output_vae')
|
|
|
|
setattr(stored_args, 'progressive_output_vae', 'none')
|
|
|
|
"""
|
|
|
|
|
|
|
|
for f in architecture_fields:
|
|
|
|
if not hasattr(args, f) or getattr(args, f) != getattr(stored_args, f):
|
|
|
|
logging.info('Setting %s from loaded checkpoint', f)
|
|
|
|
setattr(args, f, getattr(stored_args, f))
|
|
|
|
|
|
|
|
|
|
|
|
def init_processes(rank, size, fn, args, config):
|
|
|
|
""" Initialize the distributed environment. """
|
|
|
|
os.environ['MASTER_ADDR'] = args.master_address
|
|
|
|
os.environ['MASTER_PORT'] = '6020'
|
|
|
|
if args.num_proc_node == 1:
|
|
|
|
import socket
|
|
|
|
import errno
|
|
|
|
a_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
|
|
for p in range(6010, 6030):
|
|
|
|
location = (args.master_address, p) # "127.0.0.1", p)
|
|
|
|
try:
|
|
|
|
a_socket.bind((args.master_address, p))
|
|
|
|
logger.debug('set port as {}', p)
|
|
|
|
os.environ['MASTER_PORT'] = '%d' % p
|
|
|
|
a_socket.close()
|
|
|
|
break
|
|
|
|
except socket.error as e:
|
|
|
|
a = 0
|
|
|
|
# if e.errno == errno.EADDRINUSE:
|
|
|
|
# # logger.debug("Port {} is already in use", p)
|
|
|
|
# else:
|
|
|
|
# logger.debug(e)
|
|
|
|
|
|
|
|
logger.info('init_process: rank={}, world_size={}', rank, size)
|
|
|
|
torch.cuda.set_device(args.local_rank)
|
|
|
|
dist.init_process_group(
|
|
|
|
backend='nccl', init_method='env://', rank=rank, world_size=size)
|
|
|
|
fn(args, config)
|
|
|
|
logger.info('barrier: rank={}, world_size={}', rank, size)
|
|
|
|
dist.barrier()
|
|
|
|
logger.info('skip destroy_process_group: rank={}, world_size={}', rank, size)
|
|
|
|
# dist.destroy_process_group()
|
|
|
|
logger.info('skip destroy fini')
|
|
|
|
|
|
|
|
|
|
|
|
def sample_rademacher_like(y):
|
|
|
|
return torch.randint(low=0, high=2, size=y.shape, device='cuda') * 2 - 1
|
|
|
|
|
|
|
|
|
|
|
|
def sample_gaussian_like(y):
|
|
|
|
return torch.randn_like(y, device='cuda')
|
|
|
|
|
|
|
|
|
|
|
|
def trace_df_dx_hutchinson(f, x, noise, no_autograd):
|
|
|
|
"""
|
|
|
|
Hutchinson's trace estimator for Jacobian df/dx, O(1) call to autograd
|
|
|
|
"""
|
|
|
|
if no_autograd:
|
|
|
|
# the following is compatible with checkpointing
|
|
|
|
torch.sum(f * noise).backward()
|
|
|
|
# torch.autograd.backward(tensors=[f], grad_tensors=[noise])
|
|
|
|
jvp = x.grad
|
|
|
|
trJ = torch.sum(jvp * noise, dim=[1, 2, 3])
|
|
|
|
x.grad = None
|
|
|
|
else:
|
|
|
|
jvp = torch.autograd.grad(f, x, noise, create_graph=False)[0]
|
|
|
|
trJ = torch.sum(jvp * noise, dim=[1, 2, 3])
|
|
|
|
# trJ = torch.einsum('bijk,bijk->b', jvp, noise) # we could test if there's a speed difference in einsum vs sum
|
|
|
|
|
|
|
|
return trJ
|
|
|
|
|
|
|
|
|
|
|
|
def calc_jacobian_regularization(pred_params, eps_t, dae, var_t, m_t, f_t, g2_t, var_N_t, args):
|
|
|
|
"""
|
|
|
|
Calculates Jabobian regularization loss. For reference implementations, see
|
|
|
|
https://github.com/facebookresearch/jacobian_regularizer/blob/master/jacobian/jacobian.py or
|
|
|
|
https://github.com/cfinlay/ffjord-rnode/blob/master/lib/layers/odefunc.py.
|
|
|
|
"""
|
|
|
|
# eps_t_jvp = eps_t.detach()
|
|
|
|
# eps_t_jvp = eps_t.detach().requires_grad_()
|
|
|
|
if args.no_autograd_jvp:
|
|
|
|
raise NotImplementedError(
|
|
|
|
"We have not implemented no_autograd_jvp for jacobian reg.")
|
|
|
|
|
|
|
|
jvp_ode_func_norms = []
|
|
|
|
alpha = torch.sigmoid(dae.mixing_logit.detach())
|
|
|
|
for _ in range(args.jac_reg_samples):
|
|
|
|
noise = sample_gaussian_like(eps_t)
|
|
|
|
jvp = torch.autograd.grad(
|
|
|
|
pred_params, eps_t, noise, create_graph=True)[0]
|
|
|
|
|
|
|
|
if args.sde_type in ['geometric_sde', 'vpsde', 'power_vpsde']:
|
|
|
|
jvp_ode_func = alpha * (noise * torch.sqrt(var_t) - jvp)
|
|
|
|
if not args.jac_kin_reg_drop_weights:
|
|
|
|
jvp_ode_func = f_t / torch.sqrt(var_t) * jvp_ode_func
|
|
|
|
elif args.sde_type in ['sub_vpsde', 'sub_power_vpsde']:
|
|
|
|
sigma2_N_t = (1.0 - m_t ** 2) ** 2 + m_t ** 2
|
|
|
|
jvp_ode_func = noise * torch.sqrt(var_t) / (1.0 - m_t ** 4) - (
|
|
|
|
(1.0 - alpha) * noise * torch.sqrt(var_t) / sigma2_N_t + alpha * jvp)
|
|
|
|
if not args.jac_kin_reg_drop_weights:
|
|
|
|
jvp_ode_func = f_t * (1.0 - m_t ** 4) / \
|
|
|
|
torch.sqrt(var_t) * jvp_ode_func
|
|
|
|
elif args.sde_type in ['vesde']:
|
|
|
|
jvp_ode_func = (1.0 - alpha) * noise * \
|
|
|
|
torch.sqrt(var_t) / var_N_t + alpha * jvp
|
|
|
|
if not args.jac_kin_reg_drop_weights:
|
|
|
|
jvp_ode_func = 0.5 * g2_t / torch.sqrt(var_t) * jvp_ode_func
|
|
|
|
else:
|
|
|
|
raise ValueError("Unrecognized SDE type: {}".format(args.sde_type))
|
|
|
|
|
|
|
|
jvp_ode_func_norms.append(jvp_ode_func.view(
|
|
|
|
eps_t.size(0), -1).pow(2).sum(dim=1, keepdim=True))
|
|
|
|
|
|
|
|
jac_reg_loss = torch.cat(jvp_ode_func_norms, dim=1).mean()
|
|
|
|
# jac_reg_loss = torch.mean(jvp_ode_func.view(eps_t.size(0), -1).pow(2).sum(dim=1))
|
|
|
|
return jac_reg_loss
|
|
|
|
|
|
|
|
|
|
|
|
def calc_kinetic_regularization(pred_params, eps_t, dae, var_t, m_t, f_t, g2_t, var_N_t, args):
|
|
|
|
"""
|
|
|
|
Calculates kinetic regularization loss. For a reference implementation, see
|
|
|
|
https://github.com/cfinlay/ffjord-rnode/blob/master/lib/layers/wrappers/cnf_regularization.py
|
|
|
|
"""
|
|
|
|
# eps_t_kin = eps_t.detach()
|
|
|
|
|
|
|
|
alpha = torch.sigmoid(dae.mixing_logit.detach())
|
|
|
|
if args.sde_type in ['geometric_sde', 'vpsde', 'power_vpsde']:
|
|
|
|
ode_func = alpha * (eps_t * torch.sqrt(var_t) - pred_params)
|
|
|
|
if not args.jac_kin_reg_drop_weights:
|
|
|
|
ode_func = f_t / torch.sqrt(var_t) * ode_func
|
|
|
|
elif args.sde_type in ['sub_vpsde', 'sub_power_vpsde']:
|
|
|
|
sigma2_N_t = (1.0 - m_t ** 2) ** 2 + m_t ** 2
|
|
|
|
ode_func = eps_t * torch.sqrt(var_t) / (1.0 - m_t ** 4) - (
|
|
|
|
(1.0 - alpha) * eps_t * torch.sqrt(var_t) / sigma2_N_t + alpha * pred_params)
|
|
|
|
if not args.jac_kin_reg_drop_weights:
|
|
|
|
ode_func = f_t * (1.0 - m_t ** 4) / torch.sqrt(var_t) * ode_func
|
|
|
|
elif args.sde_type in ['vesde']:
|
|
|
|
ode_func = (1.0 - alpha) * eps_t * torch.sqrt(var_t) / \
|
|
|
|
var_N_t + alpha * pred_params
|
|
|
|
if not args.jac_kin_reg_drop_weights:
|
|
|
|
ode_func = 0.5 * g2_t / torch.sqrt(var_t) * ode_func
|
|
|
|
else:
|
|
|
|
raise ValueError("Unrecognized SDE type: {}".format(args.sde_type))
|
|
|
|
|
|
|
|
kin_reg_loss = torch.mean(ode_func.view(
|
|
|
|
eps_t.size(0), -1).pow(2).sum(dim=1))
|
|
|
|
return kin_reg_loss
|
|
|
|
|
|
|
|
|
|
|
|
def different_p_q_objectives(iw_sample_p, iw_sample_q):
|
|
|
|
assert iw_sample_p in ['ll_uniform', 'drop_all_uniform', 'll_iw', 'drop_all_iw', 'drop_sigma2t_iw', 'rescale_iw',
|
|
|
|
'drop_sigma2t_uniform']
|
|
|
|
assert iw_sample_q in ['reweight_p_samples', 'll_uniform', 'll_iw']
|
|
|
|
# Removed assert below. It may be stupid, but user can still do it. It may make sense for debugging purposes.
|
|
|
|
# assert iw_sample_p != iw_sample_q, 'It does not make sense to use the same objectives for p and q, but train ' \
|
|
|
|
# 'with separated q and p updates. To reuse the p objective for q, specify ' \
|
|
|
|
# '"reweight_p_samples" instead (for the ll-based objectives, the ' \
|
|
|
|
# 'reweighting factor will simply be 1.0 then)!'
|
|
|
|
# In these cases, we reuse the likelihood-based p-objective (either the uniform sampling version or the importance
|
|
|
|
# sampling version) also for q.
|
|
|
|
if iw_sample_p in ['ll_uniform', 'll_iw'] and iw_sample_q == 'reweight_p_samples':
|
|
|
|
return False
|
|
|
|
# In these cases, we are using a non-likelihood-based objective for p, and hence definitly need to use another q
|
|
|
|
# objective.
|
|
|
|
else:
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
def decoder_output(dataset, logits, fixed_log_scales=None):
|
|
|
|
if dataset in {'cifar10', 'celeba_64', 'celeba_256', 'imagenet_32', 'imagenet_64', 'ffhq',
|
|
|
|
'lsun_bedroom_128', 'lsun_bedroom_256', 'mnist', 'omniglot',
|
|
|
|
'lsun_church_256'}:
|
|
|
|
return PixelNormal(logits, fixed_log_scales)
|
|
|
|
else:
|
|
|
|
return PixelNormal(logits, fixed_log_scales)
|
|
|
|
# raise NotImplementedError
|
|
|
|
|
|
|
|
|
|
|
|
def get_mixed_prediction(mixed_prediction, param, mixing_logit, mixing_component=None):
|
|
|
|
if mixed_prediction:
|
|
|
|
assert mixing_component is not None, 'Provide mixing component when mixed_prediction is enabled.'
|
|
|
|
coeff = torch.sigmoid(mixing_logit)
|
|
|
|
param = (1 - coeff) * mixing_component + coeff * param
|
|
|
|
|
|
|
|
return param
|
|
|
|
|
|
|
|
|
|
|
|
def set_vesde_sigma_max(args, vae, train_queue, logging, is_distributed):
|
|
|
|
logging.info('')
|
|
|
|
logging.info(
|
|
|
|
'Calculating max. pairwise distance in latent space to set sigma2_max for VESDE...')
|
|
|
|
|
|
|
|
eps_list = []
|
|
|
|
vae.eval()
|
|
|
|
for step, x in enumerate(train_queue):
|
|
|
|
x = x[0] if len(x) > 1 else x
|
|
|
|
x = x.cuda()
|
|
|
|
x = symmetrize_image_data(x)
|
|
|
|
|
|
|
|
# run vae
|
|
|
|
with autocast(enabled=args.autocast_train):
|
|
|
|
with torch.set_grad_enabled(False):
|
|
|
|
logits, all_log_q, all_eps = vae(x)
|
|
|
|
eps = torch.cat(all_eps, dim=1)
|
|
|
|
|
|
|
|
eps_list.append(eps.detach())
|
|
|
|
|
|
|
|
# if step > 5: ### DEBUG
|
|
|
|
# break ### DEBUG
|
|
|
|
|
|
|
|
# concat eps tensor on each GPU and then gather all on all GPUs
|
|
|
|
eps_this_rank = torch.cat(eps_list, dim=0)
|
|
|
|
if is_distributed:
|
|
|
|
eps_all_gathered = [torch.zeros_like(
|
|
|
|
eps_this_rank)] * dist.get_world_size()
|
|
|
|
dist.all_gather(eps_all_gathered, eps_this_rank)
|
|
|
|
eps_full = torch.cat(eps_all_gathered, dim=0)
|
|
|
|
else:
|
|
|
|
eps_full = eps_this_rank
|
|
|
|
|
|
|
|
# max pairwise distance squared between all latent encodings, is computed on CPU
|
|
|
|
eps_full = eps_full.cpu().float()
|
|
|
|
eps_full = eps_full.flatten(start_dim=1).unsqueeze(0)
|
|
|
|
max_pairwise_dist_sqr = torch.cdist(eps_full, eps_full).square().max()
|
|
|
|
max_pairwise_dist_sqr = max_pairwise_dist_sqr.cuda()
|
|
|
|
|
|
|
|
# to be safe, we broadcast to all GPUs if we are in distributed environment. Shouldn't be necessary in principle.
|
|
|
|
if is_distributed:
|
|
|
|
dist.broadcast(max_pairwise_dist_sqr, src=0)
|
|
|
|
|
|
|
|
args.sigma2_max = max_pairwise_dist_sqr.item()
|
|
|
|
|
|
|
|
logging.info('Done! Set args.sigma2_max set to {}'.format(args.sigma2_max))
|
|
|
|
logging.info('')
|
|
|
|
return args
|
|
|
|
|
|
|
|
|
|
|
|
def mask_inactive_variables(x, is_active):
|
|
|
|
x = x * is_active
|
|
|
|
return x
|
|
|
|
|
|
|
|
|
|
|
|
def common_x_operations(x, num_x_bits):
|
|
|
|
x = x[0] if len(x) > 1 else x
|
|
|
|
x = x.cuda()
|
|
|
|
|
|
|
|
# change bit length
|
|
|
|
x = change_bit_length(x, num_x_bits)
|
|
|
|
x = symmetrize_image_data(x)
|
|
|
|
|
|
|
|
return x
|
|
|
|
|
|
|
|
|
|
|
|
def vae_regularization(args, vae_sn_calculator, loss_weight=None):
|
|
|
|
"""
|
|
|
|
when using hvae_trainer, we pass args=None, and loss_weight value
|
|
|
|
"""
|
|
|
|
regularization_q, vae_norm_loss, vae_bn_loss, vae_wdn_coeff = 0., 0., 0., args.weight_decay_norm_vae if loss_weight is None else loss_weight
|
|
|
|
if loss_weight is not None or args.train_vae:
|
|
|
|
vae_norm_loss = vae_sn_calculator.spectral_norm_parallel()
|
|
|
|
vae_bn_loss = vae_sn_calculator.batchnorm_loss()
|
|
|
|
regularization_q = (vae_norm_loss + vae_bn_loss) * vae_wdn_coeff
|
|
|
|
|
|
|
|
return regularization_q, vae_norm_loss, vae_bn_loss, vae_wdn_coeff
|
|
|
|
|
|
|
|
|
|
|
|
def dae_regularization(args, dae_sn_calculator, diffusion, dae, step, t, pred_params_p, eps_t_p, var_t_p, m_t_p, g2_t_p):
|
|
|
|
dae_wdn_coeff = args.weight_decay_norm_dae
|
|
|
|
dae_norm_loss = dae_sn_calculator.spectral_norm_parallel()
|
|
|
|
dae_bn_loss = dae_sn_calculator.batchnorm_loss()
|
|
|
|
regularization_p = (dae_norm_loss + dae_bn_loss) * dae_wdn_coeff
|
|
|
|
|
|
|
|
# Jacobian regularization
|
|
|
|
jac_reg_loss = 0.
|
|
|
|
if args.jac_reg_coeff > 0.0 and step % args.jac_reg_freq == 0:
|
|
|
|
f_t = diffusion.f(t).view(-1, 1, 1, 1)
|
|
|
|
var_N_t = diffusion.var_N(
|
|
|
|
t).view(-1, 1, 1, 1) if args.sde_type == 'vesde' else None
|
|
|
|
"""
|
|
|
|
# Arash: Please remove the following if it looks correct to you, Karsten.
|
|
|
|
# jac_reg_loss = utils.calc_jacobian_regularization(pred_params, eps_t, dae, var_t, m_t, f_t, args)
|
|
|
|
if args.iw_sample_q in ['ll_uniform', 'll_iw']:
|
|
|
|
pred_params_jac_reg = torch.chunk(pred_params, chunks=2, dim=0)[0]
|
|
|
|
var_t_jac_reg, m_t_jac_reg, f_t_jac_reg = torch.chunk(var_t, chunks=2, dim=0)[0], \
|
|
|
|
torch.chunk(m_t, chunks=2, dim=0)[0], \
|
|
|
|
torch.chunk(f_t, chunks=2, dim=0)[0]
|
|
|
|
g2_t_jac_reg = torch.chunk(g2_t, chunks=2, dim=0)[0]
|
|
|
|
var_N_t_jac_reg = torch.chunk(var_N_t, chunks=2, dim=0)[0] if args.sde_type == 'vesde' else None
|
|
|
|
else:
|
|
|
|
pred_params_jac_reg = pred_params
|
|
|
|
var_t_jac_reg, m_t_jac_reg, f_t_jac_reg, g2_t_jac_reg, var_N_t_jac_reg = var_t, m_t, f_t, g2_t, var_N_t
|
|
|
|
jac_reg_loss = utils.calc_jacobian_regularization(pred_params_jac_reg, eps_t_p, dae, var_t_jac_reg, m_t_jac_reg,
|
|
|
|
f_t_jac_reg, g2_t_jac_reg, var_N_t_jac_reg, args)
|
|
|
|
"""
|
|
|
|
jac_reg_loss = calc_jacobian_regularization(pred_params_p, eps_t_p, dae, var_t_p, m_t_p,
|
|
|
|
f_t, g2_t_p, var_N_t, args)
|
|
|
|
regularization_p += args.jac_reg_coeff * jac_reg_loss
|
|
|
|
|
|
|
|
# Kinetic regularization
|
|
|
|
kin_reg_loss = 0.
|
|
|
|
if args.kin_reg_coeff > 0.0:
|
|
|
|
f_t = diffusion.f(t).view(-1, 1, 1, 1)
|
|
|
|
var_N_t = diffusion.var_N(
|
|
|
|
t).view(-1, 1, 1, 1) if args.sde_type == 'vesde' else None
|
|
|
|
"""
|
|
|
|
# Arash: Please remove the following if it looks correct to you, Karsten.
|
|
|
|
# kin_reg_loss = utils.calc_kinetic_regularization(pred_params, eps_t, dae, var_t, m_t, f_t, args)
|
|
|
|
if args.iw_sample_q in ['ll_uniform', 'll_iw']:
|
|
|
|
pred_params_kin_reg = torch.chunk(pred_params, chunks=2, dim=0)[0]
|
|
|
|
var_t_kin_reg, m_t_kin_reg, f_t_kin_reg = torch.chunk(var_t, chunks=2, dim=0)[0], \
|
|
|
|
torch.chunk(m_t, chunks=2, dim=0)[0], \
|
|
|
|
torch.chunk(f_t, chunks=2, dim=0)[0]
|
|
|
|
g2_t_kin_reg = torch.chunk(g2_t, chunks=2, dim=0)[0]
|
|
|
|
var_N_t_kin_reg = torch.chunk(var_N_t, chunks=2, dim=0)[0] if args.sde_type == 'vesde' else None
|
|
|
|
else:
|
|
|
|
pred_params_kin_reg = pred_params
|
|
|
|
var_t_kin_reg, m_t_kin_reg, f_t_kin_reg, g2_t_kin_reg, var_N_t_kin_reg = var_t, m_t, f_t, g2_t, var_N_t
|
|
|
|
kin_reg_loss = utils.calc_kinetic_regularization(pred_params_kin_reg, eps_t_p, dae, var_t_kin_reg, m_t_kin_reg,
|
|
|
|
f_t_kin_reg, g2_t_kin_reg, var_N_t_kin_reg, args)
|
|
|
|
"""
|
|
|
|
kin_reg_loss = calc_kinetic_regularization(pred_params_p, eps_t_p, dae, var_t_p, m_t_p,
|
|
|
|
f_t, g2_t_p, var_N_t, args)
|
|
|
|
regularization_p += args.kin_reg_coeff * kin_reg_loss
|
|
|
|
|
|
|
|
return regularization_p, dae_norm_loss, dae_bn_loss, dae_wdn_coeff, jac_reg_loss, kin_reg_loss
|
|
|
|
|
|
|
|
|
|
|
|
def update_vae_lr(args, global_step, warmup_iters, vae_optimizer):
|
|
|
|
if global_step < warmup_iters:
|
|
|
|
lr = args.trainer.opt.lr * float(global_step) / warmup_iters
|
|
|
|
for param_group in vae_optimizer.param_groups:
|
|
|
|
param_group['lr'] = lr
|
|
|
|
# use same lr if lr for local-dae is not specified
|
|
|
|
|
|
|
|
|
|
|
|
def update_lr(args, global_step, warmup_iters, dae_optimizer, vae_optimizer, dae_local_optimizer=None):
|
|
|
|
if global_step < warmup_iters:
|
|
|
|
lr = args.learning_rate_dae * float(global_step) / warmup_iters
|
|
|
|
if args.learning_rate_mlogit > 0 and len(dae_optimizer.param_groups) > 1:
|
|
|
|
lr_mlogit = args.learning_rate_mlogit * \
|
|
|
|
float(global_step) / warmup_iters
|
|
|
|
for i, param_group in enumerate(dae_optimizer.param_groups):
|
|
|
|
if i == 0:
|
|
|
|
param_group['lr'] = lr_mlogit
|
|
|
|
else:
|
|
|
|
param_group['lr'] = lr
|
|
|
|
else:
|
|
|
|
for param_group in dae_optimizer.param_groups:
|
|
|
|
param_group['lr'] = lr
|
|
|
|
# use same lr if lr for local-dae is not specified
|
|
|
|
lr = lr if args.learning_rate_dae_local <= 0 else args.learning_rate_dae_local * \
|
|
|
|
float(global_step) / warmup_iters
|
|
|
|
if dae_local_optimizer is not None:
|
|
|
|
for param_group in dae_local_optimizer.param_groups:
|
|
|
|
param_group['lr'] = lr
|
|
|
|
|
|
|
|
if args.train_vae:
|
|
|
|
lr = args.learning_rate_vae * float(global_step) / warmup_iters
|
|
|
|
for param_group in vae_optimizer.param_groups:
|
|
|
|
param_group['lr'] = lr
|
|
|
|
|
|
|
|
|
|
|
|
def start_meters():
|
|
|
|
tr_loss_meter = AvgrageMeter()
|
|
|
|
vae_recon_meter = AvgrageMeter()
|
|
|
|
vae_kl_meter = AvgrageMeter()
|
|
|
|
vae_nelbo_meter = AvgrageMeter()
|
|
|
|
kl_per_group_ema = AvgrageMeter()
|
|
|
|
return tr_loss_meter, vae_recon_meter, vae_kl_meter, vae_nelbo_meter, kl_per_group_ema
|
|
|
|
|
|
|
|
|
|
|
|
def epoch_logging(args, writer, step, vae_recon_meter, vae_kl_meter, vae_nelbo_meter, tr_loss_meter, kl_per_group_ema):
|
|
|
|
average_tensor(vae_recon_meter.avg, args.distributed)
|
|
|
|
average_tensor(vae_kl_meter.avg, args.distributed)
|
|
|
|
average_tensor(vae_nelbo_meter.avg, args.distributed)
|
|
|
|
average_tensor(tr_loss_meter.avg, args.distributed)
|
|
|
|
average_tensor(kl_per_group_ema.avg, args.distributed)
|
|
|
|
|
|
|
|
writer.add_scalar('epoch/vae_recon', vae_recon_meter.avg, step)
|
|
|
|
writer.add_scalar('epoch/vae_kl', vae_kl_meter.avg, step)
|
|
|
|
writer.add_scalar('epoch/vae_nelbo', vae_nelbo_meter.avg, step)
|
|
|
|
writer.add_scalar('epoch/total_loss', tr_loss_meter.avg, step)
|
|
|
|
# add kl value per group to tensorboard
|
|
|
|
for i in range(len(kl_per_group_ema.avg)):
|
|
|
|
writer.add_scalar('kl_value/group_%d' %
|
|
|
|
i, kl_per_group_ema.avg[i], step)
|
|
|
|
|
|
|
|
|
|
|
|
def infer_active_variables(train_queue, vae, args, device, distributed, max_iter=None):
|
|
|
|
kl_meter = AvgrageMeter()
|
|
|
|
vae.eval()
|
|
|
|
for step, x in enumerate(train_queue):
|
|
|
|
if max_iter is not None and step > max_iter:
|
|
|
|
break
|
|
|
|
tr_pts = x['tr_points']
|
|
|
|
with autocast(enabled=args.autocast_train):
|
|
|
|
# apply vae:
|
|
|
|
with torch.set_grad_enabled(False):
|
|
|
|
# output = model.recont(val_x) ## torch.cat([val_x, tr_x]))
|
|
|
|
dist = vae.encode(tr_pts.to(device))
|
|
|
|
eps = dist.sample()[0]
|
|
|
|
all_log_q = [dist.log_p(eps)]
|
|
|
|
## _, all_log_q, all_eps = vae(x)
|
|
|
|
## all_eps = vae.concat_eps_per_scale(all_eps)
|
|
|
|
## all_log_q = vae.concat_eps_per_scale(all_log_q)
|
|
|
|
all_eps = [eps]
|
|
|
|
|
|
|
|
def make_4d(xlist): return [
|
|
|
|
x.unsqueeze(-1).unsqueeze(-1) if len(x.shape) == 2 else x.unsqueeze(-1) for x in xlist]
|
|
|
|
|
|
|
|
log_q, log_p, kl_all, kl_diag = vae_terms(
|
|
|
|
make_4d(all_log_q), make_4d(all_eps))
|
|
|
|
kl_meter.update(kl_diag[0], 1) # only the top scale
|
|
|
|
average_tensor(kl_meter.avg, distributed)
|
|
|
|
return kl_meter.avg > 0.1
|