From 531d6956b5c8eb8c3bd217cd685f85d25a8fbd91 Mon Sep 17 00:00:00 2001 From: xzeng Date: Thu, 16 Mar 2023 12:44:47 -0400 Subject: [PATCH] add comment and exp logger --- README.md | 12 ++++++++++++ trainers/base_trainer.py | 8 +++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 54747f4..3df927f 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,18 @@ run `python demo.py`, will load the released text2shape model on hugging face an * require the vae checkpoint * run `bash ./script/train_prior.sh $NGPU` (the released checkpoint is trained with `NGPU=8` with 2 node on V100) +### (Optional) monitor exp +* (tested) use comet-ml: need to add a file `.comet_api` under this `LION` folder, example of the `.comet_api` file: +``` +{"api_key": "...", "project_name": "lion", "workspace": "..."} +``` +* (not tested) use wandb: need to add a `.wandb_api` file, and set the env variable `export USE_WB=1` before training +``` +{"project": "...", "entity": "..."} +``` +* (not tested) use tensorboard, set the env variable `export USE_TFB=1` before training +* see the `utils/utils.py` files for the details of the experiment logger; I usually use comet-ml for my experiments + ### evaluate a trained prior * download the test data (Table 1) from [here](https://drive.google.com/file/d/1uEp0o6UpRqfYwvRXQGZ5ZgT1IYBQvUSV/view?usp=share_link), unzip and put it as `./datasets/test_data/` * download the released checkpoint from above diff --git a/trainers/base_trainer.py b/trainers/base_trainer.py index 17d38a8..761c1a2 100644 --- a/trainers/base_trainer.py +++ b/trainers/base_trainer.py @@ -721,7 +721,7 @@ class BaseTrainer(ABC): def eval_nll(self, step, ntest=None, save_file=False): loss_dict = {} cfg = self.cfg - self.swap_vae_param_if_need() + self.swap_vae_param_if_need() # if using EMA, load the ema weight args = self.args device = torch.device('cuda:%d' % args.local_rank) tag = exp_helper.get_evalname(self.cfg) @@ -752,6 +752,8 @@ class BaseTrainer(ABC): output = self.model.get_loss( val_x, it=step, is_eval_nll=1, noisy_input=inputs, **model_kwargs) + + # book-keeping for k, v in output.items(): if 'print/' in k: k = k.split('print/')[-1] @@ -761,7 +763,7 @@ class BaseTrainer(ABC): loss_dict[k].update(v) gen_x = output['final_pred'] - if gen_x.shape[1] > val_x.shape[1]: + if gen_x.shape[1] > val_x.shape[1]: # downsample points if needed tr_idxs = np.random.permutation(np.arange(gen_x.shape[1]))[ :val_x.shape[1]] gen_x = gen_x[:, tr_idxs] @@ -813,7 +815,7 @@ class BaseTrainer(ABC): self.writer.add_scalar('eval/%s' % (n), v, step) if 'CD' in n: score = v - self.swap_vae_param_if_need() + self.swap_vae_param_if_need() # if using EMA, swap back to none-ema weight here return score def prepare_clip_model_data(self):