init

2023-01-23 00:14:49 -05:00 · 2023-01-23 00:14:49 -05:00 · 1d24a7879d
parent 1118b9b0c0
commit 1d24a7879d
134 changed files with 18308 additions and 10 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,14 @@
 __pycache__/
 *__pycache__
 .idea/
 *.pyc
 *.m
 .ipynb_checkpoints
 *swp
 *swo
 *__pycache__*
 models/pvcnn/functional/build/
 *.sh
 lion_ckpt
 data/
 datasets/test_data
--- a/README.md
+++ b/README.md
@ -1,20 +1,76 @@
 ## <p align="center">LION: Latent Point Diffusion Models for 3D Shape Generation<br><br> NeurIPS 2022 </p>
 <div align="center">
-  <a href="https://www.cs.utoronto.ca/~xiaohui/" target="_blank">Xiaohui&nbsp;Zeng</a> &emsp; <b>&middot;</b> &emsp;
+  <a href="https://www.cs.utoronto.ca/~xiaohui/" target="_blank">Xiaohui&nbsp;Zeng</a> &emsp; 
-  <a href="http://latentspace.cc/" target="_blank">Arash&nbsp;Vahdat</a> &emsp; <b>&middot;</b> &emsp;
+  <a href="http://latentspace.cc/" target="_blank">Arash&nbsp;Vahdat</a> &emsp; 
-  <a href="https://www.fwilliams.info/" target="_blank">Francis&nbsp;Williams</a> &emsp; <b>&middot;</b> &emsp;
+  <a href="https://www.fwilliams.info/" target="_blank">Francis&nbsp;Williams</a> &emsp; 
-  <a href="https://zgojcic.github.io/" target="_blank">Zan&nbsp;Gojcic</a> &emsp; <b>&middot;</b> &emsp;
+  <a href="https://zgojcic.github.io/" target="_blank">Zan&nbsp;Gojcic</a> &emsp; 
-  <a href="https://orlitany.github.io/" target="_blank">Or&nbsp;Litany</a> &emsp; <b>&middot;</b> &emsp;
+  <a href="https://orlitany.github.io/" target="_blank">Or&nbsp;Litany</a> &emsp; 
-  <a href="https://www.cs.utoronto.ca/~fidler/" target="_blank">Sanja&nbsp;Fidler</a> &emsp; <b>&middot;</b> &emsp;
+  <a href="https://www.cs.utoronto.ca/~fidler/" target="_blank">Sanja&nbsp;Fidler</a> &emsp; 
  <a href="https://karstenkreis.github.io/" target="_blank">Karsten&nbsp;Kreis</a>
  <br> <br>
  <a href="https://arxiv.org/abs/2210.06978" target="_blank">Paper</a> &emsp;
  <a href="https://nv-tlabs.github.io/LION" target="_blank">Project&nbsp;Page</a> 
 </div>
-<br><br>
+
 <p align="center">:construction: :pick: :hammer_and_wrench: :construction_worker:</p>
 <p align="center">Here, we will release code and checkpoints in the near future! Stay tuned!</p>
 <br><br>
 <p align="center">
    <img width="750" alt="Animation" src="assets/animation.gif"/>
 </p>
 ## Install 
 * Dependencies: 
    * CUDA 11.6 
 * Setup the environment 
    Install from conda file  
    ``` 
        conda env create --name lion_env --file=env.yaml 
        conda activate lion_env 
        # Install some other packages 
        pip install git+https://github.com/openai/CLIP.git 
        # build some packages first (optional)
        python build_pkg.py
    ```
    Tested with conda version 22.9.0
 ## Demo
 run `python demo.py`, will load the released text2shape model on hugging face and generate a chair point cloud. 
 ## Released checkpoint and samples 
 * will be release soon
 * put the downloaded file under `./lion_ckpt/`
 ## Training 
 ### data 
 * ShapeNet can be downloaded [here](https://github.com/stevenygd/PointFlow#dataset). 
 * Put the downloaded data as `./data/ShapeNetCore.v2.PC15k` *or* edit the `pointflow` entry in `./datasets/data_path.py` for the ShapeNet dataset path. 
 ### train VAE 
 * run `bash ./script/train_vae.sh $NGPU` (the released checkpoint is trained with `NGPU=4`) 
 ### train diffusion prior 
 * require the vae checkpoint
 * run `bash ./script/train_prior.sh $NGPU` (the released checkpoint is trained with `NGPU=8` with 2 node)
 ### evaluate a trained prior 
 * download the test data from [here](https://drive.google.com/file/d/1uEp0o6UpRqfYwvRXQGZ5ZgT1IYBQvUSV/view?usp=share_link), unzip and put it as `./datasets/test_data/`
 * download the released checkpoint from above
 ```
 checkpoint="./lion_ckpt/unconditional/airplane/checkpoints/model.pt" 
 bash ./script/eval.sh $checkpoint  # will take 1-2 hour 
 ```
 ## Evaluate the samples with the 1-NNA metrics 
 * download the test data from [here](https://drive.google.com/file/d/1uEp0o6UpRqfYwvRXQGZ5ZgT1IYBQvUSV/view?usp=share_link), unzip and put it as `./datasets/test_data/`
 * run `python ./script/compute_score.py`
 ## Citation
 ```
@inproceedings{zeng2022lion,
    title={LION: Latent Point Diffusion Models for 3D Shape Generation},
        author={Xiaohui Zeng and Arash Vahdat and Francis Williams and Zan Gojcic and Or Litany and Sanja Fidler and Karsten Kreis},
        booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
        year={2022}
 }
 ```
--- a/build_pkg.py
+++ b/build_pkg.py
@ -0,0 +1,3 @@
 import clip
 from models import pvcnn2
 from utils import eval_helper 
--- a/datasets/data_path.py
+++ b/datasets/data_path.py
@ -0,0 +1,38 @@
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
 # and proprietary rights in and to this software, related documentation
 # and any modifications thereto.  Any use, reproduction, disclosure or
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
 import os
 def get_path(dataname=None):
    dataset_path = {}
    dataset_path['pointflow'] = [
        './data/ShapeNetCore.v2.PC15k/'
    ]
    if dataname is None:
        return dataset_path
    else:
        assert(
            dataname in dataset_path), f'not found {dataname}, only: {list(dataset_path.keys())}'
        for p in dataset_path[dataname]:
            print(f'searching: {dataname}, get: {p}')
            if os.path.exists(p):
                return p
        ValueError(
            f'all path not found for {dataname}, please double check: {dataset_path[dataname]}; or edit the datasets/data_path.py ')
 def get_cache_path():
    cache_list = ['/workspace/data_cache_local/data_stat/',
                  '/workspace/data_cache/data_stat/']
    for p in cache_list:
        if os.path.exists(p):
            return p
    ValueError(
        f'all path not found for {cache_list}, please double check: or edit the datasets/data_path.py ')
--- a/datasets/pointflow_datasets.py
+++ b/datasets/pointflow_datasets.py
@ -0,0 +1,404 @@
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
 # and proprietary rights in and to this software, related documentation
 # and any modifications thereto.  Any use, reproduction, disclosure or
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
 """ copied and modified from https://github.com/stevenygd/PointFlow/blob/master/datasets.py """
 import os
 import open3d as o3d
 import time
 import torch
 import numpy as np
 from loguru import logger
 from torch.utils.data import Dataset
 from torch.utils import data
 import random
 import tqdm
 from datasets.data_path import get_path
 OVERFIT = 0
 # taken from https://github.com/optas/latent_3d_points/blob/
 # 8e8f29f8124ed5fc59439e8551ba7ef7567c9a37/src/in_out.py
 synsetid_to_cate = {
    '02691156': 'airplane',
    '02773838': 'bag',
    '02801938': 'basket',
    '02808440': 'bathtub',
    '02818832': 'bed',
    '02828884': 'bench',
    '02876657': 'bottle',
    '02880940': 'bowl',
    '02924116': 'bus',
    '02933112': 'cabinet',
    '02747177': 'can',
    '02942699': 'camera',
    '02954340': 'cap',
    '02958343': 'car',
    '03001627': 'chair',
    '03046257': 'clock',
    '03207941': 'dishwasher',
    '03211117': 'monitor',
    '04379243': 'table',
    '04401088': 'telephone',
    '02946921': 'tin_can',
    '04460130': 'tower',
    '04468005': 'train',
    '03085013': 'keyboard',
    '03261776': 'earphone',
    '03325088': 'faucet',
    '03337140': 'file',
    '03467517': 'guitar',
    '03513137': 'helmet',
    '03593526': 'jar',
    '03624134': 'knife',
    '03636649': 'lamp',
    '03642806': 'laptop',
    '03691459': 'speaker',
    '03710193': 'mailbox',
    '03759954': 'microphone',
    '03761084': 'microwave',
    '03790512': 'motorcycle',
    '03797390': 'mug',
    '03928116': 'piano',
    '03938244': 'pillow',
    '03948459': 'pistol',
    '03991062': 'pot',
    '04004475': 'printer',
    '04074963': 'remote_control',
    '04090263': 'rifle',
    '04099429': 'rocket',
    '04225987': 'skateboard',
    '04256520': 'sofa',
    '04330267': 'stove',
    '04530566': 'vessel',
    '04554684': 'washer',
    '02992529': 'cellphone',
    '02843684': 'birdhouse',
    '02871439': 'bookshelf',
    # '02858304': 'boat', no boat in our dataset, merged into vessels
    # '02834778': 'bicycle', not in our taxonomy
 }
 cate_to_synsetid = {v: k for k, v in synsetid_to_cate.items()}
 class ShapeNet15kPointClouds(Dataset):
    def __init__(self,
                 categories=['airplane'],
                 tr_sample_size=10000,
                 te_sample_size=10000,
                 split='train',
                 scale=1.,
                 normalize_per_shape=False,
                 normalize_shape_box=False,
                 random_subsample=False,
                 sample_with_replacement=1,
                 normalize_std_per_axis=False,
                 normalize_global=False,
                 recenter_per_shape=False,
                 all_points_mean=None,
                 all_points_std=None,
                 input_dim=3, 
                 ):
        self.normalize_shape_box = normalize_shape_box
        root_dir = get_path('pointflow')
        self.root_dir = root_dir
        logger.info('[DATA] cat: {}, split: {}, full path: {}; norm global={}, norm-box={}',
                    categories, split, self.root_dir, normalize_global, normalize_shape_box)
        self.split = split
        assert self.split in ['train', 'test', 'val']
        self.tr_sample_size = tr_sample_size
        self.te_sample_size = te_sample_size
        if type(categories) is str:
            categories = [categories]
        self.cates = categories
        if 'all' in categories:
            self.synset_ids = list(cate_to_synsetid.values())
        else:
            self.synset_ids = [cate_to_synsetid[c] for c in self.cates]
        subdirs = self.synset_ids
        # assert 'v2' in root_dir, "Only supporting v2 right now."
        self.gravity_axis = 1
        self.display_axis_order = [0, 2, 1]
        self.root_dir = root_dir
        self.split = split
        self.in_tr_sample_size = tr_sample_size
        self.in_te_sample_size = te_sample_size
        self.subdirs = subdirs
        self.scale = scale
        self.random_subsample = random_subsample
        self.sample_with_replacement = sample_with_replacement
        self.input_dim = input_dim
        self.all_cate_mids = []
        self.cate_idx_lst = []
        self.all_points = []
        tic = time.time()
        for cate_idx, subd in enumerate(self.subdirs):
            # NOTE: [subd] here is synset id
            sub_path = os.path.join(root_dir, subd, self.split)
            if not os.path.isdir(sub_path):
                print("Directory missing : %s " % (sub_path))
                raise ValueError('check the data path')
                continue
            if True:
                all_mids = []
                assert(os.path.exists(sub_path)), f'path missing: {sub_path}'
                for x in os.listdir(sub_path):
                    if not x.endswith('.npy'):
                        continue
                    all_mids.append(os.path.join(self.split, x[:-len('.npy')]))
                logger.info('[DATA] number of file [{}] under: {} ',
                            len(os.listdir(sub_path)), sub_path)
                # NOTE: [mid] contains the split: i.e. "train/<mid>"
                # or "val/<mid>" or "test/<mid>"
                all_mids = sorted(all_mids)
                for mid in all_mids:
                    # obj_fname = os.path.join(sub_path, x)
                    obj_fname = os.path.join(root_dir, subd, mid + ".npy")
                    point_cloud = np.load(obj_fname)  # (15k, 3)
                    self.all_points.append(point_cloud[np.newaxis, ...])
                    self.cate_idx_lst.append(cate_idx)
                    self.all_cate_mids.append((subd, mid))
        logger.info('[DATA] Load data time: {:.1f}s | dir: {} | '
                    'sample_with_replacement: {}; num points: {}', time.time() - tic, self.subdirs,
                    self.sample_with_replacement, len(self.all_points))
        # Shuffle the index deterministically (based on the number of examples)
        self.shuffle_idx = list(range(len(self.all_points)))
        random.Random(38383).shuffle(self.shuffle_idx)
        self.cate_idx_lst = [self.cate_idx_lst[i] for i in self.shuffle_idx]
        self.all_points = [self.all_points[i] for i in self.shuffle_idx]
        self.all_cate_mids = [self.all_cate_mids[i] for i in self.shuffle_idx]
        # Normalization
        self.all_points = np.concatenate(self.all_points)  # (N, 15000, 3)
        self.normalize_per_shape = normalize_per_shape
        self.normalize_std_per_axis = normalize_std_per_axis
        self.recenter_per_shape = recenter_per_shape
        if self.normalize_shape_box:  # per shape normalization
            B, N = self.all_points.shape[:2]
            self.all_points_mean = (  # B,1,3
                (np.amax(self.all_points, axis=1)).reshape(B, 1, input_dim) +
                (np.amin(self.all_points, axis=1)).reshape(B, 1, input_dim)) / 2
            self.all_points_std = np.amax(  # B,1,1
                ((np.amax(self.all_points, axis=1)).reshape(B, 1, input_dim) -
                 (np.amin(self.all_points, axis=1)).reshape(B, 1, input_dim)),
                axis=-1).reshape(B, 1, 1) / 2
        elif self.normalize_per_shape:  # per shape normalization
            B, N = self.all_points.shape[:2]
            self.all_points_mean = self.all_points.mean(axis=1).reshape(
                B, 1, input_dim)
            logger.info('all_points shape: {}. mean over axis=1',
                        self.all_points.shape)
            if normalize_std_per_axis:
                self.all_points_std = self.all_points.reshape(
                    B, N, -1).std(axis=1).reshape(B, 1, input_dim)
            else:
                self.all_points_std = self.all_points.reshape(
                    B, -1).std(axis=1).reshape(B, 1, 1)
        elif all_points_mean is not None and all_points_std is not None and not self.recenter_per_shape:
            # using loaded dataset stats
            self.all_points_mean = all_points_mean
            self.all_points_std = all_points_std
        elif self.recenter_per_shape:  # per shape center
            # TODO: bounding box scale at the large dim and center
            B, N = self.all_points.shape[:2]
            self.all_points_mean = (
                (np.amax(self.all_points, axis=1)).reshape(B, 1, input_dim) +
                (np.amin(self.all_points, axis=1)).reshape(B, 1,
                                                           input_dim)) / 2
            self.all_points_std = np.amax(
                ((np.amax(self.all_points, axis=1)).reshape(B, 1, input_dim) -
                 (np.amin(self.all_points, axis=1)).reshape(B, 1, input_dim)),
                axis=-1).reshape(B, 1, 1) / 2
        # else:  # normalize across the dataset
        elif normalize_global:  # normalize across the dataset
            self.all_points_mean = self.all_points.reshape(
                -1, input_dim).mean(axis=0).reshape(1, 1, input_dim)
            if normalize_std_per_axis:
                self.all_points_std = self.all_points.reshape(
                    -1, input_dim).std(axis=0).reshape(1, 1, input_dim)
            else:
                self.all_points_std = self.all_points.reshape(-1).std(
                    axis=0).reshape(1, 1, 1)
            logger.info('[DATA] normalize_global: mean={}, std={}',
                        self.all_points_mean.reshape(-1),
                        self.all_points_std.reshape(-1))
        else:
            raise NotImplementedError('No Normalization')
        self.all_points = (self.all_points - self.all_points_mean) / \
            self.all_points_std
        logger.info('[DATA] shape={}, all_points_mean:={}, std={}, max={:.3f}, min={:.3f}; num-pts={}',
                    self.all_points.shape,
                    self.all_points_mean.shape, self.all_points_std.shape,
                    self.all_points.max(), self.all_points.min(), tr_sample_size)
        if OVERFIT:
            self.all_points = self.all_points[:40]
        # TODO: why do we need this??
        self.train_points = self.all_points[:, :min(
            10000, self.all_points.shape[1])]
        self.tr_sample_size = min(10000, tr_sample_size)
        self.te_sample_size = min(5000, te_sample_size)
        assert self.scale == 1, "Scale (!= 1) is deprecated"
        # Default display axis order
        self.display_axis_order = [0, 1, 2]
    def get_pc_stats(self, idx):
        if self.recenter_per_shape:
            m = self.all_points_mean[idx].reshape(1, self.input_dim)
            s = self.all_points_std[idx].reshape(1, -1)
            return m, s
        if self.normalize_per_shape or self.normalize_shape_box:
            m = self.all_points_mean[idx].reshape(1, self.input_dim)
            s = self.all_points_std[idx].reshape(1, -1)
            return m, s
        return self.all_points_mean.reshape(1, -1), \
            self.all_points_std.reshape(1, -1)
    def renormalize(self, mean, std):
        self.all_points = self.all_points * self.all_points_std + \
            self.all_points_mean
        self.all_points_mean = mean
        self.all_points_std = std
        self.all_points = (self.all_points - self.all_points_mean) / \
            self.all_points_std
        self.train_points = self.all_points[:, :min(
            10000, self.all_points.shape[1])]
        ## self.test_points = self.all_points[:, 10000:]
    def __len__(self):
        return len(self.train_points)
    def __getitem__(self, idx):
        output = {}
        tr_out = self.train_points[idx]
        if self.random_subsample and self.sample_with_replacement:
            tr_idxs = np.random.choice(tr_out.shape[0], self.tr_sample_size)
        elif self.random_subsample and not self.sample_with_replacement:
            tr_idxs = np.random.permutation(
                np.arange(tr_out.shape[0]))[:self.tr_sample_size]
        else:
            tr_idxs = np.arange(self.tr_sample_size)
        tr_out = torch.from_numpy(tr_out[tr_idxs, :]).float()
        m, s = self.get_pc_stats(idx)
        cate_idx = self.cate_idx_lst[idx]
        sid, mid = self.all_cate_mids[idx]
        input_pts = tr_out
        output.update(
            {
                'idx': idx,
                'select_idx': tr_idxs,
                'tr_points': tr_out,
                'input_pts': input_pts,
                'mean': m,
                'std': s,
                'cate_idx': cate_idx,
                'sid': sid,
                'mid': mid,
                'display_axis_order': self.display_axis_order
            })
        return output
 def init_np_seed(worker_id):
    seed = torch.initial_seed()
    np.random.seed(seed % 4294967296)
 def get_datasets(cfg, args):
    """
        cfg: config.data sub part 
    """
    if OVERFIT:
        random_subsample = 0
    else:
        random_subsample = cfg.random_subsample
    logger.info(f'get_datasets: tr_sample_size={cfg.tr_max_sample_points}, '
                f' te_sample_size={cfg.te_max_sample_points}; '
                f' random_subsample={random_subsample}'
                f' normalize_global={cfg.normalize_global}'
                f' normalize_std_per_axix={cfg.normalize_std_per_axis}'
                f' normalize_per_shape={cfg.normalize_per_shape}'
                f' recenter_per_shape={cfg.recenter_per_shape}'
                )
    kwargs = {}
    tr_dataset = ShapeNet15kPointClouds(
        categories=cfg.cates,
        split='train',
        tr_sample_size=cfg.tr_max_sample_points,
        te_sample_size=cfg.te_max_sample_points,
        sample_with_replacement=cfg.sample_with_replacement,
        scale=cfg.dataset_scale,  # root_dir=cfg.data_dir,
        normalize_shape_box=cfg.normalize_shape_box,
        normalize_per_shape=cfg.normalize_per_shape,
        normalize_std_per_axis=cfg.normalize_std_per_axis,
        normalize_global=cfg.normalize_global,
        recenter_per_shape=cfg.recenter_per_shape,
        random_subsample=random_subsample,
        **kwargs)
    eval_split = getattr(args, "eval_split", "val")
    # te_dataset has random_subsample as False, therefore not using sample_with_replacement
    te_dataset = ShapeNet15kPointClouds(
        categories=cfg.cates,
        split=eval_split,
        tr_sample_size=cfg.tr_max_sample_points,
        te_sample_size=cfg.te_max_sample_points,
        scale=cfg.dataset_scale,  # root_dir=cfg.data_dir,
        normalize_shape_box=cfg.normalize_shape_box,
        normalize_per_shape=cfg.normalize_per_shape,
        normalize_std_per_axis=cfg.normalize_std_per_axis,
        normalize_global=cfg.normalize_global,
        recenter_per_shape=cfg.recenter_per_shape,
        all_points_mean=tr_dataset.all_points_mean,
        all_points_std=tr_dataset.all_points_std,
    )
    return tr_dataset, te_dataset
 def get_data_loaders(cfg, args):
    tr_dataset, te_dataset = get_datasets(cfg, args)
    kwargs = {}
    if args.distributed:
        kwargs['sampler'] = data.distributed.DistributedSampler(
            tr_dataset, shuffle=True)
    else:
        kwargs['shuffle'] = True
    if args.eval_trainnll:
        kwargs['shuffle'] = False
    train_loader = data.DataLoader(dataset=tr_dataset,
                                   batch_size=cfg.batch_size,
                                   num_workers=cfg.num_workers,
                                   drop_last=cfg.train_drop_last == 1,
                                   pin_memory=False, **kwargs)
    test_loader = data.DataLoader(dataset=te_dataset,
                                  batch_size=cfg.batch_size_test,
                                  shuffle=False,
                                  num_workers=cfg.num_workers,
                                  pin_memory=False,
                                  drop_last=False,
                                  )
    logger.info(
        f'[Batch Size] train={cfg.batch_size}, test={cfg.batch_size_test}; drop-last={cfg.train_drop_last}')
    loaders = {
        "test_loader": test_loader,
        'train_loader': train_loader,
    }
    return loaders
--- a/default_config.py
+++ b/default_config.py
@ -0,0 +1,450 @@
 # ---------------------------------------------------------------
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
 # and proprietary rights in and to this software, related documentation
 # and any modifications thereto.  Any use, reproduction, disclosure or
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
 # ---------------------------------------------------------------
 from third_party.yacs_config import CfgNode as CN
 cfg = CN()
 cfg.dpm_ckpt = ''
 cfg.clipforge = CN()
 cfg.clipforge.clip_model = "ViT-B/32"
 cfg.clipforge.enable = 0
 cfg.clipforge.feat_dim = 512
 cfg.eval_trainnll = 0
 cfg.exp_name = ''
 cfg.cmt = ''
 cfg.hash = ''
 cfg.ngpu = 1
 cfg.snapshot_min = 30  # snapshot every 30 min
 cfg.bash_name = ''
 cfg.set_detect_anomaly = 0
 cfg.weight_recont = 1.0
 # vae ckpt
 # lns
 cfg.use_checkpoint = 0
 cfg.num_val_samples = 16  # 24 #12
 # config for pointtransformer
 cfg.eval = CN()
 cfg.eval.need_denoise = 0
 cfg.eval.load_other_vae_ckpt = 0
 cfg.register_deprecated_key('eval.other_vae_ckpt_path')
 cfg.vis_latent_point = 0
 cfg.latent_pts = CN()
 #cfg.latent_pts.class_embed_layer = ''
 cfg.register_deprecated_key('latent_pts.class_embed_layer')
 cfg.latent_pts.style_dim = 128  # dim of global style latent variable
 cfg.register_deprecated_key('latent_pts.perturb_input')
 cfg.register_deprecated_key('latent_pts.perturb_input_scale')
 cfg.register_deprecated_key('latent_pts.outlier_input')
 # scale of init weights for the mlp in adaGN layer
 cfg.latent_pts.ada_mlp_init_scale = 1.0
 # models.latent_points_ada.StyleMLP' # style mlp layers
 cfg.latent_pts.style_mlp = ''
 cfg.latent_pts.pts_sigma_offset = 0.0
 cfg.latent_pts.skip_weight = 0.1
 cfg.latent_pts.encoder_layer_out_dim = 32
 cfg.latent_pts.decoder_layer_out_dim = 32
 cfg.register_deprecated_key('latent_pts.encoder_nneighbor')
 cfg.register_deprecated_key('latent_pts.decoder_nneighbor')
 cfg.latent_pts.style_prior = 'models.score_sde.resnet.PriorSEDrop'
 cfg.latent_pts.mask_out_extra_latent = 0  # use only latent coordinates
 # latent coordinates directly same as input (not using the decoder and encoder)
 cfg.register_deprecated_key('latent_pts.latent_as_pts')
 cfg.latent_pts.normalization = 'bn'  # BatchNorm or LayerNorm
 cfg.latent_pts.pvd_mse_loss = 0
 cfg.latent_pts.hid = 64
 cfg.register_deprecated_key('latent_pts.knn')
 cfg.register_deprecated_key('latent_pts.n5layer')
 cfg.register_deprecated_key('latent_pts.dgcnn_last_hid')
 cfg.latent_pts.latent_dim_ext = [64]  # the global latent dim
 cfg.latent_pts.weight_kl_pt = 1.0  # kl ratio of the pts
 cfg.latent_pts.weight_kl_feat = 1.0  # kl ratio of the latent feat
 cfg.latent_pts.weight_kl_glb = 1.0  # kl ratio of the latent feat
 # kl ratio of the latent feat
 cfg.latent_pts.style_encoder = 'models.shapelatent_modules.PointNetPlusEncoder'
 cfg.latent_pts.use_linear_for_adagn = 0
 # cfg.latent_pts.weight_kl_glb = 1.0 # kl ratio of the global latent
 # shapelatent:
 cfg.has_shapelatent = 1 
 cfg.shapelatent = CN()
 cfg.shapelatent.local_emb_agg = 'mean'
 cfg.shapelatent.freeze_vae = 0  # learn vae
 cfg.shapelatent.eps_z_global_only = 1
 cfg.shapelatent.model = 'flow'
 cfg.shapelatent.residual = 1
 cfg.shapelatent.encoder_type = 'pointnet'
 cfg.shapelatent.prior_type = 'flow'
 cfg.shapelatent.decoder_type = 'PointwiseNet'
 cfg.shapelatent.loss0_weight = 1.0
 cfg.shapelatent.latent_dim = 256
 cfg.shapelatent.kl_weight = 1e-3
 cfg.shapelatent.decoder_num_points = -1
 # offset the sigma towards zero for better init, will use the log_sigma - offset value, better to be positive s.t. - offset < 0 since we'd like to push it towards 0; exp(-0.1)=0.9, exp(-0.8)=0.44, exp(-1)=0.3, exp(-10)=4e-5
 cfg.shapelatent.log_sigma_offset = 0.0
 cfg.sde = CN()
 cfg.sde.ode_sample = 0 #1
 # train the prior or not, default is 1, only when we do voxel2pts, will freeze prior
 cfg.sde.train_dae = 1
 cfg.sde.init_t = 1.0  # start from time = 1.0
 cfg.sde.nhead = 4  # number of head in transformder: multi-head attention layer
 cfg.sde.local_prior = 'same_as_global'  # architecture for local prior
 cfg.sde.drop_inactive_var = 0
 cfg.sde.learn_mixing_logit = 1  # freeze it
 cfg.sde.regularize_mlogit_margin = 0.0
 cfg.sde.share_mlogit = 0  # use same mlogit for all latent variables
 cfg.sde.hypara_mixing_logit = 0  # set as hyper-parameter and freeze it?
 cfg.sde.bound_mlogit = 0  # clamp or not
 cfg.sde.bound_mlogit_value = -5.42  # clamp the max value
 cfg.sde.regularize_mlogit = 0  # set the sum of sigmoid(mlogit) as one loss
 cfg.sde.attn_mhead = 0  # use multi-head attention in prior model
 cfg.sde.attn_mhead_local = -1  # use multi-head attention in prior model
 cfg.sde.pos_embed = 'none'
 cfg.sde.hier_prior = 0
 cfg.sde.is_continues = 0
 cfg.sde.time_emb_scales = 1.0  # -> 1k?
 cfg.sde.time_eps = 1e-2
 cfg.sde.ode_eps = 1e-5  # cut off for ode sampling
 cfg.sde.sde_type = 'vpsde'  # vada
 cfg.sde.sigma2_0 = 0.0
 cfg.sde.sigma2_max = 0.99
 cfg.sde.sigma2_min = 1e-4
 cfg.sde.beta_start = 0.1  # 1e-4 * 1e3
 cfg.sde.beta_end = 20.0  # 1e-2 * 1e3
 # sampling, always iw # ll: small times; 'll_uniform'  # -> ll_iw
 cfg.sde.iw_sample_p = 'll_iw'
 # drop_all_iw / drop_sigma2t_iw
 cfg.sde.iw_subvp_like_vp_sde = False
 cfg.sde.prior_model = 'models.latent_points_ada_localprior.PVCNN2Prior'
 # -- to train diffusion in latent space -- #
 cfg.sde.update_q_ema = False
 cfg.sde.iw_sample_q = 'reweight_p_samples'
 # ll_iw / reweight_p_samples
 cfg.sde.kl_anneal_portion_vada = 0.1
 cfg.sde.kl_const_portion_vada = 0.0
 cfg.sde.kl_const_coeff_vada = 0.7
 cfg.sde.kl_balance_vada = False
 cfg.sde.grad_clip_max_norm = 0.0
 cfg.sde.cont_kl_anneal = True
 # False
 cfg.sde.mixing_logit_init = -6
 cfg.sde.weight_decay_norm_vae = 0.0 #1e-2
 cfg.sde.weight_decay_norm_dae = 0.0 #1e-2
 # -> 0, for sn calculator
 cfg.sde.train_vae = True
 cfg.sde.jac_reg_coeff = 0
 cfg.sde.jac_reg_freq = 1
 cfg.sde.kin_reg_coeff = 0
 cfg.sde.learning_rate_mlogit = -1.0
 cfg.sde.learning_rate_dae_local = 3e-4
 cfg.sde.learning_rate_min_dae_local = 3e-4
 cfg.sde.learning_rate_dae = 3e-4
 cfg.sde.learning_rate_min_dae = 3e-4
 cfg.sde.learning_rate_min_vae = 1e-5
 cfg.sde.learning_rate_vae = 1e-4
 cfg.sde.epochs = 800
 cfg.sde.warmup_epochs = 20
 cfg.sde.weight_decay = 3e-4
 cfg.sde.use_adamax = False
 cfg.sde.use_adam = True # False
 cfg.sde.mixed_prediction = False # True
 cfg.sde.vae_checkpoint = ''
 cfg.sde.dae_checkpoint = ''
 # will be used to multiply with the t value, if ode solver, use 1k, if discrete solver, use 1.0
 cfg.sde.embedding_scale = 1.0 # 1000.0
 cfg.sde.embedding_type = 'positional'
 cfg.sde.train_ode_solver_tol = 1e-5
 cfg.sde.num_scales_dae = 2
 cfg.sde.autocast_train = False
 cfg.sde.diffusion_steps = 1000
 cfg.sde.embedding_dim = 128
 cfg.sde.num_channels_dae = 256
 cfg.sde.num_cell_per_scale_dae = 8
 cfg.sde.num_cell_per_scale_dae_local = 0
 cfg.sde.dropout = 0.2
 cfg.sde.num_preprocess_blocks = 2
 cfg.sde.num_latent_scales = 1
 cfg.sde.fir = False
 cfg.sde.progressive = 'none'
 cfg.sde.progressive_input = 'none'
 cfg.sde.progressive_combine = 'sum'
 cfg.sde.dataset = 'shape'
 cfg.sde.denoising_stddevs = 'beta'
 cfg.sde.ema_decay = 0.9999
 # cfg.sde.is_train_vae=True
 cfg.register_deprecated_key("sde.is_train_vae")
 cfg.sde.kl_max_coeff_vada = 1.0
 # conditional prior input
 cfg.sde.condition_add = 1
 cfg.sde.condition_cat = 0
 cfg.sde.global_prior_ckpt = ''  # checkpoint for global prior component
 cfg.sde.pool_feat_cat = 0  # the local prior aggregate the feat as extra input channels
 # hyperparameter of ddim sampling
 cfg.sde.ddim_skip_type = 'uniform'
 cfg.sde.ddim_kappa = 1.0  # 1.0: fully ddpm sampling; 0: ode style sampling
 cfg.ddpm = CN()
 cfg.ddpm.use_p2_weight = 0
 cfg.ddpm.p2_k = 1.0
 cfg.ddpm.p2_gamma = 1.0
 cfg.ddpm.use_new_timeemb = 0
 cfg.ddpm.input_dim = 3
 cfg.ddpm.dropout = 0.1
 cfg.ddpm.num_layers_classifier = 3
 cfg.ddpm.use_bn = True
 cfg.ddpm.add_point_feat = True
 cfg.ddpm.use_gn = False
 cfg.ddpm.time_dim = 64
 cfg.ddpm.ema = 1
 cfg.ddpm.with_se = 0
 cfg.ddpm.use_global_attn = 0
 cfg.ddpm.num_steps = 1000
 cfg.ddpm.beta_1 = 1e-4
 cfg.ddpm.beta_T = 2e-2
 # ['linear', 'customer'] 'customer' for airplane in PVD
 cfg.ddpm.sched_mode = 'linear'
 cfg.ddpm.model_var_type = 'fixedlarge'
 # define architecture:
 cfg.register_deprecated_key("ddpm.pointnet_plus")
 cfg.register_deprecated_key("ddpm.pointnet_pp")
 cfg.register_deprecated_key("ddpm.pointnet_luo")
 # end define architecture
 #cfg.ddpm.use_pvc = 1
 cfg.register_deprecated_key("ddpm.use_pvc")
 cfg.ddpm.clip_denoised = 0
 cfg.ddpm.model_mean_type = 'eps'
 cfg.ddpm.loss_type = 'mse'
 cfg.ddpm.loss_type_0 = ''
 cfg.ddpm.loss_weight_emd = 0.02
 cfg.ddpm.loss_weight_cdnorm = 1.0
 cfg.ddpm.attn = [0, 1, 0, 0]
 cfg.ddpm.ncenter = [1024, 256, 64, 16]
 #cfg.ddpm.pvc = CN()
 #cfg.ddpm.pvc.use_small_model = 0
 #cfg.ddpm.pvc.mlp_after_pvc = 0
 cfg.register_deprecated_key("ddpm.pvc")
 cfg.register_deprecated_key("ddpm.pvc.use_small_model")
 cfg.register_deprecated_key("ddpm.pvc.mlp_after_pvc")
 cfg.ddpm.ddim_step = 200
 cfg.data = CN()
 cfg.data.nclass = 55
 cfg.data.cond_on_cat = 0
 cfg.data.cond_on_voxel = 0
 cfg.data.eval_test_split = 0  # eval loader will be using test split
 cfg.data.voxel_size = 0.1  # size of voxel for voxel_datasets.py
 cfg.data.noise_std = 0.1  # std for the noise added to the input data
 cfg.data.noise_type = 'normal'  # std for the noise added to the input data
 cfg.data.noise_std_min = -1.0  # for range of noise std
 cfg.data.clip_forge_enable = 0
 cfg.data.clip_model = 'ViT-B/32'
 cfg.data.type = "datasets.pointflow_datasets"
 # datasets/neuralspline_datasets datasets/shape_curvature
 cfg.data.dataset_type = "shapenet15k"
 cfg.data.num_workers = 12  # 8
 cfg.data.train_drop_last = 1  # drop_last for train data loader
 cfg.data.cates = 'chair'  # data category
 cfg.data.tr_max_sample_points = 2048
 cfg.data.te_max_sample_points = 2048
 cfg.data.data_dir = "data/ShapeNetCore.v2.PC15k"  # depreciated
 cfg.data.batch_size = 12
 cfg.data.batch_size_test = 10
 cfg.data.dataset_scale = 1
 # -- the following option in terms of normalization should turn into string -- #
 cfg.data.normalize_per_shape = False
 cfg.data.normalize_shape_box = False
 cfg.data.normalize_global = False
 cfg.data.normalize_std_per_axis = False
 cfg.data.normalize_range = False  # not used
 cfg.data.recenter_per_shape = True
 # -- for the normal prediction model, used in folder_datasets
 cfg.register_deprecated_key('data.load_point_stat')
 cfg.register_deprecated_key('data.is_load_pointflow2NS')
 cfg.register_deprecated_key('data.data_path')
 #
 cfg.data.sample_with_replacement = 1
 # fixed the  data.tr_max_sample_points $np data.te_max_sample_points $np2048 points of the first 15k points
 cfg.data.random_subsample = 1
 # the data dim, used in dataset worker, if -1, it will be the same as ddpm.input_dim
 cfg.data.input_dim = -1
 cfg.data.is_encode_whole_dataset_trainer = 0
 cfg.register_deprecated_key('data.augment')
 cfg.register_deprecated_key('data.aug_translate')
 cfg.register_deprecated_key('data.aug_scale')
 cfg.register_deprecated_key('data.sub_train_set')
 cfg.test_size = 660
 cfg.viz = CN()
 cfg.viz.log_freq = 10
 cfg.viz.viz_freq = 400
 cfg.viz.save_freq = 200
 cfg.viz.val_freq = -1
 cfg.viz.viz_order = [2, 0, 1]
 cfg.viz.vis_sample_ddim_step = 0
 cfg.trainer = CN()
 # when loss 1 is weighted, also weight the kl terms
 cfg.trainer.apply_loss_weight_1_kl = 0
 cfg.trainer.kl_free = [0, 0]  # the value for the threshold
 # not back ward kl loss if KL value is smaller than the threshold
 cfg.trainer.use_kl_free = 0
 cfg.trainer.type = "trainers.ddpm_trainer"  # it means dist trainer
 cfg.trainer.epochs = 10000
 cfg.trainer.warmup_epochs = 0
 cfg.trainer.seed = 1
 cfg.trainer.use_grad_scalar = 0
 cfg.trainer.opt = CN()
 cfg.trainer.opt.type = 'adam'
 cfg.trainer.opt.lr = 1e-4  # use bs*1e-5/8
 cfg.trainer.opt.lr_min = 1e-4  # use bs*1e-5/8
 # lr start to anneal after ratio of epochs; used in cosine and lambda lr scheduler
 cfg.trainer.opt.start_ratio = 0.6
 cfg.trainer.opt.beta1 = 0.9
 cfg.trainer.opt.beta2 = 0.999
 cfg.trainer.opt.momentum = 0.9  # for SGD
 cfg.trainer.opt.weight_decay = 0.
 cfg.trainer.opt.ema_decay = 0.9999
 cfg.trainer.opt.grad_clip = -1.
 cfg.trainer.opt.scheduler = ''
 cfg.trainer.opt.step_decay = 0.998
 cfg.trainer.opt.vae_lr_warmup_epochs = 0
 cfg.trainer.anneal_kl = 0
 cfg.trainer.kl_balance = 0
 cfg.trainer.rec_balance = 0
 cfg.trainer.loss1_weight_anneal_v = 'quad'
 cfg.trainer.kl_ratio = [1.0, 1.0]
 cfg.trainer.kl_ratio_apply = 0  # apply the fixed kl ratio in the kl_ratio list
 # using spectral norm regularization on vae training or not (used in hvae_trainer)
 cfg.trainer.sn_reg_vae = 0
 cfg.trainer.sn_reg_vae_weight = 0.0  # loss weight for the sn regulatrization
 # [start] set in runtime
 cfg.log_name = ''
 cfg.save_dir = ''
 cfg.log_dir = ''
 cfg.comet_key = ''
 # [end]
 cfg.voxel2pts = CN()
 cfg.voxel2pts.init_weight = ''
 cfg.voxel2pts.diffusion_steps = [0]
 cfg.dpm = CN()
 cfg.dpm.train_encoder_only = 0
 cfg.num_ref = 0  # manully set the number of reference
 cfg.eval_ddim_step = 0  # ddim sampling for the model evaluation
 cfg.model_config = ''  # used for model control, without ading new flag
 ## --- depreciated --- #
 cfg.register_deprecated_key('cls')  # CN()
 cfg.register_deprecated_key('cls.classifier_type')  # 'models.classifier.OneLayer'
 cfg.register_deprecated_key('cls.train_on_eps')  # 1
 cfg.register_deprecated_key('cond_prior')  # CN()
 cfg.register_deprecated_key('cond_prior.grid_emb_resolution')  # 32
 cfg.register_deprecated_key('cond_prior.emb_dim')  # 64
 cfg.register_deprecated_key('cond_prior.use_voxel_feat')  # 1
 cfg.register_deprecated_key('cond_encoder_prior')  # 'models.shapelatent_modules.VoxelGridEncoder'
 cfg.register_deprecated_key('cond_prior.pvcconv_concat_3d_feat_input')  # 0
 cfg.register_deprecated_key('generate_mode_global')  # 'interpolate'
 cfg.register_deprecated_key('generate_mode_local')  # 'freeze'
 cfg.register_deprecated_key('normals')  # CN()
 cfg.register_deprecated_key('normals.model_type')  # ''
 cfg.register_deprecated_key('save_sample_seq_and_quit')  # 0
 cfg.register_deprecated_key('lns_loss_weight')  # 1.0
 cfg.register_deprecated_key('normal_pred_checkpoint')  # ''
 cfg.register_deprecated_key('lns')  # CN()
 cfg.register_deprecated_key('lns.override_config')  # ''
 cfg.register_deprecated_key('lns.wandb_checkpoint')  # 'nvidia-toronto/generative_chairs/3m3gc6sz/checkpoint-171.pth'
 cfg.register_deprecated_key('lns.num_input_points')  # 1000
 cfg.register_deprecated_key('lns.num_simulate')  # 20
 cfg.register_deprecated_key('lns.split_simulate')  # 'train'
 # use mesh-trainer or not
 cfg.register_deprecated_key('with_lns')  # 0
 cfg.register_deprecated_key('normal_predictor_yaml')  # ''
 cfg.register_deprecated_key('pointtransformer')  # CN()
 # number of attention layer in each block
 cfg.register_deprecated_key('pointtransformer.blocks')  # [2, 3, 4, 6, 3]
 cfg.register_deprecated_key('shapelatent.refiner_bp')  # 1  # bp gradient to the local-decoder or not
 cfg.register_deprecated_key('shapelatent.loss_weight_refiner')  # 1.0  # weighted loss for the refiner
 cfg.register_deprecated_key('shapelatent.refiner_type')  # 'models.pvcnn2.PVCNN2BaseAPI'  # mode for the refiner
 cfg.register_deprecated_key('shapelatent.encoder_weight_std')  # 0.1
 cfg.register_deprecated_key('shapelatent.encoder_weight_norm')  # 0
 cfg.register_deprecated_key('shapelatent.encoder_weight_uniform')  # 1
 cfg.register_deprecated_key('shapelatent.key_point_gen')  # 'mlps'
 cfg.register_deprecated_key('shapelatent.add_sub_loss')  # 1  # not used
 cfg.register_deprecated_key('shapelatent.local_decoder_type')  # ''
 cfg.register_deprecated_key('shapelatent.local_decoder_type_1')  # ''
 cfg.register_deprecated_key('shapelatent.local_encoder_ball_radius')  # 0.8
 cfg.register_deprecated_key('shapelatent.local_encoder_ap_ball_radius')  # 1.0
 cfg.register_deprecated_key('shapelatent.local_encoder_type')  # ''
 cfg.register_deprecated_key('shapelatent.local_encoder_type_1')  # ''
 cfg.register_deprecated_key('shapelatent.local_loss_weight_max')  # 50
 cfg.register_deprecated_key('shapelatent.num_neighbors')  # 0
 cfg.register_deprecated_key('shapelatent.extra_centers')  # []
 # for latent model is flow
 cfg.register_deprecated_key('shapelatent.latent_flow_depth')  # 14
 cfg.register_deprecated_key('shapelatent.latent_flow_hidden_dim')  # 256
 cfg.register_deprecated_key('shapelatent.bp_to_l0')  # True
 cfg.register_deprecated_key('shapelatent.global_only_epochs')  # 0
 cfg.register_deprecated_key('shapelatent.center_local_points')  # 1
 cfg.register_deprecated_key('shapelatent.hvae')  # CN()
 # alternatively way to compute the local loss
 cfg.register_deprecated_key('shapelatent.hvae.loss_wrt_ori')  # 0
 # add voxel feature to the latent space; the decoder require pvc conv or query
 cfg.register_deprecated_key('shapelatent.add_voxel2z_global')  # 0
 # reuse the encoder to get local latent
 cfg.register_deprecated_key('shapelatent.query_output_local_from_enc')  # 0
 # check models/shapelatent_modules where the feature will be saved as a dict
 cfg.register_deprecated_key('shapelatent.query_local_feat_layer')  # 'inter_voxelfeat_0'
 # need to check the sa_blocks of the global encoder
 cfg.register_deprecated_key('shapelatent.query_local_feat_dim')  # 32
 # reuse the encoder to get local latent
 cfg.register_deprecated_key('shapelatent.query_center_emd_from_enc')  # 0  # reuse the encoder for center emd
 cfg.register_deprecated_key('shapelatent.prog_dec_gf')  # 8  # grow_factor in VaniDecoderProg
 cfg.register_deprecated_key('shapelatent.prog_dec_gf_list')  # [0, 0]  # grow_factor in VaniDecoderProg
 cfg.register_deprecated_key('shapelatent.prog_dec_ne')  # 2  # num_expand in VaniDecoderProg
 # increase number hirach, used by hvaemul model
 cfg.register_deprecated_key('shapelatent.num_neighbors_per_level')  # [64]  # number of neighbors for each level
 cfg.register_deprecated_key('shapelatent.num_level')  # 1  # number of hierarchi latent space (local)
 cfg.register_deprecated_key('shapelatent.x0_target_fps')  # 0  # let the target of global output as the
 cfg.register_deprecated_key('shapelatent.downsample_input_ratio')  # 1.0
 # whether taking other tensor as input to local-encoder of not
 cfg.register_deprecated_key('shapelatent.local_enc_input')  # 'sim'
 # local encoder take z0 as input at which location
 cfg.register_deprecated_key('shapelatent.local_encoder_condition_z0')  # ''
 # output the absolution coordinates or the offset w.r.t centers
 cfg.register_deprecated_key('shapelatent.local_decoder_output_offset')  # 0
 # feed coords of keypoints to the local prior model
 cfg.register_deprecated_key('shapelatent.local_prior_need_coords')  # 0
 # add the time embedding tensor to each encoder layer instead of add to first layer only
 cfg.register_deprecated_key('sde.transformer_temb2interlayer')  # 0
 # normalization used in transformer encoder;
 cfg.register_deprecated_key('sde.transformer_norm_type')  # 'layer_norm'
 cfg.register_deprecated_key('data.has_normal')  # 0  # for datasets/pointflow_rgb.py only
 cfg.register_deprecated_key('data.has_color')  # 0  # for datasets/pointflow_rgb.py only
 cfg.register_deprecated_key('data.cls_data_ratio')  # 1.0  # ratio of the training data
 cfg.register_deprecated_key('data.sample_curvature')  # 0  # only for datasets/shape_curvature
 cfg.register_deprecated_key('data.ratio_c')  # 1.0  # only for datasets/shape_curvature
--- a/demo.py
+++ b/demo.py
@ -0,0 +1,45 @@
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
 # and proprietary rights in and to this software, related documentation
 # and any modifications thereto.  Any use, reproduction, disclosure or
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
 """
    require diffusers-0.11.1
 """
 import os
 import clip
 import torch
 from PIL import Image
 from default_config import cfg as config
 from models.lion import LION
 from utils.vis_helper import plot_points
 from huggingface_hub import hf_hub_download 
 model_path = './lion_ckpt/text2shape/chair/checkpoints/model.pt'
 model_config = './lion_ckpt/text2shape/chair/cfg.yml'
 config.merge_from_file(model_config)
 lion = LION(config)
 lion.load_model(model_path)
 if config.clipforge.enable:
    input_t = ["a swivel chair, five wheels"] 
    device_str = 'cuda'
    clip_model, clip_preprocess = clip.load(
                        config.clipforge.clip_model, device=device_str)    
    text = clip.tokenize(input_t).to(device_str)
    clip_feat = []
    clip_feat.append(clip_model.encode_text(text).float())
    clip_feat = torch.cat(clip_feat, dim=0)
    print('clip_feat', clip_feat.shape)
 else:
    clip_feat = None
 output = lion.sample(1 if clip_feat is None else clip_feat.shape[0], clip_feat=clip_feat)
 pts = output['points']
 img_name = "/tmp/tmp.png"
 plot_points(pts, output_name=img_name)
 img = Image.open(img_name)
 img.show()
--- a/env.yaml
+++ b/env.yaml
@ -0,0 +1,311 @@
 name: lion_env
 channels:
  - pytorch
  - nvidia
  - anaconda
  - conda-forge
  - defaults
 dependencies:
  - _libgcc_mutex=0.1=main
  - _openmp_mutex=4.5=1_gnu
  - argon2-cffi=20.1.0=py38h27cfd23_1
  - async_generator=1.10=pyhd3eb1b0_0
  - attrs=21.4.0=pyhd3eb1b0_0
  - backcall=0.2.0=pyhd3eb1b0_0
  - blas=1.0=mkl
  - bleach=4.1.0=pyhd3eb1b0_0
  - bzip2=1.0.8=h7b6447c_0
  - ca-certificates=2020.10.14=0
  - certifi=2020.6.20=py38_0
  - cffi=1.15.0=py38hd667e15_1
  - cmake=3.18.2=ha30ef3c_0
  - cudatoolkit=11.1.74=h6bb024c_0
  - debugpy=1.5.1=py38h295c915_0
  - decorator=5.1.1=pyhd3eb1b0_0
  - defusedxml=0.7.1=pyhd3eb1b0_0
  - entrypoints=0.3=py38_0
  - expat=2.2.10=he6710b0_2
  - ffmpeg=4.3=hf484d3e_0
  - freetype=2.11.0=h70c0345_0
  - giflib=5.2.1=h7b6447c_0
  - gmp=6.2.1=h2531618_2
  - gnutls=3.6.15=he1e5248_0
  - importlib_metadata=4.8.2=hd3eb1b0_0
  - intel-openmp=2021.4.0=h06a4308_3561
  - ipykernel=6.4.1=py38h06a4308_1
  - ipython=7.31.1=py38h06a4308_0
  - ipython_genutils=0.2.0=pyhd3eb1b0_1
  - ipywidgets=7.6.5=pyhd3eb1b0_1
  - jedi=0.18.1=py38h06a4308_0
  - jpeg=9d=h7f8727e_0
  - jupyter_client=7.1.2=pyhd3eb1b0_0
  - jupyter_core=4.9.1=py38h06a4308_0
  - jupyterlab_pygments=0.1.2=py_0
  - jupyterlab_widgets=1.0.0=pyhd3eb1b0_1
  - krb5=1.18.2=h173b8e3_0
  - lame=3.100=h7b6447c_0
  - lcms2=2.12=h3be6417_0
  - ld_impl_linux-64=2.35.1=h7274673_9
  - libcurl=7.71.1=h20c2e04_1
  - libedit=3.1.20191231=h14c3975_1
  - libffi=3.3=he6710b0_2
  - libgcc-ng=9.3.0=h5101ec6_17
  - libgomp=9.3.0=h5101ec6_17
  - libiconv=1.15=h63c8f33_5
  - libidn2=2.3.2=h7f8727e_0
  - libpng=1.6.37=hbc83047_0
  - libsodium=1.0.18=h7b6447c_0
  - libssh2=1.9.0=h1ba5d50_1
  - libstdcxx-ng=9.3.0=hd4cf53a_17
  - libtasn1=4.16.0=h27cfd23_0
  - libtiff=4.2.0=h85742a9_0
  - libunistring=0.9.10=h27cfd23_0
  - libuv=1.40.0=h7b6447c_0
  - libwebp=1.2.0=h89dd481_0
  - libwebp-base=1.2.0=h27cfd23_0
  - lz4-c=1.9.3=h295c915_1
  - markupsafe=2.0.1=py38h27cfd23_0
  - matplotlib-inline=0.1.2=pyhd3eb1b0_2
  - mistune=0.8.4=py38h7b6447c_1000
  - mkl=2021.4.0=h06a4308_640
  - mkl-service=2.4.0=py38h7f8727e_0
  - mkl_fft=1.3.1=py38hd3c417c_0
  - mkl_random=1.2.2=py38h51133e4_0
  - nbclient=0.5.3=pyhd3eb1b0_0
  - nbconvert=6.3.0=py38h06a4308_0
  - ncurses=6.3=h7f8727e_2
  - nest-asyncio=1.5.1=pyhd3eb1b0_0
  - nettle=3.7.3=hbbd107a_1
  - notebook=6.4.6=py38h06a4308_0
  - numpy=1.21.2=py38h20f2e39_0
  - numpy-base=1.21.2=py38h79a1101_0
  - olefile=0.46=pyhd3eb1b0_0
  - openh264=2.1.1=h4ff587b_0
  - openssl=1.1.1m=h7f8727e_0
  - packaging=21.3=pyhd3eb1b0_0
  - pandocfilters=1.5.0=pyhd3eb1b0_0
  - parso=0.8.3=pyhd3eb1b0_0
  - pexpect=4.8.0=pyhd3eb1b0_3
  - pickleshare=0.7.5=pyhd3eb1b0_1003
  - pillow=8.4.0=py38h5aabda8_0
  - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0
  - prometheus_client=0.13.1=pyhd3eb1b0_0
  - prompt-toolkit=3.0.20=pyhd3eb1b0_0
  - ptyprocess=0.7.0=pyhd3eb1b0_2
  - pycparser=2.21=pyhd3eb1b0_0
  - pygments=2.11.2=pyhd3eb1b0_0
  - python=3.8.12=h12debd9_0
  - python-dateutil=2.8.2=pyhd3eb1b0_0
  - python-fastjsonschema=2.16.1=pyhd8ed1ab_0
  - python_abi=3.8=2_cp38
  - pytorch=1.10.2=py3.8_cuda11.1_cudnn8.0.5_0
  - pytorch-mutex=1.0=cuda
  - pyzmq=22.3.0=py38h295c915_2
  - readline=8.1.2=h7f8727e_1
  - rhash=1.4.0=h1ba5d50_0
  - send2trash=1.8.0=pyhd3eb1b0_1
  - six=1.16.0=pyhd3eb1b0_0
  - sqlite=3.37.2=hc218d9a_0
  - terminado=0.9.4=py38h06a4308_0
  - testpath=0.5.0=pyhd3eb1b0_0
  - tk=8.6.11=h1ccaba5_0
  - torchaudio=0.10.2=py38_cu111
  - torchvision=0.11.3=py38_cu111
  - tornado=6.1=py38h27cfd23_0
  - traitlets=5.1.1=pyhd3eb1b0_0
  - wcwidth=0.2.5=pyhd3eb1b0_0
  - webencodings=0.5.1=py38_1
  - wheel=0.37.1=pyhd3eb1b0_0
  - widgetsnbextension=3.5.1=py38_0
  - xz=5.2.5=h7b6447c_0
  - zeromq=4.3.4=h2531618_0
  - zipp=3.7.0=pyhd3eb1b0_0
  - zlib=1.2.11=h7f8727e_4
  - zstd=1.4.5=h9ceee32_0
  - pip:
    - about-time==3.1.1
    - absl-py==1.0.0
    - addict==2.4.0
    - aiohttp==3.8.1
    - aiosignal==1.2.0
    - alive-progress==2.2.0
    - antlr4-python3-runtime==4.9.3
    - anyio==3.5.0
    - astunparse==1.6.3
    - async-timeout==4.0.2
    - babel==2.9.1
    - cachetools==5.0.0
    - calmsize==0.1.3
    - ccimport==0.3.7
    - cftime==1.6.0
    - charset-normalizer==2.0.11
    - click==8.0.3
    - colorama==0.4.4
    - comet-ml==3.31.21
    - commonmark==0.9.1
    - configobj==5.0.6
    - crc32c==2.2.post0
    - cumm-cu111==0.2.8
    - cupy-cuda111==10.2.0
    - cycler==0.11.0
    - cython==0.29.20
    - dataclasses==0.6
    - deepspeed==0.6.5
    - deprecation==2.1.0
    - diffusers==0.11.1
    - docker-pycreds==0.4.0
    - drjit==0.2.1
    - dulwich==0.20.32
    - easydict==1.9
    - einops==0.4.0
    - everett==3.0.0
    - fastrlock==0.8
    - filelock==3.9.0
    - fire==0.4.0
    - flatbuffers==2.0
    - flatten-dict==0.4.2
    - fonttools==4.29.1
    - freetype-py==2.3.0
    - frozenlist==1.3.0
    - fsspec==2022.2.0
    - ftfy==6.1.1
    - future==0.18.2
    - fvcore==0.1.5.post20220512
    - gast==0.5.3
    - gitdb==4.0.9
    - gitpython==3.1.26
    - google-auth==2.6.0
    - google-auth-oauthlib==0.4.6
    - google-pasta==0.2.0
    - grapheme==0.6.0
    - grpcio==1.43.0
    - h5py==3.6.0
    - hjson==3.0.2
    - huggingface-hub==0.11.1
    - idna==3.3
    - imageio==2.15.0
    - imageio-ffmpeg==0.4.5
    - importlib-metadata==4.10.1
    - importlib-resources==5.4.0
    - iopath==0.1.10
    - jinja2==3.1.1
    - joblib==1.1.0
    - json5==0.9.6
    - jsonschema==4.4.0
    - jupyter-packaging==0.12.0
    - jupyter-server==1.15.6
    - jupyterlab==3.3.2
    - jupyterlab-server==2.11.2
    - keras==2.8.0
    - keras-preprocessing==1.1.2
    - kiwisolver==1.3.2
    - kornia==0.6.6
    - lark==1.1.2
    - libclang==14.0.1
    - llvmlite==0.39.0
    - loguru==0.6.0
    - markdown==3.3.6
    - matplotlib==3.5.1
    - matplotlib2tikz==0.7.6
    - meshio==5.3.4
    - mitsuba==3.0.1
    - mrcfile==1.3.0
    - multidict==6.0.2
    - multipledispatch==0.6.0
    - mypy-extensions==0.4.3
    - nbclassic==0.3.7
    - nbformat==5.2.0
    - nestargs==0.5.0
    - netcdf4==1.5.8
    - networkx==2.6.3
    - ninja==1.10.2.3
    - notebook-shim==0.1.0
    - numba==0.56.0
    - nvidia-ml-py3==7.352.0
    - oauthlib==3.2.0
    - omegaconf==2.2.2
    - open3d==0.15.2
    - opencv-python==4.5.5.64
    - openexr==1.3.7
    - opt-einsum==3.3.0
    - pandas==1.4.0
    - pathtools==0.1.2
    - pccm==0.3.4
    - pip==22.3.1
    - plyfile==0.7.4
    - portalocker==2.5.1
    - progressbar2==4.0.0
    - promise==2.3
    - protobuf==3.19.4
    - psutil==5.9.0
    - py-cpuinfo==8.0.0
    - pyasn1==0.4.8
    - pyasn1-modules==0.2.8
    - pybind11==2.10.0
    - pydeprecate==0.3.1
    - pyglet==1.5.23
    - pykeops==1.5
    - pymcubes==0.1.2
    - pyopengl==3.1.0
    - pyparsing==3.0.7
    - pyquaternion==0.9.9
    - pyrr==0.10.3
    - pyrsistent==0.18.1
    - python-swiftclient==4.0.0
    - python-utils==3.3.3
    - pytorch-lightning==1.5.1
    - pytorch3d==0.3.0
    - pytz==2021.3
    - pywavelets==1.2.0
    - pyyaml==6.0
    - regex==2022.3.15
    - requests==2.27.1
    - requests-oauthlib==1.3.1
    - requests-toolbelt==0.9.1
    - rich==12.3.0
    - rsa==4.8
    - ruamel-yaml==0.17.20
    - ruamel-yaml-clib==0.2.6
    - scikit-image==0.19.1
    - scikit-learn==1.0.2
    - scipy==1.8.0
    - seaborn==0.11.2
    - semantic-version==2.9.0
    - sentry-sdk==1.5.4
    - sharedarray==3.2.1
    - shortuuid==1.0.8
    - simple-parsing==0.0.18
    - simplejson==3.18.0
    - sklearn==0.0
    - smmap==5.0.0
    - sniffio==1.2.0
    - tabulate==0.8.9
    - tensorboard==2.8.0
    - tensorboard-data-server==0.6.1
    - tensorboard-plugin-wit==1.8.1
    - tensorboardx==2.4.1
    - tensorflow-gpu==2.8.0
    - tensorflow-io-gcs-filesystem==0.25.0
    - termcolor==1.1.0
    - tf-estimator-nightly==2.8.0.dev2021122109
    - tflearn==0.5.0
    - tfrecord==1.14.1
    - threadpoolctl==3.1.0
    - tifffile==2022.2.2
    - tikzplotlib==0.10.1
    - tomlkit==0.10.0
    - torchmetrics==0.7.2
    - tqdm==4.62.3
    - trimesh==3.10.1
    - typing-extensions==4.2.0
    - typing-inspect==0.7.1
    - urllib3==1.26.8
    - wandb==0.12.10
    - webcolors==1.11.1
    - websocket-client==1.2.3
    - werkzeug==2.0.3
    - wrapt==1.13.3
    - wurlitzer==3.0.2
    - yacs==0.1.8
    - yarl==1.7.2
    - yaspin==2.1.0
--- a/models/adagn.py
+++ b/models/adagn.py
@ -0,0 +1,67 @@
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
 # and proprietary rights in and to this software, related documentation
 # and any modifications thereto.  Any use, reproduction, disclosure or
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
 """
 adaptive group norm 
 """
 from loguru import logger
 import torch.nn as nn
 import torch
 import numpy as np
 from utils.checker import *
 from .dense import dense
 import os 
 class AdaGN(nn.Module):
    '''
    adaptive group normalization
    '''
    def __init__(self, ndim, cfg, n_channel):
        """
        ndim: dim of the input features 
        n_channel: number of channels of the inputs 
        ndim_style: channel of the style features 
        """
        super().__init__()
        style_dim = cfg.latent_pts.style_dim 
        init_scale = cfg.latent_pts.ada_mlp_init_scale 
        self.ndim = ndim 
        self.n_channel = n_channel
        self.style_dim = style_dim
        self.out_dim = n_channel * 2
        self.norm = nn.GroupNorm(8, n_channel)
        in_channel = n_channel 
        self.emd = dense(style_dim, n_channel*2, init_scale=init_scale)
        self.emd.bias.data[:in_channel] = 1
        self.emd.bias.data[in_channel:] = 0
    def __repr__(self):
        return f"AdaGN(GN(8, {self.n_channel}), Linear({self.style_dim}, {self.out_dim}))" 
    def forward(self, image, style):
        # style: B,D 
        # image: B,D,N,1 
        CHECK2D(style)
        style = self.emd(style)
        if self.ndim == 3: #B,D,V,V,V
            CHECK5D(image)
            style = style.view(style.shape[0], -1, 1, 1, 1) # 5D 
        elif self.ndim == 2: # B,D,N,1 
            CHECK4D(image) 
            style = style.view(style.shape[0], -1, 1, 1) # 4D 
        elif self.ndim == 1: # B,D,N
            CHECK3D(image) 
            style = style.view(style.shape[0], -1, 1) # 4D 
        else:
            raise NotImplementedError
        factor, bias = style.chunk(2, 1)
        result = self.norm(image)
        result = result * factor + bias  
        return result 
--- a/models/dense.py
+++ b/models/dense.py
@ -0,0 +1,80 @@
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
 # and proprietary rights in and to this software, related documentation
 # and any modifications thereto.  Any use, reproduction, disclosure or
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
 """ copied and modified from https://github.com/CW-Huang/sdeflow-light/blob/524650bc5ad69522b3e0905672deef0650374512/lib/models/unet.py """
 import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.init import _calculate_fan_in_and_fan_out
 import numpy as np
 def _calculate_correct_fan(tensor, mode):
    """
    copied and modified from https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py#L337
    """
    mode = mode.lower()
    valid_modes = ['fan_in', 'fan_out', 'fan_avg']
    if mode not in valid_modes:
        raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))
    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
    return fan_in if mode == 'fan_in' else fan_out
 def kaiming_uniform_(tensor, gain=1., mode='fan_in'):
    r"""Fills the input `Tensor` with values according to the method
    described in `Delving deep into rectifiers: Surpassing human-level
    performance on ImageNet classification` - He, K. et al. (2015), using a
    uniform distribution. The resulting tensor will have values sampled from
    :math:`\mathcal{U}(-\text{bound}, \text{bound})` where
    .. math::
        \text{bound} = \text{gain} \times \sqrt{\frac{3}{\text{fan\_mode}}}
    Also known as He initialization.
    Args:
        tensor: an n-dimensional `torch.Tensor`
        gain: multiplier to the dispersion
        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
            preserves the magnitude of the variance of the weights in the
            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
            backwards pass.
    Examples:
        >>> w = torch.empty(3, 5)
        >>> nn.init.kaiming_uniform_(w, mode='fan_in')
    """
    fan = _calculate_correct_fan(tensor, mode)
    # gain = calculate_gain(nonlinearity, a)
    var = gain / max(1., fan)
    bound = math.sqrt(3.0 * var)  # Calculate uniform bounds from standard deviation
    with torch.no_grad():
        return tensor.uniform_(-bound, bound)
 def variance_scaling_init_(tensor, scale):
    return kaiming_uniform_(tensor, gain=1e-10 if scale == 0 else scale, mode='fan_avg')
 def dense(in_channels, out_channels, init_scale=1.):
    lin = nn.Linear(in_channels, out_channels)
    variance_scaling_init_(lin.weight, scale=init_scale)
    nn.init.zeros_(lin.bias)
    return lin
 def conv2d(in_planes, out_planes, kernel_size=(3, 3), stride=1, dilation=1, padding=1, bias=True, padding_mode='zeros',
           init_scale=1.):
    conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation,
                     bias=bias, padding_mode=padding_mode)
    variance_scaling_init_(conv.weight, scale=init_scale)
    if bias:
        nn.init.zeros_(conv.bias)
    return conv
--- a/models/distributions.py
+++ b/models/distributions.py
@ -0,0 +1,37 @@
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
 # and proprietary rights in and to this software, related documentation
 # and any modifications thereto.  Any use, reproduction, disclosure or
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
 import torch
 import numpy as np
@torch.jit.script
 def sample_normal_jit(mu, sigma):
    rho = mu.mul(0).normal_()
    z = rho.mul_(sigma).add_(mu)
    return z, rho
 class Normal:
    def __init__(self, mu, log_sigma, sigma=None):
        self.mu = mu
        self.log_sigma = log_sigma
        self.sigma = torch.exp(log_sigma) if sigma is None else sigma 
    def sample(self, t=1.):
        return sample_normal_jit(self.mu, self.sigma * t)
    def sample_given_rho(self, rho):
        return rho * self.sigma + self.mu
    def mean(self):
        return self.mu
    def log_p(self, samples):
        normalized_samples = (samples - self.mu) / self.sigma
        log_p = - 0.5 * normalized_samples * normalized_samples - 0.5 * np.log(2 * np.pi) - self.log_sigma
        return log_p
--- a/models/latent_points_ada.py
+++ b/models/latent_points_ada.py
@ -0,0 +1,273 @@
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
 # and proprietary rights in and to this software, related documentation
 # and any modifications thereto.  Any use, reproduction, disclosure or
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
 import torch 
 from loguru import logger 
 import torch.nn as nn 
 import torch.nn.functional as F
 import numpy as np
 from .pvcnn2_ada import \
        create_pointnet2_sa_components, create_pointnet2_fp_modules, LinearAttention, create_mlp_components, SharedMLP 
 # the building block of encode and decoder for VAE 
 class PVCNN2Unet(nn.Module):
    """
        copied and modified from https://github.com/alexzhou907/PVD/blob/9747265a5f141e5546fd4f862bfa66aa59f1bd33/model/pvcnn_generation.py#L172 
    """
    def __init__(self, 
                 num_classes, embed_dim, use_att, dropout=0.1,
                 extra_feature_channels=3, 
                 input_dim=3,
                 width_multiplier=1, 
                 voxel_resolution_multiplier=1,
                 time_emb_scales=1.0,
                 verbose=True, 
                 condition_input=False, 
                 point_as_feat=1, cfg={}, 
                 sa_blocks={}, fp_blocks={}, 
                 clip_forge_enable=0,
                 clip_forge_dim=512
                 ):
        super().__init__()
        logger.info('[Build Unet] extra_feature_channels={}, input_dim={}',
                extra_feature_channels, input_dim)
        self.input_dim = input_dim 
        self.clip_forge_enable = clip_forge_enable 
        self.sa_blocks = sa_blocks 
        self.fp_blocks = fp_blocks
        self.point_as_feat = point_as_feat
        self.condition_input = condition_input
        assert extra_feature_channels >= 0
        self.time_emb_scales = time_emb_scales
        self.embed_dim = embed_dim
        ## assert(self.embed_dim == 0)
        if self.embed_dim > 0: # has time embedding 
            # for prior model, we have time embedding, for VAE model, no time embedding 
            self.embedf = nn.Sequential(
                nn.Linear(embed_dim, embed_dim),
                nn.LeakyReLU(0.1, inplace=True),
                nn.Linear(embed_dim, embed_dim),
            )
        if self.clip_forge_enable:
            self.clip_forge_mapping = nn.Linear(clip_forge_dim, embed_dim) 
            style_dim = cfg.latent_pts.style_dim
            self.style_clip = nn.Linear(style_dim + embed_dim, style_dim) 
        self.in_channels = extra_feature_channels + 3
        sa_layers, sa_in_channels, channels_sa_features, _ = \
            create_pointnet2_sa_components(
            input_dim=input_dim,
            sa_blocks=self.sa_blocks, 
            extra_feature_channels=extra_feature_channels, 
            with_se=True, 
            embed_dim=embed_dim, # time embedding dim 
            use_att=use_att, dropout=dropout,
            width_multiplier=width_multiplier, 
            voxel_resolution_multiplier=voxel_resolution_multiplier, 
            verbose=verbose, cfg=cfg
        )
        self.sa_layers = nn.ModuleList(sa_layers)
        self.global_att = None if not use_att else LinearAttention(channels_sa_features, 8, verbose=verbose)
        # only use extra features in the last fp module
        sa_in_channels[0] = extra_feature_channels + input_dim - 3
        fp_layers, channels_fp_features = create_pointnet2_fp_modules(
            fp_blocks=self.fp_blocks, in_channels=channels_sa_features, 
            sa_in_channels=sa_in_channels, 
            with_se=True, embed_dim=embed_dim,
            use_att=use_att, dropout=dropout,
            width_multiplier=width_multiplier, voxel_resolution_multiplier=voxel_resolution_multiplier,
            verbose=verbose, cfg=cfg 
        )
        self.fp_layers = nn.ModuleList(fp_layers)
        layers, _ = create_mlp_components(
                in_channels=channels_fp_features, 
                out_channels=[128, dropout, num_classes], # was 0.5
                classifier=True, dim=2, width_multiplier=width_multiplier,
                cfg=cfg)
        self.classifier = nn.ModuleList(layers)
    def get_timestep_embedding(self, timesteps, device):
        if len(timesteps.shape) == 2 and timesteps.shape[1] == 1:
            timesteps = timesteps[:,0]
        assert(len(timesteps.shape) == 1), f'get shape: {timesteps.shape}'  
        timesteps = timesteps * self.time_emb_scales 
        half_dim = self.embed_dim // 2
        emb = np.log(10000) / (half_dim - 1)
        emb = torch.from_numpy(np.exp(np.arange(0, half_dim) * -emb)).float().to(device)
        emb = timesteps[:, None] * emb[None, :]
        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
        if self.embed_dim % 2 == 1:  # zero pad
            emb = nn.functional.pad(emb, (0, 1), "constant", 0)
        assert emb.shape == torch.Size([timesteps.shape[0], self.embed_dim])
        return emb
    def forward(self, inputs, **kwargs):
        # Input: coords: B3N 
        B = inputs.shape[0]
        coords = inputs[:, :self.input_dim, :].contiguous() 
        features = inputs 
        temb = kwargs.get('t', None) 
        if temb is not None:
            t = temb 
            if t.ndim == 0 and not len(t.shape) == 1:
                t = t.view(1).expand(B)
            temb =  self.embedf(self.get_timestep_embedding(t, inputs.device 
                ))[:,:,None].expand(-1,-1,inputs.shape[-1])
            temb_ori = temb  # B,embed_dim,Npoint 
        style = kwargs['style'] 
        if self.clip_forge_enable:
            clip_feat = kwargs['clip_feat'] 
            assert(clip_feat is not None), f'require clip_feat as input'
            clip_feat = self.clip_forge_mapping(clip_feat) 
            style = torch.cat([style, clip_feat], dim=1).contiguous()
            style = self.style_clip(style)
        coords_list, in_features_list = [], []
        for i, sa_blocks  in enumerate(self.sa_layers):
            in_features_list.append(features)
            coords_list.append(coords)
            if i > 0 and temb is not None:
                #TODO: implement a sa_blocks forward function; check if is PVConv layer and kwargs get grid_emb, take as additional input 
                features = torch.cat([features,temb],dim=1)
                features, coords, temb, _ = \
                    sa_blocks ((features, 
                    coords, temb, style)) 
            else: # i == 0 or temb is None 
                features, coords, temb, _ = \
                    sa_blocks ((features, coords, temb, style)) 
        in_features_list[0] = inputs[:, 3:, :].contiguous()
        if self.global_att is not None:
            features = self.global_att(features)
        for fp_idx, fp_blocks  in enumerate(self.fp_layers):
            if temb is not None:
                features, coords, temb, _ = fp_blocks((
                    coords_list[-1-fp_idx], coords, 
                    torch.cat([features,temb],dim=1), 
                    in_features_list[-1-fp_idx], temb, style))
            else:
                features, coords, temb, _ = fp_blocks((
                    coords_list[-1-fp_idx], coords, 
                    features, 
                    in_features_list[-1-fp_idx], temb, style))
        for l in self.classifier:
            if isinstance(l, SharedMLP):
                features = l(features, style)
            else:
                features = l(features)
        return features 
 class PointTransPVC(nn.Module):
    # encoder : B,N,3 -> B,N,2*D 
    sa_blocks = [ # conv_configs, sa_configs
        ((32, 2, 32), (1024, 0.1, 32, (32, 64))),
        ((64, 3, 16), (256, 0.2, 32, (64, 128))),
        ((128, 3, 8), (64, 0.4, 32, (128, 256))),
        (None, (16, 0.8, 32, (128, 128, 128))), 
    ]
    fp_blocks = [
        ((128, 128), (128, 3, 8)), # fp_configs, conv_configs
        ((128, 128), (128, 3, 8)),
        ((128, 128), (128, 2, 16)),
        ((128, 128, 64), (64, 2, 32)),
    ]
    def __init__(self, zdim, input_dim, args={}):
        super().__init__()
        self.zdim = zdim 
        self.layers = PVCNN2Unet(2*zdim+input_dim*2, 
                embed_dim=0, use_att=1, extra_feature_channels=0,
                input_dim=args.ddpm.input_dim, cfg=args,
                sa_blocks=self.sa_blocks, fp_blocks=self.fp_blocks,
                dropout=args.ddpm.dropout)
        self.skip_weight = args.latent_pts.skip_weight
        self.pts_sigma_offset = args.latent_pts.pts_sigma_offset 
        self.input_dim = input_dim
    def forward(self, inputs):
        x, style = inputs 
        B,N,D = x.shape 
        output = self.layers(x.permute(0,2,1).contiguous(), style=style).permute(0,2,1).contiguous() # BND  
        pt_mu_1d = output[:,:,:self.input_dim].contiguous()
        pt_sigma_1d = output[:,:,self.input_dim:2*self.input_dim].contiguous() - self.pts_sigma_offset 
        pt_mu_1d = self.skip_weight * pt_mu_1d + x 
        if self.zdim > 0:
            ft_mu_1d = output[:,:,2*self.input_dim:-self.zdim].contiguous()
            ft_sigma_1d = output[:,:,-self.zdim:].contiguous()
            mu_1d = torch.cat([pt_mu_1d, ft_mu_1d], dim=2).view(B,-1).contiguous()
            sigma_1d = torch.cat([pt_sigma_1d, ft_sigma_1d], dim=2).view(B,-1).contiguous() 
        else:
            mu_1d = pt_mu_1d.view(B,-1).contiguous()
            sigma_1d = pt_sigma_1d.view(B,-1).contiguous() 
        return {'mu_1d': mu_1d, 'sigma_1d': sigma_1d}
 class LatentPointDecPVC(nn.Module):
    """ input x: [B,Npoint,D] with [B,Npoint,3] 
    """
    sa_blocks = [ # conv_configs, sa_configs
        ((32, 2, 32), (1024, 0.1, 32, (32, 64))),
        ((64, 3, 16), (256, 0.2, 32, (64, 128))),
        ((128, 3, 8), (64, 0.4, 32, (128, 256))),
        (None, (16, 0.8, 32, (128, 128, 128))), 
    ]
    fp_blocks = [
        ((128, 128), (128, 3, 8)), # fp_configs, conv_configs
        ((128, 128), (128, 3, 8)),
        ((128, 128), (128, 2, 16)),
        ((128, 128, 64), (64, 2, 32)),
    ]
    def __init__(self, point_dim, context_dim, num_points=None, args={}, **kwargs):
        super().__init__()
        self.point_dim = point_dim  
        logger.info('[Build Dec] point_dim={}, context_dim={}', point_dim, context_dim)
        self.context_dim  = context_dim + self.point_dim 
        # self.num_points = num_points
        if num_points is None:
            self.num_points = args.data.tr_max_sample_points
        else:
            self.num_points = num_points
        self.layers = PVCNN2Unet(point_dim, embed_dim=0, use_att=1, 
                extra_feature_channels=context_dim,
                input_dim=args.ddpm.input_dim, cfg=args, 
                sa_blocks=self.sa_blocks, fp_blocks=self.fp_blocks, 
                dropout=args.ddpm.dropout)
        self.skip_weight = args.latent_pts.skip_weight
    def forward(self, x, beta, context, style):
        """
        Args:
            x:  Point clouds at some timestep t, (B, N, d). [not used] 
            beta:     Time. (B, ). [not used] 
            context:  Latent points, (B,N_pts*D_latent_pts), D_latent_pts = D_input + D_extra
            style: Shape latents. (B,d).
        Returns: 
            points: (B,N,3)
        """ 
        # CHECKDIM(context, 1, self.num_points*self.context_dim)
        assert(context.shape[1] == self.num_points*self.context_dim)
        context = context.view(-1,self.num_points,self.context_dim) # BND 
        x = context[:,:,:self.point_dim]
        output = self.layers(context.permute(0,2,1).contiguous(), style=style).permute(0,2,1).contiguous() # BN3 
        output = output * self.skip_weight + x 
        return output  
--- a/models/latent_points_ada_localprior.py
+++ b/models/latent_points_ada_localprior.py
@ -0,0 +1,84 @@
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
 # and proprietary rights in and to this software, related documentation
 # and any modifications thereto.  Any use, reproduction, disclosure or
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
 import torch 
 from loguru import logger 
 import torch.nn as nn 
 import torch.nn.functional as F
 from .latent_points_ada import PVCNN2Unet 
 from .utils import mask_inactive_variables 
 # diffusion model for latent points 
 class PVCNN2Prior(PVCNN2Unet): 
    sa_blocks = [ # conv_configs, sa_configs
        ((32, 2, 32), (1024, 0.1, 32, (32, 64))),
        ((64, 3, 16), (256, 0.2, 32, (64, 128))),
        ((128, 3, 8), (64, 0.4, 32, (128, 128))),
        (None, (16, 0.8, 32, (128, 128, 128))), 
    ]
    fp_blocks = [
        ((128, 128), (128, 3, 8)), # fp_configs, conv_configs
        ((128, 128), (128, 3, 8)),
        ((128, 128), (128, 2, 16)),
        ((128, 128, 64), (64, 2, 32)),
    ]
    def __init__(self, args, num_input_channels, cfg):
        # only cfg is used 
        self.clip_forge_enable = cfg.clipforge.enable
        clip_forge_dim = cfg.clipforge.feat_dim
        num_input_channels = num_classes = cfg.shapelatent.latent_dim + cfg.ddpm.input_dim 
        self.num_classes = num_classes 
        embed_dim = cfg.ddpm.time_dim 
        use_att = True 
        extra_feature_channels = cfg.shapelatent.latent_dim 
        self.num_points = cfg.data.tr_max_sample_points 
        dropout = cfg.ddpm.dropout 
        time_emb_scales = cfg.sde.embedding_scale  # 1k default 
        logger.info('[Build Prior Model] nclass={}, embed_dim={}, use_att={},'
                'extra_feature_channels={}, dropout={}, time_emb_scales={} num_point={}',
                num_classes, embed_dim, use_att, extra_feature_channels, dropout, time_emb_scales,
                self.num_points)
        # Attention: we are not using time_emb_scales here, but the embedding_scale
        super().__init__(
                num_classes, embed_dim, use_att, dropout=dropout,
                input_dim=cfg.ddpm.input_dim,
                extra_feature_channels=extra_feature_channels, 
                time_emb_scales=time_emb_scales,
                verbose=True,
                condition_input=False, 
                cfg=cfg,
                sa_blocks=self.sa_blocks,
                fp_blocks=self.fp_blocks, 
                clip_forge_enable=self.clip_forge_enable, clip_forge_dim=clip_forge_dim) 
        # init mixing logit 
        self.mixed_prediction = cfg.sde.mixed_prediction  # This enables mixed prediction
        if self.mixed_prediction:
            logger.info('init-mixing_logit = {}, after sigmoid = {}', 
                    cfg.sde.mixing_logit_init, torch.sigmoid(torch.tensor(cfg.sde.mixing_logit_init))
                    )
            init = cfg.sde.mixing_logit_init * torch.ones(size=[1, num_input_channels*self.num_points, 1, 1])
            self.mixing_logit = torch.nn.Parameter(init, requires_grad=True)
            self.is_active = None
        else: # no mixing_logit
          self.mixing_logit = None
          self.is_active = None
    def forward(self, x, t, *args, **kwargs): #x0=None):
        # Input: x: B,ND or B,ND,1,1   
        # require shape for x: B,C,N
        ## CHECKEQ(x.shape[-1], self.num_classes)
        assert('condition_input' in kwargs), 'require condition_input'
        if self.mixed_prediction and self.is_active is not None:
            x = mask_inactive_variables(x, self.is_active)
        input_shape = x.shape 
        x = x.view(-1,self.num_points,self.num_classes).permute(0,2,1).contiguous()
        B = x.shape[0] 
        out = super().forward(x, t=t, style=kwargs['condition_input'].squeeze(-1).squeeze(-1), clip_feat=kwargs.get('clip_feat', None))
        return out.permute(0,2,1).contiguous().view(input_shape)  
        # -1,self.num_classes) # BDN -> BND -> BN,D
--- a/models/lion.py
+++ b/models/lion.py
@ -0,0 +1,91 @@
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
 # and proprietary rights in and to this software, related documentation
 # and any modifications thereto.  Any use, reproduction, disclosure or
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
 from models.vae_adain import Model as VAE
 from models.latent_points_ada_localprior import PVCNN2Prior as LocalPrior
 from utils.diffusion_pvd import DiffusionDiscretized
 from utils.vis_helper import plot_points
 from utils.model_helper import import_model
 from diffusers import DDPMScheduler
 import torch
 from matplotlib import pyplot as plt
 class LION(object):
    def __init__(self, cfg):
        self.vae = VAE(cfg).cuda()
        GlobalPrior = import_model(cfg.latent_pts.style_prior)
        global_prior = GlobalPrior(cfg.sde, cfg.latent_pts.style_dim, cfg).cuda()
        local_prior = LocalPrior(cfg.sde, cfg.shapelatent.latent_dim, cfg).cuda()
        self.priors = torch.nn.ModuleList([global_prior, local_prior])
        self.scheduler = DDPMScheduler(clip_sample=False,
                                       beta_start=cfg.ddpm.beta_1, beta_end=cfg.ddpm.beta_T, beta_schedule=cfg.ddpm.sched_mode,
                                       num_train_timesteps=cfg.ddpm.num_steps, variance_type=cfg.ddpm.model_var_type)
        self.diffusion = DiffusionDiscretized(None, None, cfg)
        # self.load_model(cfg)
    def load_model(self, model_path):
        # model_path = cfg.ckpt.path
        ckpt = torch.load(model_path)
        self.priors.load_state_dict(ckpt['dae_state_dict'])
        self.vae.load_state_dict(ckpt['vae_state_dict'])
        print(f'INFO finish loading from {model_path}')
    @torch.no_grad()
    def sample(self, num_samples=10, clip_feat=None, save_img=False):
        self.scheduler.set_timesteps(1000, device='cuda')
        timesteps = self.scheduler.timesteps
        latent_shape = self.vae.latent_shape()
        global_prior, local_prior = self.priors[0], self.priors[1]
        assert(not local_prior.mixed_prediction and not global_prior.mixed_prediction)
        sampled_list = []
        output_dict = {}
        # start sample global prior
        x_T_shape = [num_samples] + latent_shape[0]
        x_noisy = torch.randn(size=x_T_shape, device='cuda')
        condition_input = None
        for i, t in enumerate(timesteps):
            t_tensor = torch.ones(num_samples, dtype=torch.int64, device='cuda') * (t+1)
            noise_pred = global_prior(x=x_noisy, t=t_tensor.float(), 
                    condition_input=condition_input, clip_feat=clip_feat)
            x_noisy = self.scheduler.step(noise_pred, t, x_noisy).prev_sample
        sampled_list.append(x_noisy)
        output_dict['z_global'] = x_noisy
        condition_input = x_noisy
        condition_input = self.vae.global2style(condition_input)
        # start sample local prior
        x_T_shape = [num_samples] + latent_shape[1]
        x_noisy = torch.randn(size=x_T_shape, device='cuda')
        for i, t in enumerate(timesteps):
            t_tensor = torch.ones(num_samples, dtype=torch.int64, device='cuda') * (t+1)
            noise_pred = local_prior(x=x_noisy, t=t_tensor.float(), 
                    condition_input=condition_input, clip_feat=clip_feat)
            x_noisy = self.scheduler.step(noise_pred, t, x_noisy).prev_sample
        sampled_list.append(x_noisy)
        output_dict['z_local'] = x_noisy
        # decode the latent
        output = self.vae.sample(num_samples=num_samples, decomposed_eps=sampled_list)
        if save_img:
            out_name = plot_points(output, "/tmp/tmp.png")
            print(f'INFO save plot image at {out_name}')
        output_dict['points'] = output
        return output_dict
    def get_mixing_component(self, noise_pred, t):
        # usage:
        # if global_prior.mixed_prediction:
        #     mixing_component = self.get_mixing_component(noise_pred, t)
        #     coeff = torch.sigmoid(global_prior.mixing_logit)
        #     noise_pred = (1 - coeff) * mixing_component + coeff * noise_pred
        alpha_bar = self.scheduler.alphas_cumprod[t]
        one_minus_alpha_bars_sqrt = np.sqrt(1.0 - alpha_bar)
        return noise_pred * one_minus_alpha_bars_sqrt
--- a/models/pvcnn2.py
+++ b/models/pvcnn2.py
@ -0,0 +1,557 @@
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
 # and proprietary rights in and to this software, related documentation
 # and any modifications thereto.  Any use, reproduction, disclosure or
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
 """
 copied and modified from source: 
    https://github.com/alexzhou907/PVD/blob/9747265a5f141e5546fd4f862bfa66aa59f1bd33/model/pvcnn_generation.py 
    and functions under 
    https://github.com/alexzhou907/PVD/tree/9747265a5f141e5546fd4f862bfa66aa59f1bd33/modules 
 """
 import copy
 import functools
 from loguru import logger
 from einops import rearrange
 import torch.nn as nn
 import torch
 import numpy as np
 import third_party.pvcnn.functional as F
 from torch.cuda.amp import autocast, GradScaler, custom_fwd, custom_bwd 
 class SE3d(nn.Module):
    def __init__(self, channel, reduction=8):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel, bias=False),
            nn.Sigmoid()
        )
        self.channel = channel
    def __repr__(self):
        return f"SE({self.channel}, {self.channel})" 
    def forward(self, inputs):
        return inputs * self.fc(inputs.mean(-1).mean(-1).mean(-1)).view(inputs.shape[0], inputs.shape[1], 1, 1, 1)
 class LinearAttention(nn.Module): 
    """
    copied and modified from https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py#L159 
    """
    def __init__(self, dim, heads = 4, dim_head = 32, verbose=True): 
        super().__init__()
        self.heads = heads
        hidden_dim = dim_head * heads
        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
        self.to_out = nn.Conv2d(hidden_dim, dim, 1) 
    def forward(self, x):
        '''
        Args:
            x: torch.tensor (B,C,N), C=num-channels, N=num-points 
        Returns:
            out: torch.tensor (B,C,N)
        '''
        x = x.unsqueeze(-1) # add w dimension
        b, c, h, w = x.shape
        qkv = self.to_qkv(x)
        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
        k = k.softmax(dim=-1)
        context = torch.einsum('bhdn,bhen->bhde', k, v)
        out = torch.einsum('bhde,bhdn->bhen', context, q)
        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
        out = self.to_out(out)
        out = out.squeeze(-1) # B,C,N,1 -> B,C,N
        return out 
 def swish(input):
    return input * torch.sigmoid(input)
 class Swish(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, input):
        return swish(input)
 class BallQuery(nn.Module):
    def __init__(self, radius, num_neighbors, include_coordinates=True):
        super().__init__()
        self.radius = radius
        self.num_neighbors = num_neighbors
        self.include_coordinates = include_coordinates
    @custom_bwd
    def backward(self, *args, **kwargs):
        return super().backward(*args, **kwargs)
    @custom_fwd(cast_inputs=torch.float32) 
    def forward(self, points_coords, centers_coords, points_features=None):
        # input: BCN, BCN 
        # returns: 
        # neighbor_features: B,D(+3),Ncenter 
        points_coords = points_coords.contiguous()
        centers_coords = centers_coords.contiguous()
        neighbor_indices = F.ball_query(centers_coords, points_coords, self.radius, self.num_neighbors)
        neighbor_coordinates = F.grouping(points_coords, neighbor_indices)
        neighbor_coordinates = neighbor_coordinates - centers_coords.unsqueeze(-1)
        if points_features is None:
            assert self.include_coordinates, 'No Features For Grouping'
            neighbor_features = neighbor_coordinates
        else:
            neighbor_features = F.grouping(points_features, neighbor_indices)
            if self.include_coordinates:
                neighbor_features = torch.cat([neighbor_coordinates, neighbor_features], dim=1)
        return neighbor_features
    def extra_repr(self):
        return 'radius={}, num_neighbors={}{}'.format(
            self.radius, self.num_neighbors, ', include coordinates' if self.include_coordinates else '')
 class SharedMLP(nn.Module):
    def __init__(self, in_channels, out_channels, dim=1):
        super().__init__()
        if dim==1:
            conv = nn.Conv1d
        else:
            conv = nn.Conv2d
        bn = nn.GroupNorm 
        if not isinstance(out_channels, (list, tuple)):
            out_channels = [out_channels]
        layers = []
        for oc in out_channels:
            layers.append( conv(in_channels, oc, 1)) 
            layers.append(bn(8, oc))
            layers.append(Swish()) 
            in_channels = oc
        self.layers = nn.Sequential(*layers)
    def forward(self, inputs):
        if isinstance(inputs, (list, tuple)):
            return (self.layers(inputs[0]), *inputs[1:])
        else:
            return self.layers(inputs)
 class Voxelization(nn.Module):
    def __init__(self, resolution, normalize=True, eps=0):
        super().__init__()
        self.r = int(resolution)
        self.normalize = normalize
        self.eps = eps
    def forward(self, features, coords):
        # features: B,D,N
        # coords:   B,3,N 
        coords = coords.detach()
        norm_coords = coords - coords.mean(2, keepdim=True)
        if self.normalize:
            norm_coords = norm_coords / (norm_coords.norm(
                dim=1, keepdim=True).max(dim=2, keepdim=True).values * 2.0 +
                                         self.eps) + 0.5
        else:
            norm_coords = (norm_coords + 1) / 2.0
        norm_coords = torch.clamp(norm_coords * self.r, 0, self.r - 1)
        vox_coords = torch.round(norm_coords).to(torch.int32)
        if features is None:
            return features, norm_coords
        return F.avg_voxelize(features, vox_coords, self.r), norm_coords
    def extra_repr(self):
        return 'resolution={}{}'.format(
            self.r,
            ', normalized eps = {}'.format(self.eps) if self.normalize else '')
 class PVConv(nn.Module):
    def __init__(self, in_channels, out_channels, 
        kernel_size, resolution, 
        normalize=1, eps=0, with_se=False, 
        add_point_feat=True, attention=False, 
        dropout=0.1, verbose=True 
        ):
        super().__init__()
        self.resolution = resolution
        self.voxelization = Voxelization(resolution,
                                         normalize=normalize,
                                         eps=eps)
        # For each PVConv we use (Conv3d, GroupNorm(8), Swish, dropout, Conv3d, GroupNorm(8), Attention) 
        voxel_layers = [
            nn.Conv3d(in_channels, 
                      out_channels,
                      kernel_size, stride=1,
                      padding=kernel_size // 2), 
            nn.GroupNorm(8, out_channels),
            Swish(),
            nn.Dropout(dropout),
            nn.Conv3d(out_channels, out_channels,
                        kernel_size, stride=1,
                        padding=kernel_size // 2),
            nn.GroupNorm(8, out_channels)
            ]
        if with_se:
            voxel_layers.append(SE3d(out_channels))
        self.voxel_layers = nn.Sequential(*voxel_layers)
        if attention:
            self.attn = LinearAttention(out_channels, verbose=verbose)
        else:
            self.attn = None
        if add_point_feat:
            self.point_features = SharedMLP(in_channels, out_channels) #, **mlp_kwargs)
        self.add_point_feat = add_point_feat
    def forward(self, inputs):  
        '''
        Args: 
            inputs: tuple of features and coords 
                features: B,feat-dim,num-points 
                coords:   B,3, num-points 
        Returns:
            fused_features: in (B,out-feat-dim,num-points)
            coords        : in (B, 3, num_points); same as the input coords
        '''
        features = inputs[0] 
        coords_input = inputs[1]
        time_emb = inputs[2]
        ## features, coords_input, time_emb = inputs
        if coords_input.shape[1] > 3:
            coords = coords_input[:,:3] # the last 3 dim are other point attributes if any  
        else:
            coords = coords_input
        assert (features.shape[0] == coords.shape[0]
                ), f'get feat: {features.shape} and {coords.shape}'
        assert (features.shape[2] == coords.shape[2]
                ), f'get feat: {features.shape} and {coords.shape}'
        assert (coords.shape[1] == 3
                ), f'expect coords: B,3,Npoint, get: {coords.shape}'
        # features: B,D,N; point_features  
        # coords:   B,3,N 
        voxel_features_4d, voxel_coords = self.voxelization(features, coords)
        r = self.resolution 
        B = coords.shape[0]
        voxel_features_4d = self.voxel_layers(voxel_features_4d) 
        voxel_features = F.trilinear_devoxelize(voxel_features_4d, voxel_coords,
                                                r, self.training)
        fused_features = voxel_features 
        if self.add_point_feat:
            fused_features = fused_features + self.point_features(features)
        if self.attn is not None:
            fused_features = self.attn(fused_features)
        if time_emb is None:
            time_emb = {'voxel_features_4d': voxel_features_4d, 'resolution': self.resolution, 'training': self.training}  
        return fused_features, coords_input, time_emb #inputs[2]
 class PointNetAModule(nn.Module):
    def __init__(self, in_channels, out_channels, include_coordinates=True):
        super().__init__()
        if not isinstance(out_channels, (list, tuple)):
            out_channels = [[out_channels]]
        elif not isinstance(out_channels[0], (list, tuple)):
            out_channels = [out_channels]
        mlps = []
        total_out_channels = 0
        for _out_channels in out_channels:
            mlps.append(
                SharedMLP(in_channels=in_channels + (3 if include_coordinates else 0),
                          out_channels=_out_channels, dim=1)
            )
            total_out_channels += _out_channels[-1]
        self.include_coordinates = include_coordinates
        self.out_channels = total_out_channels
        self.mlps = nn.ModuleList(mlps)
    def forward(self, inputs):
        features, coords, time_emb = inputs
        if self.include_coordinates:
            features = torch.cat([features, coords], dim=1)
        coords = torch.zeros((coords.size(0), 3, 1), device=coords.device)
        if len(self.mlps) > 1:
            features_list = []
            for mlp in self.mlps:
                features_list.append(mlp(features).max(dim=-1, keepdim=True).values)
            return torch.cat(features_list, dim=1), coords, time_emb
        else:
            return self.mlps[0](features).max(dim=-1, keepdim=True).values, coords, time_emb
    def extra_repr(self):
        return f'out_channels={self.out_channels}, include_coordinates={self.include_coordinates}'
 class PointNetSAModule(nn.Module):
    def __init__(self, num_centers, radius, num_neighbors, in_channels, out_channels, include_coordinates=True):
        super().__init__()
        if not isinstance(radius, (list, tuple)):
            radius = [radius]
        if not isinstance(num_neighbors, (list, tuple)):
            num_neighbors = [num_neighbors] * len(radius)
        assert len(radius) == len(num_neighbors)
        if not isinstance(out_channels, (list, tuple)):
            out_channels = [[out_channels]] * len(radius)
        elif not isinstance(out_channels[0], (list, tuple)):
            out_channels = [out_channels] * len(radius)
        assert len(radius) == len(out_channels)
        groupers, mlps = [], []
        total_out_channels = 0
        for _radius, _out_channels, _num_neighbors in zip(radius, out_channels, num_neighbors):
            groupers.append(
                BallQuery(radius=_radius, num_neighbors=_num_neighbors, 
                    include_coordinates=include_coordinates)
            )
            # logger.info('create MLP: in_channel={}, out_channels={}',
            #        in_channels + (3 if include_coordinates else 0),_out_channels)
            mlps.append(
                SharedMLP(in_channels=in_channels + (3 if include_coordinates else 0) ,
                          out_channels=_out_channels, dim=2)
            )
            total_out_channels += _out_channels[-1]
        self.num_centers = num_centers
        self.out_channels = total_out_channels
        self.groupers = nn.ModuleList(groupers)
        self.mlps = nn.ModuleList(mlps)
    def forward(self, inputs):
        # features, coords, _ = inputs
        features = inputs[0] 
        coords = inputs[1]  # B3N 
        if coords.shape[1] > 3:
            coords = coords[:,:3]
        centers_coords = F.furthest_point_sample(coords, self.num_centers)
        # centers_coords: B,D,N
        S = centers_coords.shape[-1]
        time_emb = inputs[2] 
        time_emb = time_emb[:,:,:S] if \
            time_emb is not None and type(time_emb) is not dict \
            else time_emb  
        features_list = []
        c = 0
        for grouper, mlp in zip(self.groupers, self.mlps):
            c += 1
            grouper_output = grouper(coords, centers_coords, features)
            features_list.append(
                    mlp(grouper_output
                        ).max(dim=-1).values
                    )
        if len(features_list) > 1:
            return torch.cat(features_list, dim=1), centers_coords, time_emb
        else:
            return features_list[0], centers_coords, time_emb
    def extra_repr(self):
        return f'num_centers={self.num_centers}, out_channels={self.out_channels}'
 class PointNetFPModule(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.mlp = SharedMLP(in_channels=in_channels, out_channels=out_channels, dim=1)
    def forward(self, inputs):
        if len(inputs) == 4:
            points_coords, centers_coords, centers_features, time_emb = inputs
            points_features = None
        else:
            points_coords, centers_coords, centers_features, points_features, time_emb = inputs
        interpolated_features = F.nearest_neighbor_interpolate(points_coords, centers_coords, centers_features)
        if points_features is not None:
            interpolated_features = torch.cat(
                [interpolated_features, points_features], dim=1
            )
        if time_emb is not None:
            B,D,S = time_emb.shape 
            N = points_coords.shape[-1]
            time_emb = time_emb[:,:,0:1].expand(-1,-1,N) 
        return self.mlp(interpolated_features), points_coords, time_emb
 def _linear_gn_relu(in_channels, out_channels):
    return nn.Sequential(nn.Linear(in_channels, out_channels), nn.GroupNorm(8,out_channels), Swish())
 def create_mlp_components(in_channels, out_channels, classifier=False, dim=2, width_multiplier=1):
    r = width_multiplier
    if dim == 1:
        block = _linear_gn_relu
    else:
        block = SharedMLP
    if not isinstance(out_channels, (list, tuple)):
        out_channels = [out_channels]
    if len(out_channels) == 0 or (len(out_channels) == 1 and out_channels[0] is None):
        return nn.Sequential(), in_channels, in_channels
    layers = []
    for oc in out_channels[:-1]:
        if oc < 1:
            layers.append(nn.Dropout(oc))
        else:
            oc = int(r * oc)
            layers.append(block(in_channels, oc))
            in_channels = oc
    if dim == 1:
        if classifier:
            layers.append(nn.Linear(in_channels, out_channels[-1]))
        else:
            layers.append(_linear_gn_relu(in_channels, int(r * out_channels[-1])))
    else:
        if classifier:
            layers.append(nn.Conv1d(in_channels, out_channels[-1], 1))
        else:
            layers.append(SharedMLP(in_channels, int(r * out_channels[-1])))
    return layers, out_channels[-1] if classifier else int(r * out_channels[-1])
 def create_pointnet_components(blocks, in_channels, embed_dim, with_se=False, normalize=True, eps=0,
                               width_multiplier=1, voxel_resolution_multiplier=1, verbose=True):
    r, vr = width_multiplier, voxel_resolution_multiplier
    layers, concat_channels = [], 0
    c = 0
    for k, (out_channels, num_blocks, voxel_resolution) in enumerate(blocks):
        out_channels = int(r * out_channels)
        for p in range(num_blocks):
            attention = k % 2 == 0 and k > 0 and p == 0
            if voxel_resolution is None:
                block = SharedMLP
            else:
                block = functools.partial(PVConv, kernel_size=3, resolution=int(vr * voxel_resolution), attention=attention,
                                          with_se=with_se, normalize=normalize, eps=eps, verbose=verbose)
            if c == 0:
                layers.append(block(in_channels, out_channels))
            else:
                layers.append(block(in_channels+embed_dim, out_channels))
            in_channels = out_channels
            concat_channels += out_channels
            c += 1
    return layers, in_channels, concat_channels
 def create_pointnet2_sa_components(sa_blocks, extra_feature_channels, 
        input_dim=3, 
        embed_dim=64, use_att=False, force_att=0,
        dropout=0.1, with_se=False, normalize=True, eps=0, has_temb=1,
        width_multiplier=1, voxel_resolution_multiplier=1, verbose=True):
    """
    Returns: 
        in_channels: the last output channels of the sa blocks 
    """
    r, vr = width_multiplier, voxel_resolution_multiplier
    in_channels = extra_feature_channels + input_dim 
    sa_layers, sa_in_channels = [], []
    c = 0
    num_centers = None
    for conv_configs, sa_configs in sa_blocks:
        k = 0
        sa_in_channels.append(in_channels)
        sa_blocks = []
        if conv_configs is not None:
            out_channels, num_blocks, voxel_resolution = conv_configs
            out_channels = int(r * out_channels)
            for p in range(num_blocks):
                attention = ( (c+1) % 2 == 0 and use_att and p == 0 ) or (force_att and c > 0)
                if voxel_resolution is None:
                    block = SharedMLP
                else:
                    block = functools.partial(
                        PVConv, kernel_size=3, 
                        resolution=int(vr * voxel_resolution), attention=attention,
                        dropout=dropout,
                        with_se=with_se, 
                        normalize=normalize, eps=eps, verbose=verbose)
                if c == 0:
                    sa_blocks.append(block(in_channels, out_channels))
                elif k ==0:
                    sa_blocks.append(block(in_channels+embed_dim*has_temb, out_channels))
                in_channels = out_channels
                k += 1
            extra_feature_channels = in_channels
        if sa_configs is not None:
            num_centers, radius, num_neighbors, out_channels = sa_configs
            _out_channels = []
            for oc in out_channels:
                if isinstance(oc, (list, tuple)):
                    _out_channels.append([int(r * _oc) for _oc in oc])
                else:
                    _out_channels.append(int(r * oc))
            out_channels = _out_channels
            if num_centers is None:
                block = PointNetAModule
            else:
                block = functools.partial(PointNetSAModule, num_centers=num_centers, radius=radius,
                                          num_neighbors=num_neighbors) 
            sa_blocks.append(block(in_channels=extra_feature_channels+(embed_dim*has_temb if k==0 else 0 ), 
                out_channels=out_channels,
                include_coordinates=True))
            in_channels = extra_feature_channels = sa_blocks[-1].out_channels 
        c += 1
        if len(sa_blocks) == 1:
            sa_layers.append(sa_blocks[0])
        else:
            sa_layers.append(nn.Sequential(*sa_blocks))
    return sa_layers, sa_in_channels, in_channels, 1 if num_centers is None else num_centers
 def create_pointnet2_fp_modules(fp_blocks, in_channels, sa_in_channels, embed_dim=64, use_att=False,
                                dropout=0.1, has_temb=1, 
                                with_se=False, normalize=True, eps=0,
                                width_multiplier=1, voxel_resolution_multiplier=1,
                                verbose=True):
    r, vr = width_multiplier, voxel_resolution_multiplier
    fp_layers = []
    c = 0
    for fp_idx, (fp_configs, conv_configs) in enumerate(fp_blocks):
        fp_blocks = []
        out_channels = tuple(int(r * oc) for oc in fp_configs)
        fp_blocks.append(
            PointNetFPModule(in_channels=in_channels + sa_in_channels[-1 - fp_idx] + embed_dim*has_temb, 
                out_channels=out_channels)
        )
        in_channels = out_channels[-1]
        if conv_configs is not None:
            out_channels, num_blocks, voxel_resolution = conv_configs
            out_channels = int(r * out_channels)
            for p in range(num_blocks):
                attention = (c+1) % 2 == 0 and c < len(fp_blocks) - 1 and use_att and p == 0
                if voxel_resolution is None:
                    block = SharedMLP
                else:
                    block = functools.partial(PVConv, kernel_size=3, 
                            resolution=int(vr * voxel_resolution), attention=attention,
                            dropout=dropout,
                            with_se=with_se, # with_se_relu=True,
                            normalize=normalize, eps=eps,
                            verbose=verbose)
                fp_blocks.append(block(in_channels, out_channels))
                in_channels = out_channels
        if len(fp_blocks) == 1:
            fp_layers.append(fp_blocks[0])
        else:
            fp_layers.append(nn.Sequential(*fp_blocks))
        c += 1
    return fp_layers, in_channels
--- a/models/pvcnn2_ada.py
+++ b/models/pvcnn2_ada.py
@ -0,0 +1,568 @@
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
 # and proprietary rights in and to this software, related documentation
 # and any modifications thereto.  Any use, reproduction, disclosure or
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
 """
 copied and modified from source: 
    https://github.com/alexzhou907/PVD/blob/9747265a5f141e5546fd4f862bfa66aa59f1bd33/model/pvcnn_generation.py 
    and functions under 
    https://github.com/alexzhou907/PVD/tree/9747265a5f141e5546fd4f862bfa66aa59f1bd33/modules 
 """
 import copy
 import functools
 from loguru import logger
 from einops import rearrange
 import torch.nn as nn
 import torch
 import numpy as np
 import third_party.pvcnn.functional as F
 # from utils.checker import *
 from torch.cuda.amp import autocast, GradScaler, custom_fwd, custom_bwd 
 from .adagn import AdaGN 
 import os 
 quiet = int(os.environ.get('quiet', 0))
 class SE3d(nn.Module):
    def __init__(self, channel, reduction=8):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel, bias=False),
            nn.Sigmoid()
        )
        self.channel = channel
    def __repr__(self):
        return f"SE({self.channel}, {self.channel})" 
    def forward(self, inputs):
        return inputs * self.fc(inputs.mean(-1).mean(-1).mean(-1)).view(inputs.shape[0], inputs.shape[1], 1, 1, 1)
 class LinearAttention(nn.Module): 
    """
    copied and modified from https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py#L159 
    """
    def __init__(self, dim, heads = 4, dim_head = 32, verbose=True): 
        super().__init__()
        self.heads = heads
        hidden_dim = dim_head * heads
        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
        self.to_out = nn.Conv2d(hidden_dim, dim, 1) 
    def forward(self, x):
        '''
        Args:
            x: torch.tensor (B,C,N), C=num-channels, N=num-points 
        Returns:
            out: torch.tensor (B,C,N)
        '''
        x = x.unsqueeze(-1) # add w dimension
        b, c, h, w = x.shape
        qkv = self.to_qkv(x)
        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
        k = k.softmax(dim=-1)
        context = torch.einsum('bhdn,bhen->bhde', k, v)
        out = torch.einsum('bhde,bhdn->bhen', context, q)
        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
        out = self.to_out(out)
        out = out.squeeze(-1) # B,C,N,1 -> B,C,N
        return out 
 def swish(input):
    return input * torch.sigmoid(input)
 class Swish(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, input):
        return swish(input)
 class BallQuery(nn.Module):
    def __init__(self, radius, num_neighbors, include_coordinates=True):
        super().__init__()
        self.radius = radius
        self.num_neighbors = num_neighbors
        self.include_coordinates = include_coordinates
    @custom_bwd
    def backward(self, *args, **kwargs):
        return super().backward(*args, **kwargs)
    @custom_fwd(cast_inputs=torch.float32) 
    def forward(self, points_coords, centers_coords, points_features=None):
        # input: BCN, BCN 
        # neighbor_features: B,D(+3),Ncenter 
        points_coords = points_coords.contiguous()
        centers_coords = centers_coords.contiguous()
        neighbor_indices = F.ball_query(centers_coords, points_coords, self.radius, self.num_neighbors)
        neighbor_coordinates = F.grouping(points_coords, neighbor_indices)
        neighbor_coordinates = neighbor_coordinates - centers_coords.unsqueeze(-1)
        if points_features is None:
            assert self.include_coordinates, 'No Features For Grouping'
            neighbor_features = neighbor_coordinates
        else:
            neighbor_features = F.grouping(points_features, neighbor_indices)
            if self.include_coordinates:
                neighbor_features = torch.cat([neighbor_coordinates, neighbor_features], dim=1)
        return neighbor_features
    def extra_repr(self):
        return 'radius={}, num_neighbors={}{}'.format(
            self.radius, self.num_neighbors, ', include coordinates' if self.include_coordinates else '')
 class SharedMLP(nn.Module):
    def __init__(self, in_channels, out_channels, dim=1, cfg={}):
        assert(len(cfg) > 0), cfg
        super().__init__()
        if dim==1:
            conv = nn.Conv1d
        else:
            conv = nn.Conv2d
        bn = functools.partial(AdaGN, dim, cfg) 
        if not isinstance(out_channels, (list, tuple)):
            out_channels = [out_channels]
        layers = []
        for oc in out_channels:
            layers.append(conv(in_channels, oc, 1)) 
            layers.append(bn(oc))
            layers.append(Swish()) 
            in_channels = oc
        self.layers = nn.ModuleList(layers)
    def forward(self, *inputs): 
        if len(inputs) == 1 and len(inputs[0]) == 4:
            # try to fix thwn SharedMLP is the first layer 
            inputs = inputs[0] 
        if len(inputs) == 1: 
            raise NotImplementedError 
        elif len(inputs) == 4:
            assert(len(inputs) == 4), 'input, style'
            x, _, _, style = inputs 
            for l in self.layers:
                if isinstance(l, AdaGN): 
                    x = l(x, style)
                else:
                    x = l(x) 
            return (x, *inputs[1:])
        elif len(inputs) == 2:
            x, style = inputs 
            for l in self.layers:
                if isinstance(l, AdaGN): 
                    x = l(x, style)
                else:
                    x = l(x) 
            return x 
        else:
            raise NotImplementedError 
 class Voxelization(nn.Module):
    def __init__(self, resolution, normalize=True, eps=0):
        super().__init__()
        self.r = int(resolution)
        self.normalize = normalize
        self.eps = eps
    def forward(self, features, coords):
        # features: B,D,N
        # coords:   B,3,N 
        coords = coords.detach()
        norm_coords = coords - coords.mean(2, keepdim=True)
        if self.normalize:
            norm_coords = norm_coords / (norm_coords.norm(
                dim=1, keepdim=True).max(dim=2, keepdim=True).values * 2.0 +
                                         self.eps) + 0.5
        else:
            norm_coords = (norm_coords + 1) / 2.0
        norm_coords = torch.clamp(norm_coords * self.r, 0, self.r - 1)
        vox_coords = torch.round(norm_coords).to(torch.int32)
        if features is None:
            return features, norm_coords
        return F.avg_voxelize(features, vox_coords, self.r), norm_coords
    def extra_repr(self):
        return 'resolution={}{}'.format(
            self.r,
            ', normalized eps = {}'.format(self.eps) if self.normalize else '')
 class PVConv(nn.Module):
    def __init__(self, in_channels, out_channels, 
        kernel_size, resolution, 
        normalize=1, eps=0, with_se=False, 
        add_point_feat=True, attention=False, 
        dropout=0.1, verbose=True, 
        cfg={}
        ):
        super().__init__()
        assert(len(cfg) > 0), cfg
        self.resolution = resolution
        self.voxelization = Voxelization(resolution,
                                         normalize=normalize,
                                         eps=eps)
        # For each PVConv we use (Conv3d, GroupNorm(8), Swish, dropout, Conv3d, GroupNorm(8), Attention) 
        NormLayer = functools.partial(AdaGN, 3, cfg)
        voxel_layers = [
            nn.Conv3d(in_channels , 
                      out_channels,
                      kernel_size, stride=1,
                      padding=kernel_size // 2), 
            NormLayer(out_channels),
            Swish(),
            nn.Dropout(dropout),
            nn.Conv3d(out_channels, out_channels,
                        kernel_size, stride=1,
                        padding=kernel_size // 2),
            NormLayer(out_channels)
            ]
        if with_se:
            voxel_layers.append(SE3d(out_channels))
        self.voxel_layers = nn.ModuleList(voxel_layers)
        if attention:
            self.attn = LinearAttention(out_channels, verbose=verbose)
        else:
            self.attn = None
        if add_point_feat:
            self.point_features = SharedMLP(in_channels, out_channels, cfg=cfg)
        self.add_point_feat = add_point_feat
    def forward(self, inputs):  
        '''
        Args: 
            inputs: tuple of features and coords 
                features: B,feat-dim,num-points 
                coords:   B,3, num-points 
                time_emd: B,D; time embedding 
                style:    B,D; global latent 
        Returns:
            fused_features: in (B,out-feat-dim,num-points)
            coords        : in (B, 3 or 6, num_points); same as the input coords
        '''
        features    = inputs[0] 
        coords_input= inputs[1]
        time_emb    = inputs[2]
        style       = inputs[3] 
        if coords_input.shape[1] > 3:
            coords = coords_input[:,:3] 
        else:
            coords = coords_input
        assert (features.shape[0] == coords.shape[0]
                ), f'get feat: {features.shape} and {coords.shape}'
        assert (features.shape[2] == coords.shape[2]
                ), f'get feat: {features.shape} and {coords.shape}'
        assert (coords.shape[1] == 3
                ), f'expect coords: B,3,Npoint, get: {coords.shape}'
        # features: B,D,N; point_features  
        # coords:   B,3,N 
        voxel_features_4d, voxel_coords = self.voxelization(features, coords)
        r = self.resolution 
        B = coords.shape[0]
        for voxel_layers in self.voxel_layers:
            if isinstance(voxel_layers, AdaGN):
                voxel_features_4d = voxel_layers(voxel_features_4d, style)
            else:
                voxel_features_4d = voxel_layers(voxel_features_4d) 
        voxel_features = F.trilinear_devoxelize(voxel_features_4d, voxel_coords,
                                                r, self.training)
        fused_features = voxel_features 
        if self.add_point_feat:
            fused_features = fused_features + self.point_features(features, style)
        if self.attn is not None:
            fused_features = self.attn(fused_features)
        return fused_features, coords_input, time_emb, style
 class PointNetAModule(nn.Module):
    def __init__(self, in_channels, out_channels, include_coordinates=True, cfg={}):
        super().__init__()
        if not isinstance(out_channels, (list, tuple)):
            out_channels = [[out_channels]]
        elif not isinstance(out_channels[0], (list, tuple)):
            out_channels = [out_channels]
        mlps = []
        total_out_channels = 0
        for _out_channels in out_channels:
            mlps.append(
                SharedMLP(in_channels=in_channels + (3 if include_coordinates else 0),
                          out_channels=_out_channels, dim=1, cfg=cfg)
            )
            total_out_channels += _out_channels[-1]
        self.include_coordinates = include_coordinates
        self.out_channels = total_out_channels
        self.mlps = nn.ModuleList(mlps)
    def forward(self, inputs):
        features, coords, time_emb, style = inputs
        if self.include_coordinates:
            features = torch.cat([features, coords], dim=1)
        coords = torch.zeros((coords.size(0), 3, 1), device=coords.device)
        if len(self.mlps) > 1:
            features_list = []
            for mlp in self.mlps:
                features_list.append(mlp(features, style).max(dim=-1, keepdim=True).values)
            return torch.cat(features_list, dim=1), coords, time_emb
        else:
            return self.mlps[0](features, style).max(dim=-1, keepdim=True).values, coords, time_emb
    def extra_repr(self):
        return f'out_channels={self.out_channels}, include_coordinates={self.include_coordinates}'
 class PointNetSAModule(nn.Module):
    def __init__(self, num_centers, radius, num_neighbors, in_channels, out_channels, include_coordinates=True,
            cfg={}):
        super().__init__()
        if not isinstance(radius, (list, tuple)):
            radius = [radius]
        if not isinstance(num_neighbors, (list, tuple)):
            num_neighbors = [num_neighbors] * len(radius)
        assert len(radius) == len(num_neighbors)
        if not isinstance(out_channels, (list, tuple)):
            out_channels = [[out_channels]] * len(radius)
        elif not isinstance(out_channels[0], (list, tuple)):
            out_channels = [out_channels] * len(radius)
        assert len(radius) == len(out_channels)
        groupers, mlps = [], []
        total_out_channels = 0
        for _radius, _out_channels, _num_neighbors in zip(radius, out_channels, num_neighbors):
            groupers.append(
                BallQuery(radius=_radius, num_neighbors=_num_neighbors, 
                    include_coordinates=include_coordinates)
            )
            mlps.append(
                SharedMLP(in_channels=in_channels + (3 if include_coordinates else 0),
                          out_channels=_out_channels, dim=2, cfg=cfg)
            )
            total_out_channels += _out_channels[-1]
        self.num_centers = num_centers
        self.out_channels = total_out_channels
        self.groupers = nn.ModuleList(groupers)
        self.mlps = nn.ModuleList(mlps)
    def forward(self, inputs):
        features = inputs[0] 
        coords = inputs[1]  # B3N 
        style = inputs[3] 
        if coords.shape[1] > 3:
            coords = coords[:,:3]
        centers_coords = F.furthest_point_sample(coords, self.num_centers)
        # centers_coords: B,D,N
        S = centers_coords.shape[-1]
        time_emb = inputs[2] 
        time_emb = time_emb[:,:,:S] if \
            time_emb is not None and type(time_emb) is not dict \
            else time_emb  
        features_list = []
        c = 0
        for grouper, mlp in zip(self.groupers, self.mlps):
            c += 1
            grouper_output = grouper(coords, centers_coords, features )
            features_list.append(
                    mlp(grouper_output, style
                        ).max(dim=-1).values
                    )
        if len(features_list) > 1:
            return torch.cat(features_list, dim=1), centers_coords, time_emb, style
        else:
            return features_list[0], centers_coords, time_emb, style
    def extra_repr(self):
        return f'num_centers={self.num_centers}, out_channels={self.out_channels}'
 class PointNetFPModule(nn.Module):
    def __init__(self, in_channels, out_channels, cfg={}):
        super().__init__()
        self.mlp = SharedMLP(in_channels=in_channels, out_channels=out_channels, dim=1, cfg=cfg)
    def forward(self, inputs):
        if len(inputs) == 5:
            points_coords, centers_coords, centers_features, time_emb, style = inputs
            points_features = None
        elif len(inputs) == 6:
            points_coords, centers_coords, centers_features, points_features, time_emb, style = inputs
        else:
            raise NotImplementedError 
        interpolated_features = F.nearest_neighbor_interpolate(points_coords, centers_coords, centers_features)
        if points_features is not None:
            interpolated_features = torch.cat(
                [interpolated_features, points_features], dim=1
            )
        if time_emb is not None:
            B,D,S = time_emb.shape 
            N = points_coords.shape[-1]
            time_emb = time_emb[:,:,0:1].expand(-1,-1,N) 
        return self.mlp(interpolated_features, style), points_coords, time_emb, style
 def _linear_gn_relu(in_channels, out_channels):
    return nn.Sequential(nn.Linear(in_channels, out_channels), nn.GroupNorm(8,out_channels), Swish())
 def create_mlp_components(in_channels, out_channels, classifier=False, dim=2, width_multiplier=1, cfg={}):
    r = width_multiplier
    if dim == 1:
        block = _linear_gn_relu
    else:
        block = SharedMLP
    if not isinstance(out_channels, (list, tuple)):
        out_channels = [out_channels]
    if len(out_channels) == 0 or (len(out_channels) == 1 and out_channels[0] is None):
        return nn.Sequential(), in_channels, in_channels
    layers = []
    for oc in out_channels[:-1]:
        if oc < 1:
            layers.append(nn.Dropout(oc))
        else:
            oc = int(r * oc)
            layers.append(block(in_channels, oc, cfg=cfg))
            in_channels = oc
    if dim == 1:
        if classifier:
            layers.append(nn.Linear(in_channels, out_channels[-1]))
        else:
            layers.append(_linear_gn_relu(in_channels, int(r * out_channels[-1])))
    else:
        if classifier:
            layers.append(nn.Conv1d(in_channels, out_channels[-1], 1))
        else:
            layers.append(SharedMLP(in_channels, int(r * out_channels[-1])))
    return layers, out_channels[-1] if classifier else int(r * out_channels[-1]) 
 def create_pointnet2_sa_components(sa_blocks, extra_feature_channels, 
        input_dim=3, 
        embed_dim=64, use_att=False, force_att=0,
        dropout=0.1, with_se=False, normalize=True, eps=0, has_temb=1,
        width_multiplier=1, voxel_resolution_multiplier=1, verbose=True,
        cfg={}):
    """
    Returns: 
        in_channels: the last output channels of the sa blocks 
    """
    assert(len(cfg) > 0), cfg
    r, vr = width_multiplier, voxel_resolution_multiplier
    in_channels = extra_feature_channels + input_dim 
    sa_layers, sa_in_channels = [], []
    c = 0
    num_centers = None
    for conv_configs, sa_configs in sa_blocks:
        k = 0
        sa_in_channels.append(in_channels)
        sa_blocks = []
        if conv_configs is not None:
            out_channels, num_blocks, voxel_resolution = conv_configs
            out_channels = int(r * out_channels)
            for p in range(num_blocks):
                attention = ( (c+1) % 2 == 0 and use_att and p == 0 ) or (force_att and c > 0)
                if voxel_resolution is None:
                    block = SharedMLP
                else:
                    block = functools.partial(
                        PVConv, kernel_size=3, 
                        resolution=int(vr * voxel_resolution), attention=attention,
                        dropout=dropout,
                        with_se=with_se, # with_se_relu=True,
                        normalize=normalize, eps=eps, verbose=verbose, cfg=cfg)
                if c == 0:
                    sa_blocks.append(block(in_channels, out_channels, cfg=cfg))
                elif k ==0:
                    sa_blocks.append(block(in_channels+embed_dim*has_temb, out_channels, cfg=cfg))
                in_channels = out_channels
                k += 1
            extra_feature_channels = in_channels
        if sa_configs is not None:
            num_centers, radius, num_neighbors, out_channels = sa_configs
            _out_channels = []
            for oc in out_channels:
                if isinstance(oc, (list, tuple)):
                    _out_channels.append([int(r * _oc) for _oc in oc])
                else:
                    _out_channels.append(int(r * oc))
            out_channels = _out_channels
            if num_centers is None:
                block = PointNetAModule
            else:
                block = functools.partial(PointNetSAModule, num_centers=num_centers, radius=radius,
                                          num_neighbors=num_neighbors)
            sa_blocks.append(block(cfg=cfg,
                in_channels=extra_feature_channels+(embed_dim*has_temb if k==0 else 0 ), 
                out_channels=out_channels,
                include_coordinates=True))
            in_channels = extra_feature_channels = sa_blocks[-1].out_channels 
        c += 1
        if len(sa_blocks) == 1:
            sa_layers.append(sa_blocks[0])
        else:
            sa_layers.append(nn.Sequential(*sa_blocks))
    return sa_layers, sa_in_channels, in_channels, 1 if num_centers is None else num_centers
 def create_pointnet2_fp_modules(fp_blocks, in_channels, sa_in_channels, embed_dim=64, use_att=False,
                                dropout=0.1, has_temb=1, 
                                with_se=False, normalize=True, eps=0,
                                width_multiplier=1, voxel_resolution_multiplier=1,
                                verbose=True, cfg={}):
    assert(len(cfg) > 0), cfg
    r, vr = width_multiplier, voxel_resolution_multiplier
    fp_layers = []
    c = 0
    for fp_idx, (fp_configs, conv_configs) in enumerate(fp_blocks):
        fp_blocks = []
        out_channels = tuple(int(r * oc) for oc in fp_configs)
        fp_blocks.append(
            PointNetFPModule(
                in_channels=in_channels + sa_in_channels[-1 - fp_idx] + embed_dim*has_temb, 
                out_channels=out_channels,
                cfg=cfg)
        )
        in_channels = out_channels[-1]
        if conv_configs is not None:
            out_channels, num_blocks, voxel_resolution = conv_configs
            out_channels = int(r * out_channels)
            for p in range(num_blocks):
                attention = (c+1) % 2 == 0 and c < len(fp_blocks) - 1 and use_att and p == 0
                if voxel_resolution is None:
                    block = functools.partial(SharedMLP, cfg=cfg)
                else:
                    block = functools.partial(PVConv, kernel_size=3, 
                            resolution=int(vr * voxel_resolution), attention=attention,
                            dropout=dropout,
                            with_se=with_se, # with_se_relu=True,
                            normalize=normalize, eps=eps,
                            verbose=verbose,
                            cfg=cfg)
                fp_blocks.append(block(in_channels, out_channels))
                in_channels = out_channels
        if len(fp_blocks) == 1:
            fp_layers.append(fp_blocks[0])
        else:
            fp_layers.append(nn.Sequential(*fp_blocks))
        c += 1
    return fp_layers, in_channels
--- a/models/score_sde/resnet.py
+++ b/models/score_sde/resnet.py
@ -0,0 +1,230 @@
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
 # and proprietary rights in and to this software, related documentation
 # and any modifications thereto.  Any use, reproduction, disclosure or
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
 """ implement the gloabl prior for LION
 """
 import torch.nn as nn
 from loguru import logger 
 import functools
 import torch
 from ..utils import init_temb_fun, mask_inactive_variables
 class SE(nn.Module):
    def __init__(self, channel, reduction=8):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Conv2d(channel, channel // reduction, 1, 1, bias=False),
            nn.ReLU(inplace=True),
            nn.Conv2d(channel // reduction, channel, 1, 1, bias=False),
            nn.Sigmoid()
        )
    def forward(self, inputs):
        return inputs * self.fc(inputs)
 class ResBlockSEClip(nn.Module):
    """
    fixed the conv0 not used error in ResBlockSE
    """
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.non_linearity = nn.ReLU(inplace=True) 
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.conv1 = nn.Conv2d(input_dim*2, output_dim, 1, 1)
        self.conv2 = nn.Conv2d(output_dim, output_dim, 1, 1)
        in_ch = self.output_dim 
        self.SE = SE(in_ch)
    def forward(self, x, t): 
        ## logger.info('x: {}, t: {}, input_dim={}', x.shape, t.shape, self.input_dim)
        clip_feat = t[:, self.input_dim:].contiguous() 
        t = t[:,:self.input_dim].contiguous()
        output = x + t 
        output = torch.cat([output, clip_feat], dim=1).contiguous() 
        output = self.conv1(output)
        output = self.non_linearity(output)
        output = self.conv2(output)
        output = self.non_linearity(output)
        output = self.SE(output) 
        shortcut = x
        return shortcut + output
    def __repr__(self):
        return "ResBlockSEClip(%d, %d)"%(self.input_dim, self.output_dim)
 class ResBlockSEDrop(nn.Module):
    """
    fixed the conv0 not used error in ResBlockSE
    """
    def __init__(self, input_dim, output_dim, dropout):
        super().__init__()
        self.non_linearity = nn.ReLU(inplace=True)
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.conv1 = nn.Conv2d(input_dim, output_dim, 1, 1)
        self.conv2 = nn.Conv2d(output_dim, output_dim, 1, 1)
        in_ch = self.output_dim
        self.SE = SE(in_ch)
        self.dropout = nn.Dropout(dropout)
        self.dropout_ratio = dropout
    def forward(self, x, t):
        output = x + t
        output = self.conv1(output)
        output = self.non_linearity(output)
        output = self.dropout(output)
        output = self.conv2(output)
        output = self.non_linearity(output)
        output = self.SE(output)
        shortcut = x
        return shortcut + output
    def __repr__(self):
        return "ResBlockSE_withdropout(%d, %d, drop=%f)" % (
            self.input_dim, self.output_dim, self.dropout_ratio)
 class ResBlock(nn.Module):
    def __init__(self, input_dim, output_dim):
        # resample=None, act=nn.ELU(),
        #           normalization=nn.InstanceNorm2d, adjust_padding=False, dilation=1):
        super().__init__()
        self.non_linearity = nn.ELU()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.conv1 = nn.Conv2d(input_dim, output_dim, 1, 1)
        self.conv2 = nn.Conv2d(output_dim, output_dim, 1, 1)
        in_ch = self.output_dim
        self.normalize1 = nn.GroupNorm(num_groups=min(in_ch // 4, 32),
                                       num_channels=in_ch, eps=1e-6)
        self.normalize2 = nn.GroupNorm(num_groups=min(in_ch // 4, 32),
                                       num_channels=in_ch, eps=1e-6)
    def forward(self, x, t):
        x = x + t
        output = self.conv1(x)
        output = self.normalize1(output)
        output = self.non_linearity(output)
        output = self.conv2(output)
        output = self.normalize2(output)
        output = self.non_linearity(output)
        shortcut = x
        return shortcut + output
    def __repr__(self):
        return "ResBlock(%d, %d)" % (self.input_dim, self.output_dim)
 class Prior(nn.Module):
    building_block = ResBlock
    def __init__(self, args, num_input_channels, *oargs, **kwargs):
        super().__init__()
        # args: cfg.sde
        # oargs: other argument: the global argument
        self.condition_input = kwargs.get('condition_input', False)
        self.cfg = oargs[0]
        self.clip_forge_enable = self.cfg.clipforge.enable  # kwargs.get('clipforge.enable', 0)
        logger.info('[Build Resnet Prior] Has condition input: {}; clipforge {}; '
                    'learn_mixing_logit={}, ', self.condition_input,
                    self.clip_forge_enable, args.learn_mixing_logit)
        self.act = act = nn.SiLU()
        self.num_scales = args.num_scales_dae
        self.num_input_channels = num_input_channels
        self.nf = nf = args.num_channels_dae
        num_cell_per_scale_dae = args.num_cell_per_scale_dae if 'num_cell_per_scale_dae' not in kwargs else kwargs[
            'num_cell_per_scale_dae']
        # take clip feature as input
        if self.clip_forge_enable:
            self.clip_feat_mapping = nn.Conv1d(self.cfg.clipforge.feat_dim, self.nf, 1)
        # mixed_prediction #
        self.mixed_prediction = args.mixed_prediction  # This enables mixed prediction
        if self.mixed_prediction:
            logger.info('init-mixing_logit = {}, after sigmoid = {}',
                        args.mixing_logit_init, torch.sigmoid(torch.tensor(args.mixing_logit_init)))
            assert(args.mixing_logit_init), f'require learning'
            # if not args.learn_mixing_logit and args.hypara_mixing_logit:
            #    # not learn, treat it as hyparameters
            #    init = args.mixing_logit_init * torch.ones(size=[1, num_input_channels, 1, 1])
            #    self.mixing_logit = torch.nn.Parameter(init, requires_grad=False) # not update
            #    self.is_active = None
            # elif not args.learn_mixing_logit: # not learn, loaded from c04cd1h exp
            #    init = torch.load('../exp/1110/chair/c04cd1h_hvae3s_390f8dhInitSepesTrainvae0_hvaeB72l1E4W1/mlogit.pt')
            #    self.mixing_logit = torch.nn.Parameter(init, requires_grad=False)
            #    self.is_active = None
            # else:
            if True:
                init = args.mixing_logit_init * torch.ones(size=[1, num_input_channels, 1, 1])
                self.mixing_logit = torch.nn.Parameter(init, requires_grad=True)
                self.is_active = None
        else:  # no mixing_logit
            self.mixing_logit = None
            self.is_active = None
        self.embedding_dim = args.embedding_dim
        self.embedding_dim_mult = 4
        self.temb_fun = init_temb_fun(args.embedding_type, args.embedding_scale, args.embedding_dim)
        logger.info('[temb_fun] embedding_type={}, embedding_scale={}, embedding_dim={}',
                    args.embedding_type, args.embedding_scale, args.embedding_dim)
        # exit()
        modules = []
        modules.append(nn.Conv2d(self.embedding_dim, self.embedding_dim * 4, 1, 1))
        modules.append(nn.Conv2d(self.embedding_dim * 4, nf, 1, 1))
        self.temb_layer = nn.Sequential(*modules)
        modules = []
        input_channels = num_input_channels
        self.input_layer = nn.Conv2d(input_channels, nf, 1, 1)
        in_ch = nf
        for i_block in range(args.num_cell_per_scale_dae):
            modules.append(self.building_block(nf, nf))
        self.output_layer = nn.Conv2d(nf, input_channels, 1, 1)
        self.all_modules = nn.ModuleList(modules)
    def forward(self, x, t, **kwargs):
        # timestep/noise_level embedding; only for continuous training
        # time embedding
        if t.dim() == 0:
            t = t.expand(1)
        temb = self.temb_fun(t)[:, :, None, None]  # make it 4d
        temb = self.temb_layer(temb)
        if self.clip_forge_enable:
            clip_feat = kwargs['clip_feat']
            clip_feat = self.clip_feat_mapping(clip_feat[:, :, None])[:, :, :, None]  # B,D -> BD1->B,D,1,1
            if temb.shape[0] == 1 and temb.shape[0] < clip_feat.shape[0]:
                temb = temb.expand(clip_feat.shape[0], -1, -1, -1)
            temb = torch.cat([temb, clip_feat], dim=1)  # add to temb feature
        # mask out inactive variables
        if self.mixed_prediction and self.is_active is not None:
            x = mask_inactive_variables(x, self.is_active)
        x = self.input_layer(x)
        for layer in self.all_modules:
            enc_input = x
            x = layer(enc_input, temb)
        h = self.output_layer(x)
        return h
 class PriorSEDrop(Prior):
    def __init__(self, *args, **kwargs):
        self.building_block = functools.partial(ResBlockSEDrop, dropout=args[0].dropout)
        super().__init__(*args, **kwargs)
 class PriorSEClip(Prior):
  building_block = ResBlockSEClip 
  def __init__(self, *args, **kwargs):
      super().__init__(*args, **kwargs)
--- a/models/shapelatent_modules.py
+++ b/models/shapelatent_modules.py
@ -0,0 +1,54 @@
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
 # and proprietary rights in and to this software, related documentation
 # and any modifications thereto.  Any use, reproduction, disclosure or
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
 import torch.nn as nn 
 from loguru import logger 
 from .pvcnn2 import create_pointnet2_sa_components 
 # implement the global encoder for VAE model 
 class PointNetPlusEncoder(nn.Module):
    sa_blocks = [
        [[32, 2, 32], [1024, 0.1, 32, [32, 32]]],
        [[32, 1, 16], [256, 0.2, 32, [32, 64]]]
        ]
    force_att = 0 # add attention to all layers  
    def __init__(self, zdim, input_dim, extra_feature_channels=0, args={}):
        super().__init__()
        sa_blocks = self.sa_blocks 
        layers, sa_in_channels, channels_sa_features, _  = \
            create_pointnet2_sa_components(sa_blocks, 
            extra_feature_channels, input_dim=input_dim, 
            embed_dim=0, force_att=self.force_att,
            use_att=True, with_se=True)
        self.mlp = nn.Linear(channels_sa_features, zdim*2) 
        self.zdim = zdim 
        logger.info('[Encoder] zdim={}, out_sigma={}; force_att: {}', zdim, True, self.force_att) 
        self.layers = nn.ModuleList(layers) 
        self.voxel_dim = [n[1][-1][-1] for n in self.sa_blocks]
    def forward(self, x):
        """
        Args: 
            x: B,N,3 
        Returns: 
            mu, sigma: B,D
        """
        output = {} 
        x = x.transpose(1, 2) # B,3,N
        xyz = x ## x[:,:3,:]
        features = x
        for layer_id, layer in enumerate(self.layers):
            features, xyz, _ = layer( (features, xyz, None) )
        # features: B,D,N; xyz: B,3,N
        features = features.max(-1)[0]
        features = self.mlp(features)
        mu_1d, sigma_1d = features[:, :self.zdim], features[:, self.zdim:]
        output.update({'mu_1d': mu_1d, 'sigma_1d': sigma_1d})
        return output 
--- a/models/utils.py
+++ b/models/utils.py
@ -0,0 +1,52 @@
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
 # and proprietary rights in and to this software, related documentation
 # and any modifications thereto.  Any use, reproduction, disclosure or
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
 import torch
 import math
 import torch.nn as nn
 def mask_inactive_variables(x, is_active):
    x = x * is_active
    return x
 class PositionalEmbedding(nn.Module):
    def __init__(self, embedding_dim, scale):
        super(PositionalEmbedding, self).__init__()
        self.embedding_dim = embedding_dim
        self.scale = scale
    def forward(self, timesteps):
        assert len(timesteps.shape) == 1
        timesteps = timesteps * self.scale
        half_dim = self.embedding_dim // 2
        emb = math.log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim) * -emb)
        emb = emb.to(device=timesteps.device)
        emb = timesteps[:, None] * emb[None, :]
        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
        return emb
 class RandomFourierEmbedding(nn.Module):
    def __init__(self, embedding_dim, scale):
        super(RandomFourierEmbedding, self).__init__()
        self.w = nn.Parameter(torch.randn(size=(1, embedding_dim // 2)) * scale, requires_grad=False)
    def forward(self, timesteps):
        emb = torch.mm(timesteps[:, None], self.w * 2 * 3.14159265359)
        return torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
 def init_temb_fun(embedding_type, embedding_scale, embedding_dim):
    if embedding_type == 'positional':
        temb_fun = PositionalEmbedding(embedding_dim, embedding_scale)
    elif embedding_type == 'fourier':
        temb_fun = RandomFourierEmbedding(embedding_dim, embedding_scale)
    else:
        raise NotImplementedError
    return temb_fun
--- a/models/vae_adain.py
+++ b/models/vae_adain.py
@ -0,0 +1,339 @@
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
 # and proprietary rights in and to this software, related documentation
 # and any modifications thereto.  Any use, reproduction, disclosure or
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
 import torch
 import numpy as np 
 from loguru import logger
 import importlib
 import torch.nn as nn  
 from .distributions import Normal
 from utils.model_helper import import_model 
 from utils.model_helper import loss_fn
 from utils import utils as helper 
 class Model(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.num_total_iter = 0
        self.args = args
        self.input_dim = args.ddpm.input_dim 
        latent_dim = args.shapelatent.latent_dim
        self.latent_dim = latent_dim
        self.kl_weight = args.shapelatent.kl_weight
        self.num_points = args.data.tr_max_sample_points
        # ---- global ---- #
        # build encoder  
        self.style_encoder = import_model(args.latent_pts.style_encoder)(
            zdim=args.latent_pts.style_dim, 
            input_dim=self.input_dim, 
            args=args)
        if len(args.latent_pts.style_mlp):
            self.style_mlp = import_model(args.latent_pts.style_mlp)(args) 
        else:
            self.style_mlp = None
        self.encoder = import_model(args.shapelatent.encoder_type)(
            zdim=latent_dim, 
            input_dim=self.input_dim, 
            args=args)
        # build decoder  
        self.decoder = import_model(args.shapelatent.decoder_type)(
            context_dim=latent_dim, 
            point_dim=args.ddpm.input_dim, 
            args=args)
        logger.info('[Build Model] style_encoder: {}, encoder: {}, decoder: {}',
            args.latent_pts.style_encoder, 
            args.shapelatent.encoder_type,
            args.shapelatent.decoder_type)
    @torch.no_grad()
    def encode(self, x, class_label=None):
        batch_size, _, point_dim = x.size()
        assert(x.shape[2] == self.input_dim), f'expect input in ' \
            f'[B,Npoint,PointDim={self.input_dim}], get: {x.shape}'
        x_0_target = x 
        latent_list = []
        all_eps = [] 
        all_log_q = []
        if self.args.data.cond_on_cat:
            assert(class_label is not None), f'require class label input for cond on cat'
            cls_emb = self.class_embedding(class_label) 
            enc_input = x, cls_emb 
        else:
            enc_input = x 
        # ---- global style encoder ---- #
        z = self.style_encoder(enc_input) 
        z_mu, z_sigma = z['mu_1d'], z['sigma_1d'] # log_sigma
        dist = Normal(mu=z_mu, log_sigma=z_sigma)  # (B, F)
        z_global = dist.sample()[0] 
        all_eps.append(z_global) 
        all_log_q.append(dist.log_p(z_global)) 
        latent_list.append( [z_global, z_mu, z_sigma] )
        # ---- original encoder ---- #
        style = z_global  # torch.cat([z_global, cls_emb], dim=1) if self.args.data.cond_on_cat else z_global 
        style = self.style_mlp(style) if self.style_mlp is not None else style  
        z = self.encoder([x, style])
        z_mu, z_sigma = z['mu_1d'], z['sigma_1d']
        z_sigma = z_sigma - self.args.shapelatent.log_sigma_offset 
        dist = Normal(mu=z_mu, log_sigma=z_sigma)  # (B, F)
        z_local = dist.sample()[0] 
        all_eps.append(z_local) 
        all_log_q.append(dist.log_p(z_local)) 
        latent_list.append( [z_local, z_mu, z_sigma] )
        all_eps = self.compose_eps(all_eps) 
        if self.args.data.cond_on_cat:
            return all_eps, all_log_q, latent_list, cls_emb 
        else:
            return all_eps, all_log_q, latent_list
    def compose_eps(self, all_eps):
        return torch.cat(all_eps, dim=1) #  style: [B,D1], latent pts: [B,ND2]
    def decompose_eps(self, all_eps):
        eps_style = all_eps[:,:self.args.latent_pts.style_dim] 
        eps_local = all_eps[:,self.args.latent_pts.style_dim:]
        return [eps_style, eps_local] 
    def encode_global(self, x, class_label=None):
        batch_size, N, point_dim = x.size()
        if self.args.data.cond_on_cat:
            assert(class_label is not None), f'require class label input for cond on cat'
            cls_emb = self.class_embedding(class_label) 
            enc_input = x, cls_emb 
        else:
            enc_input = x 
        z = self.style_encoder(enc_input) 
        z_mu, z_sigma = z['mu_1d'], z['sigma_1d'] # log_sigma
        dist = Normal(mu=z_mu, log_sigma=z_sigma)  # (B, F)
        return dist 
    def global2style(self, style): ##, cls_emb=None):
        Ndim = len(style.shape) 
        if Ndim == 4:
            style = style.squeeze(-1).squeeze(-1) 
        style = self.style_mlp(style) if self.style_mlp is not None else style  
        if Ndim == 4:
            style = style.unsqueeze(-1).unsqueeze(-1) 
        return style 
    def encode_local(self, x, style):
        # ---- original encoder ---- #
        z = self.encoder([x, style])
        z_mu, z_sigma = z['mu_1d'], z['sigma_1d'] # log_sigma
        z_sigma = z_sigma - self.args.shapelatent.log_sigma_offset 
        dist = Normal(mu=z_mu, log_sigma=z_sigma)  # (B, F)
        return dist 
    def recont(self, x, target=None, class_label=None, cls_emb=None):
        batch_size, N, point_dim = x.size()
        assert(x.shape[2] == self.input_dim), f'expect input in ' \
            f'[B,Npoint,PointDim={self.input_dim}], get: {x.shape}'
        x_0_target = x if target is None else target  
        latent_list = []
        all_eps = [] 
        all_log_q = []
        # ---- global style encoder ---- #
        if self.args.data.cond_on_cat: 
            if class_label is not None:
                assert(class_label is not None)
                cls_emb = self.class_embedding(class_label) 
            else:
                assert(cls_emb is not None)
            enc_input = x, cls_emb 
        else:
            enc_input = x 
        z = self.style_encoder(enc_input) 
        z_mu, z_sigma = z['mu_1d'], z['sigma_1d'] # log_sigma
        dist = Normal(mu=z_mu, log_sigma=z_sigma)  # (B, F)
        z_global = dist.sample()[0] 
        all_eps.append(z_global) 
        all_log_q.append(dist.log_p(z_global)) 
        latent_list.append( [z_global, z_mu, z_sigma] )
        # ---- original encoder ---- #
        style = torch.cat([z_global, cls_emb], dim=1) if self.args.data.cond_on_cat else z_global 
        style = self.style_mlp(style) if self.style_mlp is not None else style  
        z = self.encoder([x, style])
        z_mu, z_sigma = z['mu_1d'], z['sigma_1d'] # log_sigma
        z_sigma = z_sigma - self.args.shapelatent.log_sigma_offset 
        dist = Normal(mu=z_mu, log_sigma=z_sigma)  # (B, F)
        z_local = dist.sample()[0] 
        all_eps.append(z_local) 
        all_log_q.append(dist.log_p(z_local)) 
        latent_list.append( [z_local, z_mu, z_sigma] )
        # ---- decoder ---- #
        x_0_pred = self.decoder(None, beta=None, context=z_local, style=style) # (B,ncenter,3) 
        make_4d = lambda x: x.unsqueeze(-1).unsqueeze(-1) if len(x.shape) == 2 else x.unsqueeze(-1) 
        all_eps = [make_4d(e) for e in all_eps]
        all_log_q = [make_4d(e) for e in all_log_q]
        output = {  
                'all_eps': all_eps,
                'all_log_q': all_log_q,
                'latent_list': latent_list,
                'x_0_pred':x_0_pred,  
                'x_0_target': x_0_target, 
                'x_t': torch.zeros_like(x_0_target), 
                't': torch.zeros(batch_size), 
                'x_0': x_0_target
                }
        output['hist/global_var'] = latent_list[0][2].exp() 
        if 'LatentPoint' in self.args.shapelatent.decoder_type: 
            latent_shape = [batch_size, -1, self.latent_dim + self.input_dim] 
            if 'Hir' in self.args.shapelatent.decoder_type:
                latent_pts = z_local[:,:-self.args.latent_pts.latent_dim_ext[0]].view(*latent_shape)[:,:,:3].contiguous().clone()
            else:
                latent_pts = z_local.view(*latent_shape)[:,:,:self.input_dim].contiguous().clone()
            output['vis/latent_pts'] = latent_pts.detach().cpu().view(batch_size,
                    -1, self.input_dim) # B,N,3
        output['final_pred'] = output['x_0_pred'] 
        return output 
    def get_loss(self, x, writer=None, it=None, ## weight_loss_1=1, 
            noisy_input=None, class_label=None, **kwargs):
        """
        shapelatent z ~ q(z|x_0) 
        and x_t ~ q(x_t|x_0, t), t ~ Uniform(T)
        forward and get x_{t-1} ~ p(x_{t-1} | x_t, z)
        Args:
            x:  Input point clouds, (B, N, d).
        """
        ## kl_weight = self.kl_weight
        if self.args.trainer.anneal_kl and self.num_total_iter > 0: 
            global_step = it 
            kl_weight = helper.kl_coeff(step=global_step,
                 total_step=self.args.sde.kl_anneal_portion_vada * self.num_total_iter,
                 constant_step=self.args.sde.kl_const_portion_vada * self.num_total_iter,
                 min_kl_coeff=self.args.sde.kl_const_coeff_vada,
                 max_kl_coeff=self.args.sde.kl_max_coeff_vada)
        else:
            kl_weight = self.kl_weight
        batch_size = x.shape[0]
        # CHECKDIM(x, 2, self.input_dim)
        assert(x.shape[2] == self.input_dim)
        inputs = noisy_input if noisy_input is not None else x  
        output = self.recont(inputs, target=x, class_label=class_label)
        x_0_pred, x_0_target = output['x_0_pred'], output['x_0_target']
        loss_0 = loss_fn(x_0_pred, x_0_target, self.args.ddpm.loss_type, 
                self.input_dim, batch_size).mean()
        rec_loss = loss_0 
        output['print/loss_0'] = loss_0
        output['rec_loss'] = rec_loss 
        # Loss
        ## z_global, z_sigma, z_mu = output['z_global'], output['z_sigma'], output['z_mu']
        kl_term_list = []
        weighted_kl_terms = []
        for pairs_id, pairs in enumerate(output['latent_list']):
            cz, cmu, csigma = pairs 
            log_sigma = csigma
            kl_term_close = (0.5*log_sigma.exp()**2 + 
                    0.5*cmu**2 - log_sigma - 0.5).view(
                    batch_size, -1) 
            if 'LatentPoint' in self.args.shapelatent.decoder_type and 'Hir' not in self.args.shapelatent.decoder_type:
                if pairs_id == 1:
                    latent_shape = [batch_size, -1, self.latent_dim + self.input_dim] 
                    kl_pt = kl_term_close.view(*latent_shape)[:,:,:self.input_dim] 
                    kl_feat = kl_term_close.view(*latent_shape)[:,:,self.input_dim:] 
                    weighted_kl_terms.append(kl_pt.sum(2).sum(1) * self.args.latent_pts.weight_kl_pt) 
                    weighted_kl_terms.append(kl_feat.sum(2).sum(1) * self.args.latent_pts.weight_kl_feat)  
                    output['print/kl_pt%d'%pairs_id] = kl_pt.sum(2).sum(1)
                    output['print/kl_feat%d'%pairs_id] = kl_feat.sum(2).sum(1) 
                    output['print/z_var_pt%d'%pairs_id]  = (log_sigma.view(*latent_shape)[:,:,:self.input_dim]
                            ).exp()**2 
                    output['print/z_var_feat%d'%pairs_id]  = (log_sigma.view(*latent_shape)[:,:,self.input_dim:]
                            ).exp()**2 
                    output['print/z_mean_feat%d'%pairs_id] = cmu.view(*latent_shape)[:,:,self.input_dim:].mean() 
                elif pairs_id == 0:
                    kl_style = kl_term_close  
                    weighted_kl_terms.append(kl_style.sum(-1) * self.args.latent_pts.weight_kl_glb)
                    output['print/kl_glb%d'%pairs_id] = kl_style.sum(-1) 
                    output['print/z_var_glb%d'%pairs_id]  = (log_sigma).exp()**2 
            kl_term_close = kl_term_close.sum(-1)
            kl_term_list.append(kl_term_close) 
            output['print/kl_%d'%pairs_id] = kl_term_close
            output['print/z_mean_%d'%pairs_id] = cmu.mean() 
            output['print/z_mag_%d'%pairs_id]  = cmu.abs().max() 
            # logger.info('log_sigma: {}, mean: {}', log_sigma.shape, (log_sigma.exp()**2).mean())
            output['print/z_var_%d'%pairs_id]  = (log_sigma).exp()**2 
            output['print/z_logsigma_%d'%pairs_id] = log_sigma
            output['print/kl_weight'] = kl_weight 
        loss_recons = rec_loss  
        if len(weighted_kl_terms) > 0:
            kl = kl_weight * sum(weighted_kl_terms) 
        else:
            kl = kl_weight * sum(kl_term_list) 
        loss = kl + loss_recons * self.args.weight_recont 
        output['msg/kl'] = kl 
        output['msg/rec'] = loss_recons
        output['loss'] = loss 
        return output 
    def pz(self, w): 
       return w 
    def sample(self, num_samples=10, temp=None, decomposed_eps=[], 
            enable_autocast=False, device_str='cuda', cls_emb=None): 
        """ currently not support the samples of local level 
        Return: 
            model_output: [B,N,D]
        """ 
        batch_size = num_samples 
        center_emd = None 
        if 'LatentPoint' in self.args.shapelatent.decoder_type:
            # Latent Point Model: latent shape; B; ND 
            latent_shape = (num_samples, self.num_points*(self.latent_dim+self.input_dim))
            style_latent_shape = (num_samples, self.args.latent_pts.style_dim) 
        else:
            raise NotImplementedError 
        if len(decomposed_eps) == 0:
            z_local = torch.zeros(*latent_shape).to(
                torch.device(device_str)).normal_()
            z_global = torch.zeros(*style_latent_shape).to(
                torch.device(device_str)).normal_()
        else:
            z_global = decomposed_eps[0] 
            z_local = decomposed_eps[1]
            z_local = z_local.view(*latent_shape) 
            z_global = z_global.view(style_latent_shape)
        style = z_global
        style = self.style_mlp(style) if self.style_mlp is not None else style  
        x_0_pred = self.decoder(None, beta=None, 
                context=z_local, style=z_global) # (B,ncenter,3) 
        ## CHECKSIZE(x_0_pred, (batch_size,self.num_points,[3,6])) 
        return x_0_pred 
    def latent_shape(self):
        return [ 
            [self.args.latent_pts.style_dim, 1, 1],
            [self.num_points*(self.latent_dim+self.input_dim),1,1]
            ]
--- a/script/compute_score.py
+++ b/script/compute_score.py
@ -0,0 +1,43 @@
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
 # and proprietary rights in and to this software, related documentation
 # and any modifications thereto.  Any use, reproduction, disclosure or
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
 import sys
 sys.path.append('.')
 from utils.eval_helper import compute_score 
 # samples = sys.argv[1]
 # ref = sys.argv[2]
 samples = './lion_ckpt/unconditional/car/samples.pt'
 ref = './datasets/test_data/ref_val_car.pt'
 compute_score(samples, ref_name=ref)
 """
 will get: 
 [Test] MinMatDis | CD 0.000913 | EMD 0.007523
 [Test] Coverage | CD 0.500000 | EMD 0.565341
 [Test] 1NN-Accur | CD 0.534091 | EMD 0.511364
 [Test] JsnShnDis | 0.009229 
 """
 samples = './lion_ckpt/unconditional/chair/samples.pt'
 ref = './datasets/test_data/ref_val_chair.pt'
 compute_score(samples, ref_name=ref)
 """
 [Test] MinMatDis | CD 0.002643 | EMD 0.015516
 [Test] Coverage | CD 0.489426 | EMD 0.521148
 [Test] 1NN-Accur | CD 0.537009 | EMD 0.523414
 [Test] JsnShnDis | 0.013535
 """
 samples = './lion_ckpt/unconditional/chair/samples.pt'
 ref = './datasets/test_data/ref_val_chair.pt'
 compute_score(samples, ref_name=ref)
 """
 [Test] MinMatDis | CD 0.000221 | EMD 0.003706
 [Test] Coverage | CD 0.471605 | EMD 0.496296
 [Test] 1NN-Accur | CD 0.674074 | EMD 0.612346
 [Test] JsnShnDis | 0.060703 
 """
--- a/third_party/ChamferDistancePytorch/.gitignore
+++ b/third_party/ChamferDistancePytorch/.gitignore
@ -0,0 +1,3 @@
 *__pycache__*
 /tmp
 tmp/*
--- a/third_party/ChamferDistancePytorch/LICENSE
+++ b/third_party/ChamferDistancePytorch/LICENSE
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2019 ThibaultGROUEIX
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/third_party/ChamferDistancePytorch/README.md
+++ b/third_party/ChamferDistancePytorch/README.md
@ -0,0 +1,104 @@
 * adapted from https://github.com/ThibaultGROUEIX/ChamferDistancePytorch 
 ----------------------------------
 # Pytorch Chamfer Distance.
 Include a **CUDA** version, and a **PYTHON** version with pytorch standard operations.
 NB : In this depo, dist1 and dist2 are squared pointcloud euclidean distances, so you should adapt thresholds accordingly.
 - [x] F - Score  
 ### CUDA VERSION
 - [x] JIT compilation
 - [x] Supports multi-gpu
 - [x] 2D  point clouds.
 - [x] 3D  point clouds.
 - [x] 5D  point clouds.
 - [x] Contiguous() safe.
 ### Python Version
 - [x]  Supports any dimension
 ### Usage
 ```python
 import torch, chamfer3D.dist_chamfer_3D, fscore
 chamLoss = chamfer3D.dist_chamfer_3D.chamfer_3DDist()
 points1 = torch.rand(32, 1000, 3).cuda()
 points2 = torch.rand(32, 2000, 3, requires_grad=True).cuda()
 dist1, dist2, idx1, idx2 = chamLoss(points1, points2)
 f_score, precision, recall = fscore.fscore(dist1, dist2)
 ```
 ### Add it to your project as a submodule
 ```shell
 git submodule add https://github.com/ThibaultGROUEIX/ChamferDistancePytorch
 ```
 ### Benchmark:  [forward + backward] pass
 - [x] CUDA 10.1, NVIDIA 435, Pytorch 1.4
 - [x] p1 : 32 x 2000 x dim
 - [x] p2 : 32 x 1000 x dim
 |  *Timing (sec * 1000)*  | 2D | 3D | 5D |
 | ---------- | -------- | ------- | ------- |
 | **Cuda Compiled**     | **1.2** | 1.4 |1.8 |
 | **Cuda JIT**     | 1.3 | **1.4** |**1.5** |
 | **Python**     | 37 | 37 | 37 |
 | *Memory (MB)* |  2D | 3D | 5D |
 | ---------- | -------- | ------- | ------- |
 | **Cuda Compiled**     | 529 | 529  | 549 |
 | **Cuda JIT**     | **520** | **529** |**549** |
 | **Python**     | 2495 | 2495 | 2495 |
 ### What is the chamfer distance ? 
 [Stanford course](http://graphics.stanford.edu/courses/cs468-17-spring/LectureSlides/L14%20-%203d%20deep%20learning%20on%20point%20cloud%20representation%20(analysis).pdf) on 3D deep Learning
 ### Aknowledgment 
 Original backbone from [Fei Xia](https://github.com/fxia22/pointGAN/blob/master/nndistance/src/nnd_cuda.cu).
 JIT cool trick from [Christian Diller](https://github.com/chrdiller)
 ### Troubleshoot
 - `Undefined symbol: Zxxxxxxxxxxxxxxxxx `:
 --> Fix: Make sure to `import torch` before you `import chamfer`.
 --> Use pytorch.version >= 1.1.0
 -  [RuntimeError: Ninja is required to load C++ extension](https://github.com/zhanghang1989/PyTorch-Encoding/issues/167)
 ```shell
 wget https://github.com/ninja-build/ninja/releases/download/v1.8.2/ninja-linux.zip
 sudo unzip ninja-linux.zip -d /usr/local/bin/
 sudo update-alternatives --install /usr/bin/ninja ninja /usr/local/bin/ninja 1 --force 
 ```
 #### TODO:
 * Discuss behaviour of torch.min() and tensor.min() which causes issues in some pytorch versions
--- a/third_party/ChamferDistancePytorch/chamfer2D/chamfer2D.cu
+++ b/third_party/ChamferDistancePytorch/chamfer2D/chamfer2D.cu
@ -0,0 +1,182 @@
 #include <stdio.h>
 #include <ATen/ATen.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <vector>
 __global__ void NmDistanceKernel(int b,int n,const float * xyz,int m,const float * xyz2,float * result,int * result_i){
 	const int batch=512;
 	__shared__ float buf[batch*2];
 	for (int i=blockIdx.x;i<b;i+=gridDim.x){
 		for (int k2=0;k2<m;k2+=batch){
 			int end_k=min(m,k2+batch)-k2;
 			for (int j=threadIdx.x;j<end_k*2;j+=blockDim.x){
 				buf[j]=xyz2[(i*m+k2)*2+j];
 			}
 			__syncthreads();
 			for (int j=threadIdx.x+blockIdx.y*blockDim.x;j<n;j+=blockDim.x*gridDim.y){
 				float x1=xyz[(i*n+j)*2+0];
 				float y1=xyz[(i*n+j)*2+1];
 				int best_i=0;
 				float best=0;
 				int end_ka=end_k-(end_k&2);
 				if (end_ka==batch){
 					for (int k=0;k<batch;k+=4){
 						{
 							float x2=buf[k*2+0]-x1;
 							float y2=buf[k*2+1]-y1;
 							float d=x2*x2+y2*y2;
 							if (k==0 || d<best){
 								best=d;
 								best_i=k+k2;
 							}
 						}
 						{
 							float x2=buf[k*2+2]-x1;
 							float y2=buf[k*2+3]-y1;
 							float d=x2*x2+y2*y2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+1;
 							}
 						}
 						{
 							float x2=buf[k*2+4]-x1;
 							float y2=buf[k*2+5]-y1;
 							float d=x2*x2+y2*y2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+2;
 							}
 						}
 						{
 							float x2=buf[k*2+6]-x1;
 							float y2=buf[k*2+7]-y1;
 							float d=x2*x2+y2*y2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+3;
 							}
 						}
 					}
 				}else{
 					for (int k=0;k<end_ka;k+=4){
 						{
 							float x2=buf[k*2+0]-x1;
 							float y2=buf[k*2+1]-y1;
 							float d=x2*x2+y2*y2;
 							if (k==0 || d<best){
 								best=d;
 								best_i=k+k2;
 							}
 						}
 						{
 							float x2=buf[k*2+2]-x1;
 							float y2=buf[k*2+3]-y1;
 							float d=x2*x2+y2*y2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+1;
 							}
 						}
 						{
 							float x2=buf[k*2+4]-x1;
 							float y2=buf[k*2+5]-y1;
 							float d=x2*x2+y2*y2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+2;
 							}
 						}
 						{
 							float x2=buf[k*2+6]-x1;
 							float y2=buf[k*2+7]-y1;
 							float d=x2*x2+y2*y2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+3;
 							}
 						}
 					}
 				}
 				for (int k=end_ka;k<end_k;k++){
 					float x2=buf[k*2+0]-x1;
 					float y2=buf[k*2+1]-y1;
 					float d=x2*x2+y2*y2;
 					if (k==0 || d<best){
 						best=d;
 						best_i=k+k2;
 					}
 				}
 				if (k2==0 || result[(i*n+j)]>best){
 					result[(i*n+j)]=best;
 					result_i[(i*n+j)]=best_i;
 				}
 			}
 			__syncthreads();
 		}
 	}
 }
 // int chamfer_cuda_forward(int b,int n,const float * xyz,int m,const float * xyz2,float * result,int * result_i,float * result2,int * result2_i, cudaStream_t stream){
 int chamfer_cuda_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2){
 	const auto batch_size = xyz1.size(0);
 	const auto n = xyz1.size(1); //num_points point cloud A
 	const auto m = xyz2.size(1); //num_points point cloud B
 	NmDistanceKernel<<<dim3(32,16,1),512>>>(batch_size, n, xyz1.data<float>(), m, xyz2.data<float>(), dist1.data<float>(), idx1.data<int>());
 	NmDistanceKernel<<<dim3(32,16,1),512>>>(batch_size, m, xyz2.data<float>(), n, xyz1.data<float>(), dist2.data<float>(), idx2.data<int>());
 	cudaError_t err = cudaGetLastError();
 	  if (err != cudaSuccess) {
 	    printf("error in nnd updateOutput: %s\n", cudaGetErrorString(err));
 	    //THError("aborting");
 	    return 0;
 	  }
 	  return 1;
 }
 __global__ void NmDistanceGradKernel(int b,int n,const float * xyz1,int m,const float * xyz2,const float * grad_dist1,const int * idx1,float * grad_xyz1,float * grad_xyz2){
 	for (int i=blockIdx.x;i<b;i+=gridDim.x){
 		for (int j=threadIdx.x+blockIdx.y*blockDim.x;j<n;j+=blockDim.x*gridDim.y){
 			float x1=xyz1[(i*n+j)*2+0];
 			float y1=xyz1[(i*n+j)*2+1];
 			int j2=idx1[i*n+j];
 			float x2=xyz2[(i*m+j2)*2+0];
 			float y2=xyz2[(i*m+j2)*2+1];
 			float g=grad_dist1[i*n+j]*2;
 			atomicAdd(&(grad_xyz1[(i*n+j)*2+0]),g*(x1-x2));
 			atomicAdd(&(grad_xyz1[(i*n+j)*2+1]),g*(y1-y2));
 			atomicAdd(&(grad_xyz2[(i*m+j2)*2+0]),-(g*(x1-x2)));
 			atomicAdd(&(grad_xyz2[(i*m+j2)*2+1]),-(g*(y1-y2)));
 		}
 	}
 }
 // int chamfer_cuda_backward(int b,int n,const float * xyz1,int m,const float * xyz2,const float * grad_dist1,const int * idx1,const float * grad_dist2,const int * idx2,float * grad_xyz1,float * grad_xyz2, cudaStream_t stream){
 int chamfer_cuda_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2){
 	// cudaMemset(grad_xyz1,0,b*n*3*4);
 	// cudaMemset(grad_xyz2,0,b*m*3*4);
 	const auto batch_size = xyz1.size(0);
 	const auto n = xyz1.size(1); //num_points point cloud A
 	const auto m = xyz2.size(1); //num_points point cloud B
 	NmDistanceGradKernel<<<dim3(1,16,1),256>>>(batch_size,n,xyz1.data<float>(),m,xyz2.data<float>(),graddist1.data<float>(),idx1.data<int>(),gradxyz1.data<float>(),gradxyz2.data<float>());
 	NmDistanceGradKernel<<<dim3(1,16,1),256>>>(batch_size,m,xyz2.data<float>(),n,xyz1.data<float>(),graddist2.data<float>(),idx2.data<int>(),gradxyz2.data<float>(),gradxyz1.data<float>());
 	cudaError_t err = cudaGetLastError();
 	  if (err != cudaSuccess) {
 	    printf("error in nnd get grad: %s\n", cudaGetErrorString(err));
 	    //THError("aborting");
 	    return 0;
 	  }
 	  return 1;
 }
--- a/third_party/ChamferDistancePytorch/chamfer2D/chamfer_cuda.cpp
+++ b/third_party/ChamferDistancePytorch/chamfer2D/chamfer_cuda.cpp
@ -0,0 +1,33 @@
 #include <torch/torch.h>
 #include <vector>
 ///TMP
 //#include "common.h"
 /// NOT TMP
 int chamfer_cuda_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2);
 int chamfer_cuda_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2);
 int chamfer_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2) {
    return chamfer_cuda_forward(xyz1, xyz2, dist1, dist2, idx1, idx2);
 }
 int chamfer_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, 
 					  at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2) {
    return chamfer_cuda_backward(xyz1, xyz2, gradxyz1, gradxyz2, graddist1, graddist2, idx1, idx2);
 }
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &chamfer_forward, "chamfer forward (CUDA)");
  m.def("backward", &chamfer_backward, "chamfer backward (CUDA)");
 }
--- a/third_party/ChamferDistancePytorch/chamfer2D/dist_chamfer_2D.py
+++ b/third_party/ChamferDistancePytorch/chamfer2D/dist_chamfer_2D.py
@ -0,0 +1,80 @@
 from torch import nn
 from torch.autograd import Function
 import torch
 import importlib
 import os
 chamfer_found = importlib.find_loader("chamfer_2D") is not None
 if not chamfer_found:
    ## Cool trick from https://github.com/chrdiller
    print("Jitting Chamfer 2D")
    cur_path = os.path.dirname(os.path.abspath(__file__))
    build_path = cur_path.replace('chamfer2D', 'tmp')
    os.makedirs(build_path, exist_ok=True)
    from torch.utils.cpp_extension import load
    chamfer_2D = load(name="chamfer_2D",
                  sources=[
                      "/".join(os.path.abspath(__file__).split('/')[:-1] + ["chamfer_cuda.cpp"]),
                      "/".join(os.path.abspath(__file__).split('/')[:-1] + ["chamfer2D.cu"]),
                  ], build_directory=build_path)
    print("Loaded JIT 2D CUDA chamfer distance")
 else:
    import chamfer_2D
    print("Loaded compiled 2D CUDA chamfer distance")
 # Chamfer's distance module @thibaultgroueix
 # GPU tensors only
 class chamfer_2DFunction(Function):
    @staticmethod
    def forward(ctx, xyz1, xyz2):
        batchsize, n, dim = xyz1.size()
        assert dim==2, "Wrong last dimension for the chamfer distance 's input! Check with .size()"
        _, m, dim = xyz2.size()
        assert dim==2, "Wrong last dimension for the chamfer distance 's input! Check with .size()"
        device = xyz1.device
        device = xyz1.device
        dist1 = torch.zeros(batchsize, n)
        dist2 = torch.zeros(batchsize, m)
        idx1 = torch.zeros(batchsize, n).type(torch.IntTensor)
        idx2 = torch.zeros(batchsize, m).type(torch.IntTensor)
        dist1 = dist1.to(device)
        dist2 = dist2.to(device)
        idx1 = idx1.to(device)
        idx2 = idx2.to(device)
        torch.cuda.set_device(device)
        chamfer_2D.forward(xyz1, xyz2, dist1, dist2, idx1, idx2)
        ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
        return dist1, dist2, idx1, idx2
    @staticmethod
    def backward(ctx, graddist1, graddist2, gradidx1, gradidx2):
        xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
        graddist1 = graddist1.contiguous()
        graddist2 = graddist2.contiguous()
        device = graddist1.device
        gradxyz1 = torch.zeros(xyz1.size())
        gradxyz2 = torch.zeros(xyz2.size())
        gradxyz1 = gradxyz1.to(device)
        gradxyz2 = gradxyz2.to(device)
        chamfer_2D.backward(
            xyz1, xyz2, gradxyz1, gradxyz2, graddist1, graddist2, idx1, idx2
        )
        return gradxyz1, gradxyz2
 class chamfer_2DDist(nn.Module):
    def __init__(self):
        super(chamfer_2DDist, self).__init__()
    def forward(self, input1, input2):
        input1 = input1.contiguous()
        input2 = input2.contiguous()
        return chamfer_2DFunction.apply(input1, input2)
--- a/third_party/ChamferDistancePytorch/chamfer2D/setup.py
+++ b/third_party/ChamferDistancePytorch/chamfer2D/setup.py
@ -0,0 +1,14 @@
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 setup(
    name='chamfer_2D',
    ext_modules=[
        CUDAExtension('chamfer_2D', [
            "/".join(__file__.split('/')[:-1] + ['chamfer_cuda.cpp']),
            "/".join(__file__.split('/')[:-1] + ['chamfer2D.cu']),
        ]),
    ],
    cmdclass={
        'build_ext': BuildExtension
    })
--- a/third_party/ChamferDistancePytorch/chamfer3D/chamfer3D.cu
+++ b/third_party/ChamferDistancePytorch/chamfer3D/chamfer3D.cu
@ -0,0 +1,196 @@
 #include <stdio.h>
 #include <ATen/ATen.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <vector>
 __global__ void NmDistanceKernel(int b,int n,const float * xyz,int m,const float * xyz2,float * result,int * result_i){
 	const int batch=512;
 	__shared__ float buf[batch*3];
 	for (int i=blockIdx.x;i<b;i+=gridDim.x){
 		for (int k2=0;k2<m;k2+=batch){
 			int end_k=min(m,k2+batch)-k2;
 			for (int j=threadIdx.x;j<end_k*3;j+=blockDim.x){
 				buf[j]=xyz2[(i*m+k2)*3+j];
 			}
 			__syncthreads();
 			for (int j=threadIdx.x+blockIdx.y*blockDim.x;j<n;j+=blockDim.x*gridDim.y){
 				float x1=xyz[(i*n+j)*3+0];
 				float y1=xyz[(i*n+j)*3+1];
 				float z1=xyz[(i*n+j)*3+2];
 				int best_i=0;
 				float best=0;
 				int end_ka=end_k-(end_k&3);
 				if (end_ka==batch){
 					for (int k=0;k<batch;k+=4){
 						{
 							float x2=buf[k*3+0]-x1;
 							float y2=buf[k*3+1]-y1;
 							float z2=buf[k*3+2]-z1;
 							float d=x2*x2+y2*y2+z2*z2;
 							if (k==0 || d<best){
 								best=d;
 								best_i=k+k2;
 							}
 						}
 						{
 							float x2=buf[k*3+3]-x1;
 							float y2=buf[k*3+4]-y1;
 							float z2=buf[k*3+5]-z1;
 							float d=x2*x2+y2*y2+z2*z2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+1;
 							}
 						}
 						{
 							float x2=buf[k*3+6]-x1;
 							float y2=buf[k*3+7]-y1;
 							float z2=buf[k*3+8]-z1;
 							float d=x2*x2+y2*y2+z2*z2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+2;
 							}
 						}
 						{
 							float x2=buf[k*3+9]-x1;
 							float y2=buf[k*3+10]-y1;
 							float z2=buf[k*3+11]-z1;
 							float d=x2*x2+y2*y2+z2*z2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+3;
 							}
 						}
 					}
 				}else{
 					for (int k=0;k<end_ka;k+=4){
 						{
 							float x2=buf[k*3+0]-x1;
 							float y2=buf[k*3+1]-y1;
 							float z2=buf[k*3+2]-z1;
 							float d=x2*x2+y2*y2+z2*z2;
 							if (k==0 || d<best){
 								best=d;
 								best_i=k+k2;
 							}
 						}
 						{
 							float x2=buf[k*3+3]-x1;
 							float y2=buf[k*3+4]-y1;
 							float z2=buf[k*3+5]-z1;
 							float d=x2*x2+y2*y2+z2*z2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+1;
 							}
 						}
 						{
 							float x2=buf[k*3+6]-x1;
 							float y2=buf[k*3+7]-y1;
 							float z2=buf[k*3+8]-z1;
 							float d=x2*x2+y2*y2+z2*z2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+2;
 							}
 						}
 						{
 							float x2=buf[k*3+9]-x1;
 							float y2=buf[k*3+10]-y1;
 							float z2=buf[k*3+11]-z1;
 							float d=x2*x2+y2*y2+z2*z2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+3;
 							}
 						}
 					}
 				}
 				for (int k=end_ka;k<end_k;k++){
 					float x2=buf[k*3+0]-x1;
 					float y2=buf[k*3+1]-y1;
 					float z2=buf[k*3+2]-z1;
 					float d=x2*x2+y2*y2+z2*z2;
 					if (k==0 || d<best){
 						best=d;
 						best_i=k+k2;
 					}
 				}
 				if (k2==0 || result[(i*n+j)]>best){
 					result[(i*n+j)]=best;
 					result_i[(i*n+j)]=best_i;
 				}
 			}
 			__syncthreads();
 		}
 	}
 }
 // int chamfer_cuda_forward(int b,int n,const float * xyz,int m,const float * xyz2,float * result,int * result_i,float * result2,int * result2_i, cudaStream_t stream){
 int chamfer_cuda_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2){
 	const auto batch_size = xyz1.size(0);
 	const auto n = xyz1.size(1); //num_points point cloud A
 	const auto m = xyz2.size(1); //num_points point cloud B
 	NmDistanceKernel<<<dim3(32,16,1),512>>>(batch_size, n, xyz1.data<float>(), m, xyz2.data<float>(), dist1.data<float>(), idx1.data<int>());
 	NmDistanceKernel<<<dim3(32,16,1),512>>>(batch_size, m, xyz2.data<float>(), n, xyz1.data<float>(), dist2.data<float>(), idx2.data<int>());
 	cudaError_t err = cudaGetLastError();
 	  if (err != cudaSuccess) {
 	    printf("error in nnd updateOutput: %s\n", cudaGetErrorString(err));
 	    //THError("aborting");
 	    return 0;
 	  }
 	  return 1;
 }
 __global__ void NmDistanceGradKernel(int b,int n,const float * xyz1,int m,const float * xyz2,const float * grad_dist1,const int * idx1,float * grad_xyz1,float * grad_xyz2){
 	for (int i=blockIdx.x;i<b;i+=gridDim.x){
 		for (int j=threadIdx.x+blockIdx.y*blockDim.x;j<n;j+=blockDim.x*gridDim.y){
 			float x1=xyz1[(i*n+j)*3+0];
 			float y1=xyz1[(i*n+j)*3+1];
 			float z1=xyz1[(i*n+j)*3+2];
 			int j2=idx1[i*n+j];
 			float x2=xyz2[(i*m+j2)*3+0];
 			float y2=xyz2[(i*m+j2)*3+1];
 			float z2=xyz2[(i*m+j2)*3+2];
 			float g=grad_dist1[i*n+j]*2;
 			atomicAdd(&(grad_xyz1[(i*n+j)*3+0]),g*(x1-x2));
 			atomicAdd(&(grad_xyz1[(i*n+j)*3+1]),g*(y1-y2));
 			atomicAdd(&(grad_xyz1[(i*n+j)*3+2]),g*(z1-z2));
 			atomicAdd(&(grad_xyz2[(i*m+j2)*3+0]),-(g*(x1-x2)));
 			atomicAdd(&(grad_xyz2[(i*m+j2)*3+1]),-(g*(y1-y2)));
 			atomicAdd(&(grad_xyz2[(i*m+j2)*3+2]),-(g*(z1-z2)));
 		}
 	}
 }
 // int chamfer_cuda_backward(int b,int n,const float * xyz1,int m,const float * xyz2,const float * grad_dist1,const int * idx1,const float * grad_dist2,const int * idx2,float * grad_xyz1,float * grad_xyz2, cudaStream_t stream){
 int chamfer_cuda_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2){
 	// cudaMemset(grad_xyz1,0,b*n*3*4);
 	// cudaMemset(grad_xyz2,0,b*m*3*4);
 	const auto batch_size = xyz1.size(0);
 	const auto n = xyz1.size(1); //num_points point cloud A
 	const auto m = xyz2.size(1); //num_points point cloud B
 	NmDistanceGradKernel<<<dim3(1,16,1),256>>>(batch_size,n,xyz1.data<float>(),m,xyz2.data<float>(),graddist1.data<float>(),idx1.data<int>(),gradxyz1.data<float>(),gradxyz2.data<float>());
 	NmDistanceGradKernel<<<dim3(1,16,1),256>>>(batch_size,m,xyz2.data<float>(),n,xyz1.data<float>(),graddist2.data<float>(),idx2.data<int>(),gradxyz2.data<float>(),gradxyz1.data<float>());
 	cudaError_t err = cudaGetLastError();
 	  if (err != cudaSuccess) {
 	    printf("error in nnd get grad: %s\n", cudaGetErrorString(err));
 	    //THError("aborting");
 	    return 0;
 	  }
 	  return 1;
 }
--- a/third_party/ChamferDistancePytorch/chamfer3D/chamfer_cuda.cpp
+++ b/third_party/ChamferDistancePytorch/chamfer3D/chamfer_cuda.cpp
@ -0,0 +1,33 @@
 #include <torch/torch.h>
 #include <vector>
 ///TMP
 //#include "common.h"
 /// NOT TMP
 int chamfer_cuda_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2);
 int chamfer_cuda_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2);
 int chamfer_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2) {
    return chamfer_cuda_forward(xyz1, xyz2, dist1, dist2, idx1, idx2);
 }
 int chamfer_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, 
 					  at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2) {
    return chamfer_cuda_backward(xyz1, xyz2, gradxyz1, gradxyz2, graddist1, graddist2, idx1, idx2);
 }
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &chamfer_forward, "chamfer forward (CUDA)");
  m.def("backward", &chamfer_backward, "chamfer backward (CUDA)");
 }
--- a/third_party/ChamferDistancePytorch/chamfer3D/dist_chamfer_3D.py
+++ b/third_party/ChamferDistancePytorch/chamfer3D/dist_chamfer_3D.py
@ -0,0 +1,133 @@
 from torch import nn
 from torch.autograd import Function
 import torch
 import importlib
 import os
 from torch.cuda.amp import autocast, GradScaler, custom_fwd, custom_bwd 
 cur_path = os.path.dirname(os.path.abspath(__file__))
 build_path = cur_path.replace('chamfer3D', 'tmp')
 os.makedirs(build_path, exist_ok=True)
 from torch.utils.cpp_extension import load
 chamfer_3D = load(name="chamfer_3D",
      sources=[
          "/".join(os.path.abspath(__file__).split('/')[:-1] + ["chamfer_cuda.cpp"]),
          "/".join(os.path.abspath(__file__).split('/')[:-1] + ["chamfer3D.cu"]),
          ], build_directory=build_path)
 #chamfer_found = importlib.find_loader("chamfer_3D") is not None
 #if not chamfer_found:
 #    ## Cool trick from https://github.com/chrdiller
 #    print("Jitting Chamfer 3D")
 #    cur_path = os.path.dirname(os.path.abspath(__file__))
 #    build_path = cur_path.replace('chamfer3D', 'tmp')
 #    os.makedirs(build_path, exist_ok=True)
 #
 #    from torch.utils.cpp_extension import load
 #    chamfer_3D = load(name="chamfer_3D",
 #          sources=[
 #              "/".join(os.path.abspath(__file__).split('/')[:-1] + ["chamfer_cuda.cpp"]),
 #              "/".join(os.path.abspath(__file__).split('/')[:-1] + ["chamfer3D.cu"]),
 #              ], build_directory=build_path)
 #    print("Loaded JIT 3D CUDA chamfer distance")
 #
 #else:
 #    import chamfer_3D
 #    print("Loaded compiled 3D CUDA chamfer distance")
 # Chamfer's distance module @thibaultgroueix
 # GPU tensors only
 class chamfer_3DFunction(Function):
    @staticmethod
    @custom_fwd(cast_inputs=torch.float32) 
    def forward(ctx, xyz1, xyz2):
        batchsize, n, dim = xyz1.size()
        assert dim==3, "Wrong last dimension for the chamfer distance 's input! Check with .size()"
        _, m, dim = xyz2.size()
        assert dim==3, "Wrong last dimension for the chamfer distance 's input! Check with .size()"
        device = xyz1.device
        device = xyz1.device
        dist1 = torch.zeros(batchsize, n)
        dist2 = torch.zeros(batchsize, m)
        idx1 = torch.zeros(batchsize, n).type(torch.IntTensor)
        idx2 = torch.zeros(batchsize, m).type(torch.IntTensor)
        dist1 = dist1.to(device)
        dist2 = dist2.to(device)
        idx1 = idx1.to(device)
        idx2 = idx2.to(device)
        torch.cuda.set_device(device)
        chamfer_3D.forward(xyz1, xyz2, dist1, dist2, idx1, idx2)
        ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
        return dist1, dist2, idx1, idx2
    @staticmethod
    @custom_bwd
    def backward(ctx, graddist1, graddist2, gradidx1, gradidx2):
        xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
        graddist1 = graddist1.contiguous()
        graddist2 = graddist2.contiguous()
        device = graddist1.device
        gradxyz1 = torch.zeros(xyz1.size())
        gradxyz2 = torch.zeros(xyz2.size())
        gradxyz1 = gradxyz1.to(device)
        gradxyz2 = gradxyz2.to(device)
        chamfer_3D.backward(
            xyz1, xyz2, gradxyz1, gradxyz2, graddist1, graddist2, idx1, idx2
        )
        return gradxyz1, gradxyz2
 class chamfer_3DDist(nn.Module):
    def __init__(self):
        super(chamfer_3DDist, self).__init__()
    def forward(self, input1, input2):
        input1 = input1.contiguous()
        input2 = input2.contiguous()
        return chamfer_3DFunction.apply(input1, input2)
 # Chamfer's distance module @thibaultgroueix
 # GPU tensors only
 class chamfer_3DFunction_noGrad(Function):
    @staticmethod
    def forward(ctx, xyz1, xyz2):
        batchsize, n, dim = xyz1.size()
        assert dim==3, "Wrong last dimension for the chamfer distance 's input! Check with .size()"
        _, m, dim = xyz2.size()
        assert dim==3, "Wrong last dimension for the chamfer distance 's input! Check with .size()"
        device = xyz1.device
        device = xyz1.device
        dist1 = torch.zeros(batchsize, n)
        dist2 = torch.zeros(batchsize, m)
        idx1 = torch.zeros(batchsize, n).type(torch.IntTensor)
        idx2 = torch.zeros(batchsize, m).type(torch.IntTensor)
        dist1 = dist1.to(device)
        dist2 = dist2.to(device)
        idx1 = idx1.to(device)
        idx2 = idx2.to(device)
        torch.cuda.set_device(device)
        chamfer_3D.forward(xyz1, xyz2, dist1, dist2, idx1, idx2)
        return dist1, dist2, idx1, idx2
 class chamfer_3DDist_nograd(nn.Module):
    def __init__(self):
        super(chamfer_3DDist_nograd, self).__init__()
    def forward(self, input1, input2):
        input1 = input1.contiguous()
        input2 = input2.contiguous()
        return chamfer_3DFunction_noGrad.apply(input1, input2)
--- a/third_party/ChamferDistancePytorch/chamfer3D/setup.py
+++ b/third_party/ChamferDistancePytorch/chamfer3D/setup.py
@ -0,0 +1,14 @@
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 setup(
    name='chamfer_3D',
    ext_modules=[
        CUDAExtension('chamfer_3D', [
            "/".join(__file__.split('/')[:-1] + ['chamfer_cuda.cpp']),
            "/".join(__file__.split('/')[:-1] + ['chamfer3D.cu']),
        ]),
    ],
    cmdclass={
        'build_ext': BuildExtension
    })
--- a/third_party/ChamferDistancePytorch/chamfer5D/chamfer5D.cu
+++ b/third_party/ChamferDistancePytorch/chamfer5D/chamfer5D.cu
@ -0,0 +1,223 @@
 #include <stdio.h>
 #include <ATen/ATen.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <vector>
 __global__ void NmDistanceKernel(int b,int n,const float * xyz,int m,const float * xyz2,float * result,int * result_i){
 	const int batch=2048;
 	__shared__ float buf[batch*5];
 	for (int i=blockIdx.x;i<b;i+=gridDim.x){
 		for (int k2=0;k2<m;k2+=batch){
 			int end_k=min(m,k2+batch)-k2;
 			for (int j=threadIdx.x;j<end_k*5;j+=blockDim.x){
 				buf[j]=xyz2[(i*m+k2)*5+j];
 			}
 			__syncthreads();
 			for (int j=threadIdx.x+blockIdx.y*blockDim.x;j<n;j+=blockDim.x*gridDim.y){
 				float x1=xyz[(i*n+j)*5+0];
 				float y1=xyz[(i*n+j)*5+1];
 				float r1=xyz[(i*n+j)*5+2];
 				float g1=xyz[(i*n+j)*5+3];
 				float b1=xyz[(i*n+j)*5+4];
 				int best_i=0;
 				float best=0;
 				int end_ka=end_k-(end_k&5);
 				if (end_ka==batch){
 					for (int k=0;k<batch;k+=4){
 						{
 							float x2=buf[k*5+0]-x1;
 							float y2=buf[k*5+1]-y1;
 							float r2=buf[k*5+2]-r1;
 							float g2=buf[k*5+3]-g1;
 							float b2=buf[k*5+4]-b1;
 							float d=x2*x2+y2*y2+r2*r2+g2*g2+b2*b2;
 							if (k==0 || d<best){
 								best=d;
 								best_i=k+k2;
 							}
 						}
 						{
 							float x2=buf[k*5+5]-x1;
 							float y2=buf[k*5+6]-y1;
 							float r2=buf[k*5+7]-r1;
 							float g2=buf[k*5+8]-g1;
 							float b2=buf[k*5+9]-b1;
 							float d=x2*x2+y2*y2+r2*r2+g2*g2+b2*b2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+1;
 							}
 						}
 						{
 							float x2=buf[k*5+10]-x1;
 							float y2=buf[k*5+11]-y1;
 							float r2=buf[k*5+12]-r1;
 							float g2=buf[k*5+13]-g1;
 							float b2=buf[k*5+14]-b1;
 							float d=x2*x2+y2*y2+r2*r2+g2*g2+b2*b2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+2;
 							}
 						}
 						{
 							float x2=buf[k*5+15]-x1;
 							float y2=buf[k*5+16]-y1;
 							float r2=buf[k*5+17]-r1;
 							float g2=buf[k*5+18]-g1;
 							float b2=buf[k*5+19]-b1;
 							float d=x2*x2+y2*y2+r2*r2+g2*g2+b2*b2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+3;
 							}
 						}
 					}
 				}else{
 					for (int k=0;k<end_ka;k+=4){
 						{
 							float x2=buf[k*5+0]-x1;
 							float y2=buf[k*5+1]-y1;
 							float r2=buf[k*5+2]-r1;
 							float g2=buf[k*5+3]-g1;
 							float b2=buf[k*5+4]-b1;
 							float d=x2*x2+y2*y2+r2*r2+g2*g2+b2*b2;
 							if (k==0 || d<best){
 								best=d;
 								best_i=k+k2;
 							}
 						}
 						{
 							float x2=buf[k*5+5]-x1;
 							float y2=buf[k*5+6]-y1;
 							float r2=buf[k*5+7]-r1;
 							float g2=buf[k*5+8]-g1;
 							float b2=buf[k*5+9]-b1;
 							float d=x2*x2+y2*y2+r2*r2+g2*g2+b2*b2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+1;
 							}
 						}
 						{
 							float x2=buf[k*5+10]-x1;
 							float y2=buf[k*5+11]-y1;
 							float r2=buf[k*5+12]-r1;
 							float g2=buf[k*5+13]-g1;
 							float b2=buf[k*5+14]-b1;
 							float d=x2*x2+y2*y2+r2*r2+g2*g2+b2*b2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+2;
 							}
 						}
 						{
 							float x2=buf[k*5+15]-x1;
 							float y2=buf[k*5+16]-y1;
 							float r2=buf[k*5+17]-r1;
 							float g2=buf[k*5+18]-g1;
 							float b2=buf[k*5+19]-b1;
 							float d=x2*x2+y2*y2+r2*r2+g2*g2+b2*b2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+3;
 							}
 						}
 					}
 				}
 				for (int k=end_ka;k<end_k;k++){
 					float x2=buf[k*5+0]-x1;
 					float y2=buf[k*5+1]-y1;
 					float r2=buf[k*5+2]-r1;
 					float g2=buf[k*5+3]-g1;
 					float b2=buf[k*5+4]-b1;
 					float d=x2*x2+y2*y2+r2*r2+g2*g2+b2*b2;
 					if (k==0 || d<best){
 						best=d;
 						best_i=k+k2;
 					}
 				}
 				if (k2==0 || result[(i*n+j)]>best){
 					result[(i*n+j)]=best;
 					result_i[(i*n+j)]=best_i;
 				}
 			}
 			__syncthreads();
 		}
 	}
 }
 // int chamfer_cuda_forward(int b,int n,const float * xyz,int m,const float * xyz2,float * result,int * result_i,float * result2,int * result2_i, cudaStream_t stream){
 int chamfer_cuda_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2){
 	const auto batch_size = xyz1.size(0);
 	const auto n = xyz1.size(1); //num_points point cloud A
 	const auto m = xyz2.size(1); //num_points point cloud B
 	NmDistanceKernel<<<dim3(32,16,1),512>>>(batch_size, n, xyz1.data<float>(), m, xyz2.data<float>(), dist1.data<float>(), idx1.data<int>());
 	NmDistanceKernel<<<dim3(32,16,1),512>>>(batch_size, m, xyz2.data<float>(), n, xyz1.data<float>(), dist2.data<float>(), idx2.data<int>());
 	cudaError_t err = cudaGetLastError();
 	  if (err != cudaSuccess) {
 	    printf("error in nnd updateOutput: %s\n", cudaGetErrorString(err));
 	    //THError("aborting");
 	    return 0;
 	  }
 	  return 1;
 }
 __global__ void NmDistanceGradKernel(int b,int n,const float * xyz1,int m,const float * xyz2,const float * grad_dist1,const int * idx1,float * grad_xyz1,float * grad_xyz2){
 	for (int i=blockIdx.x;i<b;i+=gridDim.x){
 		for (int j=threadIdx.x+blockIdx.y*blockDim.x;j<n;j+=blockDim.x*gridDim.y){
 			float x1=xyz1[(i*n+j)*5+0];
 			float y1=xyz1[(i*n+j)*5+1];
 			float r1=xyz1[(i*n+j)*5+2];
 			float g1=xyz1[(i*n+j)*5+3];
 			float b1=xyz1[(i*n+j)*5+4];
 			int j2=idx1[i*n+j];
 			float x2=xyz2[(i*m+j2)*5+0];
 			float y2=xyz2[(i*m+j2)*5+1];
 			float r2=xyz2[(i*m+j2)*5+2];
 			float g2=xyz2[(i*m+j2)*5+3];
 			float b2=xyz2[(i*m+j2)*5+4];
 			float g=grad_dist1[i*n+j]*2;
 			atomicAdd(&(grad_xyz1[(i*n+j)*5+0]),g*(x1-x2));
 			atomicAdd(&(grad_xyz1[(i*n+j)*5+1]),g*(y1-y2));
 			atomicAdd(&(grad_xyz1[(i*n+j)*5+2]),g*(r1-r2));
 			atomicAdd(&(grad_xyz1[(i*n+j)*5+3]),g*(g1-g2));
 			atomicAdd(&(grad_xyz1[(i*n+j)*5+4]),g*(b1-b2));
 			atomicAdd(&(grad_xyz2[(i*m+j2)*5+0]),-(g*(x1-x2)));
 			atomicAdd(&(grad_xyz2[(i*m+j2)*5+1]),-(g*(y1-y2)));
 			atomicAdd(&(grad_xyz2[(i*m+j2)*5+2]),-(g*(r1-r2)));
 			atomicAdd(&(grad_xyz2[(i*m+j2)*5+3]),-(g*(g1-g2)));
 			atomicAdd(&(grad_xyz2[(i*m+j2)*5+4]),-(g*(b1-b2)));
 		}
 	}
 }
 // int chamfer_cuda_backward(int b,int n,const float * xyz1,int m,const float * xyz2,const float * grad_dist1,const int * idx1,const float * grad_dist2,const int * idx2,float * grad_xyz1,float * grad_xyz2, cudaStream_t stream){
 int chamfer_cuda_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2){
 	// cudaMemset(grad_xyz1,0,b*n*3*4);
 	// cudaMemset(grad_xyz2,0,b*m*3*4);
 	const auto batch_size = xyz1.size(0);
 	const auto n = xyz1.size(1); //num_points point cloud A
 	const auto m = xyz2.size(1); //num_points point cloud B
 	NmDistanceGradKernel<<<dim3(1,16,1),256>>>(batch_size,n,xyz1.data<float>(),m,xyz2.data<float>(),graddist1.data<float>(),idx1.data<int>(),gradxyz1.data<float>(),gradxyz2.data<float>());
 	NmDistanceGradKernel<<<dim3(1,16,1),256>>>(batch_size,m,xyz2.data<float>(),n,xyz1.data<float>(),graddist2.data<float>(),idx2.data<int>(),gradxyz2.data<float>(),gradxyz1.data<float>());
 	cudaError_t err = cudaGetLastError();
 	  if (err != cudaSuccess) {
 	    printf("error in nnd get grad: %s\n", cudaGetErrorString(err));
 	    //THError("aborting");
 	    return 0;
 	  }
 	  return 1;
 }
--- a/third_party/ChamferDistancePytorch/chamfer5D/chamfer_cuda.cpp
+++ b/third_party/ChamferDistancePytorch/chamfer5D/chamfer_cuda.cpp
@ -0,0 +1,33 @@
 #include <torch/torch.h>
 #include <vector>
 ///TMP
 //#include "common.h"
 /// NOT TMP
 int chamfer_cuda_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2);
 int chamfer_cuda_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2);
 int chamfer_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2) {
    return chamfer_cuda_forward(xyz1, xyz2, dist1, dist2, idx1, idx2);
 }
 int chamfer_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, 
 					  at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2) {
    return chamfer_cuda_backward(xyz1, xyz2, gradxyz1, gradxyz2, graddist1, graddist2, idx1, idx2);
 }
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &chamfer_forward, "chamfer forward (CUDA)");
  m.def("backward", &chamfer_backward, "chamfer backward (CUDA)");
 }
--- a/third_party/ChamferDistancePytorch/chamfer5D/dist_chamfer_5D.py
+++ b/third_party/ChamferDistancePytorch/chamfer5D/dist_chamfer_5D.py
@ -0,0 +1,82 @@
 from torch import nn
 from torch.autograd import Function
 import torch
 import importlib
 import os
 chamfer_found = importlib.find_loader("chamfer_5D") is not None
 if not chamfer_found:
    ## Cool trick from https://github.com/chrdiller
    print("Jitting Chamfer 5D")
    cur_path = os.path.dirname(os.path.abspath(__file__))
    build_path = cur_path.replace('chamfer5D', 'tmp')
    os.makedirs(build_path, exist_ok=True)
    from torch.utils.cpp_extension import load
    chamfer_5D = load(name="chamfer_5D",
                      sources=[
                          "/".join(os.path.abspath(__file__).split('/')[:-1] + ["chamfer_cuda.cpp"]),
                          "/".join(os.path.abspath(__file__).split('/')[:-1] + ["chamfer5D.cu"]),
                      ], build_directory=build_path)
    print("Loaded JIT 5D CUDA chamfer distance")
 else:
    import chamfer_5D
    print("Loaded compiled 5D CUDA chamfer distance")
 # Chamfer's distance module @thibaultgroueix
 # GPU tensors only
 class chamfer_5DFunction(Function):
    @staticmethod
    def forward(ctx, xyz1, xyz2):
        batchsize, n, dim = xyz1.size()
        assert dim==5, "Wrong last dimension for the chamfer distance 's input! Check with .size()"
        _, m, dim = xyz2.size()
        assert dim==5, "Wrong last dimension for the chamfer distance 's input! Check with .size()"
        device = xyz1.device
        device = xyz1.device
        dist1 = torch.zeros(batchsize, n)
        dist2 = torch.zeros(batchsize, m)
        idx1 = torch.zeros(batchsize, n).type(torch.IntTensor)
        idx2 = torch.zeros(batchsize, m).type(torch.IntTensor)
        dist1 = dist1.to(device)
        dist2 = dist2.to(device)
        idx1 = idx1.to(device)
        idx2 = idx2.to(device)
        torch.cuda.set_device(device)
        chamfer_5D.forward(xyz1, xyz2, dist1, dist2, idx1, idx2)
        ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
        return dist1, dist2, idx1, idx2
    @staticmethod
    def backward(ctx, graddist1, graddist2, gradidx1, gradidx2):
        xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
        graddist1 = graddist1.contiguous()
        graddist2 = graddist2.contiguous()
        device = graddist1.device
        gradxyz1 = torch.zeros(xyz1.size())
        gradxyz2 = torch.zeros(xyz2.size())
        gradxyz1 = gradxyz1.to(device)
        gradxyz2 = gradxyz2.to(device)
        chamfer_5D.backward(
            xyz1, xyz2, gradxyz1, gradxyz2, graddist1, graddist2, idx1, idx2
        )
        return gradxyz1, gradxyz2
 class chamfer_5DDist(nn.Module):
    def __init__(self):
        super(chamfer_5DDist, self).__init__()
    def forward(self, input1, input2):
        input1 = input1.contiguous()
        input2 = input2.contiguous()
        return chamfer_5DFunction.apply(input1, input2)
--- a/third_party/ChamferDistancePytorch/chamfer5D/setup.py
+++ b/third_party/ChamferDistancePytorch/chamfer5D/setup.py
@ -0,0 +1,14 @@
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 setup(
    name='chamfer_5D',
    ext_modules=[
        CUDAExtension('chamfer_5D', [
            "/".join(__file__.split('/')[:-1] + ['chamfer_cuda.cpp']),
            "/".join(__file__.split('/')[:-1] + ['chamfer5D.cu']),
        ]),
    ],
    cmdclass={
        'build_ext': BuildExtension
    })
--- a/third_party/ChamferDistancePytorch/chamfer6D/chamfer6D.cu
+++ b/third_party/ChamferDistancePytorch/chamfer6D/chamfer6D.cu
@ -0,0 +1,237 @@
 #include <stdio.h>
 #include <ATen/ATen.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <vector>
 __global__ void NmDistanceKernel(int b,int n,const float * xyz,int m,const float * xyz2,float * result,int * result_i){
 	const int batch=2048;
 	__shared__ float buf[batch*6];
 	for (int i=blockIdx.x;i<b;i+=gridDim.x){
 		for (int k2=0;k2<m;k2+=batch){
 			int end_k=min(m,k2+batch)-k2;
 			for (int j=threadIdx.x;j<end_k*6;j+=blockDim.x){
 				buf[j]=xyz2[(i*m+k2)*6+j];
 			}
 			__syncthreads();
 			for (int j=threadIdx.x+blockIdx.y*blockDim.x;j<n;j+=blockDim.x*gridDim.y){
 				float x1=xyz[(i*n+j)*6+0];
 				float y1=xyz[(i*n+j)*6+1];
 				float z1=xyz[(i*n+j)*6+2];
 				float nx1=xyz[(i*n+j)*6+3];
 				float ny1=xyz[(i*n+j)*6+4];
 				float nz1=xyz[(i*n+j)*6+5];
 				int best_i=0;
 				float best=0;
 				int end_ka=end_k-(end_k&6);
 				if (end_ka==batch){
 					for (int k=0;k<batch;k+=4){
 						{
 							float x2=buf[k*6+0]-x1;
 							float y2=buf[k*6+1]-y1;
 							float z2=buf[k*6+2]-z1;
 							float nx2=buf[k*6+3]-nx1;
 							float ny2=buf[k*6+4]-ny1;
 							float nz2=buf[k*6+5]-nz1;
 							float d=x2*x2+y2*y2+z2*z2+nx2*nx2+ny2*ny2+nz2*nz2;
 							if (k==0 || d<best){
 								best=d;
 								best_i=k+k2;
 							}
 						}
 						{
 							float x2=buf[k*6+6]-x1;
 							float y2=buf[k*6+7]-y1;
 							float z2=buf[k*6+8]-z1;
 							float nx2=buf[k*6+9]-nx1;
 							float ny2=buf[k*6+10]-ny1;
 							float nz2=buf[k*6+11]-nz1;
 							float d=x2*x2+y2*y2+z2*z2+nx2*nx2+ny2*ny2+nz2*nz2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+1;
 							}
 						}
 						{
 							float x2=buf[k*6+12]-x1;
 							float y2=buf[k*6+13]-y1;
 							float z2=buf[k*6+14]-z1;
 							float nx2=buf[k*6+15]-nx1;
 							float ny2=buf[k*6+16]-ny1;
 							float nz2=buf[k*6+17]-nz1;
 							float d=x2*x2+y2*y2+z2*z2+nx2*nx2+ny2*ny2+nz2*nz2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+2;
 							}
 						}
 						{
 							float x2=buf[k*6+18]-x1;
 							float y2=buf[k*6+19]-y1;
 							float z2=buf[k*6+20]-z1;
 							float nx2=buf[k*6+21]-nx1;
 							float ny2=buf[k*6+22]-ny1;
 							float nz2=buf[k*6+23]-nz1;
 							float d=x2*x2+y2*y2+z2*z2+nx2*nx2+ny2*ny2+nz2*nz2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+3;
 							}
 						}
 					}
 				}else{
 					for (int k=0;k<end_ka;k+=4){
 						{
 							float x2=buf[k*6+0]-x1;
 							float y2=buf[k*6+1]-y1;
 							float z2=buf[k*6+2]-z1;
 							float nx2=buf[k*6+3]-nx1;
 							float ny2=buf[k*6+4]-ny1;
 							float nz2=buf[k*6+5]-nz1;
 							float d=x2*x2+y2*y2+z2*z2+nx2*nx2+ny2*ny2+nz2*nz2;
 							if (k==0 || d<best){
 								best=d;
 								best_i=k+k2;
 							}
 						}
 						{
 							float x2=buf[k*6+6]-x1;
 							float y2=buf[k*6+7]-y1;
 							float z2=buf[k*6+8]-z1;
 							float nx2=buf[k*6+9]-nx1;
 							float ny2=buf[k*6+10]-ny1;
 							float nz2=buf[k*6+11]-nz1;
 							float d=x2*x2+y2*y2+z2*z2+nx2*nx2+ny2*ny2+nz2*nz2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+1;
 							}
 						}
 						{
 							float x2=buf[k*6+12]-x1;
 							float y2=buf[k*6+13]-y1;
 							float z2=buf[k*6+14]-z1;
 							float nx2=buf[k*6+15]-nx1;
 							float ny2=buf[k*6+16]-ny1;
 							float nz2=buf[k*6+17]-nz1;
 							float d=x2*x2+y2*y2+z2*z2+nx2*nx2+ny2*ny2+nz2*nz2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+2;
 							}
 						}
 						{
 							float x2=buf[k*6+18]-x1;
 							float y2=buf[k*6+19]-y1;
 							float z2=buf[k*6+20]-z1;
 							float nx2=buf[k*6+21]-nx1;
 							float ny2=buf[k*6+22]-ny1;
 							float nz2=buf[k*6+23]-nz1;
 							float d=x2*x2+y2*y2+z2*z2+nx2*nx2+ny2*ny2+nz2*nz2;
 							if (d<best){
 								best=d;
 								best_i=k+k2+3;
 							}
 						}
 					}
 				}
 				for (int k=end_ka;k<end_k;k++){
 					float x2=buf[k*6+0]-x1;
 					float y2=buf[k*6+1]-y1;
 					float z2=buf[k*6+2]-z1;
 					float nx2=buf[k*6+3]-nx1;
 					float ny2=buf[k*6+4]-ny1;
 					float nz2=buf[k*6+5]-nz1;
 					float d=x2*x2+y2*y2+z2*z2+nx2*nx2+ny2*ny2+nz2*nz2;
 					if (k==0 || d<best){
 						best=d;
 						best_i=k+k2;
 					}
 				}
 				if (k2==0 || result[(i*n+j)]>best){
 					result[(i*n+j)]=best;
 					result_i[(i*n+j)]=best_i;
 				}
 			}
 			__syncthreads();
 		}
 	}
 }
 // int chamfer_cuda_forward(int b,int n,const float * xyz,int m,const float * xyz2,float * result,int * result_i,float * result2,int * result2_i, cudaStream_t stream){
 int chamfer_cuda_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2){
 	const auto batch_size = xyz1.size(0);
 	const auto n = xyz1.size(1); //num_points point cloud A
 	const auto m = xyz2.size(1); //num_points point cloud B
 	NmDistanceKernel<<<dim3(32,16,1),512>>>(batch_size, n, xyz1.data<float>(), m, xyz2.data<float>(), dist1.data<float>(), idx1.data<int>());
 	NmDistanceKernel<<<dim3(32,16,1),512>>>(batch_size, m, xyz2.data<float>(), n, xyz1.data<float>(), dist2.data<float>(), idx2.data<int>());
 	cudaError_t err = cudaGetLastError();
 	  if (err != cudaSuccess) {
 	    printf("error in nnd updateOutput: %s\n", cudaGetErrorString(err));
 	    //THError("aborting");
 	    return 0;
 	  }
 	  return 1;
 }
 __global__ void NmDistanceGradKernel(int b,int n,const float * xyz1,int m,const float * xyz2,const float * grad_dist1,const int * idx1,float * grad_xyz1,float * grad_xyz2){
 	for (int i=blockIdx.x;i<b;i+=gridDim.x){
 		for (int j=threadIdx.x+blockIdx.y*blockDim.x;j<n;j+=blockDim.x*gridDim.y){
 			float x1=xyz1[(i*n+j)*6+0];
 			float y1=xyz1[(i*n+j)*6+1];
 			float z1=xyz1[(i*n+j)*6+2];
 			float nx1=xyz1[(i*n+j)*6+3];
 			float ny1=xyz1[(i*n+j)*6+4];
 			float nz1=xyz1[(i*n+j)*6+5];
 			int j2=idx1[i*n+j];
 			float x2=xyz2[(i*m+j2)*6+0];
 			float y2=xyz2[(i*m+j2)*6+1];
 			float z2=xyz2[(i*m+j2)*6+2];
 			float nx2=xyz2[(i*m+j2)*6+3];
 			float ny2=xyz2[(i*m+j2)*6+4];
 			float nz2=xyz2[(i*m+j2)*6+5];
 			float g=grad_dist1[i*n+j]*2;
 			atomicAdd(&(grad_xyz1[(i*n+j)*6+0]),g*(x1-x2));
 			atomicAdd(&(grad_xyz1[(i*n+j)*6+1]),g*(y1-y2));
 			atomicAdd(&(grad_xyz1[(i*n+j)*6+2]),g*(z1-z2));
 			atomicAdd(&(grad_xyz1[(i*n+j)*6+3]),g*(nx1-nx2));
 			atomicAdd(&(grad_xyz1[(i*n+j)*6+4]),g*(ny1-ny2));
 			atomicAdd(&(grad_xyz1[(i*n+j)*6+5]),g*(nz1-nz2));
 			atomicAdd(&(grad_xyz2[(i*m+j2)*6+0]),-(g*(x1-x2)));
 			atomicAdd(&(grad_xyz2[(i*m+j2)*6+1]),-(g*(y1-y2)));
 			atomicAdd(&(grad_xyz2[(i*m+j2)*6+2]),-(g*(z1-z2)));
 			atomicAdd(&(grad_xyz2[(i*m+j2)*6+3]),-(g*(nx1-nx2)));
 			atomicAdd(&(grad_xyz2[(i*m+j2)*6+4]),-(g*(ny1-ny2)));
 			atomicAdd(&(grad_xyz2[(i*m+j2)*6+5]),-(g*(nz1-nz2)));
 		}
 	}
 }
 // int chamfer_cuda_backward(int b,int n,const float * xyz1,int m,const float * xyz2,const float * grad_dist1,const int * idx1,const float * grad_dist2,const int * idx2,float * grad_xyz1,float * grad_xyz2, cudaStream_t stream){
 int chamfer_cuda_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2){
 	// cudaMemset(grad_xyz1,0,b*n*3*4);
 	// cudaMemset(grad_xyz2,0,b*m*3*4);
 	const auto batch_size = xyz1.size(0);
 	const auto n = xyz1.size(1); //num_points point cloud A
 	const auto m = xyz2.size(1); //num_points point cloud B
 	NmDistanceGradKernel<<<dim3(1,16,1),256>>>(batch_size,n,xyz1.data<float>(),m,xyz2.data<float>(),graddist1.data<float>(),idx1.data<int>(),gradxyz1.data<float>(),gradxyz2.data<float>());
 	NmDistanceGradKernel<<<dim3(1,16,1),256>>>(batch_size,m,xyz2.data<float>(),n,xyz1.data<float>(),graddist2.data<float>(),idx2.data<int>(),gradxyz2.data<float>(),gradxyz1.data<float>());
 	cudaError_t err = cudaGetLastError();
 	  if (err != cudaSuccess) {
 	    printf("error in nnd get grad: %s\n", cudaGetErrorString(err));
 	    //THError("aborting");
 	    return 0;
 	  }
 	  return 1;
 }
--- a/third_party/ChamferDistancePytorch/chamfer6D/chamfer_cuda.cpp
+++ b/third_party/ChamferDistancePytorch/chamfer6D/chamfer_cuda.cpp
@ -0,0 +1,33 @@
 #include <torch/torch.h>
 #include <vector>
 ///TMP
 //#include "common.h"
 /// NOT TMP
 int chamfer_cuda_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2);
 int chamfer_cuda_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2);
 int chamfer_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2) {
    return chamfer_cuda_forward(xyz1, xyz2, dist1, dist2, idx1, idx2);
 }
 int chamfer_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, 
 					  at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2) {
    return chamfer_cuda_backward(xyz1, xyz2, gradxyz1, gradxyz2, graddist1, graddist2, idx1, idx2);
 }
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &chamfer_forward, "chamfer forward (CUDA)");
  m.def("backward", &chamfer_backward, "chamfer backward (CUDA)");
 }
--- a/third_party/ChamferDistancePytorch/chamfer6D/dist_chamfer_6D.py
+++ b/third_party/ChamferDistancePytorch/chamfer6D/dist_chamfer_6D.py
@ -0,0 +1,82 @@
 from torch import nn
 from torch.autograd import Function
 import torch
 import importlib
 import os
 chamfer_found = importlib.find_loader("chamfer_6D") is not None
 if not chamfer_found:
    ## Cool trick from https://github.com/chrdiller
    print("Jitting Chamfer 6D")
    cur_path = os.path.dirname(os.path.abspath(__file__))
    build_path = cur_path.replace('chamfer6D', 'tmp')
    os.makedirs(build_path, exist_ok=True)
    from torch.utils.cpp_extension import load
    chamfer_6D = load(name="chamfer_6D",
                      sources=[
                          "/".join(os.path.abspath(__file__).split('/')[:-1] + ["chamfer_cuda.cpp"]),
                          "/".join(os.path.abspath(__file__).split('/')[:-1] + ["chamfer6D.cu"]),
                      ], build_directory=build_path)
    print("Loaded JIT 6D CUDA chamfer distance")
 else:
    import chamfer_6D
    print("Loaded compiled 6D CUDA chamfer distance")
 # Chamfer's distance module @thibaultgroueix
 # GPU tensors only
 class chamfer_6DFunction(Function):
    @staticmethod
    def forward(ctx, xyz1, xyz2):
        batchsize, n, dim = xyz1.size()
        assert dim==6, "Wrong last dimension for the chamfer distance 's input! Check with .size()"
        _, m, dim = xyz2.size()
        assert dim==6, "Wrong last dimension for the chamfer distance 's input! Check with .size()"
        device = xyz1.device
        device = xyz1.device
        dist1 = torch.zeros(batchsize, n)
        dist2 = torch.zeros(batchsize, m)
        idx1 = torch.zeros(batchsize, n).type(torch.IntTensor)
        idx2 = torch.zeros(batchsize, m).type(torch.IntTensor)
        dist1 = dist1.to(device)
        dist2 = dist2.to(device)
        idx1 = idx1.to(device)
        idx2 = idx2.to(device)
        torch.cuda.set_device(device)
        chamfer_6D.forward(xyz1, xyz2, dist1, dist2, idx1, idx2)
        ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
        return dist1, dist2, idx1, idx2
    @staticmethod
    def backward(ctx, graddist1, graddist2, gradidx1, gradidx2):
        xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
        graddist1 = graddist1.contiguous()
        graddist2 = graddist2.contiguous()
        device = graddist1.device
        gradxyz1 = torch.zeros(xyz1.size())
        gradxyz2 = torch.zeros(xyz2.size())
        gradxyz1 = gradxyz1.to(device)
        gradxyz2 = gradxyz2.to(device)
        chamfer_6D.backward(
            xyz1, xyz2, gradxyz1, gradxyz2, graddist1, graddist2, idx1, idx2
        )
        return gradxyz1, gradxyz2
 class chamfer_6DDist(nn.Module):
    def __init__(self):
        super(chamfer_6DDist, self).__init__()
    def forward(self, input1, input2):
        input1 = input1.contiguous()
        input2 = input2.contiguous()
        return chamfer_6DFunction.apply(input1, input2)
--- a/third_party/ChamferDistancePytorch/chamfer6D/setup.py
+++ b/third_party/ChamferDistancePytorch/chamfer6D/setup.py
@ -0,0 +1,14 @@
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 setup(
    name='chamfer_6D',
    ext_modules=[
        CUDAExtension('chamfer_6D', [
            "/".join(__file__.split('/')[:-1] + ['chamfer_cuda.cpp']),
            "/".join(__file__.split('/')[:-1] + ['chamfer6D.cu']),
        ]),
    ],
    cmdclass={
        'build_ext': BuildExtension
    })
--- a/third_party/ChamferDistancePytorch/chamfer_python.py
+++ b/third_party/ChamferDistancePytorch/chamfer_python.py
@ -0,0 +1,44 @@
 import torch
 def pairwise_dist(x, y):
    xx, yy, zz = torch.mm(x, x.t()), torch.mm(y, y.t()), torch.mm(x, y.t())
    rx = xx.diag().unsqueeze(0).expand_as(xx)
    ry = yy.diag().unsqueeze(0).expand_as(yy)
    P = rx.t() + ry - 2 * zz
    return P
 def NN_loss(x, y, dim=0):
    dist = pairwise_dist(x, y)
    values, indices = dist.min(dim=dim)
    return values.mean()
 def batched_pairwise_dist(a, b):
    x, y = a.double(), b.double()
    bs, num_points_x, points_dim = x.size()
    bs, num_points_y, points_dim = y.size()
    xx = torch.pow(x, 2).sum(2)
    yy = torch.pow(y, 2).sum(2)
    zz = torch.bmm(x, y.transpose(2, 1))
    rx = xx.unsqueeze(1).expand(bs, num_points_y, num_points_x) # Diagonal elements xx
    ry = yy.unsqueeze(1).expand(bs, num_points_x, num_points_y) # Diagonal elements yy
    P = rx.transpose(2, 1) + ry - 2 * zz
    return P
 def distChamfer(a, b):
    """
    :param a: Pointclouds Batch x nul_points x dim
    :param b:  Pointclouds Batch x nul_points x dim
    :return:
    -closest point on b of points from a
    -closest point on a of points from b
    -idx of closest point on b of points from a
    -idx of closest point on a of points from b
    Works for pointcloud of any dimension
    """
    P = batched_pairwise_dist(a, b)
    return torch.min(P, 2)[0].float(), torch.min(P, 1)[0].float(), torch.min(P, 2)[1].int(), torch.min(P, 1)[1].int()
--- a/third_party/ChamferDistancePytorch/fscore.py
+++ b/third_party/ChamferDistancePytorch/fscore.py
@ -0,0 +1,17 @@
 import torch
 def fscore(dist1, dist2, threshold=0.001):
    """
    Calculates the F-score between two point clouds with the corresponding threshold value.
    :param dist1: Batch, N-Points
    :param dist2: Batch, N-Points
    :param th: float
    :return: fscore, precision, recall
    """
    # NB : In this depo, dist1 and dist2 are squared pointcloud euclidean distances, so you should adapt the threshold accordingly.
    precision_1 = torch.mean((dist1 < threshold).float(), dim=1)
    precision_2 = torch.mean((dist2 < threshold).float(), dim=1)
    fscore = 2 * precision_1 * precision_2 / (precision_1 + precision_2)
    fscore[torch.isnan(fscore)] = 0
    return fscore, precision_1, precision_2
--- a/third_party/ChamferDistancePytorch/unit_test.py
+++ b/third_party/ChamferDistancePytorch/unit_test.py
@ -0,0 +1,69 @@
 import torch, time
 import chamfer2D.dist_chamfer_2D
 import chamfer3D.dist_chamfer_3D
 import chamfer5D.dist_chamfer_5D
 import chamfer_python
 cham2D = chamfer2D.dist_chamfer_2D.chamfer_2DDist()
 cham3D = chamfer3D.dist_chamfer_3D.chamfer_3DDist()
 cham5D = chamfer5D.dist_chamfer_5D.chamfer_5DDist()
 from torch.autograd import Variable
 from fscore import fscore
 def test_chamfer(distChamfer, dim):
    points1 = torch.rand(4, 100, dim).cuda()
    points2 = torch.rand(4, 200, dim, requires_grad=True).cuda()
    dist1, dist2, idx1, idx2= distChamfer(points1, points2)
    loss = torch.sum(dist1)
    loss.backward()
    mydist1, mydist2, myidx1, myidx2 = chamfer_python.distChamfer(points1, points2)
    d1 = (dist1 - mydist1) ** 2
    d2 = (dist2 - mydist2) ** 2
    assert (
        torch.mean(d1) + torch.mean(d2) < 0.00000001
    ), "chamfer cuda and chamfer normal are not giving the same results"
    xd1 = idx1 - myidx1
    xd2 = idx2 - myidx2
    assert (
            torch.norm(xd1.float()) + torch.norm(xd2.float()) == 0
    ), "chamfer cuda and chamfer normal are not giving the same results"
    print(f"fscore :", fscore(dist1, dist2))
    print("Unit test passed")
 def timings(distChamfer, dim):
    p1 = torch.rand(32, 2000, dim).cuda()
    p2 = torch.rand(32, 1000, dim).cuda()
    print("Timings : Start CUDA version")
    start = time.time()
    num_it = 100
    for i in range(num_it):
        points1 = Variable(p1, requires_grad=True)
        points2 = Variable(p2)
        mydist1, mydist2, idx1, idx2 = distChamfer(points1, points2)
        loss = torch.sum(mydist1)
        loss.backward()
    print(f"Ellapsed time forward backward is {(time.time() - start)/num_it} seconds.")
    print("Timings : Start Pythonic version")
    start = time.time()
    for i in range(num_it):
        points1 = Variable(p1, requires_grad=True)
        points2 = Variable(p2)
        mydist1, mydist2, idx1, idx2 = chamfer_python.distChamfer(points1, points2)
        loss = torch.sum(mydist1)
        loss.backward()
    print(f"Ellapsed time  forward backward  is {(time.time() - start)/num_it} seconds.")
 dims = [2,3,5]
 for i,cham in enumerate([cham2D, cham3D, cham5D]):
    print(f"testing Chamfer {dims[i]}D")
    test_chamfer(cham, dims[i])
    timings(cham, dims[i])
--- a/third_party/PyTorchEMD/.gitignore
+++ b/third_party/PyTorchEMD/.gitignore
@ -0,0 +1,5 @@
 __pycache__
 build
 dist
 emd_ext.egg-info
 *.so
--- a/third_party/PyTorchEMD/README.md
+++ b/third_party/PyTorchEMD/README.md
@ -0,0 +1,34 @@
 * adapted from https://github.com/daerduoCarey/PyTorchEMD 
 ---------------------------------
 # PyTorch Wrapper for Point-cloud Earth-Mover-Distance (EMD)
 ## Dependency
 The code has been tested on Ubuntu 16.04, PyTorch 1.1.0, CUDA 9.0.
 ## Usage
 First compile using
        python setup.py install
 Then, copy the lib file out to the main directory,
        cp build/lib.linux-x86_64-3.6/emd_cuda.cpython-36m-x86_64-linux-gnu.so .
 Then, you can use it by simply
        from emd import earth_mover_distance
        d = earth_mover_distance(p1, p2, transpose=False)  # p1: B x N1 x 3, p2: B x N2 x 3
 Check `test_emd_loss.py` for example.
 ## Author
 The cuda code is originally written by Haoqiang Fan. The PyTorch wrapper is written by Kaichun Mo. Also, Jiayuan Gu provided helps.
 ## License
 MIT
--- a/third_party/PyTorchEMD/init.py
+++ b/third_party/PyTorchEMD/init.py
--- a/third_party/PyTorchEMD/backend.py
+++ b/third_party/PyTorchEMD/backend.py
@ -0,0 +1,21 @@
 import os
 import time
 from torch.utils.cpp_extension import load
 _src_path = os.path.dirname(os.path.abspath(__file__))
 if not os.path.exists(os.path.join(_src_path, 'build_dynamic')):
    os.makedirs(os.path.join(_src_path, 'build_dynamic'))
 tic = time.time() 
 emd_cuda_dynamic = load(name='emd_ext',
                extra_cflags=['-O3', '-std=c++17'],
                ## build_directory=os.path.join(_src_path, 'build_dynamic'),
                verbose=True,
                sources=[
                    os.path.join(_src_path, f) for f in [
                        'cuda/emd.cpp',
                        'cuda/emd_kernel.cu',
                    ]
                ])
 print('load emd_ext time: {:.3f}s'.format(time.time() - tic))
 __all__ = ['emd_cuda_dynamic']
--- a/third_party/PyTorchEMD/cuda/emd.cpp
+++ b/third_party/PyTorchEMD/cuda/emd.cpp
@ -0,0 +1,29 @@
 #ifndef _EMD
 #define _EMD
 #include <vector>
 #include <torch/extension.h>
 //CUDA declarations
 at::Tensor ApproxMatchForward(
    const at::Tensor xyz1,
    const at::Tensor xyz2);
 at::Tensor MatchCostForward(
    const at::Tensor xyz1,
    const at::Tensor xyz2,
    const at::Tensor match);
 std::vector<at::Tensor> MatchCostBackward(
    const at::Tensor grad_cost,
    const at::Tensor xyz1,
    const at::Tensor xyz2,
    const at::Tensor match);
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("approxmatch_forward", &ApproxMatchForward,"ApproxMatch forward (CUDA)");
  m.def("matchcost_forward", &MatchCostForward,"MatchCost forward (CUDA)");
  m.def("matchcost_backward", &MatchCostBackward,"MatchCost backward (CUDA)");
 }
 #endif
--- a/third_party/PyTorchEMD/cuda/emd_kernel.cu
+++ b/third_party/PyTorchEMD/cuda/emd_kernel.cu
@ -0,0 +1,398 @@
 /**********************************
 * Original Author: Haoqiang Fan
 * Modified by: Kaichun Mo
 *********************************/
 #ifndef _EMD_KERNEL
 #define _EMD_KERNEL
 #include <cmath>
 #include <vector>
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAApplyUtils.cuh>  // at::cuda::getApplyGrid
 #include <THC/THC.h>
 #define CHECK_INPUT(x) 
 /********************************
 * Forward kernel for approxmatch
 *********************************/
 template<typename scalar_t>
 __global__ void approxmatch(int b,int n,int m,const scalar_t * __restrict__ xyz1,const scalar_t * __restrict__ xyz2,scalar_t * __restrict__ match,scalar_t * temp){
 	scalar_t * remainL=temp+blockIdx.x*(n+m)*2, * remainR=temp+blockIdx.x*(n+m)*2+n,*ratioL=temp+blockIdx.x*(n+m)*2+n+m,*ratioR=temp+blockIdx.x*(n+m)*2+n+m+n;
 	scalar_t multiL,multiR;
 	if (n>=m){
 		multiL=1;
 		multiR=n/m;
 	}else{
 		multiL=m/n;
 		multiR=1;
 	}
 	const int Block=1024;
 	__shared__ scalar_t buf[Block*4];
 	for (int i=blockIdx.x;i<b;i+=gridDim.x){
 		for (int j=threadIdx.x;j<n*m;j+=blockDim.x)
 			match[i*n*m+j]=0;
 		for (int j=threadIdx.x;j<n;j+=blockDim.x)
 			remainL[j]=multiL;
 		for (int j=threadIdx.x;j<m;j+=blockDim.x)
 			remainR[j]=multiR;
 		__syncthreads();
 		for (int j=7;j>=-2;j--){
 			scalar_t level=-powf(4.0f,j);
 			if (j==-2){
 				level=0;
 			}
 			for (int k0=0;k0<n;k0+=blockDim.x){
 				int k=k0+threadIdx.x;
 				scalar_t x1=0,y1=0,z1=0;
 				if (k<n){
 					x1=xyz1[i*n*3+k*3+0];
 					y1=xyz1[i*n*3+k*3+1];
 					z1=xyz1[i*n*3+k*3+2];
 				}
 				scalar_t suml=1e-9f;
 				for (int l0=0;l0<m;l0+=Block){
 					int lend=min(m,l0+Block)-l0;
 					for (int l=threadIdx.x;l<lend;l+=blockDim.x){
 						scalar_t x2=xyz2[i*m*3+l0*3+l*3+0];
 						scalar_t y2=xyz2[i*m*3+l0*3+l*3+1];
 						scalar_t z2=xyz2[i*m*3+l0*3+l*3+2];
 						buf[l*4+0]=x2;
 						buf[l*4+1]=y2;
 						buf[l*4+2]=z2;
 						buf[l*4+3]=remainR[l0+l];
 					}
 					__syncthreads();
 					for (int l=0;l<lend;l++){
 						scalar_t x2=buf[l*4+0];
 						scalar_t y2=buf[l*4+1];
 						scalar_t z2=buf[l*4+2];
 						scalar_t d=level*((x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1));
 						scalar_t w=__expf(d)*buf[l*4+3];
 						suml+=w;
 					}
 					__syncthreads();
 				}
 				if (k<n)
 					ratioL[k]=remainL[k]/suml;
 			}
 			__syncthreads();
 			for (int l0=0;l0<m;l0+=blockDim.x){
 				int l=l0+threadIdx.x;
 				scalar_t x2=0,y2=0,z2=0;
 				if (l<m){
 					x2=xyz2[i*m*3+l*3+0];
 					y2=xyz2[i*m*3+l*3+1];
 					z2=xyz2[i*m*3+l*3+2];
 				}
 				scalar_t sumr=0;
 				for (int k0=0;k0<n;k0+=Block){
 					int kend=min(n,k0+Block)-k0;
 					for (int k=threadIdx.x;k<kend;k+=blockDim.x){
 						buf[k*4+0]=xyz1[i*n*3+k0*3+k*3+0];
 						buf[k*4+1]=xyz1[i*n*3+k0*3+k*3+1];
 						buf[k*4+2]=xyz1[i*n*3+k0*3+k*3+2];
 						buf[k*4+3]=ratioL[k0+k];
 					}
 					__syncthreads();
 					for (int k=0;k<kend;k++){
 						scalar_t x1=buf[k*4+0];
 						scalar_t y1=buf[k*4+1];
 						scalar_t z1=buf[k*4+2];
 						scalar_t w=__expf(level*((x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1)))*buf[k*4+3];
 						sumr+=w;
 					}
 					__syncthreads();
 				}
 				if (l<m){
 					sumr*=remainR[l];
 					scalar_t consumption=fminf(remainR[l]/(sumr+1e-9f),1.0f);
 					ratioR[l]=consumption*remainR[l];
 					remainR[l]=fmaxf(0.0f,remainR[l]-sumr);
 				}
 			}
 			__syncthreads();
 			for (int k0=0;k0<n;k0+=blockDim.x){
 				int k=k0+threadIdx.x;
 				scalar_t x1=0,y1=0,z1=0;
 				if (k<n){
 					x1=xyz1[i*n*3+k*3+0];
 					y1=xyz1[i*n*3+k*3+1];
 					z1=xyz1[i*n*3+k*3+2];
 				}
 				scalar_t suml=0;
 				for (int l0=0;l0<m;l0+=Block){
 					int lend=min(m,l0+Block)-l0;
 					for (int l=threadIdx.x;l<lend;l+=blockDim.x){
 						buf[l*4+0]=xyz2[i*m*3+l0*3+l*3+0];
 						buf[l*4+1]=xyz2[i*m*3+l0*3+l*3+1];
 						buf[l*4+2]=xyz2[i*m*3+l0*3+l*3+2];
 						buf[l*4+3]=ratioR[l0+l];
 					}
 					__syncthreads();
 					scalar_t rl=ratioL[k];
 					if (k<n){
 						for (int l=0;l<lend;l++){
 							scalar_t x2=buf[l*4+0];
 							scalar_t y2=buf[l*4+1];
 							scalar_t z2=buf[l*4+2];
 							scalar_t w=__expf(level*((x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1)))*rl*buf[l*4+3];
 							match[i*n*m+(l0+l)*n+k]+=w;
 							suml+=w;
 						}
 					}
 					__syncthreads();
 				}
 				if (k<n)
 					remainL[k]=fmaxf(0.0f,remainL[k]-suml);
 			}
 			__syncthreads();
 		}
 	}
 }
 //void approxmatchLauncher(int b,int n,int m,const scalar_t * xyz1,const scalar_t * xyz2,scalar_t * match,scalar_t * temp){
 //	approxmatch<<<32,512>>>(b,n,m,xyz1,xyz2,match,temp);
 //}
 /* ApproxMatch forward interface
 Input:
  xyz1: (B, N1, 3)  # dataset_points
  xyz2: (B, N2, 3)  # query_points
 Output:
  match: (B, N2, N1)
 */
 at::Tensor ApproxMatchForward(
    const at::Tensor xyz1,
    const at::Tensor xyz2){
  const auto b = xyz1.size(0);
  const auto n = xyz1.size(1);
  const auto m = xyz2.size(1);
  CHECK_EQ(xyz2.size(0), b);
  CHECK_EQ(xyz1.size(2), 3);
  CHECK_EQ(xyz2.size(2), 3);
  CHECK_INPUT(xyz1);
  CHECK_INPUT(xyz2);
  auto match = at::zeros({b, m, n}, xyz1.type());
  auto temp = at::zeros({b, (n+m)*2}, xyz1.type());
  AT_DISPATCH_FLOATING_TYPES(xyz1.scalar_type(), "ApproxMatchForward", ([&] {
        approxmatch<scalar_t><<<32,512>>>(b, n, m, xyz1.data<scalar_t>(), xyz2.data<scalar_t>(), match.data<scalar_t>(), temp.data<scalar_t>());
  }));
  THCudaCheck(cudaGetLastError());
  return match;
 }
 /********************************
 * Forward kernel for matchcost
 *********************************/
 template<typename scalar_t>
 __global__ void matchcost(int b,int n,int m,const scalar_t * __restrict__ xyz1,const scalar_t * __restrict__ xyz2,const scalar_t * __restrict__ match,scalar_t * __restrict__ out){
 	__shared__ scalar_t allsum[512];
 	const int Block=1024;
 	__shared__ scalar_t buf[Block*3];
 	for (int i=blockIdx.x;i<b;i+=gridDim.x){
 		scalar_t subsum=0;
 		for (int k0=0;k0<n;k0+=blockDim.x){
 			int k=k0+threadIdx.x;
 			scalar_t x1=0,y1=0,z1=0;
 			if (k<n){
 				x1=xyz1[i*n*3+k*3+0];
 				y1=xyz1[i*n*3+k*3+1];
 				z1=xyz1[i*n*3+k*3+2];
 			}
 			for (int l0=0;l0<m;l0+=Block){
 				int lend=min(m,l0+Block)-l0;
 				for (int l=threadIdx.x;l<lend*3;l+=blockDim.x)
 					buf[l]=xyz2[i*m*3+l0*3+l];
 				__syncthreads();
 				if (k<n){
 					for (int l=0;l<lend;l++){
 						scalar_t x2=buf[l*3+0];
 						scalar_t y2=buf[l*3+1];
 						scalar_t z2=buf[l*3+2];
 						scalar_t d=(x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1);
 						subsum+=d*match[i*n*m+(l0+l)*n+k];
 					}
 				}
 				__syncthreads();
 			}
 		}
 		allsum[threadIdx.x]=subsum;
 		for (int j=1;j<blockDim.x;j<<=1){
 			__syncthreads();
 			if ((threadIdx.x&j)==0 && threadIdx.x+j<blockDim.x){
 				allsum[threadIdx.x]+=allsum[threadIdx.x+j];
 			}
 		}
 		if (threadIdx.x==0)
 			out[i]=allsum[0];
 		__syncthreads();
 	}
 }
 //void matchcostLauncher(int b,int n,int m,const scalar_t * xyz1,const scalar_t * xyz2,const scalar_t * match,scalar_t * out){
 //	matchcost<<<32,512>>>(b,n,m,xyz1,xyz2,match,out);
 //}
 /* MatchCost forward interface
 Input:
  xyz1: (B, N1, 3)  # dataset_points
  xyz2: (B, N2, 3)  # query_points
  match: (B, N2, N1)
 Output:
  cost: (B)
 */
 at::Tensor MatchCostForward(
    const at::Tensor xyz1,
    const at::Tensor xyz2,
    const at::Tensor match){
  const auto b = xyz1.size(0);
  const auto n = xyz1.size(1);
  const auto m = xyz2.size(1);
  CHECK_EQ(xyz2.size(0), b);
  CHECK_EQ(xyz1.size(2), 3);
  CHECK_EQ(xyz2.size(2), 3);
  CHECK_INPUT(xyz1);
  CHECK_INPUT(xyz2);
  auto cost = at::zeros({b}, xyz1.type());
  AT_DISPATCH_FLOATING_TYPES(xyz1.scalar_type(), "MatchCostForward", ([&] {
        matchcost<scalar_t><<<32,512>>>(b, n, m, xyz1.data<scalar_t>(), xyz2.data<scalar_t>(), match.data<scalar_t>(), cost.data<scalar_t>());
  }));
  THCudaCheck(cudaGetLastError());
  return cost;
 }
 /********************************
 * matchcostgrad2 kernel
 *********************************/
 template<typename scalar_t>
 __global__ void matchcostgrad2(int b,int n,int m,const scalar_t * __restrict__ grad_cost,const scalar_t * __restrict__ xyz1,const scalar_t * __restrict__ xyz2,const scalar_t * __restrict__ match,scalar_t * __restrict__ grad2){
 	__shared__ scalar_t sum_grad[256*3];
 	for (int i=blockIdx.x;i<b;i+=gridDim.x){
 		int kbeg=m*blockIdx.y/gridDim.y;
 		int kend=m*(blockIdx.y+1)/gridDim.y;
 		for (int k=kbeg;k<kend;k++){
 			scalar_t x2=xyz2[(i*m+k)*3+0];
 			scalar_t y2=xyz2[(i*m+k)*3+1];
 			scalar_t z2=xyz2[(i*m+k)*3+2];
 			scalar_t subsumx=0,subsumy=0,subsumz=0;
 			for (int j=threadIdx.x;j<n;j+=blockDim.x){
 				scalar_t x1=x2-xyz1[(i*n+j)*3+0];
 				scalar_t y1=y2-xyz1[(i*n+j)*3+1];
 				scalar_t z1=z2-xyz1[(i*n+j)*3+2];
 				scalar_t d=match[i*n*m+k*n+j]*2;
 				subsumx+=x1*d;
 				subsumy+=y1*d;
 				subsumz+=z1*d;
 			}
 			sum_grad[threadIdx.x*3+0]=subsumx;
 			sum_grad[threadIdx.x*3+1]=subsumy;
 			sum_grad[threadIdx.x*3+2]=subsumz;
 			for (int j=1;j<blockDim.x;j<<=1){
 				__syncthreads();
 				int j1=threadIdx.x;
 				int j2=threadIdx.x+j;
 				if ((j1&j)==0 && j2<blockDim.x){
 					sum_grad[j1*3+0]+=sum_grad[j2*3+0];
 					sum_grad[j1*3+1]+=sum_grad[j2*3+1];
 					sum_grad[j1*3+2]+=sum_grad[j2*3+2];
 				}
 			}
 			if (threadIdx.x==0){
 				grad2[(i*m+k)*3+0]=sum_grad[0]*grad_cost[i];
 				grad2[(i*m+k)*3+1]=sum_grad[1]*grad_cost[i];
 				grad2[(i*m+k)*3+2]=sum_grad[2]*grad_cost[i];
 			}
 			__syncthreads();
 		}
 	}
 }
 /********************************
 * matchcostgrad1 kernel
 *********************************/
 template<typename scalar_t>
 __global__ void matchcostgrad1(int b,int n,int m,const scalar_t * __restrict__ grad_cost,const scalar_t * __restrict__ xyz1,const scalar_t * __restrict__ xyz2,const scalar_t * __restrict__ match,scalar_t * __restrict__ grad1){
 	for (int i=blockIdx.x;i<b;i+=gridDim.x){
 		for (int l=threadIdx.x;l<n;l+=blockDim.x){
 			scalar_t x1=xyz1[i*n*3+l*3+0];
 			scalar_t y1=xyz1[i*n*3+l*3+1];
 			scalar_t z1=xyz1[i*n*3+l*3+2];
 			scalar_t dx=0,dy=0,dz=0;
 			for (int k=0;k<m;k++){
 				scalar_t x2=xyz2[i*m*3+k*3+0];
 				scalar_t y2=xyz2[i*m*3+k*3+1];
 				scalar_t z2=xyz2[i*m*3+k*3+2];
 				scalar_t d=match[i*n*m+k*n+l]*2;
 				dx+=(x1-x2)*d;
 				dy+=(y1-y2)*d;
 				dz+=(z1-z2)*d;
 			}
 			grad1[i*n*3+l*3+0]=dx*grad_cost[i];
 			grad1[i*n*3+l*3+1]=dy*grad_cost[i];
 			grad1[i*n*3+l*3+2]=dz*grad_cost[i];
 		}
 	}
 }
 //void matchcostgradLauncher(int b,int n,int m,const scalar_t * xyz1,const scalar_t * xyz2,const scalar_t * match,scalar_t * grad1,scalar_t * grad2){
 //	matchcostgrad1<<<32,512>>>(b,n,m,xyz1,xyz2,match,grad1);
 //	matchcostgrad2<<<dim3(32,32),256>>>(b,n,m,xyz1,xyz2,match,grad2);
 //}
 /* MatchCost backward interface
 Input:
  grad_cost: (B)    # gradients on cost
  xyz1: (B, N1, 3)  # dataset_points
  xyz2: (B, N2, 3)  # query_points
  match: (B, N2, N1)
 Output:
  grad1: (B, N1, 3)
  grad2: (B, N2, 3)
 */
 std::vector<at::Tensor> MatchCostBackward(
    const at::Tensor grad_cost,
    const at::Tensor xyz1,
    const at::Tensor xyz2,
    const at::Tensor match){
  const auto b = xyz1.size(0);
  const auto n = xyz1.size(1);
  const auto m = xyz2.size(1);
  CHECK_EQ(xyz2.size(0), b);
  CHECK_EQ(xyz1.size(2), 3);
  CHECK_EQ(xyz2.size(2), 3);
  CHECK_INPUT(xyz1);
  CHECK_INPUT(xyz2);
  auto grad1 = at::zeros({b, n, 3}, xyz1.type());
  auto grad2 = at::zeros({b, m, 3}, xyz1.type());
  AT_DISPATCH_FLOATING_TYPES(xyz1.scalar_type(), "MatchCostBackward", ([&] {
        matchcostgrad1<scalar_t><<<32,512>>>(b, n, m, grad_cost.data<scalar_t>(), xyz1.data<scalar_t>(), xyz2.data<scalar_t>(), match.data<scalar_t>(), grad1.data<scalar_t>());
        matchcostgrad2<scalar_t><<<dim3(32,32),256>>>(b, n, m, grad_cost.data<scalar_t>(), xyz1.data<scalar_t>(), xyz2.data<scalar_t>(), match.data<scalar_t>(), grad2.data<scalar_t>());
  }));
  THCudaCheck(cudaGetLastError());
  return std::vector<at::Tensor>({grad1, grad2});
 }
 #endif
--- a/third_party/PyTorchEMD/emd.py
+++ b/third_party/PyTorchEMD/emd.py
@ -0,0 +1,52 @@
 import torch
 # from backend import emd_cuda_dynamic as emd_cuda # jit compiling 
 from third_party.PyTorchEMD.backend import emd_cuda_dynamic as emd_cuda 
 from torch.cuda.amp import autocast, GradScaler, custom_fwd, custom_bwd 
 class EarthMoverDistanceFunction(torch.autograd.Function):
    @staticmethod
    @custom_fwd(cast_inputs=torch.float32) 
    def forward(ctx, xyz1, xyz2):
        xyz1 = xyz1.contiguous()
        xyz2 = xyz2.contiguous()
        assert xyz1.is_cuda and xyz2.is_cuda, "Only support cuda currently."
        match = emd_cuda.approxmatch_forward(xyz1, xyz2)
        cost = emd_cuda.matchcost_forward(xyz1, xyz2, match)
        ctx.save_for_backward(xyz1, xyz2, match)
        return cost
    @staticmethod
    @custom_bwd
    def backward(ctx, grad_cost):
        xyz1, xyz2, match = ctx.saved_tensors
        grad_cost = grad_cost.contiguous()
        grad_xyz1, grad_xyz2 = emd_cuda.matchcost_backward(grad_cost, xyz1, xyz2, match)
        return grad_xyz1, grad_xyz2
 def earth_mover_distance(xyz1, xyz2, transpose=True):
    """Earth Mover Distance (Approx)
    Args:
        xyz1 (torch.Tensor): (b, 3, n1)
        xyz2 (torch.Tensor): (b, 3, n1)
        transpose (bool): whether to transpose inputs as it might be BCN format.
            Extensions only support BNC format.
    Returns:
        cost (torch.Tensor): (b)
    """
    if xyz1.dim() == 2:
        xyz1 = xyz1.unsqueeze(0)
    if xyz2.dim() == 2:
        xyz2 = xyz2.unsqueeze(0)
    if transpose:
        xyz1 = xyz1.transpose(1, 2)
        xyz2 = xyz2.transpose(1, 2)
    # xyz1: B,N,3
    N = xyz1.shape[1]
    assert(xyz1.shape[-1] == 3), f'require it to be B,N,3; get: {xyz1.shape}'
    cost = EarthMoverDistanceFunction.apply(xyz1, xyz2) / float(N)
    return cost
--- a/third_party/PyTorchEMD/emd_cuda.py
+++ b/third_party/PyTorchEMD/emd_cuda.py
@ -0,0 +1,9 @@
 def __bootstrap__():
    global __bootstrap__, __loader__, __file__
    import sys, pkg_resources, importlib.util
    __file__ = pkg_resources.resource_filename(__name__, 'emd_cuda.cpython-38-x86_64-linux-gnu.so')
    __loader__ = None; del __bootstrap__, __loader__
    spec = importlib.util.spec_from_file_location(__name__,__file__)
    mod = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(mod)
 __bootstrap__()
--- a/third_party/PyTorchEMD/emd_nograd.py
+++ b/third_party/PyTorchEMD/emd_nograd.py
@ -0,0 +1,45 @@
 import torch
 #import emd_cuda
 # from evaluation.PyTorchEMD import emd_cuda 
 from third_party.PyTorchEMD.backend import emd_cuda_dynamic as emd_cuda 
 class EarthMoverDistanceFunctionNoGrad(torch.autograd.Function):
    @staticmethod
    def forward(ctx, xyz1, xyz2):
        xyz1 = xyz1.contiguous()
        xyz2 = xyz2.contiguous()
        assert xyz1.is_cuda and xyz2.is_cuda, "Only support cuda currently."
        match = emd_cuda.approxmatch_forward(xyz1, xyz2)
        cost = emd_cuda.matchcost_forward(xyz1, xyz2, match)
        # ctx.save_for_backward(xyz1, xyz2, match)
        return cost
 def earth_mover_distance_nograd(xyz1, xyz2, transpose=True):
    """Earth Mover Distance (Approx)
    Args:
        xyz1 (torch.Tensor): (b, 3, n1)
        xyz2 (torch.Tensor): (b, 3, n1)
        transpose (bool): whether to transpose inputs as it might be BCN format.
            Extensions only support BNC format.
    Returns:
        cost (torch.Tensor): (b)
    """
    if xyz1.dim() == 2:
        xyz1 = xyz1.unsqueeze(0)
    if xyz2.dim() == 2:
        xyz2 = xyz2.unsqueeze(0)
    if transpose:
        xyz1 = xyz1.transpose(1, 2)
        xyz2 = xyz2.transpose(1, 2)
    # xyz1: B,N,3
    N = xyz1.shape[1]
    assert(xyz1.shape[-1] == 3), f'require it to be B,N,3; get: {xyz1.shape}'
    #print('xyz1: ', xyz1.shape, xyz2.shape, xyz1.min(), xyz1.max(), xyz2.min(), xyz2.max())
    cost = EarthMoverDistanceFunctionNoGrad.apply(xyz1, xyz2) / float(N)
    return cost
--- a/third_party/PyTorchEMD/emd_static.py
+++ b/third_party/PyTorchEMD/emd_static.py
@ -0,0 +1,49 @@
 import torch
 import emd_cuda
 class EarthMoverDistanceFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, xyz1, xyz2):
        xyz1 = xyz1.contiguous()
        xyz2 = xyz2.contiguous()
        assert xyz1.is_cuda and xyz2.is_cuda, "Only support cuda currently."
        match = emd_cuda.approxmatch_forward(xyz1, xyz2)
        cost = emd_cuda.matchcost_forward(xyz1, xyz2, match)
        ctx.save_for_backward(xyz1, xyz2, match)
        return cost
    @staticmethod
    def backward(ctx, grad_cost):
        xyz1, xyz2, match = ctx.saved_tensors
        grad_cost = grad_cost.contiguous()
        grad_xyz1, grad_xyz2 = emd_cuda.matchcost_backward(grad_cost, xyz1, xyz2, match)
        return grad_xyz1, grad_xyz2
 def earth_mover_distance(xyz1, xyz2, transpose=True):
    """Earth Mover Distance (Approx)
    Args:
        xyz1 (torch.Tensor): (b, 3, n1)
        xyz2 (torch.Tensor): (b, 3, n1)
        transpose (bool): whether to transpose inputs as it might be BCN format.
            Extensions only support BNC format.
    Returns:
        cost (torch.Tensor): (b)
    """
    if xyz1.dim() == 2:
        xyz1 = xyz1.unsqueeze(0)
    if xyz2.dim() == 2:
        xyz2 = xyz2.unsqueeze(0)
    if transpose:
        xyz1 = xyz1.transpose(1, 2)
        xyz2 = xyz2.transpose(1, 2)
    # xyz1: B,N,3
    N = xyz1.shape[1]
    assert(xyz1.shape[-1] == 3), f'require it to be B,N,3; get: {xyz1.shape}'
    cost = EarthMoverDistanceFunction.apply(xyz1, xyz2) / float(N)
    return cost
--- a/third_party/PyTorchEMD/setup.py
+++ b/third_party/PyTorchEMD/setup.py
@ -0,0 +1,27 @@
 """Setup extension
 Notes:
    If extra_compile_args is provided, you need to provide different instances for different extensions.
    Refer to https://github.com/pytorch/pytorch/issues/20169
 """
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 setup(
    name='emd_ext',
    ext_modules=[
        CUDAExtension(
            name='emd_cuda',
            sources=[
                'cuda/emd.cpp',
                'cuda/emd_kernel.cu',
            ],
            extra_compile_args={'cxx': ['-g'], 'nvcc': ['-O2']}
        ),
    ],
    cmdclass={
        'build_ext': BuildExtension
    })
--- a/third_party/PyTorchEMD/test_emd_loss.py
+++ b/third_party/PyTorchEMD/test_emd_loss.py
@ -0,0 +1,44 @@
 import torch
 import numpy as np
 import time
 from emd import earth_mover_distance
 # gt
 p1 = torch.from_numpy(np.array([[[1.7, -0.1, 0.1], [0.1, 1.2, 0.3]]], dtype=np.float32)).cuda()
 p1 = p1.repeat(3, 1, 1)
 p2 = torch.from_numpy(np.array([[[0.3, 1.8, 0.2], [1.2, -0.2, 0.3]]], dtype=np.float32)).cuda()
 p2 = p2.repeat(3, 1, 1)
 print(p1)
 print(p2)
 p1.requires_grad = True
 p2.requires_grad = True
 gt_dist = (((p1[0, 0] - p2[0, 1])**2).sum() + ((p1[0, 1] - p2[0, 0])**2).sum()) / 2 +  \
         (((p1[1, 0] - p2[1, 1])**2).sum() + ((p1[1, 1] - p2[1, 0])**2).sum()) * 2 + \
         (((p1[2, 0] - p2[2, 1])**2).sum() + ((p1[2, 1] - p2[2, 0])**2).sum()) / 3
 print('gt_dist: ', gt_dist)
 gt_dist.backward()
 print(p1.grad)
 print(p2.grad)
 # emd
 p1 = torch.from_numpy(np.array([[[1.7, -0.1, 0.1], [0.1, 1.2, 0.3]]], dtype=np.float32)).cuda()
 p1 = p1.repeat(3, 1, 1)
 p2 = torch.from_numpy(np.array([[[0.3, 1.8, 0.2], [1.2, -0.2, 0.3]]], dtype=np.float32)).cuda()
 p2 = p2.repeat(3, 1, 1)
 print(p1)
 print(p2)
 p1.requires_grad = True
 p2.requires_grad = True
 d = earth_mover_distance(p1, p2, transpose=False)
 print(d)
 loss = d[0] / 2 + d[1] * 2 + d[2] / 3
 print(loss)
 loss.backward()
 print(p1.grad)
 print(p2.grad)
--- a/third_party/pvcnn/LICENSE
+++ b/third_party/pvcnn/LICENSE
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2018 Zhijian Liu, Haotian Tang, Yujun Lin
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/third_party/pvcnn/README.md
+++ b/third_party/pvcnn/README.md
@ -0,0 +1,2 @@
 * all the code under this folder is based on the code under https://github.com/mit-han-lab/pvcnn/tree/master/modules
--- a/third_party/pvcnn/functional/init.py
+++ b/third_party/pvcnn/functional/init.py
@ -0,0 +1,7 @@
 from third_party.pvcnn.functional.ball_query import ball_query
 from third_party.pvcnn.functional.devoxelization import trilinear_devoxelize
 from third_party.pvcnn.functional.grouping import grouping
 from third_party.pvcnn.functional.interpolatation import nearest_neighbor_interpolate
 from third_party.pvcnn.functional.loss import kl_loss, huber_loss
 from third_party.pvcnn.functional.sampling import gather, furthest_point_sample, logits_mask
 from third_party.pvcnn.functional.voxelization import avg_voxelize
--- a/third_party/pvcnn/functional/backend.py
+++ b/third_party/pvcnn/functional/backend.py
@ -0,0 +1,29 @@
 import os
 from torch.utils.cpp_extension import load
 _src_path = os.path.dirname(os.path.abspath(__file__))
 if not os.path.exists(os.path.join(_src_path, 'build')):
    os.makedirs(os.path.join(_src_path, 'build'))
 _backend = load(name='_pvcnn_backend',
                extra_cflags=['-O3', '-std=c++17'],
                verbose=True,
                sources=[
                    os.path.join(_src_path, 'src', f) for f in [
                        'ball_query/ball_query.cpp',
                        'ball_query/ball_query.cu',
                        'grouping/grouping.cpp',
                        'grouping/grouping.cu',
                        'interpolate/neighbor_interpolate.cpp',
                        'interpolate/neighbor_interpolate.cu',
                        'interpolate/trilinear_devox.cpp',
                        'interpolate/trilinear_devox.cu',
                        'sampling/sampling.cpp',
                        'sampling/sampling.cu',
                        'voxelization/vox.cpp',
                        'voxelization/vox.cu',
                        'bindings.cpp',
                    ]
                ])
 __all__ = ['_backend']
--- a/third_party/pvcnn/functional/ball_query.py
+++ b/third_party/pvcnn/functional/ball_query.py
@ -0,0 +1,20 @@
 from torch.autograd import Function
 from third_party.pvcnn.functional.backend import _backend
 __all__ = ['ball_query']
 def ball_query(centers_coords, points_coords, radius, num_neighbors):
    """
        :param centers_coords: coordinates of centers, FloatTensor[B, 3, M]
        :param points_coords: coordinates of points, FloatTensor[B, 3, N]
        :param radius: float, radius of ball query
        :param num_neighbors: int, maximum number of neighbors
        :return:
            neighbor_indices: indices of neighbors, IntTensor[B, M, U]
        """
    centers_coords = centers_coords[:,:3].contiguous()
    points_coords = points_coords[:,:3].contiguous()
    return _backend.ball_query(centers_coords, points_coords, radius,
                               num_neighbors)
--- a/third_party/pvcnn/functional/devoxelization.py
+++ b/third_party/pvcnn/functional/devoxelization.py
@ -0,0 +1,45 @@
 from torch.autograd import Function
 from third_party.pvcnn.functional.backend import _backend
 __all__ = ['trilinear_devoxelize']
 class TrilinearDevoxelization(Function):
    @staticmethod
    def forward(ctx, features, coords, resolution, is_training=True):
        """
        :param ctx:
        :param coords: the coordinates of points, FloatTensor[B, 3, N]
        :param features: FloatTensor[B, C, R, R, R]
        :param resolution: int, the voxel resolution
        :param is_training: bool, training mode
        :return:
            FloatTensor[B, C, N]
        """
        B, C = features.shape[:2]
        features = features.contiguous().view(B, C, -1)
        coords = coords[:,:3].contiguous()
        outs, inds, wgts = _backend.trilinear_devoxelize_forward(
            resolution, is_training, coords, features)
        if is_training:
            ctx.save_for_backward(inds, wgts)
            ctx.r = resolution
        return outs
    @staticmethod
    def backward(ctx, grad_output):
        """
        :param ctx: 
        :param grad_output: gradient of outputs, FloatTensor[B, C, N]
        :return:
            gradient of inputs, FloatTensor[B, C, R, R, R]
        """
        inds, wgts = ctx.saved_tensors
        grad_inputs = _backend.trilinear_devoxelize_backward(
            grad_output.contiguous(), inds, wgts, ctx.r)
        return grad_inputs.view(grad_output.size(0), grad_output.size(1),
                                ctx.r, ctx.r, ctx.r), None, None, None
 trilinear_devoxelize = TrilinearDevoxelization.apply
--- a/third_party/pvcnn/functional/grouping.py
+++ b/third_party/pvcnn/functional/grouping.py
@ -0,0 +1,33 @@
 from torch.autograd import Function
 # from modules.functional.backend import _backend
 from third_party.pvcnn.functional.backend import _backend
 __all__ = ['grouping']
 class Grouping(Function):
    @staticmethod
    def forward(ctx, features, indices):
        """
        :param ctx:
        :param features: features of points, FloatTensor[B, C, N]
        :param indices: neighbor indices of centers, IntTensor[B, M, U], M is #centers, U is #neighbors
        :return:
            grouped_features: grouped features, FloatTensor[B, C, M, U]
        """
        features = features.contiguous()
        indices = indices.contiguous()
        ctx.save_for_backward(indices)
        ctx.num_points = features.size(-1)
        return _backend.grouping_forward(features, indices)
    @staticmethod
    def backward(ctx, grad_output):
        indices, = ctx.saved_tensors
        grad_features = _backend.grouping_backward(grad_output.contiguous(),
                                                   indices, ctx.num_points)
        return grad_features, None
 grouping = Grouping.apply
--- a/third_party/pvcnn/functional/interpolatation.py
+++ b/third_party/pvcnn/functional/interpolatation.py
@ -0,0 +1,54 @@
 from torch.autograd import Function
 # from modules.functional.backend import _backend
 from third_party.pvcnn.functional.backend import _backend
 import torch
 from torch.cuda.amp import autocast, GradScaler, custom_fwd, custom_bwd 
 __all__ = ['nearest_neighbor_interpolate']
 class NeighborInterpolation(Function):
    @staticmethod
    @custom_fwd(cast_inputs=torch.float32) 
    def forward(ctx, points_coords, centers_coords, centers_features):
        """
        :param ctx:
        :param points_coords: coordinates of points, FloatTensor[B, 3, N]
        :param centers_coords: coordinates of centers, FloatTensor[B, 3, M]
        :param centers_features: features of centers, FloatTensor[B, C, M]
        :return:
            points_features: features of points, FloatTensor[B, C, N]
        """
        centers_coords = centers_coords[:,:3].contiguous()
        points_coords = points_coords[:,:3].contiguous()
        centers_features = centers_features.contiguous()
        points_features, indices, weights = _backend.three_nearest_neighbors_interpolate_forward(
            points_coords, centers_coords, centers_features)
        ctx.save_for_backward(indices, weights)
        ctx.num_centers = centers_coords.size(-1)
        return points_features
    @staticmethod
    @custom_bwd
    def backward(ctx, grad_output):
        indices, weights = ctx.saved_tensors
        grad_centers_features = _backend.three_nearest_neighbors_interpolate_backward(
            grad_output.contiguous(), indices, weights, ctx.num_centers)
        return None, None, grad_centers_features
 nearest_neighbor_interpolate = NeighborInterpolation.apply
 #def nearest_neighbor_interpolate(points_coords, centers_coords, centers_features):
 #    # points_coords:         (B,6,  64)
 #    # centers_coords:        (B,6,  16)
 #    # centers_features:      (B,128,16)
 #    # interpolated_features: (B,128,64) 
 #    B = points_coords.shape[0] 
 #    D = centers_features.shape[1]
 #    N = points_coords.shape[2] 
 #    output = torch.zeros(B,D,N).to(points_coords.shape) 
 #    for b in range(B):
 #        for n in range(N):
 #            points_coords_cur = points_coords 
--- a/third_party/pvcnn/functional/loss.py
+++ b/third_party/pvcnn/functional/loss.py
@ -0,0 +1,18 @@
 import torch
 import torch.nn.functional as F
 __all__ = ['kl_loss', 'huber_loss']
 def kl_loss(x, y):
    x = F.softmax(x.detach(), dim=1)
    y = F.log_softmax(y, dim=1)
    return torch.mean(torch.sum(x * (torch.log(x) - y), dim=1))
 def huber_loss(error, delta):
    abs_error = torch.abs(error)
    quadratic = torch.min(abs_error,
                          torch.full_like(abs_error, fill_value=delta))
    losses = 0.5 * (quadratic**2) + delta * (abs_error - quadratic)
    return torch.mean(losses)
--- a/third_party/pvcnn/functional/sampling.py
+++ b/third_party/pvcnn/functional/sampling.py
@ -0,0 +1,100 @@
 import numpy as np
 import torch
 from torch.autograd import Function
 # from modules.functional.backend import _backend
 from third_party.pvcnn.functional.backend import _backend
 __all__ = ['gather', 'furthest_point_sample', 'logits_mask']
 class Gather(Function):
    @staticmethod
    def forward(ctx, features, indices):
        """
        Gather
        :param ctx:
        :param features: features of points, FloatTensor[B, C, N]
        :param indices: centers' indices in points, IntTensor[b, m]
        :return:
            centers_coords: coordinates of sampled centers, FloatTensor[B, C, M]
        """
        features = features.contiguous()
        indices = indices.int().contiguous()
        ctx.save_for_backward(indices)
        ctx.num_points = features.size(-1)
        return _backend.gather_features_forward(features, indices)
    @staticmethod
    def backward(ctx, grad_output):
        indices, = ctx.saved_tensors
        grad_features = _backend.gather_features_backward(
            grad_output.contiguous(), indices, ctx.num_points)
        return grad_features, None
 gather = Gather.apply
 def furthest_point_sample(coords, num_samples, normals=None):
    """
    Uses iterative furthest point sampling to select a set of npoint features that have the largest
    minimum distance to the sampled point set
    :param coords: coordinates of points, FloatTensor[B, 3, N]
    :param num_samples: int, M
    :return:
       center_coords: coordinates of sampled centers, FloatTensor[B, 3, M]
    """
    assert(len(coords.shape) == 3 and coords.shape[1] == 3), f'expect input as B,3,N; get: {coords.shape}'
    coords = coords.contiguous()
    indices = _backend.furthest_point_sampling(coords, num_samples)
    centers_coords = gather(coords, indices)
    if normals is not None:
        center_normals = gather(normals, indices)
    return centers_coords if normals is None else (centers_coords, center_normals) 
 def logits_mask(coords, logits, num_points_per_object):
    """
    Use logits to sample points
    :param coords: coords of points, FloatTensor[B, 3, N]
    :param logits: binary classification logits, FloatTensor[B, 2, N]
    :param num_points_per_object: M, #points per object after masking, int
    :return:
        selected_coords: FloatTensor[B, 3, M]
        masked_coords_mean: mean coords of selected points, FloatTensor[B, 3]
        mask: mask to select points, BoolTensor[B, N]
    """
    batch_size, _, num_points = coords.shape
    mask = torch.lt(logits[:, 0, :], logits[:, 1, :])  # [B, N]
    num_candidates = torch.sum(mask, dim=-1, keepdim=True)  # [B, 1]
    masked_coords = coords * mask.view(batch_size, 1, num_points)  # [B, C, N]
    masked_coords_mean = torch.sum(masked_coords, dim=-1) / torch.max(
        num_candidates, torch.ones_like(num_candidates)).float()  # [B, C]
    selected_indices = torch.zeros((batch_size, num_points_per_object),
                                   device=coords.device,
                                   dtype=torch.int32)
    for i in range(batch_size):
        current_mask = mask[i]  # [N]
        current_candidates = current_mask.nonzero().view(-1)
        current_num_candidates = current_candidates.numel()
        if current_num_candidates >= num_points_per_object:
            choices = np.random.choice(current_num_candidates,
                                       num_points_per_object,
                                       replace=False)
            selected_indices[i] = current_candidates[choices]
        elif current_num_candidates > 0:
            choices = np.concatenate([
                np.arange(current_num_candidates).repeat(
                    num_points_per_object // current_num_candidates),
                np.random.choice(current_num_candidates,
                                 num_points_per_object %
                                 current_num_candidates,
                                 replace=False)
            ])
            np.random.shuffle(choices)
            selected_indices[i] = current_candidates[choices]
    selected_coords = gather(
        masked_coords - masked_coords_mean.view(batch_size, -1, 1),
        selected_indices)
    return selected_coords, masked_coords_mean, mask
--- a/third_party/pvcnn/functional/src/ball_query/ball_query.cpp
+++ b/third_party/pvcnn/functional/src/ball_query/ball_query.cpp
@ -0,0 +1,30 @@
 #include "ball_query.hpp"
 #include "ball_query.cuh"
 #include "../utils.hpp"
 at::Tensor ball_query_forward(at::Tensor centers_coords,
                              at::Tensor points_coords, const float radius,
                              const int num_neighbors) {
  CHECK_CUDA(centers_coords);
  CHECK_CUDA(points_coords);
  CHECK_CONTIGUOUS(centers_coords);
  CHECK_CONTIGUOUS(points_coords);
  CHECK_IS_FLOAT(centers_coords);
  CHECK_IS_FLOAT(points_coords);
  int b = centers_coords.size(0);
  int m = centers_coords.size(2);
  int n = points_coords.size(2);
  at::Tensor neighbors_indices = torch::zeros(
      {b, m, num_neighbors},
      at::device(centers_coords.device()).dtype(at::ScalarType::Int));
  ball_query(b, n, m, radius * radius, num_neighbors,
             centers_coords.data_ptr<float>(),
             points_coords.data_ptr<float>(),
             neighbors_indices.data_ptr<int>());
  return neighbors_indices;
 }
--- a/third_party/pvcnn/functional/src/ball_query/ball_query.cu
+++ b/third_party/pvcnn/functional/src/ball_query/ball_query.cu
@ -0,0 +1,59 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include "../cuda_utils.cuh"
 /*
  Function: ball query
  Args:
    b   : batch size
    n   : number of points in point clouds
    m   : number of query centers
    r2  : ball query radius ** 2
    u   : maximum number of neighbors
    centers_coords: coordinates of centers, FloatTensor[b, 3, m]
    points_coords : coordinates of points, FloatTensor[b, 3, n]
    neighbors_indices : neighbor indices in points, IntTensor[b, m, u]
 */
 __global__ void ball_query_kernel(int b, int n, int m, float r2, int u,
                                  const float *__restrict__ centers_coords,
                                  const float *__restrict__ points_coords,
                                  int *__restrict__ neighbors_indices) {
  int batch_index = blockIdx.x;
  int index = threadIdx.x;
  int stride = blockDim.x;
  points_coords += batch_index * n * 3;
  centers_coords += batch_index * m * 3;
  neighbors_indices += batch_index * m * u;
  for (int j = index; j < m; j += stride) {
    float center_x = centers_coords[j];
    float center_y = centers_coords[j + m];
    float center_z = centers_coords[j + m + m];
    for (int k = 0, cnt = 0; k < n && cnt < u; ++k) {
      float dx = center_x - points_coords[k];
      float dy = center_y - points_coords[k + n];
      float dz = center_z - points_coords[k + n + n];
      float d2 = dx * dx + dy * dy + dz * dz;
      if (d2 < r2) {
        if (cnt == 0) {
          for (int v = 0; v < u; ++v) {
            neighbors_indices[j * u + v] = k;
          }
        }
        neighbors_indices[j * u + cnt] = k;
        ++cnt;
      }
    }
  }
 }
 void ball_query(int b, int n, int m, float r2, int u,
                const float *centers_coords, const float *points_coords,
                int *neighbors_indices) {
  ball_query_kernel<<<b, optimal_num_threads(m), 0,
                      at::cuda::getCurrentCUDAStream()>>>(
      b, n, m, r2, u, centers_coords, points_coords, neighbors_indices);
  CUDA_CHECK_ERRORS();
 }
--- a/third_party/pvcnn/functional/src/ball_query/ball_query.cuh
+++ b/third_party/pvcnn/functional/src/ball_query/ball_query.cuh
@ -0,0 +1,8 @@
 #ifndef _BALL_QUERY_CUH
 #define _BALL_QUERY_CUH
 void ball_query(int b, int n, int m, float r2, int u,
                const float *centers_coords, const float *points_coords,
                int *neighbors_indices);
 #endif
--- a/third_party/pvcnn/functional/src/ball_query/ball_query.hpp
+++ b/third_party/pvcnn/functional/src/ball_query/ball_query.hpp
@ -0,0 +1,10 @@
 #ifndef _BALL_QUERY_HPP
 #define _BALL_QUERY_HPP
 #include <torch/extension.h>
 at::Tensor ball_query_forward(at::Tensor centers_coords,
                              at::Tensor points_coords, const float radius,
                              const int num_neighbors);
 #endif
--- a/third_party/pvcnn/functional/src/bindings.cpp
+++ b/third_party/pvcnn/functional/src/bindings.cpp
@ -0,0 +1,37 @@
 #include <pybind11/pybind11.h>
 #include "ball_query/ball_query.hpp"
 #include "grouping/grouping.hpp"
 #include "interpolate/neighbor_interpolate.hpp"
 #include "interpolate/trilinear_devox.hpp"
 #include "sampling/sampling.hpp"
 #include "voxelization/vox.hpp"
 PYBIND11_MODULE(_pvcnn_backend, m) {
  m.def("gather_features_forward", &gather_features_forward,
        "Gather Centers' Features forward (CUDA)");
  m.def("gather_features_backward", &gather_features_backward,
        "Gather Centers' Features backward (CUDA)");
  m.def("furthest_point_sampling", &furthest_point_sampling_forward,
        "Furthest Point Sampling (CUDA)");
  m.def("ball_query", &ball_query_forward, "Ball Query (CUDA)");
  m.def("grouping_forward", &grouping_forward,
        "Grouping Features forward (CUDA)");
  m.def("grouping_backward", &grouping_backward,
        "Grouping Features backward (CUDA)");
  m.def("three_nearest_neighbors_interpolate_forward",
        &three_nearest_neighbors_interpolate_forward,
        "3 Nearest Neighbors Interpolate forward (CUDA)");
  m.def("three_nearest_neighbors_interpolate_backward",
        &three_nearest_neighbors_interpolate_backward,
        "3 Nearest Neighbors Interpolate backward (CUDA)");
  m.def("trilinear_devoxelize_forward", &trilinear_devoxelize_forward,
        "Trilinear Devoxelization forward (CUDA)");
  m.def("trilinear_devoxelize_backward", &trilinear_devoxelize_backward,
        "Trilinear Devoxelization backward (CUDA)");
  m.def("avg_voxelize_forward", &avg_voxelize_forward,
        "Voxelization forward with average pooling (CUDA)");
  m.def("avg_voxelize_backward", &avg_voxelize_backward,
        "Voxelization backward (CUDA)");
 }
--- a/third_party/pvcnn/functional/src/cuda_utils.cuh
+++ b/third_party/pvcnn/functional/src/cuda_utils.cuh
@ -0,0 +1,39 @@
 #ifndef _CUDA_UTILS_H
 #define _CUDA_UTILS_H
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <cmath>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <vector>
 #define MAXIMUM_THREADS 512
 inline int optimal_num_threads(int work_size) {
  const int pow_2 = std::log2(static_cast<double>(work_size));
  return max(min(1 << pow_2, MAXIMUM_THREADS), 1);
 }
 inline dim3 optimal_block_config(int x, int y) {
  const int x_threads = optimal_num_threads(x);
  const int y_threads =
      max(min(optimal_num_threads(y), MAXIMUM_THREADS / x_threads), 1);
  dim3 block_config(x_threads, y_threads, 1);
  return block_config;
 }
 #define CUDA_CHECK_ERRORS()                                                    \
  {                                                                            \
    cudaError_t err = cudaGetLastError();                                      \
    if (cudaSuccess != err) {                                                  \
      fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n",           \
              cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__,          \
              __FILE__);                                                       \
      exit(-1);                                                                \
    }                                                                          \
  }
 #endif
--- a/third_party/pvcnn/functional/src/grouping/grouping.cpp
+++ b/third_party/pvcnn/functional/src/grouping/grouping.cpp
@ -0,0 +1,44 @@
 #include "grouping.hpp"
 #include "grouping.cuh"
 #include "../utils.hpp"
 at::Tensor grouping_forward(at::Tensor features, at::Tensor indices) {
  CHECK_CUDA(features);
  CHECK_CUDA(indices);
  CHECK_CONTIGUOUS(features);
  CHECK_CONTIGUOUS(indices);
  CHECK_IS_FLOAT(features);
  CHECK_IS_INT(indices);
  int b = features.size(0);
  int c = features.size(1);
  int n = features.size(2);
  int m = indices.size(1);
  int u = indices.size(2);
  at::Tensor output = torch::zeros(
      {b, c, m, u}, at::device(features.device()).dtype(at::ScalarType::Float));
  grouping(b, c, n, m, u, features.data_ptr<float>(), indices.data_ptr<int>(),
           output.data_ptr<float>());
  return output;
 }
 at::Tensor grouping_backward(at::Tensor grad_y, at::Tensor indices,
                             const int n) {
  CHECK_CUDA(grad_y);
  CHECK_CUDA(indices);
  CHECK_CONTIGUOUS(grad_y);
  CHECK_CONTIGUOUS(indices);
  CHECK_IS_FLOAT(grad_y);
  CHECK_IS_INT(indices);
  int b = grad_y.size(0);
  int c = grad_y.size(1);
  int m = indices.size(1);
  int u = indices.size(2);
  at::Tensor grad_x = torch::zeros(
      {b, c, n}, at::device(grad_y.device()).dtype(at::ScalarType::Float));
  grouping_grad(b, c, n, m, u, grad_y.data_ptr<float>(),
                indices.data_ptr<int>(), grad_x.data_ptr<float>());
  return grad_x;
 }
--- a/third_party/pvcnn/functional/src/grouping/grouping.cu
+++ b/third_party/pvcnn/functional/src/grouping/grouping.cu
@ -0,0 +1,85 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "../cuda_utils.cuh"
 /*
  Function: grouping features of neighbors (forward)
  Args:
    b   : batch size
    c   : #channles of features
    n   : number of points in point clouds
    m   : number of query centers
    u   : maximum number of neighbors
    features: points' features, FloatTensor[b, c, n]
    indices : neighbor indices in points, IntTensor[b, m, u]
    out     : gathered features, FloatTensor[b, c, m, u]
 */
 __global__ void grouping_kernel(int b, int c, int n, int m, int u,
                                const float *__restrict__ features,
                                const int *__restrict__ indices,
                                float *__restrict__ out) {
  int batch_index = blockIdx.x;
  features += batch_index * n * c;
  indices += batch_index * m * u;
  out += batch_index * m * u * c;
  const int index = threadIdx.y * blockDim.x + threadIdx.x;
  const int stride = blockDim.y * blockDim.x;
  for (int i = index; i < c * m; i += stride) {
    const int l = i / m;
    const int j = i % m;
    for (int k = 0; k < u; ++k) {
      out[(l * m + j) * u + k] = features[l * n + indices[j * u + k]];
    }
  }
 }
 void grouping(int b, int c, int n, int m, int u, const float *features,
              const int *indices, float *out) {
  grouping_kernel<<<b, optimal_block_config(m, c), 0,
                    at::cuda::getCurrentCUDAStream()>>>(b, c, n, m, u, features,
                                                        indices, out);
  CUDA_CHECK_ERRORS();
 }
 /*
  Function: grouping features of neighbors (backward)
  Args:
    b   : batch size
    c   : #channles of features
    n   : number of points in point clouds
    m   : number of query centers
    u   : maximum number of neighbors
    grad_y : grad of gathered features, FloatTensor[b, c, m, u]
    indices : neighbor indices in points, IntTensor[b, m, u]
    grad_x: grad of points' features, FloatTensor[b, c, n]
 */
 __global__ void grouping_grad_kernel(int b, int c, int n, int m, int u,
                                     const float *__restrict__ grad_y,
                                     const int *__restrict__ indices,
                                     float *__restrict__ grad_x) {
  int batch_index = blockIdx.x;
  grad_y += batch_index * m * u * c;
  indices += batch_index * m * u;
  grad_x += batch_index * n * c;
  const int index = threadIdx.y * blockDim.x + threadIdx.x;
  const int stride = blockDim.y * blockDim.x;
  for (int i = index; i < c * m; i += stride) {
    const int l = i / m;
    const int j = i % m;
    for (int k = 0; k < u; ++k) {
      atomicAdd(grad_x + l * n + indices[j * u + k],
                grad_y[(l * m + j) * u + k]);
    }
  }
 }
 void grouping_grad(int b, int c, int n, int m, int u, const float *grad_y,
                   const int *indices, float *grad_x) {
  grouping_grad_kernel<<<b, optimal_block_config(m, c), 0,
                         at::cuda::getCurrentCUDAStream()>>>(
      b, c, n, m, u, grad_y, indices, grad_x);
  CUDA_CHECK_ERRORS();
 }
--- a/third_party/pvcnn/functional/src/grouping/grouping.cuh
+++ b/third_party/pvcnn/functional/src/grouping/grouping.cuh
@ -0,0 +1,9 @@
 #ifndef _GROUPING_CUH
 #define _GROUPING_CUH
 void grouping(int b, int c, int n, int m, int u, const float *features,
              const int *indices, float *out);
 void grouping_grad(int b, int c, int n, int m, int u, const float *grad_y,
                   const int *indices, float *grad_x);
 #endif
--- a/third_party/pvcnn/functional/src/grouping/grouping.hpp
+++ b/third_party/pvcnn/functional/src/grouping/grouping.hpp
@ -0,0 +1,10 @@
 #ifndef _GROUPING_HPP
 #define _GROUPING_HPP
 #include <torch/extension.h>
 at::Tensor grouping_forward(at::Tensor features, at::Tensor indices);
 at::Tensor grouping_backward(at::Tensor grad_y, at::Tensor indices,
                             const int n);
 #endif
--- a/third_party/pvcnn/functional/src/interpolate/neighbor_interpolate.cpp
+++ b/third_party/pvcnn/functional/src/interpolate/neighbor_interpolate.cpp
@ -0,0 +1,65 @@
 #include "neighbor_interpolate.hpp"
 #include "neighbor_interpolate.cuh"
 #include "../utils.hpp"
 std::vector<at::Tensor>
 three_nearest_neighbors_interpolate_forward(at::Tensor points_coords,
                                            at::Tensor centers_coords,
                                            at::Tensor centers_features) {
  CHECK_CUDA(points_coords);
  CHECK_CUDA(centers_coords);
  CHECK_CUDA(centers_features);
  CHECK_CONTIGUOUS(points_coords);
  CHECK_CONTIGUOUS(centers_coords);
  CHECK_CONTIGUOUS(centers_features);
  CHECK_IS_FLOAT(points_coords);
  CHECK_IS_FLOAT(centers_coords);
  CHECK_IS_FLOAT(centers_features);
  int b = centers_features.size(0);
  int c = centers_features.size(1);
  int m = centers_features.size(2);
  int n = points_coords.size(2);
  at::Tensor indices = torch::zeros(
      {b, 3, n}, at::device(points_coords.device()).dtype(at::ScalarType::Int));
  at::Tensor weights = torch::zeros(
      {b, 3, n},
      at::device(points_coords.device()).dtype(at::ScalarType::Float));
  at::Tensor output = torch::zeros(
      {b, c, n},
      at::device(centers_features.device()).dtype(at::ScalarType::Float));
  three_nearest_neighbors_interpolate(
      b, c, m, n, points_coords.data_ptr<float>(),
      centers_coords.data_ptr<float>(), centers_features.data_ptr<float>(),
      indices.data_ptr<int>(), weights.data_ptr<float>(),
      output.data_ptr<float>());
  return {output, indices, weights};
 }
 at::Tensor three_nearest_neighbors_interpolate_backward(at::Tensor grad_y,
                                                        at::Tensor indices,
                                                        at::Tensor weights,
                                                        const int m) {
  CHECK_CUDA(grad_y);
  CHECK_CUDA(indices);
  CHECK_CUDA(weights);
  CHECK_CONTIGUOUS(grad_y);
  CHECK_CONTIGUOUS(indices);
  CHECK_CONTIGUOUS(weights);
  CHECK_IS_FLOAT(grad_y);
  CHECK_IS_INT(indices);
  CHECK_IS_FLOAT(weights);
  int b = grad_y.size(0);
  int c = grad_y.size(1);
  int n = grad_y.size(2);
  at::Tensor grad_x = torch::zeros(
      {b, c, m}, at::device(grad_y.device()).dtype(at::ScalarType::Float));
  three_nearest_neighbors_interpolate_grad(
      b, c, n, m, grad_y.data_ptr<float>(), indices.data_ptr<int>(),
      weights.data_ptr<float>(), grad_x.data_ptr<float>());
  return grad_x;
 }
--- a/third_party/pvcnn/functional/src/interpolate/neighbor_interpolate.cu
+++ b/third_party/pvcnn/functional/src/interpolate/neighbor_interpolate.cu
@ -0,0 +1,181 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include "../cuda_utils.cuh"
 /*
  Function: three nearest neighbors
  Args:
    b   : batch size
    n   : number of points in point clouds
    m   : number of query centers
    points_coords : coordinates of points, FloatTensor[b, 3, n]
    centers_coords: coordinates of centers, FloatTensor[b, 3, m]
    weights       : weights of nearest 3 centers to the point,
                    FloatTensor[b, 3, n]
    indices       : indices of nearest 3 centers to the point,
                    IntTensor[b, 3, n]
 */
 __global__ void three_nearest_neighbors_kernel(
    int b, int n, int m, const float *__restrict__ points_coords,
    const float *__restrict__ centers_coords, float *__restrict__ weights,
    int *__restrict__ indices) {
  int batch_index = blockIdx.x;
  int index = threadIdx.x;
  int stride = blockDim.x;
  points_coords += batch_index * 3 * n;
  weights += batch_index * 3 * n;
  indices += batch_index * 3 * n;
  centers_coords += batch_index * 3 * m;
  for (int j = index; j < n; j += stride) {
    float ux = points_coords[j];
    float uy = points_coords[j + n];
    float uz = points_coords[j + n + n];
    double best0 = 1e40, best1 = 1e40, best2 = 1e40;
    int besti0 = 0, besti1 = 0, besti2 = 0;
    for (int k = 0; k < m; ++k) {
      float x = centers_coords[k];
      float y = centers_coords[k + m];
      float z = centers_coords[k + m + m];
      float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
      if (d < best2) {
        best2 = d;
        besti2 = k;
        if (d < best1) {
          best2 = best1;
          besti2 = besti1;
          best1 = d;
          besti1 = k;
          if (d < best0) {
            best1 = best0;
            besti1 = besti0;
            best0 = d;
            besti0 = k;
          }
        }
      }
    }
    best0 = max(min(1e10f, best0), 1e-10f);
    best1 = max(min(1e10f, best1), 1e-10f);
    best2 = max(min(1e10f, best2), 1e-10f);
    float d0d1 = best0 * best1;
    float d0d2 = best0 * best2;
    float d1d2 = best1 * best2;
    float d0d1d2 = 1.0f / (d0d1 + d0d2 + d1d2);
    weights[j] = d1d2 * d0d1d2;
    indices[j] = besti0;
    weights[j + n] = d0d2 * d0d1d2;
    indices[j + n] = besti1;
    weights[j + n + n] = d0d1 * d0d1d2;
    indices[j + n + n] = besti2;
  }
 }
 /*
  Function: interpolate three nearest neighbors (forward)
  Args:
    b   : batch size
    c   : #channels of features
    m   : number of query centers
    n   : number of points in point clouds
    centers_features: features of centers, FloatTensor[b, c, m]
    indices         : indices of nearest 3 centers to the point,
                      IntTensor[b, 3, n]
    weights         : weights for interpolation, FloatTensor[b, 3, n]
    out             : features of points, FloatTensor[b, c, n]
 */
 __global__ void three_nearest_neighbors_interpolate_kernel(
    int b, int c, int m, int n, const float *__restrict__ centers_features,
    const int *__restrict__ indices, const float *__restrict__ weights,
    float *__restrict__ out) {
  int batch_index = blockIdx.x;
  centers_features += batch_index * m * c;
  indices += batch_index * n * 3;
  weights += batch_index * n * 3;
  out += batch_index * n * c;
  const int index = threadIdx.y * blockDim.x + threadIdx.x;
  const int stride = blockDim.y * blockDim.x;
  for (int i = index; i < c * n; i += stride) {
    const int l = i / n;
    const int j = i % n;
    float w1 = weights[j];
    float w2 = weights[j + n];
    float w3 = weights[j + n + n];
    int i1 = indices[j];
    int i2 = indices[j + n];
    int i3 = indices[j + n + n];
    out[i] = centers_features[l * m + i1] * w1 +
             centers_features[l * m + i2] * w2 +
             centers_features[l * m + i3] * w3;
  }
 }
 void three_nearest_neighbors_interpolate(int b, int c, int m, int n,
                                         const float *points_coords,
                                         const float *centers_coords,
                                         const float *centers_features,
                                         int *indices, float *weights,
                                         float *out) {
  three_nearest_neighbors_kernel<<<b, optimal_num_threads(n), 0,
                                   at::cuda::getCurrentCUDAStream()>>>(
      b, n, m, points_coords, centers_coords, weights, indices);
  three_nearest_neighbors_interpolate_kernel<<<
      b, optimal_block_config(n, c), 0, at::cuda::getCurrentCUDAStream()>>>(
      b, c, m, n, centers_features, indices, weights, out);
  CUDA_CHECK_ERRORS();
 }
 /*
  Function: interpolate three nearest neighbors (backward)
  Args:
    b   : batch size
    c   : #channels of features
    m   : number of query centers
    n   : number of points in point clouds
    grad_y  : grad of features of points, FloatTensor[b, c, n]
    indices : indices of nearest 3 centers to the point, IntTensor[b, 3, n]
    weights : weights for interpolation, FloatTensor[b, 3, n]
    grad_x  : grad of features of centers, FloatTensor[b, c, m]
 */
 __global__ void three_nearest_neighbors_interpolate_grad_kernel(
    int b, int c, int n, int m, const float *__restrict__ grad_y,
    const int *__restrict__ indices, const float *__restrict__ weights,
    float *__restrict__ grad_x) {
  int batch_index = blockIdx.x;
  grad_y += batch_index * n * c;
  indices += batch_index * n * 3;
  weights += batch_index * n * 3;
  grad_x += batch_index * m * c;
  const int index = threadIdx.y * blockDim.x + threadIdx.x;
  const int stride = blockDim.y * blockDim.x;
  for (int i = index; i < c * n; i += stride) {
    const int l = i / n;
    const int j = i % n;
    float w1 = weights[j];
    float w2 = weights[j + n];
    float w3 = weights[j + n + n];
    int i1 = indices[j];
    int i2 = indices[j + n];
    int i3 = indices[j + n + n];
    atomicAdd(grad_x + l * m + i1, grad_y[i] * w1);
    atomicAdd(grad_x + l * m + i2, grad_y[i] * w2);
    atomicAdd(grad_x + l * m + i3, grad_y[i] * w3);
  }
 }
 void three_nearest_neighbors_interpolate_grad(int b, int c, int n, int m,
                                              const float *grad_y,
                                              const int *indices,
                                              const float *weights,
                                              float *grad_x) {
  three_nearest_neighbors_interpolate_grad_kernel<<<
      b, optimal_block_config(n, c), 0, at::cuda::getCurrentCUDAStream()>>>(
      b, c, n, m, grad_y, indices, weights, grad_x);
  CUDA_CHECK_ERRORS();
 }
--- a/third_party/pvcnn/functional/src/interpolate/neighbor_interpolate.cuh
+++ b/third_party/pvcnn/functional/src/interpolate/neighbor_interpolate.cuh
@ -0,0 +1,16 @@
 #ifndef _NEIGHBOR_INTERPOLATE_CUH
 #define _NEIGHBOR_INTERPOLATE_CUH
 void three_nearest_neighbors_interpolate(int b, int c, int m, int n,
                                         const float *points_coords,
                                         const float *centers_coords,
                                         const float *centers_features,
                                         int *indices, float *weights,
                                         float *out);
 void three_nearest_neighbors_interpolate_grad(int b, int c, int n, int m,
                                              const float *grad_y,
                                              const int *indices,
                                              const float *weights,
                                              float *grad_x);
 #endif
--- a/third_party/pvcnn/functional/src/interpolate/neighbor_interpolate.hpp
+++ b/third_party/pvcnn/functional/src/interpolate/neighbor_interpolate.hpp
@ -0,0 +1,16 @@
 #ifndef _NEIGHBOR_INTERPOLATE_HPP
 #define _NEIGHBOR_INTERPOLATE_HPP
 #include <torch/extension.h>
 #include <vector>
 std::vector<at::Tensor>
 three_nearest_neighbors_interpolate_forward(at::Tensor points_coords,
                                            at::Tensor centers_coords,
                                            at::Tensor centers_features);
 at::Tensor three_nearest_neighbors_interpolate_backward(at::Tensor grad_y,
                                                        at::Tensor indices,
                                                        at::Tensor weights,
                                                        const int m);
 #endif
--- a/third_party/pvcnn/functional/src/interpolate/trilinear_devox.cpp
+++ b/third_party/pvcnn/functional/src/interpolate/trilinear_devox.cpp
@ -0,0 +1,91 @@
 #include "trilinear_devox.hpp"
 #include "trilinear_devox.cuh"
 #include "../utils.hpp"
 /*
  Function: trilinear devoxelization (forward)
  Args:
    r        : voxel resolution
    trainig  : whether is training mode
    coords   : the coordinates of points, FloatTensor[b, 3, n]
    features : features, FloatTensor[b, c, s], s = r ** 3
  Return:
    outs : outputs, FloatTensor[b, c, n]
    inds : the voxel coordinates of point cube, IntTensor[b, 8, n]
    wgts : weight for trilinear interpolation, FloatTensor[b, 8, n]
 */
 std::vector<at::Tensor>
 trilinear_devoxelize_forward(const int r, const bool is_training,
                             const at::Tensor coords,
                             const at::Tensor features) {
  CHECK_CUDA(features);
  CHECK_CUDA(coords);
  CHECK_CONTIGUOUS(features);
  CHECK_CONTIGUOUS(coords);
  CHECK_IS_FLOAT(features);
  CHECK_IS_FLOAT(coords);
  int b = features.size(0);
  int c = features.size(1);
  int n = coords.size(2);
  int r2 = r * r;
  int r3 = r2 * r;
  at::Tensor outs = torch::zeros(
      {b, c, n}, at::device(features.device()).dtype(at::ScalarType::Float));
  if (is_training) {
    at::Tensor inds = torch::zeros(
        {b, 8, n}, at::device(features.device()).dtype(at::ScalarType::Int));
    at::Tensor wgts = torch::zeros(
        {b, 8, n}, at::device(features.device()).dtype(at::ScalarType::Float));
    trilinear_devoxelize(b, c, n, r, r2, r3, true, coords.data_ptr<float>(),
                         features.data_ptr<float>(), inds.data_ptr<int>(),
                         wgts.data_ptr<float>(), outs.data_ptr<float>());
    return {outs, inds, wgts};
  } else {
    at::Tensor inds = torch::zeros(
        {1}, at::device(features.device()).dtype(at::ScalarType::Int));
    at::Tensor wgts = torch::zeros(
        {1}, at::device(features.device()).dtype(at::ScalarType::Float));
    trilinear_devoxelize(b, c, n, r, r2, r3, false, coords.data_ptr<float>(),
                         features.data_ptr<float>(), inds.data_ptr<int>(),
                         wgts.data_ptr<float>(), outs.data_ptr<float>());
    return {outs, inds, wgts};
  }
 }
 /*
  Function: trilinear devoxelization (backward)
  Args:
    grad_y  : grad outputs, FloatTensor[b, c, n]
    indices : the voxel coordinates of point cube, IntTensor[b, 8, n]
    weights : weight for trilinear interpolation, FloatTensor[b, 8, n]
    r       : voxel resolution
  Return:
    grad_x     : grad inputs, FloatTensor[b, c, s], s = r ** 3
 */
 at::Tensor trilinear_devoxelize_backward(const at::Tensor grad_y,
                                         const at::Tensor indices,
                                         const at::Tensor weights,
                                         const int r) {
  CHECK_CUDA(grad_y);
  CHECK_CUDA(weights);
  CHECK_CUDA(indices);
  CHECK_CONTIGUOUS(grad_y);
  CHECK_CONTIGUOUS(weights);
  CHECK_CONTIGUOUS(indices);
  CHECK_IS_FLOAT(grad_y);
  CHECK_IS_FLOAT(weights);
  CHECK_IS_INT(indices);
  int b = grad_y.size(0);
  int c = grad_y.size(1);
  int n = grad_y.size(2);
  int r3 = r * r * r;
  at::Tensor grad_x = torch::zeros(
      {b, c, r3}, at::device(grad_y.device()).dtype(at::ScalarType::Float));
  trilinear_devoxelize_grad(b, c, n, r3, indices.data_ptr<int>(),
                            weights.data_ptr<float>(), grad_y.data_ptr<float>(),
                            grad_x.data_ptr<float>());
  return grad_x;
 }
--- a/third_party/pvcnn/functional/src/interpolate/trilinear_devox.cu
+++ b/third_party/pvcnn/functional/src/interpolate/trilinear_devox.cu
@ -0,0 +1,178 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "../cuda_utils.cuh"
 /*
  Function: trilinear devoxlization (forward)
  Args:
    b   : batch size
    c   : #channels
    n   : number of points
    r   : voxel resolution
    r2  : r ** 2
    r3  : r ** 3
    coords : the coordinates of points, FloatTensor[b, 3, n]
    feat   : features, FloatTensor[b, c, r3]
    inds   : the voxel indices of point cube, IntTensor[b, 8, n]
    wgts   : weight for trilinear interpolation, FloatTensor[b, 8, n]
    outs   : outputs, FloatTensor[b, c, n]
 */
 __global__ void trilinear_devoxelize_kernel(int b, int c, int n, int r, int r2,
                                            int r3, bool is_training,
                                            const float *__restrict__ coords,
                                            const float *__restrict__ feat,
                                            int *__restrict__ inds,
                                            float *__restrict__ wgts,
                                            float *__restrict__ outs) {
  int batch_index = blockIdx.x;
  int stride = blockDim.x;
  int index = threadIdx.x;
  coords += batch_index * n * 3;
  inds += batch_index * n * 8;
  wgts += batch_index * n * 8;
  feat += batch_index * c * r3;
  outs += batch_index * c * n;
  for (int i = index; i < n; i += stride) {
    float x = coords[i];
    float y = coords[i + n];
    float z = coords[i + n + n];
    float x_lo_f = floorf(x);
    float y_lo_f = floorf(y);
    float z_lo_f = floorf(z);
    float x_d_1 = x - x_lo_f; // / (x_hi_f - x_lo_f + 1e-8f)
    float y_d_1 = y - y_lo_f;
    float z_d_1 = z - z_lo_f;
    float x_d_0 = 1.0f - x_d_1;
    float y_d_0 = 1.0f - y_d_1;
    float z_d_0 = 1.0f - z_d_1;
    float wgt000 = x_d_0 * y_d_0 * z_d_0;
    float wgt001 = x_d_0 * y_d_0 * z_d_1;
    float wgt010 = x_d_0 * y_d_1 * z_d_0;
    float wgt011 = x_d_0 * y_d_1 * z_d_1;
    float wgt100 = x_d_1 * y_d_0 * z_d_0;
    float wgt101 = x_d_1 * y_d_0 * z_d_1;
    float wgt110 = x_d_1 * y_d_1 * z_d_0;
    float wgt111 = x_d_1 * y_d_1 * z_d_1;
    int x_lo = static_cast<int>(x_lo_f);
    int y_lo = static_cast<int>(y_lo_f);
    int z_lo = static_cast<int>(z_lo_f);
    int x_hi = (x_d_1 > 0) ? -1 : 0;
    int y_hi = (y_d_1 > 0) ? -1 : 0;
    int z_hi = (z_d_1 > 0) ? 1 : 0;
    int idx000 = x_lo * r2 + y_lo * r + z_lo;
    int idx001 = idx000 + z_hi;      // x_lo * r2 + y_lo * r + z_hi;
    int idx010 = idx000 + (y_hi & r);  // x_lo * r2 + y_hi * r + z_lo;
    int idx011 = idx010 + z_hi;      // x_lo * r2 + y_hi * r + z_hi;
    int idx100 = idx000 + (x_hi & r2); // x_hi * r2 + y_lo * r + z_lo;
    int idx101 = idx100 + z_hi;      // x_hi * r2 + y_lo * r + z_hi;
    int idx110 = idx100 + (y_hi & r);  // x_hi * r2 + y_hi * r + z_lo;
    int idx111 = idx110 + z_hi;      // x_hi * r2 + y_hi * r + z_hi;
    if (is_training) {
      wgts[i] = wgt000;
      wgts[i + n] = wgt001;
      wgts[i + n * 2] = wgt010;
      wgts[i + n * 3] = wgt011;
      wgts[i + n * 4] = wgt100;
      wgts[i + n * 5] = wgt101;
      wgts[i + n * 6] = wgt110;
      wgts[i + n * 7] = wgt111;
      inds[i] = idx000;
      inds[i + n] = idx001;
      inds[i + n * 2] = idx010;
      inds[i + n * 3] = idx011;
      inds[i + n * 4] = idx100;
      inds[i + n * 5] = idx101;
      inds[i + n * 6] = idx110;
      inds[i + n * 7] = idx111;
    }
    for (int j = 0; j < c; j++) {
      int jr3 = j * r3;
      outs[j * n + i] =
          wgt000 * feat[jr3 + idx000] + wgt001 * feat[jr3 + idx001] +
          wgt010 * feat[jr3 + idx010] + wgt011 * feat[jr3 + idx011] +
          wgt100 * feat[jr3 + idx100] + wgt101 * feat[jr3 + idx101] +
          wgt110 * feat[jr3 + idx110] + wgt111 * feat[jr3 + idx111];
    }
  }
 }
 /*
  Function: trilinear devoxlization (backward)
  Args:
    b   : batch size
    c   : #channels
    n   : number of points
    r3  : voxel cube size = voxel resolution ** 3
    inds   : the voxel indices of point cube, IntTensor[b, 8, n]
    wgts   : weight for trilinear interpolation, FloatTensor[b, 8, n]
    grad_y : grad outputs, FloatTensor[b, c, n]
    grad_x : grad inputs, FloatTensor[b, c, r3]
 */
 __global__ void trilinear_devoxelize_grad_kernel(
    int b, int c, int n, int r3, const int *__restrict__ inds,
    const float *__restrict__ wgts, const float *__restrict__ grad_y,
    float *__restrict__ grad_x) {
  int batch_index = blockIdx.x;
  int stride = blockDim.x;
  int index = threadIdx.x;
  inds += batch_index * n * 8;
  wgts += batch_index * n * 8;
  grad_x += batch_index * c * r3;
  grad_y += batch_index * c * n;
  for (int i = index; i < n; i += stride) {
    int idx000 = inds[i];
    int idx001 = inds[i + n];
    int idx010 = inds[i + n * 2];
    int idx011 = inds[i + n * 3];
    int idx100 = inds[i + n * 4];
    int idx101 = inds[i + n * 5];
    int idx110 = inds[i + n * 6];
    int idx111 = inds[i + n * 7];
    float wgt000 = wgts[i];
    float wgt001 = wgts[i + n];
    float wgt010 = wgts[i + n * 2];
    float wgt011 = wgts[i + n * 3];
    float wgt100 = wgts[i + n * 4];
    float wgt101 = wgts[i + n * 5];
    float wgt110 = wgts[i + n * 6];
    float wgt111 = wgts[i + n * 7];
    for (int j = 0; j < c; j++) {
      int jr3 = j * r3;
      float g = grad_y[j * n + i];
      atomicAdd(grad_x + jr3 + idx000, wgt000 * g);
      atomicAdd(grad_x + jr3 + idx001, wgt001 * g);
      atomicAdd(grad_x + jr3 + idx010, wgt010 * g);
      atomicAdd(grad_x + jr3 + idx011, wgt011 * g);
      atomicAdd(grad_x + jr3 + idx100, wgt100 * g);
      atomicAdd(grad_x + jr3 + idx101, wgt101 * g);
      atomicAdd(grad_x + jr3 + idx110, wgt110 * g);
      atomicAdd(grad_x + jr3 + idx111, wgt111 * g);
    }
  }
 }
 void trilinear_devoxelize(int b, int c, int n, int r, int r2, int r3,
                          bool training, const float *coords, const float *feat,
                          int *inds, float *wgts, float *outs) {
  trilinear_devoxelize_kernel<<<b, optimal_num_threads(n)>>>(
      b, c, n, r, r2, r3, training, coords, feat, inds, wgts, outs);
  CUDA_CHECK_ERRORS();
 }
 void trilinear_devoxelize_grad(int b, int c, int n, int r3, const int *inds,
                               const float *wgts, const float *grad_y,
                               float *grad_x) {
  trilinear_devoxelize_grad_kernel<<<b, optimal_num_threads(n)>>>(
      b, c, n, r3, inds, wgts, grad_y, grad_x);
  CUDA_CHECK_ERRORS();
 }
--- a/third_party/pvcnn/functional/src/interpolate/trilinear_devox.cuh
+++ b/third_party/pvcnn/functional/src/interpolate/trilinear_devox.cuh
@ -0,0 +1,13 @@
 #ifndef _TRILINEAR_DEVOX_CUH
 #define _TRILINEAR_DEVOX_CUH
 // CUDA function declarations
 void trilinear_devoxelize(int b, int c, int n, int r, int r2, int r3,
                          bool is_training, const float *coords,
                          const float *feat, int *inds, float *wgts,
                          float *outs);
 void trilinear_devoxelize_grad(int b, int c, int n, int r3, const int *inds,
                               const float *wgts, const float *grad_y,
                               float *grad_x);
 #endif
--- a/third_party/pvcnn/functional/src/interpolate/trilinear_devox.hpp
+++ b/third_party/pvcnn/functional/src/interpolate/trilinear_devox.hpp
@ -0,0 +1,16 @@
 #ifndef _TRILINEAR_DEVOX_HPP
 #define _TRILINEAR_DEVOX_HPP
 #include <torch/torch.h>
 #include <vector>
 std::vector<at::Tensor> trilinear_devoxelize_forward(const int r,
                                                     const bool is_training,
                                                     const at::Tensor coords,
                                                     const at::Tensor features);
 at::Tensor trilinear_devoxelize_backward(const at::Tensor grad_y,
                                         const at::Tensor indices,
                                         const at::Tensor weights, const int r);
 #endif
--- a/third_party/pvcnn/functional/src/sampling/sampling.cpp
+++ b/third_party/pvcnn/functional/src/sampling/sampling.cpp
@ -0,0 +1,58 @@
 #include "sampling.hpp"
 #include "sampling.cuh"
 #include "../utils.hpp"
 at::Tensor gather_features_forward(at::Tensor features, at::Tensor indices) {
  CHECK_CUDA(features);
  CHECK_CUDA(indices);
  CHECK_CONTIGUOUS(features);
  CHECK_CONTIGUOUS(indices);
  CHECK_IS_FLOAT(features);
  CHECK_IS_INT(indices);
  int b = features.size(0);
  int c = features.size(1);
  int n = features.size(2);
  int m = indices.size(1);
  at::Tensor output = torch::zeros(
      {b, c, m}, at::device(features.device()).dtype(at::ScalarType::Float));
  gather_features(b, c, n, m, features.data_ptr<float>(),
                  indices.data_ptr<int>(), output.data_ptr<float>());
  return output;
 }
 at::Tensor gather_features_backward(at::Tensor grad_y, at::Tensor indices,
                                    const int n) {
  CHECK_CUDA(grad_y);
  CHECK_CUDA(indices);
  CHECK_CONTIGUOUS(grad_y);
  CHECK_CONTIGUOUS(indices);
  CHECK_IS_FLOAT(grad_y);
  CHECK_IS_INT(indices);
  int b = grad_y.size(0);
  int c = grad_y.size(1);
  at::Tensor grad_x = torch::zeros(
      {b, c, n}, at::device(grad_y.device()).dtype(at::ScalarType::Float));
  gather_features_grad(b, c, n, indices.size(1), grad_y.data_ptr<float>(),
                       indices.data_ptr<int>(), grad_x.data_ptr<float>());
  return grad_x;
 }
 at::Tensor furthest_point_sampling_forward(at::Tensor coords,
                                           const int num_samples) {
  CHECK_CUDA(coords);
  CHECK_CONTIGUOUS(coords);
  CHECK_IS_FLOAT(coords);
  int b = coords.size(0);
  int n = coords.size(2);
  at::Tensor indices = torch::zeros(
      {b, num_samples}, at::device(coords.device()).dtype(at::ScalarType::Int));
  at::Tensor distances = torch::full(
      {b, n}, 1e38f, at::device(coords.device()).dtype(at::ScalarType::Float));
  furthest_point_sampling(b, n, num_samples, coords.data_ptr<float>(),
                          distances.data_ptr<float>(), indices.data_ptr<int>());
  return indices;
 }
--- a/third_party/pvcnn/functional/src/sampling/sampling.cu
+++ b/third_party/pvcnn/functional/src/sampling/sampling.cu
@ -0,0 +1,174 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "../cuda_utils.cuh"
 /*
  Function: gather centers' features (forward)
  Args:
    b   : batch size
    c   : #channles of features
    n   : number of points in point clouds
    m   : number of query/sampled centers
    features: points' features, FloatTensor[b, c, n]
    indices : centers' indices in points, IntTensor[b, m]
    out     : gathered features, FloatTensor[b, c, m]
 */
 __global__ void gather_features_kernel(int b, int c, int n, int m,
                                       const float *__restrict__ features,
                                       const int *__restrict__ indices,
                                       float *__restrict__ out) {
  int batch_index = blockIdx.x;
  int channel_index = blockIdx.y;
  int temp_index = batch_index * c + channel_index;
  features += temp_index * n;
  indices += batch_index * m;
  out += temp_index * m;
  for (int j = threadIdx.x; j < m; j += blockDim.x) {
    out[j] = features[indices[j]];
  }
 }
 void gather_features(int b, int c, int n, int m, const float *features,
                     const int *indices, float *out) {
  gather_features_kernel<<<dim3(b, c, 1), optimal_num_threads(m), 0,
                           at::cuda::getCurrentCUDAStream()>>>(
      b, c, n, m, features, indices, out);
  CUDA_CHECK_ERRORS();
 }
 /*
  Function: gather centers' features (backward)
  Args:
    b   : batch size
    c   : #channles of features
    n   : number of points in point clouds
    m   : number of query/sampled centers
    grad_y  : grad of gathered features, FloatTensor[b, c, m]
    indices : centers' indices in points, IntTensor[b, m]
    grad_x  : grad of points' features, FloatTensor[b, c, n]
 */
 __global__ void gather_features_grad_kernel(int b, int c, int n, int m,
                                            const float *__restrict__ grad_y,
                                            const int *__restrict__ indices,
                                            float *__restrict__ grad_x) {
  int batch_index = blockIdx.x;
  int channel_index = blockIdx.y;
  int temp_index = batch_index * c + channel_index;
  grad_y += temp_index * m;
  indices += batch_index * m;
  grad_x += temp_index * n;
  for (int j = threadIdx.x; j < m; j += blockDim.x) {
    atomicAdd(grad_x + indices[j], grad_y[j]);
  }
 }
 void gather_features_grad(int b, int c, int n, int m, const float *grad_y,
                          const int *indices, float *grad_x) {
  gather_features_grad_kernel<<<dim3(b, c, 1), optimal_num_threads(m), 0,
                                at::cuda::getCurrentCUDAStream()>>>(
      b, c, n, m, grad_y, indices, grad_x);
  CUDA_CHECK_ERRORS();
 }
 /*
  Function: furthest point sampling
  Args:
    b   : batch size
    n   : number of points in point clouds
    m   : number of query/sampled centers
    coords    : points' coords, FloatTensor[b, 3, n]
    distances : minimum distance of a point to the set, IntTensor[b, n]
    indices   : sampled centers' indices in points, IntTensor[b, m]
 */
 __global__ void furthest_point_sampling_kernel(int b, int n, int m,
                                               const float *__restrict__ coords,
                                               float *__restrict__ distances,
                                               int *__restrict__ indices) {
  if (m <= 0)
    return;
  int batch_index = blockIdx.x;
  coords += batch_index * n * 3;
  distances += batch_index * n;
  indices += batch_index * m;
  const int BlockSize = 512;
  __shared__ float dists[BlockSize];
  __shared__ int dists_i[BlockSize];
  const int BufferSize = 3072;
  __shared__ float buf[BufferSize * 3];
  int old = 0;
  if (threadIdx.x == 0)
    indices[0] = old;
  for (int j = threadIdx.x; j < min(BufferSize, n); j += blockDim.x) {
    buf[j] = coords[j];
    buf[j + BufferSize] = coords[j + n];
    buf[j + BufferSize + BufferSize] = coords[j + n + n];
  }
  __syncthreads();
  for (int j = 1; j < m; j++) {
    int besti = 0;   // best index
    float best = -1; // farthest distance
    // calculating the distance with the latest sampled point
    float x1 = coords[old];
    float y1 = coords[old + n];
    float z1 = coords[old + n + n];
    for (int k = threadIdx.x; k < n; k += blockDim.x) {
      // fetch distance at block n, thread k
      float td = distances[k];
      float x2, y2, z2;
      if (k < BufferSize) {
        x2 = buf[k];
        y2 = buf[k + BufferSize];
        z2 = buf[k + BufferSize + BufferSize];
      } else {
        x2 = coords[k];
        y2 = coords[k + n];
        z2 = coords[k + n + n];
      }
      float d =
          (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
      float d2 = min(d, td);
      // update "point-to-set" distance
      if (d2 != td)
        distances[k] = d2;
      // update the farthest distance at sample step j
      if (d2 > best) {
        best = d2;
        besti = k;
      }
    }
    dists[threadIdx.x] = best;
    dists_i[threadIdx.x] = besti;
    for (int u = 0; (1 << u) < blockDim.x; u++) {
      __syncthreads();
      if (threadIdx.x < (blockDim.x >> (u + 1))) {
        int i1 = (threadIdx.x * 2) << u;
        int i2 = (threadIdx.x * 2 + 1) << u;
        if (dists[i1] < dists[i2]) {
          dists[i1] = dists[i2];
          dists_i[i1] = dists_i[i2];
        }
      }
    }
    __syncthreads();
    // finish sample step j; old is the sampled index
    old = dists_i[0];
    if (threadIdx.x == 0)
      indices[j] = old;
  }
 }
 void furthest_point_sampling(int b, int n, int m, const float *coords,
                             float *distances, int *indices) {
  furthest_point_sampling_kernel<<<b, 512>>>(b, n, m, coords, distances,
                                             indices);
  CUDA_CHECK_ERRORS();
 }
--- a/third_party/pvcnn/functional/src/sampling/sampling.cuh
+++ b/third_party/pvcnn/functional/src/sampling/sampling.cuh
@ -0,0 +1,11 @@
 #ifndef _SAMPLING_CUH
 #define _SAMPLING_CUH
 void gather_features(int b, int c, int n, int m, const float *features,
                     const int *indices, float *out);
 void gather_features_grad(int b, int c, int n, int m, const float *grad_y,
                          const int *indices, float *grad_x);
 void furthest_point_sampling(int b, int n, int m, const float *coords,
                             float *distances, int *indices);
 #endif
--- a/third_party/pvcnn/functional/src/sampling/sampling.hpp
+++ b/third_party/pvcnn/functional/src/sampling/sampling.hpp
@ -0,0 +1,12 @@
 #ifndef _SAMPLING_HPP
 #define _SAMPLING_HPP
 #include <torch/extension.h>
 at::Tensor gather_features_forward(at::Tensor features, at::Tensor indices);
 at::Tensor gather_features_backward(at::Tensor grad_y, at::Tensor indices,
                                    const int n);
 at::Tensor furthest_point_sampling_forward(at::Tensor coords,
                                           const int num_samples);
 #endif
--- a/third_party/pvcnn/functional/src/utils.hpp
+++ b/third_party/pvcnn/functional/src/utils.hpp
@ -0,0 +1,20 @@
 #ifndef _UTILS_HPP
 #define _UTILS_HPP
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>
 #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
 #define CHECK_CONTIGUOUS(x)                                                    \
  TORCH_CHECK(x.is_contiguous(), #x " must be a contiguous tensor")
 #define CHECK_IS_INT(x)                                                        \
  TORCH_CHECK(x.scalar_type() == at::ScalarType::Int,                             \
           #x " must be an int tensor")
 #define CHECK_IS_FLOAT(x)                                                      \
  TORCH_CHECK(x.scalar_type() == at::ScalarType::Float,                           \
           #x " must be a float tensor")
 #endif
--- a/third_party/pvcnn/functional/src/voxelization/vox.cpp
+++ b/third_party/pvcnn/functional/src/voxelization/vox.cpp
@ -0,0 +1,76 @@
 #include "vox.hpp"
 #include "vox.cuh"
 #include "../utils.hpp"
 /*
  Function: average pool voxelization (forward)
  Args:
    features: features, FloatTensor[b, c, n]
    coords  : coords of each point, IntTensor[b, 3, n]
    resolution : voxel resolution
  Return:
    out : outputs, FloatTensor[b, c, s], s = r ** 3
    ind : voxel index of each point, IntTensor[b, n]
    cnt : #points in each voxel index, IntTensor[b, s]
 */
 std::vector<at::Tensor> avg_voxelize_forward(const at::Tensor features,
                                             const at::Tensor coords,
                                             const int resolution) {
  CHECK_CUDA(features);
  CHECK_CUDA(coords);
  CHECK_CONTIGUOUS(features);
  CHECK_CONTIGUOUS(coords);
  CHECK_IS_FLOAT(features);
  CHECK_IS_INT(coords);
  int b = features.size(0);
  int c = features.size(1);
  int n = features.size(2);
  int r = resolution;
  int r2 = r * r;
  int r3 = r2 * r;
  at::Tensor ind = torch::zeros(
      {b, n}, at::device(features.device()).dtype(at::ScalarType::Int));
  at::Tensor out = torch::zeros(
      {b, c, r3}, at::device(features.device()).dtype(at::ScalarType::Float));
  at::Tensor cnt = torch::zeros(
      {b, r3}, at::device(features.device()).dtype(at::ScalarType::Int));
  avg_voxelize(b, c, n, r, r2, r3, coords.data_ptr<int>(),
               features.data_ptr<float>(), ind.data_ptr<int>(),
               cnt.data_ptr<int>(), out.data_ptr<float>());
  return {out, ind, cnt};
 }
 /*
  Function: average pool voxelization (backward)
  Args:
    grad_y : grad outputs, FloatTensor[b, c, s]
    indices: voxel index of each point, IntTensor[b, n]
    cnt    : #points in each voxel index, IntTensor[b, s]
  Return:
    grad_x : grad inputs, FloatTensor[b, c, n]
 */
 at::Tensor avg_voxelize_backward(const at::Tensor grad_y,
                                 const at::Tensor indices,
                                 const at::Tensor cnt) {
  CHECK_CUDA(grad_y);
  CHECK_CUDA(indices);
  CHECK_CUDA(cnt);
  CHECK_CONTIGUOUS(grad_y);
  CHECK_CONTIGUOUS(indices);
  CHECK_CONTIGUOUS(cnt);
  CHECK_IS_FLOAT(grad_y);
  CHECK_IS_INT(indices);
  CHECK_IS_INT(cnt);
  int b = grad_y.size(0);
  int c = grad_y.size(1);
  int s = grad_y.size(2);
  int n = indices.size(1);
  at::Tensor grad_x = torch::zeros(
      {b, c, n}, at::device(grad_y.device()).dtype(at::ScalarType::Float));
  avg_voxelize_grad(b, c, n, s, indices.data_ptr<int>(), cnt.data_ptr<int>(),
                    grad_y.data_ptr<float>(), grad_x.data_ptr<float>());
  return grad_x;
 }
--- a/third_party/pvcnn/functional/src/voxelization/vox.cu
+++ b/third_party/pvcnn/functional/src/voxelization/vox.cu
@ -0,0 +1,126 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "../cuda_utils.cuh"
 /*
  Function: get how many points in each voxel grid
  Args:
    b      : batch size
    n      : number of points
    r      : voxel resolution
    r2     : = r * r
    r3     : s, voxel cube size = r ** 3
    coords : coords of each point, IntTensor[b, 3, n]
    ind    : voxel index of each point, IntTensor[b, n]
    cnt    : #points in each voxel index, IntTensor[b, s]
 */
 __global__ void grid_stats_kernel(int b, int n, int r, int r2, int r3,
                                  const int *__restrict__ coords,
                                  int *__restrict__ ind, int *cnt) {
  int batch_index = blockIdx.x;
  int stride = blockDim.x;
  int index = threadIdx.x;
  coords += batch_index * n * 3;
  ind += batch_index * n;
  cnt += batch_index * r3;
  for (int i = index; i < n; i += stride) {
    // if (ind[i] == -1)
    //   continue;
    ind[i] = coords[i] * r2 + coords[i + n] * r + coords[i + n + n];
    atomicAdd(cnt + ind[i], 1);
  }
 }
 /*
  Function: average pool voxelization (forward)
  Args:
    b   : batch size
    c   : #channels
    n   : number of points
    s   : voxel cube size = voxel resolution ** 3
    ind : voxel index of each point, IntTensor[b, n]
    cnt : #points in each voxel index, IntTensor[b, s]
    feat: features, FloatTensor[b, c, n]
    out : outputs, FloatTensor[b, c, s]
 */
 __global__ void avg_voxelize_kernel(int b, int c, int n, int s,
                                    const int *__restrict__ ind,
                                    const int *__restrict__ cnt,
                                    const float *__restrict__ feat,
                                    float *__restrict__ out) {
  int batch_index = blockIdx.x;
  int stride = blockDim.x;
  int index = threadIdx.x;
  ind += batch_index * n;
  feat += batch_index * c * n;
  out += batch_index * c * s;
  cnt += batch_index * s;
  for (int i = index; i < n; i += stride) {
    int pos = ind[i];
    // if (pos == -1)
    //   continue;
    int cur_cnt = cnt[pos];
    if (cur_cnt > 0) {
      float div_cur_cnt = 1.0 / static_cast<float>(cur_cnt);
      for (int j = 0; j < c; j++) {
        atomicAdd(out + j * s + pos, feat[j * n + i] * div_cur_cnt);
      }
    }
  }
 }
 /*
  Function: average pool voxelization (backward)
  Args:
    b      : batch size
    c      : #channels
    n      : number of points
    r3     : voxel cube size = voxel resolution ** 3
    ind    : voxel index of each point, IntTensor[b, n]
    cnt    : #points in each voxel index, IntTensor[b, s]
    grad_y : grad outputs, FloatTensor[b, c, s]
    grad_x : grad inputs, FloatTensor[b, c, n]
 */
 __global__ void avg_voxelize_grad_kernel(int b, int c, int n, int r3,
                                         const int *__restrict__ ind,
                                         const int *__restrict__ cnt,
                                         const float *__restrict__ grad_y,
                                         float *__restrict__ grad_x) {
  int batch_index = blockIdx.x;
  int stride = blockDim.x;
  int index = threadIdx.x;
  ind += batch_index * n;
  grad_x += batch_index * c * n;
  grad_y += batch_index * c * r3;
  cnt += batch_index * r3;
  for (int i = index; i < n; i += stride) {
    int pos = ind[i];
    // if (pos == -1)
    //   continue;
    int cur_cnt = cnt[pos];
    if (cur_cnt > 0) {
      float div_cur_cnt = 1.0 / static_cast<float>(cur_cnt);
      for (int j = 0; j < c; j++) {
        atomicAdd(grad_x + j * n + i, grad_y[j * r3 + pos] * div_cur_cnt);
      }
    }
  }
 }
 void avg_voxelize(int b, int c, int n, int r, int r2, int r3, const int *coords,
                  const float *feat, int *ind, int *cnt, float *out) {
  grid_stats_kernel<<<b, optimal_num_threads(n)>>>(b, n, r, r2, r3, coords, ind,
                                                   cnt);
  avg_voxelize_kernel<<<b, optimal_num_threads(n)>>>(b, c, n, r3, ind, cnt,
                                                     feat, out);
  CUDA_CHECK_ERRORS();
 }
 void avg_voxelize_grad(int b, int c, int n, int s, const int *ind,
                       const int *cnt, const float *grad_y, float *grad_x) {
  avg_voxelize_grad_kernel<<<b, optimal_num_threads(n)>>>(b, c, n, s, ind, cnt,
                                                          grad_y, grad_x);
  CUDA_CHECK_ERRORS();
 }
--- a/third_party/pvcnn/functional/src/voxelization/vox.cuh
+++ b/third_party/pvcnn/functional/src/voxelization/vox.cuh
@ -0,0 +1,10 @@
 #ifndef _VOX_CUH
 #define _VOX_CUH
 // CUDA function declarations
 void avg_voxelize(int b, int c, int n, int r, int r2, int r3, const int *coords,
                  const float *feat, int *ind, int *cnt, float *out);
 void avg_voxelize_grad(int b, int c, int n, int s, const int *idx,
                       const int *cnt, const float *grad_y, float *grad_x);
 #endif
--- a/third_party/pvcnn/functional/src/voxelization/vox.hpp
+++ b/third_party/pvcnn/functional/src/voxelization/vox.hpp
@ -0,0 +1,15 @@
 #ifndef _VOX_HPP
 #define _VOX_HPP
 #include <torch/torch.h>
 #include <vector>
 std::vector<at::Tensor> avg_voxelize_forward(const at::Tensor features,
                                             const at::Tensor coords,
                                             const int resolution);
 at::Tensor avg_voxelize_backward(const at::Tensor grad_y,
                                 const at::Tensor indices,
                                 const at::Tensor cnt);
 #endif
--- a/third_party/pvcnn/functional/voxelization.py
+++ b/third_party/pvcnn/functional/voxelization.py
@ -0,0 +1,47 @@
 from torch.autograd import Function
 import torch
 # from modules.functional.backend import _backend
 from third_party.pvcnn.functional.backend import _backend
 from torch.cuda.amp import autocast, GradScaler, custom_fwd, custom_bwd 
 __all__ = ['avg_voxelize']
 class AvgVoxelization(Function):
    @staticmethod
    @custom_fwd(cast_inputs=torch.float32) 
    def forward(ctx, features, coords, resolution):
        """
        :param ctx:
        :param features: Features of the point cloud, FloatTensor[B, C, N]
        :param coords: Voxelized Coordinates of each point, IntTensor[B, 3, N]
        :param resolution: Voxel resolution
        :return:
            Voxelized Features, FloatTensor[B, C, R, R, R]
        """
        features = features.contiguous()
        coords = coords.int()[:,:3].contiguous()
        b, c, _ = features.shape
        out, indices, counts = _backend.avg_voxelize_forward(
            features, coords, resolution)
        ctx.save_for_backward(indices, counts)
        return out.view(b, c, resolution, resolution, resolution)
    @staticmethod
    @custom_bwd
    def backward(ctx, grad_output):
        """
        :param ctx:
        :param grad_output: gradient of output, FloatTensor[B, C, R, R, R]
        :return:
            gradient of inputs, FloatTensor[B, C, N]
        """
        b, c = grad_output.shape[:2]
        indices, counts = ctx.saved_tensors
        grad_features = _backend.avg_voxelize_backward(
            grad_output.contiguous().view(b, c, -1), indices, counts)
        return grad_features, None, None
 avg_voxelize = AvgVoxelization.apply
--- a/third_party/torchdiffeq/LICENSE
+++ b/third_party/torchdiffeq/LICENSE
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2018 Ricky Tian Qi Chen
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/third_party/torchdiffeq/README.md
+++ b/third_party/torchdiffeq/README.md
@ -0,0 +1 @@
 adapted from `https://github.com/rtqichen/torchdiffeq/tree/master/torchdiffeq`
--- a/third_party/torchdiffeq/torchdiffeq/init.py
+++ b/third_party/torchdiffeq/torchdiffeq/init.py
@ -0,0 +1,4 @@
 from ._impl import odeint
 from ._impl import odeint_adjoint
 from ._impl import odeint_event
 __version__ = "0.2.2"
--- a/third_party/torchdiffeq/torchdiffeq/_impl/init.py
+++ b/third_party/torchdiffeq/torchdiffeq/_impl/init.py
@ -0,0 +1,2 @@
 from .odeint import odeint, odeint_event
 from .adjoint import odeint_adjoint
--- a/third_party/torchdiffeq/torchdiffeq/_impl/adaptive_heun.py
+++ b/third_party/torchdiffeq/torchdiffeq/_impl/adaptive_heun.py
@ -0,0 +1,25 @@
 import torch
 from .rk_common import _ButcherTableau, RKAdaptiveStepsizeODESolver
 _ADAPTIVE_HEUN_TABLEAU = _ButcherTableau(
    alpha=torch.tensor([1.], dtype=torch.float64),
    beta=[
        torch.tensor([1.], dtype=torch.float64),
    ],
    c_sol=torch.tensor([0.5, 0.5], dtype=torch.float64),
    c_error=torch.tensor([
        0.5,
        -0.5,
    ], dtype=torch.float64),
 )
 _AH_C_MID = torch.tensor([
    0.5, 0.
 ], dtype=torch.float64)
 class AdaptiveHeunSolver(RKAdaptiveStepsizeODESolver):
    order = 2
    tableau = _ADAPTIVE_HEUN_TABLEAU
    mid = _AH_C_MID
--- a/third_party/torchdiffeq/torchdiffeq/_impl/adjoint.py
+++ b/third_party/torchdiffeq/torchdiffeq/_impl/adjoint.py
@ -0,0 +1,280 @@
 import warnings
 import torch
 import torch.nn as nn
 from .odeint import SOLVERS, odeint
 from .misc import _check_inputs, _flat_to_shape
 from .misc import _mixed_norm
 class OdeintAdjointMethod(torch.autograd.Function):
    @staticmethod
    def forward(ctx, shapes, func, y0, t, rtol, atol, method, options, event_fn, adjoint_rtol, adjoint_atol, adjoint_method,
                adjoint_options, t_requires_grad, *adjoint_params):
        ctx.shapes = shapes
        ctx.func = func
        ctx.adjoint_rtol = adjoint_rtol
        ctx.adjoint_atol = adjoint_atol
        ctx.adjoint_method = adjoint_method
        ctx.adjoint_options = adjoint_options
        ctx.t_requires_grad = t_requires_grad
        ctx.event_mode = event_fn is not None
        with torch.no_grad():
            ans = odeint(func, y0, t, rtol=rtol, atol=atol, method=method, options=options, event_fn=event_fn)
            if event_fn is None:
                y = ans
            else:
                event_t, y = ans
                ctx.event_t = event_t
        ctx.save_for_backward(t, y, *adjoint_params)
        return ans
    @staticmethod
    def backward(ctx, *grad_y):
        with torch.no_grad():
            func = ctx.func
            adjoint_rtol = ctx.adjoint_rtol
            adjoint_atol = ctx.adjoint_atol
            adjoint_method = ctx.adjoint_method
            adjoint_options = ctx.adjoint_options
            t_requires_grad = ctx.t_requires_grad
            t, y, *adjoint_params = ctx.saved_tensors
            adjoint_params = tuple(adjoint_params)
            # Backprop as if integrating up to event time.
            # Does NOT backpropagate through the event time.
            event_mode = ctx.event_mode
            if event_mode:
                event_t = ctx.event_t
                _t = t
                t = torch.cat([t[0].reshape(-1), event_t.reshape(-1)])
                grad_y = grad_y[1]
            else:
                grad_y = grad_y[0]
            ##################################
            #      Set up initial state      #
            ##################################
            # [-1] because y and grad_y are both of shape (len(t), *y0.shape)
            aug_state = [torch.zeros((), dtype=y.dtype, device=y.device), y[-1], grad_y[-1]]  # vjp_t, y, vjp_y
            aug_state.extend([torch.zeros_like(param) for param in adjoint_params])  # vjp_params
            ##################################
            #    Set up backward ODE func    #
            ##################################
            # TODO: use a nn.Module and call odeint_adjoint to implement higher order derivatives.
            def augmented_dynamics(t, y_aug):
                # Dynamics of the original system augmented with
                # the adjoint wrt y, and an integrator wrt t and args.
                y = y_aug[1]
                adj_y = y_aug[2]
                # ignore gradients wrt time and parameters
                with torch.enable_grad():
                    t_ = t.detach()
                    t = t_.requires_grad_(True)
                    y = y.detach().requires_grad_(True)
                    # If using an adaptive solver we don't want to waste time resolving dL/dt unless we need it (which
                    # doesn't necessarily even exist if there is piecewise structure in time), so turning off gradients
                    # wrt t here means we won't compute that if we don't need it.
                    func_eval = func(t if t_requires_grad else t_, y)
                    # Workaround for PyTorch bug #39784
                    _t = torch.as_strided(t, (), ())  # noqa
                    _y = torch.as_strided(y, (), ())  # noqa
                    _params = tuple(torch.as_strided(param, (), ()) for param in adjoint_params)  # noqa
                    vjp_t, vjp_y, *vjp_params = torch.autograd.grad(
                        func_eval, (t, y) + adjoint_params, -adj_y,
                        allow_unused=True, retain_graph=True
                    )
                # autograd.grad returns None if no gradient, set to zero.
                vjp_t = torch.zeros_like(t) if vjp_t is None else vjp_t
                vjp_y = torch.zeros_like(y) if vjp_y is None else vjp_y
                vjp_params = [torch.zeros_like(param) if vjp_param is None else vjp_param
                              for param, vjp_param in zip(adjoint_params, vjp_params)]
                return (vjp_t, func_eval, vjp_y, *vjp_params)
            ##################################
            #       Solve adjoint ODE        #
            ##################################
            if t_requires_grad:
                time_vjps = torch.empty(len(t), dtype=t.dtype, device=t.device)
            else:
                time_vjps = None
            for i in range(len(t) - 1, 0, -1):
                if t_requires_grad:
                    # Compute the effect of moving the current time measurement point.
                    # We don't compute this unless we need to, to save some computation.
                    func_eval = func(t[i], y[i])
                    dLd_cur_t = func_eval.reshape(-1).dot(grad_y[i].reshape(-1))
                    aug_state[0] -= dLd_cur_t
                    time_vjps[i] = dLd_cur_t
                # Run the augmented system backwards in time.
                aug_state = odeint(
                    augmented_dynamics, tuple(aug_state),
                    t[i - 1:i + 1].flip(0),
                    rtol=adjoint_rtol, atol=adjoint_atol, method=adjoint_method, options=adjoint_options
                )
                aug_state = [a[1] for a in aug_state]  # extract just the t[i - 1] value
                aug_state[1] = y[i - 1]  # update to use our forward-pass estimate of the state
                aug_state[2] += grad_y[i - 1]  # update any gradients wrt state at this time point
            if t_requires_grad:
                time_vjps[0] = aug_state[0]
            # Only compute gradient wrt initial time when in event handling mode.
            if event_mode and t_requires_grad:
                time_vjps = torch.cat([time_vjps[0].reshape(-1), torch.zeros_like(_t[1:])])
            adj_y = aug_state[2]
            adj_params = aug_state[3:]
        return (None, None, adj_y, time_vjps, None, None, None, None, None, None, None, None, None, None, *adj_params)
 def odeint_adjoint(func, y0, t, *, rtol=1e-7, atol=1e-9, method=None, options=None, event_fn=None,
                   adjoint_rtol=None, adjoint_atol=None, adjoint_method=None, adjoint_options=None, adjoint_params=None):
    # We need this in order to access the variables inside this module,
    # since we have no other way of getting variables along the execution path.
    if adjoint_params is None and not isinstance(func, nn.Module):
        raise ValueError('func must be an instance of nn.Module to specify the adjoint parameters; alternatively they '
                         'can be specified explicitly via the `adjoint_params` argument. If there are no parameters '
                         'then it is allowable to set `adjoint_params=()`.')
    # Must come before _check_inputs as we don't want to use normalised input (in particular any changes to options)
    if adjoint_rtol is None:
        adjoint_rtol = rtol
    if adjoint_atol is None:
        adjoint_atol = atol
    if adjoint_method is None:
        adjoint_method = method
    if adjoint_method != method and options is not None and adjoint_options is None:
        raise ValueError("If `adjoint_method != method` then we cannot infer `adjoint_options` from `options`. So as "
                         "`options` has been passed then `adjoint_options` must be passed as well.")
    if adjoint_options is None:
        adjoint_options = {k: v for k, v in options.items() if k != "norm"} if options is not None else {}
    else:
        # Avoid in-place modifying a user-specified dict.
        adjoint_options = adjoint_options.copy()
    if adjoint_params is None:
        adjoint_params = tuple(find_parameters(func))
    else:
        adjoint_params = tuple(adjoint_params)  # in case adjoint_params is a generator.
    # Filter params that don't require gradients.
    oldlen_ = len(adjoint_params)
    adjoint_params = tuple(p for p in adjoint_params if p.requires_grad)
    if len(adjoint_params) != oldlen_:
        # Some params were excluded.
        # Issue a warning if a user-specified norm is specified.
        if 'norm' in adjoint_options and callable(adjoint_options['norm']):
            warnings.warn("An adjoint parameter was passed without requiring gradient. For efficiency this will be "
                          "excluded from the adjoint pass, and will not appear as a tensor in the adjoint norm.")
    # Convert to flattened state.
    shapes, func, y0, t, rtol, atol, method, options, event_fn, decreasing_time = _check_inputs(func, y0, t, rtol, atol, method, options, event_fn, SOLVERS)
    # Handle the adjoint norm function.
    state_norm = options["norm"]
    handle_adjoint_norm_(adjoint_options, shapes, state_norm)
    ans = OdeintAdjointMethod.apply(shapes, func, y0, t, rtol, atol, method, options, event_fn, adjoint_rtol, adjoint_atol,
                                    adjoint_method, adjoint_options, t.requires_grad, *adjoint_params)
    if event_fn is None:
        solution = ans
    else:
        event_t, solution = ans
        event_t = event_t.to(t)
        if decreasing_time:
            event_t = -event_t
    if shapes is not None:
        solution = _flat_to_shape(solution, (len(t),), shapes)
    if event_fn is None:
        return solution
    else:
        return event_t, solution
 def find_parameters(module):
    assert isinstance(module, nn.Module)
    # If called within DataParallel, parameters won't appear in module.parameters().
    if getattr(module, '_is_replica', False):
        def find_tensor_attributes(module):
            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v) and v.requires_grad]
            return tuples
        gen = module._named_members(get_members_fn=find_tensor_attributes)
        return [param for _, param in gen]
    else:
        return list(module.parameters())
 def handle_adjoint_norm_(adjoint_options, shapes, state_norm):
    """In-place modifies the adjoint options to choose or wrap the norm function."""
    # This is the default adjoint norm on the backward pass: a mixed norm over the tuple of inputs.
    def default_adjoint_norm(tensor_tuple):
        t, y, adj_y, *adj_params = tensor_tuple
        # (If the state is actually a flattened tuple then this will be unpacked again in state_norm.)
        return max(t.abs(), state_norm(y), state_norm(adj_y), _mixed_norm(adj_params))
    if "norm" not in adjoint_options:
        # `adjoint_options` was not explicitly specified by the user. Use the default norm.
        adjoint_options["norm"] = default_adjoint_norm
    else:
        # `adjoint_options` was explicitly specified by the user...
        try:
            adjoint_norm = adjoint_options['norm']
        except KeyError:
            # ...but they did not specify the norm argument. Back to plan A: use the default norm.
            adjoint_options['norm'] = default_adjoint_norm
        else:
            # ...and they did specify the norm argument.
            if adjoint_norm == 'seminorm':
                # They told us they want to use seminorms. Slight modification to plan A: use the default norm,
                # but ignore the parameter state
                def adjoint_seminorm(tensor_tuple):
                    t, y, adj_y, *adj_params = tensor_tuple
                    # (If the state is actually a flattened tuple then this will be unpacked again in state_norm.)
                    return max(t.abs(), state_norm(y), state_norm(adj_y))
                adjoint_options['norm'] = adjoint_seminorm
            else:
                # And they're using their own custom norm.
                if shapes is None:
                    # The state on the forward pass was a tensor, not a tuple. We don't need to do anything, they're
                    # already going to get given the full adjoint state as (t, y, adj_y, adj_params)
                    pass  # this branch included for clarity
                else:
                    # This is the bit that is tuple/tensor abstraction-breaking, because the odeint machinery
                    # doesn't know about the tupled nature of the forward state. We need to tell the user's adjoint
                    # norm about that ourselves.
                    def _adjoint_norm(tensor_tuple):
                        t, y, adj_y, *adj_params = tensor_tuple
                        y = _flat_to_shape(y, (), shapes)
                        adj_y = _flat_to_shape(adj_y, (), shapes)
                        return adjoint_norm((t, *y, *adj_y, *adj_params))
                    adjoint_options['norm'] = _adjoint_norm
--- a/third_party/torchdiffeq/torchdiffeq/_impl/bosh3.py
+++ b/third_party/torchdiffeq/torchdiffeq/_impl/bosh3.py
@ -0,0 +1,22 @@
 import torch
 from .rk_common import _ButcherTableau, RKAdaptiveStepsizeODESolver
 _BOGACKI_SHAMPINE_TABLEAU = _ButcherTableau(
    alpha=torch.tensor([1 / 2, 3 / 4, 1.], dtype=torch.float64),
    beta=[
        torch.tensor([1 / 2], dtype=torch.float64),
        torch.tensor([0., 3 / 4], dtype=torch.float64),
        torch.tensor([2 / 9, 1 / 3, 4 / 9], dtype=torch.float64)
    ],
    c_sol=torch.tensor([2 / 9, 1 / 3, 4 / 9, 0.], dtype=torch.float64),
    c_error=torch.tensor([2 / 9 - 7 / 24, 1 / 3 - 1 / 4, 4 / 9 - 1 / 3, -1 / 8], dtype=torch.float64),
 )
 _BS_C_MID = torch.tensor([0., 0.5, 0., 0.], dtype=torch.float64)
 class Bosh3Solver(RKAdaptiveStepsizeODESolver):
    order = 3
    tableau = _BOGACKI_SHAMPINE_TABLEAU
    mid = _BS_C_MID
--- a/Show more
+++ b/Show more
		`@ -0,0 +1,2 @@`
							`* all the code under this folder is based on the code under https://github.com/mit-han-lab/pvcnn/tree/master/modules`
		`@ -0,0 +1 @@`
							adapted from `https://github.com/rtqichen/torchdiffeq/tree/master/torchdiffeq`
		`@ -0,0 +1,2 @@`
							`from .odeint import odeint, odeint_event`
							`from .adjoint import odeint_adjoint`