LION/models/latent_points_ada.py

# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.

import torch
from loguru import logger
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from .pvcnn2_ada import \
        create_pointnet2_sa_components, create_pointnet2_fp_modules, LinearAttention, create_mlp_components, SharedMLP

# the building block of encode and decoder for VAE

class PVCNN2Unet(nn.Module):
    """
        copied and modified from https://github.com/alexzhou907/PVD/blob/9747265a5f141e5546fd4f862bfa66aa59f1bd33/model/pvcnn_generation.py#L172
    """
    def __init__(self,
                 num_classes, embed_dim, use_att, dropout=0.1,
                 extra_feature_channels=3,
                 input_dim=3,
                 width_multiplier=1,
                 voxel_resolution_multiplier=1,
                 time_emb_scales=1.0,
                 verbose=True,
                 condition_input=False,
                 point_as_feat=1, cfg={},
                 sa_blocks={}, fp_blocks={},
                 clip_forge_enable=0,
                 clip_forge_dim=512
                 ):
        super().__init__()
        logger.info('[Build Unet] extra_feature_channels={}, input_dim={}',
                extra_feature_channels, input_dim)
        self.input_dim = input_dim

        self.clip_forge_enable = clip_forge_enable
        self.sa_blocks = sa_blocks
        self.fp_blocks = fp_blocks
        self.point_as_feat = point_as_feat
        self.condition_input = condition_input
        assert extra_feature_channels >= 0
        self.time_emb_scales = time_emb_scales
        self.embed_dim = embed_dim
        ## assert(self.embed_dim == 0)
        if self.embed_dim > 0: # has time embedding
            # for prior model, we have time embedding, for VAE model, no time embedding
            self.embedf = nn.Sequential(
                nn.Linear(embed_dim, embed_dim),
                nn.LeakyReLU(0.1, inplace=True),
                nn.Linear(embed_dim, embed_dim),
            )

        if self.clip_forge_enable:
            self.clip_forge_mapping = nn.Linear(clip_forge_dim, embed_dim)
            style_dim = cfg.latent_pts.style_dim
            self.style_clip = nn.Linear(style_dim + embed_dim, style_dim)

        self.in_channels = extra_feature_channels + 3

        sa_layers, sa_in_channels, channels_sa_features, _ = \
            create_pointnet2_sa_components(
            input_dim=input_dim,
            sa_blocks=self.sa_blocks,
            extra_feature_channels=extra_feature_channels,
            with_se=True,
            embed_dim=embed_dim, # time embedding dim
            use_att=use_att, dropout=dropout,
            width_multiplier=width_multiplier,
            voxel_resolution_multiplier=voxel_resolution_multiplier,
            verbose=verbose, cfg=cfg
        )
        self.sa_layers = nn.ModuleList(sa_layers)

        self.global_att = None if not use_att else LinearAttention(channels_sa_features, 8, verbose=verbose)

        # only use extra features in the last fp module
        sa_in_channels[0] = extra_feature_channels + input_dim - 3
        fp_layers, channels_fp_features = create_pointnet2_fp_modules(
            fp_blocks=self.fp_blocks, in_channels=channels_sa_features,
            sa_in_channels=sa_in_channels,
            with_se=True, embed_dim=embed_dim,
            use_att=use_att, dropout=dropout,
            width_multiplier=width_multiplier, voxel_resolution_multiplier=voxel_resolution_multiplier,
            verbose=verbose, cfg=cfg
        )
        self.fp_layers = nn.ModuleList(fp_layers)

        layers, _ = create_mlp_components(
                in_channels=channels_fp_features,
                out_channels=[128, dropout, num_classes], # was 0.5
                classifier=True, dim=2, width_multiplier=width_multiplier,
                cfg=cfg)
        self.classifier = nn.ModuleList(layers)

    def get_timestep_embedding(self, timesteps, device):
        if len(timesteps.shape) == 2 and timesteps.shape[1] == 1:
            timesteps = timesteps[:,0]
        assert(len(timesteps.shape) == 1), f'get shape: {timesteps.shape}'
        timesteps = timesteps * self.time_emb_scales

        half_dim = self.embed_dim // 2
        emb = np.log(10000) / (half_dim - 1)
        emb = torch.from_numpy(np.exp(np.arange(0, half_dim) * -emb)).float().to(device)
        emb = timesteps[:, None] * emb[None, :]
        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
        if self.embed_dim % 2 == 1:  # zero pad
            emb = nn.functional.pad(emb, (0, 1), "constant", 0)
        assert emb.shape == torch.Size([timesteps.shape[0], self.embed_dim])
        return emb

    def forward(self, inputs, **kwargs):
        # Input: coords: B3N
        B = inputs.shape[0]
        coords = inputs[:, :self.input_dim, :].contiguous()
        features = inputs
        temb = kwargs.get('t', None)
        if temb is not None:
            t = temb
            if t.ndim == 0 and not len(t.shape) == 1:
                t = t.view(1).expand(B)
            temb =  self.embedf(self.get_timestep_embedding(t, inputs.device
                ))[:,:,None].expand(-1,-1,inputs.shape[-1])
            temb_ori = temb  # B,embed_dim,Npoint

        style = kwargs['style']
        if self.clip_forge_enable:
            clip_feat = kwargs['clip_feat']
            assert(clip_feat is not None), f'require clip_feat as input'
            clip_feat = self.clip_forge_mapping(clip_feat)
            style = torch.cat([style, clip_feat], dim=1).contiguous()
            style = self.style_clip(style)

        coords_list, in_features_list = [], []
        for i, sa_blocks  in enumerate(self.sa_layers):
            in_features_list.append(features)
            coords_list.append(coords)
            if i > 0 and temb is not None:
                #TODO: implement a sa_blocks forward function; check if is PVConv layer and kwargs get grid_emb, take as additional input
                features = torch.cat([features,temb],dim=1)
                features, coords, temb, _ = \
                    sa_blocks ((features,
                    coords, temb, style))
            else: # i == 0 or temb is None
                features, coords, temb, _ = \
                    sa_blocks ((features, coords, temb, style))

        in_features_list[0] = inputs[:, 3:, :].contiguous()
        if self.global_att is not None:
            features = self.global_att(features)
        for fp_idx, fp_blocks  in enumerate(self.fp_layers):
            if temb is not None:
                features, coords, temb, _ = fp_blocks((
                    coords_list[-1-fp_idx], coords,
                    torch.cat([features,temb],dim=1),
                    in_features_list[-1-fp_idx], temb, style))
            else:
                features, coords, temb, _ = fp_blocks((
                    coords_list[-1-fp_idx], coords,
                    features,
                    in_features_list[-1-fp_idx], temb, style))

        for l in self.classifier:
            if isinstance(l, SharedMLP):
                features = l(features, style)
            else:
                features = l(features)
        return features

class PointTransPVC(nn.Module):
    # encoder : B,N,3 -> B,N,2*D
    sa_blocks = [ # conv_configs, sa_configs
        ((32, 2, 32), (1024, 0.1, 32, (32, 64))),
        ((64, 3, 16), (256, 0.2, 32, (64, 128))),
        ((128, 3, 8), (64, 0.4, 32, (128, 256))),
        (None, (16, 0.8, 32, (128, 128, 128))),
    ]
    fp_blocks = [
        ((128, 128), (128, 3, 8)), # fp_configs, conv_configs
        ((128, 128), (128, 3, 8)),
        ((128, 128), (128, 2, 16)),
        ((128, 128, 64), (64, 2, 32)),
    ]

    def __init__(self, zdim, input_dim, args={}):
        super().__init__()
        self.zdim = zdim
        self.layers = PVCNN2Unet(2*zdim+input_dim*2,
                embed_dim=0, use_att=1, extra_feature_channels=0,
                input_dim=args.ddpm.input_dim, cfg=args,
                sa_blocks=self.sa_blocks, fp_blocks=self.fp_blocks,
                dropout=args.ddpm.dropout)
        self.skip_weight = args.latent_pts.skip_weight
        self.pts_sigma_offset = args.latent_pts.pts_sigma_offset
        self.input_dim = input_dim

    def forward(self, inputs):
        x, style = inputs
        B,N,D = x.shape
        output = self.layers(x.permute(0,2,1).contiguous(), style=style).permute(0,2,1).contiguous() # BND

        pt_mu_1d = output[:,:,:self.input_dim].contiguous()
        pt_sigma_1d = output[:,:,self.input_dim:2*self.input_dim].contiguous() - self.pts_sigma_offset

        pt_mu_1d = self.skip_weight * pt_mu_1d + x
        if self.zdim > 0:
            ft_mu_1d = output[:,:,2*self.input_dim:-self.zdim].contiguous()
            ft_sigma_1d = output[:,:,-self.zdim:].contiguous()

            mu_1d = torch.cat([pt_mu_1d, ft_mu_1d], dim=2).view(B,-1).contiguous()
            sigma_1d = torch.cat([pt_sigma_1d, ft_sigma_1d], dim=2).view(B,-1).contiguous()
        else:
            mu_1d = pt_mu_1d.view(B,-1).contiguous()
            sigma_1d = pt_sigma_1d.view(B,-1).contiguous()
        return {'mu_1d': mu_1d, 'sigma_1d': sigma_1d}

class LatentPointDecPVC(nn.Module):
    """ input x: [B,Npoint,D] with [B,Npoint,3]
    """
    sa_blocks = [ # conv_configs, sa_configs
        ((32, 2, 32), (1024, 0.1, 32, (32, 64))),
        ((64, 3, 16), (256, 0.2, 32, (64, 128))),
        ((128, 3, 8), (64, 0.4, 32, (128, 256))),
        (None, (16, 0.8, 32, (128, 128, 128))),
    ]
    fp_blocks = [
        ((128, 128), (128, 3, 8)), # fp_configs, conv_configs
        ((128, 128), (128, 3, 8)),
        ((128, 128), (128, 2, 16)),
        ((128, 128, 64), (64, 2, 32)),
    ]

    def __init__(self, point_dim, context_dim, num_points=None, args={}, **kwargs):
        super().__init__()
        self.point_dim = point_dim
        logger.info('[Build Dec] point_dim={}, context_dim={}', point_dim, context_dim)
        self.context_dim  = context_dim + self.point_dim
        # self.num_points = num_points
        if num_points is None:
            self.num_points = args.data.tr_max_sample_points
        else:
            self.num_points = num_points
        self.layers = PVCNN2Unet(point_dim, embed_dim=0, use_att=1,
                extra_feature_channels=context_dim,
                input_dim=args.ddpm.input_dim, cfg=args,
                sa_blocks=self.sa_blocks, fp_blocks=self.fp_blocks,
                dropout=args.ddpm.dropout)
        self.skip_weight = args.latent_pts.skip_weight

    def forward(self, x, beta, context, style):
        """
        Args:
            x:  Point clouds at some timestep t, (B, N, d). [not used]
            beta:     Time. (B, ). [not used]
            context:  Latent points, (B,N_pts*D_latent_pts), D_latent_pts = D_input + D_extra
            style: Shape latents. (B,d).
        Returns:
            points: (B,N,3)
        """

        # CHECKDIM(context, 1, self.num_points*self.context_dim)
        assert(context.shape[1] == self.num_points*self.context_dim)
        context = context.view(-1,self.num_points,self.context_dim) # BND
        x = context[:,:,:self.point_dim]
        output = self.layers(context.permute(0,2,1).contiguous(), style=style).permute(0,2,1).contiguous() # BN3
        output = output * self.skip_weight + x
        return output