LION/models/pvcnn2_ada.py

# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
"""
copied and modified from source:
    https://github.com/alexzhou907/PVD/blob/9747265a5f141e5546fd4f862bfa66aa59f1bd33/model/pvcnn_generation.py
    and functions under
    https://github.com/alexzhou907/PVD/tree/9747265a5f141e5546fd4f862bfa66aa59f1bd33/modules
"""
import copy
import functools
from loguru import logger
from einops import rearrange
import torch.nn as nn
import torch
import numpy as np
import third_party.pvcnn.functional as F
# from utils.checker import *
from torch.cuda.amp import autocast, GradScaler, custom_fwd, custom_bwd
from .adagn import AdaGN
import os
quiet = int(os.environ.get('quiet', 0))
class SE3d(nn.Module):
    def __init__(self, channel, reduction=8):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel, bias=False),
            nn.Sigmoid()
        )

        self.channel = channel
    def __repr__(self):
        return f"SE({self.channel}, {self.channel})"
    def forward(self, inputs):
        return inputs * self.fc(inputs.mean(-1).mean(-1).mean(-1)).view(inputs.shape[0], inputs.shape[1], 1, 1, 1)

class LinearAttention(nn.Module):
    """
    copied and modified from https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py#L159
    """
    def __init__(self, dim, heads = 4, dim_head = 32, verbose=True):
        super().__init__()
        self.heads = heads
        hidden_dim = dim_head * heads
        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
        self.to_out = nn.Conv2d(hidden_dim, dim, 1)

    def forward(self, x):
        '''
        Args:
            x: torch.tensor (B,C,N), C=num-channels, N=num-points
        Returns:
            out: torch.tensor (B,C,N)
        '''
        x = x.unsqueeze(-1) # add w dimension
        b, c, h, w = x.shape
        qkv = self.to_qkv(x)
        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
        k = k.softmax(dim=-1)
        context = torch.einsum('bhdn,bhen->bhde', k, v)
        out = torch.einsum('bhde,bhdn->bhen', context, q)
        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
        out = self.to_out(out)
        out = out.squeeze(-1) # B,C,N,1 -> B,C,N
        return out


def swish(input):
    return input * torch.sigmoid(input)


class Swish(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return swish(input)


class BallQuery(nn.Module):
    def __init__(self, radius, num_neighbors, include_coordinates=True):
        super().__init__()
        self.radius = radius
        self.num_neighbors = num_neighbors
        self.include_coordinates = include_coordinates

    @custom_bwd
    def backward(self, *args, **kwargs):
        return super().backward(*args, **kwargs)

    @custom_fwd(cast_inputs=torch.float32)
    def forward(self, points_coords, centers_coords, points_features=None):
        # input: BCN, BCN
        # neighbor_features: B,D(+3),Ncenter
        points_coords = points_coords.contiguous()
        centers_coords = centers_coords.contiguous()
        neighbor_indices = F.ball_query(centers_coords, points_coords, self.radius, self.num_neighbors)
        neighbor_coordinates = F.grouping(points_coords, neighbor_indices)
        neighbor_coordinates = neighbor_coordinates - centers_coords.unsqueeze(-1)

        if points_features is None:
            assert self.include_coordinates, 'No Features For Grouping'
            neighbor_features = neighbor_coordinates
        else:
            neighbor_features = F.grouping(points_features, neighbor_indices)
            if self.include_coordinates:
                neighbor_features = torch.cat([neighbor_coordinates, neighbor_features], dim=1)
        return neighbor_features

    def extra_repr(self):
        return 'radius={}, num_neighbors={}{}'.format(
            self.radius, self.num_neighbors, ', include coordinates' if self.include_coordinates else '')

class SharedMLP(nn.Module):
    def __init__(self, in_channels, out_channels, dim=1, cfg={}):

        assert(len(cfg) > 0), cfg
        super().__init__()
        if dim==1:
            conv = nn.Conv1d
        else:
            conv = nn.Conv2d
        bn = functools.partial(AdaGN, dim, cfg)
        if not isinstance(out_channels, (list, tuple)):
            out_channels = [out_channels]
        layers = []
        for oc in out_channels:
            layers.append(conv(in_channels, oc, 1))
            layers.append(bn(oc))
            layers.append(Swish())
            in_channels = oc
        self.layers = nn.ModuleList(layers)

    def forward(self, *inputs):
        if len(inputs) == 1 and len(inputs[0]) == 4:
            # try to fix thwn SharedMLP is the first layer
            inputs = inputs[0]
        if len(inputs) == 1:
            raise NotImplementedError
        elif len(inputs) == 4:
            assert(len(inputs) == 4), 'input, style'
            x, _, _, style = inputs
            for l in self.layers:
                if isinstance(l, AdaGN):
                    x = l(x, style)
                else:
                    x = l(x)
            return (x, *inputs[1:])
        elif len(inputs) == 2:
            x, style = inputs
            for l in self.layers:
                if isinstance(l, AdaGN):
                    x = l(x, style)
                else:
                    x = l(x)
            return x
        else:
            raise NotImplementedError

class Voxelization(nn.Module):
    def __init__(self, resolution, normalize=True, eps=0):
        super().__init__()
        self.r = int(resolution)
        self.normalize = normalize
        self.eps = eps

    def forward(self, features, coords):
        # features: B,D,N
        # coords:   B,3,N
        coords = coords.detach()
        norm_coords = coords - coords.mean(2, keepdim=True)
        if self.normalize:
            norm_coords = norm_coords / (norm_coords.norm(
                dim=1, keepdim=True).max(dim=2, keepdim=True).values * 2.0 +
                                         self.eps) + 0.5
        else:
            norm_coords = (norm_coords + 1) / 2.0
        norm_coords = torch.clamp(norm_coords * self.r, 0, self.r - 1)
        vox_coords = torch.round(norm_coords).to(torch.int32)
        if features is None:
            return features, norm_coords
        return F.avg_voxelize(features, vox_coords, self.r), norm_coords

    def extra_repr(self):
        return 'resolution={}{}'.format(
            self.r,
            ', normalized eps = {}'.format(self.eps) if self.normalize else '')

class PVConv(nn.Module):
    def __init__(self, in_channels, out_channels,
        kernel_size, resolution,
        normalize=1, eps=0, with_se=False,
        add_point_feat=True, attention=False,
        dropout=0.1, verbose=True,
        cfg={}
        ):
        super().__init__()
        assert(len(cfg) > 0), cfg
        self.resolution = resolution
        self.voxelization = Voxelization(resolution,
                                         normalize=normalize,
                                         eps=eps)
        # For each PVConv we use (Conv3d, GroupNorm(8), Swish, dropout, Conv3d, GroupNorm(8), Attention)
        NormLayer = functools.partial(AdaGN, 3, cfg)
        voxel_layers = [
            nn.Conv3d(in_channels ,
                      out_channels,
                      kernel_size, stride=1,
                      padding=kernel_size // 2),
            NormLayer(out_channels),
            Swish(),
            nn.Dropout(dropout),
            nn.Conv3d(out_channels, out_channels,
                        kernel_size, stride=1,
                        padding=kernel_size // 2),
            NormLayer(out_channels)
            ]
        if with_se:
            voxel_layers.append(SE3d(out_channels))
        self.voxel_layers = nn.ModuleList(voxel_layers)
        if attention:
            self.attn = LinearAttention(out_channels, verbose=verbose)
        else:
            self.attn = None
        if add_point_feat:
            self.point_features = SharedMLP(in_channels, out_channels, cfg=cfg)
        self.add_point_feat = add_point_feat

    def forward(self, inputs):
        '''
        Args:
            inputs: tuple of features and coords
                features: B,feat-dim,num-points
                coords:   B,3, num-points
                time_emd: B,D; time embedding
                style:    B,D; global latent
        Returns:
            fused_features: in (B,out-feat-dim,num-points)
            coords        : in (B, 3 or 6, num_points); same as the input coords
        '''
        features    = inputs[0]
        coords_input= inputs[1]
        time_emb    = inputs[2]
        style       = inputs[3]
        if coords_input.shape[1] > 3:
            coords = coords_input[:,:3]
        else:
            coords = coords_input
        assert (features.shape[0] == coords.shape[0]
                ), f'get feat: {features.shape} and {coords.shape}'
        assert (features.shape[2] == coords.shape[2]
                ), f'get feat: {features.shape} and {coords.shape}'
        assert (coords.shape[1] == 3
                ), f'expect coords: B,3,Npoint, get: {coords.shape}'
        # features: B,D,N; point_features
        # coords:   B,3,N
        voxel_features_4d, voxel_coords = self.voxelization(features, coords)
        r = self.resolution
        B = coords.shape[0]

        for voxel_layers in self.voxel_layers:
            if isinstance(voxel_layers, AdaGN):
                voxel_features_4d = voxel_layers(voxel_features_4d, style)
            else:
                voxel_features_4d = voxel_layers(voxel_features_4d)
        voxel_features = F.trilinear_devoxelize(voxel_features_4d, voxel_coords,
                                                r, self.training)

        fused_features = voxel_features
        if self.add_point_feat:
            fused_features = fused_features + self.point_features(features, style)
        if self.attn is not None:
            fused_features = self.attn(fused_features)
        return fused_features, coords_input, time_emb, style


class PointNetAModule(nn.Module):
    def __init__(self, in_channels, out_channels, include_coordinates=True, cfg={}):
        super().__init__()
        if not isinstance(out_channels, (list, tuple)):
            out_channels = [[out_channels]]
        elif not isinstance(out_channels[0], (list, tuple)):
            out_channels = [out_channels]

        mlps = []
        total_out_channels = 0
        for _out_channels in out_channels:
            mlps.append(
                SharedMLP(in_channels=in_channels + (3 if include_coordinates else 0),
                          out_channels=_out_channels, dim=1, cfg=cfg)
            )
            total_out_channels += _out_channels[-1]

        self.include_coordinates = include_coordinates
        self.out_channels = total_out_channels
        self.mlps = nn.ModuleList(mlps)

    def forward(self, inputs):
        features, coords, time_emb, style = inputs
        if self.include_coordinates:
            features = torch.cat([features, coords], dim=1)
        coords = torch.zeros((coords.size(0), 3, 1), device=coords.device)
        if len(self.mlps) > 1:
            features_list = []
            for mlp in self.mlps:
                features_list.append(mlp(features, style).max(dim=-1, keepdim=True).values)
            return torch.cat(features_list, dim=1), coords, time_emb
        else:
            return self.mlps[0](features, style).max(dim=-1, keepdim=True).values, coords, time_emb

    def extra_repr(self):
        return f'out_channels={self.out_channels}, include_coordinates={self.include_coordinates}'


class PointNetSAModule(nn.Module):
    def __init__(self, num_centers, radius, num_neighbors, in_channels, out_channels, include_coordinates=True,
            cfg={}):
        super().__init__()
        if not isinstance(radius, (list, tuple)):
            radius = [radius]
        if not isinstance(num_neighbors, (list, tuple)):
            num_neighbors = [num_neighbors] * len(radius)
        assert len(radius) == len(num_neighbors)
        if not isinstance(out_channels, (list, tuple)):
            out_channels = [[out_channels]] * len(radius)
        elif not isinstance(out_channels[0], (list, tuple)):
            out_channels = [out_channels] * len(radius)
        assert len(radius) == len(out_channels)

        groupers, mlps = [], []
        total_out_channels = 0
        for _radius, _out_channels, _num_neighbors in zip(radius, out_channels, num_neighbors):
            groupers.append(
                BallQuery(radius=_radius, num_neighbors=_num_neighbors,
                    include_coordinates=include_coordinates)
            )
            mlps.append(
                SharedMLP(in_channels=in_channels + (3 if include_coordinates else 0),
                          out_channels=_out_channels, dim=2, cfg=cfg)
            )
            total_out_channels += _out_channels[-1]

        self.num_centers = num_centers
        self.out_channels = total_out_channels
        self.groupers = nn.ModuleList(groupers)
        self.mlps = nn.ModuleList(mlps)

    def forward(self, inputs):
        features = inputs[0]
        coords = inputs[1]  # B3N
        style = inputs[3]
        if coords.shape[1] > 3:
            coords = coords[:,:3]

        centers_coords = F.furthest_point_sample(coords, self.num_centers)
        # centers_coords: B,D,N
        S = centers_coords.shape[-1]
        time_emb = inputs[2]
        time_emb = time_emb[:,:,:S] if \
            time_emb is not None and type(time_emb) is not dict \
            else time_emb

        features_list = []
        c = 0
        for grouper, mlp in zip(self.groupers, self.mlps):
            c += 1
            grouper_output = grouper(coords, centers_coords, features )
            features_list.append(
                    mlp(grouper_output, style
                        ).max(dim=-1).values
                    )

        if len(features_list) > 1:
            return torch.cat(features_list, dim=1), centers_coords, time_emb, style
        else:
            return features_list[0], centers_coords, time_emb, style

    def extra_repr(self):
        return f'num_centers={self.num_centers}, out_channels={self.out_channels}'


class PointNetFPModule(nn.Module):
    def __init__(self, in_channels, out_channels, cfg={}):
        super().__init__()
        self.mlp = SharedMLP(in_channels=in_channels, out_channels=out_channels, dim=1, cfg=cfg)

    def forward(self, inputs):
        if len(inputs) == 5:
            points_coords, centers_coords, centers_features, time_emb, style = inputs
            points_features = None
        elif len(inputs) == 6:
            points_coords, centers_coords, centers_features, points_features, time_emb, style = inputs
        else:
            raise NotImplementedError

        interpolated_features = F.nearest_neighbor_interpolate(points_coords, centers_coords, centers_features)
        if points_features is not None:
            interpolated_features = torch.cat(
                [interpolated_features, points_features], dim=1
            )
        if time_emb is not None:
            B,D,S = time_emb.shape
            N = points_coords.shape[-1]
            time_emb = time_emb[:,:,0:1].expand(-1,-1,N)
        return self.mlp(interpolated_features, style), points_coords, time_emb, style

def _linear_gn_relu(in_channels, out_channels):
    return nn.Sequential(nn.Linear(in_channels, out_channels), nn.GroupNorm(8,out_channels), Swish())

def create_mlp_components(in_channels, out_channels, classifier=False, dim=2, width_multiplier=1, cfg={}):
    r = width_multiplier

    if dim == 1:
        block = _linear_gn_relu
    else:
        block = SharedMLP
    if not isinstance(out_channels, (list, tuple)):
        out_channels = [out_channels]
    if len(out_channels) == 0 or (len(out_channels) == 1 and out_channels[0] is None):
        return nn.Sequential(), in_channels, in_channels

    layers = []
    for oc in out_channels[:-1]:
        if oc < 1:
            layers.append(nn.Dropout(oc))
        else:
            oc = int(r * oc)
            layers.append(block(in_channels, oc, cfg=cfg))
            in_channels = oc
    if dim == 1:
        if classifier:
            layers.append(nn.Linear(in_channels, out_channels[-1]))
        else:
            layers.append(_linear_gn_relu(in_channels, int(r * out_channels[-1])))
    else:
        if classifier:
            layers.append(nn.Conv1d(in_channels, out_channels[-1], 1))
        else:
            layers.append(SharedMLP(in_channels, int(r * out_channels[-1])))
    return layers, out_channels[-1] if classifier else int(r * out_channels[-1])

def create_pointnet2_sa_components(sa_blocks, extra_feature_channels,
        input_dim=3,
        embed_dim=64, use_att=False, force_att=0,
        dropout=0.1, with_se=False, normalize=True, eps=0, has_temb=1,
        width_multiplier=1, voxel_resolution_multiplier=1, verbose=True,
        cfg={}):
    """
    Returns:
        in_channels: the last output channels of the sa blocks
    """
    assert(len(cfg) > 0), cfg
    r, vr = width_multiplier, voxel_resolution_multiplier
    in_channels = extra_feature_channels + input_dim

    sa_layers, sa_in_channels = [], []
    c = 0
    num_centers = None
    for conv_configs, sa_configs in sa_blocks:
        k = 0
        sa_in_channels.append(in_channels)
        sa_blocks = []
        if conv_configs is not None:
            out_channels, num_blocks, voxel_resolution = conv_configs
            out_channels = int(r * out_channels)
            for p in range(num_blocks):
                attention = ( (c+1) % 2 == 0 and use_att and p == 0 ) or (force_att and c > 0)
                if voxel_resolution is None:
                    block = SharedMLP
                else:
                    block = functools.partial(
                        PVConv, kernel_size=3,
                        resolution=int(vr * voxel_resolution), attention=attention,
                        dropout=dropout,
                        with_se=with_se, # with_se_relu=True,
                        normalize=normalize, eps=eps, verbose=verbose, cfg=cfg)

                if c == 0:
                    sa_blocks.append(block(in_channels, out_channels, cfg=cfg))
                elif k ==0:
                    sa_blocks.append(block(in_channels+embed_dim*has_temb, out_channels, cfg=cfg))
                in_channels = out_channels
                k += 1
            extra_feature_channels = in_channels
        if sa_configs is not None:
            num_centers, radius, num_neighbors, out_channels = sa_configs
            _out_channels = []
            for oc in out_channels:
                if isinstance(oc, (list, tuple)):
                    _out_channels.append([int(r * _oc) for _oc in oc])
                else:
                    _out_channels.append(int(r * oc))
            out_channels = _out_channels
            if num_centers is None:
                block = PointNetAModule
            else:
                block = functools.partial(PointNetSAModule, num_centers=num_centers, radius=radius,
                                          num_neighbors=num_neighbors)
            sa_blocks.append(block(cfg=cfg,
                in_channels=extra_feature_channels+(embed_dim*has_temb if k==0 else 0 ),
                out_channels=out_channels,
                include_coordinates=True))
            in_channels = extra_feature_channels = sa_blocks[-1].out_channels
        c += 1

        if len(sa_blocks) == 1:
            sa_layers.append(sa_blocks[0])
        else:
            sa_layers.append(nn.Sequential(*sa_blocks))

    return sa_layers, sa_in_channels, in_channels, 1 if num_centers is None else num_centers


def create_pointnet2_fp_modules(fp_blocks, in_channels, sa_in_channels, embed_dim=64, use_att=False,
                                dropout=0.1, has_temb=1,
                                with_se=False, normalize=True, eps=0,
                                width_multiplier=1, voxel_resolution_multiplier=1,
                                verbose=True, cfg={}):
    assert(len(cfg) > 0), cfg
    r, vr = width_multiplier, voxel_resolution_multiplier

    fp_layers = []
    c = 0

    for fp_idx, (fp_configs, conv_configs) in enumerate(fp_blocks):
        fp_blocks = []
        out_channels = tuple(int(r * oc) for oc in fp_configs)
        fp_blocks.append(
            PointNetFPModule(
                in_channels=in_channels + sa_in_channels[-1 - fp_idx] + embed_dim*has_temb,
                out_channels=out_channels,
                cfg=cfg)
        )
        in_channels = out_channels[-1]

        if conv_configs is not None:
            out_channels, num_blocks, voxel_resolution = conv_configs
            out_channels = int(r * out_channels)
            for p in range(num_blocks):
                attention = (c+1) % 2 == 0 and c < len(fp_blocks) - 1 and use_att and p == 0
                if voxel_resolution is None:
                    block = functools.partial(SharedMLP, cfg=cfg)
                else:
                    block = functools.partial(PVConv, kernel_size=3,
                            resolution=int(vr * voxel_resolution), attention=attention,
                            dropout=dropout,
                            with_se=with_se, # with_se_relu=True,
                            normalize=normalize, eps=eps,
                            verbose=verbose,
                            cfg=cfg)

                fp_blocks.append(block(in_channels, out_channels))
                in_channels = out_channels
        if len(fp_blocks) == 1:
            fp_layers.append(fp_blocks[0])
        else:
            fp_layers.append(nn.Sequential(*fp_blocks))

        c += 1

    return fp_layers, in_channels