#include #include #include #include #include __global__ void NmDistanceKernel(int b,int n,const float * xyz,int m,const float * xyz2,float * result,int * result_i){ const int batch=2048; __shared__ float buf[batch*6]; for (int i=blockIdx.x;ibest){ result[(i*n+j)]=best; result_i[(i*n+j)]=best_i; } } __syncthreads(); } } } // int chamfer_cuda_forward(int b,int n,const float * xyz,int m,const float * xyz2,float * result,int * result_i,float * result2,int * result2_i, cudaStream_t stream){ int chamfer_cuda_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2){ const auto batch_size = xyz1.size(0); const auto n = xyz1.size(1); //num_points point cloud A const auto m = xyz2.size(1); //num_points point cloud B NmDistanceKernel<<>>(batch_size, n, xyz1.data(), m, xyz2.data(), dist1.data(), idx1.data()); NmDistanceKernel<<>>(batch_size, m, xyz2.data(), n, xyz1.data(), dist2.data(), idx2.data()); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in nnd updateOutput: %s\n", cudaGetErrorString(err)); //THError("aborting"); return 0; } return 1; } __global__ void NmDistanceGradKernel(int b,int n,const float * xyz1,int m,const float * xyz2,const float * grad_dist1,const int * idx1,float * grad_xyz1,float * grad_xyz2){ for (int i=blockIdx.x;i>>(batch_size,n,xyz1.data(),m,xyz2.data(),graddist1.data(),idx1.data(),gradxyz1.data(),gradxyz2.data()); NmDistanceGradKernel<<>>(batch_size,m,xyz2.data(),n,xyz1.data(),graddist2.data(),idx2.data(),gradxyz2.data(),gradxyz1.data()); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in nnd get grad: %s\n", cudaGetErrorString(err)); //THError("aborting"); return 0; } return 1; }