1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
10 #include <cub/cub.cuh>
12 #include "cloudViewer/core/CUDAUtils.h"
13 #include "cloudViewer/core/ParallelFor.h"
14 #include "cloudViewer/core/Tensor.h"
15 #include "cloudViewer/t/pipelines/kernel/RegistrationImpl.h"
16 #include "cloudViewer/t/pipelines/kernel/TransformationConverter.h"
17 #include "cloudViewer/t/pipelines/registration/RobustKernel.h"
18 #include "cloudViewer/t/pipelines/registration/RobustKernelImpl.h"
19 #include "cloudViewer/utility/MiniVec.h"
21 namespace cloudViewer {
26 const int kThread1DUnit = 256;
27 const int kReduceDim = 29; // 21 (JtJ) + 6 (Jtr) + 1 (inlier) + 1 (r)
29 template <typename scalar_t, typename func_t>
30 __global__ void ComputePosePointToPlaneKernelCUDA(
31 const scalar_t *source_points_ptr,
32 const scalar_t *target_points_ptr,
33 const scalar_t *target_normals_ptr,
34 const int64_t *correspondence_indices,
37 func_t GetWeightFromRobustKernel) {
38 typedef utility::MiniVec<scalar_t, kReduceDim> ReduceVec;
39 // Create shared memory.
40 typedef cub::BlockReduce<ReduceVec, kThread1DUnit> BlockReduce;
41 __shared__ typename BlockReduce::TempStorage temp_storage;
42 ReduceVec local_sum(static_cast<scalar_t>(0));
44 const int workload_idx = threadIdx.x + blockIdx.x * blockDim.x;
45 if (workload_idx < n) {
46 scalar_t J_ij[6] = {0};
48 const bool valid = GetJacobianPointToPlane<scalar_t>(
49 workload_idx, source_points_ptr, target_points_ptr,
50 target_normals_ptr, correspondence_indices, J_ij, r);
53 const scalar_t w = GetWeightFromRobustKernel(r);
55 // Dump J, r into JtJ and Jtr
57 for (int j = 0; j < 6; ++j) {
58 for (int k = 0; k <= j; ++k) {
59 local_sum[i] += J_ij[j] * w * J_ij[k];
62 local_sum[21 + j] += J_ij[j] * w * r;
70 auto result = BlockReduce(temp_storage).Sum(local_sum);
72 // Add result to global_sum.
73 if (threadIdx.x == 0) {
75 for (int i = 0; i < kReduceDim; ++i) {
76 atomicAdd(&global_sum[i], result[i]);
81 void ComputePosePointToPlaneCUDA(const core::Tensor &source_points,
82 const core::Tensor &target_points,
83 const core::Tensor &target_normals,
84 const core::Tensor &correspondence_indices,
88 const core::Dtype &dtype,
89 const core::Device &device,
90 const registration::RobustKernel &kernel) {
91 core::CUDAScopedDevice scoped_device(source_points.GetDevice());
92 int n = source_points.GetLength();
94 core::Tensor global_sum = core::Tensor::Zeros({29}, dtype, device);
95 const dim3 blocks((n + kThread1DUnit - 1) / kThread1DUnit);
96 const dim3 threads(kThread1DUnit);
98 DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
99 scalar_t *global_sum_ptr = global_sum.GetDataPtr<scalar_t>();
101 DISPATCH_ROBUST_KERNEL_FUNCTION(
102 kernel.type_, scalar_t, kernel.scaling_parameter_,
103 kernel.shape_parameter_, [&]() {
104 ComputePosePointToPlaneKernelCUDA<<<
105 blocks, threads, 0, core::cuda::GetStream()>>>(
106 source_points.GetDataPtr<scalar_t>(),
107 target_points.GetDataPtr<scalar_t>(),
108 target_normals.GetDataPtr<scalar_t>(),
109 correspondence_indices.GetDataPtr<int64_t>(), n,
110 global_sum_ptr, GetWeightFromRobustKernel);
114 core::cuda::Synchronize();
116 DecodeAndSolve6x6(global_sum, pose, residual, inlier_count);
119 template <typename scalar_t, typename funct_t>
120 __global__ void ComputePoseColoredICPKernelCUDA(
121 const scalar_t *source_points_ptr,
122 const scalar_t *source_colors_ptr,
123 const scalar_t *target_points_ptr,
124 const scalar_t *target_normals_ptr,
125 const scalar_t *target_colors_ptr,
126 const scalar_t *target_color_gradients_ptr,
127 const int64_t *correspondence_indices,
128 const scalar_t sqrt_lambda_geometric,
129 const scalar_t sqrt_lambda_photometric,
131 scalar_t *global_sum,
132 funct_t GetWeightFromRobustKernel) {
133 typedef utility::MiniVec<scalar_t, kReduceDim> ReduceVec;
134 // Create shared memory.
135 typedef cub::BlockReduce<ReduceVec, kThread1DUnit> BlockReduce;
136 __shared__ typename BlockReduce::TempStorage temp_storage;
137 ReduceVec local_sum(static_cast<scalar_t>(0));
139 const int workload_idx = threadIdx.x + blockIdx.x * blockDim.x;
140 if (workload_idx < n) {
141 scalar_t J_G[6] = {0}, J_I[6] = {0};
142 scalar_t r_G = 0, r_I = 0;
144 const bool valid = GetJacobianColoredICP<scalar_t>(
145 workload_idx, source_points_ptr, source_colors_ptr,
146 target_points_ptr, target_normals_ptr, target_colors_ptr,
147 target_color_gradients_ptr, correspondence_indices,
148 sqrt_lambda_geometric, sqrt_lambda_photometric, J_G, J_I, r_G,
152 const scalar_t w_G = GetWeightFromRobustKernel(r_G);
153 const scalar_t w_I = GetWeightFromRobustKernel(r_I);
155 // Dump J, r into JtJ and Jtr
157 for (int j = 0; j < 6; ++j) {
158 for (int k = 0; k <= j; ++k) {
160 J_G[j] * w_G * J_G[k] + J_I[j] * w_I * J_I[k];
163 local_sum[21 + j] += J_G[j] * w_G * r_G + J_I[j] * w_I * r_I;
165 local_sum[27] += r_G * r_G + r_I * r_I;
171 auto result = BlockReduce(temp_storage).Sum(local_sum);
173 // Add result to global_sum.
174 if (threadIdx.x == 0) {
176 for (int i = 0; i < kReduceDim; ++i) {
177 atomicAdd(&global_sum[i], result[i]);
182 void ComputePoseColoredICPCUDA(const core::Tensor &source_points,
183 const core::Tensor &source_colors,
184 const core::Tensor &target_points,
185 const core::Tensor &target_normals,
186 const core::Tensor &target_colors,
187 const core::Tensor &target_color_gradients,
188 const core::Tensor &correspondence_indices,
192 const core::Dtype &dtype,
193 const core::Device &device,
194 const registration::RobustKernel &kernel,
195 const double &lambda_geometric) {
196 core::CUDAScopedDevice scoped_device(source_points.GetDevice());
197 int n = source_points.GetLength();
199 core::Tensor global_sum = core::Tensor::Zeros({29}, dtype, device);
200 const dim3 blocks((n + kThread1DUnit - 1) / kThread1DUnit);
201 const dim3 threads(kThread1DUnit);
203 DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
204 scalar_t sqrt_lambda_geometric =
205 static_cast<scalar_t>(sqrt(lambda_geometric));
206 scalar_t sqrt_lambda_photometric =
207 static_cast<scalar_t>(sqrt(1.0 - lambda_geometric));
209 DISPATCH_ROBUST_KERNEL_FUNCTION(
210 kernel.type_, scalar_t, kernel.scaling_parameter_,
211 kernel.shape_parameter_, [&]() {
212 ComputePoseColoredICPKernelCUDA<<<
213 blocks, threads, 0, core::cuda::GetStream()>>>(
214 source_points.GetDataPtr<scalar_t>(),
215 source_colors.GetDataPtr<scalar_t>(),
216 target_points.GetDataPtr<scalar_t>(),
217 target_normals.GetDataPtr<scalar_t>(),
218 target_colors.GetDataPtr<scalar_t>(),
219 target_color_gradients.GetDataPtr<scalar_t>(),
220 correspondence_indices.GetDataPtr<int64_t>(),
221 sqrt_lambda_geometric, sqrt_lambda_photometric, n,
222 global_sum.GetDataPtr<scalar_t>(),
223 GetWeightFromRobustKernel);
227 core::cuda::Synchronize();
229 DecodeAndSolve6x6(global_sum, pose, residual, inlier_count);
232 template <typename scalar_t, typename funct1_t, typename funct2_t>
233 __global__ void ComputePoseDopplerICPKernelCUDA(
234 const scalar_t *source_points_ptr,
235 const scalar_t *source_dopplers_ptr,
236 const scalar_t *source_directions_ptr,
237 const scalar_t *target_points_ptr,
238 const scalar_t *target_normals_ptr,
239 const int64_t *correspondence_indices,
240 const scalar_t *R_S_to_V,
241 const scalar_t *r_v_to_s_in_V,
242 const scalar_t *v_s_in_S,
243 const bool reject_dynamic_outliers,
244 const scalar_t doppler_outlier_threshold,
245 const scalar_t sqrt_lambda_geometric,
246 const scalar_t sqrt_lambda_doppler,
247 const scalar_t sqrt_lambda_doppler_by_dt,
249 scalar_t *global_sum,
250 funct1_t GetWeightFromRobustKernelFirst, // Geometric kernel
251 funct2_t GetWeightFromRobustKernelSecond // Doppler kernel
253 typedef utility::MiniVec<scalar_t, kReduceDim> ReduceVec;
254 // Create shared memory.
255 typedef cub::BlockReduce<ReduceVec, kThread1DUnit> BlockReduce;
256 __shared__ typename BlockReduce::TempStorage temp_storage;
257 ReduceVec local_sum(static_cast<scalar_t>(0));
259 const int workload_idx = threadIdx.x + blockIdx.x * blockDim.x;
260 if (workload_idx < n) {
261 scalar_t J_G[6] = {0}, J_D[6] = {0};
262 scalar_t r_G = 0, r_D = 0;
264 const bool valid = GetJacobianDopplerICP<scalar_t>(
265 workload_idx, source_points_ptr, source_dopplers_ptr,
266 source_directions_ptr, target_points_ptr, target_normals_ptr,
267 correspondence_indices, R_S_to_V, r_v_to_s_in_V, v_s_in_S,
268 reject_dynamic_outliers, doppler_outlier_threshold,
269 sqrt_lambda_geometric, sqrt_lambda_doppler,
270 sqrt_lambda_doppler_by_dt, J_G, J_D, r_G, r_D);
273 const scalar_t w_G = GetWeightFromRobustKernelFirst(r_G);
274 const scalar_t w_D = GetWeightFromRobustKernelSecond(r_D);
276 // Dump J, r into JtJ and Jtr
278 for (int j = 0; j < 6; ++j) {
279 for (int k = 0; k <= j; ++k) {
281 J_G[j] * w_G * J_G[k] + J_D[j] * w_D * J_D[k];
284 local_sum[21 + j] += J_G[j] * w_G * r_G + J_D[j] * w_D * r_D;
286 local_sum[27] += r_G * r_G + r_D * r_D;
292 auto result = BlockReduce(temp_storage).Sum(local_sum);
294 // Add result to global_sum.
295 if (threadIdx.x == 0) {
297 for (int i = 0; i < kReduceDim; ++i) {
298 atomicAdd(&global_sum[i], result[i]);
303 template <typename scalar_t>
304 __global__ void PreComputeForDopplerICPKernelCUDA(const scalar_t *R_S_to_V,
305 const scalar_t *r_v_to_s_in_V,
306 const scalar_t *w_v_in_V,
307 const scalar_t *v_v_in_V,
308 scalar_t *v_s_in_S) {
309 PreComputeForDopplerICP(R_S_to_V, r_v_to_s_in_V, w_v_in_V, v_v_in_V,
313 void ComputePoseDopplerICPCUDA(
314 const core::Tensor &source_points,
315 const core::Tensor &source_dopplers,
316 const core::Tensor &source_directions,
317 const core::Tensor &target_points,
318 const core::Tensor &target_normals,
319 const core::Tensor &correspondence_indices,
320 core::Tensor &output_pose,
323 const core::Dtype &dtype,
324 const core::Device &device,
325 const core::Tensor &R_S_to_V,
326 const core::Tensor &r_v_to_s_in_V,
327 const core::Tensor &w_v_in_V,
328 const core::Tensor &v_v_in_V,
330 const bool reject_dynamic_outliers,
331 const double doppler_outlier_threshold,
332 const registration::RobustKernel &kernel_geometric,
333 const registration::RobustKernel &kernel_doppler,
334 const double lambda_doppler) {
335 core::CUDAScopedDevice scoped_device(source_points.GetDevice());
336 int n = source_points.GetLength();
338 core::Tensor global_sum = core::Tensor::Zeros({29}, dtype, device);
339 const dim3 blocks((n + kThread1DUnit - 1) / kThread1DUnit);
340 const dim3 threads(kThread1DUnit);
342 core::Tensor v_s_in_S = core::Tensor::Zeros({3}, dtype, device);
344 DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
345 const scalar_t sqrt_lambda_geometric =
346 sqrt(1.0 - static_cast<scalar_t>(lambda_doppler));
347 const scalar_t sqrt_lambda_doppler = sqrt(lambda_doppler);
348 const scalar_t sqrt_lambda_doppler_by_dt =
349 sqrt_lambda_doppler / static_cast<scalar_t>(period);
351 PreComputeForDopplerICPKernelCUDA<scalar_t>
352 <<<1, 1, 0, core::cuda::GetStream()>>>(
353 R_S_to_V.GetDataPtr<scalar_t>(),
354 r_v_to_s_in_V.GetDataPtr<scalar_t>(),
355 w_v_in_V.GetDataPtr<scalar_t>(),
356 v_v_in_V.GetDataPtr<scalar_t>(),
357 v_s_in_S.GetDataPtr<scalar_t>());
359 DISPATCH_DUAL_ROBUST_KERNEL_FUNCTION(
360 scalar_t, kernel_geometric.type_,
361 kernel_geometric.scaling_parameter_, kernel_doppler.type_,
362 kernel_doppler.scaling_parameter_, [&]() {
363 ComputePoseDopplerICPKernelCUDA<<<
364 blocks, threads, 0, core::cuda::GetStream()>>>(
365 source_points.GetDataPtr<scalar_t>(),
366 source_dopplers.GetDataPtr<scalar_t>(),
367 source_directions.GetDataPtr<scalar_t>(),
368 target_points.GetDataPtr<scalar_t>(),
369 target_normals.GetDataPtr<scalar_t>(),
370 correspondence_indices.GetDataPtr<int64_t>(),
371 R_S_to_V.GetDataPtr<scalar_t>(),
372 r_v_to_s_in_V.GetDataPtr<scalar_t>(),
373 v_s_in_S.GetDataPtr<scalar_t>(),
374 reject_dynamic_outliers,
375 static_cast<scalar_t>(doppler_outlier_threshold),
376 sqrt_lambda_geometric, sqrt_lambda_doppler,
377 sqrt_lambda_doppler_by_dt, n,
378 global_sum.GetDataPtr<scalar_t>(),
379 GetWeightFromRobustKernelFirst, // Geometric kernel
380 GetWeightFromRobustKernelSecond // Doppler kernel
385 core::cuda::Synchronize();
387 DecodeAndSolve6x6(global_sum, output_pose, residual, inlier_count);
390 template <typename scalar_t>
391 __global__ void ComputeInformationMatrixKernelCUDA(
392 const scalar_t *target_points_ptr,
393 const int64_t *correspondence_indices,
395 scalar_t *global_sum) {
396 // Reduce dimention for this function is 21
397 typedef utility::MiniVec<scalar_t, 21> ReduceVec;
398 // Create shared memory.
399 typedef cub::BlockReduce<ReduceVec, kThread1DUnit> BlockReduce;
400 __shared__ typename BlockReduce::TempStorage temp_storage;
401 ReduceVec local_sum(static_cast<scalar_t>(0));
403 const int workload_idx = threadIdx.x + blockIdx.x * blockDim.x;
404 if (workload_idx < n) {
405 scalar_t J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0};
406 const bool valid = GetInformationJacobians<scalar_t>(
407 workload_idx, target_points_ptr, correspondence_indices, J_x,
412 for (int j = 0; j < 6; ++j) {
413 for (int k = 0; k <= j; ++k) {
415 J_x[j] * J_x[k] + J_y[j] * J_y[k] + J_z[j] * J_z[k];
423 auto result = BlockReduce(temp_storage).Sum(local_sum);
425 // Add result to global_sum.
426 if (threadIdx.x == 0) {
428 for (int i = 0; i < 21; ++i) {
429 atomicAdd(&global_sum[i], result[i]);
434 void ComputeInformationMatrixCUDA(const core::Tensor &target_points,
435 const core::Tensor &correspondence_indices,
436 core::Tensor &information_matrix,
437 const core::Dtype &dtype,
438 const core::Device &device) {
439 core::CUDAScopedDevice scoped_device(target_points.GetDevice());
440 int n = correspondence_indices.GetLength();
442 core::Tensor global_sum = core::Tensor::Zeros({21}, dtype, device);
443 const dim3 blocks((n + kThread1DUnit - 1) / kThread1DUnit);
444 const dim3 threads(kThread1DUnit);
446 DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
447 scalar_t *global_sum_ptr = global_sum.GetDataPtr<scalar_t>();
449 ComputeInformationMatrixKernelCUDA<<<blocks, threads, 0,
450 core::cuda::GetStream()>>>(
451 target_points.GetDataPtr<scalar_t>(),
452 correspondence_indices.GetDataPtr<int64_t>(), n,
455 core::cuda::Synchronize();
457 core::Tensor global_sum_cpu =
458 global_sum.To(core::Device("CPU:0"), core::Float64);
459 double *sum_ptr = global_sum_cpu.GetDataPtr<double>();
461 // Information matrix is on CPU of type Float64.
462 double *GTG_ptr = information_matrix.GetDataPtr<double>();
465 for (int j = 0; j < 6; j++) {
466 for (int k = 0; k <= j; k++) {
467 GTG_ptr[j * 6 + k] = GTG_ptr[k * 6 + j] = sum_ptr[i];
474 } // namespace kernel
475 } // namespace pipelines
477 } // namespace cloudViewer