ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
RegistrationCUDA.cu
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #include <cuda.h>
9 
10 #include <cub/cub.cuh>
11 
12 #include "cloudViewer/core/CUDAUtils.h"
13 #include "cloudViewer/core/ParallelFor.h"
14 #include "cloudViewer/core/Tensor.h"
15 #include "cloudViewer/t/pipelines/kernel/RegistrationImpl.h"
16 #include "cloudViewer/t/pipelines/kernel/TransformationConverter.h"
17 #include "cloudViewer/t/pipelines/registration/RobustKernel.h"
18 #include "cloudViewer/t/pipelines/registration/RobustKernelImpl.h"
19 #include "cloudViewer/utility/MiniVec.h"
20 
21 namespace cloudViewer {
22 namespace t {
23 namespace pipelines {
24 namespace kernel {
25 
26 const int kThread1DUnit = 256;
27 const int kReduceDim = 29; // 21 (JtJ) + 6 (Jtr) + 1 (inlier) + 1 (r)
28 
29 template <typename scalar_t, typename func_t>
30 __global__ void ComputePosePointToPlaneKernelCUDA(
31  const scalar_t *source_points_ptr,
32  const scalar_t *target_points_ptr,
33  const scalar_t *target_normals_ptr,
34  const int64_t *correspondence_indices,
35  const int n,
36  scalar_t *global_sum,
37  func_t GetWeightFromRobustKernel) {
38  typedef utility::MiniVec<scalar_t, kReduceDim> ReduceVec;
39  // Create shared memory.
40  typedef cub::BlockReduce<ReduceVec, kThread1DUnit> BlockReduce;
41  __shared__ typename BlockReduce::TempStorage temp_storage;
42  ReduceVec local_sum(static_cast<scalar_t>(0));
43 
44  const int workload_idx = threadIdx.x + blockIdx.x * blockDim.x;
45  if (workload_idx < n) {
46  scalar_t J_ij[6] = {0};
47  scalar_t r = 0;
48  const bool valid = GetJacobianPointToPlane<scalar_t>(
49  workload_idx, source_points_ptr, target_points_ptr,
50  target_normals_ptr, correspondence_indices, J_ij, r);
51 
52  if (valid) {
53  const scalar_t w = GetWeightFromRobustKernel(r);
54 
55  // Dump J, r into JtJ and Jtr
56  int i = 0;
57  for (int j = 0; j < 6; ++j) {
58  for (int k = 0; k <= j; ++k) {
59  local_sum[i] += J_ij[j] * w * J_ij[k];
60  ++i;
61  }
62  local_sum[21 + j] += J_ij[j] * w * r;
63  }
64  local_sum[27] += r;
65  local_sum[28] += 1;
66  }
67  }
68 
69  // Reduction.
70  auto result = BlockReduce(temp_storage).Sum(local_sum);
71 
72  // Add result to global_sum.
73  if (threadIdx.x == 0) {
74 #pragma unroll
75  for (int i = 0; i < kReduceDim; ++i) {
76  atomicAdd(&global_sum[i], result[i]);
77  }
78  }
79 }
80 
81 void ComputePosePointToPlaneCUDA(const core::Tensor &source_points,
82  const core::Tensor &target_points,
83  const core::Tensor &target_normals,
84  const core::Tensor &correspondence_indices,
85  core::Tensor &pose,
86  float &residual,
87  int &inlier_count,
88  const core::Dtype &dtype,
89  const core::Device &device,
90  const registration::RobustKernel &kernel) {
91  core::CUDAScopedDevice scoped_device(source_points.GetDevice());
92  int n = source_points.GetLength();
93 
94  core::Tensor global_sum = core::Tensor::Zeros({29}, dtype, device);
95  const dim3 blocks((n + kThread1DUnit - 1) / kThread1DUnit);
96  const dim3 threads(kThread1DUnit);
97 
98  DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
99  scalar_t *global_sum_ptr = global_sum.GetDataPtr<scalar_t>();
100 
101  DISPATCH_ROBUST_KERNEL_FUNCTION(
102  kernel.type_, scalar_t, kernel.scaling_parameter_,
103  kernel.shape_parameter_, [&]() {
104  ComputePosePointToPlaneKernelCUDA<<<
105  blocks, threads, 0, core::cuda::GetStream()>>>(
106  source_points.GetDataPtr<scalar_t>(),
107  target_points.GetDataPtr<scalar_t>(),
108  target_normals.GetDataPtr<scalar_t>(),
109  correspondence_indices.GetDataPtr<int64_t>(), n,
110  global_sum_ptr, GetWeightFromRobustKernel);
111  });
112  });
113 
114  core::cuda::Synchronize();
115 
116  DecodeAndSolve6x6(global_sum, pose, residual, inlier_count);
117 }
118 
119 template <typename scalar_t, typename funct_t>
120 __global__ void ComputePoseColoredICPKernelCUDA(
121  const scalar_t *source_points_ptr,
122  const scalar_t *source_colors_ptr,
123  const scalar_t *target_points_ptr,
124  const scalar_t *target_normals_ptr,
125  const scalar_t *target_colors_ptr,
126  const scalar_t *target_color_gradients_ptr,
127  const int64_t *correspondence_indices,
128  const scalar_t sqrt_lambda_geometric,
129  const scalar_t sqrt_lambda_photometric,
130  const int n,
131  scalar_t *global_sum,
132  funct_t GetWeightFromRobustKernel) {
133  typedef utility::MiniVec<scalar_t, kReduceDim> ReduceVec;
134  // Create shared memory.
135  typedef cub::BlockReduce<ReduceVec, kThread1DUnit> BlockReduce;
136  __shared__ typename BlockReduce::TempStorage temp_storage;
137  ReduceVec local_sum(static_cast<scalar_t>(0));
138 
139  const int workload_idx = threadIdx.x + blockIdx.x * blockDim.x;
140  if (workload_idx < n) {
141  scalar_t J_G[6] = {0}, J_I[6] = {0};
142  scalar_t r_G = 0, r_I = 0;
143 
144  const bool valid = GetJacobianColoredICP<scalar_t>(
145  workload_idx, source_points_ptr, source_colors_ptr,
146  target_points_ptr, target_normals_ptr, target_colors_ptr,
147  target_color_gradients_ptr, correspondence_indices,
148  sqrt_lambda_geometric, sqrt_lambda_photometric, J_G, J_I, r_G,
149  r_I);
150 
151  if (valid) {
152  const scalar_t w_G = GetWeightFromRobustKernel(r_G);
153  const scalar_t w_I = GetWeightFromRobustKernel(r_I);
154 
155  // Dump J, r into JtJ and Jtr
156  int i = 0;
157  for (int j = 0; j < 6; ++j) {
158  for (int k = 0; k <= j; ++k) {
159  local_sum[i] +=
160  J_G[j] * w_G * J_G[k] + J_I[j] * w_I * J_I[k];
161  ++i;
162  }
163  local_sum[21 + j] += J_G[j] * w_G * r_G + J_I[j] * w_I * r_I;
164  }
165  local_sum[27] += r_G * r_G + r_I * r_I;
166  local_sum[28] += 1;
167  }
168  }
169 
170  // Reduction.
171  auto result = BlockReduce(temp_storage).Sum(local_sum);
172 
173  // Add result to global_sum.
174  if (threadIdx.x == 0) {
175 #pragma unroll
176  for (int i = 0; i < kReduceDim; ++i) {
177  atomicAdd(&global_sum[i], result[i]);
178  }
179  }
180 }
181 
182 void ComputePoseColoredICPCUDA(const core::Tensor &source_points,
183  const core::Tensor &source_colors,
184  const core::Tensor &target_points,
185  const core::Tensor &target_normals,
186  const core::Tensor &target_colors,
187  const core::Tensor &target_color_gradients,
188  const core::Tensor &correspondence_indices,
189  core::Tensor &pose,
190  float &residual,
191  int &inlier_count,
192  const core::Dtype &dtype,
193  const core::Device &device,
194  const registration::RobustKernel &kernel,
195  const double &lambda_geometric) {
196  core::CUDAScopedDevice scoped_device(source_points.GetDevice());
197  int n = source_points.GetLength();
198 
199  core::Tensor global_sum = core::Tensor::Zeros({29}, dtype, device);
200  const dim3 blocks((n + kThread1DUnit - 1) / kThread1DUnit);
201  const dim3 threads(kThread1DUnit);
202 
203  DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
204  scalar_t sqrt_lambda_geometric =
205  static_cast<scalar_t>(sqrt(lambda_geometric));
206  scalar_t sqrt_lambda_photometric =
207  static_cast<scalar_t>(sqrt(1.0 - lambda_geometric));
208 
209  DISPATCH_ROBUST_KERNEL_FUNCTION(
210  kernel.type_, scalar_t, kernel.scaling_parameter_,
211  kernel.shape_parameter_, [&]() {
212  ComputePoseColoredICPKernelCUDA<<<
213  blocks, threads, 0, core::cuda::GetStream()>>>(
214  source_points.GetDataPtr<scalar_t>(),
215  source_colors.GetDataPtr<scalar_t>(),
216  target_points.GetDataPtr<scalar_t>(),
217  target_normals.GetDataPtr<scalar_t>(),
218  target_colors.GetDataPtr<scalar_t>(),
219  target_color_gradients.GetDataPtr<scalar_t>(),
220  correspondence_indices.GetDataPtr<int64_t>(),
221  sqrt_lambda_geometric, sqrt_lambda_photometric, n,
222  global_sum.GetDataPtr<scalar_t>(),
223  GetWeightFromRobustKernel);
224  });
225  });
226 
227  core::cuda::Synchronize();
228 
229  DecodeAndSolve6x6(global_sum, pose, residual, inlier_count);
230 }
231 
232 template <typename scalar_t, typename funct1_t, typename funct2_t>
233 __global__ void ComputePoseDopplerICPKernelCUDA(
234  const scalar_t *source_points_ptr,
235  const scalar_t *source_dopplers_ptr,
236  const scalar_t *source_directions_ptr,
237  const scalar_t *target_points_ptr,
238  const scalar_t *target_normals_ptr,
239  const int64_t *correspondence_indices,
240  const scalar_t *R_S_to_V,
241  const scalar_t *r_v_to_s_in_V,
242  const scalar_t *v_s_in_S,
243  const bool reject_dynamic_outliers,
244  const scalar_t doppler_outlier_threshold,
245  const scalar_t sqrt_lambda_geometric,
246  const scalar_t sqrt_lambda_doppler,
247  const scalar_t sqrt_lambda_doppler_by_dt,
248  const int n,
249  scalar_t *global_sum,
250  funct1_t GetWeightFromRobustKernelFirst, // Geometric kernel
251  funct2_t GetWeightFromRobustKernelSecond // Doppler kernel
252 ) {
253  typedef utility::MiniVec<scalar_t, kReduceDim> ReduceVec;
254  // Create shared memory.
255  typedef cub::BlockReduce<ReduceVec, kThread1DUnit> BlockReduce;
256  __shared__ typename BlockReduce::TempStorage temp_storage;
257  ReduceVec local_sum(static_cast<scalar_t>(0));
258 
259  const int workload_idx = threadIdx.x + blockIdx.x * blockDim.x;
260  if (workload_idx < n) {
261  scalar_t J_G[6] = {0}, J_D[6] = {0};
262  scalar_t r_G = 0, r_D = 0;
263 
264  const bool valid = GetJacobianDopplerICP<scalar_t>(
265  workload_idx, source_points_ptr, source_dopplers_ptr,
266  source_directions_ptr, target_points_ptr, target_normals_ptr,
267  correspondence_indices, R_S_to_V, r_v_to_s_in_V, v_s_in_S,
268  reject_dynamic_outliers, doppler_outlier_threshold,
269  sqrt_lambda_geometric, sqrt_lambda_doppler,
270  sqrt_lambda_doppler_by_dt, J_G, J_D, r_G, r_D);
271 
272  if (valid) {
273  const scalar_t w_G = GetWeightFromRobustKernelFirst(r_G);
274  const scalar_t w_D = GetWeightFromRobustKernelSecond(r_D);
275 
276  // Dump J, r into JtJ and Jtr
277  int i = 0;
278  for (int j = 0; j < 6; ++j) {
279  for (int k = 0; k <= j; ++k) {
280  local_sum[i] +=
281  J_G[j] * w_G * J_G[k] + J_D[j] * w_D * J_D[k];
282  ++i;
283  }
284  local_sum[21 + j] += J_G[j] * w_G * r_G + J_D[j] * w_D * r_D;
285  }
286  local_sum[27] += r_G * r_G + r_D * r_D;
287  local_sum[28] += 1;
288  }
289  }
290 
291  // Reduction.
292  auto result = BlockReduce(temp_storage).Sum(local_sum);
293 
294  // Add result to global_sum.
295  if (threadIdx.x == 0) {
296 #pragma unroll
297  for (int i = 0; i < kReduceDim; ++i) {
298  atomicAdd(&global_sum[i], result[i]);
299  }
300  }
301 }
302 
303 template <typename scalar_t>
304 __global__ void PreComputeForDopplerICPKernelCUDA(const scalar_t *R_S_to_V,
305  const scalar_t *r_v_to_s_in_V,
306  const scalar_t *w_v_in_V,
307  const scalar_t *v_v_in_V,
308  scalar_t *v_s_in_S) {
309  PreComputeForDopplerICP(R_S_to_V, r_v_to_s_in_V, w_v_in_V, v_v_in_V,
310  v_s_in_S);
311 }
312 
313 void ComputePoseDopplerICPCUDA(
314  const core::Tensor &source_points,
315  const core::Tensor &source_dopplers,
316  const core::Tensor &source_directions,
317  const core::Tensor &target_points,
318  const core::Tensor &target_normals,
319  const core::Tensor &correspondence_indices,
320  core::Tensor &output_pose,
321  float &residual,
322  int &inlier_count,
323  const core::Dtype &dtype,
324  const core::Device &device,
325  const core::Tensor &R_S_to_V,
326  const core::Tensor &r_v_to_s_in_V,
327  const core::Tensor &w_v_in_V,
328  const core::Tensor &v_v_in_V,
329  const double period,
330  const bool reject_dynamic_outliers,
331  const double doppler_outlier_threshold,
332  const registration::RobustKernel &kernel_geometric,
333  const registration::RobustKernel &kernel_doppler,
334  const double lambda_doppler) {
335  core::CUDAScopedDevice scoped_device(source_points.GetDevice());
336  int n = source_points.GetLength();
337 
338  core::Tensor global_sum = core::Tensor::Zeros({29}, dtype, device);
339  const dim3 blocks((n + kThread1DUnit - 1) / kThread1DUnit);
340  const dim3 threads(kThread1DUnit);
341 
342  core::Tensor v_s_in_S = core::Tensor::Zeros({3}, dtype, device);
343 
344  DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
345  const scalar_t sqrt_lambda_geometric =
346  sqrt(1.0 - static_cast<scalar_t>(lambda_doppler));
347  const scalar_t sqrt_lambda_doppler = sqrt(lambda_doppler);
348  const scalar_t sqrt_lambda_doppler_by_dt =
349  sqrt_lambda_doppler / static_cast<scalar_t>(period);
350 
351  PreComputeForDopplerICPKernelCUDA<scalar_t>
352  <<<1, 1, 0, core::cuda::GetStream()>>>(
353  R_S_to_V.GetDataPtr<scalar_t>(),
354  r_v_to_s_in_V.GetDataPtr<scalar_t>(),
355  w_v_in_V.GetDataPtr<scalar_t>(),
356  v_v_in_V.GetDataPtr<scalar_t>(),
357  v_s_in_S.GetDataPtr<scalar_t>());
358 
359  DISPATCH_DUAL_ROBUST_KERNEL_FUNCTION(
360  scalar_t, kernel_geometric.type_,
361  kernel_geometric.scaling_parameter_, kernel_doppler.type_,
362  kernel_doppler.scaling_parameter_, [&]() {
363  ComputePoseDopplerICPKernelCUDA<<<
364  blocks, threads, 0, core::cuda::GetStream()>>>(
365  source_points.GetDataPtr<scalar_t>(),
366  source_dopplers.GetDataPtr<scalar_t>(),
367  source_directions.GetDataPtr<scalar_t>(),
368  target_points.GetDataPtr<scalar_t>(),
369  target_normals.GetDataPtr<scalar_t>(),
370  correspondence_indices.GetDataPtr<int64_t>(),
371  R_S_to_V.GetDataPtr<scalar_t>(),
372  r_v_to_s_in_V.GetDataPtr<scalar_t>(),
373  v_s_in_S.GetDataPtr<scalar_t>(),
374  reject_dynamic_outliers,
375  static_cast<scalar_t>(doppler_outlier_threshold),
376  sqrt_lambda_geometric, sqrt_lambda_doppler,
377  sqrt_lambda_doppler_by_dt, n,
378  global_sum.GetDataPtr<scalar_t>(),
379  GetWeightFromRobustKernelFirst, // Geometric kernel
380  GetWeightFromRobustKernelSecond // Doppler kernel
381  );
382  });
383  });
384 
385  core::cuda::Synchronize();
386 
387  DecodeAndSolve6x6(global_sum, output_pose, residual, inlier_count);
388 }
389 
390 template <typename scalar_t>
391 __global__ void ComputeInformationMatrixKernelCUDA(
392  const scalar_t *target_points_ptr,
393  const int64_t *correspondence_indices,
394  const int n,
395  scalar_t *global_sum) {
396  // Reduce dimention for this function is 21
397  typedef utility::MiniVec<scalar_t, 21> ReduceVec;
398  // Create shared memory.
399  typedef cub::BlockReduce<ReduceVec, kThread1DUnit> BlockReduce;
400  __shared__ typename BlockReduce::TempStorage temp_storage;
401  ReduceVec local_sum(static_cast<scalar_t>(0));
402 
403  const int workload_idx = threadIdx.x + blockIdx.x * blockDim.x;
404  if (workload_idx < n) {
405  scalar_t J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0};
406  const bool valid = GetInformationJacobians<scalar_t>(
407  workload_idx, target_points_ptr, correspondence_indices, J_x,
408  J_y, J_z);
409 
410  if (valid) {
411  int i = 0;
412  for (int j = 0; j < 6; ++j) {
413  for (int k = 0; k <= j; ++k) {
414  local_sum[i] +=
415  J_x[j] * J_x[k] + J_y[j] * J_y[k] + J_z[j] * J_z[k];
416  ++i;
417  }
418  }
419  }
420  }
421 
422  // Reduction.
423  auto result = BlockReduce(temp_storage).Sum(local_sum);
424 
425  // Add result to global_sum.
426  if (threadIdx.x == 0) {
427 #pragma unroll
428  for (int i = 0; i < 21; ++i) {
429  atomicAdd(&global_sum[i], result[i]);
430  }
431  }
432 }
433 
434 void ComputeInformationMatrixCUDA(const core::Tensor &target_points,
435  const core::Tensor &correspondence_indices,
436  core::Tensor &information_matrix,
437  const core::Dtype &dtype,
438  const core::Device &device) {
439  core::CUDAScopedDevice scoped_device(target_points.GetDevice());
440  int n = correspondence_indices.GetLength();
441 
442  core::Tensor global_sum = core::Tensor::Zeros({21}, dtype, device);
443  const dim3 blocks((n + kThread1DUnit - 1) / kThread1DUnit);
444  const dim3 threads(kThread1DUnit);
445 
446  DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
447  scalar_t *global_sum_ptr = global_sum.GetDataPtr<scalar_t>();
448 
449  ComputeInformationMatrixKernelCUDA<<<blocks, threads, 0,
450  core::cuda::GetStream()>>>(
451  target_points.GetDataPtr<scalar_t>(),
452  correspondence_indices.GetDataPtr<int64_t>(), n,
453  global_sum_ptr);
454 
455  core::cuda::Synchronize();
456 
457  core::Tensor global_sum_cpu =
458  global_sum.To(core::Device("CPU:0"), core::Float64);
459  double *sum_ptr = global_sum_cpu.GetDataPtr<double>();
460 
461  // Information matrix is on CPU of type Float64.
462  double *GTG_ptr = information_matrix.GetDataPtr<double>();
463 
464  int i = 0;
465  for (int j = 0; j < 6; j++) {
466  for (int k = 0; k <= j; k++) {
467  GTG_ptr[j * 6 + k] = GTG_ptr[k * 6 + j] = sum_ptr[i];
468  ++i;
469  }
470  }
471  });
472 }
473 
474 } // namespace kernel
475 } // namespace pipelines
476 } // namespace t
477 } // namespace cloudViewer