ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
RegistrationCPU.cpp
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #include <Parallel.h>
9 #include <tbb/parallel_for.h>
10 #include <tbb/parallel_reduce.h>
11 
12 #include <cmath>
13 #include <functional>
14 #include <vector>
15 
23 
24 namespace cloudViewer {
25 namespace t {
26 namespace pipelines {
27 namespace kernel {
28 
29 template <typename scalar_t, typename func_t>
31  const scalar_t *source_points_ptr,
32  const scalar_t *target_points_ptr,
33  const scalar_t *target_normals_ptr,
34  const int64_t *correspondence_indices,
35  const int n,
36  scalar_t *global_sum,
37  func_t GetWeightFromRobustKernel) {
38  // As, AtA is a symmetric matrix, we only need 21 elements instead of 36.
39  // Atb is of shape {6,1}. Combining both, A_1x29 is a temp. storage
40  // with [0:21] elements as AtA, [21:27] elements as Atb, 27th as residual
41  // and 28th as inlier_count.
42  std::vector<scalar_t> A_1x29(29, 0.0);
43 
44 #ifdef _WIN32
45  std::vector<scalar_t> zeros_29(29, 0.0);
46  A_1x29 = tbb::parallel_reduce(
47  tbb::blocked_range<int>(0, n), zeros_29,
48  [&](tbb::blocked_range<int> r, std::vector<scalar_t> A_reduction) {
49  for (int workload_idx = r.begin(); workload_idx < r.end();
50  ++workload_idx) {
51 #else
52  scalar_t *A_reduction = A_1x29.data();
53 #pragma omp parallel for reduction(+ : A_reduction[ : 29]) schedule(static) \
54  num_threads(utility::EstimateMaxThreads())
55  for (int workload_idx = 0; workload_idx < n; ++workload_idx) {
56 #endif
57  scalar_t J_ij[6];
58  scalar_t r = 0;
59 
60  bool valid = kernel::GetJacobianPointToPlane<scalar_t>(
61  workload_idx, source_points_ptr, target_points_ptr,
62  target_normals_ptr, correspondence_indices, J_ij,
63  r);
64 
65  scalar_t w = GetWeightFromRobustKernel(r);
66 
67  if (valid) {
68  // Dump J, r into JtJ and Jtr
69  int i = 0;
70  for (int j = 0; j < 6; ++j) {
71  for (int k = 0; k <= j; ++k) {
72  A_reduction[i] += J_ij[j] * w * J_ij[k];
73  ++i;
74  }
75  A_reduction[21 + j] += J_ij[j] * w * r;
76  }
77  A_reduction[27] += r;
78  A_reduction[28] += 1;
79  }
80  }
81 #ifdef _WIN32
82  return A_reduction;
83  },
84  // TBB: Defining reduction operation.
85  [&](std::vector<scalar_t> a, std::vector<scalar_t> b) {
86  std::vector<scalar_t> result(29);
87  for (int j = 0; j < 29; ++j) {
88  result[j] = a[j] + b[j];
89  }
90  return result;
91  });
92 #endif
93 
94  for (int i = 0; i < 29; ++i) {
95  global_sum[i] = A_1x29[i];
96  }
97 }
98 
99 void ComputePosePointToPlaneCPU(const core::Tensor &source_points,
100  const core::Tensor &target_points,
101  const core::Tensor &target_normals,
102  const core::Tensor &correspondence_indices,
103  core::Tensor &pose,
104  float &residual,
105  int &inlier_count,
106  const core::Dtype &dtype,
107  const core::Device &device,
108  const registration::RobustKernel &kernel) {
109  int n = source_points.GetLength();
110 
111  core::Tensor global_sum = core::Tensor::Zeros({29}, dtype, device);
112 
113  DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
114  scalar_t *global_sum_ptr = global_sum.GetDataPtr<scalar_t>();
115 
117  kernel.type_, scalar_t, kernel.scaling_parameter_,
118  kernel.shape_parameter_, [&]() {
119  kernel::ComputePosePointToPlaneKernelCPU(
120  source_points.GetDataPtr<scalar_t>(),
121  target_points.GetDataPtr<scalar_t>(),
122  target_normals.GetDataPtr<scalar_t>(),
123  correspondence_indices.GetDataPtr<int64_t>(), n,
124  global_sum_ptr, GetWeightFromRobustKernel);
125  });
126  });
127 
128  DecodeAndSolve6x6(global_sum, pose, residual, inlier_count);
129 }
130 
131 template <typename scalar_t, typename funct_t>
133  const scalar_t *source_points_ptr,
134  const scalar_t *source_colors_ptr,
135  const scalar_t *target_points_ptr,
136  const scalar_t *target_normals_ptr,
137  const scalar_t *target_colors_ptr,
138  const scalar_t *target_color_gradients_ptr,
139  const int64_t *correspondence_indices,
140  const scalar_t &sqrt_lambda_geometric,
141  const scalar_t &sqrt_lambda_photometric,
142  const int n,
143  scalar_t *global_sum,
144  funct_t GetWeightFromRobustKernel) {
145  // As, AtA is a symmetric matrix, we only need 21 elements instead of 36.
146  // Atb is of shape {6,1}. Combining both, A_1x29 is a temp. storage
147  // with [0:21] elements as AtA, [21:27] elements as Atb, 27th as residual
148  // and 28th as inlier_count.
149  std::vector<scalar_t> A_1x29(29, 0.0);
150 
151 #ifdef _WIN32
152  std::vector<scalar_t> zeros_29(29, 0.0);
153  A_1x29 = tbb::parallel_reduce(
154  tbb::blocked_range<int>(0, n), zeros_29,
155  [&](tbb::blocked_range<int> r, std::vector<scalar_t> A_reduction) {
156  for (int workload_idx = r.begin(); workload_idx < r.end();
157  ++workload_idx) {
158 #else
159  scalar_t *A_reduction = A_1x29.data();
160 #pragma omp parallel for reduction(+ : A_reduction[ : 29]) schedule(static) \
161  num_threads(utility::EstimateMaxThreads())
162  for (int workload_idx = 0; workload_idx < n; ++workload_idx) {
163 #endif
164  scalar_t J_G[6] = {0}, J_I[6] = {0};
165  scalar_t r_G = 0, r_I = 0;
166 
167  bool valid = GetJacobianColoredICP<scalar_t>(
168  workload_idx, source_points_ptr, source_colors_ptr,
169  target_points_ptr, target_normals_ptr,
170  target_colors_ptr, target_color_gradients_ptr,
171  correspondence_indices, sqrt_lambda_geometric,
172  sqrt_lambda_photometric, J_G, J_I, r_G, r_I);
173 
174  scalar_t w_G = GetWeightFromRobustKernel(r_G);
175  scalar_t w_I = GetWeightFromRobustKernel(r_I);
176 
177  if (valid) {
178  // Dump J, r into JtJ and Jtr
179  int i = 0;
180  for (int j = 0; j < 6; ++j) {
181  for (int k = 0; k <= j; ++k) {
182  A_reduction[i] += J_G[j] * w_G * J_G[k] +
183  J_I[j] * w_I * J_I[k];
184  ++i;
185  }
186  A_reduction[21 + j] +=
187  J_G[j] * w_G * r_G + J_I[j] * w_I * r_I;
188  }
189  A_reduction[27] += r_G * r_G + r_I * r_I;
190  A_reduction[28] += 1;
191  }
192  }
193 #ifdef _WIN32
194  return A_reduction;
195  },
196  // TBB: Defining reduction operation.
197  [&](std::vector<scalar_t> a, std::vector<scalar_t> b) {
198  std::vector<scalar_t> result(29);
199  for (int j = 0; j < 29; ++j) {
200  result[j] = a[j] + b[j];
201  }
202  return result;
203  });
204 #endif
205 
206  for (int i = 0; i < 29; ++i) {
207  global_sum[i] = A_1x29[i];
208  }
209 }
210 
211 void ComputePoseColoredICPCPU(const core::Tensor &source_points,
212  const core::Tensor &source_colors,
213  const core::Tensor &target_points,
214  const core::Tensor &target_normals,
215  const core::Tensor &target_colors,
216  const core::Tensor &target_color_gradients,
217  const core::Tensor &correspondence_indices,
218  core::Tensor &pose,
219  float &residual,
220  int &inlier_count,
221  const core::Dtype &dtype,
222  const core::Device &device,
223  const registration::RobustKernel &kernel,
224  const double &lambda_geometric) {
225  int n = source_points.GetLength();
226 
227  core::Tensor global_sum = core::Tensor::Zeros({29}, dtype, device);
228 
229  DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
230  scalar_t sqrt_lambda_geometric =
231  static_cast<scalar_t>(sqrt(lambda_geometric));
232  scalar_t sqrt_lambda_photometric =
233  static_cast<scalar_t>(sqrt(1.0 - lambda_geometric));
235  kernel.type_, scalar_t, kernel.scaling_parameter_,
236  kernel.shape_parameter_, [&]() {
237  kernel::ComputePoseColoredICPKernelCPU(
238  source_points.GetDataPtr<scalar_t>(),
239  source_colors.GetDataPtr<scalar_t>(),
240  target_points.GetDataPtr<scalar_t>(),
241  target_normals.GetDataPtr<scalar_t>(),
242  target_colors.GetDataPtr<scalar_t>(),
243  target_color_gradients.GetDataPtr<scalar_t>(),
244  correspondence_indices.GetDataPtr<int64_t>(),
245  sqrt_lambda_geometric, sqrt_lambda_photometric, n,
246  global_sum.GetDataPtr<scalar_t>(),
247  GetWeightFromRobustKernel);
248  });
249  });
250 
251  DecodeAndSolve6x6(global_sum, pose, residual, inlier_count);
252 }
253 
254 template <typename scalar_t, typename funct1_t, typename funct2_t>
256  const scalar_t *source_points_ptr,
257  const scalar_t *source_dopplers_ptr,
258  const scalar_t *source_directions_ptr,
259  const scalar_t *target_points_ptr,
260  const scalar_t *target_normals_ptr,
261  const int64_t *correspondence_indices,
262  const scalar_t *R_S_to_V,
263  const scalar_t *r_v_to_s_in_V,
264  const scalar_t *v_s_in_S,
265  const bool reject_dynamic_outliers,
266  const scalar_t doppler_outlier_threshold,
267  const scalar_t sqrt_lambda_geometric,
268  const scalar_t sqrt_lambda_doppler,
269  const scalar_t sqrt_lambda_doppler_by_dt,
270  const int n,
271  scalar_t *global_sum,
272  funct1_t GetWeightFromRobustKernelFirst, // Geometric kernel
273  funct2_t GetWeightFromRobustKernelSecond // Doppler kernel
274 ) {
275  // As, AtA is a symmetric matrix, we only need 21 elements instead of 36.
276  // Atb is of shape {6,1}. Combining both, A_1x29 is a temp. storage
277  // with [0:21] elements as AtA, [21:27] elements as Atb, 27th as residual
278  // and 28th as inlier_count.
279  std::vector<scalar_t> A_1x29(29, 0.0);
280 
281 #ifdef _WIN32
282  std::vector<scalar_t> zeros_29(29, 0.0);
283  A_1x29 = tbb::parallel_reduce(
284  tbb::blocked_range<int>(0, n), zeros_29,
285  [&](tbb::blocked_range<int> r, std::vector<scalar_t> A_reduction) {
286  for (int workload_idx = r.begin(); workload_idx < r.end();
287  ++workload_idx) {
288 #else
289  scalar_t *A_reduction = A_1x29.data();
290 #pragma omp parallel for reduction(+ : A_reduction[ : 29]) schedule(static) \
291  num_threads(utility::EstimateMaxThreads())
292  for (int workload_idx = 0; workload_idx < n; ++workload_idx) {
293 #endif
294  scalar_t J_G[6] = {0}, J_D[6] = {0};
295  scalar_t r_G = 0, r_D = 0;
296 
297  bool valid = GetJacobianDopplerICP<scalar_t>(
298  workload_idx, source_points_ptr,
299  source_dopplers_ptr, source_directions_ptr,
300  target_points_ptr, target_normals_ptr,
301  correspondence_indices, R_S_to_V, r_v_to_s_in_V,
302  v_s_in_S, reject_dynamic_outliers,
303  doppler_outlier_threshold, sqrt_lambda_geometric,
304  sqrt_lambda_doppler, sqrt_lambda_doppler_by_dt, J_G,
305  J_D, r_G, r_D);
306 
307  scalar_t w_G = GetWeightFromRobustKernelFirst(r_G);
308  scalar_t w_D = GetWeightFromRobustKernelSecond(r_D);
309 
310  if (valid) {
311  // Dump J, r into JtJ and Jtr
312  int i = 0;
313  for (int j = 0; j < 6; ++j) {
314  for (int k = 0; k <= j; ++k) {
315  A_reduction[i] += J_G[j] * w_G * J_G[k] +
316  J_D[j] * w_D * J_D[k];
317  ++i;
318  }
319  A_reduction[21 + j] +=
320  J_G[j] * w_G * r_G + J_D[j] * w_D * r_D;
321  }
322  A_reduction[27] += r_G * r_G + r_D * r_D;
323  A_reduction[28] += 1;
324  }
325  }
326 #ifdef _WIN32
327  return A_reduction;
328  },
329  // TBB: Defining reduction operation.
330  [&](std::vector<scalar_t> a, std::vector<scalar_t> b) {
331  std::vector<scalar_t> result(29);
332  for (int j = 0; j < 29; ++j) {
333  result[j] = a[j] + b[j];
334  }
335  return result;
336  });
337 #endif
338 
339  for (int i = 0; i < 29; ++i) {
340  global_sum[i] = A_1x29[i];
341  }
342 }
343 
344 template <typename scalar_t>
345 void PreComputeForDopplerICPKernelCPU(const scalar_t *R_S_to_V,
346  const scalar_t *r_v_to_s_in_V,
347  const scalar_t *w_v_in_V,
348  const scalar_t *v_v_in_V,
349  scalar_t *v_s_in_S) {
350  PreComputeForDopplerICP(R_S_to_V, r_v_to_s_in_V, w_v_in_V, v_v_in_V,
351  v_s_in_S);
352 }
353 
355  const core::Tensor &source_points,
356  const core::Tensor &source_dopplers,
357  const core::Tensor &source_directions,
358  const core::Tensor &target_points,
359  const core::Tensor &target_normals,
360  const core::Tensor &correspondence_indices,
361  core::Tensor &output_pose,
362  float &residual,
363  int &inlier_count,
364  const core::Dtype &dtype,
365  const core::Device &device,
366  const core::Tensor &R_S_to_V,
367  const core::Tensor &r_v_to_s_in_V,
368  const core::Tensor &w_v_in_V,
369  const core::Tensor &v_v_in_V,
370  const double period,
371  const bool reject_dynamic_outliers,
372  const double doppler_outlier_threshold,
373  const registration::RobustKernel &kernel_geometric,
374  const registration::RobustKernel &kernel_doppler,
375  const double lambda_doppler) {
376  const int n = source_points.GetLength();
377 
378  core::Tensor global_sum = core::Tensor::Zeros({29}, dtype, device);
379  core::Tensor v_s_in_S = core::Tensor::Zeros({3}, dtype, device);
380 
381  DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
382  const scalar_t sqrt_lambda_geometric =
383  sqrt(1.0 - static_cast<scalar_t>(lambda_doppler));
384  const scalar_t sqrt_lambda_doppler = sqrt(lambda_doppler);
385  const scalar_t sqrt_lambda_doppler_by_dt =
386  sqrt_lambda_doppler / static_cast<scalar_t>(period);
387 
388  PreComputeForDopplerICPKernelCPU<scalar_t>(
389  R_S_to_V.GetDataPtr<scalar_t>(),
390  r_v_to_s_in_V.GetDataPtr<scalar_t>(),
391  w_v_in_V.GetDataPtr<scalar_t>(),
392  v_v_in_V.GetDataPtr<scalar_t>(),
393  v_s_in_S.GetDataPtr<scalar_t>());
394 
396  scalar_t, kernel_geometric.type_,
397  kernel_geometric.scaling_parameter_, kernel_doppler.type_,
398  kernel_doppler.scaling_parameter_, [&]() {
399  kernel::ComputePoseDopplerICPKernelCPU(
400  source_points.GetDataPtr<scalar_t>(),
401  source_dopplers.GetDataPtr<scalar_t>(),
402  source_directions.GetDataPtr<scalar_t>(),
403  target_points.GetDataPtr<scalar_t>(),
404  target_normals.GetDataPtr<scalar_t>(),
405  correspondence_indices.GetDataPtr<int64_t>(),
406  R_S_to_V.GetDataPtr<scalar_t>(),
407  r_v_to_s_in_V.GetDataPtr<scalar_t>(),
408  v_s_in_S.GetDataPtr<scalar_t>(),
409  reject_dynamic_outliers,
410  static_cast<scalar_t>(doppler_outlier_threshold),
411  sqrt_lambda_geometric, sqrt_lambda_doppler,
412  sqrt_lambda_doppler_by_dt, n,
413  global_sum.GetDataPtr<scalar_t>(),
414  GetWeightFromRobustKernelFirst, // Geometric kernel
415  GetWeightFromRobustKernelSecond // Doppler kernel
416  );
417  });
418  });
419 
420  DecodeAndSolve6x6(global_sum, output_pose, residual, inlier_count);
421 }
422 
423 template <typename scalar_t>
424 static void Get3x3SxyLinearSystem(const scalar_t *source_points_ptr,
425  const scalar_t *target_points_ptr,
426  const int64_t *correspondence_indices,
427  const int &n,
428  const core::Dtype &dtype,
429  const core::Device &device,
430  core::Tensor &Sxy,
431  core::Tensor &target_mean,
432  core::Tensor &source_mean,
433  int &inlier_count) {
434  // Calculating source_mean and target_mean, which are mean(x, y, z) of
435  // source and target points respectively.
436  // mean_1x7[6] is the number of total valid correspondences.
437  std::vector<scalar_t> mean_1x7(7, 0.0);
438  // Identity element for running_total reduction variable: zeros_6.
439  std::vector<scalar_t> zeros_7(7, 0.0);
440 
441  mean_1x7 = tbb::parallel_reduce(
442  tbb::blocked_range<int>(0, n), zeros_7,
443  [&](tbb::blocked_range<int> r,
444  std::vector<scalar_t> mean_reduction) {
445  for (int workload_idx = r.begin(); workload_idx < r.end();
446  ++workload_idx) {
447  if (correspondence_indices[workload_idx] != -1) {
448  int64_t target_idx =
449  3 * correspondence_indices[workload_idx];
450  mean_reduction[0] +=
451  source_points_ptr[3 * workload_idx];
452  mean_reduction[1] +=
453  source_points_ptr[3 * workload_idx + 1];
454  mean_reduction[2] +=
455  source_points_ptr[3 * workload_idx + 2];
456 
457  mean_reduction[3] += target_points_ptr[target_idx];
458  mean_reduction[4] += target_points_ptr[target_idx + 1];
459  mean_reduction[5] += target_points_ptr[target_idx + 2];
460 
461  mean_reduction[6] += 1;
462  }
463  }
464  return mean_reduction;
465  },
466  // TBB: Defining reduction operation.
467  [&](std::vector<scalar_t> a, std::vector<scalar_t> b) {
468  std::vector<scalar_t> result(7);
469  for (int j = 0; j < 7; ++j) {
470  result[j] = a[j] + b[j];
471  }
472  return result;
473  });
474 
475  if (mean_1x7[6] == 0) {
476  utility::LogError("No valid correspondence present.");
477  }
478 
479  for (int i = 0; i < 6; ++i) {
480  mean_1x7[i] = mean_1x7[i] / mean_1x7[6];
481  }
482 
483  // Calculating the Sxy for SVD.
484  std::vector<scalar_t> sxy_1x9(9, 0.0);
485  // Identity element for running total reduction variable: zeros_9.
486  std::vector<scalar_t> zeros_9(9, 0.0);
487 
488  sxy_1x9 = tbb::parallel_reduce(
489  tbb::blocked_range<int>(0, n), zeros_9,
490  [&](tbb::blocked_range<int> r,
491  std::vector<scalar_t> sxy_1x9_reduction) {
492  for (int workload_idx = r.begin(); workload_idx < r.end();
493  workload_idx++) {
494  if (correspondence_indices[workload_idx] != -1) {
495  for (int i = 0; i < 9; ++i) {
496  const int row = i % 3;
497  const int col = i / 3;
498  const int source_idx = 3 * workload_idx + row;
499  const int target_idx =
500  3 * correspondence_indices[workload_idx] +
501  col;
502  sxy_1x9_reduction[i] +=
503  (source_points_ptr[source_idx] -
504  mean_1x7[row]) *
505  (target_points_ptr[target_idx] -
506  mean_1x7[3 + col]);
507  }
508  }
509  }
510  return sxy_1x9_reduction;
511  },
512  // TBB: Defining reduction operation.
513  [&](std::vector<scalar_t> a, std::vector<scalar_t> b) {
514  std::vector<scalar_t> result(9);
515  for (int j = 0; j < 9; ++j) {
516  result[j] = a[j] + b[j];
517  }
518  return result;
519  });
520 
521  source_mean = core::Tensor::Empty({1, 3}, dtype, device);
522  scalar_t *source_mean_ptr = source_mean.GetDataPtr<scalar_t>();
523 
524  target_mean = core::Tensor::Empty({1, 3}, dtype, device);
525  scalar_t *target_mean_ptr = target_mean.GetDataPtr<scalar_t>();
526 
527  Sxy = core::Tensor::Empty({3, 3}, dtype, device);
528  scalar_t *sxy_ptr = Sxy.GetDataPtr<scalar_t>();
529 
530  // Getting Tensor Sxy {3,3}, source_mean {3,1} and target_mean {3} from
531  // temporary reduction variables. The shapes of source_mean and target_mean
532  // are such, because it will be required in equation: t = source_mean -
533  // R.Matmul(target_mean.T()).Reshape({-1}).
534  int i = 0;
535  for (int j = 0; j < 3; ++j) {
536  for (int k = 0; k < 3; ++k) {
537  sxy_ptr[j * 3 + k] = sxy_1x9[i] / mean_1x7[6];
538  ++i;
539  }
540  source_mean_ptr[j] = mean_1x7[j];
541  target_mean_ptr[j] = mean_1x7[j + 3];
542  }
543 
544  inlier_count = static_cast<int64_t>(mean_1x7[6]);
545 }
546 
547 void ComputeRtPointToPointCPU(const core::Tensor &source_points,
548  const core::Tensor &target_points,
549  const core::Tensor &corres,
550  core::Tensor &R,
551  core::Tensor &t,
552  int &inlier_count,
553  const core::Dtype &dtype,
554  const core::Device &device) {
555  core::Tensor Sxy, target_mean, source_mean;
556 
557  DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
558  const scalar_t *source_points_ptr =
559  source_points.GetDataPtr<scalar_t>();
560  const scalar_t *target_points_ptr =
561  target_points.GetDataPtr<scalar_t>();
562  const int64_t *correspondence_indices = corres.GetDataPtr<int64_t>();
563 
564  int n = source_points.GetLength();
565 
566  Get3x3SxyLinearSystem(source_points_ptr, target_points_ptr,
567  correspondence_indices, n, dtype, device, Sxy,
568  target_mean, source_mean, inlier_count);
569  });
570 
571  core::Tensor U, D, VT;
572  std::tie(U, D, VT) = Sxy.SVD();
573  core::Tensor S = core::Tensor::Eye(3, dtype, device);
574  if (U.Det() * (VT.T()).Det() < 0) {
575  S[-1][-1] = -1;
576  }
577 
578  R = U.Matmul(S.Matmul(VT));
579  t = (target_mean.Reshape({-1}) - R.Matmul(source_mean.T()).Reshape({-1}))
580  .To(dtype);
581 }
582 
583 template <typename scalar_t>
584 void ComputeInformationMatrixKernelCPU(const scalar_t *target_points_ptr,
585  const int64_t *correspondence_indices,
586  const int n,
587  scalar_t *global_sum) {
588  // As, AtA is a symmetric matrix, we only need 21 elements instead of 36.
589  std::vector<scalar_t> AtA(21, 0.0);
590 
591 #ifdef _WIN32
592  std::vector<scalar_t> zeros_21(21, 0.0);
593  AtA = tbb::parallel_reduce(
594  tbb::blocked_range<int>(0, n), zeros_21,
595  [&](tbb::blocked_range<int> r, std::vector<scalar_t> A_reduction) {
596  for (int workload_idx = r.begin(); workload_idx < r.end();
597  ++workload_idx) {
598 #else
599  scalar_t *A_reduction = AtA.data();
600 #pragma omp parallel for reduction(+ : A_reduction[ : 21]) schedule(static) \
601  num_threads(utility::EstimateMaxThreads())
602  for (int workload_idx = 0; workload_idx < n; workload_idx++) {
603 #endif
604  scalar_t J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0};
605 
606  bool valid = GetInformationJacobians<scalar_t>(
607  workload_idx, target_points_ptr,
608  correspondence_indices, J_x, J_y, J_z);
609 
610  if (valid) {
611  int i = 0;
612  for (int j = 0; j < 6; ++j) {
613  for (int k = 0; k <= j; ++k) {
614  A_reduction[i] += J_x[j] * J_x[k] +
615  J_y[j] * J_y[k] +
616  J_z[j] * J_z[k];
617  ++i;
618  }
619  }
620  }
621  }
622 #ifdef _WIN32
623  return A_reduction;
624  },
625  // TBB: Defining reduction operation.
626  [&](std::vector<scalar_t> a, std::vector<scalar_t> b) {
627  std::vector<scalar_t> result(21);
628  for (int j = 0; j < 21; ++j) {
629  result[j] = a[j] + b[j];
630  }
631  return result;
632  });
633 #endif
634 
635  for (int i = 0; i < 21; ++i) {
636  global_sum[i] = AtA[i];
637  }
638 }
639 
640 void ComputeInformationMatrixCPU(const core::Tensor &target_points,
641  const core::Tensor &correspondence_indices,
642  core::Tensor &information_matrix,
643  const core::Dtype &dtype,
644  const core::Device &device) {
645  int n = correspondence_indices.GetLength();
646 
647  core::Tensor global_sum = core::Tensor::Zeros({21}, dtype, device);
648 
649  DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
650  scalar_t *global_sum_ptr = global_sum.GetDataPtr<scalar_t>();
651 
653  target_points.GetDataPtr<scalar_t>(),
654  correspondence_indices.GetDataPtr<int64_t>(), n,
655  global_sum_ptr);
656 
657  core::Tensor global_sum_cpu =
658  global_sum.To(core::Device("CPU:0"), core::Float64);
659  double *sum_ptr = global_sum_cpu.GetDataPtr<double>();
660 
661  // Information matrix is on CPU of type Float64.
662  double *GTG_ptr = information_matrix.GetDataPtr<double>();
663 
664  int i = 0;
665  for (int j = 0; j < 6; j++) {
666  for (int k = 0; k <= j; k++) {
667  GTG_ptr[j * 6 + k] = GTG_ptr[k * 6 + j] = sum_ptr[i];
668  ++i;
669  }
670  }
671  });
672 }
673 
674 } // namespace kernel
675 } // namespace pipelines
676 } // namespace t
677 } // namespace cloudViewer
#define DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(DTYPE,...)
Definition: Dispatch.h:78
#define DISPATCH_ROBUST_KERNEL_FUNCTION(METHOD, scalar_t, scaling_parameter, shape_parameter,...)
#define DISPATCH_DUAL_ROBUST_KERNEL_FUNCTION(scalar_t, METHOD_1, scaling_parameter_1, METHOD_2, scaling_parameter_2,...)
core::Tensor result
Definition: VtkUtils.cpp:76
Tensor Matmul(const Tensor &rhs) const
Definition: Tensor.cpp:1919
int64_t GetLength() const
Definition: Tensor.h:1125
static Tensor Eye(int64_t n, Dtype dtype, const Device &device)
Create an identity matrix of size n x n.
Definition: Tensor.cpp:418
static Tensor Zeros(const SizeVector &shape, Dtype dtype, const Device &device=Device("CPU:0"))
Create a tensor fill with zeros.
Definition: Tensor.cpp:406
Tensor Reshape(const SizeVector &dst_shape) const
Definition: Tensor.cpp:671
static Tensor Empty(const SizeVector &shape, Dtype dtype, const Device &device=Device("CPU:0"))
Create a tensor with uninitialized values.
Definition: Tensor.cpp:400
std::tuple< Tensor, Tensor, Tensor > SVD() const
Definition: Tensor.cpp:1990
Tensor T() const
Expects input to be <= 2-D Tensor by swapping dimension 0 and 1.
Definition: Tensor.cpp:1079
Tensor To(Dtype dtype, bool copy=false) const
Definition: Tensor.cpp:739
#define LogError(...)
Definition: Logging.h:60
const Dtype Float64
Definition: Dtype.cpp:43
static void ComputePoseDopplerICPKernelCPU(const scalar_t *source_points_ptr, const scalar_t *source_dopplers_ptr, const scalar_t *source_directions_ptr, const scalar_t *target_points_ptr, const scalar_t *target_normals_ptr, const int64_t *correspondence_indices, const scalar_t *R_S_to_V, const scalar_t *r_v_to_s_in_V, const scalar_t *v_s_in_S, const bool reject_dynamic_outliers, const scalar_t doppler_outlier_threshold, const scalar_t sqrt_lambda_geometric, const scalar_t sqrt_lambda_doppler, const scalar_t sqrt_lambda_doppler_by_dt, const int n, scalar_t *global_sum, funct1_t GetWeightFromRobustKernelFirst, funct2_t GetWeightFromRobustKernelSecond)
static void Get3x3SxyLinearSystem(const scalar_t *source_points_ptr, const scalar_t *target_points_ptr, const int64_t *correspondence_indices, const int &n, const core::Dtype &dtype, const core::Device &device, core::Tensor &Sxy, core::Tensor &target_mean, core::Tensor &source_mean, int &inlier_count)
void ComputePoseColoredICPCPU(const core::Tensor &source_points, const core::Tensor &source_colors, const core::Tensor &target_points, const core::Tensor &target_normals, const core::Tensor &target_colors, const core::Tensor &target_color_gradients, const core::Tensor &correspondence_indices, core::Tensor &pose, float &residual, int &inlier_count, const core::Dtype &dtype, const core::Device &device, const registration::RobustKernel &kernel, const double &lambda_geometric)
void ComputePosePointToPlaneCPU(const core::Tensor &source_points, const core::Tensor &target_points, const core::Tensor &target_normals, const core::Tensor &correspondence_indices, core::Tensor &pose, float &residual, int &inlier_count, const core::Dtype &dtype, const core::Device &device, const registration::RobustKernel &kernel)
void ComputeRtPointToPointCPU(const core::Tensor &source_points, const core::Tensor &target_points, const core::Tensor &corres, core::Tensor &R, core::Tensor &t, int &inlier_count, const core::Dtype &dtype, const core::Device &device)
static void ComputePosePointToPlaneKernelCPU(const scalar_t *source_points_ptr, const scalar_t *target_points_ptr, const scalar_t *target_normals_ptr, const int64_t *correspondence_indices, const int n, scalar_t *global_sum, func_t GetWeightFromRobustKernel)
void PreComputeForDopplerICPKernelCPU(const scalar_t *R_S_to_V, const scalar_t *r_v_to_s_in_V, const scalar_t *w_v_in_V, const scalar_t *v_v_in_V, scalar_t *v_s_in_S)
void ComputePoseDopplerICPCPU(const core::Tensor &source_points, const core::Tensor &source_dopplers, const core::Tensor &source_directions, const core::Tensor &target_points, const core::Tensor &target_normals, const core::Tensor &correspondence_indices, core::Tensor &output_pose, float &residual, int &inlier_count, const core::Dtype &dtype, const core::Device &device, const core::Tensor &R_S_to_V, const core::Tensor &r_v_to_s_in_V, const core::Tensor &w_v_in_V, const core::Tensor &v_v_in_V, const double period, const bool reject_dynamic_outliers, const double doppler_outlier_threshold, const registration::RobustKernel &kernel_geometric, const registration::RobustKernel &kernel_doppler, const double lambda_doppler)
void ComputeInformationMatrixCPU(const core::Tensor &target_points, const core::Tensor &correspondence_indices, core::Tensor &information_matrix, const core::Dtype &dtype, const core::Device &device)
void DecodeAndSolve6x6(const core::Tensor &A_reduction, core::Tensor &delta, float &inlier_residual, int &inlier_count)
Decodes a 6x6 linear system from a compressed 29x1 tensor.
void ComputeInformationMatrixKernelCPU(const scalar_t *target_points_ptr, const int64_t *correspondence_indices, const int n, scalar_t *global_sum)
CLOUDVIEWER_HOST_DEVICE void PreComputeForDopplerICP(const scalar_t *R_S_to_V, const scalar_t *r_v_to_s_in_V, const scalar_t *w_v_in_V, const scalar_t *v_v_in_V, scalar_t *v_s_in_S)
static void ComputePoseColoredICPKernelCPU(const scalar_t *source_points_ptr, const scalar_t *source_colors_ptr, const scalar_t *target_points_ptr, const scalar_t *target_normals_ptr, const scalar_t *target_colors_ptr, const scalar_t *target_color_gradients_ptr, const int64_t *correspondence_indices, const scalar_t &sqrt_lambda_geometric, const scalar_t &sqrt_lambda_photometric, const int n, scalar_t *global_sum, funct_t GetWeightFromRobustKernel)
Generic file read and write utility for python interface.