ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
RGBDOdometryCUDA.cu
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #include <cuda.h>
9 
10 #include <cub/cub.cuh>
11 
12 #include "cloudViewer/core/CUDAUtils.h"
13 #include "cloudViewer/t/geometry/kernel/GeometryIndexer.h"
14 #include "cloudViewer/t/geometry/kernel/GeometryMacros.h"
15 #include "cloudViewer/t/pipelines/kernel/RGBDOdometryImpl.h"
16 #include "cloudViewer/t/pipelines/kernel/RGBDOdometryJacobianImpl.h"
17 #include "cloudViewer/t/pipelines/kernel/TransformationConverter.h"
18 #include "cloudViewer/utility/MiniVec.h"
19 #include "core/Dispatch.h"
20 #include "core/ParallelFor.h"
21 #include "core/Tensor.h"
22 
23 namespace cloudViewer {
24 namespace t {
25 namespace pipelines {
26 namespace kernel {
27 namespace odometry {
28 
29 const int kBlockSize = 256;
30 const int kJtJDim = 21;
31 const int kJtrDim = 6;
32 const int kReduceDim =
33  kJtJDim + kJtrDim + 1 + 1; // 21 (JtJ) + 6 (Jtr) + 1 (inlier) + 1 (r)
34 typedef utility::MiniVec<float, kReduceDim> ReduceVec;
35 typedef cub::BlockReduce<ReduceVec, kBlockSize> BlockReduce;
36 
37 __global__ void ComputeOdometryResultPointToPlaneCUDAKernel(
38  NDArrayIndexer source_vertex_indexer,
39  NDArrayIndexer target_vertex_indexer,
40  NDArrayIndexer target_normal_indexer,
41  TransformIndexer ti,
42  float* global_sum,
43  int rows,
44  int cols,
45  const float depth_outlier_trunc,
46  const float depth_huber_delta) {
47  __shared__ typename BlockReduce::TempStorage temp_storage;
48 
49  const int workload = threadIdx.x + blockIdx.x * blockDim.x;
50  int y = workload / cols;
51  int x = workload % cols;
52  const int tid = threadIdx.x;
53 
54  ReduceVec local_sum(0.0f);
55  if (workload < rows * cols) {
56  float J[6] = {0};
57  float r = 0;
58  bool valid = GetJacobianPointToPlane(
59  x, y, depth_outlier_trunc, source_vertex_indexer,
60  target_vertex_indexer, target_normal_indexer, ti, J, r);
61 
62  float d_huber = HuberDeriv(r, depth_huber_delta);
63  float r_huber = HuberLoss(r, depth_huber_delta);
64 
65  // Dump J, r into JtJ and Jtr
66  int offset = 0;
67  for (int i = 0; i < 6; ++i) {
68  for (int j = 0; j <= i; ++j) {
69  local_sum[offset++] = valid ? J[i] * J[j] : 0;
70  }
71  }
72  for (int i = 0; i < 6; ++i) {
73  local_sum[offset++] = valid ? J[i] * d_huber : 0;
74  }
75  local_sum[offset++] = valid ? r_huber : 0;
76  local_sum[offset++] = valid;
77  }
78 
79  auto result = BlockReduce(temp_storage).Sum(local_sum);
80  if (tid == 0) {
81 #pragma unroll
82  for (int i = 0; i < kReduceDim; ++i) {
83  atomicAdd(&global_sum[i], result[i]);
84  }
85  }
86 }
87 
88 void ComputeOdometryResultPointToPlaneCUDA(
89  const core::Tensor& source_vertex_map,
90  const core::Tensor& target_vertex_map,
91  const core::Tensor& target_normal_map,
92  const core::Tensor& intrinsics,
93  const core::Tensor& init_source_to_target,
94  core::Tensor& delta,
95  float& inlier_residual,
96  int& inlier_count,
97  const float depth_outlier_trunc,
98  const float depth_huber_delta) {
99  core::CUDAScopedDevice scoped_device(source_vertex_map.GetDevice());
100 
101  NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
102  NDArrayIndexer target_vertex_indexer(target_vertex_map, 2);
103  NDArrayIndexer target_normal_indexer(target_normal_map, 2);
104 
105  core::Device device = source_vertex_map.GetDevice();
106 
107  core::Tensor trans = init_source_to_target;
108  TransformIndexer ti(intrinsics, trans);
109 
110  const int64_t rows = source_vertex_indexer.GetShape(0);
111  const int64_t cols = source_vertex_indexer.GetShape(1);
112 
113  core::Tensor global_sum =
114  core::Tensor::Zeros({kReduceDim}, core::Float32, device);
115  float* global_sum_ptr = global_sum.GetDataPtr<float>();
116 
117  const dim3 blocks((rows * cols + kBlockSize - 1) / kBlockSize);
118  const dim3 threads(kBlockSize);
119  ComputeOdometryResultPointToPlaneCUDAKernel<<<blocks, threads, 0,
120  core::cuda::GetStream()>>>(
121  source_vertex_indexer, target_vertex_indexer, target_normal_indexer,
122  ti, global_sum_ptr, rows, cols, depth_outlier_trunc,
123  depth_huber_delta);
124  core::cuda::Synchronize();
125  DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
126 }
127 
128 __global__ void ComputeOdometryResultIntensityCUDAKernel(
129  NDArrayIndexer source_depth_indexer,
130  NDArrayIndexer target_depth_indexer,
131  NDArrayIndexer source_intensity_indexer,
132  NDArrayIndexer target_intensity_indexer,
133  NDArrayIndexer target_intensity_dx_indexer,
134  NDArrayIndexer target_intensity_dy_indexer,
135  NDArrayIndexer source_vertex_indexer,
136  TransformIndexer ti,
137  float* global_sum,
138  int rows,
139  int cols,
140  const float depth_outlier_trunc,
141  const float intensity_huber_delta) {
142  __shared__ typename BlockReduce::TempStorage temp_storage;
143 
144  const int workload = threadIdx.x + blockIdx.x * blockDim.x;
145  int y = workload / cols;
146  int x = workload % cols;
147  const int tid = threadIdx.x;
148 
149  ReduceVec local_sum(0.0f);
150  if (workload < rows * cols) {
151  float J[6] = {0};
152  float r = 0;
153  bool valid = GetJacobianIntensity(
154  x, y, depth_outlier_trunc, source_depth_indexer,
155  target_depth_indexer, source_intensity_indexer,
156  target_intensity_indexer, target_intensity_dx_indexer,
157  target_intensity_dy_indexer, source_vertex_indexer, ti, J, r);
158 
159  float d_huber = HuberDeriv(r, intensity_huber_delta);
160  float r_huber = HuberLoss(r, intensity_huber_delta);
161 
162  // Dump J, r into JtJ and Jtr
163  int offset = 0;
164  for (int i = 0; i < 6; ++i) {
165  for (int j = 0; j <= i; ++j) {
166  local_sum[offset++] = J[i] * J[j];
167  }
168  }
169  for (int i = 0; i < 6; ++i) {
170  local_sum[offset++] = J[i] * HuberDeriv(r, intensity_huber_delta);
171  }
172  local_sum[offset++] = HuberLoss(r, intensity_huber_delta);
173  local_sum[offset++] = valid;
174  }
175 
176  auto result = BlockReduce(temp_storage).Sum(local_sum);
177  if (tid == 0) {
178 #pragma unroll
179  for (int i = 0; i < kReduceDim; ++i) {
180  atomicAdd(&global_sum[i], result[i]);
181  }
182  }
183 }
184 
185 void ComputeOdometryResultIntensityCUDA(
186  const core::Tensor& source_depth,
187  const core::Tensor& target_depth,
188  const core::Tensor& source_intensity,
189  const core::Tensor& target_intensity,
190  const core::Tensor& target_intensity_dx,
191  const core::Tensor& target_intensity_dy,
192  const core::Tensor& source_vertex_map,
193  const core::Tensor& intrinsics,
194  const core::Tensor& init_source_to_target,
195  core::Tensor& delta,
196  float& inlier_residual,
197  int& inlier_count,
198  const float depth_outlier_trunc,
199  const float intensity_huber_delta) {
200  core::CUDAScopedDevice scoped_device(source_depth.GetDevice());
201 
202  NDArrayIndexer source_depth_indexer(source_depth, 2);
203  NDArrayIndexer target_depth_indexer(target_depth, 2);
204 
205  NDArrayIndexer source_intensity_indexer(source_intensity, 2);
206  NDArrayIndexer target_intensity_indexer(target_intensity, 2);
207 
208  NDArrayIndexer target_intensity_dx_indexer(target_intensity_dx, 2);
209  NDArrayIndexer target_intensity_dy_indexer(target_intensity_dy, 2);
210 
211  NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
212 
213  core::Device device = source_vertex_map.GetDevice();
214  core::Tensor trans = init_source_to_target;
215  t::geometry::kernel::TransformIndexer ti(intrinsics, trans);
216 
217  const int64_t rows = source_vertex_indexer.GetShape(0);
218  const int64_t cols = source_vertex_indexer.GetShape(1);
219 
220  core::Tensor global_sum =
221  core::Tensor::Zeros({kReduceDim}, core::Float32, device);
222  float* global_sum_ptr = global_sum.GetDataPtr<float>();
223 
224  const dim3 blocks((cols * rows + kBlockSize - 1) / kBlockSize);
225  const dim3 threads(kBlockSize);
226  ComputeOdometryResultIntensityCUDAKernel<<<blocks, threads, 0,
227  core::cuda::GetStream()>>>(
228  source_depth_indexer, target_depth_indexer,
229  source_intensity_indexer, target_intensity_indexer,
230  target_intensity_dx_indexer, target_intensity_dy_indexer,
231  source_vertex_indexer, ti, global_sum_ptr, rows, cols,
232  depth_outlier_trunc, intensity_huber_delta);
233  core::cuda::Synchronize();
234  DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
235 }
236 
237 __global__ void ComputeOdometryResultHybridCUDAKernel(
238  NDArrayIndexer source_depth_indexer,
239  NDArrayIndexer target_depth_indexer,
240  NDArrayIndexer source_intensity_indexer,
241  NDArrayIndexer target_intensity_indexer,
242  NDArrayIndexer target_depth_dx_indexer,
243  NDArrayIndexer target_depth_dy_indexer,
244  NDArrayIndexer target_intensity_dx_indexer,
245  NDArrayIndexer target_intensity_dy_indexer,
246  NDArrayIndexer source_vertex_indexer,
247  TransformIndexer ti,
248  float* global_sum,
249  int rows,
250  int cols,
251  const float depth_outlier_trunc,
252  const float depth_huber_delta,
253  const float intensity_huber_delta) {
254  __shared__ typename BlockReduce::TempStorage temp_storage;
255 
256  const int workload = threadIdx.x + blockIdx.x * blockDim.x;
257  int y = workload / cols;
258  int x = workload % cols;
259  const int tid = threadIdx.x;
260 
261  ReduceVec local_sum(0.0f);
262  if (workload < rows * cols) {
263  float J_I[6] = {0}, J_D[6] = {0};
264  float r_I = 0, r_D = 0;
265  bool valid = GetJacobianHybrid(
266  x, y, depth_outlier_trunc, source_depth_indexer,
267  target_depth_indexer, source_intensity_indexer,
268  target_intensity_indexer, target_depth_dx_indexer,
269  target_depth_dy_indexer, target_intensity_dx_indexer,
270  target_intensity_dy_indexer, source_vertex_indexer, ti, J_I,
271  J_D, r_I, r_D);
272 
273  float d_huber_D = HuberDeriv(r_D, depth_huber_delta);
274  float d_huber_I = HuberDeriv(r_I, intensity_huber_delta);
275 
276  float r_huber_D = HuberLoss(r_D, depth_huber_delta);
277  float r_huber_I = HuberLoss(r_I, intensity_huber_delta);
278 
279  // Dump J, r into JtJ and Jtr
280  int offset = 0;
281  for (int i = 0; i < 6; ++i) {
282  for (int j = 0; j <= i; ++j) {
283  local_sum[offset++] = J_I[i] * J_I[j] + J_D[i] * J_D[j];
284  }
285  }
286  for (int i = 0; i < 6; ++i) {
287  local_sum[offset++] = J_I[i] * d_huber_I + J_D[i] * d_huber_D;
288  }
289  local_sum[offset++] = r_huber_D + r_huber_I;
290  local_sum[offset++] = valid;
291  }
292 
293  auto result = BlockReduce(temp_storage).Sum(local_sum);
294  if (tid == 0) {
295 #pragma unroll
296  for (int i = 0; i < kReduceDim; ++i) {
297  atomicAdd(&global_sum[i], result[i]);
298  }
299  }
300 }
301 
302 void ComputeOdometryResultHybridCUDA(const core::Tensor& source_depth,
303  const core::Tensor& target_depth,
304  const core::Tensor& source_intensity,
305  const core::Tensor& target_intensity,
306  const core::Tensor& target_depth_dx,
307  const core::Tensor& target_depth_dy,
308  const core::Tensor& target_intensity_dx,
309  const core::Tensor& target_intensity_dy,
310  const core::Tensor& source_vertex_map,
311  const core::Tensor& intrinsics,
312  const core::Tensor& init_source_to_target,
313  core::Tensor& delta,
314  float& inlier_residual,
315  int& inlier_count,
316  const float depth_outlier_trunc,
317  const float depth_huber_delta,
318  const float intensity_huber_delta) {
319  core::CUDAScopedDevice scoped_device(source_depth.GetDevice());
320 
321  NDArrayIndexer source_depth_indexer(source_depth, 2);
322  NDArrayIndexer target_depth_indexer(target_depth, 2);
323 
324  NDArrayIndexer source_intensity_indexer(source_intensity, 2);
325  NDArrayIndexer target_intensity_indexer(target_intensity, 2);
326 
327  NDArrayIndexer target_depth_dx_indexer(target_depth_dx, 2);
328  NDArrayIndexer target_depth_dy_indexer(target_depth_dy, 2);
329  NDArrayIndexer target_intensity_dx_indexer(target_intensity_dx, 2);
330  NDArrayIndexer target_intensity_dy_indexer(target_intensity_dy, 2);
331 
332  NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
333 
334  core::Device device = source_vertex_map.GetDevice();
335  core::Tensor trans = init_source_to_target;
336  t::geometry::kernel::TransformIndexer ti(intrinsics, trans);
337 
338  const int64_t rows = source_vertex_indexer.GetShape(0);
339  const int64_t cols = source_vertex_indexer.GetShape(1);
340 
341  core::Tensor global_sum =
342  core::Tensor::Zeros({kReduceDim}, core::Float32, device);
343  float* global_sum_ptr = global_sum.GetDataPtr<float>();
344 
345  const dim3 blocks((cols * rows + kBlockSize - 1) / kBlockSize);
346  const dim3 threads(kBlockSize);
347  ComputeOdometryResultHybridCUDAKernel<<<blocks, threads, 0,
348  core::cuda::GetStream()>>>(
349  source_depth_indexer, target_depth_indexer,
350  source_intensity_indexer, target_intensity_indexer,
351  target_depth_dx_indexer, target_depth_dy_indexer,
352  target_intensity_dx_indexer, target_intensity_dy_indexer,
353  source_vertex_indexer, ti, global_sum_ptr, rows, cols,
354  depth_outlier_trunc, depth_huber_delta, intensity_huber_delta);
355  core::cuda::Synchronize();
356  DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
357 }
358 
359 __global__ void ComputeOdometryInformationMatrixCUDAKernel(
360  NDArrayIndexer source_vertex_indexer,
361  NDArrayIndexer target_vertex_indexer,
362  TransformIndexer ti,
363  float* global_sum,
364  int rows,
365  int cols,
366  const float square_dist_thr) {
367  __shared__ typename BlockReduce::TempStorage temp_storage;
368 
369  const int workload = threadIdx.x + blockIdx.x * blockDim.x;
370  int y = workload / cols;
371  int x = workload % cols;
372  const int tid = threadIdx.x;
373 
374  ReduceVec local_sum(0.0f);
375  if (workload < rows * cols) {
376  float J_x[6], J_y[6], J_z[6];
377  float rx = 0, ry = 0, rz = 0;
378  bool valid = GetJacobianPointToPoint(
379  x, y, square_dist_thr, source_vertex_indexer,
380  target_vertex_indexer, ti, J_x, J_y, J_z, rx, ry, rz);
381 
382  // Dump J, r into JtJ and Jtr
383  int offset = 0;
384  for (int i = 0; i < 6; ++i) {
385  for (int j = 0; j <= i; ++j) {
386  local_sum[offset] = valid ? J_x[i] * J_x[j] : 0;
387  local_sum[offset] += valid ? J_y[i] * J_y[j] : 0;
388  local_sum[offset] += valid ? J_z[i] * J_z[j] : 0;
389  offset++;
390  }
391  }
392  }
393 
394  auto result = BlockReduce(temp_storage).Sum(local_sum);
395  if (tid == 0) {
396 #pragma unroll
397  for (int i = 0; i < kJtJDim; ++i) {
398  atomicAdd(&global_sum[i], result[i]);
399  }
400  }
401 }
402 
403 void ComputeOdometryInformationMatrixCUDA(const core::Tensor& source_vertex_map,
404  const core::Tensor& target_vertex_map,
405  const core::Tensor& intrinsic,
406  const core::Tensor& source_to_target,
407  const float square_dist_thr,
408  core::Tensor& information) {
409  NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
410  NDArrayIndexer target_vertex_indexer(target_vertex_map, 2);
411 
412  core::Device device = source_vertex_map.GetDevice();
413  core::Tensor trans = source_to_target;
414  t::geometry::kernel::TransformIndexer ti(intrinsic, trans);
415 
416  const int64_t rows = source_vertex_indexer.GetShape(0);
417  const int64_t cols = source_vertex_indexer.GetShape(1);
418 
419  core::Tensor global_sum =
420  core::Tensor::Zeros({kJtJDim}, core::Float32, device);
421  float* global_sum_ptr = global_sum.GetDataPtr<float>();
422 
423  const dim3 blocks((cols * rows + kBlockSize - 1) / kBlockSize);
424  const dim3 threads(kBlockSize);
425  ComputeOdometryInformationMatrixCUDAKernel<<<blocks, threads, 0,
426  core::cuda::GetStream()>>>(
427  source_vertex_indexer, target_vertex_indexer, ti, global_sum_ptr,
428  rows, cols, square_dist_thr);
429  core::cuda::Synchronize();
430 
431  // 21 => 6x6
432  const core::Device host(core::Device("CPU:0"));
433  information = core::Tensor::Empty({6, 6}, core::Float64, host);
434  global_sum = global_sum.To(host, core::Float64);
435 
436  double* info_ptr = information.GetDataPtr<double>();
437  double* reduction_ptr = global_sum.GetDataPtr<double>();
438  for (int j = 0; j < 6; j++) {
439  const int64_t reduction_idx = ((j * (j + 1)) / 2);
440  for (int k = 0; k <= j; k++) {
441  info_ptr[j * 6 + k] = reduction_ptr[reduction_idx + k];
442  info_ptr[k * 6 + j] = reduction_ptr[reduction_idx + k];
443  }
444  }
445 }
446 } // namespace odometry
447 } // namespace kernel
448 } // namespace pipelines
449 } // namespace t
450 } // namespace cloudViewer