1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
10 #include <cub/cub.cuh>
12 #include "cloudViewer/core/CUDAUtils.h"
13 #include "cloudViewer/t/geometry/kernel/GeometryIndexer.h"
14 #include "cloudViewer/t/geometry/kernel/GeometryMacros.h"
15 #include "cloudViewer/t/pipelines/kernel/RGBDOdometryImpl.h"
16 #include "cloudViewer/t/pipelines/kernel/RGBDOdometryJacobianImpl.h"
17 #include "cloudViewer/t/pipelines/kernel/TransformationConverter.h"
18 #include "cloudViewer/utility/MiniVec.h"
19 #include "core/Dispatch.h"
20 #include "core/ParallelFor.h"
21 #include "core/Tensor.h"
23 namespace cloudViewer {
29 const int kBlockSize = 256;
30 const int kJtJDim = 21;
31 const int kJtrDim = 6;
32 const int kReduceDim =
33 kJtJDim + kJtrDim + 1 + 1; // 21 (JtJ) + 6 (Jtr) + 1 (inlier) + 1 (r)
34 typedef utility::MiniVec<float, kReduceDim> ReduceVec;
35 typedef cub::BlockReduce<ReduceVec, kBlockSize> BlockReduce;
37 __global__ void ComputeOdometryResultPointToPlaneCUDAKernel(
38 NDArrayIndexer source_vertex_indexer,
39 NDArrayIndexer target_vertex_indexer,
40 NDArrayIndexer target_normal_indexer,
45 const float depth_outlier_trunc,
46 const float depth_huber_delta) {
47 __shared__ typename BlockReduce::TempStorage temp_storage;
49 const int workload = threadIdx.x + blockIdx.x * blockDim.x;
50 int y = workload / cols;
51 int x = workload % cols;
52 const int tid = threadIdx.x;
54 ReduceVec local_sum(0.0f);
55 if (workload < rows * cols) {
58 bool valid = GetJacobianPointToPlane(
59 x, y, depth_outlier_trunc, source_vertex_indexer,
60 target_vertex_indexer, target_normal_indexer, ti, J, r);
62 float d_huber = HuberDeriv(r, depth_huber_delta);
63 float r_huber = HuberLoss(r, depth_huber_delta);
65 // Dump J, r into JtJ and Jtr
67 for (int i = 0; i < 6; ++i) {
68 for (int j = 0; j <= i; ++j) {
69 local_sum[offset++] = valid ? J[i] * J[j] : 0;
72 for (int i = 0; i < 6; ++i) {
73 local_sum[offset++] = valid ? J[i] * d_huber : 0;
75 local_sum[offset++] = valid ? r_huber : 0;
76 local_sum[offset++] = valid;
79 auto result = BlockReduce(temp_storage).Sum(local_sum);
82 for (int i = 0; i < kReduceDim; ++i) {
83 atomicAdd(&global_sum[i], result[i]);
88 void ComputeOdometryResultPointToPlaneCUDA(
89 const core::Tensor& source_vertex_map,
90 const core::Tensor& target_vertex_map,
91 const core::Tensor& target_normal_map,
92 const core::Tensor& intrinsics,
93 const core::Tensor& init_source_to_target,
95 float& inlier_residual,
97 const float depth_outlier_trunc,
98 const float depth_huber_delta) {
99 core::CUDAScopedDevice scoped_device(source_vertex_map.GetDevice());
101 NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
102 NDArrayIndexer target_vertex_indexer(target_vertex_map, 2);
103 NDArrayIndexer target_normal_indexer(target_normal_map, 2);
105 core::Device device = source_vertex_map.GetDevice();
107 core::Tensor trans = init_source_to_target;
108 TransformIndexer ti(intrinsics, trans);
110 const int64_t rows = source_vertex_indexer.GetShape(0);
111 const int64_t cols = source_vertex_indexer.GetShape(1);
113 core::Tensor global_sum =
114 core::Tensor::Zeros({kReduceDim}, core::Float32, device);
115 float* global_sum_ptr = global_sum.GetDataPtr<float>();
117 const dim3 blocks((rows * cols + kBlockSize - 1) / kBlockSize);
118 const dim3 threads(kBlockSize);
119 ComputeOdometryResultPointToPlaneCUDAKernel<<<blocks, threads, 0,
120 core::cuda::GetStream()>>>(
121 source_vertex_indexer, target_vertex_indexer, target_normal_indexer,
122 ti, global_sum_ptr, rows, cols, depth_outlier_trunc,
124 core::cuda::Synchronize();
125 DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
128 __global__ void ComputeOdometryResultIntensityCUDAKernel(
129 NDArrayIndexer source_depth_indexer,
130 NDArrayIndexer target_depth_indexer,
131 NDArrayIndexer source_intensity_indexer,
132 NDArrayIndexer target_intensity_indexer,
133 NDArrayIndexer target_intensity_dx_indexer,
134 NDArrayIndexer target_intensity_dy_indexer,
135 NDArrayIndexer source_vertex_indexer,
140 const float depth_outlier_trunc,
141 const float intensity_huber_delta) {
142 __shared__ typename BlockReduce::TempStorage temp_storage;
144 const int workload = threadIdx.x + blockIdx.x * blockDim.x;
145 int y = workload / cols;
146 int x = workload % cols;
147 const int tid = threadIdx.x;
149 ReduceVec local_sum(0.0f);
150 if (workload < rows * cols) {
153 bool valid = GetJacobianIntensity(
154 x, y, depth_outlier_trunc, source_depth_indexer,
155 target_depth_indexer, source_intensity_indexer,
156 target_intensity_indexer, target_intensity_dx_indexer,
157 target_intensity_dy_indexer, source_vertex_indexer, ti, J, r);
159 float d_huber = HuberDeriv(r, intensity_huber_delta);
160 float r_huber = HuberLoss(r, intensity_huber_delta);
162 // Dump J, r into JtJ and Jtr
164 for (int i = 0; i < 6; ++i) {
165 for (int j = 0; j <= i; ++j) {
166 local_sum[offset++] = J[i] * J[j];
169 for (int i = 0; i < 6; ++i) {
170 local_sum[offset++] = J[i] * HuberDeriv(r, intensity_huber_delta);
172 local_sum[offset++] = HuberLoss(r, intensity_huber_delta);
173 local_sum[offset++] = valid;
176 auto result = BlockReduce(temp_storage).Sum(local_sum);
179 for (int i = 0; i < kReduceDim; ++i) {
180 atomicAdd(&global_sum[i], result[i]);
185 void ComputeOdometryResultIntensityCUDA(
186 const core::Tensor& source_depth,
187 const core::Tensor& target_depth,
188 const core::Tensor& source_intensity,
189 const core::Tensor& target_intensity,
190 const core::Tensor& target_intensity_dx,
191 const core::Tensor& target_intensity_dy,
192 const core::Tensor& source_vertex_map,
193 const core::Tensor& intrinsics,
194 const core::Tensor& init_source_to_target,
196 float& inlier_residual,
198 const float depth_outlier_trunc,
199 const float intensity_huber_delta) {
200 core::CUDAScopedDevice scoped_device(source_depth.GetDevice());
202 NDArrayIndexer source_depth_indexer(source_depth, 2);
203 NDArrayIndexer target_depth_indexer(target_depth, 2);
205 NDArrayIndexer source_intensity_indexer(source_intensity, 2);
206 NDArrayIndexer target_intensity_indexer(target_intensity, 2);
208 NDArrayIndexer target_intensity_dx_indexer(target_intensity_dx, 2);
209 NDArrayIndexer target_intensity_dy_indexer(target_intensity_dy, 2);
211 NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
213 core::Device device = source_vertex_map.GetDevice();
214 core::Tensor trans = init_source_to_target;
215 t::geometry::kernel::TransformIndexer ti(intrinsics, trans);
217 const int64_t rows = source_vertex_indexer.GetShape(0);
218 const int64_t cols = source_vertex_indexer.GetShape(1);
220 core::Tensor global_sum =
221 core::Tensor::Zeros({kReduceDim}, core::Float32, device);
222 float* global_sum_ptr = global_sum.GetDataPtr<float>();
224 const dim3 blocks((cols * rows + kBlockSize - 1) / kBlockSize);
225 const dim3 threads(kBlockSize);
226 ComputeOdometryResultIntensityCUDAKernel<<<blocks, threads, 0,
227 core::cuda::GetStream()>>>(
228 source_depth_indexer, target_depth_indexer,
229 source_intensity_indexer, target_intensity_indexer,
230 target_intensity_dx_indexer, target_intensity_dy_indexer,
231 source_vertex_indexer, ti, global_sum_ptr, rows, cols,
232 depth_outlier_trunc, intensity_huber_delta);
233 core::cuda::Synchronize();
234 DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
237 __global__ void ComputeOdometryResultHybridCUDAKernel(
238 NDArrayIndexer source_depth_indexer,
239 NDArrayIndexer target_depth_indexer,
240 NDArrayIndexer source_intensity_indexer,
241 NDArrayIndexer target_intensity_indexer,
242 NDArrayIndexer target_depth_dx_indexer,
243 NDArrayIndexer target_depth_dy_indexer,
244 NDArrayIndexer target_intensity_dx_indexer,
245 NDArrayIndexer target_intensity_dy_indexer,
246 NDArrayIndexer source_vertex_indexer,
251 const float depth_outlier_trunc,
252 const float depth_huber_delta,
253 const float intensity_huber_delta) {
254 __shared__ typename BlockReduce::TempStorage temp_storage;
256 const int workload = threadIdx.x + blockIdx.x * blockDim.x;
257 int y = workload / cols;
258 int x = workload % cols;
259 const int tid = threadIdx.x;
261 ReduceVec local_sum(0.0f);
262 if (workload < rows * cols) {
263 float J_I[6] = {0}, J_D[6] = {0};
264 float r_I = 0, r_D = 0;
265 bool valid = GetJacobianHybrid(
266 x, y, depth_outlier_trunc, source_depth_indexer,
267 target_depth_indexer, source_intensity_indexer,
268 target_intensity_indexer, target_depth_dx_indexer,
269 target_depth_dy_indexer, target_intensity_dx_indexer,
270 target_intensity_dy_indexer, source_vertex_indexer, ti, J_I,
273 float d_huber_D = HuberDeriv(r_D, depth_huber_delta);
274 float d_huber_I = HuberDeriv(r_I, intensity_huber_delta);
276 float r_huber_D = HuberLoss(r_D, depth_huber_delta);
277 float r_huber_I = HuberLoss(r_I, intensity_huber_delta);
279 // Dump J, r into JtJ and Jtr
281 for (int i = 0; i < 6; ++i) {
282 for (int j = 0; j <= i; ++j) {
283 local_sum[offset++] = J_I[i] * J_I[j] + J_D[i] * J_D[j];
286 for (int i = 0; i < 6; ++i) {
287 local_sum[offset++] = J_I[i] * d_huber_I + J_D[i] * d_huber_D;
289 local_sum[offset++] = r_huber_D + r_huber_I;
290 local_sum[offset++] = valid;
293 auto result = BlockReduce(temp_storage).Sum(local_sum);
296 for (int i = 0; i < kReduceDim; ++i) {
297 atomicAdd(&global_sum[i], result[i]);
302 void ComputeOdometryResultHybridCUDA(const core::Tensor& source_depth,
303 const core::Tensor& target_depth,
304 const core::Tensor& source_intensity,
305 const core::Tensor& target_intensity,
306 const core::Tensor& target_depth_dx,
307 const core::Tensor& target_depth_dy,
308 const core::Tensor& target_intensity_dx,
309 const core::Tensor& target_intensity_dy,
310 const core::Tensor& source_vertex_map,
311 const core::Tensor& intrinsics,
312 const core::Tensor& init_source_to_target,
314 float& inlier_residual,
316 const float depth_outlier_trunc,
317 const float depth_huber_delta,
318 const float intensity_huber_delta) {
319 core::CUDAScopedDevice scoped_device(source_depth.GetDevice());
321 NDArrayIndexer source_depth_indexer(source_depth, 2);
322 NDArrayIndexer target_depth_indexer(target_depth, 2);
324 NDArrayIndexer source_intensity_indexer(source_intensity, 2);
325 NDArrayIndexer target_intensity_indexer(target_intensity, 2);
327 NDArrayIndexer target_depth_dx_indexer(target_depth_dx, 2);
328 NDArrayIndexer target_depth_dy_indexer(target_depth_dy, 2);
329 NDArrayIndexer target_intensity_dx_indexer(target_intensity_dx, 2);
330 NDArrayIndexer target_intensity_dy_indexer(target_intensity_dy, 2);
332 NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
334 core::Device device = source_vertex_map.GetDevice();
335 core::Tensor trans = init_source_to_target;
336 t::geometry::kernel::TransformIndexer ti(intrinsics, trans);
338 const int64_t rows = source_vertex_indexer.GetShape(0);
339 const int64_t cols = source_vertex_indexer.GetShape(1);
341 core::Tensor global_sum =
342 core::Tensor::Zeros({kReduceDim}, core::Float32, device);
343 float* global_sum_ptr = global_sum.GetDataPtr<float>();
345 const dim3 blocks((cols * rows + kBlockSize - 1) / kBlockSize);
346 const dim3 threads(kBlockSize);
347 ComputeOdometryResultHybridCUDAKernel<<<blocks, threads, 0,
348 core::cuda::GetStream()>>>(
349 source_depth_indexer, target_depth_indexer,
350 source_intensity_indexer, target_intensity_indexer,
351 target_depth_dx_indexer, target_depth_dy_indexer,
352 target_intensity_dx_indexer, target_intensity_dy_indexer,
353 source_vertex_indexer, ti, global_sum_ptr, rows, cols,
354 depth_outlier_trunc, depth_huber_delta, intensity_huber_delta);
355 core::cuda::Synchronize();
356 DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
359 __global__ void ComputeOdometryInformationMatrixCUDAKernel(
360 NDArrayIndexer source_vertex_indexer,
361 NDArrayIndexer target_vertex_indexer,
366 const float square_dist_thr) {
367 __shared__ typename BlockReduce::TempStorage temp_storage;
369 const int workload = threadIdx.x + blockIdx.x * blockDim.x;
370 int y = workload / cols;
371 int x = workload % cols;
372 const int tid = threadIdx.x;
374 ReduceVec local_sum(0.0f);
375 if (workload < rows * cols) {
376 float J_x[6], J_y[6], J_z[6];
377 float rx = 0, ry = 0, rz = 0;
378 bool valid = GetJacobianPointToPoint(
379 x, y, square_dist_thr, source_vertex_indexer,
380 target_vertex_indexer, ti, J_x, J_y, J_z, rx, ry, rz);
382 // Dump J, r into JtJ and Jtr
384 for (int i = 0; i < 6; ++i) {
385 for (int j = 0; j <= i; ++j) {
386 local_sum[offset] = valid ? J_x[i] * J_x[j] : 0;
387 local_sum[offset] += valid ? J_y[i] * J_y[j] : 0;
388 local_sum[offset] += valid ? J_z[i] * J_z[j] : 0;
394 auto result = BlockReduce(temp_storage).Sum(local_sum);
397 for (int i = 0; i < kJtJDim; ++i) {
398 atomicAdd(&global_sum[i], result[i]);
403 void ComputeOdometryInformationMatrixCUDA(const core::Tensor& source_vertex_map,
404 const core::Tensor& target_vertex_map,
405 const core::Tensor& intrinsic,
406 const core::Tensor& source_to_target,
407 const float square_dist_thr,
408 core::Tensor& information) {
409 NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
410 NDArrayIndexer target_vertex_indexer(target_vertex_map, 2);
412 core::Device device = source_vertex_map.GetDevice();
413 core::Tensor trans = source_to_target;
414 t::geometry::kernel::TransformIndexer ti(intrinsic, trans);
416 const int64_t rows = source_vertex_indexer.GetShape(0);
417 const int64_t cols = source_vertex_indexer.GetShape(1);
419 core::Tensor global_sum =
420 core::Tensor::Zeros({kJtJDim}, core::Float32, device);
421 float* global_sum_ptr = global_sum.GetDataPtr<float>();
423 const dim3 blocks((cols * rows + kBlockSize - 1) / kBlockSize);
424 const dim3 threads(kBlockSize);
425 ComputeOdometryInformationMatrixCUDAKernel<<<blocks, threads, 0,
426 core::cuda::GetStream()>>>(
427 source_vertex_indexer, target_vertex_indexer, ti, global_sum_ptr,
428 rows, cols, square_dist_thr);
429 core::cuda::Synchronize();
432 const core::Device host(core::Device("CPU:0"));
433 information = core::Tensor::Empty({6, 6}, core::Float64, host);
434 global_sum = global_sum.To(host, core::Float64);
436 double* info_ptr = information.GetDataPtr<double>();
437 double* reduction_ptr = global_sum.GetDataPtr<double>();
438 for (int j = 0; j < 6; j++) {
439 const int64_t reduction_idx = ((j * (j + 1)) / 2);
440 for (int k = 0; k <= j; k++) {
441 info_ptr[j * 6 + k] = reduction_ptr[reduction_idx + k];
442 info_ptr[k * 6 + j] = reduction_ptr[reduction_idx + k];
446 } // namespace odometry
447 } // namespace kernel
448 } // namespace pipelines
450 } // namespace cloudViewer