ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
IndexReductionCUDA.cu
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 #include <type_traits>
8 
9 #include "cloudViewer/core/CUDAUtils.h"
10 #include "cloudViewer/core/Dispatch.h"
11 #include "cloudViewer/core/Indexer.h"
12 #include "cloudViewer/core/ParallelFor.h"
13 #include "cloudViewer/core/Tensor.h"
14 #include "cloudViewer/t/geometry/kernel/GeometryMacros.h"
15 
16 namespace cloudViewer {
17 namespace core {
18 namespace kernel {
19 
20 template <typename func_t>
21 void LaunchIndexReductionKernel(int64_t dim,
22  const Device& device,
23  const Tensor& index,
24  const Tensor& src,
25  Tensor& dst,
26  const func_t& element_kernel) {
27  CLOUDVIEWER_ASSERT_HOST_DEVICE_LAMBDA(func_t);
28 
29  // index: [N,], src: [N, D], dst: [M, D]
30  // In Indexer, output shape defines the actual primary strides.
31  // However, in IndexAdd_, input dominates the iterations.
32  // So put dst (output) at indexer's input, and src (input) at output.
33  Indexer indexer({dst}, src, DtypePolicy::NONE);
34 
35  // Index is simply a 1D contiguous tensor, with a different stride
36  // behavior to src. So use raw pointer for simplicity.
37  auto index_ptr = index.GetDataPtr<int64_t>();
38 
39  int64_t broadcasting_elems = 1;
40  for (int64_t d = 1; d < src.NumDims(); ++d) {
41  broadcasting_elems *= src.GetShape(d);
42  }
43  auto element_func = [=] CLOUDVIEWER_HOST_DEVICE(int64_t workload_idx) {
44  int reduction_idx = workload_idx / broadcasting_elems;
45  int broadcasting_idx = workload_idx % broadcasting_elems;
46 
47  const int64_t idx = index_ptr[reduction_idx];
48  int64_t dst_idx = idx * broadcasting_elems + broadcasting_idx;
49 
50  void* src_ptr = indexer.GetOutputPtr(0, workload_idx);
51  void* dst_ptr = indexer.GetInputPtr(0, dst_idx);
52  // Note input and output is switched here to adapt to the indexer.
53  element_kernel(src_ptr, dst_ptr);
54  };
55 
56  ParallelFor(device, indexer.NumWorkloads(), element_func);
57  CLOUDVIEWER_GET_LAST_CUDA_ERROR("LaunchIndexReductionKernel failed.");
58 }
59 
60 template <typename scalar_t>
61 static CLOUDVIEWER_HOST_DEVICE void CUDASumKernel(const void* src, void* dst) {
62  scalar_t* dst_s_ptr = static_cast<scalar_t*>(dst);
63  const scalar_t* src_s_ptr = static_cast<const scalar_t*>(src);
64  atomicAdd(dst_s_ptr, *src_s_ptr);
65 }
66 
67 void IndexAddCUDA_(int64_t dim,
68  const Tensor& index,
69  const Tensor& src,
70  Tensor& dst) {
71  DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() {
72  LaunchIndexReductionKernel(
73  dim, src.GetDevice(), index, src, dst,
74  [] CLOUDVIEWER_HOST_DEVICE(const void* src, void* dst) {
75  CUDASumKernel<scalar_t>(src, dst);
76  });
77  });
78 }
79 
80 } // namespace kernel
81 } // namespace core
82 } // namespace cloudViewer