1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
9 #include "cloudViewer/core/CUDAUtils.h"
10 #include "cloudViewer/core/Dispatch.h"
11 #include "cloudViewer/core/Indexer.h"
12 #include "cloudViewer/core/ParallelFor.h"
13 #include "cloudViewer/core/Tensor.h"
14 #include "cloudViewer/t/geometry/kernel/GeometryMacros.h"
16 namespace cloudViewer {
20 template <typename func_t>
21 void LaunchIndexReductionKernel(int64_t dim,
26 const func_t& element_kernel) {
27 CLOUDVIEWER_ASSERT_HOST_DEVICE_LAMBDA(func_t);
29 // index: [N,], src: [N, D], dst: [M, D]
30 // In Indexer, output shape defines the actual primary strides.
31 // However, in IndexAdd_, input dominates the iterations.
32 // So put dst (output) at indexer's input, and src (input) at output.
33 Indexer indexer({dst}, src, DtypePolicy::NONE);
35 // Index is simply a 1D contiguous tensor, with a different stride
36 // behavior to src. So use raw pointer for simplicity.
37 auto index_ptr = index.GetDataPtr<int64_t>();
39 int64_t broadcasting_elems = 1;
40 for (int64_t d = 1; d < src.NumDims(); ++d) {
41 broadcasting_elems *= src.GetShape(d);
43 auto element_func = [=] CLOUDVIEWER_HOST_DEVICE(int64_t workload_idx) {
44 int reduction_idx = workload_idx / broadcasting_elems;
45 int broadcasting_idx = workload_idx % broadcasting_elems;
47 const int64_t idx = index_ptr[reduction_idx];
48 int64_t dst_idx = idx * broadcasting_elems + broadcasting_idx;
50 void* src_ptr = indexer.GetOutputPtr(0, workload_idx);
51 void* dst_ptr = indexer.GetInputPtr(0, dst_idx);
52 // Note input and output is switched here to adapt to the indexer.
53 element_kernel(src_ptr, dst_ptr);
56 ParallelFor(device, indexer.NumWorkloads(), element_func);
57 CLOUDVIEWER_GET_LAST_CUDA_ERROR("LaunchIndexReductionKernel failed.");
60 template <typename scalar_t>
61 static CLOUDVIEWER_HOST_DEVICE void CUDASumKernel(const void* src, void* dst) {
62 scalar_t* dst_s_ptr = static_cast<scalar_t*>(dst);
63 const scalar_t* src_s_ptr = static_cast<const scalar_t*>(src);
64 atomicAdd(dst_s_ptr, *src_s_ptr);
67 void IndexAddCUDA_(int64_t dim,
71 DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() {
72 LaunchIndexReductionKernel(
73 dim, src.GetDevice(), index, src, dst,
74 [] CLOUDVIEWER_HOST_DEVICE(const void* src, void* dst) {
75 CUDASumKernel<scalar_t>(src, dst);
82 } // namespace cloudViewer