ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
IndexReductionCPU.cpp
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #include <Logging.h>
9 
13 
14 namespace cloudViewer {
15 namespace core {
16 namespace kernel {
17 
18 template <typename func_t>
19 void LaunchIndexReductionKernel(int64_t dim,
20  const Device& device,
21  const Tensor& index,
22  const Tensor& src,
23  Tensor& dst,
24  const func_t& element_kernel) {
25  // index: [N,], src: [N, D], dst: [M, D]
26  // In Indexer, output shape defines the actual primary strides.
27  // However, in IndexAdd_, input dominates the iterations.
28  // So put dst (output) at indexer's input, and src (input) at output.
29  Indexer indexer({dst}, src, DtypePolicy::NONE);
30 
31  // Index is simply a 1D contiguous tensor, with a different stride
32  // behavior to src. So use raw pointer for simplicity.
33  auto index_ptr = index.GetDataPtr<int64_t>();
34 
35  int64_t broadcasting_elems = 1;
36  for (int64_t d = 1; d < src.NumDims(); ++d) {
37  broadcasting_elems *= src.GetShape(d);
38  }
39  auto element_func = [=](int64_t workload_idx) {
40  int reduction_idx = workload_idx / broadcasting_elems;
41  int broadcasting_idx = workload_idx % broadcasting_elems;
42 
43  const int64_t idx = index_ptr[reduction_idx];
44  int64_t dst_idx = idx * broadcasting_elems + broadcasting_idx;
45 
46  void* src_ptr = indexer.GetOutputPtr(0, workload_idx);
47  void* dst_ptr = indexer.GetInputPtr(0, dst_idx);
48  // Note input and output is switched here to adapt to the indexer
49  element_kernel(src_ptr, dst_ptr);
50  };
51 
52  // TODO: check in detail
53  // No OpenMP could be faster, otherwise there would be thousands of atomics.
54  for (int64_t d = 0; d < indexer.NumWorkloads(); ++d) {
55  element_func(d);
56  }
57 }
58 
59 template <typename scalar_t>
60 static CLOUDVIEWER_HOST_DEVICE void CPUSumKernel(const void* src, void* dst) {
61  scalar_t* dst_s_ptr = static_cast<scalar_t*>(dst);
62  const scalar_t* src_s_ptr = static_cast<const scalar_t*>(src);
63  *dst_s_ptr += *src_s_ptr;
64 }
65 
66 void IndexAddCPU_(int64_t dim,
67  const Tensor& index,
68  const Tensor& src,
69  Tensor& dst) {
71  LaunchIndexReductionKernel(dim, src.GetDevice(), index, src, dst,
72  [](const void* src, void* dst) {
73  CPUSumKernel<scalar_t>(src, dst);
74  });
75  });
76 }
77 
78 } // namespace kernel
79 } // namespace core
80 } // namespace cloudViewer
Indexer indexer
#define CLOUDVIEWER_HOST_DEVICE
Definition: CUDAUtils.h:44
#define DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(DTYPE,...)
Definition: Dispatch.h:78
int64_t NumDims() const
Definition: Tensor.h:1172
Dtype GetDtype() const
Definition: Tensor.h:1164
SizeVector GetShape() const
Definition: Tensor.h:1127
static CLOUDVIEWER_HOST_DEVICE void CPUSumKernel(const void *src, void *dst)
void LaunchIndexReductionKernel(int64_t dim, const Device &device, const Tensor &index, const Tensor &src, Tensor &dst, const func_t &element_kernel)
void IndexAddCPU_(int64_t dim, const Tensor &index, const Tensor &src, Tensor &dst)
Generic file read and write utility for python interface.