ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
ReduceSubarraysSum.cuh
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <Helper.h>
11 
12 using namespace cloudViewer::utility;
13 
14 namespace cloudViewer {
15 namespace ml {
16 namespace impl {
17 
18 /// Kernel for ReduceSubarraysSumCUDA
19 template <class T>
20 __global__ void ReduceSubarraysSumCUDAKernel(
21  const T* const __restrict__ values,
22  const size_t values_size,
23  const int64_t* const __restrict__ row_splits,
24  const size_t num_arrays,
25  T* __restrict__ out_sums) {
26  const int i = blockDim.x * blockIdx.x + threadIdx.x;
27  if (i >= num_arrays) return;
28 
29  size_t begin_idx = row_splits[i];
30  size_t end_idx = row_splits[i + 1];
31 
32  T sum = T(0);
33 
34  for (size_t j = begin_idx; j < end_idx; ++j) {
35  sum += values[j];
36  }
37  out_sums[i] = sum;
38 }
39 
40 /// Reduces subarrays in linear memory with the sum operation.
41 /// The sum for empty subarrays is 0.
42 ///
43 /// \param values The linear array with all values
44 /// \param values_size Number of elements of \p values
45 /// \param row_splits Defines the start and end of each subarray. This is
46 /// an exclusive prefix sum with 0 as the first element
47 /// and the length of \p values as last element.
48 /// The size is \p num_arrays + 1
49 /// \param num_arrays The number of subarrays
50 /// \param out_sums The preallocated output array with size
51 /// \p num_arrays
52 template <class T>
53 void ReduceSubarraysSumCUDA(const cudaStream_t& stream,
54  const T* const values,
55  const size_t values_size,
56  const int64_t* const row_splits,
57  const size_t num_arrays,
58  T* out_sums) {
59  const int BLOCKSIZE = 128;
60  dim3 block(BLOCKSIZE, 1, 1);
61  dim3 grid(DivUp(num_arrays, block.x));
62 
63  if (grid.x) {
64  ReduceSubarraysSumCUDAKernel<T><<<grid, block, 0, stream>>>(
65  values, values_size, row_splits, num_arrays, out_sums);
66  }
67 }
68 
69 } // namespace impl
70 } // namespace ml
71 } // namespace cloudViewer