1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
12 using namespace cloudViewer::utility;
14 namespace cloudViewer {
18 /// Kernel for ReduceSubarraysSumCUDA
20 __global__ void ReduceSubarraysSumCUDAKernel(
21 const T* const __restrict__ values,
22 const size_t values_size,
23 const int64_t* const __restrict__ row_splits,
24 const size_t num_arrays,
25 T* __restrict__ out_sums) {
26 const int i = blockDim.x * blockIdx.x + threadIdx.x;
27 if (i >= num_arrays) return;
29 size_t begin_idx = row_splits[i];
30 size_t end_idx = row_splits[i + 1];
34 for (size_t j = begin_idx; j < end_idx; ++j) {
40 /// Reduces subarrays in linear memory with the sum operation.
41 /// The sum for empty subarrays is 0.
43 /// \param values The linear array with all values
44 /// \param values_size Number of elements of \p values
45 /// \param row_splits Defines the start and end of each subarray. This is
46 /// an exclusive prefix sum with 0 as the first element
47 /// and the length of \p values as last element.
48 /// The size is \p num_arrays + 1
49 /// \param num_arrays The number of subarrays
50 /// \param out_sums The preallocated output array with size
53 void ReduceSubarraysSumCUDA(const cudaStream_t& stream,
54 const T* const values,
55 const size_t values_size,
56 const int64_t* const row_splits,
57 const size_t num_arrays,
59 const int BLOCKSIZE = 128;
60 dim3 block(BLOCKSIZE, 1, 1);
61 dim3 grid(DivUp(num_arrays, block.x));
64 ReduceSubarraysSumCUDAKernel<T><<<grid, block, 0, stream>>>(
65 values, values_size, row_splits, num_arrays, out_sums);
71 } // namespace cloudViewer