1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
12 #include <cub/cub.cuh>
14 #include "ml/impl/misc/MemoryAllocation.h"
16 namespace cloudViewer {
22 /// Kernel for CountNeighborsCUDA
24 __global__ void CountNeighborsCUDAKernel(uint32_t* __restrict__ count,
26 const T* const __restrict__ indices,
27 size_t indices_size) {
28 const int i = blockDim.x * blockIdx.x + threadIdx.x;
29 if (i >= indices_size) return;
32 atomicAdd(&count[idx], 1);
35 /// Counts the number of neighbors per index value.
37 /// \param count Output array for counting.
39 /// \param count_size The size of count. This is the number of queries
40 /// with respect to the inverted neighbors list.
42 /// \param indices Array of indices. This is the nested list of neighbors.
44 /// \param indices_size Size of indices.
47 void CountNeighborsCUDA(const cudaStream_t& stream,
48 uint32_t* __restrict__ count,
50 const T* const __restrict__ indices,
51 size_t indices_size) {
52 using namespace cloudViewer::utility;
54 cudaMemsetAsync(count, 0, sizeof(uint32_t) * count_size, stream);
56 const int BLOCKSIZE = 128;
57 dim3 block(BLOCKSIZE, 1, 1);
59 grid.x = DivUp(indices_size, block.x);
62 CountNeighborsCUDAKernel<T><<<grid, block, 0, stream>>>(
63 count, count_size, indices, indices_size);
67 /// Kernel for FillNeighborsIndexAndAttributesCUDA.
68 template <class TIndex, class TAttr, bool FILL_ATTRIBUTES>
69 __global__ void FillNeighborsIndexAndAttributesCUDAKernel(
70 uint32_t* __restrict__ count,
72 TIndex* __restrict__ out_neighbors_index,
73 TAttr* __restrict__ out_neighbors_attributes,
74 const TIndex* const __restrict__ inp_neighbors_index,
75 const TAttr* const __restrict__ inp_neighbors_attributes,
76 int num_attributes_per_neighbor,
78 const int64_t* const __restrict__ inp_neighbors_row_splits,
79 size_t inp_num_queries,
80 const int64_t* const __restrict__ out_neighbors_row_splits,
81 size_t out_num_queries) {
82 const int i = blockDim.x * blockIdx.x + threadIdx.x;
83 if (i >= inp_num_queries) return;
87 size_t begin_idx = inp_neighbors_row_splits[i];
88 size_t end_idx = inp_neighbors_row_splits[i + 1];
90 for (size_t j = begin_idx; j < end_idx; ++j) {
91 TIndex neighbor_idx = inp_neighbors_index[j];
93 size_t list_offset = out_neighbors_row_splits[neighbor_idx];
94 size_t item_offset = atomicAdd(&count[neighbor_idx], 1);
95 out_neighbors_index[list_offset + item_offset] = query_idx;
97 if (FILL_ATTRIBUTES) {
99 out_neighbors_attributes +
100 num_attributes_per_neighbor * (list_offset + item_offset);
101 for (int attr_i = 0; attr_i < num_attributes_per_neighbor;
104 inp_neighbors_attributes[num_attributes_per_neighbor *
112 /// Fills in the indices and attributes for the inverted neighbors list
114 /// \param count Temporary array for index computation
116 /// \param count_size The size of count. This is the number of queries
117 /// with respect to the inverted neighbors list.
119 /// See InvertNeighborsListCUDA for the meaning of the remaining parameters.
121 template <class TIndex, class TAttr>
122 void FillNeighborsIndexAndAttributesCUDA(
123 const cudaStream_t& stream,
124 uint32_t* __restrict__ count,
126 TIndex* __restrict__ out_neighbors_index,
127 TAttr* __restrict__ out_neighbors_attributes,
128 const TIndex* const __restrict__ inp_neighbors_index,
129 const TAttr* const __restrict__ inp_neighbors_attributes,
130 const int num_attributes_per_neighbor,
132 const int64_t* const __restrict__ inp_neighbors_row_splits,
133 size_t inp_num_queries,
134 const int64_t* const __restrict__ out_neighbors_row_splits,
135 size_t out_num_queries) {
136 using namespace cloudViewer::utility;
138 cudaMemsetAsync(count, 0, sizeof(uint32_t) * count_size, stream);
140 const int BLOCKSIZE = 128;
141 dim3 block(BLOCKSIZE, 1, 1);
143 grid.x = DivUp(inp_num_queries, block.x);
146 if (inp_neighbors_attributes) {
147 FillNeighborsIndexAndAttributesCUDAKernel<TIndex, TAttr, true>
148 <<<grid, block, 0, stream>>>(
149 count, count_size, out_neighbors_index,
150 out_neighbors_attributes, inp_neighbors_index,
151 inp_neighbors_attributes,
152 num_attributes_per_neighbor, index_size,
153 inp_neighbors_row_splits, inp_num_queries,
154 out_neighbors_row_splits, out_num_queries);
156 FillNeighborsIndexAndAttributesCUDAKernel<TIndex, TAttr, false>
157 <<<grid, block, 0, stream>>>(
158 count, count_size, out_neighbors_index,
159 out_neighbors_attributes, inp_neighbors_index,
160 inp_neighbors_attributes,
161 num_attributes_per_neighbor, index_size,
162 inp_neighbors_row_splits, inp_num_queries,
163 out_neighbors_row_splits, out_num_queries);
170 /// Inverts a neighbors list, which is a tuple of the form
171 /// (neighbors_index, neighbors_row_splits, neighbors_attributes).
172 /// neighbors_index is a nested list of indices to the neighbors. Each entry
173 /// defines an edge between two indices (points).
174 /// The neighbors_row_splits defines the start and end of each sublist.
175 /// neighbors_attributes is an optional array of attributes for each entry in
178 /// Example: The neighbors for point cloud A (3 points) in point cloud B
179 /// (2 points) is defined by:
180 /// - neighbors_index [0 1 0 0]
181 /// - neighbors_row_splits [0 2 3 4]
182 /// - optional neighbors_attributes [0.1 0.2 0.3 0.4] (1 scalar attribute)
184 /// The inverted neighbors list is then the neighbors for point cloud B in A
185 /// - neighbors_index [0 1 2 0]
186 /// - neighbors_row_splits [0 3 4]
187 /// - optional neighbors_attributes [0.1 0.3 0.4 0.2]
190 /// All pointer arguments point to device memory unless stated otherwise.
192 /// \param temp Pointer to temporary memory. If nullptr then the required
193 /// size of temporary memory will be written to \p temp_size and no
196 /// \param temp_size The size of the temporary memory in bytes. This is
197 /// used as an output if temp is nullptr
199 /// \param texture_alignment The texture alignment in bytes. This is used
200 /// for allocating segments within the temporary memory.
202 /// \param inp_neighbors_index The nested list of neighbor indices.
204 /// \param inp_neighbors_attributes The array of attributes for each entry
205 /// in \p inp_neighbors_index. This is optional and can be set to null.
207 /// \param num_attributes_per_neighbor The number of scalar attributes for
208 /// each entry in \p inp_neighbors_index.
210 /// \param inp_neighbors_row_splits Defines the start and end of the
211 /// sublists in \p inp_neighbors_index. This is an exclusive prefix sum
212 /// with 0 as the first element and the length of
213 /// \p inp_neighbors_index as last element.
214 /// The size is \p inp_num_queries + 1
216 /// \param inp_num_queries The number of queries.
218 /// \param out_neighbors_index The inverted neighbors_index list with the
219 /// same size as \p inp_neighbors_index .
221 /// \param out_neighbors_attributes The inverted array of attributes with
222 /// the same size as \p inp_neighbors_attributes .
224 /// \param index_size This is the size of \p inp_neighbors_index and
225 /// \p out_neighbors_index, both have the same size.
227 /// \param out_neighbors_row_splits The prefix sum which defines the start
228 /// and end of the sublists in \p out_neighbors_index.
230 /// \param out_num_queries The number of queries with respect to the
231 /// inverted neighbors list.
233 template <class TIndex, class TAttr>
234 void InvertNeighborsListCUDA(const cudaStream_t& stream,
237 int texture_alignment,
238 const TIndex* const inp_neighbors_index,
239 const TAttr* const inp_neighbors_attributes,
240 const int num_attributes_per_neighbor,
241 const int64_t* const inp_neighbors_row_splits,
242 const size_t inp_num_queries,
243 TIndex* out_neighbors_index,
244 TAttr* out_neighbors_attributes,
245 const size_t index_size,
246 int64_t* out_neighbors_row_splits,
247 const size_t out_num_queries) {
248 using namespace cloudViewer::utility;
250 const bool get_temp_size = !temp;
253 temp = (char*)1; // worst case pointer alignment
254 temp_size = std::numeric_limits<int64_t>::max();
257 // Object for allocating memory within the temporary memory
258 MemoryAllocation mem_temp(temp, temp_size, texture_alignment);
260 // Reserve memory for counting the neighbors
261 std::pair<uint32_t*, size_t> tmp_neighbors_count =
262 mem_temp.Alloc<uint32_t>(out_num_queries);
264 if (!get_temp_size) {
265 CountNeighborsCUDA(stream, tmp_neighbors_count.first,
266 tmp_neighbors_count.second, inp_neighbors_index,
270 // compute prefix sum
272 std::pair<void*, size_t> inclusive_scan_temp(nullptr, 0);
273 cub::DeviceScan::InclusiveSum(
274 inclusive_scan_temp.first, inclusive_scan_temp.second,
275 tmp_neighbors_count.first, out_neighbors_row_splits + 1,
276 out_num_queries, stream);
278 inclusive_scan_temp = mem_temp.Alloc(inclusive_scan_temp.second);
280 if (!get_temp_size) {
281 // set first element to zero
282 cudaMemsetAsync(out_neighbors_row_splits, 0, sizeof(int64_t),
284 cub::DeviceScan::InclusiveSum(
285 inclusive_scan_temp.first, inclusive_scan_temp.second,
286 tmp_neighbors_count.first, out_neighbors_row_splits + 1,
287 out_num_queries, stream);
290 mem_temp.Free(inclusive_scan_temp);
292 mem_temp.Free(tmp_neighbors_count);
295 // return the memory peak as the required temporary memory size.
296 temp_size = mem_temp.MaxUsed();
300 if (!get_temp_size) {
301 FillNeighborsIndexAndAttributesCUDA(
302 stream, tmp_neighbors_count.first, tmp_neighbors_count.second,
303 out_neighbors_index, out_neighbors_attributes,
304 inp_neighbors_index, inp_neighbors_attributes,
305 num_attributes_per_neighbor, index_size,
306 inp_neighbors_row_splits, inp_num_queries,
307 out_neighbors_row_splits, out_num_queries);
313 } // namespace cloudViewer