ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
InvertNeighborsList.cuh
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <Helper.h>
11 
12 #include <cub/cub.cuh>
13 
14 #include "ml/impl/misc/MemoryAllocation.h"
15 
16 namespace cloudViewer {
17 namespace ml {
18 namespace impl {
19 
20 namespace {
21 
22 /// Kernel for CountNeighborsCUDA
23 template <class T>
24 __global__ void CountNeighborsCUDAKernel(uint32_t* __restrict__ count,
25  size_t count_size,
26  const T* const __restrict__ indices,
27  size_t indices_size) {
28  const int i = blockDim.x * blockIdx.x + threadIdx.x;
29  if (i >= indices_size) return;
30 
31  T idx = indices[i];
32  atomicAdd(&count[idx], 1);
33 }
34 
35 /// Counts the number of neighbors per index value.
36 ///
37 /// \param count Output array for counting.
38 ///
39 /// \param count_size The size of count. This is the number of queries
40 /// with respect to the inverted neighbors list.
41 ///
42 /// \param indices Array of indices. This is the nested list of neighbors.
43 ///
44 /// \param indices_size Size of indices.
45 ///
46 template <class T>
47 void CountNeighborsCUDA(const cudaStream_t& stream,
48  uint32_t* __restrict__ count,
49  size_t count_size,
50  const T* const __restrict__ indices,
51  size_t indices_size) {
52  using namespace cloudViewer::utility;
53 
54  cudaMemsetAsync(count, 0, sizeof(uint32_t) * count_size, stream);
55 
56  const int BLOCKSIZE = 128;
57  dim3 block(BLOCKSIZE, 1, 1);
58  dim3 grid(0, 1, 1);
59  grid.x = DivUp(indices_size, block.x);
60 
61  if (grid.x) {
62  CountNeighborsCUDAKernel<T><<<grid, block, 0, stream>>>(
63  count, count_size, indices, indices_size);
64  }
65 }
66 
67 /// Kernel for FillNeighborsIndexAndAttributesCUDA.
68 template <class TIndex, class TAttr, bool FILL_ATTRIBUTES>
69 __global__ void FillNeighborsIndexAndAttributesCUDAKernel(
70  uint32_t* __restrict__ count,
71  size_t count_size,
72  TIndex* __restrict__ out_neighbors_index,
73  TAttr* __restrict__ out_neighbors_attributes,
74  const TIndex* const __restrict__ inp_neighbors_index,
75  const TAttr* const __restrict__ inp_neighbors_attributes,
76  int num_attributes_per_neighbor,
77  size_t indices_size,
78  const int64_t* const __restrict__ inp_neighbors_row_splits,
79  size_t inp_num_queries,
80  const int64_t* const __restrict__ out_neighbors_row_splits,
81  size_t out_num_queries) {
82  const int i = blockDim.x * blockIdx.x + threadIdx.x;
83  if (i >= inp_num_queries) return;
84 
85  TIndex query_idx = i;
86 
87  size_t begin_idx = inp_neighbors_row_splits[i];
88  size_t end_idx = inp_neighbors_row_splits[i + 1];
89 
90  for (size_t j = begin_idx; j < end_idx; ++j) {
91  TIndex neighbor_idx = inp_neighbors_index[j];
92 
93  size_t list_offset = out_neighbors_row_splits[neighbor_idx];
94  size_t item_offset = atomicAdd(&count[neighbor_idx], 1);
95  out_neighbors_index[list_offset + item_offset] = query_idx;
96 
97  if (FILL_ATTRIBUTES) {
98  TAttr* attr_ptr =
99  out_neighbors_attributes +
100  num_attributes_per_neighbor * (list_offset + item_offset);
101  for (int attr_i = 0; attr_i < num_attributes_per_neighbor;
102  ++attr_i) {
103  attr_ptr[attr_i] =
104  inp_neighbors_attributes[num_attributes_per_neighbor *
105  j +
106  attr_i];
107  }
108  }
109  }
110 }
111 
112 /// Fills in the indices and attributes for the inverted neighbors list
113 ///
114 /// \param count Temporary array for index computation
115 ///
116 /// \param count_size The size of count. This is the number of queries
117 /// with respect to the inverted neighbors list.
118 ///
119 /// See InvertNeighborsListCUDA for the meaning of the remaining parameters.
120 ///
121 template <class TIndex, class TAttr>
122 void FillNeighborsIndexAndAttributesCUDA(
123  const cudaStream_t& stream,
124  uint32_t* __restrict__ count,
125  size_t count_size,
126  TIndex* __restrict__ out_neighbors_index,
127  TAttr* __restrict__ out_neighbors_attributes,
128  const TIndex* const __restrict__ inp_neighbors_index,
129  const TAttr* const __restrict__ inp_neighbors_attributes,
130  const int num_attributes_per_neighbor,
131  size_t index_size,
132  const int64_t* const __restrict__ inp_neighbors_row_splits,
133  size_t inp_num_queries,
134  const int64_t* const __restrict__ out_neighbors_row_splits,
135  size_t out_num_queries) {
136  using namespace cloudViewer::utility;
137 
138  cudaMemsetAsync(count, 0, sizeof(uint32_t) * count_size, stream);
139 
140  const int BLOCKSIZE = 128;
141  dim3 block(BLOCKSIZE, 1, 1);
142  dim3 grid(0, 1, 1);
143  grid.x = DivUp(inp_num_queries, block.x);
144 
145  if (grid.x) {
146  if (inp_neighbors_attributes) {
147  FillNeighborsIndexAndAttributesCUDAKernel<TIndex, TAttr, true>
148  <<<grid, block, 0, stream>>>(
149  count, count_size, out_neighbors_index,
150  out_neighbors_attributes, inp_neighbors_index,
151  inp_neighbors_attributes,
152  num_attributes_per_neighbor, index_size,
153  inp_neighbors_row_splits, inp_num_queries,
154  out_neighbors_row_splits, out_num_queries);
155  } else {
156  FillNeighborsIndexAndAttributesCUDAKernel<TIndex, TAttr, false>
157  <<<grid, block, 0, stream>>>(
158  count, count_size, out_neighbors_index,
159  out_neighbors_attributes, inp_neighbors_index,
160  inp_neighbors_attributes,
161  num_attributes_per_neighbor, index_size,
162  inp_neighbors_row_splits, inp_num_queries,
163  out_neighbors_row_splits, out_num_queries);
164  }
165  }
166 }
167 
168 } // namespace
169 
170 /// Inverts a neighbors list, which is a tuple of the form
171 /// (neighbors_index, neighbors_row_splits, neighbors_attributes).
172 /// neighbors_index is a nested list of indices to the neighbors. Each entry
173 /// defines an edge between two indices (points).
174 /// The neighbors_row_splits defines the start and end of each sublist.
175 /// neighbors_attributes is an optional array of attributes for each entry in
176 /// neighbors_index.
177 ///
178 /// Example: The neighbors for point cloud A (3 points) in point cloud B
179 /// (2 points) is defined by:
180 /// - neighbors_index [0 1 0 0]
181 /// - neighbors_row_splits [0 2 3 4]
182 /// - optional neighbors_attributes [0.1 0.2 0.3 0.4] (1 scalar attribute)
183 ///
184 /// The inverted neighbors list is then the neighbors for point cloud B in A
185 /// - neighbors_index [0 1 2 0]
186 /// - neighbors_row_splits [0 3 4]
187 /// - optional neighbors_attributes [0.1 0.3 0.4 0.2]
188 ///
189 ///
190 /// All pointer arguments point to device memory unless stated otherwise.
191 ///
192 /// \param temp Pointer to temporary memory. If nullptr then the required
193 /// size of temporary memory will be written to \p temp_size and no
194 /// work is done.
195 ///
196 /// \param temp_size The size of the temporary memory in bytes. This is
197 /// used as an output if temp is nullptr
198 ///
199 /// \param texture_alignment The texture alignment in bytes. This is used
200 /// for allocating segments within the temporary memory.
201 ///
202 /// \param inp_neighbors_index The nested list of neighbor indices.
203 ///
204 /// \param inp_neighbors_attributes The array of attributes for each entry
205 /// in \p inp_neighbors_index. This is optional and can be set to null.
206 ///
207 /// \param num_attributes_per_neighbor The number of scalar attributes for
208 /// each entry in \p inp_neighbors_index.
209 ///
210 /// \param inp_neighbors_row_splits Defines the start and end of the
211 /// sublists in \p inp_neighbors_index. This is an exclusive prefix sum
212 /// with 0 as the first element and the length of
213 /// \p inp_neighbors_index as last element.
214 /// The size is \p inp_num_queries + 1
215 ///
216 /// \param inp_num_queries The number of queries.
217 ///
218 /// \param out_neighbors_index The inverted neighbors_index list with the
219 /// same size as \p inp_neighbors_index .
220 ///
221 /// \param out_neighbors_attributes The inverted array of attributes with
222 /// the same size as \p inp_neighbors_attributes .
223 ///
224 /// \param index_size This is the size of \p inp_neighbors_index and
225 /// \p out_neighbors_index, both have the same size.
226 ///
227 /// \param out_neighbors_row_splits The prefix sum which defines the start
228 /// and end of the sublists in \p out_neighbors_index.
229 ///
230 /// \param out_num_queries The number of queries with respect to the
231 /// inverted neighbors list.
232 ///
233 template <class TIndex, class TAttr>
234 void InvertNeighborsListCUDA(const cudaStream_t& stream,
235  void* temp,
236  size_t& temp_size,
237  int texture_alignment,
238  const TIndex* const inp_neighbors_index,
239  const TAttr* const inp_neighbors_attributes,
240  const int num_attributes_per_neighbor,
241  const int64_t* const inp_neighbors_row_splits,
242  const size_t inp_num_queries,
243  TIndex* out_neighbors_index,
244  TAttr* out_neighbors_attributes,
245  const size_t index_size,
246  int64_t* out_neighbors_row_splits,
247  const size_t out_num_queries) {
248  using namespace cloudViewer::utility;
249 
250  const bool get_temp_size = !temp;
251 
252  if (get_temp_size) {
253  temp = (char*)1; // worst case pointer alignment
254  temp_size = std::numeric_limits<int64_t>::max();
255  }
256 
257  // Object for allocating memory within the temporary memory
258  MemoryAllocation mem_temp(temp, temp_size, texture_alignment);
259 
260  // Reserve memory for counting the neighbors
261  std::pair<uint32_t*, size_t> tmp_neighbors_count =
262  mem_temp.Alloc<uint32_t>(out_num_queries);
263 
264  if (!get_temp_size) {
265  CountNeighborsCUDA(stream, tmp_neighbors_count.first,
266  tmp_neighbors_count.second, inp_neighbors_index,
267  index_size);
268  }
269 
270  // compute prefix sum
271  {
272  std::pair<void*, size_t> inclusive_scan_temp(nullptr, 0);
273  cub::DeviceScan::InclusiveSum(
274  inclusive_scan_temp.first, inclusive_scan_temp.second,
275  tmp_neighbors_count.first, out_neighbors_row_splits + 1,
276  out_num_queries, stream);
277 
278  inclusive_scan_temp = mem_temp.Alloc(inclusive_scan_temp.second);
279 
280  if (!get_temp_size) {
281  // set first element to zero
282  cudaMemsetAsync(out_neighbors_row_splits, 0, sizeof(int64_t),
283  stream);
284  cub::DeviceScan::InclusiveSum(
285  inclusive_scan_temp.first, inclusive_scan_temp.second,
286  tmp_neighbors_count.first, out_neighbors_row_splits + 1,
287  out_num_queries, stream);
288  }
289 
290  mem_temp.Free(inclusive_scan_temp);
291  }
292  mem_temp.Free(tmp_neighbors_count);
293 
294  if (get_temp_size) {
295  // return the memory peak as the required temporary memory size.
296  temp_size = mem_temp.MaxUsed();
297  return;
298  }
299 
300  if (!get_temp_size) {
301  FillNeighborsIndexAndAttributesCUDA(
302  stream, tmp_neighbors_count.first, tmp_neighbors_count.second,
303  out_neighbors_index, out_neighbors_attributes,
304  inp_neighbors_index, inp_neighbors_attributes,
305  num_attributes_per_neighbor, index_size,
306  inp_neighbors_row_splits, inp_num_queries,
307  out_neighbors_row_splits, out_num_queries);
308  }
309 }
310 
311 } // namespace impl
312 } // namespace ml
313 } // namespace cloudViewer