ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
Voxelize.cuh
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <Helper.h>
11 #include <thrust/sequence.h>
12 
13 #include <cub/cub.cuh>
14 
15 #include "ml/impl/misc/MemoryAllocation.h"
16 #include "utility/MiniVec.h"
17 
18 namespace cloudViewer {
19 namespace ml {
20 namespace impl {
21 
22 namespace {
23 
24 using namespace cloudViewer::utility;
25 
26 template <class T, bool LARGE_ARRAY>
27 __global__ void IotaCUDAKernel(T* first, int64_t len, T value) {
28  int64_t linear_idx;
29  const int64_t x = blockDim.x * blockIdx.x + threadIdx.x;
30  if (LARGE_ARRAY) {
31  const int64_t y = blockDim.y * blockIdx.y + threadIdx.y;
32  const int64_t z = blockDim.z * blockIdx.z + threadIdx.z;
33  linear_idx = z * gridDim.x * blockDim.x * gridDim.y +
34  y * gridDim.x * blockDim.x + x;
35  } else {
36  linear_idx = x;
37  }
38  if (linear_idx < len) {
39  T* ptr = first + linear_idx;
40  value += linear_idx;
41  *ptr = value;
42  }
43 }
44 
45 /// Iota function for CUDA
46 template <class T>
47 void IotaCUDA(const cudaStream_t& stream, T* first, T* last, T value) {
48  ptrdiff_t len = last - first;
49  if (len) {
50  const int BLOCKSIZE = 128;
51  dim3 block(BLOCKSIZE, 1, 1);
52  dim3 grid;
53  if (len > block.x * INT32_MAX) {
54  grid.y = std::ceil(std::cbrt(len));
55  grid.z = grid.y;
56  grid.x = DivUp(len, int64_t(grid.z) * grid.y * block.x);
57  IotaCUDAKernel<T, true>
58  <<<grid, block, 0, stream>>>(first, len, value);
59  } else {
60  grid = dim3(DivUp(len, block.x), 1, 1);
61  IotaCUDAKernel<T, false>
62  <<<grid, block, 0, stream>>>(first, len, value);
63  }
64  }
65 }
66 
67 __global__ void ComputeBatchIdKernel(int64_t* hashes,
68  const int64_t num_voxels,
69  const int64_t batch_hash) {
70  int64_t linear_idx;
71  const int64_t x = blockDim.x * blockIdx.x + threadIdx.x;
72  const int64_t y = blockDim.y * blockIdx.y + threadIdx.y;
73  const int64_t z = blockDim.z * blockIdx.z + threadIdx.z;
74  linear_idx = z * gridDim.x * blockDim.x * gridDim.y +
75  y * gridDim.x * blockDim.x + x;
76 
77  if (linear_idx >= num_voxels) return;
78 
79  hashes[linear_idx] /= batch_hash;
80 }
81 
82 /// This function computes batch_id from hash value.
83 ///
84 /// \param hashes Input and output array.
85 /// \param num_voxels Number of valid voxels.
86 /// \param batch_hash The value used to hash batch dimension.
87 ///
88 void ComputeBatchId(const cudaStream_t& stream,
89  int64_t* hashes,
90  const int64_t num_voxels,
91  const int64_t batch_hash) {
92  if (num_voxels) {
93  const int BLOCKSIZE = 128;
94  dim3 block(BLOCKSIZE, 1, 1);
95  dim3 grid;
96  grid.y = std::ceil(std::cbrt(num_voxels));
97  grid.z = grid.y;
98  grid.x = DivUp(num_voxels, int64_t(grid.z) * grid.y * block.x);
99  ComputeBatchIdKernel<<<grid, block, 0, stream>>>(hashes, num_voxels,
100  batch_hash);
101  }
102 }
103 
104 __global__ void ComputeVoxelPerBatchKernel(int64_t* num_voxels_per_batch,
105  int64_t* unique_batches_count,
106  int64_t* unique_batches,
107  const int64_t num_batches) {
108  int64_t linear_idx;
109  const int64_t x = blockDim.x * blockIdx.x + threadIdx.x;
110  const int64_t y = blockDim.y * blockIdx.y + threadIdx.y;
111  const int64_t z = blockDim.z * blockIdx.z + threadIdx.z;
112  linear_idx = z * gridDim.x * blockDim.x * gridDim.y +
113  y * gridDim.x * blockDim.x + x;
114 
115  if (linear_idx >= num_batches) return;
116 
117  int64_t out_idx = unique_batches[linear_idx];
118  num_voxels_per_batch[out_idx] = unique_batches_count[linear_idx];
119 }
120 
121 /// This function computes number of voxels per batch element.
122 ///
123 /// \param num_voxels_per_batch The output array.
124 /// \param unique_batches_count Counts for unique batch_id.
125 /// \param unique_batches Unique batch_id.
126 /// \param num_batches Number of non empty batches (<= batch_size).
127 ///
128 void ComputeVoxelPerBatch(const cudaStream_t& stream,
129  int64_t* num_voxels_per_batch,
130  int64_t* unique_batches_count,
131  int64_t* unique_batches,
132  const int64_t num_batches) {
133  if (num_batches) {
134  const int BLOCKSIZE = 128;
135  dim3 block(BLOCKSIZE, 1, 1);
136  dim3 grid;
137  grid.y = std::ceil(std::cbrt(num_batches));
138  grid.z = grid.y;
139  grid.x = DivUp(num_batches, int64_t(grid.z) * grid.y * block.x);
140  ComputeVoxelPerBatchKernel<<<grid, block, 0, stream>>>(
141  num_voxels_per_batch, unique_batches_count, unique_batches,
142  num_batches);
143  }
144 }
145 
146 __global__ void ComputeIndicesBatchesKernel(int64_t* indices_batches,
147  const int64_t* row_splits,
148  const int64_t batch_size) {
149  int64_t linear_idx;
150  const int64_t x = blockDim.x * blockIdx.x + threadIdx.x;
151  const int64_t y = blockDim.y * blockIdx.y + threadIdx.y;
152  const int64_t z = blockDim.z * blockIdx.z + threadIdx.z;
153  linear_idx = z * gridDim.x * blockDim.x * gridDim.y +
154  y * gridDim.x * blockDim.x + x;
155 
156  if (linear_idx >= batch_size) return;
157 
158  for (int64_t i = row_splits[linear_idx]; i < row_splits[linear_idx + 1];
159  ++i) {
160  indices_batches[i] = linear_idx;
161  }
162 }
163 
164 /// This function computes mapping of index to batch_id.
165 ///
166 /// \param indices_batches The output array.
167 /// \param row_splits The row_splits for defining batches.
168 /// \param batch_size The batch_size of given points.
169 ///
170 void ComputeIndicesBatches(const cudaStream_t& stream,
171  int64_t* indices_batches,
172  const int64_t* row_splits,
173  const int64_t batch_size) {
174  if (batch_size) {
175  const int BLOCKSIZE = 128;
176  dim3 block(BLOCKSIZE, 1, 1);
177  dim3 grid;
178  grid.y = std::ceil(std::cbrt(batch_size));
179  grid.z = grid.y;
180  grid.x = DivUp(batch_size, int64_t(grid.z) * grid.y * block.x);
181  ComputeIndicesBatchesKernel<<<grid, block, 0, stream>>>(
182  indices_batches, row_splits, batch_size);
183  }
184 }
185 
186 template <class T, int NDIM>
187 __global__ void ComputeHashKernel(
188  int64_t* __restrict__ hashes,
189  int64_t num_points,
190  const T* const __restrict__ points,
191  const int64_t batch_size,
192  const int64_t* row_splits,
193  const int64_t* indices_batches,
194  const cloudViewer::utility::MiniVec<T, NDIM> points_range_min_vec,
195  const cloudViewer::utility::MiniVec<T, NDIM> points_range_max_vec,
196  const cloudViewer::utility::MiniVec<T, NDIM> inv_voxel_size,
197  const cloudViewer::utility::MiniVec<int64_t, NDIM> strides,
198  const int64_t batch_hash,
199  const int64_t invalid_hash) {
200  typedef MiniVec<T, NDIM> Vec_t;
201  int64_t linear_idx;
202  const int64_t x = blockDim.x * blockIdx.x + threadIdx.x;
203  const int64_t y = blockDim.y * blockIdx.y + threadIdx.y;
204  const int64_t z = blockDim.z * blockIdx.z + threadIdx.z;
205  linear_idx = z * gridDim.x * blockDim.x * gridDim.y +
206  y * gridDim.x * blockDim.x + x;
207 
208  if (linear_idx >= num_points) return;
209 
210  Vec_t point(points + linear_idx * NDIM);
211 
212  if ((point >= points_range_min_vec && point <= points_range_max_vec)
213  .all()) {
214  auto coords = ((point - points_range_min_vec) * inv_voxel_size)
215  .template cast<int64_t>();
216  int64_t h = coords.dot(strides);
217  h += indices_batches[linear_idx] * batch_hash; // add hash for batch_id
218  hashes[linear_idx] = h;
219  } else {
220  hashes[linear_idx] = invalid_hash; // max hash value used as invalid
221  }
222 }
223 
224 /// This function computes the hash (linear index) for each point.
225 /// Points outside the range will get a specific hash value.
226 ///
227 /// \tparam T The floating point type for the points
228 /// \tparam NDIM The number of dimensions, e.g., 3.
229 ///
230 /// \param hashes The output vector with the hashes/linear indexes.
231 /// \param num_points The number of points.
232 /// \param points The array with the point coordinates. The shape is
233 /// [num_points,NDIM] and the storage order is row-major.
234 /// \param batch_size The batch size of points.
235 /// \param row_splits row_splits for defining batches.
236 /// \param indices_batches Mapping of index to batch_id.
237 /// \param points_range_min_vec The minimum range for a point to be valid.
238 /// \param points_range_max_vec The maximum range for a point to be valid.
239 /// \param inv_voxel_size The reciprocal of the voxel edge lengths in each
240 /// dimension
241 /// \param strides The strides for computing the linear index.
242 /// \param batch_hash The value for hashing batch dimension.
243 /// \param invalid_hash The value to use for points outside the range.
244 template <class T, int NDIM>
245 void ComputeHash(const cudaStream_t& stream,
246  int64_t* hashes,
247  int64_t num_points,
248  const T* const points,
249  const int64_t batch_size,
250  const int64_t* row_splits,
251  const int64_t* indices_batches,
252  const MiniVec<T, NDIM> points_range_min_vec,
253  const MiniVec<T, NDIM> points_range_max_vec,
254  const MiniVec<T, NDIM> inv_voxel_size,
255  const MiniVec<int64_t, NDIM> strides,
256  const int64_t batch_hash,
257  const int64_t invalid_hash) {
258  if (num_points) {
259  const int BLOCKSIZE = 128;
260  dim3 block(BLOCKSIZE, 1, 1);
261  dim3 grid;
262  grid.y = std::ceil(std::cbrt(num_points));
263  grid.z = grid.y;
264  grid.x = DivUp(num_points, int64_t(grid.z) * grid.y * block.x);
265  ComputeHashKernel<T, NDIM><<<grid, block, 0, stream>>>(
266  hashes, num_points, points, batch_size, row_splits,
267  indices_batches, points_range_min_vec, points_range_max_vec,
268  inv_voxel_size, strides, batch_hash, invalid_hash);
269  }
270 }
271 
272 template <class T>
273 __global__ void LimitCountsKernel(T* counts, int64_t num, T limit) {
274  int64_t linear_idx;
275  const int64_t x = blockDim.x * blockIdx.x + threadIdx.x;
276  const int64_t y = blockDim.y * blockIdx.y + threadIdx.y;
277  const int64_t z = blockDim.z * blockIdx.z + threadIdx.z;
278  linear_idx = z * gridDim.x * blockDim.x * gridDim.y +
279  y * gridDim.x * blockDim.x + x;
280 
281  if (linear_idx >= num) return;
282 
283  if (counts[linear_idx] > limit) {
284  counts[linear_idx] = limit;
285  }
286 }
287 
288 /// This function performs an element-wise minimum operation.
289 ///
290 /// \param counts The input and output array.
291 /// \param num Number of input elements.
292 /// \param limit The second operator for the minimum operation.
293 template <class T>
294 void LimitCounts(const cudaStream_t& stream, T* counts, int64_t num, T limit) {
295  if (num) {
296  const int BLOCKSIZE = 128;
297  dim3 block(BLOCKSIZE, 1, 1);
298  dim3 grid;
299  grid.y = std::ceil(std::cbrt(num));
300  grid.z = grid.y;
301  grid.x = DivUp(num, int64_t(grid.z) * grid.y * block.x);
302  LimitCountsKernel<<<grid, block, 0, stream>>>(counts, num, limit);
303  }
304 }
305 
306 __global__ void ComputeStartIdxKernel(
307  int64_t* start_idx,
308  int64_t* points_count,
309  const int64_t* num_voxels_prefix_sum,
310  const int64_t* unique_hashes_count_prefix_sum,
311  const int64_t* out_batch_splits,
312  const int64_t batch_size,
313  const int64_t max_voxels,
314  const int64_t max_points_per_voxel) {
315  int64_t linear_idx;
316  const int64_t x = blockDim.x * blockIdx.x + threadIdx.x;
317  const int64_t y = blockDim.y * blockIdx.y + threadIdx.y;
318  const int64_t z = blockDim.z * blockIdx.z + threadIdx.z;
319  linear_idx = z * gridDim.x * blockDim.x * gridDim.y +
320  y * gridDim.x * blockDim.x + x;
321 
322  if (linear_idx >= batch_size) return;
323 
324  int64_t voxel_idx;
325  if (0 == linear_idx) {
326  voxel_idx = 0;
327  } else {
328  voxel_idx = num_voxels_prefix_sum[linear_idx - 1];
329  }
330 
331  int64_t begin_out = out_batch_splits[linear_idx];
332  int64_t end_out = out_batch_splits[linear_idx + 1];
333 
334  for (int64_t out_idx = begin_out; out_idx < end_out;
335  out_idx++, voxel_idx++) {
336  if (voxel_idx == 0) {
337  start_idx[out_idx] = 0;
338  points_count[out_idx] = min(max_points_per_voxel,
339  unique_hashes_count_prefix_sum[0]);
340  } else {
341  start_idx[out_idx] = unique_hashes_count_prefix_sum[voxel_idx - 1];
342  points_count[out_idx] =
343  min(max_points_per_voxel,
344  unique_hashes_count_prefix_sum[voxel_idx] -
345  unique_hashes_count_prefix_sum[voxel_idx - 1]);
346  }
347  }
348 }
349 
350 /// Computes the starting index of each voxel.
351 ///
352 /// \param start_idx The output array for storing starting index.
353 /// \param points_count The output array for storing points count.
354 /// \param num_voxels_prefix_sum The Inclusive prefix sum which gives
355 /// the index of starting voxel for each batch.
356 /// \param unique_hashes_count_prefix_sum Inclusive prefix sum defining
357 /// where point indices for each voxel ends.
358 /// \param out_batch_splits Defines starting and ending voxels for
359 /// each batch element.
360 /// \param batch_size The batch size.
361 /// \param max_voxels Maximum voxels per batch.
362 /// \param max_points_per_voxel Maximum points per voxel.
363 ///
364 void ComputeStartIdx(const cudaStream_t& stream,
365  int64_t* start_idx,
366  int64_t* points_count,
367  const int64_t* num_voxels_prefix_sum,
368  const int64_t* unique_hashes_count_prefix_sum,
369  const int64_t* out_batch_splits,
370  const int64_t batch_size,
371  const int64_t max_voxels,
372  const int64_t max_points_per_voxel) {
373  if (batch_size) {
374  const int BLOCKSIZE = 128;
375  dim3 block(BLOCKSIZE, 1, 1);
376  dim3 grid;
377  grid.y = std::ceil(std::cbrt(batch_size));
378  grid.z = grid.y;
379  grid.x = DivUp(batch_size, int64_t(grid.z) * grid.y * block.x);
380  ComputeStartIdxKernel<<<grid, block, 0, stream>>>(
381  start_idx, points_count, num_voxels_prefix_sum,
382  unique_hashes_count_prefix_sum, out_batch_splits, batch_size,
383  max_voxels, max_points_per_voxel);
384  }
385 }
386 
387 template <class T, int NDIM>
388 __global__ void ComputeVoxelCoordsKernel(
389  int32_t* __restrict__ voxel_coords,
390  const T* const __restrict__ points,
391  const int64_t* const __restrict__ point_indices,
392  const int64_t* const __restrict__ prefix_sum,
393  const MiniVec<T, NDIM> points_range_min_vec,
394  const MiniVec<T, NDIM> inv_voxel_size,
395  int64_t num_voxels) {
396  typedef MiniVec<T, NDIM> Vec_t;
397  int64_t linear_idx;
398  const int64_t x = blockDim.x * blockIdx.x + threadIdx.x;
399  const int64_t y = blockDim.y * blockIdx.y + threadIdx.y;
400  const int64_t z = blockDim.z * blockIdx.z + threadIdx.z;
401  linear_idx = z * gridDim.x * blockDim.x * gridDim.y +
402  y * gridDim.x * blockDim.x + x;
403 
404  if (linear_idx >= num_voxels) return;
405 
406  int64_t point_idx = point_indices[prefix_sum[linear_idx]];
407 
408  Vec_t point(points + point_idx * NDIM);
409  auto coords = ((point - points_range_min_vec) * inv_voxel_size)
410  .template cast<int32_t>();
411 
412  for (int i = 0; i < NDIM; ++i) {
413  voxel_coords[linear_idx * NDIM + i] = coords[i];
414  }
415 }
416 
417 /// Computes the coordinates for each voxel
418 ///
419 /// \param voxel_coords The output array with shape [num_voxels, NDIM].
420 /// \param points The array with the point coordinates.
421 /// \param point_indices The array with the point indices for all voxels.
422 /// \param prefix_sum Inclusive prefix sum defining where the point indices
423 /// for each voxels end.
424 /// \param points_range_min The lower bound of the domain to be
425 /// voxelized.
426 /// \param points_range_max The upper bound of the domain to be
427 /// voxelized.
428 /// \param inv_voxel_size The reciprocal of the voxel edge lengths for each
429 /// dimension.
430 /// \param num_voxels The number of voxels.
431 template <class T, int NDIM>
432 void ComputeVoxelCoords(const cudaStream_t& stream,
433  int32_t* voxel_coords,
434  const T* const points,
435  const int64_t* const point_indices,
436  const int64_t* const prefix_sum,
437  const MiniVec<T, NDIM> points_range_min_vec,
438  const MiniVec<T, NDIM> inv_voxel_size,
439  int64_t num_voxels) {
440  if (num_voxels) {
441  const int BLOCKSIZE = 128;
442  dim3 block(BLOCKSIZE, 1, 1);
443  dim3 grid;
444  grid.y = std::ceil(std::cbrt(num_voxels));
445  grid.z = grid.y;
446  grid.x = DivUp(num_voxels, int64_t(grid.z) * grid.y * block.x);
447  ComputeVoxelCoordsKernel<<<grid, block, 0, stream>>>(
448  voxel_coords, points, point_indices, prefix_sum,
449  points_range_min_vec, inv_voxel_size, num_voxels);
450  }
451 }
452 
453 __global__ void CopyPointIndicesKernel(
454  int64_t* __restrict__ out,
455  const int64_t* const __restrict__ point_indices,
456  const int64_t* const __restrict__ prefix_sum_in,
457  const int64_t* const __restrict__ prefix_sum_out,
458  const int64_t num_voxels) {
459  // TODO data coalescing can be optimized
460  int64_t linear_idx;
461  const int64_t x = blockDim.x * blockIdx.x + threadIdx.x;
462  const int64_t y = blockDim.y * blockIdx.y + threadIdx.y;
463  const int64_t z = blockDim.z * blockIdx.z + threadIdx.z;
464  linear_idx = z * gridDim.x * blockDim.x * gridDim.y +
465  y * gridDim.x * blockDim.x + x;
466 
467  if (linear_idx >= num_voxels) return;
468 
469  int64_t begin_out;
470  if (0 == linear_idx) {
471  begin_out = 0;
472  } else {
473  begin_out = prefix_sum_out[linear_idx - 1];
474  }
475  int64_t end_out = prefix_sum_out[linear_idx];
476  int64_t in_idx = prefix_sum_in[linear_idx];
477  for (int64_t out_idx = begin_out; out_idx < end_out; ++out_idx, ++in_idx) {
478  out[out_idx] = point_indices[in_idx];
479  }
480 }
481 
482 /// Copies the point indices for each voxel to the output.
483 ///
484 /// \param out The output array with the point indices for all voxels.
485 /// \param point_indices The array with the point indices for all voxels.
486 /// \param prefix_sum_in Inclusive prefix sum defining where the point
487 /// indices for each voxels end.
488 /// \param prefix_sum_out Inclusive prefix sum defining where the point
489 /// indices for each voxels end.
490 /// \param num_voxels The number of voxels.
491 ///
492 void CopyPointIndices(const cudaStream_t& stream,
493  int64_t* out,
494  const int64_t* const point_indices,
495  const int64_t* const prefix_sum_in,
496  const int64_t* const prefix_sum_out,
497  const int64_t num_voxels) {
498  if (num_voxels) {
499  const int BLOCKSIZE = 128;
500  dim3 block(BLOCKSIZE, 1, 1);
501  dim3 grid;
502  grid.y = std::ceil(std::cbrt(num_voxels));
503  grid.z = grid.y;
504  grid.x = DivUp(num_voxels, int64_t(grid.z) * grid.y * block.x);
505  CopyPointIndicesKernel<<<grid, block, 0, stream>>>(
506  out, point_indices, prefix_sum_in, prefix_sum_out, num_voxels);
507  }
508 }
509 
510 } // namespace
511 
512 /// This function voxelizes a point cloud.
513 /// The function returns the integer coordinates of the voxels that
514 /// contain points and a compact list of the indices that associate the
515 /// voxels to the points.
516 ///
517 /// All pointer arguments point to device memory unless stated
518 /// otherwise.
519 ///
520 /// \tparam T Floating-point data type for the point positions.
521 ///
522 /// \tparam NDIM The number of dimensions of the points.
523 ///
524 /// \tparam OUTPUT_ALLOCATOR Type of the output_allocator. See
525 /// \p output_allocator for more information.
526 ///
527 /// \param stream The cuda stream for all kernel launches.
528 ///
529 /// \param temp Pointer to temporary memory. If nullptr then the required
530 /// size of temporary memory will be written to \p temp_size and no
531 /// work is done.
532 ///
533 /// \param temp_size The size of the temporary memory in bytes. This is
534 /// used as an output if temp is nullptr
535 ///
536 /// \param texture_alignment The texture alignment in bytes. This is used
537 /// for allocating segments within the temporary memory.
538 ///
539 /// \param num_points The number of points.
540 ///
541 /// \param points Array with the point positions. The shape is
542 /// [num_points,NDIM].
543 ///
544 /// \param batch_size The batch size of points.
545 ///
546 /// \param row_splits row_splits for defining batches.
547 ///
548 /// \param voxel_size The edge lengths of the voxel. The shape is
549 /// [NDIM]. This pointer points to host memory!
550 ///
551 /// \param points_range_min The lower bound of the domain to be
552 /// voxelized.
553 /// The shape is [NDIM].
554 /// This pointer points to host memory!
555 ///
556 /// \param points_range_max The upper bound of the domain to be
557 /// voxelized.
558 /// The shape is [NDIM].
559 /// This pointer points to host memory!
560 ///
561 /// \param max_points_per_voxel This parameter limits the number of
562 /// points
563 /// that are recorderd for each voxel.
564 ///
565 /// \param max_voxels This parameter limits the number of voxels that
566 /// will be generated.
567 ///
568 /// \param output_allocator An object that implements functions for
569 /// allocating the output arrays. The object must implement
570 /// functions AllocVoxelCoords(int32_t** ptr, int64_t rows,
571 /// int64_t cols), AllocVoxelPointIndices(int64_t** ptr, int64_t
572 /// size), AllocVoxelPointRowSplits(int64_t** ptr, int64_t
573 /// size) and AllocVoxelBatchSplits(int64_t** ptr, int64_t size).
574 /// All functions should allocate memory and return a pointer
575 /// to that memory in ptr. The arguments size, rows, and cols
576 /// define the size of the array as the number of elements.
577 /// All functions must accept zero size arguments. In this case
578 /// ptr does not need to be set.
579 ///
580 template <class T, int NDIM, class OUTPUT_ALLOCATOR>
581 void VoxelizeCUDA(const cudaStream_t& stream,
582  void* temp,
583  size_t& temp_size,
584  int texture_alignment,
585  size_t num_points,
586  const T* const points,
587  const size_t batch_size,
588  const int64_t* const row_splits,
589  const T* const voxel_size,
590  const T* const points_range_min,
591  const T* const points_range_max,
592  const int64_t max_points_per_voxel,
593  const int64_t max_voxels,
594  OUTPUT_ALLOCATOR& output_allocator) {
595  using namespace cloudViewer::utility;
596  typedef MiniVec<T, NDIM> Vec_t;
597 
598  const bool get_temp_size = !temp;
599 
600  if (get_temp_size) {
601  temp = (char*)1; // worst case pointer alignment
602  temp_size = std::numeric_limits<int64_t>::max();
603  }
604 
605  MemoryAllocation mem_temp(temp, temp_size, texture_alignment);
606 
607  const Vec_t inv_voxel_size = T(1) / Vec_t(voxel_size);
608  const Vec_t points_range_min_vec(points_range_min);
609  const Vec_t points_range_max_vec(points_range_max);
610  MiniVec<int32_t, NDIM> extents =
611  ceil((points_range_max_vec - points_range_min_vec) * inv_voxel_size)
612  .template cast<int32_t>();
613  MiniVec<int64_t, NDIM> strides;
614  for (int i = 0; i < NDIM; ++i) {
615  strides[i] = 1;
616  for (int j = 0; j < i; ++j) {
617  strides[i] *= extents[j];
618  }
619  }
620  const int64_t batch_hash = strides[NDIM - 1] * extents[NDIM - 1];
621  const int64_t invalid_hash = batch_hash * batch_size;
622 
623  /// store batch_id for each point
624  std::pair<int64_t*, size_t> indices_batches =
625  mem_temp.Alloc<int64_t>(num_points);
626  if (!get_temp_size) {
627  ComputeIndicesBatches(stream, indices_batches.first, row_splits,
628  batch_size);
629  }
630 
631  // use double buffers for the sorting
632  std::pair<int64_t*, size_t> point_indices =
633  mem_temp.Alloc<int64_t>(num_points);
634  std::pair<int64_t*, size_t> point_indices_alt =
635  mem_temp.Alloc<int64_t>(num_points);
636  std::pair<int64_t*, size_t> hashes = mem_temp.Alloc<int64_t>(num_points);
637  std::pair<int64_t*, size_t> hashes_alt =
638  mem_temp.Alloc<int64_t>(num_points);
639 
640  cub::DoubleBuffer<int64_t> point_indices_dbuf(point_indices.first,
641  point_indices_alt.first);
642  cub::DoubleBuffer<int64_t> hashes_dbuf(hashes.first, hashes_alt.first);
643 
644  if (!get_temp_size) {
645  IotaCUDA(stream, point_indices.first,
646  point_indices.first + point_indices.second, int64_t(0));
647  ComputeHash(stream, hashes.first, num_points, points, batch_size,
648  row_splits, indices_batches.first, points_range_min_vec,
649  points_range_max_vec, inv_voxel_size, strides, batch_hash,
650  invalid_hash);
651  }
652 
653  {
654  // TODO compute end_bit for radix sort
655  std::pair<void*, size_t> sort_pairs_temp(nullptr, 0);
656  cub::DeviceRadixSort::SortPairs(
657  sort_pairs_temp.first, sort_pairs_temp.second, hashes_dbuf,
658  point_indices_dbuf, num_points, 0, sizeof(int64_t) * 8, stream);
659  sort_pairs_temp = mem_temp.Alloc(sort_pairs_temp.second);
660  if (!get_temp_size) {
661  cub::DeviceRadixSort::SortPairs(sort_pairs_temp.first,
662  sort_pairs_temp.second, hashes_dbuf,
663  point_indices_dbuf, num_points, 0,
664  sizeof(int64_t) * 8, stream);
665  }
666  mem_temp.Free(sort_pairs_temp);
667  }
668 
669  // reuse the alternate buffers
670  std::pair<int64_t*, size_t> unique_hashes(hashes_dbuf.Alternate(),
671  hashes.second);
672  std::pair<int64_t*, size_t> unique_hashes_count(
673  point_indices_dbuf.Alternate(), point_indices.second);
674 
675  // encode unique hashes(voxels) and their counts(points per voxel)
676  int64_t num_voxels = 0;
677  int64_t last_hash = 0; // 0 is a valid hash value
678  {
679  std::pair<void*, size_t> encode_temp(nullptr, 0);
680  std::pair<int64_t*, size_t> num_voxels_mem = mem_temp.Alloc<int64_t>(1);
681 
682  cub::DeviceRunLengthEncode::Encode(
683  encode_temp.first, encode_temp.second, hashes_dbuf.Current(),
684  unique_hashes.first, unique_hashes_count.first,
685  num_voxels_mem.first, num_points, stream);
686 
687  encode_temp = mem_temp.Alloc(encode_temp.second);
688  if (!get_temp_size) {
689  cub::DeviceRunLengthEncode::Encode(
690  encode_temp.first, encode_temp.second,
691  hashes_dbuf.Current(), unique_hashes.first,
692  unique_hashes_count.first, num_voxels_mem.first, num_points,
693  stream);
694 
695  // get the number of voxels
696  cudaMemcpyAsync(&num_voxels, num_voxels_mem.first, sizeof(int64_t),
697  cudaMemcpyDeviceToHost, stream);
698  // get the last hash value
699  cudaMemcpyAsync(&last_hash,
700  hashes_dbuf.Current() + hashes.second - 1,
701  sizeof(int64_t), cudaMemcpyDeviceToHost, stream);
702  // wait for the async copies
703  while (cudaErrorNotReady == cudaStreamQuery(stream)) { /*empty*/
704  }
705  }
706  mem_temp.Free(encode_temp);
707  }
708  if (invalid_hash == last_hash) {
709  // the last hash is invalid we have one voxel less
710  --num_voxels;
711  }
712 
713  // reuse the hashes buffer
714  std::pair<int64_t*, size_t> unique_hashes_count_prefix_sum(
715  hashes_dbuf.Current(), hashes.second);
716 
717  // compute the prefix sum for unique_hashes_count
718  // gives starting index of each voxel
719  {
720  std::pair<void*, size_t> inclusive_scan_temp(nullptr, 0);
721 
722  cub::DeviceScan::InclusiveSum(
723  inclusive_scan_temp.first, inclusive_scan_temp.second,
724  unique_hashes_count.first, unique_hashes_count_prefix_sum.first,
725  unique_hashes_count.second, stream);
726 
727  inclusive_scan_temp = mem_temp.Alloc(inclusive_scan_temp.second);
728  if (!get_temp_size) {
729  // We only need the prefix sum for the first num_voxels.
730  cub::DeviceScan::InclusiveSum(
731  inclusive_scan_temp.first, inclusive_scan_temp.second,
732  unique_hashes_count.first,
733  unique_hashes_count_prefix_sum.first, num_voxels, stream);
734  }
735  mem_temp.Free(inclusive_scan_temp);
736  }
737 
738  // Limit the number of output points to max_points_per_voxel by
739  // limiting the unique_hashes_count.
740  if (!get_temp_size) {
741  if (max_points_per_voxel < static_cast<int64_t>(num_points)) {
742  LimitCounts(stream, unique_hashes_count.first, num_voxels,
743  max_points_per_voxel);
744  }
745  }
746 
747  // Convert unique_hashes to batch_id (divide with batch_hash)
748  int64_t* unique_hashes_batch_id = unique_hashes.first;
749  if (!get_temp_size) {
750  ComputeBatchId(stream, unique_hashes_batch_id, num_voxels, batch_hash);
751  }
752 
753  std::pair<int64_t*, size_t> unique_batches =
754  mem_temp.Alloc<int64_t>(batch_size);
755  std::pair<int64_t*, size_t> unique_batches_count =
756  mem_temp.Alloc<int64_t>(batch_size);
757  int64_t num_batches = 0; // Store non empty batches
758 
759  // Convert batch_id to counts (array of num_voxels per batch)
760  {
761  std::pair<void*, size_t> encode_temp(nullptr, 0);
762  std::pair<int64_t*, size_t> num_batches_mem =
763  mem_temp.Alloc<int64_t>(1);
764 
765  cub::DeviceRunLengthEncode::Encode(
766  encode_temp.first, encode_temp.second, unique_hashes_batch_id,
767  unique_batches.first, unique_batches_count.first,
768  num_batches_mem.first, num_voxels, stream);
769  encode_temp = mem_temp.Alloc(encode_temp.second);
770  if (!get_temp_size) {
771  cub::DeviceRunLengthEncode::Encode(
772  encode_temp.first, encode_temp.second,
773  unique_hashes_batch_id, unique_batches.first,
774  unique_batches_count.first, num_batches_mem.first,
775  num_voxels, stream);
776 
777  // get the number of non empty batches.
778  cudaMemcpyAsync(&num_batches, num_batches_mem.first,
779  sizeof(int64_t), cudaMemcpyDeviceToHost, stream);
780  // wait for the async copies
781  while (cudaErrorNotReady == cudaStreamQuery(stream)) { /*empty*/
782  }
783  }
784  mem_temp.Free(encode_temp);
785  }
786 
787  // Insert count(0) for empty batches
788  std::pair<int64_t*, size_t> num_voxels_per_batch =
789  mem_temp.Alloc<int64_t>(batch_size);
790  if (!get_temp_size) {
791  cudaMemset(num_voxels_per_batch.first, 0, batch_size * sizeof(int64_t));
792  ComputeVoxelPerBatch(stream, num_voxels_per_batch.first,
793  unique_batches_count.first, unique_batches.first,
794  num_batches);
795  }
796 
797  std::pair<int64_t*, size_t> num_voxels_prefix_sum(unique_batches.first,
798  batch_size);
799 
800  // compute the prefix sum for number of voxels per batch
801  // gives starting voxel index for each batch
802  // used only when voxel count exceeds max_voxels
803  {
804  std::pair<void*, size_t> inclusive_scan_temp(nullptr, 0);
805 
806  cub::DeviceScan::InclusiveSum(
807  inclusive_scan_temp.first, inclusive_scan_temp.second,
808  num_voxels_per_batch.first, num_voxels_prefix_sum.first,
809  num_voxels_per_batch.second, stream);
810 
811  inclusive_scan_temp = mem_temp.Alloc(inclusive_scan_temp.second);
812  if (!get_temp_size) {
813  if (num_voxels > max_voxels) {
814  cub::DeviceScan::InclusiveSum(
815  inclusive_scan_temp.first, inclusive_scan_temp.second,
816  num_voxels_per_batch.first, num_voxels_prefix_sum.first,
817  num_voxels_per_batch.second, stream);
818  }
819  }
820  mem_temp.Free(inclusive_scan_temp);
821  }
822 
823  // Limit the number of voxels per batch to max_voxels
824  if (!get_temp_size) {
825  if (num_voxels >= max_voxels)
826  LimitCounts(stream, num_voxels_per_batch.first, batch_size,
827  max_voxels);
828  }
829 
830  // Prefix sum of limited counts to get batch splits.
831  int64_t* out_batch_splits = nullptr;
832  if (!get_temp_size) {
833  output_allocator.AllocVoxelBatchSplits(&out_batch_splits,
834  batch_size + 1);
835  cudaMemsetAsync(out_batch_splits, 0, sizeof(int64_t), stream);
836  }
837 
838  // Prefix sum of counts to get batch splits
839  {
840  std::pair<void*, size_t> inclusive_scan_temp(nullptr, 0);
841 
842  cub::DeviceScan::InclusiveSum(
843  inclusive_scan_temp.first, inclusive_scan_temp.second,
844  num_voxels_per_batch.first, out_batch_splits + 1,
845  num_voxels_per_batch.second, stream);
846 
847  inclusive_scan_temp = mem_temp.Alloc(inclusive_scan_temp.second);
848 
849  if (!get_temp_size) {
850  cub::DeviceScan::InclusiveSum(
851  inclusive_scan_temp.first, inclusive_scan_temp.second,
852  num_voxels_per_batch.first, out_batch_splits + 1,
853  batch_size, stream);
854  }
855  mem_temp.Free(inclusive_scan_temp);
856  }
857 
858  // num_valid_voxels excludes voxels exceeding max_voxels
859  int64_t num_valid_voxels = num_points;
860  if (!get_temp_size) {
861  // get the number of valid voxels.
862  cudaMemcpyAsync(&num_valid_voxels, out_batch_splits + batch_size,
863  sizeof(int64_t), cudaMemcpyDeviceToHost, stream);
864  // wait for the async copies
865  while (cudaErrorNotReady == cudaStreamQuery(stream)) { /*empty*/
866  }
867  }
868 
869  // start_idx stores starting index of each valid voxel.
870  // points_count stores number of valid points in respective voxel.
871  std::pair<int64_t*, size_t> start_idx(indices_batches.first,
872  num_valid_voxels);
873  std::pair<int64_t*, size_t> points_count =
874  mem_temp.Alloc<int64_t>(num_valid_voxels);
875 
876  if (!get_temp_size) {
877  if (num_voxels <= max_voxels) {
878  // starting index and points_count will be same as
879  // unique_hashes_count_prefix_sum and unique_hashes_count when voxel
880  // count doesn't exceeds max_voxels
881  cudaMemsetAsync(start_idx.first, 0, sizeof(int64_t), stream);
882  cudaMemcpyAsync(start_idx.first + 1,
883  unique_hashes_count_prefix_sum.first,
884  (num_voxels - 1) * sizeof(int64_t),
885  cudaMemcpyDeviceToDevice, stream);
886  mem_temp.Free(points_count);
887  points_count.first = unique_hashes_count.first;
888  } else {
889  ComputeStartIdx(stream, start_idx.first, points_count.first,
890  num_voxels_prefix_sum.first,
891  unique_hashes_count_prefix_sum.first,
892  out_batch_splits, batch_size, max_voxels,
893  max_points_per_voxel);
894  }
895  }
896 
897  int64_t* out_voxel_row_splits = nullptr;
898  if (!get_temp_size) {
899  output_allocator.AllocVoxelPointRowSplits(&out_voxel_row_splits,
900  num_valid_voxels + 1);
901  }
902 
903  if (!get_temp_size) {
904  // set first element to 0
905  cudaMemsetAsync(out_voxel_row_splits, 0, sizeof(int64_t), stream);
906  }
907  {
908  std::pair<void*, size_t> inclusive_scan_temp(nullptr, 0);
909 
910  cub::DeviceScan::InclusiveSum(
911  inclusive_scan_temp.first, inclusive_scan_temp.second,
912  points_count.first, out_voxel_row_splits + 1, num_valid_voxels,
913  stream);
914 
915  inclusive_scan_temp = mem_temp.Alloc(inclusive_scan_temp.second);
916  if (!get_temp_size) {
917  cub::DeviceScan::InclusiveSum(
918  inclusive_scan_temp.first, inclusive_scan_temp.second,
919  points_count.first, out_voxel_row_splits + 1,
920  num_valid_voxels, stream);
921  }
922  mem_temp.Free(inclusive_scan_temp);
923  }
924 
925  if (get_temp_size) {
926  // return the memory peak as the required temporary memory size.
927  temp_size = mem_temp.MaxUsed();
928  return;
929  }
930 
931  int32_t* out_voxel_coords = nullptr;
932  output_allocator.AllocVoxelCoords(&out_voxel_coords, num_valid_voxels,
933  NDIM);
934  ComputeVoxelCoords(stream, out_voxel_coords, points,
935  point_indices_dbuf.Current(), start_idx.first,
936  points_range_min_vec, inv_voxel_size, num_valid_voxels);
937 
938  int64_t num_valid_points = 0;
939  {
940  cudaMemcpyAsync(&num_valid_points,
941  out_voxel_row_splits + num_valid_voxels,
942  sizeof(int64_t), cudaMemcpyDeviceToHost, stream);
943  // wait for the async copies
944  while (cudaErrorNotReady == cudaStreamQuery(stream)) { /*empty*/
945  }
946  }
947  int64_t* out_point_indices = nullptr;
948  output_allocator.AllocVoxelPointIndices(&out_point_indices,
949  num_valid_points);
950  CopyPointIndices(stream, out_point_indices, point_indices_dbuf.Current(),
951  start_idx.first, out_voxel_row_splits + 1,
952  num_valid_voxels);
953 }
954 
955 } // namespace impl
956 } // namespace ml
957 } // namespace cloudViewer