1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
11 #include <thrust/sequence.h>
13 #include <cub/cub.cuh>
15 #include "ml/impl/misc/MemoryAllocation.h"
16 #include "utility/MiniVec.h"
18 namespace cloudViewer {
24 using namespace cloudViewer::utility;
26 template <class T, bool LARGE_ARRAY>
27 __global__ void IotaCUDAKernel(T* first, int64_t len, T value) {
29 const int64_t x = blockDim.x * blockIdx.x + threadIdx.x;
31 const int64_t y = blockDim.y * blockIdx.y + threadIdx.y;
32 const int64_t z = blockDim.z * blockIdx.z + threadIdx.z;
33 linear_idx = z * gridDim.x * blockDim.x * gridDim.y +
34 y * gridDim.x * blockDim.x + x;
38 if (linear_idx < len) {
39 T* ptr = first + linear_idx;
45 /// Iota function for CUDA
47 void IotaCUDA(const cudaStream_t& stream, T* first, T* last, T value) {
48 ptrdiff_t len = last - first;
50 const int BLOCKSIZE = 128;
51 dim3 block(BLOCKSIZE, 1, 1);
53 if (len > block.x * INT32_MAX) {
54 grid.y = std::ceil(std::cbrt(len));
56 grid.x = DivUp(len, int64_t(grid.z) * grid.y * block.x);
57 IotaCUDAKernel<T, true>
58 <<<grid, block, 0, stream>>>(first, len, value);
60 grid = dim3(DivUp(len, block.x), 1, 1);
61 IotaCUDAKernel<T, false>
62 <<<grid, block, 0, stream>>>(first, len, value);
67 __global__ void ComputeBatchIdKernel(int64_t* hashes,
68 const int64_t num_voxels,
69 const int64_t batch_hash) {
71 const int64_t x = blockDim.x * blockIdx.x + threadIdx.x;
72 const int64_t y = blockDim.y * blockIdx.y + threadIdx.y;
73 const int64_t z = blockDim.z * blockIdx.z + threadIdx.z;
74 linear_idx = z * gridDim.x * blockDim.x * gridDim.y +
75 y * gridDim.x * blockDim.x + x;
77 if (linear_idx >= num_voxels) return;
79 hashes[linear_idx] /= batch_hash;
82 /// This function computes batch_id from hash value.
84 /// \param hashes Input and output array.
85 /// \param num_voxels Number of valid voxels.
86 /// \param batch_hash The value used to hash batch dimension.
88 void ComputeBatchId(const cudaStream_t& stream,
90 const int64_t num_voxels,
91 const int64_t batch_hash) {
93 const int BLOCKSIZE = 128;
94 dim3 block(BLOCKSIZE, 1, 1);
96 grid.y = std::ceil(std::cbrt(num_voxels));
98 grid.x = DivUp(num_voxels, int64_t(grid.z) * grid.y * block.x);
99 ComputeBatchIdKernel<<<grid, block, 0, stream>>>(hashes, num_voxels,
104 __global__ void ComputeVoxelPerBatchKernel(int64_t* num_voxels_per_batch,
105 int64_t* unique_batches_count,
106 int64_t* unique_batches,
107 const int64_t num_batches) {
109 const int64_t x = blockDim.x * blockIdx.x + threadIdx.x;
110 const int64_t y = blockDim.y * blockIdx.y + threadIdx.y;
111 const int64_t z = blockDim.z * blockIdx.z + threadIdx.z;
112 linear_idx = z * gridDim.x * blockDim.x * gridDim.y +
113 y * gridDim.x * blockDim.x + x;
115 if (linear_idx >= num_batches) return;
117 int64_t out_idx = unique_batches[linear_idx];
118 num_voxels_per_batch[out_idx] = unique_batches_count[linear_idx];
121 /// This function computes number of voxels per batch element.
123 /// \param num_voxels_per_batch The output array.
124 /// \param unique_batches_count Counts for unique batch_id.
125 /// \param unique_batches Unique batch_id.
126 /// \param num_batches Number of non empty batches (<= batch_size).
128 void ComputeVoxelPerBatch(const cudaStream_t& stream,
129 int64_t* num_voxels_per_batch,
130 int64_t* unique_batches_count,
131 int64_t* unique_batches,
132 const int64_t num_batches) {
134 const int BLOCKSIZE = 128;
135 dim3 block(BLOCKSIZE, 1, 1);
137 grid.y = std::ceil(std::cbrt(num_batches));
139 grid.x = DivUp(num_batches, int64_t(grid.z) * grid.y * block.x);
140 ComputeVoxelPerBatchKernel<<<grid, block, 0, stream>>>(
141 num_voxels_per_batch, unique_batches_count, unique_batches,
146 __global__ void ComputeIndicesBatchesKernel(int64_t* indices_batches,
147 const int64_t* row_splits,
148 const int64_t batch_size) {
150 const int64_t x = blockDim.x * blockIdx.x + threadIdx.x;
151 const int64_t y = blockDim.y * blockIdx.y + threadIdx.y;
152 const int64_t z = blockDim.z * blockIdx.z + threadIdx.z;
153 linear_idx = z * gridDim.x * blockDim.x * gridDim.y +
154 y * gridDim.x * blockDim.x + x;
156 if (linear_idx >= batch_size) return;
158 for (int64_t i = row_splits[linear_idx]; i < row_splits[linear_idx + 1];
160 indices_batches[i] = linear_idx;
164 /// This function computes mapping of index to batch_id.
166 /// \param indices_batches The output array.
167 /// \param row_splits The row_splits for defining batches.
168 /// \param batch_size The batch_size of given points.
170 void ComputeIndicesBatches(const cudaStream_t& stream,
171 int64_t* indices_batches,
172 const int64_t* row_splits,
173 const int64_t batch_size) {
175 const int BLOCKSIZE = 128;
176 dim3 block(BLOCKSIZE, 1, 1);
178 grid.y = std::ceil(std::cbrt(batch_size));
180 grid.x = DivUp(batch_size, int64_t(grid.z) * grid.y * block.x);
181 ComputeIndicesBatchesKernel<<<grid, block, 0, stream>>>(
182 indices_batches, row_splits, batch_size);
186 template <class T, int NDIM>
187 __global__ void ComputeHashKernel(
188 int64_t* __restrict__ hashes,
190 const T* const __restrict__ points,
191 const int64_t batch_size,
192 const int64_t* row_splits,
193 const int64_t* indices_batches,
194 const cloudViewer::utility::MiniVec<T, NDIM> points_range_min_vec,
195 const cloudViewer::utility::MiniVec<T, NDIM> points_range_max_vec,
196 const cloudViewer::utility::MiniVec<T, NDIM> inv_voxel_size,
197 const cloudViewer::utility::MiniVec<int64_t, NDIM> strides,
198 const int64_t batch_hash,
199 const int64_t invalid_hash) {
200 typedef MiniVec<T, NDIM> Vec_t;
202 const int64_t x = blockDim.x * blockIdx.x + threadIdx.x;
203 const int64_t y = blockDim.y * blockIdx.y + threadIdx.y;
204 const int64_t z = blockDim.z * blockIdx.z + threadIdx.z;
205 linear_idx = z * gridDim.x * blockDim.x * gridDim.y +
206 y * gridDim.x * blockDim.x + x;
208 if (linear_idx >= num_points) return;
210 Vec_t point(points + linear_idx * NDIM);
212 if ((point >= points_range_min_vec && point <= points_range_max_vec)
214 auto coords = ((point - points_range_min_vec) * inv_voxel_size)
215 .template cast<int64_t>();
216 int64_t h = coords.dot(strides);
217 h += indices_batches[linear_idx] * batch_hash; // add hash for batch_id
218 hashes[linear_idx] = h;
220 hashes[linear_idx] = invalid_hash; // max hash value used as invalid
224 /// This function computes the hash (linear index) for each point.
225 /// Points outside the range will get a specific hash value.
227 /// \tparam T The floating point type for the points
228 /// \tparam NDIM The number of dimensions, e.g., 3.
230 /// \param hashes The output vector with the hashes/linear indexes.
231 /// \param num_points The number of points.
232 /// \param points The array with the point coordinates. The shape is
233 /// [num_points,NDIM] and the storage order is row-major.
234 /// \param batch_size The batch size of points.
235 /// \param row_splits row_splits for defining batches.
236 /// \param indices_batches Mapping of index to batch_id.
237 /// \param points_range_min_vec The minimum range for a point to be valid.
238 /// \param points_range_max_vec The maximum range for a point to be valid.
239 /// \param inv_voxel_size The reciprocal of the voxel edge lengths in each
241 /// \param strides The strides for computing the linear index.
242 /// \param batch_hash The value for hashing batch dimension.
243 /// \param invalid_hash The value to use for points outside the range.
244 template <class T, int NDIM>
245 void ComputeHash(const cudaStream_t& stream,
248 const T* const points,
249 const int64_t batch_size,
250 const int64_t* row_splits,
251 const int64_t* indices_batches,
252 const MiniVec<T, NDIM> points_range_min_vec,
253 const MiniVec<T, NDIM> points_range_max_vec,
254 const MiniVec<T, NDIM> inv_voxel_size,
255 const MiniVec<int64_t, NDIM> strides,
256 const int64_t batch_hash,
257 const int64_t invalid_hash) {
259 const int BLOCKSIZE = 128;
260 dim3 block(BLOCKSIZE, 1, 1);
262 grid.y = std::ceil(std::cbrt(num_points));
264 grid.x = DivUp(num_points, int64_t(grid.z) * grid.y * block.x);
265 ComputeHashKernel<T, NDIM><<<grid, block, 0, stream>>>(
266 hashes, num_points, points, batch_size, row_splits,
267 indices_batches, points_range_min_vec, points_range_max_vec,
268 inv_voxel_size, strides, batch_hash, invalid_hash);
273 __global__ void LimitCountsKernel(T* counts, int64_t num, T limit) {
275 const int64_t x = blockDim.x * blockIdx.x + threadIdx.x;
276 const int64_t y = blockDim.y * blockIdx.y + threadIdx.y;
277 const int64_t z = blockDim.z * blockIdx.z + threadIdx.z;
278 linear_idx = z * gridDim.x * blockDim.x * gridDim.y +
279 y * gridDim.x * blockDim.x + x;
281 if (linear_idx >= num) return;
283 if (counts[linear_idx] > limit) {
284 counts[linear_idx] = limit;
288 /// This function performs an element-wise minimum operation.
290 /// \param counts The input and output array.
291 /// \param num Number of input elements.
292 /// \param limit The second operator for the minimum operation.
294 void LimitCounts(const cudaStream_t& stream, T* counts, int64_t num, T limit) {
296 const int BLOCKSIZE = 128;
297 dim3 block(BLOCKSIZE, 1, 1);
299 grid.y = std::ceil(std::cbrt(num));
301 grid.x = DivUp(num, int64_t(grid.z) * grid.y * block.x);
302 LimitCountsKernel<<<grid, block, 0, stream>>>(counts, num, limit);
306 __global__ void ComputeStartIdxKernel(
308 int64_t* points_count,
309 const int64_t* num_voxels_prefix_sum,
310 const int64_t* unique_hashes_count_prefix_sum,
311 const int64_t* out_batch_splits,
312 const int64_t batch_size,
313 const int64_t max_voxels,
314 const int64_t max_points_per_voxel) {
316 const int64_t x = blockDim.x * blockIdx.x + threadIdx.x;
317 const int64_t y = blockDim.y * blockIdx.y + threadIdx.y;
318 const int64_t z = blockDim.z * blockIdx.z + threadIdx.z;
319 linear_idx = z * gridDim.x * blockDim.x * gridDim.y +
320 y * gridDim.x * blockDim.x + x;
322 if (linear_idx >= batch_size) return;
325 if (0 == linear_idx) {
328 voxel_idx = num_voxels_prefix_sum[linear_idx - 1];
331 int64_t begin_out = out_batch_splits[linear_idx];
332 int64_t end_out = out_batch_splits[linear_idx + 1];
334 for (int64_t out_idx = begin_out; out_idx < end_out;
335 out_idx++, voxel_idx++) {
336 if (voxel_idx == 0) {
337 start_idx[out_idx] = 0;
338 points_count[out_idx] = min(max_points_per_voxel,
339 unique_hashes_count_prefix_sum[0]);
341 start_idx[out_idx] = unique_hashes_count_prefix_sum[voxel_idx - 1];
342 points_count[out_idx] =
343 min(max_points_per_voxel,
344 unique_hashes_count_prefix_sum[voxel_idx] -
345 unique_hashes_count_prefix_sum[voxel_idx - 1]);
350 /// Computes the starting index of each voxel.
352 /// \param start_idx The output array for storing starting index.
353 /// \param points_count The output array for storing points count.
354 /// \param num_voxels_prefix_sum The Inclusive prefix sum which gives
355 /// the index of starting voxel for each batch.
356 /// \param unique_hashes_count_prefix_sum Inclusive prefix sum defining
357 /// where point indices for each voxel ends.
358 /// \param out_batch_splits Defines starting and ending voxels for
359 /// each batch element.
360 /// \param batch_size The batch size.
361 /// \param max_voxels Maximum voxels per batch.
362 /// \param max_points_per_voxel Maximum points per voxel.
364 void ComputeStartIdx(const cudaStream_t& stream,
366 int64_t* points_count,
367 const int64_t* num_voxels_prefix_sum,
368 const int64_t* unique_hashes_count_prefix_sum,
369 const int64_t* out_batch_splits,
370 const int64_t batch_size,
371 const int64_t max_voxels,
372 const int64_t max_points_per_voxel) {
374 const int BLOCKSIZE = 128;
375 dim3 block(BLOCKSIZE, 1, 1);
377 grid.y = std::ceil(std::cbrt(batch_size));
379 grid.x = DivUp(batch_size, int64_t(grid.z) * grid.y * block.x);
380 ComputeStartIdxKernel<<<grid, block, 0, stream>>>(
381 start_idx, points_count, num_voxels_prefix_sum,
382 unique_hashes_count_prefix_sum, out_batch_splits, batch_size,
383 max_voxels, max_points_per_voxel);
387 template <class T, int NDIM>
388 __global__ void ComputeVoxelCoordsKernel(
389 int32_t* __restrict__ voxel_coords,
390 const T* const __restrict__ points,
391 const int64_t* const __restrict__ point_indices,
392 const int64_t* const __restrict__ prefix_sum,
393 const MiniVec<T, NDIM> points_range_min_vec,
394 const MiniVec<T, NDIM> inv_voxel_size,
395 int64_t num_voxels) {
396 typedef MiniVec<T, NDIM> Vec_t;
398 const int64_t x = blockDim.x * blockIdx.x + threadIdx.x;
399 const int64_t y = blockDim.y * blockIdx.y + threadIdx.y;
400 const int64_t z = blockDim.z * blockIdx.z + threadIdx.z;
401 linear_idx = z * gridDim.x * blockDim.x * gridDim.y +
402 y * gridDim.x * blockDim.x + x;
404 if (linear_idx >= num_voxels) return;
406 int64_t point_idx = point_indices[prefix_sum[linear_idx]];
408 Vec_t point(points + point_idx * NDIM);
409 auto coords = ((point - points_range_min_vec) * inv_voxel_size)
410 .template cast<int32_t>();
412 for (int i = 0; i < NDIM; ++i) {
413 voxel_coords[linear_idx * NDIM + i] = coords[i];
417 /// Computes the coordinates for each voxel
419 /// \param voxel_coords The output array with shape [num_voxels, NDIM].
420 /// \param points The array with the point coordinates.
421 /// \param point_indices The array with the point indices for all voxels.
422 /// \param prefix_sum Inclusive prefix sum defining where the point indices
423 /// for each voxels end.
424 /// \param points_range_min The lower bound of the domain to be
426 /// \param points_range_max The upper bound of the domain to be
428 /// \param inv_voxel_size The reciprocal of the voxel edge lengths for each
430 /// \param num_voxels The number of voxels.
431 template <class T, int NDIM>
432 void ComputeVoxelCoords(const cudaStream_t& stream,
433 int32_t* voxel_coords,
434 const T* const points,
435 const int64_t* const point_indices,
436 const int64_t* const prefix_sum,
437 const MiniVec<T, NDIM> points_range_min_vec,
438 const MiniVec<T, NDIM> inv_voxel_size,
439 int64_t num_voxels) {
441 const int BLOCKSIZE = 128;
442 dim3 block(BLOCKSIZE, 1, 1);
444 grid.y = std::ceil(std::cbrt(num_voxels));
446 grid.x = DivUp(num_voxels, int64_t(grid.z) * grid.y * block.x);
447 ComputeVoxelCoordsKernel<<<grid, block, 0, stream>>>(
448 voxel_coords, points, point_indices, prefix_sum,
449 points_range_min_vec, inv_voxel_size, num_voxels);
453 __global__ void CopyPointIndicesKernel(
454 int64_t* __restrict__ out,
455 const int64_t* const __restrict__ point_indices,
456 const int64_t* const __restrict__ prefix_sum_in,
457 const int64_t* const __restrict__ prefix_sum_out,
458 const int64_t num_voxels) {
459 // TODO data coalescing can be optimized
461 const int64_t x = blockDim.x * blockIdx.x + threadIdx.x;
462 const int64_t y = blockDim.y * blockIdx.y + threadIdx.y;
463 const int64_t z = blockDim.z * blockIdx.z + threadIdx.z;
464 linear_idx = z * gridDim.x * blockDim.x * gridDim.y +
465 y * gridDim.x * blockDim.x + x;
467 if (linear_idx >= num_voxels) return;
470 if (0 == linear_idx) {
473 begin_out = prefix_sum_out[linear_idx - 1];
475 int64_t end_out = prefix_sum_out[linear_idx];
476 int64_t in_idx = prefix_sum_in[linear_idx];
477 for (int64_t out_idx = begin_out; out_idx < end_out; ++out_idx, ++in_idx) {
478 out[out_idx] = point_indices[in_idx];
482 /// Copies the point indices for each voxel to the output.
484 /// \param out The output array with the point indices for all voxels.
485 /// \param point_indices The array with the point indices for all voxels.
486 /// \param prefix_sum_in Inclusive prefix sum defining where the point
487 /// indices for each voxels end.
488 /// \param prefix_sum_out Inclusive prefix sum defining where the point
489 /// indices for each voxels end.
490 /// \param num_voxels The number of voxels.
492 void CopyPointIndices(const cudaStream_t& stream,
494 const int64_t* const point_indices,
495 const int64_t* const prefix_sum_in,
496 const int64_t* const prefix_sum_out,
497 const int64_t num_voxels) {
499 const int BLOCKSIZE = 128;
500 dim3 block(BLOCKSIZE, 1, 1);
502 grid.y = std::ceil(std::cbrt(num_voxels));
504 grid.x = DivUp(num_voxels, int64_t(grid.z) * grid.y * block.x);
505 CopyPointIndicesKernel<<<grid, block, 0, stream>>>(
506 out, point_indices, prefix_sum_in, prefix_sum_out, num_voxels);
512 /// This function voxelizes a point cloud.
513 /// The function returns the integer coordinates of the voxels that
514 /// contain points and a compact list of the indices that associate the
515 /// voxels to the points.
517 /// All pointer arguments point to device memory unless stated
520 /// \tparam T Floating-point data type for the point positions.
522 /// \tparam NDIM The number of dimensions of the points.
524 /// \tparam OUTPUT_ALLOCATOR Type of the output_allocator. See
525 /// \p output_allocator for more information.
527 /// \param stream The cuda stream for all kernel launches.
529 /// \param temp Pointer to temporary memory. If nullptr then the required
530 /// size of temporary memory will be written to \p temp_size and no
533 /// \param temp_size The size of the temporary memory in bytes. This is
534 /// used as an output if temp is nullptr
536 /// \param texture_alignment The texture alignment in bytes. This is used
537 /// for allocating segments within the temporary memory.
539 /// \param num_points The number of points.
541 /// \param points Array with the point positions. The shape is
542 /// [num_points,NDIM].
544 /// \param batch_size The batch size of points.
546 /// \param row_splits row_splits for defining batches.
548 /// \param voxel_size The edge lengths of the voxel. The shape is
549 /// [NDIM]. This pointer points to host memory!
551 /// \param points_range_min The lower bound of the domain to be
553 /// The shape is [NDIM].
554 /// This pointer points to host memory!
556 /// \param points_range_max The upper bound of the domain to be
558 /// The shape is [NDIM].
559 /// This pointer points to host memory!
561 /// \param max_points_per_voxel This parameter limits the number of
563 /// that are recorderd for each voxel.
565 /// \param max_voxels This parameter limits the number of voxels that
566 /// will be generated.
568 /// \param output_allocator An object that implements functions for
569 /// allocating the output arrays. The object must implement
570 /// functions AllocVoxelCoords(int32_t** ptr, int64_t rows,
571 /// int64_t cols), AllocVoxelPointIndices(int64_t** ptr, int64_t
572 /// size), AllocVoxelPointRowSplits(int64_t** ptr, int64_t
573 /// size) and AllocVoxelBatchSplits(int64_t** ptr, int64_t size).
574 /// All functions should allocate memory and return a pointer
575 /// to that memory in ptr. The arguments size, rows, and cols
576 /// define the size of the array as the number of elements.
577 /// All functions must accept zero size arguments. In this case
578 /// ptr does not need to be set.
580 template <class T, int NDIM, class OUTPUT_ALLOCATOR>
581 void VoxelizeCUDA(const cudaStream_t& stream,
584 int texture_alignment,
586 const T* const points,
587 const size_t batch_size,
588 const int64_t* const row_splits,
589 const T* const voxel_size,
590 const T* const points_range_min,
591 const T* const points_range_max,
592 const int64_t max_points_per_voxel,
593 const int64_t max_voxels,
594 OUTPUT_ALLOCATOR& output_allocator) {
595 using namespace cloudViewer::utility;
596 typedef MiniVec<T, NDIM> Vec_t;
598 const bool get_temp_size = !temp;
601 temp = (char*)1; // worst case pointer alignment
602 temp_size = std::numeric_limits<int64_t>::max();
605 MemoryAllocation mem_temp(temp, temp_size, texture_alignment);
607 const Vec_t inv_voxel_size = T(1) / Vec_t(voxel_size);
608 const Vec_t points_range_min_vec(points_range_min);
609 const Vec_t points_range_max_vec(points_range_max);
610 MiniVec<int32_t, NDIM> extents =
611 ceil((points_range_max_vec - points_range_min_vec) * inv_voxel_size)
612 .template cast<int32_t>();
613 MiniVec<int64_t, NDIM> strides;
614 for (int i = 0; i < NDIM; ++i) {
616 for (int j = 0; j < i; ++j) {
617 strides[i] *= extents[j];
620 const int64_t batch_hash = strides[NDIM - 1] * extents[NDIM - 1];
621 const int64_t invalid_hash = batch_hash * batch_size;
623 /// store batch_id for each point
624 std::pair<int64_t*, size_t> indices_batches =
625 mem_temp.Alloc<int64_t>(num_points);
626 if (!get_temp_size) {
627 ComputeIndicesBatches(stream, indices_batches.first, row_splits,
631 // use double buffers for the sorting
632 std::pair<int64_t*, size_t> point_indices =
633 mem_temp.Alloc<int64_t>(num_points);
634 std::pair<int64_t*, size_t> point_indices_alt =
635 mem_temp.Alloc<int64_t>(num_points);
636 std::pair<int64_t*, size_t> hashes = mem_temp.Alloc<int64_t>(num_points);
637 std::pair<int64_t*, size_t> hashes_alt =
638 mem_temp.Alloc<int64_t>(num_points);
640 cub::DoubleBuffer<int64_t> point_indices_dbuf(point_indices.first,
641 point_indices_alt.first);
642 cub::DoubleBuffer<int64_t> hashes_dbuf(hashes.first, hashes_alt.first);
644 if (!get_temp_size) {
645 IotaCUDA(stream, point_indices.first,
646 point_indices.first + point_indices.second, int64_t(0));
647 ComputeHash(stream, hashes.first, num_points, points, batch_size,
648 row_splits, indices_batches.first, points_range_min_vec,
649 points_range_max_vec, inv_voxel_size, strides, batch_hash,
654 // TODO compute end_bit for radix sort
655 std::pair<void*, size_t> sort_pairs_temp(nullptr, 0);
656 cub::DeviceRadixSort::SortPairs(
657 sort_pairs_temp.first, sort_pairs_temp.second, hashes_dbuf,
658 point_indices_dbuf, num_points, 0, sizeof(int64_t) * 8, stream);
659 sort_pairs_temp = mem_temp.Alloc(sort_pairs_temp.second);
660 if (!get_temp_size) {
661 cub::DeviceRadixSort::SortPairs(sort_pairs_temp.first,
662 sort_pairs_temp.second, hashes_dbuf,
663 point_indices_dbuf, num_points, 0,
664 sizeof(int64_t) * 8, stream);
666 mem_temp.Free(sort_pairs_temp);
669 // reuse the alternate buffers
670 std::pair<int64_t*, size_t> unique_hashes(hashes_dbuf.Alternate(),
672 std::pair<int64_t*, size_t> unique_hashes_count(
673 point_indices_dbuf.Alternate(), point_indices.second);
675 // encode unique hashes(voxels) and their counts(points per voxel)
676 int64_t num_voxels = 0;
677 int64_t last_hash = 0; // 0 is a valid hash value
679 std::pair<void*, size_t> encode_temp(nullptr, 0);
680 std::pair<int64_t*, size_t> num_voxels_mem = mem_temp.Alloc<int64_t>(1);
682 cub::DeviceRunLengthEncode::Encode(
683 encode_temp.first, encode_temp.second, hashes_dbuf.Current(),
684 unique_hashes.first, unique_hashes_count.first,
685 num_voxels_mem.first, num_points, stream);
687 encode_temp = mem_temp.Alloc(encode_temp.second);
688 if (!get_temp_size) {
689 cub::DeviceRunLengthEncode::Encode(
690 encode_temp.first, encode_temp.second,
691 hashes_dbuf.Current(), unique_hashes.first,
692 unique_hashes_count.first, num_voxels_mem.first, num_points,
695 // get the number of voxels
696 cudaMemcpyAsync(&num_voxels, num_voxels_mem.first, sizeof(int64_t),
697 cudaMemcpyDeviceToHost, stream);
698 // get the last hash value
699 cudaMemcpyAsync(&last_hash,
700 hashes_dbuf.Current() + hashes.second - 1,
701 sizeof(int64_t), cudaMemcpyDeviceToHost, stream);
702 // wait for the async copies
703 while (cudaErrorNotReady == cudaStreamQuery(stream)) { /*empty*/
706 mem_temp.Free(encode_temp);
708 if (invalid_hash == last_hash) {
709 // the last hash is invalid we have one voxel less
713 // reuse the hashes buffer
714 std::pair<int64_t*, size_t> unique_hashes_count_prefix_sum(
715 hashes_dbuf.Current(), hashes.second);
717 // compute the prefix sum for unique_hashes_count
718 // gives starting index of each voxel
720 std::pair<void*, size_t> inclusive_scan_temp(nullptr, 0);
722 cub::DeviceScan::InclusiveSum(
723 inclusive_scan_temp.first, inclusive_scan_temp.second,
724 unique_hashes_count.first, unique_hashes_count_prefix_sum.first,
725 unique_hashes_count.second, stream);
727 inclusive_scan_temp = mem_temp.Alloc(inclusive_scan_temp.second);
728 if (!get_temp_size) {
729 // We only need the prefix sum for the first num_voxels.
730 cub::DeviceScan::InclusiveSum(
731 inclusive_scan_temp.first, inclusive_scan_temp.second,
732 unique_hashes_count.first,
733 unique_hashes_count_prefix_sum.first, num_voxels, stream);
735 mem_temp.Free(inclusive_scan_temp);
738 // Limit the number of output points to max_points_per_voxel by
739 // limiting the unique_hashes_count.
740 if (!get_temp_size) {
741 if (max_points_per_voxel < static_cast<int64_t>(num_points)) {
742 LimitCounts(stream, unique_hashes_count.first, num_voxels,
743 max_points_per_voxel);
747 // Convert unique_hashes to batch_id (divide with batch_hash)
748 int64_t* unique_hashes_batch_id = unique_hashes.first;
749 if (!get_temp_size) {
750 ComputeBatchId(stream, unique_hashes_batch_id, num_voxels, batch_hash);
753 std::pair<int64_t*, size_t> unique_batches =
754 mem_temp.Alloc<int64_t>(batch_size);
755 std::pair<int64_t*, size_t> unique_batches_count =
756 mem_temp.Alloc<int64_t>(batch_size);
757 int64_t num_batches = 0; // Store non empty batches
759 // Convert batch_id to counts (array of num_voxels per batch)
761 std::pair<void*, size_t> encode_temp(nullptr, 0);
762 std::pair<int64_t*, size_t> num_batches_mem =
763 mem_temp.Alloc<int64_t>(1);
765 cub::DeviceRunLengthEncode::Encode(
766 encode_temp.first, encode_temp.second, unique_hashes_batch_id,
767 unique_batches.first, unique_batches_count.first,
768 num_batches_mem.first, num_voxels, stream);
769 encode_temp = mem_temp.Alloc(encode_temp.second);
770 if (!get_temp_size) {
771 cub::DeviceRunLengthEncode::Encode(
772 encode_temp.first, encode_temp.second,
773 unique_hashes_batch_id, unique_batches.first,
774 unique_batches_count.first, num_batches_mem.first,
777 // get the number of non empty batches.
778 cudaMemcpyAsync(&num_batches, num_batches_mem.first,
779 sizeof(int64_t), cudaMemcpyDeviceToHost, stream);
780 // wait for the async copies
781 while (cudaErrorNotReady == cudaStreamQuery(stream)) { /*empty*/
784 mem_temp.Free(encode_temp);
787 // Insert count(0) for empty batches
788 std::pair<int64_t*, size_t> num_voxels_per_batch =
789 mem_temp.Alloc<int64_t>(batch_size);
790 if (!get_temp_size) {
791 cudaMemset(num_voxels_per_batch.first, 0, batch_size * sizeof(int64_t));
792 ComputeVoxelPerBatch(stream, num_voxels_per_batch.first,
793 unique_batches_count.first, unique_batches.first,
797 std::pair<int64_t*, size_t> num_voxels_prefix_sum(unique_batches.first,
800 // compute the prefix sum for number of voxels per batch
801 // gives starting voxel index for each batch
802 // used only when voxel count exceeds max_voxels
804 std::pair<void*, size_t> inclusive_scan_temp(nullptr, 0);
806 cub::DeviceScan::InclusiveSum(
807 inclusive_scan_temp.first, inclusive_scan_temp.second,
808 num_voxels_per_batch.first, num_voxels_prefix_sum.first,
809 num_voxels_per_batch.second, stream);
811 inclusive_scan_temp = mem_temp.Alloc(inclusive_scan_temp.second);
812 if (!get_temp_size) {
813 if (num_voxels > max_voxels) {
814 cub::DeviceScan::InclusiveSum(
815 inclusive_scan_temp.first, inclusive_scan_temp.second,
816 num_voxels_per_batch.first, num_voxels_prefix_sum.first,
817 num_voxels_per_batch.second, stream);
820 mem_temp.Free(inclusive_scan_temp);
823 // Limit the number of voxels per batch to max_voxels
824 if (!get_temp_size) {
825 if (num_voxels >= max_voxels)
826 LimitCounts(stream, num_voxels_per_batch.first, batch_size,
830 // Prefix sum of limited counts to get batch splits.
831 int64_t* out_batch_splits = nullptr;
832 if (!get_temp_size) {
833 output_allocator.AllocVoxelBatchSplits(&out_batch_splits,
835 cudaMemsetAsync(out_batch_splits, 0, sizeof(int64_t), stream);
838 // Prefix sum of counts to get batch splits
840 std::pair<void*, size_t> inclusive_scan_temp(nullptr, 0);
842 cub::DeviceScan::InclusiveSum(
843 inclusive_scan_temp.first, inclusive_scan_temp.second,
844 num_voxels_per_batch.first, out_batch_splits + 1,
845 num_voxels_per_batch.second, stream);
847 inclusive_scan_temp = mem_temp.Alloc(inclusive_scan_temp.second);
849 if (!get_temp_size) {
850 cub::DeviceScan::InclusiveSum(
851 inclusive_scan_temp.first, inclusive_scan_temp.second,
852 num_voxels_per_batch.first, out_batch_splits + 1,
855 mem_temp.Free(inclusive_scan_temp);
858 // num_valid_voxels excludes voxels exceeding max_voxels
859 int64_t num_valid_voxels = num_points;
860 if (!get_temp_size) {
861 // get the number of valid voxels.
862 cudaMemcpyAsync(&num_valid_voxels, out_batch_splits + batch_size,
863 sizeof(int64_t), cudaMemcpyDeviceToHost, stream);
864 // wait for the async copies
865 while (cudaErrorNotReady == cudaStreamQuery(stream)) { /*empty*/
869 // start_idx stores starting index of each valid voxel.
870 // points_count stores number of valid points in respective voxel.
871 std::pair<int64_t*, size_t> start_idx(indices_batches.first,
873 std::pair<int64_t*, size_t> points_count =
874 mem_temp.Alloc<int64_t>(num_valid_voxels);
876 if (!get_temp_size) {
877 if (num_voxels <= max_voxels) {
878 // starting index and points_count will be same as
879 // unique_hashes_count_prefix_sum and unique_hashes_count when voxel
880 // count doesn't exceeds max_voxels
881 cudaMemsetAsync(start_idx.first, 0, sizeof(int64_t), stream);
882 cudaMemcpyAsync(start_idx.first + 1,
883 unique_hashes_count_prefix_sum.first,
884 (num_voxels - 1) * sizeof(int64_t),
885 cudaMemcpyDeviceToDevice, stream);
886 mem_temp.Free(points_count);
887 points_count.first = unique_hashes_count.first;
889 ComputeStartIdx(stream, start_idx.first, points_count.first,
890 num_voxels_prefix_sum.first,
891 unique_hashes_count_prefix_sum.first,
892 out_batch_splits, batch_size, max_voxels,
893 max_points_per_voxel);
897 int64_t* out_voxel_row_splits = nullptr;
898 if (!get_temp_size) {
899 output_allocator.AllocVoxelPointRowSplits(&out_voxel_row_splits,
900 num_valid_voxels + 1);
903 if (!get_temp_size) {
904 // set first element to 0
905 cudaMemsetAsync(out_voxel_row_splits, 0, sizeof(int64_t), stream);
908 std::pair<void*, size_t> inclusive_scan_temp(nullptr, 0);
910 cub::DeviceScan::InclusiveSum(
911 inclusive_scan_temp.first, inclusive_scan_temp.second,
912 points_count.first, out_voxel_row_splits + 1, num_valid_voxels,
915 inclusive_scan_temp = mem_temp.Alloc(inclusive_scan_temp.second);
916 if (!get_temp_size) {
917 cub::DeviceScan::InclusiveSum(
918 inclusive_scan_temp.first, inclusive_scan_temp.second,
919 points_count.first, out_voxel_row_splits + 1,
920 num_valid_voxels, stream);
922 mem_temp.Free(inclusive_scan_temp);
926 // return the memory peak as the required temporary memory size.
927 temp_size = mem_temp.MaxUsed();
931 int32_t* out_voxel_coords = nullptr;
932 output_allocator.AllocVoxelCoords(&out_voxel_coords, num_valid_voxels,
934 ComputeVoxelCoords(stream, out_voxel_coords, points,
935 point_indices_dbuf.Current(), start_idx.first,
936 points_range_min_vec, inv_voxel_size, num_valid_voxels);
938 int64_t num_valid_points = 0;
940 cudaMemcpyAsync(&num_valid_points,
941 out_voxel_row_splits + num_valid_voxels,
942 sizeof(int64_t), cudaMemcpyDeviceToHost, stream);
943 // wait for the async copies
944 while (cudaErrorNotReady == cudaStreamQuery(stream)) { /*empty*/
947 int64_t* out_point_indices = nullptr;
948 output_allocator.AllocVoxelPointIndices(&out_point_indices,
950 CopyPointIndices(stream, out_point_indices, point_indices_dbuf.Current(),
951 start_idx.first, out_voxel_row_splits + 1,
957 } // namespace cloudViewer