1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
10 #include "ml/impl/continuous_conv/ContinuousConvCUDAKernels.h"
12 using cloudViewer::utility::DivUp;
14 namespace cloudViewer {
18 /// Kernel for FillColumn
19 template <class TFeat,
23 CoordinateMapping MAPPING,
24 InterpolationMode INTERPOLATION>
25 __global__ void FillColumnKernel(
31 const TReal* const __restrict__ out_positions,
33 const TReal* const __restrict__ inp_positions,
34 const TFeat* const __restrict__ inp_features,
35 const TFeat* const __restrict__ inp_importance,
36 size_t neighbors_index_size,
37 const TIndex* const __restrict__ neighbors_index,
38 const TFeat* const __restrict__ neighbors_importance,
39 const int64_t* const __restrict__ neighbors_row_splits,
40 const TReal* const __restrict__ extents,
41 const TReal* const __restrict__ offsets,
45 bool INDIVIDUAL_EXTENT,
46 bool ISOTROPIC_EXTENT,
48 bool POINT_IMPORTANCE,
49 bool NEIGHBOR_IMPORTANCE) {
50 TIndex out_idx = begin_idx + blockIdx.x;
51 if (out_idx >= end_idx) return;
52 const int NUM_INTERP_VALUES =
53 (INTERPOLATION == InterpolationMode::LINEAR ||
54 INTERPOLATION == InterpolationMode::LINEAR_BORDER
57 TReal interp_weights[NUM_INTERP_VALUES];
58 TIndex interp_indices[NUM_INTERP_VALUES];
60 TReal offset[3] = {offsets[0], offsets[1], offsets[2]};
62 const TIndex col_idx = out_idx - begin_idx;
63 TFeat* out_column = columns + filter_size_x * filter_size_y *
64 filter_size_z * in_channels * col_idx;
65 const int64_t neighbor_start = neighbors_row_splits[out_idx];
66 const int64_t neighbor_end = neighbors_row_splits[out_idx + 1];
68 TReal out_pos[3] = {out_positions[out_idx * 3 + 0],
69 out_positions[out_idx * 3 + 1],
70 out_positions[out_idx * 3 + 2]};
73 if (INDIVIDUAL_EXTENT) {
74 if (ISOTROPIC_EXTENT) {
75 inv_extents[0] = TReal(1) / extents[out_idx];
76 inv_extents[1] = inv_extents[0];
77 inv_extents[2] = inv_extents[0];
79 inv_extents[0] = TReal(1) / extents[3 * out_idx + 0];
80 inv_extents[1] = TReal(1) / extents[3 * out_idx + 1];
81 inv_extents[2] = TReal(1) / extents[3 * out_idx + 2];
84 if (ISOTROPIC_EXTENT) {
85 inv_extents[0] = TReal(1) / extents[0];
86 inv_extents[1] = inv_extents[0];
87 inv_extents[2] = inv_extents[0];
89 inv_extents[0] = TReal(1) / extents[0];
90 inv_extents[1] = TReal(1) / extents[1];
91 inv_extents[2] = TReal(1) / extents[2];
95 TReal normalizer = TReal(0);
97 if (NEIGHBOR_IMPORTANCE) {
98 for (int64_t n_idx = neighbor_start + threadIdx.x;
99 n_idx < neighbor_end; n_idx += blockDim.x) {
100 TReal n_importance = neighbors_importance[n_idx];
101 normalizer += n_importance;
103 unsigned int mask = __activemask();
104 for (int offset = blockDim.x / 2; offset > 0; offset /= 2)
105 normalizer += __shfl_down_sync(mask, normalizer, offset);
106 normalizer = __shfl_sync(mask, normalizer, 0);
108 int64_t num_neighbors = neighbor_end - neighbor_start;
109 normalizer = num_neighbors;
113 for (int64_t n_idx = neighbor_start; n_idx < neighbor_end; ++n_idx) {
114 const TIndex inp_idx = neighbors_index[n_idx];
115 const TFeat n_importance =
116 NEIGHBOR_IMPORTANCE ? neighbors_importance[n_idx] : TFeat(1);
119 x = inp_positions[inp_idx * 3 + 0] - out_pos[0];
120 y = inp_positions[inp_idx * 3 + 1] - out_pos[1];
121 z = inp_positions[inp_idx * 3 + 2] - out_pos[2];
123 ComputeFilterCoordinates<ALIGN_CORNERS, MAPPING>(
124 x, y, z, filter_size_x, filter_size_y, filter_size_z,
125 inv_extents[0], inv_extents[1], inv_extents[2], offset[0],
126 offset[1], offset[2]);
127 Interpolate<INTERPOLATION>(interp_weights, interp_indices, x, y, z,
128 filter_size_x, filter_size_y, filter_size_z);
131 TFeat importance = 1;
132 if (POINT_IMPORTANCE) importance = inp_importance[inp_idx];
133 if (NEIGHBOR_IMPORTANCE) importance *= n_importance;
134 if (NORMALIZE && normalizer != 0) importance /= normalizer;
136 for (int ic = threadIdx.x; ic < in_channels; ic += blockDim.x) {
137 infeat = importance * inp_features[inp_idx * in_channels + ic];
138 for (int j = 0; j < NUM_INTERP_VALUES; ++j) {
139 TFeat value = interp_weights[j] * infeat;
140 out_column[interp_indices[j] * in_channels + ic] += value;
146 template <class TFeat, class TReal, class TIndex>
147 void FillColumn(const cudaStream_t& stream,
153 const TReal* const __restrict__ out_positions,
155 const TReal* const __restrict__ inp_positions,
156 const TFeat* const __restrict__ inp_features,
157 const TFeat* const __restrict__ inp_importance,
158 size_t neighbors_index_size,
159 const TIndex* const __restrict__ neighbors_index,
160 const TFeat* const __restrict__ neighbors_importance,
161 const int64_t* const __restrict__ neighbors_row_splits,
162 const TReal* const __restrict__ extents,
163 const TReal* const __restrict__ offsets,
164 const std::vector<int>& filter_dims,
165 InterpolationMode interpolation,
166 CoordinateMapping coordinate_mapping,
168 bool individual_extent,
169 bool isotropic_extent,
171 const int filter_size_z = filter_dims[0];
172 const int filter_size_y = filter_dims[1];
173 const int filter_size_x = filter_dims[2];
175 TIndex num_columns = end_idx - begin_idx;
176 int filter_spatial_size = filter_size_x * filter_size_y * filter_size_z;
179 sizeof(TFeat) * filter_spatial_size * in_channels * num_columns,
182 const int BLOCKSIZE = 32;
183 dim3 block(BLOCKSIZE, 1, 1);
185 grid.x = num_columns;
187 #define FN_PARAMETERS \
188 columns, in_channels, begin_idx, end_idx, num_out, out_positions, num_inp, \
189 inp_positions, inp_features, inp_importance, neighbors_index_size, \
190 neighbors_index, neighbors_importance, neighbors_row_splits, \
191 extents, offsets, filter_size_x, filter_size_y, filter_size_z, \
192 individual_extent, isotropic_extent, normalize, \
193 inp_importance != nullptr, neighbors_importance != nullptr
195 #define CALL_TEMPLATE(INTERPOLATION, MAPPING, ALIGN_CORNERS) \
196 if (INTERPOLATION == interpolation && MAPPING == coordinate_mapping && \
197 ALIGN_CORNERS == align_corners) \
198 FillColumnKernel<TFeat, TReal, TIndex, ALIGN_CORNERS, MAPPING, \
200 <<<grid, block, 0, stream>>>(FN_PARAMETERS);
202 #define CALL_TEMPLATE2(INTERPOLATION, MAPPING) \
203 CALL_TEMPLATE(INTERPOLATION, MAPPING, true) \
204 CALL_TEMPLATE(INTERPOLATION, MAPPING, false)
206 #define CALL_TEMPLATE3(INTERPOLATION) \
207 CALL_TEMPLATE2(INTERPOLATION, CoordinateMapping::BALL_TO_CUBE_RADIAL) \
208 CALL_TEMPLATE2(INTERPOLATION, \
209 CoordinateMapping::BALL_TO_CUBE_VOLUME_PRESERVING) \
210 CALL_TEMPLATE2(INTERPOLATION, CoordinateMapping::IDENTITY)
212 #define CALL_TEMPLATE4 \
213 CALL_TEMPLATE3(InterpolationMode::LINEAR) \
214 CALL_TEMPLATE3(InterpolationMode::LINEAR_BORDER) \
215 CALL_TEMPLATE3(InterpolationMode::NEAREST_NEIGHBOR)
223 #undef CALL_TEMPLATE2
224 #undef CALL_TEMPLATE3
225 #undef CALL_TEMPLATE4
230 template void FillColumn<float, float, int32_t>(
231 const cudaStream_t& stream,
237 const float* const __restrict__ out_positions,
239 const float* const __restrict__ inp_positions,
240 const float* const __restrict__ inp_features,
241 const float* const __restrict__ inp_importance,
242 size_t neighbors_index_size,
243 const int32_t* const __restrict__ neighbors_index,
244 const float* const __restrict__ neighbors_importance,
245 const int64_t* const __restrict__ neighbors_row_splits,
246 const float* const __restrict__ extents,
247 const float* const __restrict__ offsets,
248 const std::vector<int>& filter_dims,
249 InterpolationMode interpolation,
250 CoordinateMapping coordinate_mapping,
252 bool individual_extent,
253 bool isotropic_extent,
256 template <class TFeat,
260 CoordinateMapping MAPPING,
261 InterpolationMode INTERPOLATION>
262 __global__ void FillColumnTransposeKernel(
268 const TReal* const __restrict__ out_positions,
270 const TReal* const __restrict__ inp_positions,
271 const TFeat* const __restrict__ inp_features,
272 size_t neighbors_index_size,
273 const TIndex* const __restrict__ neighbors_index,
274 const TFeat* const __restrict__ inp_neighbors_importance_sum,
275 const int64_t* const __restrict__ inp_neighbors_prefix_sum,
276 const TFeat* const __restrict__ neighbors_importance,
277 const int64_t* const __restrict__ neighbors_row_splits,
278 const TReal* const __restrict__ extents,
279 const TReal* const __restrict__ offsets,
283 bool INDIVIDUAL_EXTENT,
284 bool ISOTROPIC_EXTENT,
286 bool NEIGHBOR_IMPORTANCE) {
287 TIndex out_idx = begin_idx + blockIdx.x;
288 if (out_idx >= end_idx) return;
289 const int NUM_INTERP_VALUES =
290 (INTERPOLATION == InterpolationMode::LINEAR ||
291 INTERPOLATION == InterpolationMode::LINEAR_BORDER
294 TReal interp_weights[NUM_INTERP_VALUES];
295 TIndex interp_indices[NUM_INTERP_VALUES];
297 TReal offset[3] = {offsets[0], offsets[1], offsets[2]};
299 const TIndex col_idx = out_idx - begin_idx;
300 TFeat* out_column = columns + filter_size_x * filter_size_y *
301 filter_size_z * in_channels * col_idx;
302 const int64_t neighbor_start = neighbors_row_splits[out_idx];
303 const int64_t neighbor_end = neighbors_row_splits[out_idx + 1];
305 TReal out_pos[3] = {out_positions[out_idx * 3 + 0],
306 out_positions[out_idx * 3 + 1],
307 out_positions[out_idx * 3 + 2]};
309 TReal inv_extents[3];
310 if (INDIVIDUAL_EXTENT == false) {
311 if (ISOTROPIC_EXTENT) {
312 inv_extents[0] = TReal(1) / extents[0];
313 inv_extents[1] = inv_extents[0];
314 inv_extents[2] = inv_extents[0];
316 inv_extents[0] = TReal(1) / extents[0];
317 inv_extents[1] = TReal(1) / extents[1];
318 inv_extents[2] = TReal(1) / extents[2];
322 for (int64_t n_idx = neighbor_start; n_idx < neighbor_end; ++n_idx) {
323 const TIndex inp_idx = neighbors_index[n_idx];
326 x = out_pos[0] - inp_positions[inp_idx * 3 + 0];
327 y = out_pos[1] - inp_positions[inp_idx * 3 + 1];
328 z = out_pos[2] - inp_positions[inp_idx * 3 + 2];
330 if (INDIVIDUAL_EXTENT) {
331 if (ISOTROPIC_EXTENT) {
332 inv_extents[0] = TReal(1) / extents[inp_idx];
333 inv_extents[1] = inv_extents[0];
334 inv_extents[2] = inv_extents[0];
336 inv_extents[0] = TReal(1) / extents[3 * inp_idx + 0];
337 inv_extents[1] = TReal(1) / extents[3 * inp_idx + 1];
338 inv_extents[2] = TReal(1) / extents[3 * inp_idx + 2];
342 TReal num_inp_neighbors_normalizer = 1;
344 if (NEIGHBOR_IMPORTANCE) {
345 if (inp_neighbors_importance_sum[inp_idx] != 0)
346 num_inp_neighbors_normalizer /=
347 inp_neighbors_importance_sum[inp_idx];
349 const int64_t inp_neighbor_start =
350 inp_neighbors_prefix_sum[inp_idx];
351 const int64_t inp_neighbor_end =
352 inp_idx + 1 < num_inp
353 ? inp_neighbors_prefix_sum[inp_idx + 1]
354 : neighbors_index_size;
355 const size_t num_inp_neighbors =
356 inp_neighbor_end - inp_neighbor_start;
357 if (num_inp_neighbors > 0)
358 num_inp_neighbors_normalizer /= num_inp_neighbors;
362 ComputeFilterCoordinates<ALIGN_CORNERS, MAPPING>(
363 x, y, z, filter_size_x, filter_size_y, filter_size_z,
364 inv_extents[0], inv_extents[1], inv_extents[2], offset[0],
365 offset[1], offset[2]);
366 Interpolate<INTERPOLATION>(interp_weights, interp_indices, x, y, z,
367 filter_size_x, filter_size_y, filter_size_z);
370 for (int ic = threadIdx.x; ic < in_channels; ic += blockDim.x) {
371 infeat = inp_features[inp_idx * in_channels + ic];
372 if (NEIGHBOR_IMPORTANCE) infeat *= neighbors_importance[n_idx];
373 if (NORMALIZE) infeat *= num_inp_neighbors_normalizer;
374 for (int j = 0; j < NUM_INTERP_VALUES; ++j) {
375 TFeat value = interp_weights[j] * infeat;
376 out_column[interp_indices[j] * in_channels + ic] += value;
382 template <class TFeat, class TReal, class TIndex>
383 void FillColumnTranspose(
384 const cudaStream_t& stream,
390 const TReal* const __restrict__ out_positions,
392 const TReal* const __restrict__ inp_positions,
393 const TFeat* const __restrict__ inp_features,
394 const TFeat* const __restrict__ inp_neighbors_importance_sum,
395 const int64_t* const __restrict__ inp_neighbors_prefix_sum,
396 size_t neighbors_index_size,
397 const TIndex* const __restrict__ neighbors_index,
398 const TFeat* const __restrict__ neighbors_importance,
399 const int64_t* const __restrict__ neighbors_row_splits,
400 const TReal* const __restrict__ extents,
401 const TReal* const __restrict__ offsets,
402 const std::vector<int>& filter_dims,
403 InterpolationMode interpolation,
404 CoordinateMapping coordinate_mapping,
406 bool individual_extent,
407 bool isotropic_extent,
409 const bool has_neighbors_importance = inp_neighbors_importance_sum;
410 const int filter_size_z = filter_dims[0];
411 const int filter_size_y = filter_dims[1];
412 const int filter_size_x = filter_dims[2];
414 TIndex num_columns = end_idx - begin_idx;
415 int filter_spatial_size = filter_size_x * filter_size_y * filter_size_z;
418 sizeof(TFeat) * filter_spatial_size * in_channels * num_columns,
421 const int BLOCKSIZE = 32;
422 dim3 block(BLOCKSIZE, 1, 1);
424 grid.x = num_columns;
426 #define FN_PARAMETERS \
427 columns, in_channels, begin_idx, end_idx, num_out, out_positions, num_inp, \
428 inp_positions, inp_features, neighbors_index_size, \
429 neighbors_index, inp_neighbors_importance_sum, \
430 inp_neighbors_prefix_sum, neighbors_importance, \
431 neighbors_row_splits, extents, offsets, filter_size_x, \
432 filter_size_y, filter_size_z, individual_extent, isotropic_extent, \
433 normalize, has_neighbors_importance
435 #define CALL_TEMPLATE(INTERPOLATION, MAPPING, ALIGN_CORNERS) \
436 if (INTERPOLATION == interpolation && MAPPING == coordinate_mapping && \
437 ALIGN_CORNERS == align_corners) \
438 FillColumnTransposeKernel<TFeat, TReal, TIndex, ALIGN_CORNERS, \
439 MAPPING, INTERPOLATION> \
440 <<<grid, block, 0, stream>>>(FN_PARAMETERS);
442 #define CALL_TEMPLATE2(INTERPOLATION, MAPPING) \
443 CALL_TEMPLATE(INTERPOLATION, MAPPING, true) \
444 CALL_TEMPLATE(INTERPOLATION, MAPPING, false)
446 #define CALL_TEMPLATE3(INTERPOLATION) \
447 CALL_TEMPLATE2(INTERPOLATION, CoordinateMapping::BALL_TO_CUBE_RADIAL) \
448 CALL_TEMPLATE2(INTERPOLATION, \
449 CoordinateMapping::BALL_TO_CUBE_VOLUME_PRESERVING) \
450 CALL_TEMPLATE2(INTERPOLATION, CoordinateMapping::IDENTITY)
452 #define CALL_TEMPLATE4 \
453 CALL_TEMPLATE3(InterpolationMode::LINEAR) \
454 CALL_TEMPLATE3(InterpolationMode::LINEAR_BORDER) \
455 CALL_TEMPLATE3(InterpolationMode::NEAREST_NEIGHBOR)
463 #undef CALL_TEMPLATE2
464 #undef CALL_TEMPLATE3
465 #undef CALL_TEMPLATE4
470 template void FillColumnTranspose<float, float, int32_t>(
471 const cudaStream_t& stream,
477 const float* const __restrict__ out_positions,
479 const float* const __restrict__ inp_positions,
480 const float* const __restrict__ inp_features,
481 const float* const __restrict__ inp_neighbors_importance_sum,
482 const int64_t* const __restrict__ inp_neighbors_prefix_sum,
483 size_t neighbors_index_size,
484 const int32_t* const __restrict__ neighbors_index,
485 const float* const __restrict__ neighbors_importance,
486 const int64_t* const __restrict__ neighbors_row_splits,
487 const float* const __restrict__ extents,
488 const float* const __restrict__ offsets,
489 const std::vector<int>& filter_dims,
490 InterpolationMode interpolation,
491 CoordinateMapping coordinate_mapping,
493 bool individual_extent,
494 bool isotropic_extent,
498 __global__ void MultiplyColumnsKernel(size_t rows,
500 T* __restrict__ col_major_matrix,
501 const T* const __restrict__ vector) {
502 size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
503 if (idx >= rows * cols) return;
505 size_t col = idx / rows;
507 T factor = vector[col];
508 col_major_matrix[idx] *= factor;
512 void MultiplyColumns(const cudaStream_t& stream,
515 T* __restrict__ col_major_matrix,
516 const T* const __restrict__ vector) {
517 const int BLOCKSIZE = 128;
518 dim3 block(BLOCKSIZE, 1, 1);
520 grid.x = DivUp(rows * cols, BLOCKSIZE);
523 MultiplyColumnsKernel<T><<<grid, block, 0, stream>>>(
524 rows, cols, col_major_matrix, vector);
528 template void MultiplyColumns<float>(const cudaStream_t& stream,
531 float* __restrict__ col_major_matrix,
532 const float* const __restrict__ vector);
535 __global__ void MultiplyAndCopyColumnsKernel(
538 T* __restrict__ out_ptr,
539 const T* const __restrict__ col_major_matrix,
540 const T* const __restrict__ vector) {
541 size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
542 if (idx >= rows * cols) return;
544 size_t col = idx / rows;
546 T factor = vector[col];
547 out_ptr[idx] = col_major_matrix[idx] * factor;
551 void MultiplyAndCopyColumns(const cudaStream_t& stream,
554 T* __restrict__ out_ptr,
555 const T* const __restrict__ col_major_matrix,
556 const T* const __restrict__ vector) {
557 const int BLOCKSIZE = 128;
558 dim3 block(BLOCKSIZE, 1, 1);
560 grid.x = DivUp(rows * cols, BLOCKSIZE);
563 MultiplyAndCopyColumnsKernel<T><<<grid, block, 0, stream>>>(
564 rows, cols, out_ptr, col_major_matrix, vector);
568 template void MultiplyAndCopyColumns<float>(
569 const cudaStream_t& stream,
572 float* __restrict__ out_ptr,
573 const float* const __restrict__ col_major_matrix,
574 const float* const __restrict__ vector);
578 } // namespace cloudViewer