1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
10 #include "ml/impl/sparse_conv/SparseConvCUDAKernels.h"
12 using cloudViewer::utility::DivUp;
14 namespace cloudViewer {
18 /// Kernel for FillColumn
19 template <class TReal, class TIndex, class TKernelIndex>
20 __global__ void FillColumnKernel(
27 const TReal* const __restrict__ inp_features,
28 const TReal* const __restrict__ inp_importance,
29 size_t neighbors_index_size,
30 const TIndex* const __restrict__ neighbors_index,
31 const TKernelIndex* const __restrict__ neighbors_kernel_index,
32 const TReal* const __restrict__ neighbors_importance,
33 const int64_t* const __restrict__ neighbors_row_splits,
34 const int num_kernel_elements,
36 bool POINT_IMPORTANCE,
37 bool NEIGHBOR_IMPORTANCE) {
38 TIndex out_idx = begin_idx + blockIdx.x;
39 if (out_idx >= end_idx) return;
41 const TIndex col_idx = out_idx - begin_idx;
42 TReal* out_column = columns + num_kernel_elements * in_channels * col_idx;
43 const int64_t neighbor_start = neighbors_row_splits[out_idx];
44 const int64_t neighbor_end = neighbors_row_splits[out_idx + 1];
46 TReal normalizer = TReal(0);
48 if (NEIGHBOR_IMPORTANCE) {
49 for (int64_t n_idx = neighbor_start + threadIdx.x;
50 n_idx < neighbor_end; n_idx += blockDim.x) {
51 TReal n_importance = neighbors_importance[n_idx];
52 normalizer += n_importance;
54 unsigned int mask = __activemask();
55 for (int offset = blockDim.x / 2; offset > 0; offset /= 2)
56 normalizer += __shfl_down_sync(mask, normalizer, offset);
57 normalizer = __shfl_sync(mask, normalizer, 0);
59 int64_t num_neighbors = neighbor_end - neighbor_start;
60 normalizer = num_neighbors;
64 for (int64_t n_idx = neighbor_start; n_idx < neighbor_end; ++n_idx) {
65 const TIndex inp_idx = neighbors_index[n_idx];
66 const TReal n_importance =
67 NEIGHBOR_IMPORTANCE ? neighbors_importance[n_idx] : TReal(1);
68 int kernel_idx = neighbors_kernel_index[n_idx];
72 if (POINT_IMPORTANCE) importance = inp_importance[inp_idx];
73 if (NEIGHBOR_IMPORTANCE) importance *= n_importance;
74 if (NORMALIZE && normalizer != 0) importance /= normalizer;
76 for (int ic = threadIdx.x; ic < in_channels; ic += blockDim.x) {
77 infeat = importance * inp_features[inp_idx * in_channels + ic];
78 out_column[kernel_idx * in_channels + ic] = infeat;
83 template <class TReal, class TIndex, class TKernelIndex>
84 void FillColumn(const cudaStream_t& stream,
91 const TReal* const __restrict__ inp_features,
92 const TReal* const __restrict__ inp_importance,
93 size_t neighbors_index_size,
94 const TIndex* const __restrict__ neighbors_index,
95 const TKernelIndex* const __restrict__ neighbors_kernel_index,
96 const TReal* const __restrict__ neighbors_importance,
97 const int64_t* const __restrict__ neighbors_row_splits,
98 const int num_kernel_elements,
100 TIndex num_columns = end_idx - begin_idx;
103 sizeof(TReal) * num_kernel_elements * in_channels * num_columns,
106 const int BLOCKSIZE = 32;
107 dim3 block(BLOCKSIZE, 1, 1);
109 grid.x = num_columns;
111 #define FN_PARAMETERS \
112 columns, in_channels, begin_idx, end_idx, num_out, num_inp, inp_features, \
113 inp_importance, neighbors_index_size, neighbors_index, \
114 neighbors_kernel_index, neighbors_importance, \
115 neighbors_row_splits, num_kernel_elements, normalize, \
116 inp_importance != nullptr, neighbors_importance != nullptr
118 #define CALL_TEMPLATE \
119 FillColumnKernel<TReal, TIndex><<<grid, block, 0, stream>>>(FN_PARAMETERS);
130 template void FillColumn<float, int32_t, int16_t>(
131 const cudaStream_t& stream,
138 const float* const __restrict__ inp_features,
139 const float* const __restrict__ inp_importance,
140 size_t neighbors_index_size,
141 const int32_t* const __restrict__ neighbors_index,
142 const int16_t* const __restrict__ neighbors_kernel_index,
143 const float* const __restrict__ neighbors_importance,
144 const int64_t* const __restrict__ neighbors_row_splits,
145 const int num_kernel_elements,
148 template void FillColumn<float, int32_t, uint8_t>(
149 const cudaStream_t& stream,
156 const float* const __restrict__ inp_features,
157 const float* const __restrict__ inp_importance,
158 size_t neighbors_index_size,
159 const int32_t* const __restrict__ neighbors_index,
160 const uint8_t* const __restrict__ neighbors_kernel_index,
161 const float* const __restrict__ neighbors_importance,
162 const int64_t* const __restrict__ neighbors_row_splits,
163 const int num_kernel_elements,
166 template <class TReal, class TIndex, class TKernelIndex>
167 __global__ void FillColumnTransposeKernel(
174 const TReal* const __restrict__ inp_features,
175 size_t neighbors_index_size,
176 const TIndex* const __restrict__ neighbors_index,
177 const TKernelIndex* const __restrict__ neighbors_kernel_index,
178 const TReal* const __restrict__ inp_neighbors_importance_sum,
179 const int64_t* const __restrict__ inp_neighbors_prefix_sum,
180 const TReal* const __restrict__ neighbors_importance,
181 const int64_t* const __restrict__ neighbors_row_splits,
182 const int num_kernel_elements,
184 bool NEIGHBOR_IMPORTANCE) {
185 TIndex out_idx = begin_idx + blockIdx.x;
186 if (out_idx >= end_idx) return;
188 const TIndex col_idx = out_idx - begin_idx;
189 TReal* out_column = columns + num_kernel_elements * in_channels * col_idx;
190 const int64_t neighbor_start = neighbors_row_splits[out_idx];
191 const int64_t neighbor_end = neighbors_row_splits[out_idx + 1];
193 for (int64_t n_idx = neighbor_start; n_idx < neighbor_end; ++n_idx) {
194 const TIndex inp_idx = neighbors_index[n_idx];
195 const int kernel_idx = neighbors_kernel_index[n_idx];
197 TReal num_inp_neighbors_normalizer = 1;
199 if (NEIGHBOR_IMPORTANCE) {
200 if (inp_neighbors_importance_sum[inp_idx] != 0)
201 num_inp_neighbors_normalizer /=
202 inp_neighbors_importance_sum[inp_idx];
204 const int64_t inp_neighbor_start =
205 inp_neighbors_prefix_sum[inp_idx];
206 const int64_t inp_neighbor_end =
207 inp_idx + 1 < num_inp
208 ? inp_neighbors_prefix_sum[inp_idx + 1]
209 : neighbors_index_size;
210 const size_t num_inp_neighbors =
211 inp_neighbor_end - inp_neighbor_start;
212 if (num_inp_neighbors > 0)
213 num_inp_neighbors_normalizer /= num_inp_neighbors;
218 for (int ic = threadIdx.x; ic < in_channels; ic += blockDim.x) {
219 infeat = inp_features[inp_idx * in_channels + ic];
220 if (NEIGHBOR_IMPORTANCE) infeat *= neighbors_importance[n_idx];
221 if (NORMALIZE) infeat *= num_inp_neighbors_normalizer;
223 out_column[kernel_idx * in_channels + ic] += infeat;
228 template <class TReal, class TIndex, class TKernelIndex>
229 void FillColumnTranspose(
230 const cudaStream_t& stream,
237 const TReal* const __restrict__ inp_features,
238 const TReal* const __restrict__ inp_neighbors_importance_sum,
239 const int64_t* const __restrict__ inp_neighbors_prefix_sum,
240 size_t neighbors_index_size,
241 const TIndex* const __restrict__ neighbors_index,
242 const TKernelIndex* const __restrict__ neighbors_kernel_index,
243 const TReal* const __restrict__ neighbors_importance,
244 const int64_t* const __restrict__ neighbors_row_splits,
245 const int num_kernel_elements,
247 const bool has_neighbors_importance = inp_neighbors_importance_sum;
249 TIndex num_columns = end_idx - begin_idx;
252 sizeof(TReal) * num_kernel_elements * in_channels * num_columns,
255 const int BLOCKSIZE = 32;
256 dim3 block(BLOCKSIZE, 1, 1);
258 grid.x = num_columns;
260 #define FN_PARAMETERS \
261 columns, in_channels, begin_idx, end_idx, num_out, num_inp, inp_features, \
262 neighbors_index_size, neighbors_index, neighbors_kernel_index, \
263 inp_neighbors_importance_sum, inp_neighbors_prefix_sum, \
264 neighbors_importance, neighbors_row_splits, num_kernel_elements, \
265 normalize, has_neighbors_importance
267 #define CALL_TEMPLATE \
268 FillColumnTransposeKernel<TReal, TIndex> \
269 <<<grid, block, 0, stream>>>(FN_PARAMETERS);
279 template void FillColumnTranspose<float, int32_t, int16_t>(
280 const cudaStream_t& stream,
287 const float* const __restrict__ inp_features,
288 const float* const __restrict__ inp_neighbors_importance_sum,
289 const int64_t* const __restrict__ inp_neighbors_prefix_sum,
290 size_t neighbors_index_size,
291 const int32_t* const __restrict__ neighbors_index,
292 const int16_t* const __restrict__ neighbors_kernel_index,
293 const float* const __restrict__ neighbors_importance,
294 const int64_t* const __restrict__ neighbors_row_splits,
295 const int num_kernel_elements,
298 template void FillColumnTranspose<float, int32_t, uint8_t>(
299 const cudaStream_t& stream,
306 const float* const __restrict__ inp_features,
307 const float* const __restrict__ inp_neighbors_importance_sum,
308 const int64_t* const __restrict__ inp_neighbors_prefix_sum,
309 size_t neighbors_index_size,
310 const int32_t* const __restrict__ neighbors_index,
311 const uint8_t* const __restrict__ neighbors_kernel_index,
312 const float* const __restrict__ neighbors_importance,
313 const int64_t* const __restrict__ neighbors_row_splits,
314 const int num_kernel_elements,
319 } // namespace cloudViewer