ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
SparseConvCUDAKernels.cu
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #include <Helper.h>
9 
10 #include "ml/impl/sparse_conv/SparseConvCUDAKernels.h"
11 
12 using cloudViewer::utility::DivUp;
13 
14 namespace cloudViewer {
15 namespace ml {
16 namespace impl {
17 
18 /// Kernel for FillColumn
19 template <class TReal, class TIndex, class TKernelIndex>
20 __global__ void FillColumnKernel(
21  TReal* columns,
22  int in_channels,
23  TIndex begin_idx,
24  TIndex end_idx,
25  TIndex num_out,
26  TIndex num_inp,
27  const TReal* const __restrict__ inp_features,
28  const TReal* const __restrict__ inp_importance,
29  size_t neighbors_index_size,
30  const TIndex* const __restrict__ neighbors_index,
31  const TKernelIndex* const __restrict__ neighbors_kernel_index,
32  const TReal* const __restrict__ neighbors_importance,
33  const int64_t* const __restrict__ neighbors_row_splits,
34  const int num_kernel_elements,
35  bool NORMALIZE,
36  bool POINT_IMPORTANCE,
37  bool NEIGHBOR_IMPORTANCE) {
38  TIndex out_idx = begin_idx + blockIdx.x;
39  if (out_idx >= end_idx) return;
40 
41  const TIndex col_idx = out_idx - begin_idx;
42  TReal* out_column = columns + num_kernel_elements * in_channels * col_idx;
43  const int64_t neighbor_start = neighbors_row_splits[out_idx];
44  const int64_t neighbor_end = neighbors_row_splits[out_idx + 1];
45 
46  TReal normalizer = TReal(0);
47  if (NORMALIZE) {
48  if (NEIGHBOR_IMPORTANCE) {
49  for (int64_t n_idx = neighbor_start + threadIdx.x;
50  n_idx < neighbor_end; n_idx += blockDim.x) {
51  TReal n_importance = neighbors_importance[n_idx];
52  normalizer += n_importance;
53  }
54  unsigned int mask = __activemask();
55  for (int offset = blockDim.x / 2; offset > 0; offset /= 2)
56  normalizer += __shfl_down_sync(mask, normalizer, offset);
57  normalizer = __shfl_sync(mask, normalizer, 0);
58  } else {
59  int64_t num_neighbors = neighbor_end - neighbor_start;
60  normalizer = num_neighbors;
61  }
62  }
63 
64  for (int64_t n_idx = neighbor_start; n_idx < neighbor_end; ++n_idx) {
65  const TIndex inp_idx = neighbors_index[n_idx];
66  const TReal n_importance =
67  NEIGHBOR_IMPORTANCE ? neighbors_importance[n_idx] : TReal(1);
68  int kernel_idx = neighbors_kernel_index[n_idx];
69 
70  TReal infeat = 0;
71  TReal importance = 1;
72  if (POINT_IMPORTANCE) importance = inp_importance[inp_idx];
73  if (NEIGHBOR_IMPORTANCE) importance *= n_importance;
74  if (NORMALIZE && normalizer != 0) importance /= normalizer;
75 
76  for (int ic = threadIdx.x; ic < in_channels; ic += blockDim.x) {
77  infeat = importance * inp_features[inp_idx * in_channels + ic];
78  out_column[kernel_idx * in_channels + ic] = infeat;
79  }
80  } // for n
81 }
82 
83 template <class TReal, class TIndex, class TKernelIndex>
84 void FillColumn(const cudaStream_t& stream,
85  TReal* columns,
86  int in_channels,
87  TIndex begin_idx,
88  TIndex end_idx,
89  TIndex num_out,
90  TIndex num_inp,
91  const TReal* const __restrict__ inp_features,
92  const TReal* const __restrict__ inp_importance,
93  size_t neighbors_index_size,
94  const TIndex* const __restrict__ neighbors_index,
95  const TKernelIndex* const __restrict__ neighbors_kernel_index,
96  const TReal* const __restrict__ neighbors_importance,
97  const int64_t* const __restrict__ neighbors_row_splits,
98  const int num_kernel_elements,
99  bool normalize) {
100  TIndex num_columns = end_idx - begin_idx;
101  cudaMemsetAsync(
102  columns, 0,
103  sizeof(TReal) * num_kernel_elements * in_channels * num_columns,
104  stream);
105 
106  const int BLOCKSIZE = 32;
107  dim3 block(BLOCKSIZE, 1, 1);
108  dim3 grid(0, 1, 1);
109  grid.x = num_columns;
110 
111 #define FN_PARAMETERS \
112  columns, in_channels, begin_idx, end_idx, num_out, num_inp, inp_features, \
113  inp_importance, neighbors_index_size, neighbors_index, \
114  neighbors_kernel_index, neighbors_importance, \
115  neighbors_row_splits, num_kernel_elements, normalize, \
116  inp_importance != nullptr, neighbors_importance != nullptr
117 
118 #define CALL_TEMPLATE \
119  FillColumnKernel<TReal, TIndex><<<grid, block, 0, stream>>>(FN_PARAMETERS);
120 
121  if (grid.x) {
122  CALL_TEMPLATE
123  }
124 
125 #undef CALL_TEMPLATE
126 
127 #undef FN_PARAMETERS
128 }
129 
130 template void FillColumn<float, int32_t, int16_t>(
131  const cudaStream_t& stream,
132  float* columns,
133  int in_channels,
134  int32_t begin_idx,
135  int32_t end_idx,
136  int32_t num_out,
137  int32_t num_inp,
138  const float* const __restrict__ inp_features,
139  const float* const __restrict__ inp_importance,
140  size_t neighbors_index_size,
141  const int32_t* const __restrict__ neighbors_index,
142  const int16_t* const __restrict__ neighbors_kernel_index,
143  const float* const __restrict__ neighbors_importance,
144  const int64_t* const __restrict__ neighbors_row_splits,
145  const int num_kernel_elements,
146  bool normalize);
147 
148 template void FillColumn<float, int32_t, uint8_t>(
149  const cudaStream_t& stream,
150  float* columns,
151  int in_channels,
152  int32_t begin_idx,
153  int32_t end_idx,
154  int32_t num_out,
155  int32_t num_inp,
156  const float* const __restrict__ inp_features,
157  const float* const __restrict__ inp_importance,
158  size_t neighbors_index_size,
159  const int32_t* const __restrict__ neighbors_index,
160  const uint8_t* const __restrict__ neighbors_kernel_index,
161  const float* const __restrict__ neighbors_importance,
162  const int64_t* const __restrict__ neighbors_row_splits,
163  const int num_kernel_elements,
164  bool normalize);
165 
166 template <class TReal, class TIndex, class TKernelIndex>
167 __global__ void FillColumnTransposeKernel(
168  TReal* columns,
169  int in_channels,
170  TIndex begin_idx,
171  TIndex end_idx,
172  TIndex num_out,
173  TIndex num_inp,
174  const TReal* const __restrict__ inp_features,
175  size_t neighbors_index_size,
176  const TIndex* const __restrict__ neighbors_index,
177  const TKernelIndex* const __restrict__ neighbors_kernel_index,
178  const TReal* const __restrict__ inp_neighbors_importance_sum,
179  const int64_t* const __restrict__ inp_neighbors_prefix_sum,
180  const TReal* const __restrict__ neighbors_importance,
181  const int64_t* const __restrict__ neighbors_row_splits,
182  const int num_kernel_elements,
183  bool NORMALIZE,
184  bool NEIGHBOR_IMPORTANCE) {
185  TIndex out_idx = begin_idx + blockIdx.x;
186  if (out_idx >= end_idx) return;
187 
188  const TIndex col_idx = out_idx - begin_idx;
189  TReal* out_column = columns + num_kernel_elements * in_channels * col_idx;
190  const int64_t neighbor_start = neighbors_row_splits[out_idx];
191  const int64_t neighbor_end = neighbors_row_splits[out_idx + 1];
192 
193  for (int64_t n_idx = neighbor_start; n_idx < neighbor_end; ++n_idx) {
194  const TIndex inp_idx = neighbors_index[n_idx];
195  const int kernel_idx = neighbors_kernel_index[n_idx];
196 
197  TReal num_inp_neighbors_normalizer = 1;
198  if (NORMALIZE) {
199  if (NEIGHBOR_IMPORTANCE) {
200  if (inp_neighbors_importance_sum[inp_idx] != 0)
201  num_inp_neighbors_normalizer /=
202  inp_neighbors_importance_sum[inp_idx];
203  } else {
204  const int64_t inp_neighbor_start =
205  inp_neighbors_prefix_sum[inp_idx];
206  const int64_t inp_neighbor_end =
207  inp_idx + 1 < num_inp
208  ? inp_neighbors_prefix_sum[inp_idx + 1]
209  : neighbors_index_size;
210  const size_t num_inp_neighbors =
211  inp_neighbor_end - inp_neighbor_start;
212  if (num_inp_neighbors > 0)
213  num_inp_neighbors_normalizer /= num_inp_neighbors;
214  }
215  }
216 
217  TReal infeat = 0;
218  for (int ic = threadIdx.x; ic < in_channels; ic += blockDim.x) {
219  infeat = inp_features[inp_idx * in_channels + ic];
220  if (NEIGHBOR_IMPORTANCE) infeat *= neighbors_importance[n_idx];
221  if (NORMALIZE) infeat *= num_inp_neighbors_normalizer;
222 
223  out_column[kernel_idx * in_channels + ic] += infeat;
224  }
225  } // for n
226 }
227 
228 template <class TReal, class TIndex, class TKernelIndex>
229 void FillColumnTranspose(
230  const cudaStream_t& stream,
231  TReal* columns,
232  int in_channels,
233  TIndex begin_idx,
234  TIndex end_idx,
235  TIndex num_out,
236  TIndex num_inp,
237  const TReal* const __restrict__ inp_features,
238  const TReal* const __restrict__ inp_neighbors_importance_sum,
239  const int64_t* const __restrict__ inp_neighbors_prefix_sum,
240  size_t neighbors_index_size,
241  const TIndex* const __restrict__ neighbors_index,
242  const TKernelIndex* const __restrict__ neighbors_kernel_index,
243  const TReal* const __restrict__ neighbors_importance,
244  const int64_t* const __restrict__ neighbors_row_splits,
245  const int num_kernel_elements,
246  bool normalize) {
247  const bool has_neighbors_importance = inp_neighbors_importance_sum;
248 
249  TIndex num_columns = end_idx - begin_idx;
250  cudaMemsetAsync(
251  columns, 0,
252  sizeof(TReal) * num_kernel_elements * in_channels * num_columns,
253  stream);
254 
255  const int BLOCKSIZE = 32;
256  dim3 block(BLOCKSIZE, 1, 1);
257  dim3 grid(0, 1, 1);
258  grid.x = num_columns;
259 
260 #define FN_PARAMETERS \
261  columns, in_channels, begin_idx, end_idx, num_out, num_inp, inp_features, \
262  neighbors_index_size, neighbors_index, neighbors_kernel_index, \
263  inp_neighbors_importance_sum, inp_neighbors_prefix_sum, \
264  neighbors_importance, neighbors_row_splits, num_kernel_elements, \
265  normalize, has_neighbors_importance
266 
267 #define CALL_TEMPLATE \
268  FillColumnTransposeKernel<TReal, TIndex> \
269  <<<grid, block, 0, stream>>>(FN_PARAMETERS);
270 
271  if (grid.x) {
272  CALL_TEMPLATE
273  }
274 
275 #undef CALL_TEMPLATE
276 #undef FN_PARAMETERS
277 }
278 
279 template void FillColumnTranspose<float, int32_t, int16_t>(
280  const cudaStream_t& stream,
281  float* columns,
282  int in_channels,
283  int32_t begin_idx,
284  int32_t end_idx,
285  int32_t num_out,
286  int32_t num_inp,
287  const float* const __restrict__ inp_features,
288  const float* const __restrict__ inp_neighbors_importance_sum,
289  const int64_t* const __restrict__ inp_neighbors_prefix_sum,
290  size_t neighbors_index_size,
291  const int32_t* const __restrict__ neighbors_index,
292  const int16_t* const __restrict__ neighbors_kernel_index,
293  const float* const __restrict__ neighbors_importance,
294  const int64_t* const __restrict__ neighbors_row_splits,
295  const int num_kernel_elements,
296  bool normalize);
297 
298 template void FillColumnTranspose<float, int32_t, uint8_t>(
299  const cudaStream_t& stream,
300  float* columns,
301  int in_channels,
302  int32_t begin_idx,
303  int32_t end_idx,
304  int32_t num_out,
305  int32_t num_inp,
306  const float* const __restrict__ inp_features,
307  const float* const __restrict__ inp_neighbors_importance_sum,
308  const int64_t* const __restrict__ inp_neighbors_prefix_sum,
309  size_t neighbors_index_size,
310  const int32_t* const __restrict__ neighbors_index,
311  const uint8_t* const __restrict__ neighbors_kernel_index,
312  const float* const __restrict__ neighbors_importance,
313  const int64_t* const __restrict__ neighbors_row_splits,
314  const int num_kernel_elements,
315  bool normalize);
316 
317 } // namespace impl
318 } // namespace ml
319 } // namespace cloudViewer