ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
ContinuousConvCUDAKernels.cu
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #include <Helper.h>
9 
10 #include "ml/impl/continuous_conv/ContinuousConvCUDAKernels.h"
11 
12 using cloudViewer::utility::DivUp;
13 
14 namespace cloudViewer {
15 namespace ml {
16 namespace impl {
17 
18 /// Kernel for FillColumn
19 template <class TFeat,
20  class TReal,
21  class TIndex,
22  bool ALIGN_CORNERS,
23  CoordinateMapping MAPPING,
24  InterpolationMode INTERPOLATION>
25 __global__ void FillColumnKernel(
26  TFeat* columns,
27  int in_channels,
28  TIndex begin_idx,
29  TIndex end_idx,
30  TIndex num_out,
31  const TReal* const __restrict__ out_positions,
32  TIndex num_inp,
33  const TReal* const __restrict__ inp_positions,
34  const TFeat* const __restrict__ inp_features,
35  const TFeat* const __restrict__ inp_importance,
36  size_t neighbors_index_size,
37  const TIndex* const __restrict__ neighbors_index,
38  const TFeat* const __restrict__ neighbors_importance,
39  const int64_t* const __restrict__ neighbors_row_splits,
40  const TReal* const __restrict__ extents,
41  const TReal* const __restrict__ offsets,
42  int filter_size_x,
43  int filter_size_y,
44  int filter_size_z,
45  bool INDIVIDUAL_EXTENT,
46  bool ISOTROPIC_EXTENT,
47  bool NORMALIZE,
48  bool POINT_IMPORTANCE,
49  bool NEIGHBOR_IMPORTANCE) {
50  TIndex out_idx = begin_idx + blockIdx.x;
51  if (out_idx >= end_idx) return;
52  const int NUM_INTERP_VALUES =
53  (INTERPOLATION == InterpolationMode::LINEAR ||
54  INTERPOLATION == InterpolationMode::LINEAR_BORDER
55  ? 8
56  : 1);
57  TReal interp_weights[NUM_INTERP_VALUES];
58  TIndex interp_indices[NUM_INTERP_VALUES];
59 
60  TReal offset[3] = {offsets[0], offsets[1], offsets[2]};
61 
62  const TIndex col_idx = out_idx - begin_idx;
63  TFeat* out_column = columns + filter_size_x * filter_size_y *
64  filter_size_z * in_channels * col_idx;
65  const int64_t neighbor_start = neighbors_row_splits[out_idx];
66  const int64_t neighbor_end = neighbors_row_splits[out_idx + 1];
67 
68  TReal out_pos[3] = {out_positions[out_idx * 3 + 0],
69  out_positions[out_idx * 3 + 1],
70  out_positions[out_idx * 3 + 2]};
71 
72  TReal inv_extents[3];
73  if (INDIVIDUAL_EXTENT) {
74  if (ISOTROPIC_EXTENT) {
75  inv_extents[0] = TReal(1) / extents[out_idx];
76  inv_extents[1] = inv_extents[0];
77  inv_extents[2] = inv_extents[0];
78  } else {
79  inv_extents[0] = TReal(1) / extents[3 * out_idx + 0];
80  inv_extents[1] = TReal(1) / extents[3 * out_idx + 1];
81  inv_extents[2] = TReal(1) / extents[3 * out_idx + 2];
82  }
83  } else {
84  if (ISOTROPIC_EXTENT) {
85  inv_extents[0] = TReal(1) / extents[0];
86  inv_extents[1] = inv_extents[0];
87  inv_extents[2] = inv_extents[0];
88  } else {
89  inv_extents[0] = TReal(1) / extents[0];
90  inv_extents[1] = TReal(1) / extents[1];
91  inv_extents[2] = TReal(1) / extents[2];
92  }
93  }
94 
95  TReal normalizer = TReal(0);
96  if (NORMALIZE) {
97  if (NEIGHBOR_IMPORTANCE) {
98  for (int64_t n_idx = neighbor_start + threadIdx.x;
99  n_idx < neighbor_end; n_idx += blockDim.x) {
100  TReal n_importance = neighbors_importance[n_idx];
101  normalizer += n_importance;
102  }
103  unsigned int mask = __activemask();
104  for (int offset = blockDim.x / 2; offset > 0; offset /= 2)
105  normalizer += __shfl_down_sync(mask, normalizer, offset);
106  normalizer = __shfl_sync(mask, normalizer, 0);
107  } else {
108  int64_t num_neighbors = neighbor_end - neighbor_start;
109  normalizer = num_neighbors;
110  }
111  }
112 
113  for (int64_t n_idx = neighbor_start; n_idx < neighbor_end; ++n_idx) {
114  const TIndex inp_idx = neighbors_index[n_idx];
115  const TFeat n_importance =
116  NEIGHBOR_IMPORTANCE ? neighbors_importance[n_idx] : TFeat(1);
117 
118  TReal x, y, z;
119  x = inp_positions[inp_idx * 3 + 0] - out_pos[0];
120  y = inp_positions[inp_idx * 3 + 1] - out_pos[1];
121  z = inp_positions[inp_idx * 3 + 2] - out_pos[2];
122 
123  ComputeFilterCoordinates<ALIGN_CORNERS, MAPPING>(
124  x, y, z, filter_size_x, filter_size_y, filter_size_z,
125  inv_extents[0], inv_extents[1], inv_extents[2], offset[0],
126  offset[1], offset[2]);
127  Interpolate<INTERPOLATION>(interp_weights, interp_indices, x, y, z,
128  filter_size_x, filter_size_y, filter_size_z);
129 
130  TFeat infeat = 0;
131  TFeat importance = 1;
132  if (POINT_IMPORTANCE) importance = inp_importance[inp_idx];
133  if (NEIGHBOR_IMPORTANCE) importance *= n_importance;
134  if (NORMALIZE && normalizer != 0) importance /= normalizer;
135 
136  for (int ic = threadIdx.x; ic < in_channels; ic += blockDim.x) {
137  infeat = importance * inp_features[inp_idx * in_channels + ic];
138  for (int j = 0; j < NUM_INTERP_VALUES; ++j) {
139  TFeat value = interp_weights[j] * infeat;
140  out_column[interp_indices[j] * in_channels + ic] += value;
141  }
142  }
143  } // for n
144 }
145 
146 template <class TFeat, class TReal, class TIndex>
147 void FillColumn(const cudaStream_t& stream,
148  TFeat* columns,
149  int in_channels,
150  TIndex begin_idx,
151  TIndex end_idx,
152  TIndex num_out,
153  const TReal* const __restrict__ out_positions,
154  TIndex num_inp,
155  const TReal* const __restrict__ inp_positions,
156  const TFeat* const __restrict__ inp_features,
157  const TFeat* const __restrict__ inp_importance,
158  size_t neighbors_index_size,
159  const TIndex* const __restrict__ neighbors_index,
160  const TFeat* const __restrict__ neighbors_importance,
161  const int64_t* const __restrict__ neighbors_row_splits,
162  const TReal* const __restrict__ extents,
163  const TReal* const __restrict__ offsets,
164  const std::vector<int>& filter_dims,
165  InterpolationMode interpolation,
166  CoordinateMapping coordinate_mapping,
167  bool align_corners,
168  bool individual_extent,
169  bool isotropic_extent,
170  bool normalize) {
171  const int filter_size_z = filter_dims[0];
172  const int filter_size_y = filter_dims[1];
173  const int filter_size_x = filter_dims[2];
174 
175  TIndex num_columns = end_idx - begin_idx;
176  int filter_spatial_size = filter_size_x * filter_size_y * filter_size_z;
177  cudaMemsetAsync(
178  columns, 0,
179  sizeof(TFeat) * filter_spatial_size * in_channels * num_columns,
180  stream);
181 
182  const int BLOCKSIZE = 32;
183  dim3 block(BLOCKSIZE, 1, 1);
184  dim3 grid(0, 1, 1);
185  grid.x = num_columns;
186 
187 #define FN_PARAMETERS \
188  columns, in_channels, begin_idx, end_idx, num_out, out_positions, num_inp, \
189  inp_positions, inp_features, inp_importance, neighbors_index_size, \
190  neighbors_index, neighbors_importance, neighbors_row_splits, \
191  extents, offsets, filter_size_x, filter_size_y, filter_size_z, \
192  individual_extent, isotropic_extent, normalize, \
193  inp_importance != nullptr, neighbors_importance != nullptr
194 
195 #define CALL_TEMPLATE(INTERPOLATION, MAPPING, ALIGN_CORNERS) \
196  if (INTERPOLATION == interpolation && MAPPING == coordinate_mapping && \
197  ALIGN_CORNERS == align_corners) \
198  FillColumnKernel<TFeat, TReal, TIndex, ALIGN_CORNERS, MAPPING, \
199  INTERPOLATION> \
200  <<<grid, block, 0, stream>>>(FN_PARAMETERS);
201 
202 #define CALL_TEMPLATE2(INTERPOLATION, MAPPING) \
203  CALL_TEMPLATE(INTERPOLATION, MAPPING, true) \
204  CALL_TEMPLATE(INTERPOLATION, MAPPING, false)
205 
206 #define CALL_TEMPLATE3(INTERPOLATION) \
207  CALL_TEMPLATE2(INTERPOLATION, CoordinateMapping::BALL_TO_CUBE_RADIAL) \
208  CALL_TEMPLATE2(INTERPOLATION, \
209  CoordinateMapping::BALL_TO_CUBE_VOLUME_PRESERVING) \
210  CALL_TEMPLATE2(INTERPOLATION, CoordinateMapping::IDENTITY)
211 
212 #define CALL_TEMPLATE4 \
213  CALL_TEMPLATE3(InterpolationMode::LINEAR) \
214  CALL_TEMPLATE3(InterpolationMode::LINEAR_BORDER) \
215  CALL_TEMPLATE3(InterpolationMode::NEAREST_NEIGHBOR)
216 
217  if (grid.x) {
218  CALL_TEMPLATE4
219  /*CHECK_CUDA_ERROR*/
220  }
221 
222 #undef CALL_TEMPLATE
223 #undef CALL_TEMPLATE2
224 #undef CALL_TEMPLATE3
225 #undef CALL_TEMPLATE4
226 
227 #undef FN_PARAMETERS
228 }
229 
230 template void FillColumn<float, float, int32_t>(
231  const cudaStream_t& stream,
232  float* columns,
233  int in_channels,
234  int32_t begin_idx,
235  int32_t end_idx,
236  int32_t num_out,
237  const float* const __restrict__ out_positions,
238  int32_t num_inp,
239  const float* const __restrict__ inp_positions,
240  const float* const __restrict__ inp_features,
241  const float* const __restrict__ inp_importance,
242  size_t neighbors_index_size,
243  const int32_t* const __restrict__ neighbors_index,
244  const float* const __restrict__ neighbors_importance,
245  const int64_t* const __restrict__ neighbors_row_splits,
246  const float* const __restrict__ extents,
247  const float* const __restrict__ offsets,
248  const std::vector<int>& filter_dims,
249  InterpolationMode interpolation,
250  CoordinateMapping coordinate_mapping,
251  bool align_corners,
252  bool individual_extent,
253  bool isotropic_extent,
254  bool normalize);
255 
256 template <class TFeat,
257  class TReal,
258  class TIndex,
259  bool ALIGN_CORNERS,
260  CoordinateMapping MAPPING,
261  InterpolationMode INTERPOLATION>
262 __global__ void FillColumnTransposeKernel(
263  TFeat* columns,
264  int in_channels,
265  TIndex begin_idx,
266  TIndex end_idx,
267  TIndex num_out,
268  const TReal* const __restrict__ out_positions,
269  TIndex num_inp,
270  const TReal* const __restrict__ inp_positions,
271  const TFeat* const __restrict__ inp_features,
272  size_t neighbors_index_size,
273  const TIndex* const __restrict__ neighbors_index,
274  const TFeat* const __restrict__ inp_neighbors_importance_sum,
275  const int64_t* const __restrict__ inp_neighbors_prefix_sum,
276  const TFeat* const __restrict__ neighbors_importance,
277  const int64_t* const __restrict__ neighbors_row_splits,
278  const TReal* const __restrict__ extents,
279  const TReal* const __restrict__ offsets,
280  int filter_size_x,
281  int filter_size_y,
282  int filter_size_z,
283  bool INDIVIDUAL_EXTENT,
284  bool ISOTROPIC_EXTENT,
285  bool NORMALIZE,
286  bool NEIGHBOR_IMPORTANCE) {
287  TIndex out_idx = begin_idx + blockIdx.x;
288  if (out_idx >= end_idx) return;
289  const int NUM_INTERP_VALUES =
290  (INTERPOLATION == InterpolationMode::LINEAR ||
291  INTERPOLATION == InterpolationMode::LINEAR_BORDER
292  ? 8
293  : 1);
294  TReal interp_weights[NUM_INTERP_VALUES];
295  TIndex interp_indices[NUM_INTERP_VALUES];
296 
297  TReal offset[3] = {offsets[0], offsets[1], offsets[2]};
298 
299  const TIndex col_idx = out_idx - begin_idx;
300  TFeat* out_column = columns + filter_size_x * filter_size_y *
301  filter_size_z * in_channels * col_idx;
302  const int64_t neighbor_start = neighbors_row_splits[out_idx];
303  const int64_t neighbor_end = neighbors_row_splits[out_idx + 1];
304 
305  TReal out_pos[3] = {out_positions[out_idx * 3 + 0],
306  out_positions[out_idx * 3 + 1],
307  out_positions[out_idx * 3 + 2]};
308 
309  TReal inv_extents[3];
310  if (INDIVIDUAL_EXTENT == false) {
311  if (ISOTROPIC_EXTENT) {
312  inv_extents[0] = TReal(1) / extents[0];
313  inv_extents[1] = inv_extents[0];
314  inv_extents[2] = inv_extents[0];
315  } else {
316  inv_extents[0] = TReal(1) / extents[0];
317  inv_extents[1] = TReal(1) / extents[1];
318  inv_extents[2] = TReal(1) / extents[2];
319  }
320  }
321 
322  for (int64_t n_idx = neighbor_start; n_idx < neighbor_end; ++n_idx) {
323  const TIndex inp_idx = neighbors_index[n_idx];
324 
325  TReal x, y, z;
326  x = out_pos[0] - inp_positions[inp_idx * 3 + 0];
327  y = out_pos[1] - inp_positions[inp_idx * 3 + 1];
328  z = out_pos[2] - inp_positions[inp_idx * 3 + 2];
329 
330  if (INDIVIDUAL_EXTENT) {
331  if (ISOTROPIC_EXTENT) {
332  inv_extents[0] = TReal(1) / extents[inp_idx];
333  inv_extents[1] = inv_extents[0];
334  inv_extents[2] = inv_extents[0];
335  } else {
336  inv_extents[0] = TReal(1) / extents[3 * inp_idx + 0];
337  inv_extents[1] = TReal(1) / extents[3 * inp_idx + 1];
338  inv_extents[2] = TReal(1) / extents[3 * inp_idx + 2];
339  }
340  }
341 
342  TReal num_inp_neighbors_normalizer = 1;
343  if (NORMALIZE) {
344  if (NEIGHBOR_IMPORTANCE) {
345  if (inp_neighbors_importance_sum[inp_idx] != 0)
346  num_inp_neighbors_normalizer /=
347  inp_neighbors_importance_sum[inp_idx];
348  } else {
349  const int64_t inp_neighbor_start =
350  inp_neighbors_prefix_sum[inp_idx];
351  const int64_t inp_neighbor_end =
352  inp_idx + 1 < num_inp
353  ? inp_neighbors_prefix_sum[inp_idx + 1]
354  : neighbors_index_size;
355  const size_t num_inp_neighbors =
356  inp_neighbor_end - inp_neighbor_start;
357  if (num_inp_neighbors > 0)
358  num_inp_neighbors_normalizer /= num_inp_neighbors;
359  }
360  }
361 
362  ComputeFilterCoordinates<ALIGN_CORNERS, MAPPING>(
363  x, y, z, filter_size_x, filter_size_y, filter_size_z,
364  inv_extents[0], inv_extents[1], inv_extents[2], offset[0],
365  offset[1], offset[2]);
366  Interpolate<INTERPOLATION>(interp_weights, interp_indices, x, y, z,
367  filter_size_x, filter_size_y, filter_size_z);
368 
369  TFeat infeat = 0;
370  for (int ic = threadIdx.x; ic < in_channels; ic += blockDim.x) {
371  infeat = inp_features[inp_idx * in_channels + ic];
372  if (NEIGHBOR_IMPORTANCE) infeat *= neighbors_importance[n_idx];
373  if (NORMALIZE) infeat *= num_inp_neighbors_normalizer;
374  for (int j = 0; j < NUM_INTERP_VALUES; ++j) {
375  TFeat value = interp_weights[j] * infeat;
376  out_column[interp_indices[j] * in_channels + ic] += value;
377  }
378  }
379  } // for n
380 }
381 
382 template <class TFeat, class TReal, class TIndex>
383 void FillColumnTranspose(
384  const cudaStream_t& stream,
385  TFeat* columns,
386  int in_channels,
387  TIndex begin_idx,
388  TIndex end_idx,
389  TIndex num_out,
390  const TReal* const __restrict__ out_positions,
391  TIndex num_inp,
392  const TReal* const __restrict__ inp_positions,
393  const TFeat* const __restrict__ inp_features,
394  const TFeat* const __restrict__ inp_neighbors_importance_sum,
395  const int64_t* const __restrict__ inp_neighbors_prefix_sum,
396  size_t neighbors_index_size,
397  const TIndex* const __restrict__ neighbors_index,
398  const TFeat* const __restrict__ neighbors_importance,
399  const int64_t* const __restrict__ neighbors_row_splits,
400  const TReal* const __restrict__ extents,
401  const TReal* const __restrict__ offsets,
402  const std::vector<int>& filter_dims,
403  InterpolationMode interpolation,
404  CoordinateMapping coordinate_mapping,
405  bool align_corners,
406  bool individual_extent,
407  bool isotropic_extent,
408  bool normalize) {
409  const bool has_neighbors_importance = inp_neighbors_importance_sum;
410  const int filter_size_z = filter_dims[0];
411  const int filter_size_y = filter_dims[1];
412  const int filter_size_x = filter_dims[2];
413 
414  TIndex num_columns = end_idx - begin_idx;
415  int filter_spatial_size = filter_size_x * filter_size_y * filter_size_z;
416  cudaMemsetAsync(
417  columns, 0,
418  sizeof(TFeat) * filter_spatial_size * in_channels * num_columns,
419  stream);
420 
421  const int BLOCKSIZE = 32;
422  dim3 block(BLOCKSIZE, 1, 1);
423  dim3 grid(0, 1, 1);
424  grid.x = num_columns;
425 
426 #define FN_PARAMETERS \
427  columns, in_channels, begin_idx, end_idx, num_out, out_positions, num_inp, \
428  inp_positions, inp_features, neighbors_index_size, \
429  neighbors_index, inp_neighbors_importance_sum, \
430  inp_neighbors_prefix_sum, neighbors_importance, \
431  neighbors_row_splits, extents, offsets, filter_size_x, \
432  filter_size_y, filter_size_z, individual_extent, isotropic_extent, \
433  normalize, has_neighbors_importance
434 
435 #define CALL_TEMPLATE(INTERPOLATION, MAPPING, ALIGN_CORNERS) \
436  if (INTERPOLATION == interpolation && MAPPING == coordinate_mapping && \
437  ALIGN_CORNERS == align_corners) \
438  FillColumnTransposeKernel<TFeat, TReal, TIndex, ALIGN_CORNERS, \
439  MAPPING, INTERPOLATION> \
440  <<<grid, block, 0, stream>>>(FN_PARAMETERS);
441 
442 #define CALL_TEMPLATE2(INTERPOLATION, MAPPING) \
443  CALL_TEMPLATE(INTERPOLATION, MAPPING, true) \
444  CALL_TEMPLATE(INTERPOLATION, MAPPING, false)
445 
446 #define CALL_TEMPLATE3(INTERPOLATION) \
447  CALL_TEMPLATE2(INTERPOLATION, CoordinateMapping::BALL_TO_CUBE_RADIAL) \
448  CALL_TEMPLATE2(INTERPOLATION, \
449  CoordinateMapping::BALL_TO_CUBE_VOLUME_PRESERVING) \
450  CALL_TEMPLATE2(INTERPOLATION, CoordinateMapping::IDENTITY)
451 
452 #define CALL_TEMPLATE4 \
453  CALL_TEMPLATE3(InterpolationMode::LINEAR) \
454  CALL_TEMPLATE3(InterpolationMode::LINEAR_BORDER) \
455  CALL_TEMPLATE3(InterpolationMode::NEAREST_NEIGHBOR)
456 
457  if (grid.x) {
458  CALL_TEMPLATE4
459  /*CHECK_CUDA_ERROR*/
460  }
461 
462 #undef CALL_TEMPLATE
463 #undef CALL_TEMPLATE2
464 #undef CALL_TEMPLATE3
465 #undef CALL_TEMPLATE4
466 
467 #undef FN_PARAMETERS
468 }
469 
470 template void FillColumnTranspose<float, float, int32_t>(
471  const cudaStream_t& stream,
472  float* columns,
473  int in_channels,
474  int32_t begin_idx,
475  int32_t end_idx,
476  int32_t num_out,
477  const float* const __restrict__ out_positions,
478  int32_t num_inp,
479  const float* const __restrict__ inp_positions,
480  const float* const __restrict__ inp_features,
481  const float* const __restrict__ inp_neighbors_importance_sum,
482  const int64_t* const __restrict__ inp_neighbors_prefix_sum,
483  size_t neighbors_index_size,
484  const int32_t* const __restrict__ neighbors_index,
485  const float* const __restrict__ neighbors_importance,
486  const int64_t* const __restrict__ neighbors_row_splits,
487  const float* const __restrict__ extents,
488  const float* const __restrict__ offsets,
489  const std::vector<int>& filter_dims,
490  InterpolationMode interpolation,
491  CoordinateMapping coordinate_mapping,
492  bool align_corners,
493  bool individual_extent,
494  bool isotropic_extent,
495  bool normalize);
496 
497 template <class T>
498 __global__ void MultiplyColumnsKernel(size_t rows,
499  size_t cols,
500  T* __restrict__ col_major_matrix,
501  const T* const __restrict__ vector) {
502  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
503  if (idx >= rows * cols) return;
504 
505  size_t col = idx / rows;
506 
507  T factor = vector[col];
508  col_major_matrix[idx] *= factor;
509 }
510 
511 template <class T>
512 void MultiplyColumns(const cudaStream_t& stream,
513  size_t rows,
514  size_t cols,
515  T* __restrict__ col_major_matrix,
516  const T* const __restrict__ vector) {
517  const int BLOCKSIZE = 128;
518  dim3 block(BLOCKSIZE, 1, 1);
519  dim3 grid(0, 1, 1);
520  grid.x = DivUp(rows * cols, BLOCKSIZE);
521 
522  if (grid.x) {
523  MultiplyColumnsKernel<T><<<grid, block, 0, stream>>>(
524  rows, cols, col_major_matrix, vector);
525  }
526 }
527 
528 template void MultiplyColumns<float>(const cudaStream_t& stream,
529  size_t rows,
530  size_t cols,
531  float* __restrict__ col_major_matrix,
532  const float* const __restrict__ vector);
533 
534 template <class T>
535 __global__ void MultiplyAndCopyColumnsKernel(
536  size_t rows,
537  size_t cols,
538  T* __restrict__ out_ptr,
539  const T* const __restrict__ col_major_matrix,
540  const T* const __restrict__ vector) {
541  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
542  if (idx >= rows * cols) return;
543 
544  size_t col = idx / rows;
545 
546  T factor = vector[col];
547  out_ptr[idx] = col_major_matrix[idx] * factor;
548 }
549 
550 template <class T>
551 void MultiplyAndCopyColumns(const cudaStream_t& stream,
552  size_t rows,
553  size_t cols,
554  T* __restrict__ out_ptr,
555  const T* const __restrict__ col_major_matrix,
556  const T* const __restrict__ vector) {
557  const int BLOCKSIZE = 128;
558  dim3 block(BLOCKSIZE, 1, 1);
559  dim3 grid(0, 1, 1);
560  grid.x = DivUp(rows * cols, BLOCKSIZE);
561 
562  if (grid.x) {
563  MultiplyAndCopyColumnsKernel<T><<<grid, block, 0, stream>>>(
564  rows, cols, out_ptr, col_major_matrix, vector);
565  }
566 }
567 
568 template void MultiplyAndCopyColumns<float>(
569  const cudaStream_t& stream,
570  size_t rows,
571  size_t cols,
572  float* __restrict__ out_ptr,
573  const float* const __restrict__ col_major_matrix,
574  const float* const __restrict__ vector);
575 
576 } // namespace impl
577 } // namespace ml
578 } // namespace cloudViewer