ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
BlockSelect.cuh
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include "core/Tensor.h"
11 #include "core/nns/kernel/Select.cuh"
12 
13 namespace cloudViewer {
14 namespace core {
15 
16 template <typename K,
17  typename IndexType,
18  bool Dir,
19  int NumWarpQ,
20  int NumThreadQ,
21  int ThreadsPerBlock>
22 __global__ void blockSelect(K* in,
23  K* outK,
24  IndexType* outV,
25  K initK,
26  IndexType initV,
27  int k,
28  int dim,
29  int num_points) {
30  constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
31 
32  __shared__ K smemK[kNumWarps * NumWarpQ];
33  __shared__ IndexType smemV[kNumWarps * NumWarpQ];
34 
35  BlockSelect<K, IndexType, Dir, NumWarpQ, NumThreadQ, ThreadsPerBlock> heap(
36  initK, initV, smemK, smemV, k);
37 
38  // Grid is exactly sized to rows available
39  int row = blockIdx.x;
40 
41  int i = threadIdx.x;
42  K* inStart = in + dim * row + i;
43 
44  // Whole warps must participate in the selection
45  int limit = (dim / kWarpSize) * kWarpSize;
46 
47  for (; i < limit; i += ThreadsPerBlock) {
48  heap.add(*inStart, (IndexType)i);
49  inStart += ThreadsPerBlock;
50  }
51 
52  // Handle last remainder fraction of a warp of elements
53  if (i < dim) {
54  heap.addThreadQ(*inStart, (IndexType)i);
55  }
56 
57  heap.reduce();
58 
59  for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
60  *(outK + row * dim + i) = smemK[i];
61  *(outV + row * dim + i) = smemV[i];
62  }
63 }
64 
65 template <typename K,
66  typename IndexType,
67  bool Dir,
68  int NumWarpQ,
69  int NumThreadQ,
70  int ThreadsPerBlock>
71 __global__ void blockSelectPair(K* inK,
72  IndexType* inV,
73  K* outK,
74  IndexType* outV,
75  K initK,
76  IndexType initV,
77  int k,
78  int dim,
79  int num_points) {
80  constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
81 
82  __shared__ K smemK[kNumWarps * NumWarpQ];
83  __shared__ IndexType smemV[kNumWarps * NumWarpQ];
84 
85  BlockSelect<K, IndexType, Dir, NumWarpQ, NumThreadQ, ThreadsPerBlock> heap(
86  initK, initV, smemK, smemV, k);
87 
88  // Grid is exactly sized to rows available
89  int row = blockIdx.x;
90 
91  int i = threadIdx.x;
92  K* inKStart = &inK[row * dim + i];
93  IndexType* inVStart = &inV[row * dim + i];
94 
95  // Whole warps must participate in the selection
96  int limit = (dim / kWarpSize) * kWarpSize;
97 
98  for (; i < limit; i += ThreadsPerBlock) {
99  heap.add(*inKStart, *inVStart);
100  inKStart += ThreadsPerBlock;
101  inVStart += ThreadsPerBlock;
102  }
103 
104  // Handle last remainder fraction of a warp of elements
105  if (i < dim) {
106  heap.addThreadQ(*inKStart, *inVStart);
107  }
108 
109  heap.reduce();
110 
111  for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
112  outK[row * k + i] = smemK[i];
113  outV[row * k + i] = smemV[i];
114  }
115 }
116 
117 void runBlockSelectPair(cudaStream_t stream,
118  float* inK,
119  int32_t* inV,
120  float* outK,
121  int32_t* outV,
122  bool dir,
123  int k,
124  int dim,
125  int num_points);
126 
127 void runBlockSelectPair(cudaStream_t stream,
128  float* inK,
129  int64_t* inV,
130  float* outK,
131  int64_t* outV,
132  bool dir,
133  int k,
134  int dim,
135  int num_points);
136 
137 void runBlockSelectPair(cudaStream_t stream,
138  double* inK,
139  int32_t* inV,
140  double* outK,
141  int32_t* outV,
142  bool dir,
143  int k,
144  int dim,
145  int num_points);
146 
147 void runBlockSelectPair(cudaStream_t stream,
148  double* inK,
149  int64_t* inV,
150  double* outK,
151  int64_t* outV,
152  bool dir,
153  int k,
154  int dim,
155  int num_points);
156 
157 } // namespace core
158 } // namespace cloudViewer