ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
L2Select.cuh
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <Helper.h>
11 
12 #include "cloudViewer/core/CUDAUtils.h"
13 #include "core/Tensor.h"
14 #include "core/nns/kernel/Limits.cuh"
15 #include "core/nns/kernel/Pair.cuh"
16 #include "core/nns/kernel/Reduction.cuh"
17 #include "core/nns/kernel/ReductionOps.cuh"
18 #include "core/nns/kernel/Select.cuh"
19 
20 namespace cloudViewer {
21 namespace core {
22 namespace nns {
23 
24 // L2 + select kernel for k == 1, implements re-use of ||c||^2
25 template <typename T, typename TIndex, int kRowsPerBlock, int kBlockSize>
26 __global__ void l2SelectMin1(T* productDistances,
27  T* centroidDistances,
28  T* outDistances,
29  TIndex* outIndices,
30  int num_points,
31  int dim) {
32  // Each block handles kRowsPerBlock rows of the distances (results)
33  Pair<T, int> threadMin[kRowsPerBlock];
34  __shared__ Pair<T, int> blockMin[kRowsPerBlock * (kBlockSize / kWarpSize)];
35 
36  T distance[kRowsPerBlock];
37 
38 #pragma unroll
39  for (int i = 0; i < kRowsPerBlock; ++i) {
40  threadMin[i].k = Limits<T>::getMax();
41  threadMin[i].v = -1;
42  }
43 
44  // blockIdx.x: which chunk of rows we are responsible for updating
45  int rowStart = blockIdx.x * kRowsPerBlock;
46 
47  // FIXME: if we have exact multiples, don't need this
48  bool endRow = (blockIdx.x == gridDim.x - 1);
49 
50  if (endRow) {
51  if (num_points % kRowsPerBlock == 0) {
52  endRow = false;
53  }
54  }
55 
56  if (endRow) {
57  for (int row = rowStart; row < num_points; ++row) {
58  for (int col = threadIdx.x; col < dim; col += blockDim.x) {
59  distance[0] = centroidDistances[col] +
60  productDistances[row + dim + col];
61 
62  if (distance[0] < threadMin[0].k) {
63  threadMin[0].k = distance[0];
64  threadMin[0].v = col;
65  }
66  }
67 
68  // Reduce within the block
69  threadMin[0] = blockReduceAll<Pair<T, int>, Min<Pair<T, int>>,
70  false, false>(
71  threadMin[0], Min<Pair<T, int>>(), blockMin);
72 
73  if (threadIdx.x == 0) {
74  outDistances[row + 0] = threadMin[0].k;
75  outIndices[row + 0] = threadMin[0].v;
76  }
77 
78  // so we can use the shared memory again
79  __syncthreads();
80 
81  threadMin[0].k = Limits<T>::getMax();
82  threadMin[0].v = -1;
83  }
84  } else {
85  for (int col = threadIdx.x; col < dim; col += blockDim.x) {
86  T centroidDistance = centroidDistances[col];
87 
88 #pragma unroll
89  for (int row = 0; row < kRowsPerBlock; ++row) {
90  distance[row] = productDistances[(rowStart + row) * dim + col];
91  }
92 
93 #pragma unroll
94  for (int row = 0; row < kRowsPerBlock; ++row) {
95  distance[row] = distance[row] + centroidDistance;
96  }
97 
98 #pragma unroll
99  for (int row = 0; row < kRowsPerBlock; ++row) {
100  if (distance[row] < threadMin[row].k) {
101  threadMin[row].k = distance[row];
102  threadMin[row].v = col;
103  }
104  }
105  }
106 
107  // Reduce within the block
108  blockReduceAll<kRowsPerBlock, Pair<T, int>, Min<Pair<T, int>>, false,
109  false>(threadMin, Min<Pair<T, int>>(), blockMin);
110 
111  if (threadIdx.x == 0) {
112 #pragma unroll
113  for (int row = 0; row < kRowsPerBlock; ++row) {
114  outDistances[rowStart + row + 0] = threadMin[row].k;
115  outIndices[rowStart + row + 0] = threadMin[row].v;
116  }
117  }
118  }
119 }
120 
121 // L2 + select kernel for k > 1, no re-use of ||c||^2
122 template <typename T,
123  typename TIndex,
124  int NumWarpQ,
125  int NumThreadQ,
126  int ThreadsPerBlock>
127 __global__ void l2SelectMinK(T* productDistances,
128  T* centroidDistances,
129  T* outDistances,
130  TIndex* outIndices,
131  int k,
132  int dim,
133  int num_cols,
134  int tile_cols,
135  T initK) {
136  // Each block handles a single row of the distances (results)
137  constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
138 
139  __shared__ T smemK[kNumWarps * NumWarpQ];
140  __shared__ int smemV[kNumWarps * NumWarpQ];
141 
142  BlockSelect<T, int, false, NumWarpQ, NumThreadQ, ThreadsPerBlock> heap(
143  initK, -1, smemK, smemV, k);
144 
145  int row = blockIdx.x;
146 
147  // Whole warps must participate in the selection
148  // int limit = utils::roundDown(dim, kWarpSize);
149  int limit = (dim / kWarpSize) * kWarpSize;
150  int i = threadIdx.x;
151 
152  for (; i < limit; i += blockDim.x) {
153  T v = centroidDistances[i] + productDistances[row * tile_cols + i];
154  heap.add(v, i);
155  }
156 
157  if (i < dim) {
158  T v = centroidDistances[i] + productDistances[row * tile_cols + i];
159  heap.addThreadQ(v, i);
160  }
161 
162  heap.reduce();
163  for (int i = threadIdx.x; i < k; i += blockDim.x) {
164  outDistances[row * k * num_cols + i] = smemK[i];
165  outIndices[row * k * num_cols + i] = smemV[i];
166  }
167 }
168 
169 template <typename T, typename TIndex>
170 void runL2SelectMin(const cudaStream_t stream,
171  Tensor& productDistances,
172  Tensor& centroidDistances,
173  Tensor& outDistances,
174  Tensor& outIndices,
175  int k,
176  int num_cols,
177  int tile_cols) {
178  CLOUDVIEWER_ASSERT(productDistances.GetShape(0) ==
179  outDistances.GetShape(0));
180  CLOUDVIEWER_ASSERT(productDistances.GetShape(0) == outIndices.GetShape(0));
181  CLOUDVIEWER_ASSERT(centroidDistances.GetShape(0) ==
182  productDistances.GetShape(1));
183  CLOUDVIEWER_ASSERT(outDistances.GetShape(1) == k);
184  CLOUDVIEWER_ASSERT(outIndices.GetShape(1) == k);
185  CLOUDVIEWER_ASSERT(k <= GPU_MAX_SELECTION_K);
186 
187  if (k == 1) {
188  constexpr int kThreadsPerBlock = 256;
189  constexpr int kRowsPerBlock = 8;
190 
191  auto block = dim3(kThreadsPerBlock);
192  auto grid =
193  dim3(utility::DivUp(outDistances.GetShape(0), kRowsPerBlock));
194 
195  l2SelectMin1<T, TIndex, kRowsPerBlock, kThreadsPerBlock>
196  <<<grid, block, 0, stream>>>(productDistances.GetDataPtr<T>(),
197  centroidDistances.GetDataPtr<T>(),
198  outDistances.GetDataPtr<T>(),
199  outIndices.GetDataPtr<TIndex>(),
200  (int)productDistances.GetShape(0),
201  (int)productDistances.GetShape(1));
202  } else {
203  auto grid = dim3(outDistances.GetShape(0));
204 
205 #define RUN_L2_SELECT(BLOCK, NUM_WARP_Q, NUM_THREAD_Q) \
206  do { \
207  l2SelectMinK<T, TIndex, NUM_WARP_Q, NUM_THREAD_Q, BLOCK> \
208  <<<grid, BLOCK, 0, stream>>>( \
209  productDistances.GetDataPtr<T>(), \
210  centroidDistances.GetDataPtr<T>(), \
211  outDistances.GetDataPtr<T>(), \
212  outIndices.GetDataPtr<TIndex>(), k, \
213  productDistances.GetShape(1), num_cols, tile_cols, \
214  Limits<T>::getMax()); \
215  } while (0)
216 
217  // block size 128 for everything <= 1024
218  if (k <= 32) {
219  RUN_L2_SELECT(128, 32, 2);
220  } else if (k <= 64) {
221  RUN_L2_SELECT(128, 64, 3);
222  } else if (k <= 128) {
223  RUN_L2_SELECT(128, 128, 3);
224  } else if (k <= 256) {
225  RUN_L2_SELECT(128, 256, 4);
226  } else if (k <= 512) {
227  RUN_L2_SELECT(128, 512, 8);
228  } else if (k <= 1024) {
229  RUN_L2_SELECT(128, 1024, 8);
230 
231 #if GPU_MAX_SELECTION_K >= 2048
232  } else if (k <= 2048) {
233  // smaller block for less shared memory
234  RUN_L2_SELECT(64, 2048, 8);
235 #endif
236 
237  } else {
238  CLOUDVIEWER_ASSERT(false);
239  }
240  }
241 
242  // CUDA_TEST_ERROR();
243 }
244 
245 } // namespace nns
246 } // namespace core
247 } // namespace cloudViewer