ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
DistancesUtils.cuh
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <cuda.h>
11 
12 #include <algorithm>
13 
14 #include "cloudViewer/core/CUDAUtils.h"
15 
16 namespace cloudViewer {
17 namespace core {
18 
19 struct IPDistance {
20  __host__ __device__ IPDistance() : dist(0) {}
21 
22  static constexpr bool kDirection = true; // maximize
23  static constexpr float kIdentityData = 0;
24  static constexpr float kMaxDistance = -std::numeric_limits<float>::max();
25 
26  __host__ __device__ void handle(float a, float b) { dist += a * b; }
27 
28  __host__ __device__ float reduce() { return dist; }
29 
30  __host__ __device__ void combine(const IPDistance& v) { dist += v.dist; }
31 
32  __host__ __device__ IPDistance zero() const { return IPDistance(); }
33 
34  float dist;
35 };
36 
37 struct L1Distance {
38  __host__ __device__ L1Distance() : dist(0) {}
39 
40  static constexpr bool kDirection = false; // minimize
41  static constexpr float kIdentityData = 0;
42  static constexpr float kMaxDistance = std::numeric_limits<float>::max();
43 
44  __host__ __device__ void handle(float a, float b) { dist += fabsf(a - b); }
45 
46  __host__ __device__ float reduce() { return dist; }
47 
48  __host__ __device__ void combine(const L1Distance& v) { dist += v.dist; }
49 
50  __host__ __device__ L1Distance zero() const { return L1Distance(); }
51 
52  float dist;
53 };
54 
55 struct L2Distance {
56  __host__ __device__ L2Distance() : dist(0) {}
57 
58  static constexpr bool kDirection = false; // minimize
59  static constexpr float kIdentityData = 0;
60  static constexpr float kMaxDistance = std::numeric_limits<float>::max();
61 
62  __host__ __device__ void handle(float a, float b) {
63  float v = a - b;
64  dist += v * v;
65  }
66 
67  __host__ __device__ float reduce() { return dist; }
68 
69  __host__ __device__ void combine(const L2Distance& v) { dist += v.dist; }
70 
71  __host__ __device__ L2Distance zero() const { return L2Distance(); }
72 
73  float dist;
74 };
75 
76 struct LpDistance {
77  __host__ __device__ LpDistance() : p(2), dist(0) {}
78 
79  __host__ __device__ LpDistance(float arg) : p(arg), dist(0) {}
80 
81  __host__ __device__ LpDistance(const LpDistance& v)
82  : p(v.p), dist(v.dist) {}
83 
84  __host__ __device__ LpDistance& operator=(const LpDistance& v) {
85  p = v.p;
86  dist = v.dist;
87  return *this;
88  }
89 
90  static constexpr bool kDirection = false; // minimize
91  static constexpr float kIdentityData = 0;
92  static constexpr float kMaxDistance = std::numeric_limits<float>::max();
93 
94  __host__ __device__ void handle(float a, float b) {
95  dist += powf(fabsf(a - b), p);
96  }
97 
98  __host__ __device__ float reduce() { return dist; }
99 
100  __host__ __device__ void combine(const LpDistance& v) { dist += v.dist; }
101 
102  __host__ __device__ LpDistance zero() const { return LpDistance(p); }
103 
104  float p;
105  float dist;
106 };
107 
108 struct LinfDistance {
109  __host__ __device__ LinfDistance() : dist(0) {}
110 
111  static constexpr bool kDirection = false; // minimize
112  static constexpr float kIdentityData = 0;
113  static constexpr float kMaxDistance = std::numeric_limits<float>::max();
114 
115  __host__ __device__ void handle(float a, float b) {
116  dist = fmaxf(dist, fabsf(a - b));
117  }
118 
119  __host__ __device__ float reduce() { return dist; }
120 
121  __host__ __device__ void combine(const LinfDistance& v) {
122  dist = fmaxf(dist, v.dist);
123  }
124 
125  __host__ __device__ LinfDistance zero() const { return LinfDistance(); }
126 
127  float dist;
128 };
129 
130 struct CanberraDistance {
131  __host__ __device__ CanberraDistance() : dist(0) {}
132 
133  static constexpr bool kDirection = false; // minimize
134  static constexpr float kIdentityData = 0;
135  static constexpr float kMaxDistance = std::numeric_limits<float>::max();
136 
137  __host__ __device__ void handle(float a, float b) {
138  float denom = fabsf(a) + fabsf(b);
139  dist += fabsf(a - b) / denom;
140  }
141 
142  __host__ __device__ float reduce() { return dist; }
143 
144  __host__ __device__ void combine(const CanberraDistance& v) {
145  dist += v.dist;
146  }
147 
148  __host__ __device__ CanberraDistance zero() const {
149  return CanberraDistance();
150  }
151 
152  float dist;
153 };
154 
155 struct BrayCurtisDistance {
156  __host__ __device__ BrayCurtisDistance() : numerator(0), denominator(0) {}
157 
158  static constexpr bool kDirection = false; // minimize
159  static constexpr float kIdentityData = 0;
160  static constexpr float kMaxDistance = std::numeric_limits<float>::max();
161 
162  __host__ __device__ void handle(float a, float b) {
163  numerator += fabsf(a - b);
164  denominator += fabsf(a + b);
165  }
166 
167  __host__ __device__ float reduce() { return (numerator / denominator); }
168 
169  __host__ __device__ void combine(const BrayCurtisDistance& v) {
170  numerator += v.numerator;
171  denominator += v.denominator;
172  }
173 
174  __host__ __device__ BrayCurtisDistance zero() const {
175  return BrayCurtisDistance();
176  }
177 
178  float numerator;
179  float denominator;
180 };
181 
182 struct JensenShannonDistance {
183  __host__ __device__ JensenShannonDistance() : dist(0) {}
184 
185  static constexpr bool kDirection = false; // minimize
186  static constexpr float kIdentityData = 0;
187  static constexpr float kMaxDistance = std::numeric_limits<float>::max();
188 
189  __host__ __device__ void handle(float a, float b) {
190  float m = 0.5f * (a + b);
191 
192  float x = m / a;
193  float y = m / b;
194 
195  float kl1 = -a * log(x);
196  float kl2 = -b * log(y);
197 
198  dist += kl1 + kl2;
199  }
200 
201  __host__ __device__ float reduce() { return 0.5 * dist; }
202 
203  __host__ __device__ void combine(const JensenShannonDistance& v) {
204  dist += v.dist;
205  }
206 
207  __host__ __device__ JensenShannonDistance zero() const {
208  return JensenShannonDistance();
209  }
210 
211  float dist;
212 };
213 
214 // For each chunk of k indices, increment the index by chunk * increment
215 template <typename T>
216 __global__ void incrementIndex(T* indices, int k, int dim, int increment) {
217  for (int i = threadIdx.x; i < k; i += blockDim.x) {
218  indices[blockIdx.y * dim + blockIdx.x * k + i] +=
219  blockIdx.x * increment;
220  }
221 }
222 
223 // Used to update result indices in distance computation where the number of
224 // centroids is high, and is tiled
225 template <typename T>
226 void runIncrementIndex(const cudaStream_t stream,
227  Tensor& indices,
228  int k,
229  int increment) {
230  dim3 grid(indices.GetShape(1) / k, indices.GetShape(0));
231  int block = std::min(k, 512);
232 
233  // should be exact
234  CLOUDVIEWER_ASSERT(grid.x * k == indices.GetShape(1));
235 
236  incrementIndex<<<grid, block, 0, stream>>>(indices.GetDataPtr<T>(), k,
237  indices.GetShape(1), increment);
238 }
239 
240 // If the inner size (dim) of the vectors is small, we want a larger query tile
241 // size, like 1024
242 inline void chooseTileSize(int numQueries,
243  int numCentroids,
244  int dim,
245  int elementSize,
246  int& tileRows,
247  int& tileCols) {
248  // The matrix multiplication should be large enough to be efficient, but if
249  // it is too large, we seem to lose efficiency as opposed to
250  // double-streaming. Each tile size here defines 1/2 of the memory use due
251  // to double streaming. We ignore available temporary memory, as that is
252  // adjusted independently by the user and can thus meet these requirements
253  // (or not). For <= 4 GB GPUs, prefer 512 MB of usage. For <= 8 GB GPUs,
254  // prefer 768 MB of usage. Otherwise, prefer 1 GB of usage.
255  auto totalMem = GetCUDACurrentTotalMemSize();
256 
257  int targetUsage = 0;
258 
259  if (totalMem <= ((size_t)4) * 1024 * 1024 * 1024) {
260  targetUsage = 512 * 1024 * 1024;
261  } else if (totalMem <= ((size_t)8) * 1024 * 1024 * 1024) {
262  targetUsage = 768 * 1024 * 1024;
263  } else {
264  targetUsage = 1024 * 1024 * 1024;
265  }
266 
267  targetUsage /= 2 * elementSize;
268 
269  // 512 seems to be a batch size sweetspot for float32.
270  // If we are on float16, increase to 512.
271  // If the k size (vec dim) of the matrix multiplication is small (<= 32),
272  // increase to 1024.
273  int preferredTileRows = 512;
274  if (dim <= 32) {
275  preferredTileRows = 1024;
276  }
277 
278  tileRows = std::min(preferredTileRows, numQueries);
279 
280  // tileCols is the remainder size
281  tileCols = std::min(targetUsage / preferredTileRows, numCentroids);
282 }
283 
284 } // namespace core
285 } // namespace cloudViewer