1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
14 #include "cloudViewer/core/CUDAUtils.h"
16 namespace cloudViewer {
20 __host__ __device__ IPDistance() : dist(0) {}
22 static constexpr bool kDirection = true; // maximize
23 static constexpr float kIdentityData = 0;
24 static constexpr float kMaxDistance = -std::numeric_limits<float>::max();
26 __host__ __device__ void handle(float a, float b) { dist += a * b; }
28 __host__ __device__ float reduce() { return dist; }
30 __host__ __device__ void combine(const IPDistance& v) { dist += v.dist; }
32 __host__ __device__ IPDistance zero() const { return IPDistance(); }
38 __host__ __device__ L1Distance() : dist(0) {}
40 static constexpr bool kDirection = false; // minimize
41 static constexpr float kIdentityData = 0;
42 static constexpr float kMaxDistance = std::numeric_limits<float>::max();
44 __host__ __device__ void handle(float a, float b) { dist += fabsf(a - b); }
46 __host__ __device__ float reduce() { return dist; }
48 __host__ __device__ void combine(const L1Distance& v) { dist += v.dist; }
50 __host__ __device__ L1Distance zero() const { return L1Distance(); }
56 __host__ __device__ L2Distance() : dist(0) {}
58 static constexpr bool kDirection = false; // minimize
59 static constexpr float kIdentityData = 0;
60 static constexpr float kMaxDistance = std::numeric_limits<float>::max();
62 __host__ __device__ void handle(float a, float b) {
67 __host__ __device__ float reduce() { return dist; }
69 __host__ __device__ void combine(const L2Distance& v) { dist += v.dist; }
71 __host__ __device__ L2Distance zero() const { return L2Distance(); }
77 __host__ __device__ LpDistance() : p(2), dist(0) {}
79 __host__ __device__ LpDistance(float arg) : p(arg), dist(0) {}
81 __host__ __device__ LpDistance(const LpDistance& v)
82 : p(v.p), dist(v.dist) {}
84 __host__ __device__ LpDistance& operator=(const LpDistance& v) {
90 static constexpr bool kDirection = false; // minimize
91 static constexpr float kIdentityData = 0;
92 static constexpr float kMaxDistance = std::numeric_limits<float>::max();
94 __host__ __device__ void handle(float a, float b) {
95 dist += powf(fabsf(a - b), p);
98 __host__ __device__ float reduce() { return dist; }
100 __host__ __device__ void combine(const LpDistance& v) { dist += v.dist; }
102 __host__ __device__ LpDistance zero() const { return LpDistance(p); }
108 struct LinfDistance {
109 __host__ __device__ LinfDistance() : dist(0) {}
111 static constexpr bool kDirection = false; // minimize
112 static constexpr float kIdentityData = 0;
113 static constexpr float kMaxDistance = std::numeric_limits<float>::max();
115 __host__ __device__ void handle(float a, float b) {
116 dist = fmaxf(dist, fabsf(a - b));
119 __host__ __device__ float reduce() { return dist; }
121 __host__ __device__ void combine(const LinfDistance& v) {
122 dist = fmaxf(dist, v.dist);
125 __host__ __device__ LinfDistance zero() const { return LinfDistance(); }
130 struct CanberraDistance {
131 __host__ __device__ CanberraDistance() : dist(0) {}
133 static constexpr bool kDirection = false; // minimize
134 static constexpr float kIdentityData = 0;
135 static constexpr float kMaxDistance = std::numeric_limits<float>::max();
137 __host__ __device__ void handle(float a, float b) {
138 float denom = fabsf(a) + fabsf(b);
139 dist += fabsf(a - b) / denom;
142 __host__ __device__ float reduce() { return dist; }
144 __host__ __device__ void combine(const CanberraDistance& v) {
148 __host__ __device__ CanberraDistance zero() const {
149 return CanberraDistance();
155 struct BrayCurtisDistance {
156 __host__ __device__ BrayCurtisDistance() : numerator(0), denominator(0) {}
158 static constexpr bool kDirection = false; // minimize
159 static constexpr float kIdentityData = 0;
160 static constexpr float kMaxDistance = std::numeric_limits<float>::max();
162 __host__ __device__ void handle(float a, float b) {
163 numerator += fabsf(a - b);
164 denominator += fabsf(a + b);
167 __host__ __device__ float reduce() { return (numerator / denominator); }
169 __host__ __device__ void combine(const BrayCurtisDistance& v) {
170 numerator += v.numerator;
171 denominator += v.denominator;
174 __host__ __device__ BrayCurtisDistance zero() const {
175 return BrayCurtisDistance();
182 struct JensenShannonDistance {
183 __host__ __device__ JensenShannonDistance() : dist(0) {}
185 static constexpr bool kDirection = false; // minimize
186 static constexpr float kIdentityData = 0;
187 static constexpr float kMaxDistance = std::numeric_limits<float>::max();
189 __host__ __device__ void handle(float a, float b) {
190 float m = 0.5f * (a + b);
195 float kl1 = -a * log(x);
196 float kl2 = -b * log(y);
201 __host__ __device__ float reduce() { return 0.5 * dist; }
203 __host__ __device__ void combine(const JensenShannonDistance& v) {
207 __host__ __device__ JensenShannonDistance zero() const {
208 return JensenShannonDistance();
214 // For each chunk of k indices, increment the index by chunk * increment
215 template <typename T>
216 __global__ void incrementIndex(T* indices, int k, int dim, int increment) {
217 for (int i = threadIdx.x; i < k; i += blockDim.x) {
218 indices[blockIdx.y * dim + blockIdx.x * k + i] +=
219 blockIdx.x * increment;
223 // Used to update result indices in distance computation where the number of
224 // centroids is high, and is tiled
225 template <typename T>
226 void runIncrementIndex(const cudaStream_t stream,
230 dim3 grid(indices.GetShape(1) / k, indices.GetShape(0));
231 int block = std::min(k, 512);
234 CLOUDVIEWER_ASSERT(grid.x * k == indices.GetShape(1));
236 incrementIndex<<<grid, block, 0, stream>>>(indices.GetDataPtr<T>(), k,
237 indices.GetShape(1), increment);
240 // If the inner size (dim) of the vectors is small, we want a larger query tile
242 inline void chooseTileSize(int numQueries,
248 // The matrix multiplication should be large enough to be efficient, but if
249 // it is too large, we seem to lose efficiency as opposed to
250 // double-streaming. Each tile size here defines 1/2 of the memory use due
251 // to double streaming. We ignore available temporary memory, as that is
252 // adjusted independently by the user and can thus meet these requirements
253 // (or not). For <= 4 GB GPUs, prefer 512 MB of usage. For <= 8 GB GPUs,
254 // prefer 768 MB of usage. Otherwise, prefer 1 GB of usage.
255 auto totalMem = GetCUDACurrentTotalMemSize();
259 if (totalMem <= ((size_t)4) * 1024 * 1024 * 1024) {
260 targetUsage = 512 * 1024 * 1024;
261 } else if (totalMem <= ((size_t)8) * 1024 * 1024 * 1024) {
262 targetUsage = 768 * 1024 * 1024;
264 targetUsage = 1024 * 1024 * 1024;
267 targetUsage /= 2 * elementSize;
269 // 512 seems to be a batch size sweetspot for float32.
270 // If we are on float16, increase to 512.
271 // If the k size (vec dim) of the matrix multiplication is small (<= 32),
273 int preferredTileRows = 512;
275 preferredTileRows = 1024;
278 tileRows = std::min(preferredTileRows, numQueries);
280 // tileCols is the remainder size
281 tileCols = std::min(targetUsage / preferredTileRows, numCentroids);
285 } // namespace cloudViewer