ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
ParallelFor.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <Logging.h>
11 #include <Parallel.h>
12 
13 #include <cstdint>
14 #include <type_traits>
15 
16 #include "cloudViewer/Macro.h"
20 
21 #ifdef __CUDACC__
22 #include <cuda.h>
23 #include <cuda_runtime.h>
24 
26 #endif
27 
28 namespace cloudViewer {
29 namespace core {
30 
31 #ifdef __CUDACC__
32 
33 static constexpr int64_t CLOUDVIEWER_PARFOR_BLOCK = 128;
34 static constexpr int64_t CLOUDVIEWER_PARFOR_THREAD = 4;
35 
37 template <int64_t block_size, int64_t thread_size, typename func_t>
38 __global__ void ElementWiseKernel_(int64_t n, func_t f) {
39  int64_t items_per_block = block_size * thread_size;
40  int64_t idx = blockIdx.x * items_per_block + threadIdx.x;
41 #pragma unroll
42  for (int64_t i = 0; i < thread_size; ++i) {
43  if (idx < n) {
44  f(idx);
45  idx += block_size;
46  }
47  }
48 }
49 
51 template <typename func_t>
52 void ParallelForCUDA_(const Device& device, int64_t n, const func_t& func) {
53  if (device.GetType() != Device::DeviceType::CUDA) {
54  utility::LogError("ParallelFor for CUDA cannot run on device {}.",
55  device.ToString());
56  }
57  if (n == 0) {
58  return;
59  }
60 
61  CUDAScopedDevice scoped_device(device);
62  int64_t items_per_block =
63  CLOUDVIEWER_PARFOR_BLOCK * CLOUDVIEWER_PARFOR_THREAD;
64  int64_t grid_size = (n + items_per_block - 1) / items_per_block;
65 
66  ElementWiseKernel_<CLOUDVIEWER_PARFOR_BLOCK, CLOUDVIEWER_PARFOR_THREAD>
67  <<<grid_size, CLOUDVIEWER_PARFOR_BLOCK, 0,
68  core::cuda::GetStream()>>>(n, func);
69  CLOUDVIEWER_GET_LAST_CUDA_ERROR("ParallelFor failed.");
70 }
71 
72 #else
73 
75 template <typename func_t>
76 void ParallelForCPU_(const Device& device, int64_t n, const func_t& func) {
77  if (!device.IsCPU()) {
78  utility::LogError("ParallelFor for CPU cannot run on device {}.",
79  device.ToString());
80  }
81  if (n == 0) {
82  return;
83  }
84 
85 #pragma omp parallel for num_threads(utility::EstimateMaxThreads())
86  for (int64_t i = 0; i < n; ++i) {
87  func(i);
88  }
89 }
90 
91 #endif
92 
110 template <typename func_t>
111 void ParallelFor(const Device& device, int64_t n, const func_t& func) {
112 #ifdef __CUDACC__
113  ParallelForCUDA_(device, n, func);
114 #else
115  ParallelForCPU_(device, n, func);
116 #endif
117 }
118 
165 template <typename vec_func_t, typename func_t>
166 void ParallelFor(const Device& device,
167  int64_t n,
168  const func_t& func,
169  const vec_func_t& vec_func) {
170 #ifdef BUILD_ISPC_MODULE
171 
172 #ifdef __CUDACC__
173  ParallelForCUDA_(device, n, func);
174 #else
175  int num_threads = utility::EstimateMaxThreads();
176  ParallelForCPU_(device, num_threads, [&](int64_t i) {
177  int64_t start = n * i / num_threads;
178  int64_t end = std::min<int64_t>(n * (i + 1) / num_threads, n);
179  vec_func(start, end);
180  });
181 #endif
182 
183 #else
184 
185 #ifdef __CUDACC__
186  ParallelForCUDA_(device, n, func);
187 #else
188  ParallelForCPU_(device, n, func);
189 #endif
190 
191 #endif
192 }
193 
194 #ifdef BUILD_ISPC_MODULE
195 
196 // Internal helper macro.
197 #define CLOUDVIEWER_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
198  using namespace ispc; \
199  ISPCKernel(start, end, __VA_ARGS__);
200 
201 #else
202 
203 // Internal helper macro.
204 #define CLOUDVIEWER_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
205  utility::LogError( \
206  "ISPC module disabled. Unable to call vectorized kernel {}", \
207  CLOUDVIEWER_STRINGIFY(ISPCKernel));
208 
209 #endif
210 
212 #define CLOUDVIEWER_OVERLOADED_LAMBDA_(T, ISPCKernel, ...) \
213  [&](T, int64_t start, int64_t end) { \
214  CLOUDVIEWER_CALL_ISPC_KERNEL_( \
215  CLOUDVIEWER_CONCAT(ISPCKernel, CLOUDVIEWER_CONCAT(_, T)), \
216  start, end, __VA_ARGS__); \
217  }
218 
228 #define CLOUDVIEWER_VECTORIZED(ISPCKernel, ...) \
229  [&](int64_t start, int64_t end) { \
230  CLOUDVIEWER_CALL_ISPC_KERNEL_(ISPCKernel, start, end, __VA_ARGS__); \
231  }
232 
246 #define CLOUDVIEWER_TEMPLATE_VECTORIZED(T, ISPCKernel, ...) \
247  [&](int64_t start, int64_t end) { \
248  static_assert(std::is_arithmetic<T>::value, \
249  "Data type is not an arithmetic type"); \
250  utility::Overload( \
251  CLOUDVIEWER_OVERLOADED_LAMBDA_(bool, ISPCKernel, __VA_ARGS__), \
252  CLOUDVIEWER_OVERLOADED_LAMBDA_(uint8_t, ISPCKernel, \
253  __VA_ARGS__), \
254  CLOUDVIEWER_OVERLOADED_LAMBDA_(int8_t, ISPCKernel, \
255  __VA_ARGS__), \
256  CLOUDVIEWER_OVERLOADED_LAMBDA_(uint16_t, ISPCKernel, \
257  __VA_ARGS__), \
258  CLOUDVIEWER_OVERLOADED_LAMBDA_(int16_t, ISPCKernel, \
259  __VA_ARGS__), \
260  CLOUDVIEWER_OVERLOADED_LAMBDA_(uint32_t, ISPCKernel, \
261  __VA_ARGS__), \
262  CLOUDVIEWER_OVERLOADED_LAMBDA_(int32_t, ISPCKernel, \
263  __VA_ARGS__), \
264  CLOUDVIEWER_OVERLOADED_LAMBDA_(uint64_t, ISPCKernel, \
265  __VA_ARGS__), \
266  CLOUDVIEWER_OVERLOADED_LAMBDA_(int64_t, ISPCKernel, \
267  __VA_ARGS__), \
268  CLOUDVIEWER_OVERLOADED_LAMBDA_(float, ISPCKernel, \
269  __VA_ARGS__), \
270  CLOUDVIEWER_OVERLOADED_LAMBDA_(double, ISPCKernel, \
271  __VA_ARGS__), \
272  [&](auto&& generic, int64_t start, int64_t end) { \
273  utility::LogError( \
274  "Unsupported data type {} for calling " \
275  "vectorized kernel {}", \
276  typeid(generic).name(), \
277  CLOUDVIEWER_STRINGIFY(ISPCKernel)); \
278  })(T{}, start, end); \
279  }
280 
281 } // namespace core
282 } // namespace cloudViewer
Common CUDA utilities.
#define CLOUDVIEWER_GET_LAST_CUDA_ERROR(message)
Definition: CUDAUtils.h:48
bool IsCPU() const
Returns true iff device type is CPU.
Definition: Device.h:46
std::string ToString() const
Returns string representation of device, e.g. "CPU:0", "CUDA:0".
Definition: Device.cpp:89
#define LogError(...)
Definition: Logging.h:60
void ParallelFor(const Device &device, int64_t n, const func_t &func)
Definition: ParallelFor.h:111
void ParallelForCPU_(const Device &device, int64_t n, const func_t &func)
Run a function in parallel on CPU.
Definition: ParallelFor.h:76
int EstimateMaxThreads()
Estimate the maximum number of threads to be used in a parallel region.
Definition: Parallel.cpp:31
Generic file read and write utility for python interface.