14 #include <type_traits>
23 #include <cuda_runtime.h>
33 static constexpr int64_t CLOUDVIEWER_PARFOR_BLOCK = 128;
34 static constexpr int64_t CLOUDVIEWER_PARFOR_THREAD = 4;
37 template <
int64_t block_size,
int64_t thread_size,
typename func_t>
38 __global__
void ElementWiseKernel_(int64_t n, func_t f) {
39 int64_t items_per_block = block_size * thread_size;
40 int64_t idx = blockIdx.x * items_per_block + threadIdx.x;
42 for (int64_t i = 0; i < thread_size; ++i) {
51 template <
typename func_t>
52 void ParallelForCUDA_(
const Device& device, int64_t n,
const func_t& func) {
61 CUDAScopedDevice scoped_device(device);
62 int64_t items_per_block =
63 CLOUDVIEWER_PARFOR_BLOCK * CLOUDVIEWER_PARFOR_THREAD;
64 int64_t grid_size = (n + items_per_block - 1) / items_per_block;
66 ElementWiseKernel_<CLOUDVIEWER_PARFOR_BLOCK, CLOUDVIEWER_PARFOR_THREAD>
67 <<<grid_size, CLOUDVIEWER_PARFOR_BLOCK, 0,
68 core::cuda::GetStream()>>>(n, func);
75 template <
typename func_t>
77 if (!device.
IsCPU()) {
85 #pragma omp parallel for num_threads(utility::EstimateMaxThreads())
86 for (int64_t i = 0; i < n; ++i) {
110 template <
typename func_t>
113 ParallelForCUDA_(device, n, func);
165 template <
typename vec_func_t,
typename func_t>
169 const vec_func_t& vec_func) {
170 #ifdef BUILD_ISPC_MODULE
173 ParallelForCUDA_(device, n, func);
177 int64_t start = n * i / num_threads;
178 int64_t end = std::min<int64_t>(n * (i + 1) / num_threads, n);
179 vec_func(start, end);
186 ParallelForCUDA_(device, n, func);
194 #ifdef BUILD_ISPC_MODULE
197 #define CLOUDVIEWER_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
198 using namespace ispc; \
199 ISPCKernel(start, end, __VA_ARGS__);
204 #define CLOUDVIEWER_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
206 "ISPC module disabled. Unable to call vectorized kernel {}", \
207 CLOUDVIEWER_STRINGIFY(ISPCKernel));
212 #define CLOUDVIEWER_OVERLOADED_LAMBDA_(T, ISPCKernel, ...) \
213 [&](T, int64_t start, int64_t end) { \
214 CLOUDVIEWER_CALL_ISPC_KERNEL_( \
215 CLOUDVIEWER_CONCAT(ISPCKernel, CLOUDVIEWER_CONCAT(_, T)), \
216 start, end, __VA_ARGS__); \
228 #define CLOUDVIEWER_VECTORIZED(ISPCKernel, ...) \
229 [&](int64_t start, int64_t end) { \
230 CLOUDVIEWER_CALL_ISPC_KERNEL_(ISPCKernel, start, end, __VA_ARGS__); \
246 #define CLOUDVIEWER_TEMPLATE_VECTORIZED(T, ISPCKernel, ...) \
247 [&](int64_t start, int64_t end) { \
248 static_assert(std::is_arithmetic<T>::value, \
249 "Data type is not an arithmetic type"); \
251 CLOUDVIEWER_OVERLOADED_LAMBDA_(bool, ISPCKernel, __VA_ARGS__), \
252 CLOUDVIEWER_OVERLOADED_LAMBDA_(uint8_t, ISPCKernel, \
254 CLOUDVIEWER_OVERLOADED_LAMBDA_(int8_t, ISPCKernel, \
256 CLOUDVIEWER_OVERLOADED_LAMBDA_(uint16_t, ISPCKernel, \
258 CLOUDVIEWER_OVERLOADED_LAMBDA_(int16_t, ISPCKernel, \
260 CLOUDVIEWER_OVERLOADED_LAMBDA_(uint32_t, ISPCKernel, \
262 CLOUDVIEWER_OVERLOADED_LAMBDA_(int32_t, ISPCKernel, \
264 CLOUDVIEWER_OVERLOADED_LAMBDA_(uint64_t, ISPCKernel, \
266 CLOUDVIEWER_OVERLOADED_LAMBDA_(int64_t, ISPCKernel, \
268 CLOUDVIEWER_OVERLOADED_LAMBDA_(float, ISPCKernel, \
270 CLOUDVIEWER_OVERLOADED_LAMBDA_(double, ISPCKernel, \
272 [&](auto&& generic, int64_t start, int64_t end) { \
274 "Unsupported data type {} for calling " \
275 "vectorized kernel {}", \
276 typeid(generic).name(), \
277 CLOUDVIEWER_STRINGIFY(ISPCKernel)); \
278 })(T{}, start, end); \
#define CLOUDVIEWER_GET_LAST_CUDA_ERROR(message)
bool IsCPU() const
Returns true iff device type is CPU.
std::string ToString() const
Returns string representation of device, e.g. "CPU:0", "CUDA:0".
void ParallelFor(const Device &device, int64_t n, const func_t &func)
void ParallelForCPU_(const Device &device, int64_t n, const func_t &func)
Run a function in parallel on CPU.
int EstimateMaxThreads()
Estimate the maximum number of threads to be used in a parallel region.
Generic file read and write utility for python interface.