cpp_api/api/ParallelFor_8h_source.html

 // ----------------------------------------------------------------------------

 // -                        CloudViewer: www.cloudViewer.org                  -

 // ----------------------------------------------------------------------------

 // Copyright (c) 2018-2024 www.cloudViewer.org

 // SPDX-License-Identifier: MIT

 // ----------------------------------------------------------------------------


 #pragma once


 #include <Logging.h>

 #include <Parallel.h>


 #include <cstdint>

 #include <type_traits>


 #include "cloudViewer/Macro.h"

 #include "cloudViewer/core/Device.h"

 #include "cloudViewer/utility/Overload.h"

 #include "cloudViewer/utility/Preprocessor.h"


 #ifdef __CUDACC__

 #include <cuda.h>

 #include <cuda_runtime.h>


 #include "cloudViewer/core/CUDAUtils.h"

 #endif


 namespace cloudViewer {

 namespace core {


 #ifdef __CUDACC__


 static constexpr int64_t CLOUDVIEWER_PARFOR_BLOCK = 128;

 static constexpr int64_t CLOUDVIEWER_PARFOR_THREAD = 4;


 template <int64_t block_size, int64_t thread_size, typename func_t>

 __global__ void ElementWiseKernel_(int64_t n, func_t f) {

     int64_t items_per_block = block_size * thread_size;

     int64_t idx = blockIdx.x * items_per_block + threadIdx.x;

 #pragma unroll

     for (int64_t i = 0; i < thread_size; ++i) {

         if (idx < n) {

             f(idx);

             idx += block_size;

         }

     }

 }


 template <typename func_t>

 void ParallelForCUDA_(const Device& device, int64_t n, const func_t& func) {

     if (device.GetType() != Device::DeviceType::CUDA) {

         utility::LogError("ParallelFor for CUDA cannot run on device {}.",

                           device.ToString());

     }

     if (n == 0) {

         return;

     }


     CUDAScopedDevice scoped_device(device);

     int64_t items_per_block =

             CLOUDVIEWER_PARFOR_BLOCK * CLOUDVIEWER_PARFOR_THREAD;

     int64_t grid_size = (n + items_per_block - 1) / items_per_block;


     ElementWiseKernel_<CLOUDVIEWER_PARFOR_BLOCK, CLOUDVIEWER_PARFOR_THREAD>

             <<<grid_size, CLOUDVIEWER_PARFOR_BLOCK, 0,

                core::cuda::GetStream()>>>(n, func);

     CLOUDVIEWER_GET_LAST_CUDA_ERROR("ParallelFor failed.");

 }


 #else


 template <typename func_t>

 void ParallelForCPU_(const Device& device, int64_t n, const func_t& func) {

     if (!device.IsCPU()) {

         utility::LogError("ParallelFor for CPU cannot run on device {}.",

                           device.ToString());

     }

     if (n == 0) {

         return;

     }


 #pragma omp parallel for num_threads(utility::EstimateMaxThreads())

     for (int64_t i = 0; i < n; ++i) {

         func(i);

     }

 }


 #endif


 template <typename func_t>

 void ParallelFor(const Device& device, int64_t n, const func_t& func) {

 #ifdef __CUDACC__

     ParallelForCUDA_(device, n, func);

 #else

     ParallelForCPU_(device, n, func);

 #endif

 }


 template <typename vec_func_t, typename func_t>

 void ParallelFor(const Device& device,

                  int64_t n,

                  const func_t& func,

                  const vec_func_t& vec_func) {

 #ifdef BUILD_ISPC_MODULE


 #ifdef __CUDACC__

     ParallelForCUDA_(device, n, func);

 #else

     int num_threads = utility::EstimateMaxThreads();

     ParallelForCPU_(device, num_threads, [&](int64_t i) {

         int64_t start = n * i / num_threads;

         int64_t end = std::min<int64_t>(n * (i + 1) / num_threads, n);

         vec_func(start, end);

     });

 #endif


 #else


 #ifdef __CUDACC__

     ParallelForCUDA_(device, n, func);

 #else

     ParallelForCPU_(device, n, func);

 #endif


 #endif

 }


 #ifdef BUILD_ISPC_MODULE


 // Internal helper macro.

 #define CLOUDVIEWER_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \

     using namespace ispc;                                          \

     ISPCKernel(start, end, __VA_ARGS__);


 #else


 // Internal helper macro.

 #define CLOUDVIEWER_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...)       \

     utility::LogError(                                                   \

             "ISPC module disabled. Unable to call vectorized kernel {}", \

             CLOUDVIEWER_STRINGIFY(ISPCKernel));


 #endif


 #define CLOUDVIEWER_OVERLOADED_LAMBDA_(T, ISPCKernel, ...)                \

     [&](T, int64_t start, int64_t end) {                                  \

         CLOUDVIEWER_CALL_ISPC_KERNEL_(                                    \

                 CLOUDVIEWER_CONCAT(ISPCKernel, CLOUDVIEWER_CONCAT(_, T)), \

                 start, end, __VA_ARGS__);                                 \

     }


 #define CLOUDVIEWER_VECTORIZED(ISPCKernel, ...)                             \

     [&](int64_t start, int64_t end) {                                       \

         CLOUDVIEWER_CALL_ISPC_KERNEL_(ISPCKernel, start, end, __VA_ARGS__); \

     }


 #define CLOUDVIEWER_TEMPLATE_VECTORIZED(T, ISPCKernel, ...)                    \

     [&](int64_t start, int64_t end) {                                          \

         static_assert(std::is_arithmetic<T>::value,                            \

                       "Data type is not an arithmetic type");                  \

         utility::Overload(                                                     \

                 CLOUDVIEWER_OVERLOADED_LAMBDA_(bool, ISPCKernel, __VA_ARGS__), \

                 CLOUDVIEWER_OVERLOADED_LAMBDA_(uint8_t, ISPCKernel,            \

                                                __VA_ARGS__),                   \

                 CLOUDVIEWER_OVERLOADED_LAMBDA_(int8_t, ISPCKernel,             \

                                                __VA_ARGS__),                   \

                 CLOUDVIEWER_OVERLOADED_LAMBDA_(uint16_t, ISPCKernel,           \

                                                __VA_ARGS__),                   \

                 CLOUDVIEWER_OVERLOADED_LAMBDA_(int16_t, ISPCKernel,            \

                                                __VA_ARGS__),                   \

                 CLOUDVIEWER_OVERLOADED_LAMBDA_(uint32_t, ISPCKernel,           \

                                                __VA_ARGS__),                   \

                 CLOUDVIEWER_OVERLOADED_LAMBDA_(int32_t, ISPCKernel,            \

                                                __VA_ARGS__),                   \

                 CLOUDVIEWER_OVERLOADED_LAMBDA_(uint64_t, ISPCKernel,           \

                                                __VA_ARGS__),                   \

                 CLOUDVIEWER_OVERLOADED_LAMBDA_(int64_t, ISPCKernel,            \

                                                __VA_ARGS__),                   \

                 CLOUDVIEWER_OVERLOADED_LAMBDA_(float, ISPCKernel,              \

                                                __VA_ARGS__),                   \

                 CLOUDVIEWER_OVERLOADED_LAMBDA_(double, ISPCKernel,             \

                                                __VA_ARGS__),                   \

                 [&](auto&& generic, int64_t start, int64_t end) {              \

                     utility::LogError(                                         \

                             "Unsupported data type {} for calling "            \

                             "vectorized kernel {}",                            \

                             typeid(generic).name(),                            \

                             CLOUDVIEWER_STRINGIFY(ISPCKernel));                \

                 })(T{}, start, end);                                           \

     }


 }  // namespace core

 }  // namespace cloudViewer

CUDAUtils.h
Common CUDA utilities.

CLOUDVIEWER_GET_LAST_CUDA_ERROR
#define CLOUDVIEWER_GET_LAST_CUDA_ERROR(message)
Definition: CUDAUtils.h:48

Device.h

Macro.h

Overload.h

Parallel.h

Preprocessor.h

cloudViewer::core::Device
Definition: Device.h:18

cloudViewer::core::Device::IsCPU
bool IsCPU() const
Returns true iff device type is CPU.
Definition: Device.h:46

cloudViewer::core::Device::DeviceType::CUDA
@ CUDA

cloudViewer::core::Device::ToString
std::string ToString() const
Returns string representation of device, e.g. "CPU:0", "CUDA:0".
Definition: Device.cpp:89

LogError
#define LogError(...)
Definition: Logging.h:60

Logging.h

cloudViewer::core::ParallelFor
void ParallelFor(const Device &device, int64_t n, const func_t &func)
Definition: ParallelFor.h:111

cloudViewer::core::ParallelForCPU_
void ParallelForCPU_(const Device &device, int64_t n, const func_t &func)
Run a function in parallel on CPU.
Definition: ParallelFor.h:76

cloudViewer::utility::EstimateMaxThreads
int EstimateMaxThreads()
Estimate the maximum number of threads to be used in a parallel region.
Definition: Parallel.cpp:31

cloudViewer
Generic file read and write utility for python interface.
Definition: AutoSegmentationTools.h:16