1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
8 #include "cloudViewer/core/CUDAUtils.h"
9 #include "core/Dispatch.h"
10 #include "core/Indexer.h"
11 #include "core/ParallelFor.h"
12 #include "core/Tensor.h"
13 #include "core/kernel/UnaryEW.h"
15 namespace cloudViewer {
19 // Cannot be a static function since on Windows a function enclosing
20 // __host__ __device__ lambda function must have external linkage.
21 template <typename func_t>
22 void LaunchUnaryEWKernel(const Device& device,
23 const Indexer& indexer,
24 const func_t& element_kernel) {
25 CLOUDVIEWER_ASSERT_HOST_DEVICE_LAMBDA(func_t);
26 auto element_func = [=] CLOUDVIEWER_HOST_DEVICE(int64_t i) {
27 element_kernel(indexer.GetInputPtr(0, i), indexer.GetOutputPtr(i));
29 core::ParallelFor(device, indexer.NumWorkloads(), element_func);
30 CLOUDVIEWER_GET_LAST_CUDA_ERROR("LaunchUnaryEWKernel failed.");
33 template <typename src_t, typename dst_t>
34 static CLOUDVIEWER_HOST_DEVICE void CUDACopyElementKernel(const void* src,
36 *static_cast<dst_t*>(dst) =
37 static_cast<dst_t>(*static_cast<const src_t*>(src));
40 static CLOUDVIEWER_HOST_DEVICE void CUDACopyObjectElementKernel(
41 const void* src, void* dst, int64_t object_byte_size) {
42 const char* src_bytes = static_cast<const char*>(src);
43 char* dst_bytes = static_cast<char*>(dst);
44 for (int i = 0; i < object_byte_size; ++i) {
45 dst_bytes[i] = src_bytes[i];
49 template <typename scalar_t>
50 static CLOUDVIEWER_HOST_DEVICE void CUDASqrtElementKernel(const void* src,
52 *static_cast<scalar_t*>(dst) = static_cast<scalar_t>(
53 sqrt(static_cast<double>(*static_cast<const scalar_t*>(src))));
56 template <typename scalar_t>
57 static CLOUDVIEWER_HOST_DEVICE void CUDASinElementKernel(const void* src,
59 *static_cast<scalar_t*>(dst) = static_cast<scalar_t>(
60 sin(static_cast<double>(*static_cast<const scalar_t*>(src))));
63 template <typename scalar_t>
64 static CLOUDVIEWER_HOST_DEVICE void CUDACosElementKernel(const void* src,
66 *static_cast<scalar_t*>(dst) = static_cast<scalar_t>(
67 cos(static_cast<double>(*static_cast<const scalar_t*>(src))));
70 template <typename scalar_t>
71 static CLOUDVIEWER_HOST_DEVICE void CUDANegElementKernel(const void* src,
73 *static_cast<scalar_t*>(dst) = -*static_cast<const scalar_t*>(src);
76 template <typename scalar_t>
77 static CLOUDVIEWER_HOST_DEVICE void CUDAExpElementKernel(const void* src,
79 *static_cast<scalar_t*>(dst) = static_cast<scalar_t>(
80 exp(static_cast<double>(*static_cast<const scalar_t*>(src))));
83 template <typename scalar_t>
84 static CLOUDVIEWER_HOST_DEVICE void CUDAAbsElementKernel(const void* src,
86 *static_cast<scalar_t*>(dst) = static_cast<scalar_t>(
87 abs(static_cast<double>(*static_cast<const scalar_t*>(src))));
90 template <typename scalar_t>
91 static CLOUDVIEWER_HOST_DEVICE void CUDAIsNanElementKernel(const void* src,
93 *static_cast<bool*>(dst) =
94 isnan(static_cast<float>(*static_cast<const scalar_t*>(src)));
97 template <typename scalar_t>
98 static CLOUDVIEWER_HOST_DEVICE void CUDAIsInfElementKernel(const void* src,
100 *static_cast<bool*>(dst) =
101 isinf(static_cast<float>(*static_cast<const scalar_t*>(src)));
104 template <typename scalar_t>
105 static CLOUDVIEWER_HOST_DEVICE void CUDAIsFiniteElementKernel(const void* src,
107 *static_cast<bool*>(dst) =
108 isfinite(static_cast<float>(*static_cast<const scalar_t*>(src)));
111 template <typename scalar_t>
112 static CLOUDVIEWER_HOST_DEVICE void CUDAFloorElementKernel(const void* src,
114 *static_cast<scalar_t*>(dst) = static_cast<scalar_t>(
115 floor(static_cast<double>(*static_cast<const scalar_t*>(src))));
118 template <typename scalar_t>
119 static CLOUDVIEWER_HOST_DEVICE void CUDACeilElementKernel(const void* src,
121 *static_cast<scalar_t*>(dst) = static_cast<scalar_t>(
122 ceil(static_cast<double>(*static_cast<const scalar_t*>(src))));
125 template <typename scalar_t>
126 static CLOUDVIEWER_HOST_DEVICE void CUDARoundElementKernel(const void* src,
128 *static_cast<scalar_t*>(dst) = static_cast<scalar_t>(
129 round(static_cast<double>(*static_cast<const scalar_t*>(src))));
132 template <typename scalar_t>
133 static CLOUDVIEWER_HOST_DEVICE void CUDATruncElementKernel(const void* src,
135 *static_cast<scalar_t*>(dst) = static_cast<scalar_t>(
136 trunc(static_cast<double>(*static_cast<const scalar_t*>(src))));
139 template <typename src_t, typename dst_t>
140 static CLOUDVIEWER_HOST_DEVICE void CUDALogicalNotElementKernel(const void* src,
142 *static_cast<dst_t*>(dst) = static_cast<dst_t>(
143 !static_cast<bool>(*static_cast<const src_t*>(src)));
146 void CopyCUDA(const Tensor& src, Tensor& dst) {
147 // It has been checked that
148 // - src and dst have the same dtype
149 // - at least one of src or dst is CUDA device
150 SizeVector shape = src.GetShape();
151 Dtype src_dtype = src.GetDtype();
152 Dtype dst_dtype = dst.GetDtype();
154 Device src_device = src.GetDevice();
155 Device dst_device = dst.GetDevice();
157 if (src_device.GetType() == Device::DeviceType::CUDA &&
158 dst_device.GetType() == Device::DeviceType::CUDA) {
159 if (src.IsContiguous() && dst.IsContiguous() &&
160 src.GetShape() == dst.GetShape() && src_dtype == dst_dtype) {
161 // MemoryManager handles p2p and non-p2p device copy.
162 MemoryManager::Memcpy(dst.GetDataPtr(), dst_device,
163 src.GetDataPtr(), src_device,
164 src_dtype.ByteSize() * shape.NumElements());
165 } else if (dst.NumElements() > 1 && dst.IsContiguous() &&
166 src.NumElements() == 1 && !src_dtype.IsObject()) {
167 int64_t num_elements = dst.NumElements();
169 DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dst_dtype, [&]() {
170 scalar_t scalar_element = src.To(dst_dtype).Item<scalar_t>();
171 scalar_t* dst_ptr = static_cast<scalar_t*>(dst.GetDataPtr());
172 ParallelFor(src_device, num_elements,
173 [=] CLOUDVIEWER_HOST_DEVICE(int64_t workload_idx) {
174 dst_ptr[workload_idx] = scalar_element;
177 } else if (src_device == dst_device) {
178 // For more optimized version, one can check if P2P from src to
179 // dst is enabled, then put synchronization with streams on both
180 // src and dst to wait for copy kernel to complete.
181 Indexer indexer({src}, dst, DtypePolicy::NONE);
182 if (src.GetDtype().IsObject()) {
183 int64_t object_byte_size = src.GetDtype().ByteSize();
184 LaunchUnaryEWKernel(src_device, indexer,
185 [=] CLOUDVIEWER_HOST_DEVICE(const void* src,
187 CUDACopyObjectElementKernel(
188 src, dst, object_byte_size);
192 DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
193 using src_t = scalar_t;
194 DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dst_dtype, [&]() {
195 using dst_t = scalar_t;
198 // Need to wrap as extended CUDA lambda function
199 [] CLOUDVIEWER_HOST_DEVICE(const void* src,
201 CUDACopyElementKernel<src_t, dst_t>(src,
208 dst.CopyFrom(src.Contiguous().To(dst_device));
210 } else if (src_device.GetType() == Device::DeviceType::CPU &&
211 dst_device.GetType() == Device::DeviceType::CUDA ||
212 src_device.GetType() == Device::DeviceType::CUDA &&
213 dst_device.GetType() == Device::DeviceType::CPU) {
214 Tensor src_conti = src.Contiguous(); // No op if already contiguous
215 if (dst.IsContiguous() && src.GetShape() == dst.GetShape() &&
216 src_dtype == dst_dtype) {
217 MemoryManager::Memcpy(dst.GetDataPtr(), dst_device,
218 src_conti.GetDataPtr(), src_conti.GetDevice(),
219 src_dtype.ByteSize() * shape.NumElements());
221 dst.CopyFrom(src.Contiguous().To(dst_device));
224 utility::LogError("Wrong device type {} -> {}", src_device.ToString(),
225 dst_device.ToString());
229 void UnaryEWCUDA(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
230 // src and dst have been chaged to have the same shape, dtype, device.
231 Dtype src_dtype = src.GetDtype();
232 Dtype dst_dtype = dst.GetDtype();
233 Device src_device = src.GetDevice();
235 auto assert_dtype_is_float = [](Dtype dtype) -> void {
236 if (dtype != core::Float32 && dtype != core::Float64) {
238 "Only supports Float32 and Float64, but {} is used.",
243 if (op_code == UnaryEWOpCode::LogicalNot) {
244 DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
245 if (dst_dtype == src_dtype) {
246 Indexer indexer({src}, dst, DtypePolicy::ALL_SAME);
249 [] CLOUDVIEWER_HOST_DEVICE(const void* src, void* dst) {
250 CUDALogicalNotElementKernel<scalar_t, scalar_t>(
253 } else if (dst_dtype == core::Bool) {
254 Indexer indexer({src}, dst,
255 DtypePolicy::INPUT_SAME_OUTPUT_BOOL);
258 [] CLOUDVIEWER_HOST_DEVICE(const void* src, void* dst) {
259 CUDALogicalNotElementKernel<scalar_t, bool>(src,
264 "Boolean op's output type must be boolean or the "
265 "same type as the input.");
268 } else if (op_code == UnaryEWOpCode::IsNan ||
269 op_code == UnaryEWOpCode::IsInf ||
270 op_code == UnaryEWOpCode::IsFinite) {
271 assert_dtype_is_float(src_dtype);
272 Indexer indexer({src}, dst, DtypePolicy::INPUT_SAME_OUTPUT_BOOL);
273 DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() {
274 if (op_code == UnaryEWOpCode::IsNan) {
277 [] CLOUDVIEWER_HOST_DEVICE(const void* src, void* dst) {
278 CUDAIsNanElementKernel<scalar_t>(src, dst);
280 } else if (op_code == UnaryEWOpCode::IsInf) {
283 [] CLOUDVIEWER_HOST_DEVICE(const void* src, void* dst) {
284 CUDAIsInfElementKernel<scalar_t>(src, dst);
286 } else if (op_code == UnaryEWOpCode::IsFinite) {
289 [] CLOUDVIEWER_HOST_DEVICE(const void* src, void* dst) {
290 CUDAIsFiniteElementKernel<scalar_t>(src, dst);
295 Indexer indexer({src}, dst, DtypePolicy::ALL_SAME);
296 DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() {
298 case UnaryEWOpCode::Sqrt:
299 assert_dtype_is_float(src_dtype);
300 LaunchUnaryEWKernel(src_device, indexer,
301 [] CLOUDVIEWER_HOST_DEVICE(
302 const void* src, void* dst) {
303 CUDASqrtElementKernel<scalar_t>(
307 case UnaryEWOpCode::Sin:
308 assert_dtype_is_float(src_dtype);
309 LaunchUnaryEWKernel(src_device, indexer,
310 [] CLOUDVIEWER_HOST_DEVICE(
311 const void* src, void* dst) {
312 CUDASinElementKernel<scalar_t>(src,
316 case UnaryEWOpCode::Cos:
317 assert_dtype_is_float(src_dtype);
318 LaunchUnaryEWKernel(src_device, indexer,
319 [] CLOUDVIEWER_HOST_DEVICE(
320 const void* src, void* dst) {
321 CUDACosElementKernel<scalar_t>(src,
325 case UnaryEWOpCode::Neg:
326 LaunchUnaryEWKernel(src_device, indexer,
327 [] CLOUDVIEWER_HOST_DEVICE(
328 const void* src, void* dst) {
329 CUDANegElementKernel<scalar_t>(src,
333 case UnaryEWOpCode::Exp:
334 assert_dtype_is_float(src_dtype);
335 LaunchUnaryEWKernel(src_device, indexer,
336 [] CLOUDVIEWER_HOST_DEVICE(
337 const void* src, void* dst) {
338 CUDAExpElementKernel<scalar_t>(src,
342 case UnaryEWOpCode::Abs:
343 LaunchUnaryEWKernel(src_device, indexer,
344 [] CLOUDVIEWER_HOST_DEVICE(
345 const void* src, void* dst) {
346 CUDAAbsElementKernel<scalar_t>(src,
350 case UnaryEWOpCode::Floor:
351 LaunchUnaryEWKernel(src_device, indexer,
352 [] CLOUDVIEWER_HOST_DEVICE(
353 const void* src, void* dst) {
354 CUDAFloorElementKernel<scalar_t>(
358 case UnaryEWOpCode::Ceil:
359 LaunchUnaryEWKernel(src_device, indexer,
360 [] CLOUDVIEWER_HOST_DEVICE(
361 const void* src, void* dst) {
362 CUDACeilElementKernel<scalar_t>(
366 case UnaryEWOpCode::Round:
367 LaunchUnaryEWKernel(src_device, indexer,
368 [] CLOUDVIEWER_HOST_DEVICE(
369 const void* src, void* dst) {
370 CUDARoundElementKernel<scalar_t>(
374 case UnaryEWOpCode::Trunc:
375 LaunchUnaryEWKernel(src_device, indexer,
376 [] CLOUDVIEWER_HOST_DEVICE(
377 const void* src, void* dst) {
378 CUDATruncElementKernel<scalar_t>(
383 utility::LogError("Unimplemented op_code for UnaryEWCUDA");
390 } // namespace kernel
392 } // namespace cloudViewer