ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
UnaryEWCUDA.cu
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #include "cloudViewer/core/CUDAUtils.h"
9 #include "core/Dispatch.h"
10 #include "core/Indexer.h"
11 #include "core/ParallelFor.h"
12 #include "core/Tensor.h"
13 #include "core/kernel/UnaryEW.h"
14 
15 namespace cloudViewer {
16 namespace core {
17 namespace kernel {
18 
19 // Cannot be a static function since on Windows a function enclosing
20 // __host__ __device__ lambda function must have external linkage.
21 template <typename func_t>
22 void LaunchUnaryEWKernel(const Device& device,
23  const Indexer& indexer,
24  const func_t& element_kernel) {
25  CLOUDVIEWER_ASSERT_HOST_DEVICE_LAMBDA(func_t);
26  auto element_func = [=] CLOUDVIEWER_HOST_DEVICE(int64_t i) {
27  element_kernel(indexer.GetInputPtr(0, i), indexer.GetOutputPtr(i));
28  };
29  core::ParallelFor(device, indexer.NumWorkloads(), element_func);
30  CLOUDVIEWER_GET_LAST_CUDA_ERROR("LaunchUnaryEWKernel failed.");
31 }
32 
33 template <typename src_t, typename dst_t>
34 static CLOUDVIEWER_HOST_DEVICE void CUDACopyElementKernel(const void* src,
35  void* dst) {
36  *static_cast<dst_t*>(dst) =
37  static_cast<dst_t>(*static_cast<const src_t*>(src));
38 }
39 
40 static CLOUDVIEWER_HOST_DEVICE void CUDACopyObjectElementKernel(
41  const void* src, void* dst, int64_t object_byte_size) {
42  const char* src_bytes = static_cast<const char*>(src);
43  char* dst_bytes = static_cast<char*>(dst);
44  for (int i = 0; i < object_byte_size; ++i) {
45  dst_bytes[i] = src_bytes[i];
46  }
47 }
48 
49 template <typename scalar_t>
50 static CLOUDVIEWER_HOST_DEVICE void CUDASqrtElementKernel(const void* src,
51  void* dst) {
52  *static_cast<scalar_t*>(dst) = static_cast<scalar_t>(
53  sqrt(static_cast<double>(*static_cast<const scalar_t*>(src))));
54 }
55 
56 template <typename scalar_t>
57 static CLOUDVIEWER_HOST_DEVICE void CUDASinElementKernel(const void* src,
58  void* dst) {
59  *static_cast<scalar_t*>(dst) = static_cast<scalar_t>(
60  sin(static_cast<double>(*static_cast<const scalar_t*>(src))));
61 }
62 
63 template <typename scalar_t>
64 static CLOUDVIEWER_HOST_DEVICE void CUDACosElementKernel(const void* src,
65  void* dst) {
66  *static_cast<scalar_t*>(dst) = static_cast<scalar_t>(
67  cos(static_cast<double>(*static_cast<const scalar_t*>(src))));
68 }
69 
70 template <typename scalar_t>
71 static CLOUDVIEWER_HOST_DEVICE void CUDANegElementKernel(const void* src,
72  void* dst) {
73  *static_cast<scalar_t*>(dst) = -*static_cast<const scalar_t*>(src);
74 }
75 
76 template <typename scalar_t>
77 static CLOUDVIEWER_HOST_DEVICE void CUDAExpElementKernel(const void* src,
78  void* dst) {
79  *static_cast<scalar_t*>(dst) = static_cast<scalar_t>(
80  exp(static_cast<double>(*static_cast<const scalar_t*>(src))));
81 }
82 
83 template <typename scalar_t>
84 static CLOUDVIEWER_HOST_DEVICE void CUDAAbsElementKernel(const void* src,
85  void* dst) {
86  *static_cast<scalar_t*>(dst) = static_cast<scalar_t>(
87  abs(static_cast<double>(*static_cast<const scalar_t*>(src))));
88 }
89 
90 template <typename scalar_t>
91 static CLOUDVIEWER_HOST_DEVICE void CUDAIsNanElementKernel(const void* src,
92  void* dst) {
93  *static_cast<bool*>(dst) =
94  isnan(static_cast<float>(*static_cast<const scalar_t*>(src)));
95 }
96 
97 template <typename scalar_t>
98 static CLOUDVIEWER_HOST_DEVICE void CUDAIsInfElementKernel(const void* src,
99  void* dst) {
100  *static_cast<bool*>(dst) =
101  isinf(static_cast<float>(*static_cast<const scalar_t*>(src)));
102 }
103 
104 template <typename scalar_t>
105 static CLOUDVIEWER_HOST_DEVICE void CUDAIsFiniteElementKernel(const void* src,
106  void* dst) {
107  *static_cast<bool*>(dst) =
108  isfinite(static_cast<float>(*static_cast<const scalar_t*>(src)));
109 }
110 
111 template <typename scalar_t>
112 static CLOUDVIEWER_HOST_DEVICE void CUDAFloorElementKernel(const void* src,
113  void* dst) {
114  *static_cast<scalar_t*>(dst) = static_cast<scalar_t>(
115  floor(static_cast<double>(*static_cast<const scalar_t*>(src))));
116 }
117 
118 template <typename scalar_t>
119 static CLOUDVIEWER_HOST_DEVICE void CUDACeilElementKernel(const void* src,
120  void* dst) {
121  *static_cast<scalar_t*>(dst) = static_cast<scalar_t>(
122  ceil(static_cast<double>(*static_cast<const scalar_t*>(src))));
123 }
124 
125 template <typename scalar_t>
126 static CLOUDVIEWER_HOST_DEVICE void CUDARoundElementKernel(const void* src,
127  void* dst) {
128  *static_cast<scalar_t*>(dst) = static_cast<scalar_t>(
129  round(static_cast<double>(*static_cast<const scalar_t*>(src))));
130 }
131 
132 template <typename scalar_t>
133 static CLOUDVIEWER_HOST_DEVICE void CUDATruncElementKernel(const void* src,
134  void* dst) {
135  *static_cast<scalar_t*>(dst) = static_cast<scalar_t>(
136  trunc(static_cast<double>(*static_cast<const scalar_t*>(src))));
137 }
138 
139 template <typename src_t, typename dst_t>
140 static CLOUDVIEWER_HOST_DEVICE void CUDALogicalNotElementKernel(const void* src,
141  void* dst) {
142  *static_cast<dst_t*>(dst) = static_cast<dst_t>(
143  !static_cast<bool>(*static_cast<const src_t*>(src)));
144 }
145 
146 void CopyCUDA(const Tensor& src, Tensor& dst) {
147  // It has been checked that
148  // - src and dst have the same dtype
149  // - at least one of src or dst is CUDA device
150  SizeVector shape = src.GetShape();
151  Dtype src_dtype = src.GetDtype();
152  Dtype dst_dtype = dst.GetDtype();
153 
154  Device src_device = src.GetDevice();
155  Device dst_device = dst.GetDevice();
156 
157  if (src_device.GetType() == Device::DeviceType::CUDA &&
158  dst_device.GetType() == Device::DeviceType::CUDA) {
159  if (src.IsContiguous() && dst.IsContiguous() &&
160  src.GetShape() == dst.GetShape() && src_dtype == dst_dtype) {
161  // MemoryManager handles p2p and non-p2p device copy.
162  MemoryManager::Memcpy(dst.GetDataPtr(), dst_device,
163  src.GetDataPtr(), src_device,
164  src_dtype.ByteSize() * shape.NumElements());
165  } else if (dst.NumElements() > 1 && dst.IsContiguous() &&
166  src.NumElements() == 1 && !src_dtype.IsObject()) {
167  int64_t num_elements = dst.NumElements();
168 
169  DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dst_dtype, [&]() {
170  scalar_t scalar_element = src.To(dst_dtype).Item<scalar_t>();
171  scalar_t* dst_ptr = static_cast<scalar_t*>(dst.GetDataPtr());
172  ParallelFor(src_device, num_elements,
173  [=] CLOUDVIEWER_HOST_DEVICE(int64_t workload_idx) {
174  dst_ptr[workload_idx] = scalar_element;
175  });
176  });
177  } else if (src_device == dst_device) {
178  // For more optimized version, one can check if P2P from src to
179  // dst is enabled, then put synchronization with streams on both
180  // src and dst to wait for copy kernel to complete.
181  Indexer indexer({src}, dst, DtypePolicy::NONE);
182  if (src.GetDtype().IsObject()) {
183  int64_t object_byte_size = src.GetDtype().ByteSize();
184  LaunchUnaryEWKernel(src_device, indexer,
185  [=] CLOUDVIEWER_HOST_DEVICE(const void* src,
186  void* dst) {
187  CUDACopyObjectElementKernel(
188  src, dst, object_byte_size);
189  });
190 
191  } else {
192  DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
193  using src_t = scalar_t;
194  DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dst_dtype, [&]() {
195  using dst_t = scalar_t;
196  LaunchUnaryEWKernel(
197  src_device, indexer,
198  // Need to wrap as extended CUDA lambda function
199  [] CLOUDVIEWER_HOST_DEVICE(const void* src,
200  void* dst) {
201  CUDACopyElementKernel<src_t, dst_t>(src,
202  dst);
203  });
204  });
205  });
206  }
207  } else {
208  dst.CopyFrom(src.Contiguous().To(dst_device));
209  }
210  } else if (src_device.GetType() == Device::DeviceType::CPU &&
211  dst_device.GetType() == Device::DeviceType::CUDA ||
212  src_device.GetType() == Device::DeviceType::CUDA &&
213  dst_device.GetType() == Device::DeviceType::CPU) {
214  Tensor src_conti = src.Contiguous(); // No op if already contiguous
215  if (dst.IsContiguous() && src.GetShape() == dst.GetShape() &&
216  src_dtype == dst_dtype) {
217  MemoryManager::Memcpy(dst.GetDataPtr(), dst_device,
218  src_conti.GetDataPtr(), src_conti.GetDevice(),
219  src_dtype.ByteSize() * shape.NumElements());
220  } else {
221  dst.CopyFrom(src.Contiguous().To(dst_device));
222  }
223  } else {
224  utility::LogError("Wrong device type {} -> {}", src_device.ToString(),
225  dst_device.ToString());
226  }
227 }
228 
229 void UnaryEWCUDA(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
230  // src and dst have been chaged to have the same shape, dtype, device.
231  Dtype src_dtype = src.GetDtype();
232  Dtype dst_dtype = dst.GetDtype();
233  Device src_device = src.GetDevice();
234 
235  auto assert_dtype_is_float = [](Dtype dtype) -> void {
236  if (dtype != core::Float32 && dtype != core::Float64) {
237  utility::LogError(
238  "Only supports Float32 and Float64, but {} is used.",
239  dtype.ToString());
240  }
241  };
242 
243  if (op_code == UnaryEWOpCode::LogicalNot) {
244  DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
245  if (dst_dtype == src_dtype) {
246  Indexer indexer({src}, dst, DtypePolicy::ALL_SAME);
247  LaunchUnaryEWKernel(
248  src_device, indexer,
249  [] CLOUDVIEWER_HOST_DEVICE(const void* src, void* dst) {
250  CUDALogicalNotElementKernel<scalar_t, scalar_t>(
251  src, dst);
252  });
253  } else if (dst_dtype == core::Bool) {
254  Indexer indexer({src}, dst,
255  DtypePolicy::INPUT_SAME_OUTPUT_BOOL);
256  LaunchUnaryEWKernel(
257  src_device, indexer,
258  [] CLOUDVIEWER_HOST_DEVICE(const void* src, void* dst) {
259  CUDALogicalNotElementKernel<scalar_t, bool>(src,
260  dst);
261  });
262  } else {
263  utility::LogError(
264  "Boolean op's output type must be boolean or the "
265  "same type as the input.");
266  }
267  });
268  } else if (op_code == UnaryEWOpCode::IsNan ||
269  op_code == UnaryEWOpCode::IsInf ||
270  op_code == UnaryEWOpCode::IsFinite) {
271  assert_dtype_is_float(src_dtype);
272  Indexer indexer({src}, dst, DtypePolicy::INPUT_SAME_OUTPUT_BOOL);
273  DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() {
274  if (op_code == UnaryEWOpCode::IsNan) {
275  LaunchUnaryEWKernel(
276  src_device, indexer,
277  [] CLOUDVIEWER_HOST_DEVICE(const void* src, void* dst) {
278  CUDAIsNanElementKernel<scalar_t>(src, dst);
279  });
280  } else if (op_code == UnaryEWOpCode::IsInf) {
281  LaunchUnaryEWKernel(
282  src_device, indexer,
283  [] CLOUDVIEWER_HOST_DEVICE(const void* src, void* dst) {
284  CUDAIsInfElementKernel<scalar_t>(src, dst);
285  });
286  } else if (op_code == UnaryEWOpCode::IsFinite) {
287  LaunchUnaryEWKernel(
288  src_device, indexer,
289  [] CLOUDVIEWER_HOST_DEVICE(const void* src, void* dst) {
290  CUDAIsFiniteElementKernel<scalar_t>(src, dst);
291  });
292  }
293  });
294  } else {
295  Indexer indexer({src}, dst, DtypePolicy::ALL_SAME);
296  DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() {
297  switch (op_code) {
298  case UnaryEWOpCode::Sqrt:
299  assert_dtype_is_float(src_dtype);
300  LaunchUnaryEWKernel(src_device, indexer,
301  [] CLOUDVIEWER_HOST_DEVICE(
302  const void* src, void* dst) {
303  CUDASqrtElementKernel<scalar_t>(
304  src, dst);
305  });
306  break;
307  case UnaryEWOpCode::Sin:
308  assert_dtype_is_float(src_dtype);
309  LaunchUnaryEWKernel(src_device, indexer,
310  [] CLOUDVIEWER_HOST_DEVICE(
311  const void* src, void* dst) {
312  CUDASinElementKernel<scalar_t>(src,
313  dst);
314  });
315  break;
316  case UnaryEWOpCode::Cos:
317  assert_dtype_is_float(src_dtype);
318  LaunchUnaryEWKernel(src_device, indexer,
319  [] CLOUDVIEWER_HOST_DEVICE(
320  const void* src, void* dst) {
321  CUDACosElementKernel<scalar_t>(src,
322  dst);
323  });
324  break;
325  case UnaryEWOpCode::Neg:
326  LaunchUnaryEWKernel(src_device, indexer,
327  [] CLOUDVIEWER_HOST_DEVICE(
328  const void* src, void* dst) {
329  CUDANegElementKernel<scalar_t>(src,
330  dst);
331  });
332  break;
333  case UnaryEWOpCode::Exp:
334  assert_dtype_is_float(src_dtype);
335  LaunchUnaryEWKernel(src_device, indexer,
336  [] CLOUDVIEWER_HOST_DEVICE(
337  const void* src, void* dst) {
338  CUDAExpElementKernel<scalar_t>(src,
339  dst);
340  });
341  break;
342  case UnaryEWOpCode::Abs:
343  LaunchUnaryEWKernel(src_device, indexer,
344  [] CLOUDVIEWER_HOST_DEVICE(
345  const void* src, void* dst) {
346  CUDAAbsElementKernel<scalar_t>(src,
347  dst);
348  });
349  break;
350  case UnaryEWOpCode::Floor:
351  LaunchUnaryEWKernel(src_device, indexer,
352  [] CLOUDVIEWER_HOST_DEVICE(
353  const void* src, void* dst) {
354  CUDAFloorElementKernel<scalar_t>(
355  src, dst);
356  });
357  break;
358  case UnaryEWOpCode::Ceil:
359  LaunchUnaryEWKernel(src_device, indexer,
360  [] CLOUDVIEWER_HOST_DEVICE(
361  const void* src, void* dst) {
362  CUDACeilElementKernel<scalar_t>(
363  src, dst);
364  });
365  break;
366  case UnaryEWOpCode::Round:
367  LaunchUnaryEWKernel(src_device, indexer,
368  [] CLOUDVIEWER_HOST_DEVICE(
369  const void* src, void* dst) {
370  CUDARoundElementKernel<scalar_t>(
371  src, dst);
372  });
373  break;
374  case UnaryEWOpCode::Trunc:
375  LaunchUnaryEWKernel(src_device, indexer,
376  [] CLOUDVIEWER_HOST_DEVICE(
377  const void* src, void* dst) {
378  CUDATruncElementKernel<scalar_t>(
379  src, dst);
380  });
381  break;
382  default:
383  utility::LogError("Unimplemented op_code for UnaryEWCUDA");
384  break;
385  }
386  });
387  }
388 }
389 
390 } // namespace kernel
391 } // namespace core
392 } // namespace cloudViewer