ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
UnaryEWSYCL.cpp
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #include <Logging.h>
9 
10 #include <cmath>
11 #include <cstring>
12 
14 #include "cloudViewer/core/Dtype.h"
22 
23 namespace cloudViewer {
24 namespace core {
25 namespace kernel {
26 
27 namespace {
28 
29 struct UnaryElementKernel {
30  UnaryElementKernel(Indexer indexer_) : indexer(indexer_) {}
31  void operator()(int64_t i) {}
32 
33 protected:
34  Indexer indexer;
35 };
36 
37 template <typename src_t, typename dst_t>
38 struct CopyElementKernel : public UnaryElementKernel {
39  using UnaryElementKernel::UnaryElementKernel;
40  void operator()(int64_t i) {
41  const src_t* src = indexer.GetInputPtr<src_t>(0, i);
42  dst_t* dst = indexer.GetOutputPtr<dst_t>(i);
43  *dst = static_cast<dst_t>(*src);
44  }
45 };
46 
47 // Math: integers treated as double (C++11)
48 // no casting needed for float
49 #define UNARY_ELEMENT_KERNEL(name, elem_op) \
50  template <typename src_t> \
51  struct name##ElementKernel : public UnaryElementKernel { \
52  using UnaryElementKernel::UnaryElementKernel; \
53  void operator()(int64_t i) { \
54  const src_t* src = indexer.GetInputPtr<src_t>(0, i); \
55  src_t* dst = indexer.GetOutputPtr<src_t>(i); \
56  *dst = static_cast<src_t>(elem_op(static_cast<double>(*src))); \
57  } \
58  }; \
59  template <> \
60  struct name##ElementKernel<float> : public UnaryElementKernel { \
61  using UnaryElementKernel::UnaryElementKernel; \
62  void operator()(int64_t i) { \
63  const float* src = indexer.GetInputPtr<float>(0, i); \
64  float* dst = indexer.GetOutputPtr<float>(i); \
65  *dst = elem_op(*src); \
66  } \
67  }
68 
69 UNARY_ELEMENT_KERNEL(Sqrt, sycl::sqrt);
70 UNARY_ELEMENT_KERNEL(Sin, sycl::sin);
71 UNARY_ELEMENT_KERNEL(Cos, sycl::cos);
72 UNARY_ELEMENT_KERNEL(Exp, sycl::exp);
73 // TODO: Use sycl::abs for integers (no casting)
77 UNARY_ELEMENT_KERNEL(Round, sycl::round);
78 UNARY_ELEMENT_KERNEL(Trunc, sycl::trunc);
79 #undef UNARY_ELEMENT_KERNEL
80 
81 // No special treatment for unsigned types - we use the SYCL runtime
82 // default
83 template <typename scalar_t>
84 struct NegElementKernel : public UnaryElementKernel {
85  using UnaryElementKernel::UnaryElementKernel;
86  void operator()(int64_t i) {
87  const scalar_t* src = indexer.GetInputPtr<scalar_t>(0, i);
88  scalar_t* dst = indexer.GetOutputPtr<scalar_t>(i);
89  *dst = -*src;
90  }
91 };
92 
93 // Float checkers: integers treated as double (C++11)
94 // no casting needed for float
95 #define UNARY_ELEMENT_KERNEL(name, elem_op) \
96  template <typename src_t> \
97  struct name##ElementKernel : public UnaryElementKernel { \
98  using UnaryElementKernel::UnaryElementKernel; \
99  void operator()(int64_t i) { \
100  const src_t* src = indexer.GetInputPtr<src_t>(0, i); \
101  bool* dst = indexer.GetOutputPtr<bool>(i); \
102  *dst = elem_op(static_cast<double>(*src)); \
103  } \
104  }; \
105  template <> \
106  struct name##ElementKernel<float> : public UnaryElementKernel { \
107  using UnaryElementKernel::UnaryElementKernel; \
108  void operator()(int64_t i) { \
109  const float* src = indexer.GetInputPtr<float>(0, i); \
110  bool* dst = indexer.GetOutputPtr<bool>(i); \
111  *dst = elem_op(*src); \
112  } \
113  }
114 
115 UNARY_ELEMENT_KERNEL(IsNan, sycl::isnan);
116 UNARY_ELEMENT_KERNEL(IsInf, sycl::isinf);
117 UNARY_ELEMENT_KERNEL(IsFinite, sycl::isfinite);
118 #undef UNARY_ELEMENT_KERNEL
119 
120 template <typename src_t, typename dst_t /* == bool or src_t */>
121 struct LogicalNotElementKernel : public UnaryElementKernel {
122  using UnaryElementKernel::UnaryElementKernel;
123  void operator()(int64_t i) {
124  const src_t* src = indexer.GetInputPtr<src_t>(0, i);
125  dst_t* dst = indexer.GetOutputPtr<dst_t>(i);
126  *dst = static_cast<dst_t>(!static_cast<bool>(*src));
127  }
128 };
129 } // namespace
130 
131 void CopySYCL(const Tensor& src, Tensor& dst) {
132  // src and dst have been checked to have the same shape
133  // at least one of src and dst is SYCL and the other is SYCL or CPU
134  SizeVector shape = src.GetShape();
135  Dtype src_dtype = src.GetDtype(), dst_dtype = dst.GetDtype();
136  Device src_device = src.GetDevice(), dst_device = dst.GetDevice();
137  Device device_with_queue = dst.IsSYCL() ? dst.GetDevice() : src.GetDevice();
138  sycl::queue queue =
139  sy::SYCLContext::GetInstance().GetDefaultQueue(device_with_queue);
140 
141  if (src_device.IsSYCL() && dst_device.IsSYCL()) {
142  if (src.IsContiguous() && dst.IsContiguous() &&
143  src.GetShape() == dst.GetShape() && src_dtype == dst_dtype) {
145  src.GetDataPtr(), src.GetDevice(),
146  src_dtype.ByteSize() * shape.NumElements());
147  } else if (dst.NumElements() > 1 && dst.IsContiguous() &&
148  src.NumElements() == 1 && !src_dtype.IsObject()) {
149  int64_t num_elements = dst.NumElements();
150  DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dst_dtype, [&]() {
151  scalar_t scalar_element = src.To(dst_dtype).Item<scalar_t>();
152  scalar_t* dst_ptr = dst.GetDataPtr<scalar_t>();
153  queue.fill(dst_ptr, scalar_element, num_elements)
154  .wait_and_throw();
155  });
156  } else if (src_device == dst_device) { // non-contiguous or broadcast
157  // on same SYCL device
158  Indexer indexer({src}, dst, DtypePolicy::NONE);
159  if (src.GetDtype().IsObject()) {
160  // TODO: This is likely very slow. Coalesce into less memcpy
161  // calls.
162  int64_t object_byte_size = src.GetDtype().ByteSize();
163  for (int64_t i = 0; i < indexer.NumWorkloads(); ++i) {
164  const void* src_ptr = indexer.GetInputPtr(0, i);
165  void* dst_ptr = indexer.GetOutputPtr(i);
166  queue.memcpy(dst_ptr, src_ptr, object_byte_size);
167  }
168  queue.wait_and_throw();
169  } else {
170  DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
171  using src_t = scalar_t;
172  DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dst_dtype, [&]() {
173  using dst_t = scalar_t;
174  ParallelForSYCL<CopyElementKernel<src_t, dst_t>>(
175  device_with_queue, indexer);
176  });
177  });
178  }
179  } else {
180  dst.CopyFrom(src.Contiguous().To(dst_device));
181  }
182  } else if (src_device.IsCPU() && dst_device.IsSYCL() ||
183  src_device.IsSYCL() && dst_device.IsCPU()) {
184  Tensor src_conti = src.Contiguous(); // No op if already contiguous
185  if (dst.IsContiguous() && src.GetShape() == dst.GetShape() &&
186  src_dtype == dst_dtype) {
187  MemoryManager::Memcpy(dst.GetDataPtr(), dst_device,
188  src_conti.GetDataPtr(), src_conti.GetDevice(),
189  src_dtype.ByteSize() * shape.NumElements());
190  } else {
191  dst.CopyFrom(src.Contiguous().To(dst_device));
192  }
193  } else {
194  utility::LogError("Wrong device type {} -> {}", src_device.ToString(),
195  dst_device.ToString());
196  }
197 }
198 
199 void UnaryEWSYCL(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
200  // src and dst have been changed to have the same shape, device
201  Dtype src_dtype = src.GetDtype();
202  Dtype dst_dtype = dst.GetDtype();
203  Device device = src.GetDevice(); // == dst.GetDevice()
204 
205  if (op_code == UnaryEWOpCode::LogicalNot) {
206  if (dst_dtype == src_dtype) {
208  DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
209  ParallelForSYCL<LogicalNotElementKernel<scalar_t, scalar_t>>(
210  device, indexer);
211  });
212  } else if (dst_dtype == Bool) {
214  DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
215  ParallelForSYCL<LogicalNotElementKernel<scalar_t, bool>>(
216  device, indexer);
217  });
218  } else {
220  "Boolean op's output type must be boolean or the "
221  "same type as the input.");
222  }
223  } else if (op_code == UnaryEWOpCode::IsNan ||
224  op_code == UnaryEWOpCode::IsInf ||
225  op_code == UnaryEWOpCode::IsFinite) {
227  DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() {
228  if (op_code == UnaryEWOpCode::IsNan) {
229  ParallelForSYCL<IsNanElementKernel<scalar_t>>(device, indexer);
230  } else if (op_code == UnaryEWOpCode::IsInf) {
231  ParallelForSYCL<IsInfElementKernel<scalar_t>>(device, indexer);
232  } else if (op_code == UnaryEWOpCode::IsFinite) {
233  ParallelForSYCL<IsFiniteElementKernel<scalar_t>>(device,
234  indexer);
235  }
236  });
237  } else {
239  DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() {
240  switch (op_code) {
241  case UnaryEWOpCode::Sqrt:
242  ParallelForSYCL<SqrtElementKernel<scalar_t>>(device,
243  indexer);
244  break;
245  case UnaryEWOpCode::Sin:
246  ParallelForSYCL<SinElementKernel<scalar_t>>(device,
247  indexer);
248  break;
249  case UnaryEWOpCode::Cos:
250  ParallelForSYCL<CosElementKernel<scalar_t>>(device,
251  indexer);
252  break;
253  case UnaryEWOpCode::Neg:
254  ParallelForSYCL<NegElementKernel<scalar_t>>(device,
255  indexer);
256  break;
257  case UnaryEWOpCode::Exp:
258  ParallelForSYCL<ExpElementKernel<scalar_t>>(device,
259  indexer);
260  break;
261  case UnaryEWOpCode::Abs:
262  ParallelForSYCL<AbsElementKernel<scalar_t>>(device,
263  indexer);
264  break;
266  ParallelForSYCL<FloorElementKernel<scalar_t>>(device,
267  indexer);
268  break;
269  case UnaryEWOpCode::Ceil:
270  ParallelForSYCL<CeilElementKernel<scalar_t>>(device,
271  indexer);
272  break;
274  ParallelForSYCL<RoundElementKernel<scalar_t>>(device,
275  indexer);
276  break;
278  ParallelForSYCL<TruncElementKernel<scalar_t>>(device,
279  indexer);
280  break;
281  default:
282  utility::LogError("Unimplemented op_code for UnaryEWSYCL");
283  break;
284  }
285  });
286  }
287 }
288 
289 } // namespace kernel
290 } // namespace core
291 } // namespace cloudViewer
#define DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(DTYPE,...)
Definition: Dispatch.h:68
#define DISPATCH_DTYPE_TO_TEMPLATE(DTYPE,...)
Definition: Dispatch.h:31
Indexer indexer
Definition: UnaryEWSYCL.cpp:34
#define UNARY_ELEMENT_KERNEL(name, elem_op)
Definition: UnaryEWSYCL.cpp:95
bool IsCPU() const
Returns true iff device type is CPU.
Definition: Device.h:46
std::string ToString() const
Returns string representation of device, e.g. "CPU:0", "CUDA:0".
Definition: Device.cpp:89
bool IsSYCL() const
Returns true iff device type is SYCL GPU.
Definition: Device.h:52
bool IsObject() const
Definition: Dtype.h:63
int64_t ByteSize() const
Definition: Dtype.h:59
static void Memcpy(void *dst_ptr, const Device &dst_device, const void *src_ptr, const Device &src_device, size_t num_bytes)
Tensor Contiguous() const
Definition: Tensor.cpp:772
bool IsContiguous() const
Definition: Tensor.h:1036
void CopyFrom(const Tensor &other)
Copy Tensor values to current tensor from the source tensor.
Definition: Tensor.cpp:770
Dtype GetDtype() const
Definition: Tensor.h:1164
int64_t NumElements() const
Definition: Tensor.h:1170
Device GetDevice() const override
Definition: Tensor.cpp:1435
SizeVector GetShape() const
Definition: Tensor.h:1127
Tensor To(Dtype dtype, bool copy=false) const
Definition: Tensor.cpp:739
static SYCLContext & GetInstance()
Get singleton instance.
Definition: SYCLContext.cpp:25
sycl::queue GetDefaultQueue(const Device &device)
Get the default SYCL queue given an CloudViewer device.
Definition: SYCLContext.cpp:43
#define LogError(...)
Definition: Logging.h:60
__host__ __device__ float2 fabs(float2 v)
Definition: cutil_math.h:1254
void UnaryEWSYCL(const Tensor &src, Tensor &dst, UnaryEWOpCode op_code)
void CopySYCL(const Tensor &src, Tensor &dst)
MiniVec< float, N > floor(const MiniVec< float, N > &a)
Definition: MiniVec.h:75
MiniVec< float, N > ceil(const MiniVec< float, N > &a)
Definition: MiniVec.h:89
Generic file read and write utility for python interface.
unsigned Bool
Definition: sqlite3.c:20710