ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
ReductionSYCL.cpp
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #include <Logging.h>
9 #include <Parallel.h>
10 
11 #include <limits>
12 
18 
19 namespace cloudViewer {
20 namespace core {
21 namespace kernel {
22 
23 namespace {
24 
25 template <typename scalar_t>
26 struct ArgMinReduction {
27  using basic_reduction = sycl::minimum<scalar_t>;
28  std::pair<int64_t, scalar_t> operator()(int64_t a_idx,
29  scalar_t a_val,
30  int64_t b_idx,
31  scalar_t b_val) const {
32  return a_val < b_val ? std::make_pair(a_idx, a_val)
33  : std::make_pair(b_idx, b_val);
34  }
35 };
36 
37 template <typename scalar_t>
38 struct ArgMaxReduction {
39  using basic_reduction = sycl::maximum<scalar_t>;
40  std::pair<int64_t, scalar_t> operator()(int64_t a_idx,
41  scalar_t a_val,
42  int64_t b_idx,
43  scalar_t b_val) const {
44  return a_val > b_val ? std::make_pair(a_idx, a_val)
45  : std::make_pair(b_idx, b_val);
46  }
47 };
48 
49 // TODO: This launches one kernel per output element, which can be inefficient
50 // in cases where the reduction dim is small but the non-reduced dim is large.
51 // Unit tests for a large number of outputs are disabled.
52 // Speed-up by launching one kernel for the entire reduction.
53 template <class ReductionOp, typename scalar_t>
54 void SYCLReductionEngine(Device device, Indexer indexer, scalar_t identity) {
55  auto device_props =
57  auto queue = device_props.queue;
58  auto work_group_size = device_props.max_work_group_size;
59  size_t log2elements_per_group = 13;
60  auto elements_per_group = (1 << log2elements_per_group); // 8192
61  size_t log2workitems_per_group = 8;
62  auto workitems_per_group = (1 << log2workitems_per_group); // 256
63  auto elements_per_work_item =
64  elements_per_group / workitems_per_group; // 32 (= max SIMD size)
65  auto mask = ~(~0 << log2workitems_per_group);
66  ReductionOp red_op;
67 
68  for (int64_t output_idx = 0; output_idx < indexer.NumOutputElements();
69  output_idx++) {
70  // sub_indexer.NumWorkloads() == ipo.
71  // sub_indexer's workload_idx is indexer's ipo_idx.
72  Indexer scalar_out_indexer = indexer.GetPerOutputIndexer(output_idx);
73  auto num_elements = scalar_out_indexer.NumWorkloads();
74  auto num_work_groups = num_elements / elements_per_group;
75  if (num_elements > elements_per_group * num_work_groups)
76  ++num_work_groups;
77  // ensure each work group has work_group_size work items
78  auto num_work_items = num_work_groups * work_group_size;
79 
80  auto red_cg = [&](auto& cgh) {
81  auto output = scalar_out_indexer.GetOutputPtr<scalar_t>(0);
82  // Setting this still doesn't initialize to identity -
83  // output buffer must be initialized separately.
84  auto sycl_reducer = sycl::reduction(
85  output, identity, red_op,
86  {sycl::property::reduction::initialize_to_identity()});
87  cgh.parallel_for(
88  sycl::nd_range<1>{num_work_items, work_group_size},
89  sycl_reducer, [=](sycl::nd_item<1> item, auto& red_arg) {
90  auto glob_id = item.get_global_id(0);
91  auto offset = ((glob_id >> log2workitems_per_group)
92  << log2elements_per_group) +
93  (glob_id & mask);
94  auto item_out = identity;
95  for (size_t i = 0; i < elements_per_work_item; i++) {
96  size_t idx =
97  (i << log2workitems_per_group) + offset;
98  if (idx >= num_elements) break;
99  auto val =
100  *scalar_out_indexer.GetInputPtr<scalar_t>(
101  0, idx);
102  item_out = red_op(item_out, val);
103  }
104  red_arg.combine(item_out);
105  });
106  };
107 
108  auto e = queue.submit(red_cg);
109  }
110  queue.wait_and_throw();
111 }
112 
113 // Based on OneAPI GPU optimization guide code sample (Blocked access to
114 // input data + SYCL builtin reduction ops for final reduction)
115 // TODO: This launches one kernel per output element, which can be inefficient
116 // in cases where the reduction dim is small but the non-reduced dim is large.
117 // Speed-up by launching one kernel for the entire reduction.
118 template <class ReductionOp, typename scalar_t>
119 void SYCLArgReductionEngine(Device device, Indexer indexer, scalar_t identity) {
120  auto device_props =
122  auto queue = device_props.queue;
123  auto work_group_size = device_props.max_work_group_size;
124  size_t log2elements_per_group = 13;
125  auto elements_per_group = (1 << log2elements_per_group); // 8192
126  size_t log2workitems_per_group = 8;
127  auto workitems_per_group = (1 << log2workitems_per_group); // 256
128  auto elements_per_work_item =
129  elements_per_group / workitems_per_group; // 32 (= max SIMD size)
130  auto mask = ~(~0 << log2workitems_per_group);
131  ReductionOp red_op;
132 
133  // atomic flag. Must be 4 bytes.
134  sycl::buffer<int32_t, 1> output_in_use{indexer.NumOutputElements()};
135  auto e_fill = queue.submit([&](auto& cgh) {
136  auto acc_output_in_use =
137  output_in_use.get_access<sycl::access_mode::write>(cgh);
138  cgh.fill(acc_output_in_use, 0);
139  });
140 
141  for (int64_t output_idx = 0; output_idx < indexer.NumOutputElements();
142  output_idx++) {
143  // sub_indexer.NumWorkloads() == ipo.
144  // sub_indexer's workload_idx is indexer's ipo_idx.
145  Indexer scalar_out_indexer = indexer.GetPerOutputIndexer(output_idx);
146  auto num_elements = scalar_out_indexer.NumWorkloads();
147  auto num_work_groups = num_elements / elements_per_group;
148  if (num_elements > elements_per_group * num_work_groups)
149  ++num_work_groups;
150  // ensure each work group has work_group_size work items
151  auto num_work_items = num_work_groups * work_group_size;
152 
153  sycl::buffer<int32_t, 1> this_output_in_use{output_in_use, output_idx,
154  1};
155  auto arg_red_cg = [&](auto& cgh) {
156  auto acc_in_use =
157  this_output_in_use
158  .get_access<sycl::access_mode::read_write>(cgh);
159  cgh.parallel_for(
160  sycl::nd_range<1>{num_work_items, work_group_size},
161  [=](sycl::nd_item<1> item) {
162  auto& out_idx =
163  *scalar_out_indexer.GetOutputPtr<int64_t>(0, 0);
164  auto& out_val =
165  *scalar_out_indexer.GetOutputPtr<scalar_t>(1,
166  0);
167  auto glob_id = item.get_global_id(0);
168  auto this_group = item.get_group();
169  auto offset = ((glob_id >> log2workitems_per_group)
170  << log2elements_per_group) +
171  (glob_id & mask);
172  int64_t it_idx = 0;
173  scalar_t it_val = identity;
174  for (size_t i = 0; i < elements_per_work_item; i++) {
175  size_t idx =
176  (i << log2workitems_per_group) + offset;
177  if (idx >= num_elements) break;
178  auto val =
179  *scalar_out_indexer.GetInputPtr<scalar_t>(
180  0, idx);
181  std::tie(it_idx, it_val) =
182  red_op(it_idx, it_val, idx, val);
183  }
184  auto group_out_val = sycl::reduce_over_group(
185  this_group, it_val, identity,
186  typename ReductionOp::basic_reduction());
187  // atomic (serial) reduction over all groups. SYCL does
188  // not have a barrier over groups. Work item(s) with min
189  // / max value update the output. (non-deterministic)
190  if (it_val == group_out_val) {
191  // TODO: Look for a better option to a spinlock
192  // mutex.
193  auto in_use = sycl::atomic_ref<
194  int32_t, sycl::memory_order::acq_rel,
195  sycl::memory_scope::device>(acc_in_use[0]);
196  while (in_use.exchange(1) == 1) {
197  }
198  std::tie(out_idx, out_val) = red_op(
199  out_idx, out_val, it_idx, group_out_val);
200  in_use.store(0);
201  }
202  });
203  };
204 
205  auto e = queue.submit(arg_red_cg);
206  }
207  queue.wait_and_throw();
208 }
209 } // namespace
210 
211 void ReductionSYCL(const Tensor& src,
212  Tensor& dst,
213  const SizeVector& dims,
214  bool keepdim,
215  ReductionOpCode op_code) {
216  Device device = src.GetDevice();
217  if (s_regular_reduce_ops.find(op_code) != s_regular_reduce_ops.end()) {
218  Indexer indexer({src}, dst, DtypePolicy::ALL_SAME, dims);
219  DISPATCH_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() {
220  scalar_t identity;
221  switch (op_code) {
222  case ReductionOpCode::Sum:
223  dst.Fill(0);
224  SYCLReductionEngine<sycl::plus<scalar_t>, scalar_t>(
225  device, indexer, 0);
226  break;
227  case ReductionOpCode::Prod:
228  dst.Fill(1);
229  SYCLReductionEngine<sycl::multiplies<scalar_t>, scalar_t>(
230  device, indexer, 1);
231  break;
232  case ReductionOpCode::Min:
233  if (indexer.NumWorkloads() == 0) {
234  utility::LogError(
235  "Zero-size Tensor does not support Min.");
236  } else {
237  identity = std::numeric_limits<scalar_t>::max();
238  dst.Fill(identity);
239  SYCLReductionEngine<sycl::minimum<scalar_t>, scalar_t>(
240  device, indexer, identity);
241  }
242  break;
243  case ReductionOpCode::Max:
244  if (indexer.NumWorkloads() == 0) {
245  utility::LogError(
246  "Zero-size Tensor does not support Max.");
247  } else {
248  identity = std::numeric_limits<scalar_t>::lowest();
249  dst.Fill(identity);
250  SYCLReductionEngine<sycl::maximum<scalar_t>, scalar_t>(
251  device, indexer, identity);
252  }
253  break;
254  default:
255  utility::LogError("Unsupported op code.");
256  break;
257  }
258  });
259  } else if (s_arg_reduce_ops.find(op_code) != s_arg_reduce_ops.end()) {
260  if (dst.GetDtype() != core::Int64) {
261  utility::LogError("Arg-reduction must have int64 output dtype.");
262  }
263  // Accumulation buffer to store temporary min/max values.
264  Tensor dst_acc(dst.GetShape(), src.GetDtype(), src.GetDevice());
265  Indexer indexer({src}, {dst, dst_acc}, DtypePolicy::INPUT_SAME, dims);
266  DISPATCH_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() {
267  scalar_t identity;
268  switch (op_code) {
269  case ReductionOpCode::ArgMin:
270  identity = std::numeric_limits<scalar_t>::max();
271  dst_acc.Fill(identity);
272  SYCLArgReductionEngine<ArgMinReduction<scalar_t>, scalar_t>(
273  device, indexer, identity);
274  break;
275  case ReductionOpCode::ArgMax:
276  identity = std::numeric_limits<scalar_t>::lowest();
277  dst_acc.Fill(identity);
278  SYCLArgReductionEngine<ArgMaxReduction<scalar_t>, scalar_t>(
279  device, indexer, identity);
280  break;
281  default:
282  utility::LogError("Unsupported op code.");
283  break;
284  }
285  });
286  } else if (s_boolean_reduce_ops.find(op_code) !=
287  s_boolean_reduce_ops.end()) {
288  if (src.GetDtype() != core::Bool) {
290  "Boolean reduction only supports boolean input tensor.");
291  }
292  if (dst.GetDtype() != core::Bool) {
294  "Boolean reduction only supports boolean output tensor.");
295  }
296  Indexer indexer({src}, dst, DtypePolicy::ALL_SAME, dims);
297  switch (op_code) {
298  case ReductionOpCode::All:
299  // Identity == true. 0-sized tensor, returns true.
300  dst.Fill(true);
301  SYCLReductionEngine<sycl::logical_and<bool>, bool>(
302  device, indexer, true);
303  break;
304  case ReductionOpCode::Any:
305  // Identity == false. 0-sized tensor, returns false.
306  dst.Fill(false);
307  SYCLReductionEngine<sycl::logical_or<bool>, bool>(
308  device, indexer, false);
309  break;
310  default:
311  utility::LogError("Unsupported op code.");
312  break;
313  }
314  } else {
315  utility::LogError("Unsupported op code.");
316  }
317 }
318 
319 } // namespace kernel
320 } // namespace core
321 } // namespace cloudViewer
Indexer indexer
#define DISPATCH_DTYPE_TO_TEMPLATE(DTYPE,...)
Definition: Dispatch.h:31
int offset
SYCL queue manager.
Dtype GetDtype() const
Definition: Tensor.h:1164
Device GetDevice() const override
Definition: Tensor.cpp:1435
static SYCLContext & GetInstance()
Get singleton instance.
Definition: SYCLContext.cpp:25
SYCLDevice GetDeviceProperties(const Device &device)
Get SYCL device properties given an CloudViewer device.
Definition: SYCLContext.h:62
#define LogError(...)
Definition: Logging.h:60
void ReductionSYCL(const Tensor &src, Tensor &dst, const SizeVector &dims, bool keepdim, ReductionOpCode op_code)
static const std::unordered_set< ReductionOpCode, utility::hash_enum_class > s_arg_reduce_ops
Definition: Reduction.h:41
static const std::unordered_set< ReductionOpCode, utility::hash_enum_class > s_regular_reduce_ops
Definition: Reduction.h:34
static const std::unordered_set< ReductionOpCode, utility::hash_enum_class > s_boolean_reduce_ops
Definition: Reduction.h:46
const Dtype Bool
Definition: Dtype.cpp:52
const Dtype Int64
Definition: Dtype.cpp:47
CLOUDVIEWER_HOST_DEVICE Pair< First, Second > make_pair(const First &_first, const Second &_second)
Definition: SlabTraits.h:49
Generic file read and write utility for python interface.
Definition: Eigen.h:85
sycl::queue queue
Default queue for this device.
Definition: SYCLContext.h:32