ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
Indexer.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <Logging.h>
11 
12 #include <sstream>
13 
15 #include "cloudViewer/core/Dtype.h"
20 
21 // The generated "Indexer_ispc.h" header will not be available outside the
22 // library. Therefore, forward declare all exported ISPC classes.
23 #ifdef BUILD_ISPC_MODULE
24 namespace ispc {
25 struct TensorRef;
26 struct Indexer;
27 } // namespace ispc
28 #endif
29 
30 namespace cloudViewer {
31 namespace core {
32 
33 class Indexer;
34 
35 class IndexerIterator;
36 
37 // Maximum number of dimensions of TensorRef.
38 static constexpr int64_t MAX_DIMS = 5;
39 
40 // Maximum number of inputs of an op.
41 // MAX_INPUTS shall be >= MAX_DIMS to support advanced indexing.
42 static constexpr int64_t MAX_INPUTS = 5;
43 
44 // Maximum number of outputs of an op. This number can be increased when
45 // necessary.
46 static constexpr int64_t MAX_OUTPUTS = 2;
47 
48 template <int NARGS, typename index_t = uint32_t>
50  OffsetCalculator(int dims,
51  const int64_t* sizes,
52  const int64_t* const* strides)
53  : dims_(dims) {
54  if (dims_ > MAX_DIMS) {
55  utility::LogError("tensor has too many (>{}) dims_", MAX_DIMS);
56  }
57 
58  for (int i = 0; i < MAX_DIMS; ++i) {
59  if (i < dims_) {
60  sizes_[i] = sizes[i];
61  } else {
62  sizes_[i] = 1;
63  }
64  for (int arg = 0; arg < NARGS; arg++) {
65  strides_[i][arg] = i < dims_ ? strides[arg][i] : 0;
66  }
67  }
68  }
69 
71  index_t linear_idx) const {
73 #if defined(__CUDA_ARCH__)
74 #pragma unroll
75 #endif
76  for (int arg = 0; arg < NARGS; arg++) {
77  offsets[arg] = 0;
78  }
79 
80 #if defined(__CUDA_ARCH__)
81 #pragma unroll
82 #endif
83  for (int dim = 0; dim < MAX_DIMS; ++dim) {
84  if (dim == dims_) {
85  break;
86  }
87  index_t mod = linear_idx % sizes_[dim];
88  linear_idx = linear_idx / sizes_[dim];
89 
90 #if defined(__CUDA_ARCH__)
91 #pragma unroll
92 #endif
93  for (int arg = 0; arg < NARGS; arg++) {
94  offsets[arg] += mod * strides_[dim][arg];
95  }
96  }
97  return offsets;
98  }
99 
100  int dims_;
103 };
104 
106 struct TensorRef {
107  // The default copy constructor works on __device__ as well so we don't
108  // define it explicitly. shape_[MAX_DIMS] and strides[MAX_DIMS] will be
109  // copied fully.
110  TensorRef() : data_ptr_(nullptr), ndims_(0), dtype_byte_size_(0) {}
111 
112  TensorRef(const Tensor& t) {
113  if (t.NumDims() > MAX_DIMS) {
114  utility::LogError("Tensor has too many dimensions {} > {}.",
115  t.NumDims(), MAX_DIMS);
116  }
117  data_ptr_ = const_cast<void*>(t.GetDataPtr());
118  ndims_ = t.NumDims();
120  for (int64_t i = 0; i < ndims_; ++i) {
121  shape_[i] = t.GetShape(i);
123  }
124  }
125 
132  void Permute(const SizeVector& dims) {
133  // Check dims are permuntation of [0, 1, 2, ..., n-1]
134  if (static_cast<int64_t>(dims.size()) != ndims_) {
135  utility::LogError("Number of dimensions mismatch {} != {}.",
136  dims.size(), ndims_);
137  }
138  std::vector<bool> seen_dims(ndims_, false);
139  for (const int64_t& dim : dims) {
140  seen_dims[dim] = true;
141  }
142  if (!std::all_of(seen_dims.begin(), seen_dims.end(),
143  [](bool seen) { return seen; })) {
145  "Permute dims must be a permuntation from 0 to {}.",
146  dims.size() - 1);
147  }
148 
149  // Map to new shape and strides
150  SizeVector new_shape(ndims_);
151  SizeVector new_byte_strides(ndims_);
152  for (int64_t i = 0; i < ndims_; ++i) {
153  int64_t old_dim = shape_util::WrapDim(dims[i], ndims_);
154  new_shape[i] = shape_[old_dim];
155  new_byte_strides[i] = byte_strides_[old_dim];
156  }
157  for (int64_t i = 0; i < ndims_; ++i) {
158  shape_[i] = new_shape[i];
159  byte_strides_[i] = new_byte_strides[i];
160  }
161  }
162 
164  inline bool IsContiguous() const {
165  SizeVector shape(ndims_);
166  SizeVector strides(ndims_);
167  for (int64_t i = 0; i < ndims_; ++i) {
168  shape[i] = shape_[i];
169  strides[i] = byte_strides_[i] / dtype_byte_size_;
170  }
171  return shape_util::DefaultStrides(shape) == strides;
172  }
173 
174  bool operator==(const TensorRef& other) const {
175  bool rc = true;
176  rc = rc && (data_ptr_ == other.data_ptr_);
177  rc = rc && (ndims_ == other.ndims_);
178  rc = rc && (dtype_byte_size_ == other.dtype_byte_size_);
179  for (int64_t i = 0; i < ndims_; ++i) {
180  rc = rc && (shape_[i] == other.shape_[i]);
181  rc = rc && (byte_strides_[i] == other.byte_strides_[i]);
182  }
183  return rc;
184  }
185 
186  bool operator!=(const TensorRef& other) const { return !(*this == other); }
187 
188 #ifdef BUILD_ISPC_MODULE
190  ispc::TensorRef ToISPC() const;
191 #endif
192 
193  void* data_ptr_;
194  int64_t ndims_ = 0;
195  int64_t dtype_byte_size_ = 0;
196  int64_t shape_[MAX_DIMS];
198 };
199 
200 enum class DtypePolicy {
201  NONE, // Do not check. Expects the kernel to handle the conversion.
202  // E.g. in Copy kernel with type casting.
203  ALL_SAME, // All inputs and outputs to to have the same dtype.
204  INPUT_SAME, // All inputs have the same dtype.
205  INPUT_SAME_OUTPUT_BOOL // All inputs have the same dtype. Outputs
206  // have bool dtype.
207 };
208 
223 public:
224  TensorIterator(const Tensor& tensor)
225  : input_(TensorRef(tensor)), ndims_(tensor.NumDims()) {}
226 
228  int64_t num_workloads = 1;
229  for (int64_t i = 0; i < ndims_; ++i) {
230  num_workloads *= input_.shape_[i];
231  }
232  return num_workloads;
233  }
234 
235  CLOUDVIEWER_HOST_DEVICE void* GetPtr(int64_t workload_idx) const {
236  if (workload_idx < 0 || workload_idx >= NumWorkloads()) {
237  return nullptr;
238  }
239  int64_t offset = 0;
240  workload_idx = workload_idx * input_.dtype_byte_size_;
241  for (int64_t i = 0; i < ndims_; ++i) {
242  offset += workload_idx / input_.byte_strides_[i] *
244  workload_idx = workload_idx % input_.byte_strides_[i];
245  }
246  return static_cast<void*>(static_cast<char*>(input_.data_ptr_) +
247  offset);
248  }
249 
250 protected:
252  int64_t ndims_;
253 };
254 
262 class Indexer {
263 public:
264  Indexer() {}
265  Indexer(const Indexer&) = default;
266  Indexer& operator=(const Indexer&) = default;
267 
271  Indexer(const std::vector<Tensor>& input_tensors,
272  const Tensor& output_tensor,
273  DtypePolicy dtype_policy = DtypePolicy::ALL_SAME,
274  const SizeVector& reduction_dims = {});
275 
276  Indexer(const std::vector<Tensor>& input_tensors,
277  const std::vector<Tensor>& output_tensors,
278  DtypePolicy dtype_policy = DtypePolicy::ALL_SAME,
279  const SizeVector& reduction_dims = {});
280 
282  bool CanUse32BitIndexing() const;
283 
286  IndexerIterator SplitTo32BitIndexing() const;
287 
291  std::unique_ptr<Indexer> SplitLargestDim();
292 
295  Indexer GetPerOutputIndexer(int64_t output_idx) const;
296 
297  bool ShouldAccumulate() const { return accumulate_; }
298 
299  bool IsFinalOutput() const { return final_output_; }
300 
306  void ShrinkDim(int64_t dim, int64_t start, int64_t size);
307 
309  int64_t NumReductionDims() const;
310 
312  int64_t NumDims() const { return ndims_; }
313 
316  const int64_t* GetPrimaryShape() const { return primary_shape_; }
317  int64_t* GetPrimaryShape() { return primary_shape_; }
318 
321  const int64_t* GetPrimaryStrides() const { return primary_strides_; }
322 
333  int64_t NumWorkloads() const;
334 
336  int64_t NumOutputElements() const;
337 
339  int64_t NumInputs() const { return num_inputs_; }
340 
342  int64_t NumOutputs() const { return num_outputs_; }
343 
345  TensorRef& GetInput(int64_t i) {
346  if (i >= num_inputs_ || i < 0) {
347  utility::LogError("0 <= i < {} required, however, i = {}.",
348  num_inputs_, i);
349  }
350  return inputs_[i];
351  }
352  const TensorRef& GetInput(int64_t i) const {
353  if (i >= num_inputs_ || i < 0) {
354  utility::LogError("0 <= i < {} required, however, i = {}.",
355  num_inputs_, i);
356  }
357  return inputs_[i];
358  }
359 
361  TensorRef& GetOutput(int64_t i) {
362  if (i >= num_outputs_ || i < 0) {
363  utility::LogError("0 <= i < {} required, however, i = {}.",
364  num_outputs_, i);
365  }
366  return outputs_[i];
367  }
368  const TensorRef& GetOutput(int64_t i) const {
369  if (i >= num_outputs_ || i < 0) {
370  utility::LogError("0 <= i < {} required, however, i = {}.",
371  num_outputs_, i);
372  }
373  return outputs_[i];
374  }
375 
379  if (num_outputs_ > 1) {
380  utility::LogError("num_outputs_ == {} > 0, use GetOutput(i)",
381  num_outputs_);
382  }
383  return GetOutput(0);
384  }
385  const TensorRef& GetOutput() const {
386  if (num_outputs_ > 1) {
387  utility::LogError("num_outputs_ == {} > 0, use GetOutput(i)",
388  num_outputs_);
389  }
390  return GetOutput(0);
391  }
392 
394  bool IsReductionDim(int64_t dim) const {
395  // All outputs have the same shape and reduction dims. Even if they
396  // don't have the same initial strides, the reduced strides are always
397  // set to 0. Thus it is okay to use outputs_[0].
398  return outputs_[0].byte_strides_[dim] == 0 && primary_shape_[dim] > 1;
399  }
400 
406  CLOUDVIEWER_HOST_DEVICE char* GetInputPtr(int64_t input_idx,
407  int64_t workload_idx) const {
408  if (input_idx < 0 || input_idx >= num_inputs_) {
409  return nullptr;
410  }
411  return GetWorkloadDataPtr(inputs_[input_idx],
412  inputs_contiguous_[input_idx], workload_idx);
413  }
414 
423  template <typename T>
424  CLOUDVIEWER_HOST_DEVICE T* GetInputPtr(int64_t input_idx,
425  int64_t workload_idx) const {
426  if (input_idx < 0 || input_idx >= num_inputs_) {
427  return nullptr;
428  }
429  return GetWorkloadDataPtr<T>(inputs_[input_idx],
430  inputs_contiguous_[input_idx],
431  workload_idx);
432  }
433 
438  CLOUDVIEWER_HOST_DEVICE char* GetOutputPtr(int64_t workload_idx) const {
440  workload_idx);
441  }
442 
450  template <typename T>
451  CLOUDVIEWER_HOST_DEVICE T* GetOutputPtr(int64_t workload_idx) const {
452  return GetWorkloadDataPtr<T>(outputs_[0], outputs_contiguous_[0],
453  workload_idx);
454  }
455 
461  CLOUDVIEWER_HOST_DEVICE char* GetOutputPtr(int64_t output_idx,
462  int64_t workload_idx) const {
463  return GetWorkloadDataPtr(outputs_[output_idx],
464  outputs_contiguous_[output_idx],
465  workload_idx);
466  }
467 
473  template <typename T>
474  CLOUDVIEWER_HOST_DEVICE T* GetOutputPtr(int64_t output_idx,
475  int64_t workload_idx) const {
476  return GetWorkloadDataPtr<T>(outputs_[output_idx],
477  outputs_contiguous_[output_idx],
478  workload_idx);
479  }
480 
481 #ifdef BUILD_ISPC_MODULE
483  ispc::Indexer ToISPC() const;
484 #endif
485 
486 protected:
489  void CoalesceDimensions();
490 
491  // Permute reduction dimensions to front.
492  // TODO: Sort the dimensions based on strides in ascending orderto improve
493  // thread coalescing.
494  void ReorderDimensions(const SizeVector& reduction_dims);
495 
497  void UpdatePrimaryStrides();
498 
500  void UpdateContiguousFlags();
501 
528  static void BroadcastRestride(TensorRef& src,
529  int64_t dst_ndims,
530  const int64_t* dst_shape);
531 
534  static void ReductionRestride(TensorRef& dst,
535  int64_t src_ndims,
536  const int64_t* src_shape,
537  const SizeVector& reduction_dims);
538 
543  const TensorRef& tr,
544  bool tr_contiguous,
545  int64_t workload_idx) const {
546  // For 0-sized input reduction op, the output Tensor
547  // workload_idx == 1 > NumWorkloads() == 0.
548  if (workload_idx < 0) {
549  return nullptr;
550  }
551  if (tr_contiguous) {
552  return static_cast<char*>(tr.data_ptr_) +
553  workload_idx * tr.dtype_byte_size_;
554  } else {
555  int64_t offset = 0;
556  for (int64_t i = 0; i < ndims_; ++i) {
557  offset += workload_idx / primary_strides_[i] *
558  tr.byte_strides_[i];
559  workload_idx = workload_idx % primary_strides_[i];
560  }
561  return static_cast<char*>(tr.data_ptr_) + offset;
562  }
563  }
564 
571  template <typename T>
573  bool tr_contiguous,
574  int64_t workload_idx) const {
575  // For 0-sized input reduction op, the output Tensor
576  // workload_idx == 1 > NumWorkloads() == 0.
577  if (workload_idx < 0) {
578  return nullptr;
579  }
580  if (tr_contiguous) {
581  return static_cast<T*>(tr.data_ptr_) + workload_idx;
582  } else {
583  int64_t offset = 0;
584  for (int64_t i = 0; i < ndims_; ++i) {
585  offset += workload_idx / primary_strides_[i] *
586  tr.byte_strides_[i];
587  workload_idx = workload_idx % primary_strides_[i];
588  }
589  return static_cast<T*>(static_cast<void*>(
590  static_cast<char*>(tr.data_ptr_) + offset));
591  }
592  }
593 
595  int64_t num_inputs_ = 0;
596  int64_t num_outputs_ = 0;
597 
600 
603 
606 
609 
622 
626 
628  int64_t ndims_ = 0;
629 
633  bool final_output_ = true;
634 
637  bool accumulate_ = false;
638 };
639 
641 public:
642  struct Iterator {
643  Iterator() {};
644  Iterator(const Indexer& indexer);
645  Iterator(Iterator&& other) = default;
646 
647  Indexer& operator*() const;
648  Iterator& operator++();
649  bool operator==(const Iterator& other) const;
650  bool operator!=(const Iterator& other) const;
651 
652  std::vector<std::unique_ptr<Indexer>> vec_;
653  };
654 
656 
657  Iterator begin() const;
658  Iterator end() const;
659 
660 private:
661  const Indexer& indexer_;
662 };
663 
664 } // namespace core
665 } // namespace cloudViewer
Indexer indexer
Common CUDA utilities.
#define CLOUDVIEWER_HOST_DEVICE
Definition: CUDAUtils.h:44
int size
int offset
int64_t ByteSize() const
Definition: Dtype.h:59
IndexerIterator(const Indexer &indexer)
Definition: Indexer.cpp:641
void ReorderDimensions(const SizeVector &reduction_dims)
Definition: Indexer.cpp:491
CLOUDVIEWER_HOST_DEVICE T * GetOutputPtr(int64_t workload_idx) const
Definition: Indexer.h:451
std::unique_ptr< Indexer > SplitLargestDim()
Definition: Indexer.cpp:238
const TensorRef & GetInput(int64_t i) const
Definition: Indexer.h:352
CLOUDVIEWER_HOST_DEVICE T * GetWorkloadDataPtr(const TensorRef &tr, bool tr_contiguous, int64_t workload_idx) const
Definition: Indexer.h:572
int64_t ndims_
Indexer's global number of dimensions.
Definition: Indexer.h:628
bool outputs_contiguous_[MAX_OUTPUTS]
Array of contiguous flags for all output TensorRefs.
Definition: Indexer.h:608
const int64_t * GetPrimaryShape() const
Definition: Indexer.h:316
CLOUDVIEWER_HOST_DEVICE T * GetOutputPtr(int64_t output_idx, int64_t workload_idx) const
Definition: Indexer.h:474
int64_t NumDims() const
Returns number of dimensions of the Indexer.
Definition: Indexer.h:312
bool ShouldAccumulate() const
Definition: Indexer.h:297
const int64_t * GetPrimaryStrides() const
Definition: Indexer.h:321
const TensorRef & GetOutput(int64_t i) const
Definition: Indexer.h:368
CLOUDVIEWER_HOST_DEVICE char * GetOutputPtr(int64_t output_idx, int64_t workload_idx) const
Definition: Indexer.h:461
int64_t num_inputs_
Number of input and output Tensors.
Definition: Indexer.h:595
void ShrinkDim(int64_t dim, int64_t start, int64_t size)
Definition: Indexer.cpp:364
bool IsFinalOutput() const
Definition: Indexer.h:299
int64_t primary_shape_[MAX_DIMS]
Definition: Indexer.h:621
CLOUDVIEWER_HOST_DEVICE char * GetInputPtr(int64_t input_idx, int64_t workload_idx) const
Definition: Indexer.h:406
TensorRef & GetOutput()
Definition: Indexer.h:378
void UpdateContiguousFlags()
Update input_contiguous_ and output_contiguous_.
Definition: Indexer.cpp:565
bool IsReductionDim(int64_t dim) const
Returns true if the dim -th dimension is reduced.
Definition: Indexer.h:394
TensorRef inputs_[MAX_INPUTS]
Array of input TensorRefs.
Definition: Indexer.h:599
TensorRef outputs_[MAX_OUTPUTS]
Array of output TensorRefs.
Definition: Indexer.h:602
const TensorRef & GetOutput() const
Definition: Indexer.h:385
int64_t NumInputs() const
Number of input Tensors.
Definition: Indexer.h:339
Indexer(const Indexer &)=default
int64_t NumReductionDims() const
Returns the number of reduction dimensions.
Definition: Indexer.cpp:395
bool CanUse32BitIndexing() const
Returns true iff the maximum_offsets in bytes are smaller than 2^31 - 1.
Definition: Indexer.cpp:198
static void ReductionRestride(TensorRef &dst, int64_t src_ndims, const int64_t *src_shape, const SizeVector &reduction_dims)
Definition: Indexer.cpp:602
CLOUDVIEWER_HOST_DEVICE char * GetOutputPtr(int64_t workload_idx) const
Definition: Indexer.h:438
Indexer GetPerOutputIndexer(int64_t output_idx) const
Definition: Indexer.cpp:303
TensorRef & GetInput(int64_t i)
Returns input TensorRef.
Definition: Indexer.h:345
int64_t NumWorkloads() const
Definition: Indexer.cpp:406
int64_t NumOutputElements() const
Returns the number of output elements.
Definition: Indexer.cpp:414
int64_t NumOutputs() const
Number of output Tensors.
Definition: Indexer.h:342
static void BroadcastRestride(TensorRef &src, int64_t dst_ndims, const int64_t *dst_shape)
Definition: Indexer.cpp:575
bool inputs_contiguous_[MAX_INPUTS]
Array of contiguous flags for all input TensorRefs.
Definition: Indexer.h:605
TensorRef & GetOutput(int64_t i)
Returns output TensorRef.
Definition: Indexer.h:361
int64_t * GetPrimaryShape()
Definition: Indexer.h:317
CLOUDVIEWER_HOST_DEVICE char * GetWorkloadDataPtr(const TensorRef &tr, bool tr_contiguous, int64_t workload_idx) const
Definition: Indexer.h:542
int64_t primary_strides_[MAX_DIMS]
Definition: Indexer.h:625
CLOUDVIEWER_HOST_DEVICE T * GetInputPtr(int64_t input_idx, int64_t workload_idx) const
Definition: Indexer.h:424
IndexerIterator SplitTo32BitIndexing() const
Definition: Indexer.cpp:234
void UpdatePrimaryStrides()
Update primary_strides_ based on primary_shape_.
Definition: Indexer.cpp:556
Indexer & operator=(const Indexer &)=default
CLOUDVIEWER_HOST_DEVICE int64_t NumWorkloads() const
Definition: Indexer.h:227
TensorIterator(const Tensor &tensor)
Definition: Indexer.h:224
CLOUDVIEWER_HOST_DEVICE void * GetPtr(int64_t workload_idx) const
Definition: Indexer.h:235
int64_t NumDims() const
Definition: Tensor.h:1172
Dtype GetDtype() const
Definition: Tensor.h:1164
int64_t GetStride(int64_t dim) const
Definition: Tensor.h:1139
SizeVector GetShape() const
Definition: Tensor.h:1127
#define LogError(...)
Definition: Logging.h:60
int64_t WrapDim(int64_t dim, int64_t max_dim, bool inclusive)
Wrap around negative dim.
Definition: ShapeUtil.cpp:131
SizeVector DefaultStrides(const SizeVector &shape)
Compute default strides for a shape when a tensor is contiguous.
Definition: ShapeUtil.cpp:214
static constexpr int64_t MAX_OUTPUTS
Definition: Indexer.h:46
static constexpr int64_t MAX_DIMS
Definition: Indexer.h:38
static constexpr int64_t MAX_INPUTS
Definition: Indexer.h:42
Generic file read and write utility for python interface.
bool operator==(const Iterator &other) const
Definition: Indexer.cpp:660
bool operator!=(const Iterator &other) const
Definition: Indexer.cpp:663
std::vector< std::unique_ptr< Indexer > > vec_
Definition: Indexer.h:652
OffsetCalculator(int dims, const int64_t *sizes, const int64_t *const *strides)
Definition: Indexer.h:50
CLOUDVIEWER_HOST_DEVICE utility::MiniVec< index_t, NARGS > get(index_t linear_idx) const
Definition: Indexer.h:70
index_t strides_[MAX_DIMS][NARGS]
Definition: Indexer.h:102
A minimalistic class that reference a Tensor.
Definition: Indexer.h:106
int64_t byte_strides_[MAX_DIMS]
Definition: Indexer.h:197
TensorRef(const Tensor &t)
Definition: Indexer.h:112
int64_t shape_[MAX_DIMS]
Definition: Indexer.h:196
bool IsContiguous() const
Returns True if the underlying memory buffer is contiguous.
Definition: Indexer.h:164
bool operator==(const TensorRef &other) const
Definition: Indexer.h:174
void Permute(const SizeVector &dims)
Permute (dimension shuffle) the reference to a Tensor.
Definition: Indexer.h:132
bool operator!=(const TensorRef &other) const
Definition: Indexer.h:186