23 #ifdef BUILD_ISPC_MODULE
35 class IndexerIterator;
48 template <
int NARGS,
typename index_t = u
int32_t>
52 const int64_t*
const* strides)
64 for (
int arg = 0; arg < NARGS; arg++) {
73 #if defined(__CUDA_ARCH__)
76 for (
int arg = 0; arg < NARGS; arg++) {
80 #if defined(__CUDA_ARCH__)
83 for (
int dim = 0; dim <
MAX_DIMS; ++dim) {
88 linear_idx = linear_idx /
sizes_[dim];
90 #if defined(__CUDA_ARCH__)
93 for (
int arg = 0; arg < NARGS; arg++) {
94 offsets[arg] += mod *
strides_[dim][arg];
120 for (int64_t i = 0; i <
ndims_; ++i) {
134 if (
static_cast<int64_t
>(dims.
size()) !=
ndims_) {
138 std::vector<bool> seen_dims(
ndims_,
false);
139 for (
const int64_t& dim : dims) {
140 seen_dims[dim] =
true;
142 if (!std::all_of(seen_dims.begin(), seen_dims.end(),
143 [](
bool seen) { return seen; })) {
145 "Permute dims must be a permuntation from 0 to {}.",
152 for (int64_t i = 0; i <
ndims_; ++i) {
154 new_shape[i] =
shape_[old_dim];
157 for (int64_t i = 0; i <
ndims_; ++i) {
167 for (int64_t i = 0; i <
ndims_; ++i) {
179 for (int64_t i = 0; i <
ndims_; ++i) {
188 #ifdef BUILD_ISPC_MODULE
190 ispc::TensorRef ToISPC()
const;
228 int64_t num_workloads = 1;
229 for (int64_t i = 0; i <
ndims_; ++i) {
232 return num_workloads;
236 if (workload_idx < 0 || workload_idx >=
NumWorkloads()) {
241 for (int64_t i = 0; i <
ndims_; ++i) {
271 Indexer(
const std::vector<Tensor>& input_tensors,
272 const Tensor& output_tensor,
276 Indexer(
const std::vector<Tensor>& input_tensors,
277 const std::vector<Tensor>& output_tensors,
407 int64_t workload_idx)
const {
423 template <
typename T>
425 int64_t workload_idx)
const {
429 return GetWorkloadDataPtr<T>(
inputs_[input_idx],
450 template <
typename T>
462 int64_t workload_idx)
const {
473 template <
typename T>
475 int64_t workload_idx)
const {
476 return GetWorkloadDataPtr<T>(
outputs_[output_idx],
481 #ifdef BUILD_ISPC_MODULE
483 ispc::Indexer ToISPC()
const;
530 const int64_t* dst_shape);
536 const int64_t* src_shape,
545 int64_t workload_idx)
const {
548 if (workload_idx < 0) {
552 return static_cast<char*
>(tr.
data_ptr_) +
556 for (int64_t i = 0; i <
ndims_; ++i) {
571 template <
typename T>
574 int64_t workload_idx)
const {
577 if (workload_idx < 0) {
581 return static_cast<T*
>(tr.
data_ptr_) + workload_idx;
584 for (int64_t i = 0; i <
ndims_; ++i) {
589 return static_cast<T*
>(
static_cast<void*
>(
652 std::vector<std::unique_ptr<Indexer>>
vec_;
#define CLOUDVIEWER_HOST_DEVICE
IndexerIterator(const Indexer &indexer)
void ReorderDimensions(const SizeVector &reduction_dims)
CLOUDVIEWER_HOST_DEVICE T * GetOutputPtr(int64_t workload_idx) const
std::unique_ptr< Indexer > SplitLargestDim()
const TensorRef & GetInput(int64_t i) const
CLOUDVIEWER_HOST_DEVICE T * GetWorkloadDataPtr(const TensorRef &tr, bool tr_contiguous, int64_t workload_idx) const
int64_t ndims_
Indexer's global number of dimensions.
bool outputs_contiguous_[MAX_OUTPUTS]
Array of contiguous flags for all output TensorRefs.
const int64_t * GetPrimaryShape() const
CLOUDVIEWER_HOST_DEVICE T * GetOutputPtr(int64_t output_idx, int64_t workload_idx) const
int64_t NumDims() const
Returns number of dimensions of the Indexer.
bool ShouldAccumulate() const
const int64_t * GetPrimaryStrides() const
const TensorRef & GetOutput(int64_t i) const
CLOUDVIEWER_HOST_DEVICE char * GetOutputPtr(int64_t output_idx, int64_t workload_idx) const
int64_t num_inputs_
Number of input and output Tensors.
void ShrinkDim(int64_t dim, int64_t start, int64_t size)
bool IsFinalOutput() const
int64_t primary_shape_[MAX_DIMS]
CLOUDVIEWER_HOST_DEVICE char * GetInputPtr(int64_t input_idx, int64_t workload_idx) const
void UpdateContiguousFlags()
Update input_contiguous_ and output_contiguous_.
bool IsReductionDim(int64_t dim) const
Returns true if the dim -th dimension is reduced.
TensorRef inputs_[MAX_INPUTS]
Array of input TensorRefs.
TensorRef outputs_[MAX_OUTPUTS]
Array of output TensorRefs.
const TensorRef & GetOutput() const
int64_t NumInputs() const
Number of input Tensors.
Indexer(const Indexer &)=default
int64_t NumReductionDims() const
Returns the number of reduction dimensions.
bool CanUse32BitIndexing() const
Returns true iff the maximum_offsets in bytes are smaller than 2^31 - 1.
static void ReductionRestride(TensorRef &dst, int64_t src_ndims, const int64_t *src_shape, const SizeVector &reduction_dims)
CLOUDVIEWER_HOST_DEVICE char * GetOutputPtr(int64_t workload_idx) const
Indexer GetPerOutputIndexer(int64_t output_idx) const
TensorRef & GetInput(int64_t i)
Returns input TensorRef.
int64_t NumWorkloads() const
int64_t NumOutputElements() const
Returns the number of output elements.
int64_t NumOutputs() const
Number of output Tensors.
static void BroadcastRestride(TensorRef &src, int64_t dst_ndims, const int64_t *dst_shape)
bool inputs_contiguous_[MAX_INPUTS]
Array of contiguous flags for all input TensorRefs.
TensorRef & GetOutput(int64_t i)
Returns output TensorRef.
int64_t * GetPrimaryShape()
CLOUDVIEWER_HOST_DEVICE char * GetWorkloadDataPtr(const TensorRef &tr, bool tr_contiguous, int64_t workload_idx) const
int64_t primary_strides_[MAX_DIMS]
CLOUDVIEWER_HOST_DEVICE T * GetInputPtr(int64_t input_idx, int64_t workload_idx) const
IndexerIterator SplitTo32BitIndexing() const
void UpdatePrimaryStrides()
Update primary_strides_ based on primary_shape_.
Indexer & operator=(const Indexer &)=default
void CoalesceDimensions()
CLOUDVIEWER_HOST_DEVICE int64_t NumWorkloads() const
TensorIterator(const Tensor &tensor)
CLOUDVIEWER_HOST_DEVICE void * GetPtr(int64_t workload_idx) const
int64_t GetStride(int64_t dim) const
SizeVector GetShape() const
int64_t WrapDim(int64_t dim, int64_t max_dim, bool inclusive)
Wrap around negative dim.
SizeVector DefaultStrides(const SizeVector &shape)
Compute default strides for a shape when a tensor is contiguous.
static constexpr int64_t MAX_OUTPUTS
static constexpr int64_t MAX_DIMS
static constexpr int64_t MAX_INPUTS
Generic file read and write utility for python interface.
bool operator==(const Iterator &other) const
Indexer & operator*() const
Iterator(Iterator &&other)=default
bool operator!=(const Iterator &other) const
std::vector< std::unique_ptr< Indexer > > vec_
OffsetCalculator(int dims, const int64_t *sizes, const int64_t *const *strides)
CLOUDVIEWER_HOST_DEVICE utility::MiniVec< index_t, NARGS > get(index_t linear_idx) const
index_t strides_[MAX_DIMS][NARGS]
A minimalistic class that reference a Tensor.
int64_t byte_strides_[MAX_DIMS]
TensorRef(const Tensor &t)
bool IsContiguous() const
Returns True if the underlying memory buffer is contiguous.
bool operator==(const TensorRef &other) const
void Permute(const SizeVector &dims)
Permute (dimension shuffle) the reference to a Tensor.
bool operator!=(const TensorRef &other) const