12 #ifdef BUILD_ISPC_MODULE
13 #include "Indexer_ispc.h"
19 #ifdef BUILD_ISPC_MODULE
20 ispc::TensorRef TensorRef::ToISPC()
const {
21 ispc::TensorRef ispc_tensor_ref;
24 ispc_tensor_ref.ndims_ =
ndims_;
26 for (int64_t i = 0; i <
ndims_; ++i) {
27 ispc_tensor_ref.shape_[i] =
shape_[i];
31 return ispc_tensor_ref;
36 const Tensor& output_tensor,
45 const std::vector<Tensor>& output_tensors,
49 num_inputs_ =
static_cast<int64_t
>(input_tensors.size());
50 num_outputs_ =
static_cast<int64_t
>(output_tensors.size());
56 "Indexer cannot have more than {} inputs, but got {}.",
64 "Indexer cannot have more than {} outputs, but got {}.",
70 const Dtype ref_dtype = input_tensors[0].GetDtype();
71 for (
const auto& input_tensor : input_tensors) {
72 if (input_tensor.GetDtype() != ref_dtype) {
74 input_tensor.GetDtype().ToString(),
78 for (
const auto& output_tensor : output_tensors) {
79 if (output_tensor.GetDtype() != ref_dtype) {
81 output_tensor.GetDtype().ToString(),
86 const Dtype ref_dtype = input_tensors[0].GetDtype();
87 for (
const auto& input_tensor : input_tensors) {
88 if (input_tensor.GetDtype() != ref_dtype) {
90 input_tensor.GetDtype().ToString(),
95 const Dtype ref_dtype = input_tensors[0].GetDtype();
96 for (
const auto& input_tensor : input_tensors) {
97 if (input_tensor.GetDtype() != ref_dtype) {
99 input_tensor.GetDtype().ToString(),
103 for (
const auto& output_tensor : output_tensors) {
106 output_tensor.GetDtype().ToString(),
125 SizeVector ref_output_shape = output_tensors[0].GetShape();
126 for (
const auto& output_tensor : output_tensors) {
127 if (output_tensor.GetShape() != ref_output_shape) {
129 "For broadcast, all output shapes must be the same, "
131 output_tensor.GetShape(), ref_output_shape);
137 if (reduction_dims.
size() > 0) {
140 "Internal error: reduction op can only have 1 inputs.");
147 reduction_dims,
true) !=
148 output_tensors[i].GetShape()) {
150 "Reduction dimensions mismatch, input's shape {}, "
151 "reduction dims {}, output's shape {}.",
152 input_tensors[0].GetShape(), reduction_dims,
153 output_tensors[i].GetShape());
169 for (int64_t i = 0; i <
ndims_; ++i) {
187 for (int64_t i = 0; i <
ndims_; ++i) {
208 int64_t max_offset = 1;
209 for (
int dim = 0; dim <
ndims_; dim++) {
213 if (max_offset > max_value) {
220 int64_t max_offset = 1;
221 for (
int dim = 0; dim <
ndims_; dim++) {
226 if (max_offset > max_value) {
249 int64_t max_extent = -1;
250 int64_t dim_to_split = -1;
251 for (int64_t dim =
ndims_ - 1; dim >= 0; dim--) {
256 int64_t extent = (
size - 1) *
inputs_[i].byte_strides_[dim];
257 if (extent > max_extent) {
265 int64_t extent = (
size - 1) *
outputs_[i].byte_strides_[dim];
266 if (extent > max_extent) {
272 if (max_extent < 0) {
274 "Internal error: max_extent must be >= 0, but got {}.",
278 if (!(dim_to_split >= 0 && dim_to_split <
ndims_)) {
280 "Internal error: 0 <= dim_to_split < {} required, but got {}.",
286 "Internal error: cannot split dimension size {}, must be >= 2.",
295 copy->ShrinkDim(dim_to_split, 0, copy_size);
296 copy->final_output_ &= !overlaps;
297 this->
ShrinkDim(dim_to_split, copy_size, this_size);
311 int64_t output_shape[
MAX_DIMS] = {0};
312 int64_t output_default_strides[
MAX_DIMS] = {0};
313 int64_t offset_indices[
MAX_DIMS] = {0};
315 for (int64_t i = 0; i <
ndims_; ++i) {
323 for (int64_t i =
ndims_ - 1; i >= 0; --i) {
324 output_default_strides[i] =
stride;
328 for (int64_t i = 0; i <
ndims_; ++i) {
329 offset_indices[i] = output_idx / output_default_strides[i];
330 output_idx = output_idx % output_default_strides[i];
334 for (int64_t dim = 0; dim < sub_indexer.
ndims_; ++dim) {
335 for (int64_t i = 0; i < sub_indexer.
num_inputs_; ++i) {
344 for (int64_t i = 0; i < sub_indexer.
num_outputs_; ++i) {
366 if (!(dim >= 0 && dim <
ndims_)) {
377 inputs_[i].byte_strides_[dim] * start;
382 outputs_[i].byte_strides_[dim] * start;
398 for (int64_t dim = 0; dim <
ndims_; dim++) {
399 if (
outputs_[0].byte_strides_[dim] == 0) {
407 int64_t num_workloads = 1;
408 for (int64_t i = 0; i <
ndims_; ++i) {
411 return num_workloads;
416 int64_t num_output_elements = 1;
417 for (int64_t i = 0; i <
ndims_; ++i) {
422 return num_output_elements;
430 auto can_coalesce = [&](int64_t dim0, int64_t dim1) {
433 if (shape0 == 1 || shape1 == 1) {
453 auto replace_stride = [&](int64_t dim0, int64_t dim1) {
462 int64_t prev_dim = 0;
463 for (int64_t dim = 1; dim <
ndims_; dim++) {
464 if (can_coalesce(prev_dim, dim)) {
466 replace_stride(prev_dim, dim);
471 if (prev_dim != dim) {
472 replace_stride(prev_dim, dim);
497 std::iota(permute.
rbegin(), permute.
rend(), 0);
500 auto ShouldSwap = [&](
size_t dim0,
size_t dim1) {
505 if (stride0 == 0 && stride1 != 0) {
507 }
else if (stride1 == 0 && stride0 != 0) {
509 }
else if (stride0 != 0 && stride1 != 0) {
510 if (stride0 <= stride1) {
522 if (stride0 == 0 || stride1 == 0) {
524 }
else if (stride0 <= stride1) {
535 for (
int i = 1; i <
ndims_; i++) {
537 for (
int dim0 = i - 1; dim0 >= 0; dim0--) {
538 int comparison = ShouldSwap(permute[dim0], permute[dim1]);
539 if (comparison > 0) {
542 }
else if (comparison < 0) {
558 for (int64_t i =
ndims_ - 1; i >= 0; --i) {
577 const int64_t* dst_shape) {
578 int64_t src_ndims = src.
ndims_;
581 int64_t ndims_omitted = dst_ndims - src_ndims;
582 for (int64_t i = src_ndims - 1; i >= 0; --i) {
586 for (int64_t i = 0; i < ndims_omitted; ++i) {
593 for (int64_t i = 0; i < dst_ndims; ++i) {
596 if (src.
shape_[i] == 1 && dst_shape[i] != 1) {
604 const int64_t* src_shape,
606 if (dst.
ndims_ != src_ndims) {
610 for (int64_t i = 0; i < dst.
ndims_; ++i) {
611 if (dst.
shape_[i] == 1 && src_shape[i] != 1) {
617 #ifdef BUILD_ISPC_MODULE
618 ispc::Indexer Indexer::ToISPC()
const {
619 ispc::Indexer ispc_indexer;
623 for (int64_t i = 0; i <
NumInputs(); ++i) {
624 ispc_indexer.inputs_[i] =
GetInput(i).ToISPC();
628 ispc_indexer.outputs_[i] =
GetOutput(i).ToISPC();
631 for (int64_t i = 0; i <
NumDims(); ++i) {
635 ispc_indexer.ndims_ =
NumDims();
645 vec_.emplace_back(
nullptr);
653 while (!vec_.empty() && !vec_.back()->CanUse32BitIndexing()) {
655 vec_.emplace_back(
indexer.SplitLargestDim());
661 return this == &other || (vec_.empty() && other.
vec_.empty());
664 return !(*
this == other);
std::string ToString() const
IndexerIterator(const Indexer &indexer)
void ReorderDimensions(const SizeVector &reduction_dims)
std::unique_ptr< Indexer > SplitLargestDim()
int64_t ndims_
Indexer's global number of dimensions.
bool outputs_contiguous_[MAX_OUTPUTS]
Array of contiguous flags for all output TensorRefs.
const int64_t * GetPrimaryShape() const
int64_t NumDims() const
Returns number of dimensions of the Indexer.
const int64_t * GetPrimaryStrides() const
int64_t num_inputs_
Number of input and output Tensors.
void ShrinkDim(int64_t dim, int64_t start, int64_t size)
int64_t primary_shape_[MAX_DIMS]
void UpdateContiguousFlags()
Update input_contiguous_ and output_contiguous_.
bool IsReductionDim(int64_t dim) const
Returns true if the dim -th dimension is reduced.
TensorRef inputs_[MAX_INPUTS]
Array of input TensorRefs.
TensorRef outputs_[MAX_OUTPUTS]
Array of output TensorRefs.
int64_t NumInputs() const
Number of input Tensors.
int64_t NumReductionDims() const
Returns the number of reduction dimensions.
bool CanUse32BitIndexing() const
Returns true iff the maximum_offsets in bytes are smaller than 2^31 - 1.
static void ReductionRestride(TensorRef &dst, int64_t src_ndims, const int64_t *src_shape, const SizeVector &reduction_dims)
Indexer GetPerOutputIndexer(int64_t output_idx) const
TensorRef & GetInput(int64_t i)
Returns input TensorRef.
int64_t NumWorkloads() const
int64_t NumOutputElements() const
Returns the number of output elements.
int64_t NumOutputs() const
Number of output Tensors.
static void BroadcastRestride(TensorRef &src, int64_t dst_ndims, const int64_t *dst_shape)
bool inputs_contiguous_[MAX_INPUTS]
Array of contiguous flags for all input TensorRefs.
int64_t primary_strides_[MAX_DIMS]
IndexerIterator SplitTo32BitIndexing() const
void UpdatePrimaryStrides()
Update primary_strides_ based on primary_shape_.
void CoalesceDimensions()
reverse_iterator rbegin()
SizeVector ReductionShape(const SizeVector &src_shape, const SizeVector &dims, bool keepdim)
Returns the shape after reduction.
static constexpr int64_t MAX_OUTPUTS
static constexpr int64_t MAX_DIMS
static constexpr int64_t MAX_INPUTS
Generic file read and write utility for python interface.
void swap(cloudViewer::core::SmallVectorImpl< T > &LHS, cloudViewer::core::SmallVectorImpl< T > &RHS)
Implement std::swap in terms of SmallVector swap.
bool operator==(const Iterator &other) const
Indexer & operator*() const
bool operator!=(const Iterator &other) const
std::vector< std::unique_ptr< Indexer > > vec_
A minimalistic class that reference a Tensor.
int64_t byte_strides_[MAX_DIMS]
bool IsContiguous() const
Returns True if the underlying memory buffer is contiguous.
void Permute(const SizeVector &dims)
Permute (dimension shuffle) the reference to a Tensor.