8 #include <benchmark/benchmark.h>
22 int64_t large_dim = (1ULL << 27) + 10;
27 for (
auto _ : state) {
34 ->Unit(benchmark::kMillisecond);
36 #ifdef BUILD_CUDA_MODULE
38 ->Unit(benchmark::kMillisecond);
Tensor Sum(const SizeVector &dims, bool keepdim=false) const
BENCHMARK_CAPTURE(BinaryEW, Add__CPU_Int8__100, 100, BinaryOpCode::Add, Int8, Device("CPU:0")) -> Unit(benchmark::kMillisecond)
void Reduction(benchmark::State &state, const Device &device)
Generic file read and write utility for python interface.