10 #include <unordered_map>
30 dtype = dtype_original;
37 if (A_shape.
size() != 2) {
40 if (B_shape.
size() != 1 && B_shape.
size() != 2) {
42 "Tensor B must be 1D (vector) or 2D (matrix), but got {}D.",
45 if (A_shape[1] != B_shape[0]) {
47 A_shape[1], B_shape[0]);
51 int64_t m = A_shape[0];
52 int64_t k = A_shape[1];
53 int64_t n = B_shape.
size() == 2 ? B_shape[1] : 1;
55 if (m == 0 || k == 0 || n == 0) {
57 "Tensor shapes should not contain dimensions with zero.");
69 #ifdef BUILD_SYCL_MODULE
70 MatmulSYCL(B_data, A_data, C_data, n, k, m, dtype, device);
74 }
else if (device.
IsCUDA()) {
75 #ifdef BUILD_CUDA_MODULE
77 MatmulCUDA(B_data, A_data, C_data, n, k, m, dtype, device);
82 MatmulCPU(B_data, A_data, C_data, n, k, m, dtype);
85 output = output.
To(dtype_original);
#define AssertTensorDevice(tensor,...)
#define AssertTensorDtype(tensor,...)
When CUDA is not enabled, this is a dummy class.
bool IsCUDA() const
Returns true iff device type is CUDA.
bool IsSYCL() const
Returns true iff device type is SYCL GPU.
std::string ToString() const
Tensor Contiguous() const
Device GetDevice() const override
static Tensor Empty(const SizeVector &shape, Dtype dtype, const Device &device=Device("CPU:0"))
Create a tensor with uninitialized values.
SizeVector GetShape() const
Tensor To(Dtype dtype, bool copy=false) const
void MatmulCUDA(void *A_data, void *B_data, void *C_data, int64_t m, int64_t k, int64_t n, Dtype dtype, const Device &device)
void MatmulSYCL(void *A_data, void *B_data, void *C_data, int64_t m, int64_t k, int64_t n, Dtype dtype, const Device &device)
void Matmul(const Tensor &A, const Tensor &B, Tensor &output)
Computes matrix multiplication C = AB.
void MatmulCPU(void *A_data, void *B_data, void *C_data, int64_t m, int64_t k, int64_t n, Dtype dtype)
Generic file read and write utility for python interface.