10 #include <cuda_runtime.h>
11 #include <curand_kernel.h>
35 __host__ __device__
const T*
GetPtr()
const;
43 __device__ T
Get(
const size_t row,
45 const size_t slice = 0)
const;
50 __device__ T&
GetRef(
const size_t row,
const size_t col);
51 __device__ T&
GetRef(
const size_t row,
55 __device__
void Set(
const size_t row,
const size_t col,
const T value);
56 __device__
void Set(
const size_t row,
85 void Write(
const std::string&
path,
const size_t slice);
113 template <
typename T>
114 __global__
void FillWithScalarKernel(
GpuMat<T> output,
const T value) {
115 const size_t row = blockIdx.y * blockDim.y + threadIdx.y;
116 const size_t col = blockIdx.x * blockDim.x + threadIdx.x;
118 for (
size_t slice = 0; slice < output.
GetDepth(); ++slice) {
119 output.
Set(row, col, slice, value);
124 template <
typename T>
125 __global__
void FillWithVectorKernel(
const T* values, GpuMat<T> output) {
126 const size_t row = blockIdx.y * blockDim.y + threadIdx.y;
127 const size_t col = blockIdx.x * blockDim.x + threadIdx.x;
128 if (row < output.GetHeight() && col < output.GetWidth()) {
129 for (
size_t slice = 0; slice < output.GetDepth(); ++slice) {
130 output.Set(row, col, slice, values[slice]);
135 template <
typename T>
136 __global__
void FillWithRandomNumbersKernel(GpuMat<T> output,
137 GpuMat<curandState> random_state,
140 const size_t row = blockIdx.y * blockDim.y + threadIdx.y;
141 const size_t col = blockIdx.x * blockDim.x + threadIdx.x;
142 if (row < output.GetHeight() && col < output.GetWidth()) {
143 curandState local_state = random_state.Get(row, col);
144 for (
size_t slice = 0; slice < output.GetDepth(); ++slice) {
145 const T random_value =
146 curand_uniform(&local_state) * (max_value - min_value) +
148 output.Set(row, col, slice, random_value);
150 random_state.Set(row, col, local_state);
156 template <
typename T>
171 template <
typename T>
174 array_ptr_ =
nullptr;
181 template <
typename T>
186 template <
typename T>
191 template <
typename T>
196 template <
typename T>
201 template <
typename T>
206 template <
typename T>
211 template <
typename T>
214 const size_t slice)
const {
215 return *((T*)((
char*)array_ptr_ + pitch_ * (slice * height_ + row)) + col);
218 template <
typename T>
222 for (
size_t slice = 0; slice < depth_; ++slice) {
223 values[slice] = Get(row, col, slice);
227 template <
typename T>
229 return GetRef(row, col, 0);
232 template <
typename T>
235 const size_t slice) {
236 return *((T*)((
char*)array_ptr_ + pitch_ * (slice * height_ + row)) + col);
239 template <
typename T>
243 Set(row, col, 0, value);
246 template <
typename T>
251 *((T*)((
char*)array_ptr_ + pitch_ * (slice * height_ + row)) + col) = value;
254 template <
typename T>
258 for (
size_t slice = 0; slice < depth_; ++slice) {
259 Set(row, col, slice, values[slice]);
263 template <
typename T>
265 internal::FillWithScalarKernel<T><<<gridSize_, blockSize_>>>(*
this, value);
269 template <
typename T>
272 CUDA_SAFE_CALL(cudaMalloc((
void**)&values_device, depth_ *
sizeof(T)));
273 CUDA_SAFE_CALL(cudaMemcpy(values_device, values, depth_ *
sizeof(T),
274 cudaMemcpyHostToDevice));
275 internal::FillWithVectorKernel<T>
276 <<<gridSize_, blockSize_>>>(values_device, *
this);
281 template <
typename T>
284 const GpuMat<curandState> random_state) {
285 internal::FillWithRandomNumbersKernel<T><<<gridSize_, blockSize_>>>(
286 *
this, random_state, min_value, max_value);
290 template <
typename T>
293 pitch, width_ *
sizeof(T), height_ * depth_,
294 cudaMemcpyHostToDevice));
297 template <
typename T>
300 (
size_t)pitch_, width_ *
sizeof(T),
301 height_ * depth_, cudaMemcpyDeviceToHost));
304 template <
typename T>
306 Mat<T> mat(width_, height_, depth_);
307 CopyToHost(mat.GetPtr(), mat.GetWidth() *
sizeof(T));
311 template <
typename T>
313 for (
size_t slice = 0; slice < depth_; ++slice) {
314 CudaTranspose(array_ptr_ + slice * pitch_ /
sizeof(T) * GetHeight(),
315 output->GetPtr() + slice * output->pitch_ /
sizeof(T) *
317 width_, height_, pitch_, output->pitch_);
322 template <
typename T>
324 for (
size_t slice = 0; slice < depth_; ++slice) {
326 array_ptr_ + slice * pitch_ /
sizeof(T) * GetHeight(),
327 output->GetPtr() + slice * output->pitch_ /
sizeof(T) *
329 width_, height_, pitch_, output->pitch_);
334 template <
typename T>
336 for (
size_t slice = 0; slice < depth_; ++slice) {
337 CudaRotate((T*)((
char*)array_ptr_ + slice * pitch_ * GetHeight()),
338 (T*)((
char*)output->GetPtr() +
339 slice * output->pitch_ * output->GetHeight()),
340 width_, height_, pitch_, output->pitch_);
349 template <
typename T>
351 std::fstream text_file(
path, std::ios::in | std::ios::binary);
352 CHECK(text_file.is_open()) <<
path;
358 text_file >>
width >> unused_char >>
height >> unused_char >> depth >>
360 std::streampos pos = text_file.tellg();
363 std::fstream binary_file(
path, std::ios::in | std::ios::binary);
364 binary_file.seekg(pos);
366 std::vector<T> source(width_ * height_ * depth_);
367 ReadBinaryLittleEndian<T>(&binary_file, &source);
370 CopyToDevice(source.data(), width_ *
sizeof(T));
373 template <
typename T>
375 std::vector<T> dest(width_ * height_ * depth_);
376 CopyToHost(dest.data(), width_ *
sizeof(T));
378 std::fstream text_file(
path, std::ios::out);
379 text_file << width_ <<
"&" << height_ <<
"&" << depth_ <<
"&";
382 std::fstream binary_file(
path,
383 std::ios::out | std::ios::binary | std::ios::app);
384 WriteBinaryLittleEndian<T>(&binary_file, dest);
388 template <
typename T>
390 std::vector<T> dest(width_ * height_);
392 (
void*)dest.data(), width_ *
sizeof(T),
393 (
void*)(array_ptr_ + slice * height_ * pitch_ /
sizeof(T)), pitch_,
394 width_ *
sizeof(T), height_, cudaMemcpyDeviceToHost));
396 std::fstream text_file(
path, std::ios::out);
397 text_file << width_ <<
"&" << height_ <<
"&" << 1 <<
"&";
400 std::fstream binary_file(
path,
401 std::ios::out | std::ios::binary | std::ios::app);
402 WriteBinaryLittleEndian<T>(&binary_file, dest);
406 template <
typename T>
408 blockSize_.x = kBlockDimX;
409 blockSize_.y = kBlockDimY;
412 gridSize_.x = (width_ - 1) / kBlockDimX + 1;
413 gridSize_.y = (height_ - 1) / kBlockDimY + 1;
void CopyToDevice(const T *data, const size_t pitch)
__device__ T & GetRef(const size_t row, const size_t col, const size_t slice)
__device__ void GetSlice(const size_t row, const size_t col, T *values) const
__host__ __device__ size_t GetWidth() const
void FlipHorizontal(GpuMat< T > *output)
void FillWithRandomNumbers(const T min_value, const T max_value, GpuMat< curandState > random_state)
void Transpose(GpuMat< T > *output)
std::shared_ptr< T > array_
__host__ __device__ size_t GetDepth() const
static const size_t kBlockDimY
Mat< T > CopyToMat() const
__host__ __device__ const T * GetPtr() const
GpuMat(const size_t width, const size_t height, const size_t depth=1)
void Rotate(GpuMat< T > *output)
__device__ T Get(const size_t row, const size_t col, const size_t slice=0) const
void FillWithVector(const T *values)
void Write(const std::string &path, const size_t slice)
__host__ __device__ size_t GetHeight() const
void FillWithScalar(const T value)
void Read(const std::string &path)
__device__ void Set(const size_t row, const size_t col, const T value)
void Write(const std::string &path)
__device__ void Set(const size_t row, const size_t col, const size_t slice, const T value)
__host__ __device__ T * GetPtr()
__device__ T & GetRef(const size_t row, const size_t col)
__device__ void SetSlice(const size_t row, const size_t col, const T *values)
__host__ __device__ size_t GetPitch() const
void CopyToHost(T *data, const size_t pitch) const
static const size_t kBlockDimX
#define CUDA_SYNC_AND_CHECK()
#define CUDA_SAFE_CALL(error)
static const std::string path
void CudaTranspose(const T *input, T *output, const int width, const int height, const int pitch_input, const int pitch_output)
void CudaRotate(const T *input, T *output, const int width, const int height, const int pitch_input, const int pitch_output)
void CudaFlipHorizontal(const T *input, T *output, const int width, const int height, const int pitch_input, const int pitch_output)