ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
gpu_mat.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <cuda_runtime.h>
11 #include <curand_kernel.h>
12 
13 #include <fstream>
14 #include <iterator>
15 #include <memory>
16 #include <string>
17 
18 #include "mvs/cuda_flip.h"
19 #include "mvs/cuda_rotate.h"
20 #include "mvs/cuda_transpose.h"
21 #include "mvs/mat.h"
22 #include "util/cuda.h"
23 #include "util/cudacc.h"
24 #include "util/endian.h"
25 
26 namespace colmap {
27 namespace mvs {
28 
29 template <typename T>
30 class GpuMat {
31 public:
32  GpuMat(const size_t width, const size_t height, const size_t depth = 1);
34 
35  __host__ __device__ const T* GetPtr() const;
36  __host__ __device__ T* GetPtr();
37 
38  __host__ __device__ size_t GetPitch() const;
39  __host__ __device__ size_t GetWidth() const;
40  __host__ __device__ size_t GetHeight() const;
41  __host__ __device__ size_t GetDepth() const;
42 
43  __device__ T Get(const size_t row,
44  const size_t col,
45  const size_t slice = 0) const;
46  __device__ void GetSlice(const size_t row,
47  const size_t col,
48  T* values) const;
49 
50  __device__ T& GetRef(const size_t row, const size_t col);
51  __device__ T& GetRef(const size_t row,
52  const size_t col,
53  const size_t slice);
54 
55  __device__ void Set(const size_t row, const size_t col, const T value);
56  __device__ void Set(const size_t row,
57  const size_t col,
58  const size_t slice,
59  const T value);
60  __device__ void SetSlice(const size_t row,
61  const size_t col,
62  const T* values);
63 
64  void FillWithScalar(const T value);
65  void FillWithVector(const T* values);
66  void FillWithRandomNumbers(const T min_value,
67  const T max_value,
68  GpuMat<curandState> random_state);
69 
70  void CopyToDevice(const T* data, const size_t pitch);
71  void CopyToHost(T* data, const size_t pitch) const;
72  Mat<T> CopyToMat() const;
73 
74  // Transpose array by swapping x and y coordinates.
75  void Transpose(GpuMat<T>* output);
76 
77  // Flip array along vertical axis.
78  void FlipHorizontal(GpuMat<T>* output);
79 
80  // Rotate array in counter-clockwise direction.
81  void Rotate(GpuMat<T>* output);
82 
83  void Read(const std::string& path);
84  void Write(const std::string& path);
85  void Write(const std::string& path, const size_t slice);
86 
87 protected:
89 
90  const static size_t kBlockDimX = 32;
91  const static size_t kBlockDimY = 16;
92 
93  std::shared_ptr<T> array_;
95 
96  size_t pitch_;
97  size_t width_;
98  size_t height_;
99  size_t depth_;
100 
102  dim3 gridSize_;
103 };
104 
106 // Implementation
108 
109 #ifdef __CUDACC__
110 
111 namespace internal {
112 
113 template <typename T>
114 __global__ void FillWithScalarKernel(GpuMat<T> output, const T value) {
115  const size_t row = blockIdx.y * blockDim.y + threadIdx.y;
116  const size_t col = blockIdx.x * blockDim.x + threadIdx.x;
117  if (row < output.GetHeight() && col < output.GetWidth()) {
118  for (size_t slice = 0; slice < output.GetDepth(); ++slice) {
119  output.Set(row, col, slice, value);
120  }
121  }
122 }
123 
124 template <typename T>
125 __global__ void FillWithVectorKernel(const T* values, GpuMat<T> output) {
126  const size_t row = blockIdx.y * blockDim.y + threadIdx.y;
127  const size_t col = blockIdx.x * blockDim.x + threadIdx.x;
128  if (row < output.GetHeight() && col < output.GetWidth()) {
129  for (size_t slice = 0; slice < output.GetDepth(); ++slice) {
130  output.Set(row, col, slice, values[slice]);
131  }
132  }
133 }
134 
135 template <typename T>
136 __global__ void FillWithRandomNumbersKernel(GpuMat<T> output,
137  GpuMat<curandState> random_state,
138  const T min_value,
139  const T max_value) {
140  const size_t row = blockIdx.y * blockDim.y + threadIdx.y;
141  const size_t col = blockIdx.x * blockDim.x + threadIdx.x;
142  if (row < output.GetHeight() && col < output.GetWidth()) {
143  curandState local_state = random_state.Get(row, col);
144  for (size_t slice = 0; slice < output.GetDepth(); ++slice) {
145  const T random_value =
146  curand_uniform(&local_state) * (max_value - min_value) +
147  min_value;
148  output.Set(row, col, slice, random_value);
149  }
150  random_state.Set(row, col, local_state);
151  }
152 }
153 
154 } // namespace internal
155 
156 template <typename T>
157 GpuMat<T>::GpuMat(const size_t width, const size_t height, const size_t depth)
158  : array_(nullptr),
159  array_ptr_(nullptr),
160  width_(width),
161  height_(height),
162  depth_(depth) {
163  CUDA_SAFE_CALL(cudaMallocPitch((void**)&array_ptr_, &pitch_,
164  width_ * sizeof(T), height_ * depth_));
165 
166  array_ = std::shared_ptr<T>(array_ptr_, cudaFree);
167 
169 }
170 
171 template <typename T>
173  array_.reset();
174  array_ptr_ = nullptr;
175  pitch_ = 0;
176  width_ = 0;
177  height_ = 0;
178  depth_ = 0;
179 }
180 
181 template <typename T>
182 __host__ __device__ const T* GpuMat<T>::GetPtr() const {
183  return array_ptr_;
184 }
185 
186 template <typename T>
187 __host__ __device__ T* GpuMat<T>::GetPtr() {
188  return array_ptr_;
189 }
190 
191 template <typename T>
192 __host__ __device__ size_t GpuMat<T>::GetPitch() const {
193  return pitch_;
194 }
195 
196 template <typename T>
197 __host__ __device__ size_t GpuMat<T>::GetWidth() const {
198  return width_;
199 }
200 
201 template <typename T>
202 __host__ __device__ size_t GpuMat<T>::GetHeight() const {
203  return height_;
204 }
205 
206 template <typename T>
207 __host__ __device__ size_t GpuMat<T>::GetDepth() const {
208  return depth_;
209 }
210 
211 template <typename T>
212 __device__ T GpuMat<T>::Get(const size_t row,
213  const size_t col,
214  const size_t slice) const {
215  return *((T*)((char*)array_ptr_ + pitch_ * (slice * height_ + row)) + col);
216 }
217 
218 template <typename T>
219 __device__ void GpuMat<T>::GetSlice(const size_t row,
220  const size_t col,
221  T* values) const {
222  for (size_t slice = 0; slice < depth_; ++slice) {
223  values[slice] = Get(row, col, slice);
224  }
225 }
226 
227 template <typename T>
228 __device__ T& GpuMat<T>::GetRef(const size_t row, const size_t col) {
229  return GetRef(row, col, 0);
230 }
231 
232 template <typename T>
233 __device__ T& GpuMat<T>::GetRef(const size_t row,
234  const size_t col,
235  const size_t slice) {
236  return *((T*)((char*)array_ptr_ + pitch_ * (slice * height_ + row)) + col);
237 }
238 
239 template <typename T>
240 __device__ void GpuMat<T>::Set(const size_t row,
241  const size_t col,
242  const T value) {
243  Set(row, col, 0, value);
244 }
245 
246 template <typename T>
247 __device__ void GpuMat<T>::Set(const size_t row,
248  const size_t col,
249  const size_t slice,
250  const T value) {
251  *((T*)((char*)array_ptr_ + pitch_ * (slice * height_ + row)) + col) = value;
252 }
253 
254 template <typename T>
255 __device__ void GpuMat<T>::SetSlice(const size_t row,
256  const size_t col,
257  const T* values) {
258  for (size_t slice = 0; slice < depth_; ++slice) {
259  Set(row, col, slice, values[slice]);
260  }
261 }
262 
263 template <typename T>
264 void GpuMat<T>::FillWithScalar(const T value) {
265  internal::FillWithScalarKernel<T><<<gridSize_, blockSize_>>>(*this, value);
267 }
268 
269 template <typename T>
270 void GpuMat<T>::FillWithVector(const T* values) {
271  T* values_device;
272  CUDA_SAFE_CALL(cudaMalloc((void**)&values_device, depth_ * sizeof(T)));
273  CUDA_SAFE_CALL(cudaMemcpy(values_device, values, depth_ * sizeof(T),
274  cudaMemcpyHostToDevice));
275  internal::FillWithVectorKernel<T>
276  <<<gridSize_, blockSize_>>>(values_device, *this);
278  CUDA_SAFE_CALL(cudaFree(values_device));
279 }
280 
281 template <typename T>
282 void GpuMat<T>::FillWithRandomNumbers(const T min_value,
283  const T max_value,
284  const GpuMat<curandState> random_state) {
285  internal::FillWithRandomNumbersKernel<T><<<gridSize_, blockSize_>>>(
286  *this, random_state, min_value, max_value);
288 }
289 
290 template <typename T>
291 void GpuMat<T>::CopyToDevice(const T* data, const size_t pitch) {
292  CUDA_SAFE_CALL(cudaMemcpy2D((void*)array_ptr_, (size_t)pitch_, (void*)data,
293  pitch, width_ * sizeof(T), height_ * depth_,
294  cudaMemcpyHostToDevice));
295 }
296 
297 template <typename T>
298 void GpuMat<T>::CopyToHost(T* data, const size_t pitch) const {
299  CUDA_SAFE_CALL(cudaMemcpy2D((void*)data, pitch, (void*)array_ptr_,
300  (size_t)pitch_, width_ * sizeof(T),
301  height_ * depth_, cudaMemcpyDeviceToHost));
302 }
303 
304 template <typename T>
305 Mat<T> GpuMat<T>::CopyToMat() const {
306  Mat<T> mat(width_, height_, depth_);
307  CopyToHost(mat.GetPtr(), mat.GetWidth() * sizeof(T));
308  return mat;
309 }
310 
311 template <typename T>
312 void GpuMat<T>::Transpose(GpuMat<T>* output) {
313  for (size_t slice = 0; slice < depth_; ++slice) {
314  CudaTranspose(array_ptr_ + slice * pitch_ / sizeof(T) * GetHeight(),
315  output->GetPtr() + slice * output->pitch_ / sizeof(T) *
316  output->GetHeight(),
317  width_, height_, pitch_, output->pitch_);
318  }
320 }
321 
322 template <typename T>
323 void GpuMat<T>::FlipHorizontal(GpuMat<T>* output) {
324  for (size_t slice = 0; slice < depth_; ++slice) {
326  array_ptr_ + slice * pitch_ / sizeof(T) * GetHeight(),
327  output->GetPtr() + slice * output->pitch_ / sizeof(T) *
328  output->GetHeight(),
329  width_, height_, pitch_, output->pitch_);
330  }
332 }
333 
334 template <typename T>
335 void GpuMat<T>::Rotate(GpuMat<T>* output) {
336  for (size_t slice = 0; slice < depth_; ++slice) {
337  CudaRotate((T*)((char*)array_ptr_ + slice * pitch_ * GetHeight()),
338  (T*)((char*)output->GetPtr() +
339  slice * output->pitch_ * output->GetHeight()),
340  width_, height_, pitch_, output->pitch_);
341  }
343  // This is equivalent to the following code:
344  // GpuMat<T> flipped_array(width_, height_, GetDepth());
345  // FlipHorizontal(&flipped_array);
346  // flipped_array.Transpose(output);
347 }
348 
349 template <typename T>
350 void GpuMat<T>::Read(const std::string& path) {
351  std::fstream text_file(path, std::ios::in | std::ios::binary);
352  CHECK(text_file.is_open()) << path;
353 
354  size_t width;
355  size_t height;
356  size_t depth;
357  char unused_char;
358  text_file >> width >> unused_char >> height >> unused_char >> depth >>
359  unused_char;
360  std::streampos pos = text_file.tellg();
361  text_file.close();
362 
363  std::fstream binary_file(path, std::ios::in | std::ios::binary);
364  binary_file.seekg(pos);
365 
366  std::vector<T> source(width_ * height_ * depth_);
367  ReadBinaryLittleEndian<T>(&binary_file, &source);
368  binary_file.close();
369 
370  CopyToDevice(source.data(), width_ * sizeof(T));
371 }
372 
373 template <typename T>
374 void GpuMat<T>::Write(const std::string& path) {
375  std::vector<T> dest(width_ * height_ * depth_);
376  CopyToHost(dest.data(), width_ * sizeof(T));
377 
378  std::fstream text_file(path, std::ios::out);
379  text_file << width_ << "&" << height_ << "&" << depth_ << "&";
380  text_file.close();
381 
382  std::fstream binary_file(path,
383  std::ios::out | std::ios::binary | std::ios::app);
384  WriteBinaryLittleEndian<T>(&binary_file, dest);
385  binary_file.close();
386 }
387 
388 template <typename T>
389 void GpuMat<T>::Write(const std::string& path, const size_t slice) {
390  std::vector<T> dest(width_ * height_);
391  CUDA_SAFE_CALL(cudaMemcpy2D(
392  (void*)dest.data(), width_ * sizeof(T),
393  (void*)(array_ptr_ + slice * height_ * pitch_ / sizeof(T)), pitch_,
394  width_ * sizeof(T), height_, cudaMemcpyDeviceToHost));
395 
396  std::fstream text_file(path, std::ios::out);
397  text_file << width_ << "&" << height_ << "&" << 1 << "&";
398  text_file.close();
399 
400  std::fstream binary_file(path,
401  std::ios::out | std::ios::binary | std::ios::app);
402  WriteBinaryLittleEndian<T>(&binary_file, dest);
403  binary_file.close();
404 }
405 
406 template <typename T>
408  blockSize_.x = kBlockDimX;
409  blockSize_.y = kBlockDimY;
410  blockSize_.z = 1;
411 
412  gridSize_.x = (width_ - 1) / kBlockDimX + 1;
413  gridSize_.y = (height_ - 1) / kBlockDimY + 1;
414  gridSize_.z = 1;
415 }
416 
417 #endif // __CUDACC__
418 
419 } // namespace mvs
420 } // namespace colmap
int width
int height
void CopyToDevice(const T *data, const size_t pitch)
__device__ T & GetRef(const size_t row, const size_t col, const size_t slice)
__device__ void GetSlice(const size_t row, const size_t col, T *values) const
__host__ __device__ size_t GetWidth() const
void FlipHorizontal(GpuMat< T > *output)
void FillWithRandomNumbers(const T min_value, const T max_value, GpuMat< curandState > random_state)
void Transpose(GpuMat< T > *output)
std::shared_ptr< T > array_
Definition: gpu_mat.h:93
__host__ __device__ size_t GetDepth() const
static const size_t kBlockDimY
Definition: gpu_mat.h:91
Mat< T > CopyToMat() const
__host__ __device__ const T * GetPtr() const
GpuMat(const size_t width, const size_t height, const size_t depth=1)
void Rotate(GpuMat< T > *output)
__device__ T Get(const size_t row, const size_t col, const size_t slice=0) const
void FillWithVector(const T *values)
void Write(const std::string &path, const size_t slice)
__host__ __device__ size_t GetHeight() const
void FillWithScalar(const T value)
void Read(const std::string &path)
__device__ void Set(const size_t row, const size_t col, const T value)
void Write(const std::string &path)
__device__ void Set(const size_t row, const size_t col, const size_t slice, const T value)
__host__ __device__ T * GetPtr()
__device__ T & GetRef(const size_t row, const size_t col)
__device__ void SetSlice(const size_t row, const size_t col, const T *values)
__host__ __device__ size_t GetPitch() const
void CopyToHost(T *data, const size_t pitch) const
static const size_t kBlockDimX
Definition: gpu_mat.h:90
#define CUDA_SYNC_AND_CHECK()
Definition: cudacc.h:16
#define CUDA_SAFE_CALL(error)
Definition: cudacc.h:14
GraphType data
Definition: graph_cut.cc:138
static const std::string path
Definition: PointCloud.cpp:59
void CudaTranspose(const T *input, T *output, const int width, const int height, const int pitch_input, const int pitch_output)
void CudaRotate(const T *input, T *output, const int width, const int height, const int pitch_input, const int pitch_output)
void CudaFlipHorizontal(const T *input, T *output, const int width, const int height, const int pitch_input, const int pitch_output)