ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
cuda_flip.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <cuda_runtime.h>
11 
12 namespace colmap {
13 namespace mvs {
14 
15 // Flip the input matrix horizontally.
16 template <typename T>
17 void CudaFlipHorizontal(const T* input,
18  T* output,
19  const int width,
20  const int height,
21  const int pitch_input,
22  const int pitch_output);
23 
25 // Implementation
27 
28 #ifdef __CUDACC__
29 
30 // TILE_DIM_FLIP must divide by BLOCK_ROWS. Do not change these values.
31 #define TILE_DIM_FLIP 32
32 #define BLOCK_ROWS_FLIP 8
33 
34 namespace internal {
35 
36 template <typename T>
37 __global__ void CudaFlipHorizontalKernel(T* output_data,
38  const T* input_data,
39  const int width,
40  const int height,
41  const int input_pitch,
42  const int output_pitch) {
43  int x_index = blockIdx.x * TILE_DIM_FLIP + threadIdx.x;
44  const int y_index = blockIdx.y * TILE_DIM_FLIP + threadIdx.y;
45 
46  __shared__ T tile[TILE_DIM_FLIP][TILE_DIM_FLIP + 1];
47  const int tile_x = min(threadIdx.x, width - 1 - blockIdx.x * TILE_DIM_FLIP);
48  const int tile_y =
49  min(threadIdx.y, height - 1 - blockIdx.y * TILE_DIM_FLIP);
50 
51  for (int i = 0; i < TILE_DIM_FLIP; i += BLOCK_ROWS_FLIP) {
52  const int x = min(x_index, width - 1);
53  const int y = min(y_index, height - i - 1);
54  tile[tile_y + i][tile_x] =
55  *((T*)((char*)input_data + y * input_pitch + i * input_pitch) +
56  x);
57  }
58 
59  __syncthreads();
60 
61  x_index = width - 1 - (blockIdx.x * TILE_DIM_FLIP + threadIdx.x);
62  if (x_index < width) {
63  for (int i = 0; i < TILE_DIM_FLIP; i += BLOCK_ROWS_FLIP) {
64  if (y_index + i < height) {
65  *((T*)((char*)output_data + y_index * output_pitch +
66  i * output_pitch) +
67  x_index) = tile[threadIdx.y + i][threadIdx.x];
68  }
69  }
70  }
71 }
72 
73 } // namespace internal
74 
75 template <typename T>
76 void CudaFlipHorizontal(const T* input,
77  T* output,
78  const int width,
79  const int height,
80  const int pitch_input,
81  const int pitch_output) {
82  dim3 block_dim(TILE_DIM_FLIP, BLOCK_ROWS_FLIP, 1);
83  dim3 grid_dim;
84  grid_dim.x = (width - 1) / TILE_DIM_FLIP + 1;
85  grid_dim.y = (height - 1) / TILE_DIM_FLIP + 1;
86 
87  internal::CudaFlipHorizontalKernel<<<grid_dim, block_dim>>>(
88  output, input, width, height, pitch_input, pitch_output);
89 }
90 
91 #undef TILE_DIM_FLIP
92 #undef BLOCK_ROWS_FLIP
93 
94 #endif // __CUDACC__
95 
96 } // namespace mvs
97 } // namespace colmap
int width
int height
normal_z y
normal_z x
void CudaFlipHorizontal(const T *input, T *output, const int width, const int height, const int pitch_input, const int pitch_output)