ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
cuda_transpose.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <cuda_runtime.h>
11 
12 namespace colmap {
13 namespace mvs {
14 
15 // Transpose the input matrix.
16 template <typename T>
17 void CudaTranspose(const T* input,
18  T* output,
19  const int width,
20  const int height,
21  const int pitch_input,
22  const int pitch_output);
23 
25 // Implementation
27 
28 #ifdef __CUDACC__
29 
30 // TILE_DIM_TRANSPOSE must divide by BLOCK_ROWS. Do not change these values.
31 #define TILE_DIM_TRANSPOSE 32
32 #define BLOCK_ROWS_TRANSPOSE 8
33 
34 namespace internal {
35 
36 template <typename T>
37 __global__ void CudaTransposeKernel(T* output_data,
38  const T* input_data,
39  const int width,
40  const int height,
41  const int input_pitch,
42  const int output_pitch) {
43  int x_index = blockIdx.x * TILE_DIM_TRANSPOSE + threadIdx.x;
44  int y_index = blockIdx.y * TILE_DIM_TRANSPOSE + threadIdx.y;
45 
46  __shared__ T tile[TILE_DIM_TRANSPOSE][TILE_DIM_TRANSPOSE + 1];
47  const int tile_x =
48  min(threadIdx.x, width - 1 - blockIdx.x * TILE_DIM_TRANSPOSE);
49  const int tile_y =
50  min(threadIdx.y, height - 1 - blockIdx.y * TILE_DIM_TRANSPOSE);
51 
52  for (int i = 0; i < TILE_DIM_TRANSPOSE; i += BLOCK_ROWS_TRANSPOSE) {
53  const int x = min(x_index, width - 1);
54  const int y = min(y_index, height - i - 1);
55  tile[tile_y + i][tile_x] =
56  *((T*)((char*)input_data + y * input_pitch + i * input_pitch) +
57  x);
58  }
59 
60  __syncthreads();
61 
62  x_index = blockIdx.y * TILE_DIM_TRANSPOSE + threadIdx.x;
63  if (x_index < height) {
64  y_index = blockIdx.x * TILE_DIM_TRANSPOSE + threadIdx.y;
65  for (int i = 0; i < TILE_DIM_TRANSPOSE; i += BLOCK_ROWS_TRANSPOSE) {
66  if (y_index + i < width) {
67  *((T*)((char*)output_data + y_index * output_pitch +
68  i * output_pitch) +
69  x_index) = tile[threadIdx.x][threadIdx.y + i];
70  }
71  }
72  }
73 }
74 
75 } // namespace internal
76 
77 template <typename T>
78 void CudaTranspose(const T* input,
79  T* output,
80  const int width,
81  const int height,
82  const int pitch_input,
83  const int pitch_output) {
84  dim3 block_dim(TILE_DIM_TRANSPOSE, BLOCK_ROWS_TRANSPOSE, 1);
85  dim3 grid_dim;
86  grid_dim.x = (width - 1) / TILE_DIM_TRANSPOSE + 1;
87  grid_dim.y = (height - 1) / TILE_DIM_TRANSPOSE + 1;
88 
89  internal::CudaTransposeKernel<<<grid_dim, block_dim>>>(
90  output, input, width, height, pitch_input, pitch_output);
91 }
92 
93 #undef TILE_DIM_TRANSPOSE
94 #undef BLOCK_ROWS_TRANSPOSE
95 
96 #endif // __CUDACC__
97 
98 } // namespace mvs
99 } // namespace colmap
int width
int height
normal_z y
normal_z x
void CudaTranspose(const T *input, T *output, const int width, const int height, const int pitch_input, const int pitch_output)