10 #include <cuda_runtime.h>
21 const int pitch_input,
22 const int pitch_output);
31 #define TILE_DIM_TRANSPOSE 32
32 #define BLOCK_ROWS_TRANSPOSE 8
37 __global__
void CudaTransposeKernel(T* output_data,
41 const int input_pitch,
42 const int output_pitch) {
43 int x_index = blockIdx.x * TILE_DIM_TRANSPOSE + threadIdx.x;
44 int y_index = blockIdx.y * TILE_DIM_TRANSPOSE + threadIdx.y;
46 __shared__ T tile[TILE_DIM_TRANSPOSE][TILE_DIM_TRANSPOSE + 1];
48 min(threadIdx.x,
width - 1 - blockIdx.x * TILE_DIM_TRANSPOSE);
50 min(threadIdx.y,
height - 1 - blockIdx.y * TILE_DIM_TRANSPOSE);
52 for (
int i = 0; i < TILE_DIM_TRANSPOSE; i += BLOCK_ROWS_TRANSPOSE) {
53 const int x = min(x_index,
width - 1);
54 const int y = min(y_index,
height - i - 1);
55 tile[tile_y + i][tile_x] =
56 *((T*)((
char*)input_data +
y * input_pitch + i * input_pitch) +
62 x_index = blockIdx.y * TILE_DIM_TRANSPOSE + threadIdx.x;
64 y_index = blockIdx.x * TILE_DIM_TRANSPOSE + threadIdx.y;
65 for (
int i = 0; i < TILE_DIM_TRANSPOSE; i += BLOCK_ROWS_TRANSPOSE) {
66 if (y_index + i <
width) {
67 *((T*)((
char*)output_data + y_index * output_pitch +
69 x_index) = tile[threadIdx.x][threadIdx.y + i];
82 const int pitch_input,
83 const int pitch_output) {
84 dim3 block_dim(TILE_DIM_TRANSPOSE, BLOCK_ROWS_TRANSPOSE, 1);
86 grid_dim.x = (
width - 1) / TILE_DIM_TRANSPOSE + 1;
87 grid_dim.y = (
height - 1) / TILE_DIM_TRANSPOSE + 1;
89 internal::CudaTransposeKernel<<<grid_dim, block_dim>>>(
90 output, input,
width,
height, pitch_input, pitch_output);
93 #undef TILE_DIM_TRANSPOSE
94 #undef BLOCK_ROWS_TRANSPOSE
void CudaTranspose(const T *input, T *output, const int width, const int height, const int pitch_input, const int pitch_output)