10 #include <cuda_runtime.h>
21 const int pitch_input,
22 const int pitch_output);
31 #define TILE_DIM_FLIP 32
32 #define BLOCK_ROWS_FLIP 8
37 __global__
void CudaFlipHorizontalKernel(T* output_data,
41 const int input_pitch,
42 const int output_pitch) {
43 int x_index = blockIdx.x * TILE_DIM_FLIP + threadIdx.x;
44 const int y_index = blockIdx.y * TILE_DIM_FLIP + threadIdx.y;
46 __shared__ T tile[TILE_DIM_FLIP][TILE_DIM_FLIP + 1];
47 const int tile_x = min(threadIdx.x,
width - 1 - blockIdx.x * TILE_DIM_FLIP);
49 min(threadIdx.y,
height - 1 - blockIdx.y * TILE_DIM_FLIP);
51 for (
int i = 0; i < TILE_DIM_FLIP; i += BLOCK_ROWS_FLIP) {
52 const int x = min(x_index,
width - 1);
53 const int y = min(y_index,
height - i - 1);
54 tile[tile_y + i][tile_x] =
55 *((T*)((
char*)input_data +
y * input_pitch + i * input_pitch) +
61 x_index =
width - 1 - (blockIdx.x * TILE_DIM_FLIP + threadIdx.x);
62 if (x_index <
width) {
63 for (
int i = 0; i < TILE_DIM_FLIP; i += BLOCK_ROWS_FLIP) {
64 if (y_index + i <
height) {
65 *((T*)((
char*)output_data + y_index * output_pitch +
67 x_index) = tile[threadIdx.y + i][threadIdx.x];
80 const int pitch_input,
81 const int pitch_output) {
82 dim3 block_dim(TILE_DIM_FLIP, BLOCK_ROWS_FLIP, 1);
84 grid_dim.x = (
width - 1) / TILE_DIM_FLIP + 1;
85 grid_dim.y = (
height - 1) / TILE_DIM_FLIP + 1;
87 internal::CudaFlipHorizontalKernel<<<grid_dim, block_dim>>>(
88 output, input,
width,
height, pitch_input, pitch_output);
92 #undef BLOCK_ROWS_FLIP
void CudaFlipHorizontal(const T *input, T *output, const int width, const int height, const int pitch_input, const int pitch_output)