10 #include <cuda_runtime.h>
21 const int pitch_input,
22 const int pitch_output);
30 #define TILE_DIM_ROTATE 32
35 __global__
void CudaRotateKernel(T* output_data,
39 const int input_pitch,
40 const int output_pitch) {
41 int input_x = blockDim.x * blockIdx.x + threadIdx.x;
42 int input_y = blockDim.y * blockIdx.y + threadIdx.y;
48 int output_x = input_y;
49 int output_y =
width - 1 - input_x;
51 *((T*)((
char*)output_data + output_y * output_pitch) + output_x) =
52 *((T*)((
char*)input_data + input_y * input_pitch) + input_x);
62 const int pitch_input,
63 const int pitch_output) {
64 dim3 block_dim(TILE_DIM_ROTATE, 1, 1);
66 grid_dim.x = (
width - 1) / TILE_DIM_ROTATE + 1;
69 internal::CudaRotateKernel<<<grid_dim, block_dim>>>(
70 output, input,
width,
height, pitch_input, pitch_output);
73 #undef TILE_DIM_ROTATE
void CudaRotate(const T *input, T *output, const int width, const int height, const int pitch_input, const int pitch_output)