1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
12 namespace cloudViewer {
15 // defines to simplify the SASS assembly structure file/line in the profiler
16 #if CUDA_VERSION >= 9000
17 #define SHFL_SYNC(VAL, SRC_LANE, WIDTH) \
18 __shfl_sync(0xffffffff, VAL, SRC_LANE, WIDTH)
20 #define SHFL_SYNC(VAL, SRC_LANE, WIDTH) __shfl(VAL, SRC_LANE, WIDTH)
24 inline __device__ T shfl(const T val, int srcLane, int width = kWarpSize) {
25 #if CUDA_VERSION >= 9000
26 return __shfl_sync(0xffffffff, val, srcLane, width);
28 return __shfl(val, srcLane, width);
32 // CUDA SDK does not provide specializations for T*
34 inline __device__ T* shfl(T* const val, int srcLane, int width = kWarpSize) {
35 static_assert(sizeof(T*) == sizeof(long long), "pointer size");
36 long long v = (long long)val;
38 return (T*)shfl(v, srcLane, width);
42 inline __device__ T shfl_up(const T val,
44 int width = kWarpSize) {
45 #if CUDA_VERSION >= 9000
46 return __shfl_up_sync(0xffffffff, val, delta, width);
48 return __shfl_up(val, delta, width);
52 // CUDA SDK does not provide specializations for T*
54 inline __device__ T* shfl_up(T* const val,
56 int width = kWarpSize) {
57 static_assert(sizeof(T*) == sizeof(long long), "pointer size");
58 long long v = (long long)val;
60 return (T*)shfl_up(v, delta, width);
64 inline __device__ T shfl_down(const T val,
66 int width = kWarpSize) {
67 #if CUDA_VERSION >= 9000
68 return __shfl_down_sync(0xffffffff, val, delta, width);
70 return __shfl_down(val, delta, width);
74 // CUDA SDK does not provide specializations for T*
76 inline __device__ T* shfl_down(T* const val,
78 int width = kWarpSize) {
79 static_assert(sizeof(T*) == sizeof(long long), "pointer size");
80 long long v = (long long)val;
81 return (T*)shfl_down(v, delta, width);
85 inline __device__ T shfl_xor(const T val, int laneMask, int width = kWarpSize) {
86 #if CUDA_VERSION >= 9000
87 return __shfl_xor_sync(0xffffffff, val, laneMask, width);
89 return __shfl_xor(val, laneMask, width);
93 // CUDA SDK does not provide specializations for T*
95 inline __device__ T* shfl_xor(T* const val,
97 int width = kWarpSize) {
98 static_assert(sizeof(T*) == sizeof(long long), "pointer size");
99 long long v = (long long)val;
100 return (T*)shfl_xor(v, laneMask, width);
103 // CUDA 9.0+ has half shuffle
104 #if CUDA_VERSION < 9000
105 inline __device__ half shfl(half v, int srcLane, int width = kWarpSize) {
106 unsigned int vu = v.x;
107 vu = __shfl(vu, srcLane, width);
110 h.x = (unsigned short)vu;
114 inline __device__ half shfl_xor(half v, int laneMask, int width = kWarpSize) {
115 unsigned int vu = v.x;
116 vu = __shfl_xor(vu, laneMask, width);
119 h.x = (unsigned short)vu;
122 #endif // CUDA_VERSION
125 } // namespace cloudViewer