ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
WarpShuffle.cuh
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <cuda.h>
11 
12 namespace cloudViewer {
13 namespace core {
14 
15 // defines to simplify the SASS assembly structure file/line in the profiler
16 #if CUDA_VERSION >= 9000
17 #define SHFL_SYNC(VAL, SRC_LANE, WIDTH) \
18  __shfl_sync(0xffffffff, VAL, SRC_LANE, WIDTH)
19 #else
20 #define SHFL_SYNC(VAL, SRC_LANE, WIDTH) __shfl(VAL, SRC_LANE, WIDTH)
21 #endif
22 
23 template <typename T>
24 inline __device__ T shfl(const T val, int srcLane, int width = kWarpSize) {
25 #if CUDA_VERSION >= 9000
26  return __shfl_sync(0xffffffff, val, srcLane, width);
27 #else
28  return __shfl(val, srcLane, width);
29 #endif
30 }
31 
32 // CUDA SDK does not provide specializations for T*
33 template <typename T>
34 inline __device__ T* shfl(T* const val, int srcLane, int width = kWarpSize) {
35  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
36  long long v = (long long)val;
37 
38  return (T*)shfl(v, srcLane, width);
39 }
40 
41 template <typename T>
42 inline __device__ T shfl_up(const T val,
43  unsigned int delta,
44  int width = kWarpSize) {
45 #if CUDA_VERSION >= 9000
46  return __shfl_up_sync(0xffffffff, val, delta, width);
47 #else
48  return __shfl_up(val, delta, width);
49 #endif
50 }
51 
52 // CUDA SDK does not provide specializations for T*
53 template <typename T>
54 inline __device__ T* shfl_up(T* const val,
55  unsigned int delta,
56  int width = kWarpSize) {
57  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
58  long long v = (long long)val;
59 
60  return (T*)shfl_up(v, delta, width);
61 }
62 
63 template <typename T>
64 inline __device__ T shfl_down(const T val,
65  unsigned int delta,
66  int width = kWarpSize) {
67 #if CUDA_VERSION >= 9000
68  return __shfl_down_sync(0xffffffff, val, delta, width);
69 #else
70  return __shfl_down(val, delta, width);
71 #endif
72 }
73 
74 // CUDA SDK does not provide specializations for T*
75 template <typename T>
76 inline __device__ T* shfl_down(T* const val,
77  unsigned int delta,
78  int width = kWarpSize) {
79  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
80  long long v = (long long)val;
81  return (T*)shfl_down(v, delta, width);
82 }
83 
84 template <typename T>
85 inline __device__ T shfl_xor(const T val, int laneMask, int width = kWarpSize) {
86 #if CUDA_VERSION >= 9000
87  return __shfl_xor_sync(0xffffffff, val, laneMask, width);
88 #else
89  return __shfl_xor(val, laneMask, width);
90 #endif
91 }
92 
93 // CUDA SDK does not provide specializations for T*
94 template <typename T>
95 inline __device__ T* shfl_xor(T* const val,
96  int laneMask,
97  int width = kWarpSize) {
98  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
99  long long v = (long long)val;
100  return (T*)shfl_xor(v, laneMask, width);
101 }
102 
103 // CUDA 9.0+ has half shuffle
104 #if CUDA_VERSION < 9000
105 inline __device__ half shfl(half v, int srcLane, int width = kWarpSize) {
106  unsigned int vu = v.x;
107  vu = __shfl(vu, srcLane, width);
108 
109  half h;
110  h.x = (unsigned short)vu;
111  return h;
112 }
113 
114 inline __device__ half shfl_xor(half v, int laneMask, int width = kWarpSize) {
115  unsigned int vu = v.x;
116  vu = __shfl_xor(vu, laneMask, width);
117 
118  half h;
119  h.x = (unsigned short)vu;
120  return h;
121 }
122 #endif // CUDA_VERSION
123 
124 } // namespace core
125 } // namespace cloudViewer