ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
Reduction.cuh
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <Helper.h>
11 #include <cuda.h>
12 
13 #include "core/nns/kernel/BlockMerge.cuh"
14 #include "core/nns/kernel/PtxUtils.cuh"
15 #include "core/nns/kernel/ReductionOps.cuh"
16 
17 namespace cloudViewer {
18 namespace core {
19 
20 template <typename T, typename Op, int ReduceWidth = kWarpSize>
21 __device__ inline T warpReduceAll(T val, Op op) {
22 #pragma unroll
23  for (int mask = ReduceWidth / 2; mask > 0; mask >>= 1) {
24  val = op(val, shfl_xor(val, mask));
25  }
26 
27  return val;
28 }
29 
30 /// Sums a register value across all warp threads
31 template <typename T, int ReduceWidth = kWarpSize>
32 __device__ inline T warpReduceAllSum(T val) {
33  return warpReduceAll<T, Sum<T>, ReduceWidth>(val, Sum<T>());
34 }
35 
36 /// Performs a block-wide reduction
37 template <typename T, typename Op, bool BroadcastAll, bool KillWARDependency>
38 __device__ inline T blockReduceAll(T val, Op op, T* smem) {
39  int laneId = getLaneId();
40  int warpId = threadIdx.x / kWarpSize;
41 
42  val = warpReduceAll<T, Op>(val, op);
43  if (laneId == 0) {
44  smem[warpId] = val;
45  }
46  __syncthreads();
47 
48  if (warpId == 0) {
49  val = laneId < divUp(blockDim.x, kWarpSize) ? smem[laneId]
50  : op.identity();
51  val = warpReduceAll<T, Op>(val, op);
52 
53  if (BroadcastAll) {
54  __threadfence_block();
55 
56  if (laneId == 0) {
57  smem[0] = val;
58  }
59  }
60  }
61 
62  if (BroadcastAll) {
63  __syncthreads();
64  val = smem[0];
65  }
66 
67  if (KillWARDependency) {
68  __syncthreads();
69  }
70 
71  return val;
72 }
73 
74 /// Performs a block-wide reduction of multiple values simultaneously
75 template <int Num,
76  typename T,
77  typename Op,
78  bool BroadcastAll,
79  bool KillWARDependency>
80 __device__ inline void blockReduceAll(T val[Num], Op op, T* smem) {
81  int laneId = getLaneId();
82  int warpId = threadIdx.x / kWarpSize;
83 
84 #pragma unroll
85  for (int i = 0; i < Num; ++i) {
86  val[i] = warpReduceAll<T, Op>(val[i], op);
87  }
88 
89  if (laneId == 0) {
90 #pragma unroll
91  for (int i = 0; i < Num; ++i) {
92  smem[warpId * Num + i] = val[i];
93  }
94  }
95 
96  __syncthreads();
97 
98  if (warpId == 0) {
99 #pragma unroll
100  for (int i = 0; i < Num; ++i) {
101  val[i] = laneId < divUp(blockDim.x, kWarpSize)
102  ? smem[laneId * Num + i]
103  : op.identity();
104  val[i] = warpReduceAll<T, Op>(val[i], op);
105  }
106 
107  if (BroadcastAll) {
108  __threadfence_block();
109 
110  if (laneId == 0) {
111 #pragma unroll
112  for (int i = 0; i < Num; ++i) {
113  smem[i] = val[i];
114  }
115  }
116  }
117  }
118 
119  if (BroadcastAll) {
120  __syncthreads();
121 #pragma unroll
122  for (int i = 0; i < Num; ++i) {
123  val[i] = smem[i];
124  }
125  }
126 
127  if (KillWARDependency) {
128  __syncthreads();
129  }
130 }
131 
132 /// Sums a register value across the entire block
133 template <typename T, bool BroadcastAll, bool KillWARDependency>
134 __device__ inline T blockReduceAllSum(T val, T* smem) {
135  return blockReduceAll<T, Sum<T>, BroadcastAll, KillWARDependency>(
136  val, Sum<T>(), smem);
137 }
138 
139 template <int Num, typename T, bool BroadcastAll, bool KillWARDependency>
140 __device__ inline void blockReduceAllSum(T vals[Num], T* smem) {
141  return blockReduceAll<Num, T, Sum<T>, BroadcastAll, KillWARDependency>(
142  vals, Sum<T>(), smem);
143 }
144 
145 } // namespace core
146 } // namespace cloudViewer