ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
PtxUtils.cuh
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <cuda.h>
11 
12 namespace cloudViewer {
13 namespace core {
14 
15 // defines to simplify the SASS assembly structure file/line in the profiler
16 #define GET_BITFIELD_U32(OUT, VAL, POS, LEN) \
17  asm("bfe.u32 %0, %1, %2, %3;" : "=r"(OUT) : "r"(VAL), "r"(POS), "r"(LEN));
18 
19 #define GET_BITFIELD_U64(OUT, VAL, POS, LEN) \
20  asm("bfe.u64 %0, %1, %2, %3;" : "=l"(OUT) : "l"(VAL), "r"(POS), "r"(LEN));
21 
22 __device__ __forceinline__ unsigned int getBitfield(unsigned int val,
23  int pos,
24  int len) {
25  unsigned int ret;
26  asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
27  return ret;
28 }
29 
30 __device__ __forceinline__ uint64_t getBitfield(uint64_t val,
31  int pos,
32  int len) {
33  uint64_t ret;
34  asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
35  return ret;
36 }
37 
38 __device__ __forceinline__ unsigned int setBitfield(unsigned int val,
39  unsigned int toInsert,
40  int pos,
41  int len) {
42  unsigned int ret;
43  asm("bfi.b32 %0, %1, %2, %3, %4;"
44  : "=r"(ret)
45  : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
46  return ret;
47 }
48 
49 __device__ __forceinline__ int getLaneId() {
50  int laneId;
51  asm("mov.u32 %0, %%laneid;" : "=r"(laneId));
52  return laneId;
53 }
54 
55 __device__ __forceinline__ unsigned getLaneMaskLt() {
56  unsigned mask;
57  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
58  return mask;
59 }
60 
61 __device__ __forceinline__ unsigned getLaneMaskLe() {
62  unsigned mask;
63  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
64  return mask;
65 }
66 
67 __device__ __forceinline__ unsigned getLaneMaskGt() {
68  unsigned mask;
69  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
70  return mask;
71 }
72 
73 __device__ __forceinline__ unsigned getLaneMaskGe() {
74  unsigned mask;
75  asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
76  return mask;
77 }
78 
79 __device__ __forceinline__ void namedBarrierWait(int name, int numThreads) {
80  asm volatile("bar.sync %0, %1;" : : "r"(name), "r"(numThreads) : "memory");
81 }
82 
83 __device__ __forceinline__ void namedBarrierArrived(int name, int numThreads) {
84  asm volatile("bar.arrive %0, %1;"
85  :
86  : "r"(name), "r"(numThreads)
87  : "memory");
88 }
89 
90 } // namespace core
91 } // namespace cloudViewer