24 #include <thrust/device_vector.h>
54 memory_block_index_(0),
55 super_block_index_(0) {}
58 const buf_index_t& next_slab_ptr,
const uint32_t& lane_id) {
59 return super_blocks_ + addressDecoder(next_slab_ptr) + lane_id;
62 const uint32_t super_block_idx,
const uint32_t bitmap_idx) {
68 __device__
void Init(uint32_t& tid, uint32_t& lane_id) {
70 createMemBlockIndex(tid >> 5);
73 memory_block_bitmap_ =
88 uint32_t read_bitmap = memory_block_bitmap_;
95 empty_lane = __ffs(~memory_block_bitmap_) - 1;
99 updateMemBlockIndex((threadIdx.x + blockIdx.x * blockDim.x) >>
101 read_bitmap = memory_block_bitmap_;
104 uint32_t src_lane = __ffs(free_lane) - 1;
105 if (src_lane == lane_id) {
106 read_bitmap = atomicCAS(
110 memory_block_bitmap_,
111 memory_block_bitmap_ | (1 << empty_lane));
112 if (read_bitmap == memory_block_bitmap_) {
114 memory_block_bitmap_ |= (1 << empty_lane);
121 memory_block_bitmap_ = read_bitmap;
128 return allocated_result;
138 (getMemUnitIndex(ptr) >> 5),
139 ~(1 << (getMemUnitIndex(ptr) & 0x1F)));
143 __device__ __host__ __forceinline__ uint32_t
147 __device__ __host__ __forceinline__ uint32_t
156 __device__ __host__ __forceinline__ uint32_t
158 return address & 0x3FF;
162 return getMemUnitIndex(address) *
kWarpSize;
166 __device__
void createMemBlockIndex(uint32_t global_warp_id) {
168 memory_block_index_ = (
hash_coef_ * global_warp_id) >>
173 __device__
void updateMemBlockIndex(uint32_t global_warp_id) {
175 super_block_index_++;
177 (super_block_index_ ==
kSuperBlocks) ? 0 : super_block_index_;
178 memory_block_index_ = (
hash_coef_ * (global_warp_id + num_attempts_)) >>
181 memory_block_bitmap_ =
189 getMemBlockAddress(address_ptr_index) +
190 getMemUnitIndex(address_ptr_index) *
kWarpSize;
193 __host__ __device__
void print_address(
buf_index_t address_ptr_index) {
194 printf(
"Super block Index: %d, Memory block index: %d, Memory unit "
197 getSuperBlockIndex(address_ptr_index),
198 getMemBlockIndex(address_ptr_index),
199 getMemUnitIndex(address_ptr_index));
210 uint32_t num_attempts_;
211 uint32_t memory_block_index_;
212 uint32_t memory_block_bitmap_;
213 uint32_t super_block_index_;
217 uint32_t* slabs_per_superblock);
253 thrust::device_vector<uint32_t> slabs_per_superblock(
kSuperBlocks);
254 thrust::fill(slabs_per_superblock.begin(), slabs_per_superblock.end(),
259 int num_cuda_blocks =
262 core::cuda::GetStream()>>>(
263 impl_, thrust::raw_pointer_cast(slabs_per_superblock.data()));
267 std::vector<int>
result(num_super_blocks);
268 thrust::copy(slabs_per_superblock.begin(), slabs_per_superblock.end(),
#define CLOUDVIEWER_CUDA_CHECK(err)
static void * Malloc(size_t byte_size, const Device &device)
static void Free(void *ptr, const Device &device)
Frees previously allocated memory at address ptr on device device.
uint32_t * super_blocks_
A pointer to each super-block.
__device__ __forceinline__ uint32_t * get_unit_ptr_from_slab(const buf_index_t &next_slab_ptr, const uint32_t &lane_id)
__device__ uint32_t WarpAllocate(const uint32_t &lane_id)
__device__ void FreeUntouched(buf_index_t ptr)
__device__ __forceinline__ uint32_t * get_ptr_for_bitmap(const uint32_t super_block_idx, const uint32_t bitmap_idx)
uint32_t hash_coef_
hash_coef (register): used as (16 bits, 16 bits) for hashing.
__device__ void Init(uint32_t &tid, uint32_t &lane_id)
SlabNodeManagerImpl impl_
std::vector< int > CountSlabsPerSuperblock()
SlabNodeManager(const Device &device)
buf_index_t next_slab_ptr
An internal ptr managed by InternalNodeManager.
buf_index_t kv_pair_ptrs[kWarpSize - 1]
static constexpr uint32_t kBlocksPerSuperBlock
static constexpr uint32_t kUIntsPerSuperBlock
static constexpr uint32_t kUIntsPerBlock
static constexpr uint32_t kSyncLanesMask
static constexpr uint32_t kSlabsPerBlock
static constexpr uint32_t kWarpSize
static constexpr uint32_t kBlockMaskBits
static constexpr uint32_t kSlabMaskBits
static constexpr uint32_t kSuperBlocks
static constexpr uint32_t kNotFoundFlag
static constexpr uint32_t kSuperBlockMaskBits
static constexpr uint32_t kBitmapsPerSuperBlock
__global__ void CountSlabsPerSuperblockKernel(SlabNodeManagerImpl impl, uint32_t *slabs_per_superblock)
static constexpr uint32_t kThreadsPerBlock
static constexpr uint32_t kBlocksPerSuperBlockInBits
Generic file read and write utility for python interface.