ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
SlabHashBackend.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <memory>
11 
16 
17 namespace cloudViewer {
18 namespace core {
19 template <typename Key, typename Hash, typename Eq>
21 public:
22  SlabHashBackend(int64_t init_capacity,
23  int64_t key_dsize,
24  const std::vector<int64_t>& value_dsizes,
25  const Device& device);
26 
28 
29  void Reserve(int64_t capacity) override;
30 
31  void Insert(const void* input_keys,
32  const std::vector<const void*>& input_values_soa,
33  buf_index_t* output_buf_indices,
34  bool* output_masks,
35  int64_t count) override;
36 
37  void Find(const void* input_keys,
38  buf_index_t* output_buf_indices,
39  bool* output_masks,
40  int64_t count) override;
41 
42  void Erase(const void* input_keys,
43  bool* output_masks,
44  int64_t count) override;
45 
46  int64_t GetActiveIndices(buf_index_t* output_indices) override;
47  void Clear() override;
48 
49  int64_t Size() const override;
50  int64_t GetBucketCount() const override;
51  std::vector<int64_t> BucketSizes() const override;
52  float LoadFactor() const override;
53 
55 
56  void Allocate(int64_t capacity) override;
57  void Free() override;
58 
59 protected:
63 
65  std::shared_ptr<SlabNodeManager> node_mgr_;
66 
67  int64_t bucket_count_;
68 };
69 
70 template <typename Key, typename Hash, typename Eq>
72  int64_t init_capacity,
73  int64_t key_dsize,
74  const std::vector<int64_t>& value_dsizes,
75  const Device& device)
76  : DeviceHashBackend(init_capacity, key_dsize, value_dsizes, device) {
77  CUDAScopedDevice scoped_device(this->device_);
78  Allocate(init_capacity);
79 }
80 
81 template <typename Key, typename Hash, typename Eq>
83  CUDAScopedDevice scoped_device(this->device_);
84  Free();
85 }
86 
87 template <typename Key, typename Hash, typename Eq>
89  CUDAScopedDevice scoped_device(this->device_);
90 }
91 
92 template <typename Key, typename Hash, typename Eq>
93 void SlabHashBackend<Key, Hash, Eq>::Find(const void* input_keys,
94  buf_index_t* output_buf_indices,
95  bool* output_masks,
96  int64_t count) {
97  CUDAScopedDevice scoped_device(this->device_);
98  if (count == 0) return;
99 
100  CLOUDVIEWER_CUDA_CHECK(cudaMemset(output_masks, 0, sizeof(bool) * count));
102  CLOUDVIEWER_CUDA_CHECK(cudaGetLastError());
103 
104  const int64_t num_blocks =
106  FindKernel<<<num_blocks, kThreadsPerBlock, 0, core::cuda::GetStream()>>>(
107  impl_, input_keys, output_buf_indices, output_masks, count);
109  CLOUDVIEWER_CUDA_CHECK(cudaGetLastError());
110 }
111 
112 template <typename Key, typename Hash, typename Eq>
113 void SlabHashBackend<Key, Hash, Eq>::Erase(const void* input_keys,
114  bool* output_masks,
115  int64_t count) {
116  CUDAScopedDevice scoped_device(this->device_);
117  if (count == 0) return;
118 
119  CLOUDVIEWER_CUDA_CHECK(cudaMemset(output_masks, 0, sizeof(bool) * count));
121  CLOUDVIEWER_CUDA_CHECK(cudaGetLastError());
122  auto buf_indices = static_cast<buf_index_t*>(
123  MemoryManager::Malloc(sizeof(buf_index_t) * count, this->device_));
124 
125  const int64_t num_blocks =
127  EraseKernelPass0<<<num_blocks, kThreadsPerBlock, 0,
128  core::cuda::GetStream()>>>(
129  impl_, input_keys, buf_indices, output_masks, count);
130  EraseKernelPass1<<<num_blocks, kThreadsPerBlock, 0,
131  core::cuda::GetStream()>>>(impl_, buf_indices,
132  output_masks, count);
134  CLOUDVIEWER_CUDA_CHECK(cudaGetLastError());
135 
136  MemoryManager::Free(buf_indices, this->device_);
137 }
138 
139 template <typename Key, typename Hash, typename Eq>
141  buf_index_t* output_buf_indices) {
142  CUDAScopedDevice scoped_device(this->device_);
143  uint32_t* count = static_cast<uint32_t*>(
144  MemoryManager::Malloc(sizeof(uint32_t), this->device_));
145  CLOUDVIEWER_CUDA_CHECK(cudaMemset(count, 0, sizeof(uint32_t)));
146 
148  CLOUDVIEWER_CUDA_CHECK(cudaGetLastError());
149 
150  const int64_t num_blocks =
151  (impl_.bucket_count_ * kWarpSize + kThreadsPerBlock - 1) /
153  GetActiveIndicesKernel<<<num_blocks, kThreadsPerBlock, 0,
154  core::cuda::GetStream()>>>(
155  impl_, output_buf_indices, count);
157  CLOUDVIEWER_CUDA_CHECK(cudaGetLastError());
158 
159  uint32_t ret;
160  MemoryManager::MemcpyToHost(&ret, count, this->device_, sizeof(uint32_t));
161  MemoryManager::Free(count, this->device_);
162 
163  return static_cast<int64_t>(ret);
164 }
165 
166 template <typename Key, typename Hash, typename Eq>
168  CUDAScopedDevice scoped_device(this->device_);
169  // Clear the heap
170  this->buffer_->ResetHeap();
171 
172  // Clear the linked list heads
173  CLOUDVIEWER_CUDA_CHECK(cudaMemset(impl_.bucket_list_head_, 0xFF,
174  sizeof(Slab) * this->bucket_count_));
176  CLOUDVIEWER_CUDA_CHECK(cudaGetLastError());
177 
178  // Clear the linked list nodes
179  node_mgr_->Reset();
180 }
181 
182 template <typename Key, typename Hash, typename Eq>
184  CUDAScopedDevice scoped_device(this->device_);
185  return this->buffer_->GetHeapTopIndex();
186 }
187 
188 template <typename Key, typename Hash, typename Eq>
190  CUDAScopedDevice scoped_device(this->device_);
191  return bucket_count_;
192 }
193 
194 template <typename Key, typename Hash, typename Eq>
195 std::vector<int64_t> SlabHashBackend<Key, Hash, Eq>::BucketSizes() const {
196  CUDAScopedDevice scoped_device(this->device_);
197  thrust::device_vector<int64_t> elems_per_bucket(impl_.bucket_count_);
198  thrust::fill(elems_per_bucket.begin(), elems_per_bucket.end(), 0);
199 
200  const int64_t num_blocks =
201  (impl_.buffer_accessor_.capacity_ + kThreadsPerBlock - 1) /
204  core::cuda::GetStream()>>>(
205  impl_, thrust::raw_pointer_cast(elems_per_bucket.data()));
207  CLOUDVIEWER_CUDA_CHECK(cudaGetLastError());
208 
209  std::vector<int64_t> result(impl_.bucket_count_);
210  thrust::copy(elems_per_bucket.begin(), elems_per_bucket.end(),
211  result.begin());
212  return result;
213 }
214 
215 template <typename Key, typename Hash, typename Eq>
217  CUDAScopedDevice scoped_device(this->device_);
218  return float(Size()) / float(this->bucket_count_);
219 }
220 
221 template <typename Key, typename Hash, typename Eq>
223  const void* input_keys,
224  const std::vector<const void*>& input_values_soa,
225  buf_index_t* output_buf_indices,
226  bool* output_masks,
227  int64_t count) {
228  CUDAScopedDevice scoped_device(this->device_);
229  if (count == 0) return;
230 
233  int prev_heap_top = this->buffer_->GetHeapTopIndex();
234  *thrust::device_ptr<int>(impl_.buffer_accessor_.heap_top_) =
235  prev_heap_top + count;
236 
237  const int64_t num_blocks =
239  InsertKernelPass0<<<num_blocks, kThreadsPerBlock, 0,
240  core::cuda::GetStream()>>>(
241  impl_, input_keys, output_buf_indices, prev_heap_top, count);
242  InsertKernelPass1<<<num_blocks, kThreadsPerBlock, 0,
243  core::cuda::GetStream()>>>(
244  impl_, input_keys, output_buf_indices, output_masks, count);
245 
246  thrust::device_vector<const void*> input_values_soa_device(
247  input_values_soa.begin(), input_values_soa.end());
248 
249  int64_t n_values = input_values_soa.size();
250  const void* const* ptr_input_values_soa =
251  thrust::raw_pointer_cast(input_values_soa_device.data());
252  DISPATCH_DIVISOR_SIZE_TO_BLOCK_T(
253  impl_.buffer_accessor_.common_block_size_, [&]() {
254  InsertKernelPass2<Key, Hash, Eq, block_t>
255  <<<num_blocks, kThreadsPerBlock, 0,
256  core::cuda::GetStream()>>>(
257  impl_, ptr_input_values_soa, output_buf_indices,
258  output_masks, count, n_values);
259  });
261  CLOUDVIEWER_CUDA_CHECK(cudaGetLastError());
262 }
263 
264 template <typename Key, typename Hash, typename Eq>
266  CUDAScopedDevice scoped_device(this->device_);
267  this->bucket_count_ = capacity * 2;
268  this->capacity_ = capacity;
269 
270  // Allocate buffer for key values.
271  this->buffer_ = std::make_shared<HashBackendBuffer>(
272  this->capacity_, this->key_dsize_, this->value_dsizes_,
273  this->device_);
274  buffer_accessor_.Setup(*this->buffer_);
275 
276  // Allocate buffer for linked list nodes.
277  node_mgr_ = std::make_shared<SlabNodeManager>(this->device_);
278 
279  // Allocate linked list heads.
280  impl_.bucket_list_head_ = static_cast<Slab*>(MemoryManager::Malloc(
281  sizeof(Slab) * this->bucket_count_, this->device_));
282  CLOUDVIEWER_CUDA_CHECK(cudaMemset(impl_.bucket_list_head_, 0xFF,
283  sizeof(Slab) * this->bucket_count_));
285  CLOUDVIEWER_CUDA_CHECK(cudaGetLastError());
286 
287  impl_.Setup(this->bucket_count_, node_mgr_->impl_, buffer_accessor_);
288 }
289 
290 template <typename Key, typename Hash, typename Eq>
292  CUDAScopedDevice scoped_device(this->device_);
293  buffer_accessor_.Shutdown(this->device_);
294  MemoryManager::Free(impl_.bucket_list_head_, this->device_);
295 }
296 } // namespace core
297 } // namespace cloudViewer
Common CUDA utilities.
#define CLOUDVIEWER_CUDA_CHECK(err)
Definition: CUDAUtils.h:47
int count
core::Tensor result
Definition: VtkUtils.cpp:76
bool copy
Definition: VtkUtils.cpp:74
When CUDA is not enabled, this is a dummy class.
Definition: CUDAUtils.h:214
static void MemcpyToHost(void *host_ptr, const void *src_ptr, const Device &src_device, size_t num_bytes)
Same as Memcpy, but with host (CPU:0) as default dst_device.
static void * Malloc(size_t byte_size, const Device &device)
static void Free(void *ptr, const Device &device)
Frees previously allocated memory at address ptr on device device.
void Reserve(int64_t capacity) override
void Insert(const void *input_keys, const std::vector< const void * > &input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel insert contiguous arrays of keys and values.
void Find(const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel find a contiguous array of keys.
int64_t Size() const override
Get the size (number of valid entries) of the hash map.
SlabHashBackendImpl< Key, Hash, Eq > GetImpl()
void Clear() override
Clear stored map without reallocating memory.
void Allocate(int64_t capacity) override
int64_t GetBucketCount() const override
Get the number of buckets of the hash map.
std::shared_ptr< SlabNodeManager > node_mgr_
std::vector< int64_t > BucketSizes() const override
Get the number of entries per bucket.
CUDAHashBackendBufferAccessor buffer_accessor_
float LoadFactor() const override
Get the current load factor, defined as size / bucket count.
SlabHashBackendImpl< Key, Hash, Eq > impl_
int64_t GetActiveIndices(buf_index_t *output_indices) override
Parallel collect all iterators in the hash table.
void Erase(const void *input_keys, bool *output_masks, int64_t count) override
Parallel erase a contiguous array of keys.
SlabHashBackend(int64_t init_capacity, int64_t key_dsize, const std::vector< int64_t > &value_dsizes, const Device &device)
__global__ void EraseKernelPass0(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
void Free(benchmark::State &state, int size, const Device &device, const MemoryManagerBackend &backend)
__global__ void GetActiveIndicesKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, buf_index_t *output_buf_indices, uint32_t *output_count)
__global__ void EraseKernelPass1(SlabHashBackendImpl< Key, Hash, Eq > impl, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
__global__ void CountElemsPerBucketKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, int64_t *bucket_elem_counts)
static constexpr uint32_t kWarpSize
Definition: SlabMacros.h:31
__global__ void FindKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
__global__ void InsertKernelPass1(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
__global__ void InsertKernelPass0(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, int heap_counter_prev, int64_t count)
Kernels.
static constexpr uint32_t kThreadsPerBlock
Definition: SlabMacros.h:44
Generic file read and write utility for python interface.