cpp_api/api/MemoryManagerCached_8cpp_source.html

 // ----------------------------------------------------------------------------

 // -                        CloudViewer: www.cloudViewer.org                  -

 // ----------------------------------------------------------------------------

 // Copyright (c) 2018-2024 www.cloudViewer.org

 // SPDX-License-Identifier: MIT

 // ----------------------------------------------------------------------------


 #include <Logging.h>


 #include <algorithm>

 #include <cstdlib>

 #include <limits>

 #include <memory>

 #include <mutex>

 #include <set>

 #include <unordered_map>

 #include <vector>


 #include "cloudViewer/core/MemoryManager.h"


 #ifdef BUILD_CUDA_MODULE

 #include "cloudViewer/core/CUDAUtils.h"

 #endif


 namespace cloudViewer {

 namespace core {


 // This implementation is insipred by PyTorch's CUDA memory manager.

 // Reference: https://git.io/JUqUA


 template <typename Block>

 struct SizeOrder {

     bool operator()(const std::shared_ptr<Block>& lhs,

                     const std::shared_ptr<Block>& rhs) const {

         if (lhs->byte_size_ != rhs->byte_size_) {

             return lhs->byte_size_ < rhs->byte_size_;

         }

         return lhs->ptr_ < rhs->ptr_;

     }

 };


 template <typename Block>

 struct PointerOrder {

     bool operator()(const std::shared_ptr<Block>& lhs,

                     const std::shared_ptr<Block>& rhs) const {

         if (lhs->ptr_ != rhs->ptr_) {

             return lhs->ptr_ < rhs->ptr_;

         }

         return lhs->byte_size_ < rhs->byte_size_;

     }

 };


 struct RealBlock;


 struct VirtualBlock {

     VirtualBlock(void* ptr,

                  size_t byte_size,

                  const std::weak_ptr<RealBlock>& r_block)

         : ptr_(ptr), byte_size_(byte_size), r_block_(r_block) {}


     void* ptr_ = nullptr;

     size_t byte_size_ = 0;


     std::weak_ptr<RealBlock> r_block_;

 };


 struct RealBlock {

     RealBlock(void* ptr, size_t byte_size) : ptr_(ptr), byte_size_(byte_size) {}


     void* ptr_ = nullptr;

     size_t byte_size_ = 0;


     std::shared_ptr<MemoryManagerDevice> device_mm_;

     std::set<std::shared_ptr<VirtualBlock>, PointerOrder<VirtualBlock>>

             v_blocks_;

 };


 class MemoryCache {

 public:

     MemoryCache() = default;

     MemoryCache(const MemoryCache&) = delete;

     MemoryCache& operator=(const MemoryCache&) = delete;


     static size_t AlignByteSize(size_t byte_size, size_t alignment = 8) {

         return ((byte_size + alignment - 1) / alignment) * alignment;

     }


     void* Malloc(size_t byte_size) {

         std::lock_guard<std::recursive_mutex> lock(mutex_);


         auto free_block = ExtractFreeBlock(byte_size);


         if (free_block != nullptr) {

             size_t remaining_size = free_block->byte_size_ - byte_size;


             if (remaining_size == 0) {

                 // No update of real block required for perfect fit.

                 allocated_virtual_blocks_.emplace(free_block->ptr_, free_block);


                 return free_block->ptr_;

             } else {

                 // Split virtual block.

                 auto new_block = std::make_shared<VirtualBlock>(

                         free_block->ptr_, byte_size, free_block->r_block_);

                 auto remaining_block = std::make_shared<VirtualBlock>(

                         static_cast<char*>(free_block->ptr_) + byte_size,

                         remaining_size, free_block->r_block_);


                 // Update real block.

                 auto real_block = free_block->r_block_.lock();

                 real_block->v_blocks_.erase(free_block);

                 real_block->v_blocks_.insert(new_block);

                 real_block->v_blocks_.insert(remaining_block);


                 allocated_virtual_blocks_.emplace(new_block->ptr_, new_block);

                 free_virtual_blocks_.insert(remaining_block);


                 return new_block->ptr_;

             }

         }


         return nullptr;

     }


     void Free(void* ptr) {

         std::lock_guard<std::recursive_mutex> lock(mutex_);


         auto ptr_it = allocated_virtual_blocks_.find(ptr);


         if (ptr_it == allocated_virtual_blocks_.end()) {

             // Should never reach here

             utility::LogError("Block of {} should have been recorded.",

                               fmt::ptr(ptr));

         }


         auto v_block = ptr_it->second;

         allocated_virtual_blocks_.erase(ptr_it);


         auto r_block = v_block->r_block_.lock();

         auto& v_block_set = r_block->v_blocks_;


         const auto v_block_it = v_block_set.find(v_block);

         if (v_block_it == v_block_set.end()) {

             utility::LogError(

                     "Virtual block ({} @ {} bytes) not recorded in real block "

                     "{} @ {} bytes.",

                     fmt::ptr(v_block->ptr_), v_block->byte_size_,

                     fmt::ptr(r_block->ptr_), r_block->byte_size_);

         }


         auto merged_v_block = v_block;


         // Merge with previous block.

         if (v_block_it != v_block_set.begin()) {

             // Use copy to keep original iterator unchanged.

             auto v_block_it_copy = v_block_it;

             auto v_block_it_prev = --v_block_it_copy;


             auto v_block_prev = *v_block_it_prev;


             if (free_virtual_blocks_.find(v_block_prev) !=

                 free_virtual_blocks_.end()) {

                 // Update merged block.

                 merged_v_block = std::make_shared<VirtualBlock>(

                         v_block_prev->ptr_,

                         v_block_prev->byte_size_ + merged_v_block->byte_size_,

                         r_block);


                 // Remove from sets.

                 v_block_set.erase(v_block_prev);

                 free_virtual_blocks_.erase(v_block_prev);

             }

         }


         // Merge with next block.


         // Use copy to keep original iterator unchanged.

         auto v_block_it_copy = v_block_it;

         auto v_block_it_next = ++v_block_it_copy;


         if (v_block_it_next != v_block_set.end()) {

             auto v_block_next = *v_block_it_next;


             if (free_virtual_blocks_.find(v_block_next) !=

                 free_virtual_blocks_.end()) {

                 // Update merged block.

                 merged_v_block = std::make_shared<VirtualBlock>(

                         merged_v_block->ptr_,

                         merged_v_block->byte_size_ + v_block_next->byte_size_,

                         r_block);


                 // Remove from sets.

                 v_block_set.erase(v_block_next);

                 free_virtual_blocks_.erase(v_block_next);

             }

         }


         v_block_set.erase(v_block);

         v_block_set.insert(merged_v_block);

         free_virtual_blocks_.insert(merged_v_block);

     }


     void Acquire(void* ptr,

                  size_t byte_size,

                  const std::shared_ptr<MemoryManagerDevice>& device_mm) {

         std::lock_guard<std::recursive_mutex> lock(mutex_);


         auto r_block = std::make_shared<RealBlock>(ptr, byte_size);

         auto v_block = std::make_shared<VirtualBlock>(ptr, byte_size, r_block);

         r_block->device_mm_ = device_mm;

         r_block->v_blocks_.insert(v_block);


         real_blocks_.insert(r_block);

         allocated_virtual_blocks_.emplace(v_block->ptr_, v_block);

     }


     std::vector<std::pair<void*, std::shared_ptr<MemoryManagerDevice>>> Release(

             size_t byte_size) {

         std::lock_guard<std::recursive_mutex> lock(mutex_);


         // Filter releasable blocks.

         std::set<std::shared_ptr<RealBlock>, SizeOrder<RealBlock>>

                 releasable_real_blocks;

         std::copy_if(

                 real_blocks_.begin(), real_blocks_.end(),

                 std::inserter(releasable_real_blocks,

                               releasable_real_blocks.begin()),

                 [this](const auto& r_block) { return IsReleasable(r_block); });


         // Determine greedy "minimal" subset

         std::vector<std::pair<void*, std::shared_ptr<MemoryManagerDevice>>>

                 released_pointers;

         size_t released_size = 0;

         while (!releasable_real_blocks.empty() && released_size < byte_size) {

             size_t remaining_size = byte_size - released_size;

             auto query_size =

                     std::make_shared<RealBlock>(nullptr, remaining_size);

             auto it = releasable_real_blocks.lower_bound(query_size);

             if (it == releasable_real_blocks.end()) {

                 --it;

             }

             auto r_block = *it;


             real_blocks_.erase(r_block);

             for (const auto& v_block : r_block->v_blocks_) {

                 free_virtual_blocks_.erase(v_block);

             }


             releasable_real_blocks.erase(r_block);

             released_pointers.emplace_back(r_block->ptr_, r_block->device_mm_);

             released_size += r_block->byte_size_;

         }


         return released_pointers;

     }


     std::vector<std::pair<void*, std::shared_ptr<MemoryManagerDevice>>>

     ReleaseAll() {

         return Release(std::numeric_limits<size_t>::max());

     }


     size_t Size() const { return real_blocks_.size(); }


     bool Empty() const { return Size() == 0; }


 private:

     std::shared_ptr<VirtualBlock> ExtractFreeBlock(size_t byte_size) {

         std::lock_guard<std::recursive_mutex> lock(mutex_);


         size_t max_byte_size = static_cast<size_t>(

                 kMaxFragmentation * static_cast<double>(byte_size));


         // Consider blocks with size in range

         // [byte_size, max_byte_size].

         auto query_size = std::make_shared<VirtualBlock>(

                 nullptr, byte_size, std::weak_ptr<RealBlock>());

         auto it = free_virtual_blocks_.lower_bound(query_size);

         while (it != free_virtual_blocks_.end() &&

                (*it)->byte_size_ <= max_byte_size) {

             auto r_block = (*it)->r_block_.lock();

             if (r_block->byte_size_ <= max_byte_size) {

                 auto block = *it;

                 free_virtual_blocks_.erase(it);

                 return block;

             }

             ++it;

         }


         return nullptr;

     }


     bool IsReleasable(const std::shared_ptr<RealBlock>& r_block) {

         if (r_block->v_blocks_.size() != 1) {

             return false;

         }


         auto v_block = *(r_block->v_blocks_.begin());

         if (r_block->ptr_ != v_block->ptr_ ||

             r_block->byte_size_ != v_block->byte_size_) {

             utility::LogError(

                     "Real block {} @ {} bytes has single "

                     "virtual block {} @ {} bytes",

                     fmt::ptr(r_block->ptr_), r_block->byte_size_,

                     fmt::ptr(v_block->ptr_), v_block->byte_size_);

         }


         return free_virtual_blocks_.find(v_block) != free_virtual_blocks_.end();

     }


     const double kMaxFragmentation = 4.0;


     std::set<std::shared_ptr<RealBlock>, SizeOrder<RealBlock>> real_blocks_;


     std::unordered_map<void*, std::shared_ptr<VirtualBlock>>

             allocated_virtual_blocks_;

     std::set<std::shared_ptr<VirtualBlock>, SizeOrder<VirtualBlock>>

             free_virtual_blocks_;


     std::recursive_mutex mutex_;

 };


 class Cacher {

 public:

     static Cacher& GetInstance() {

         // Ensure the static Logger instance is instantiated before the

         // Cacher instance.

         // Since destruction of static instances happens in reverse order,

         // this guarantees that the Logger can be used at any point in time.

         utility::Logger::GetInstance();


 #ifdef BUILD_CUDA_MODULE

         // Ensure CUDAState is initialized before Cacher.

         CUDAState::GetInstance();

 #endif


         static Cacher instance;

         return instance;

     }


     ~Cacher() {

         for (const auto& cache_pair : device_caches_) {

             // Simulate C++17 structured bindings for better readability.

             const auto& device = cache_pair.first;

             const auto& cache = cache_pair.second;


             Clear(device);


             if (!cache.Empty()) {

                 utility::LogError("{} leaking memory blocks on {}",

                                   cache.Size(), device.ToString());

             }

         }

     }


     Cacher(const Cacher&) = delete;

     Cacher& operator=(Cacher&) = delete;


     void* Malloc(size_t byte_size,

                  const Device& device,

                  const std::shared_ptr<MemoryManagerDevice>& device_mm) {

         Init(device);


         size_t internal_byte_size = MemoryCache::AlignByteSize(byte_size);


         // Malloc from cache.

         void* ptr = device_caches_.at(device).Malloc(internal_byte_size);

         if (ptr != nullptr) {

             return ptr;

         }


         // Malloc from real memory manager.

         try {

             ptr = device_mm->Malloc(internal_byte_size, device);

         } catch (const std::runtime_error&) {

         }


         // Free cached memory and try again.

         if (ptr == nullptr) {

             auto old_ptrs =

                     device_caches_.at(device).Release(internal_byte_size);

             for (const auto& old_pair : old_ptrs) {

                 // Simulate C++17 structured bindings for better readability.

                 const auto& old_ptr = old_pair.first;

                 const auto& old_device_mm = old_pair.second;


                 old_device_mm->Free(old_ptr, device);

             }


             // Do not catch the error if the allocation still fails.

             ptr = device_mm->Malloc(internal_byte_size, device);

         }


         device_caches_.at(device).Acquire(ptr, internal_byte_size, device_mm);


         return ptr;

     }


     void Free(void* ptr, const Device& device) {

         Init(device);


         device_caches_.at(device).Free(ptr);

     }


     void Clear(const Device& device) {

         Init(device);


         auto old_ptrs = device_caches_.at(device).ReleaseAll();

         for (const auto& old_pair : old_ptrs) {

             // Simulate C++17 structured bindings for better readability.

             const auto& old_ptr = old_pair.first;

             const auto& old_device_mm = old_pair.second;


             old_device_mm->Free(old_ptr, device);

         }

     }


     void Clear() {

         // Collect all devices in a thread-safe manner. This avoids potential

         // issues with newly initialized/inserted elements while iterating over

         // the container.

         std::vector<Device> devices;

         {

             std::lock_guard<std::recursive_mutex> lock(init_mutex_);

             for (const auto& cache_pair : device_caches_) {

                 devices.push_back(cache_pair.first);

             }

         }


         for (const auto& device : devices) {

             Clear(device);

         }

     }


 private:

     Cacher() = default;


     void Init(const Device& device) {

         std::lock_guard<std::recursive_mutex> lock(init_mutex_);


         // Performs no action if already initialized.

         device_caches_.emplace(std::piecewise_construct,

                                std::forward_as_tuple(device),

                                std::forward_as_tuple());

     }


     std::unordered_map<Device, MemoryCache> device_caches_;

     std::recursive_mutex init_mutex_;

 };


 MemoryManagerCached::MemoryManagerCached(

         const std::shared_ptr<MemoryManagerDevice>& device_mm)

     : device_mm_(device_mm) {

     if (std::dynamic_pointer_cast<MemoryManagerCached>(device_mm_) != nullptr) {

         utility::LogError(

                 "An instance of type MemoryManagerCached as the underlying "

                 "non-cached manager is forbidden.");

     }

 }


 void* MemoryManagerCached::Malloc(size_t byte_size, const Device& device) {

     if (byte_size == 0) {

         return nullptr;

     }


     return Cacher::GetInstance().Malloc(byte_size, device, device_mm_);

 }


 void MemoryManagerCached::Free(void* ptr, const Device& device) {

     if (ptr == nullptr) {

         return;

     }


     Cacher::GetInstance().Free(ptr, device);

 }


 void MemoryManagerCached::Memcpy(void* dst_ptr,

                                  const Device& dst_device,

                                  const void* src_ptr,

                                  const Device& src_device,

                                  size_t num_bytes) {

     device_mm_->Memcpy(dst_ptr, dst_device, src_ptr, src_device, num_bytes);

 }


 void MemoryManagerCached::ReleaseCache(const Device& device) {

     Cacher::GetInstance().Clear(device);

 }


 void MemoryManagerCached::ReleaseCache() { Cacher::GetInstance().Clear(); }


 }  // namespace core

 }  // namespace cloudViewer

CUDAUtils.h
Common CUDA utilities.

MemoryManager.h

cloudViewer::core::Cacher
Definition: MemoryManagerCached.cpp:344

cloudViewer::core::Cacher::~Cacher
~Cacher()
Definition: MemoryManagerCached.cpp:362

cloudViewer::core::Cacher::Free
void Free(void *ptr, const Device &device)
Definition: MemoryManagerCached.cpp:420

cloudViewer::core::Cacher::Clear
void Clear(const Device &device)
Definition: MemoryManagerCached.cpp:426

cloudViewer::core::Cacher::GetInstance
static Cacher & GetInstance()
Definition: MemoryManagerCached.cpp:346

cloudViewer::core::Cacher::Clear
void Clear()
Definition: MemoryManagerCached.cpp:439

cloudViewer::core::Cacher::Cacher
Cacher(const Cacher &)=delete

cloudViewer::core::Cacher::operator=
Cacher & operator=(Cacher &)=delete

cloudViewer::core::Cacher::Malloc
void * Malloc(size_t byte_size, const Device &device, const std::shared_ptr< MemoryManagerDevice > &device_mm)
Definition: MemoryManagerCached.cpp:380

cloudViewer::core::Device
Definition: Device.h:18

cloudViewer::core::MemoryCache
Definition: MemoryManagerCached.cpp:78

cloudViewer::core::MemoryCache::operator=
MemoryCache & operator=(const MemoryCache &)=delete

cloudViewer::core::MemoryCache::MemoryCache
MemoryCache(const MemoryCache &)=delete

cloudViewer::core::MemoryCache::MemoryCache
MemoryCache()=default

cloudViewer::core::MemoryCache::Acquire
void Acquire(void *ptr, size_t byte_size, const std::shared_ptr< MemoryManagerDevice > &device_mm)
Acquires ownership of the new real allocated blocks.
Definition: MemoryManagerCached.cpp:211

cloudViewer::core::MemoryCache::AlignByteSize
static size_t AlignByteSize(size_t byte_size, size_t alignment=8)
Definition: MemoryManagerCached.cpp:86

cloudViewer::core::MemoryCache::Malloc
void * Malloc(size_t byte_size)
Definition: MemoryManagerCached.cpp:92

cloudViewer::core::MemoryCache::Empty
bool Empty() const
True if the set of allocated real blocks is empty, false otherwise.
Definition: MemoryManagerCached.cpp:280

cloudViewer::core::MemoryCache::Release
std::vector< std::pair< void *, std::shared_ptr< MemoryManagerDevice > > > Release(size_t byte_size)
Definition: MemoryManagerCached.cpp:230

cloudViewer::core::MemoryCache::Free
void Free(void *ptr)
Definition: MemoryManagerCached.cpp:132

cloudViewer::core::MemoryCache::Size
size_t Size() const
Returns the number of allocated real blocks.
Definition: MemoryManagerCached.cpp:277

cloudViewer::core::MemoryCache::ReleaseAll
std::vector< std::pair< void *, std::shared_ptr< MemoryManagerDevice > > > ReleaseAll()
Releases ownership of all unused real allocated blocks.
Definition: MemoryManagerCached.cpp:272

cloudViewer::core::MemoryManagerCached::Memcpy
void Memcpy(void *dst_ptr, const Device &dst_device, const void *src_ptr, const Device &src_device, size_t num_bytes) override
Definition: MemoryManagerCached.cpp:500

cloudViewer::core::MemoryManagerCached::MemoryManagerCached
MemoryManagerCached(const std::shared_ptr< MemoryManagerDevice > &device_mm)
Definition: MemoryManagerCached.cpp:474

cloudViewer::core::MemoryManagerCached::ReleaseCache
static void ReleaseCache()
Definition: MemoryManagerCached.cpp:512

cloudViewer::core::MemoryManagerCached::device_mm_
std::shared_ptr< MemoryManagerDevice > device_mm_
Definition: MemoryManager.h:136

cloudViewer::core::MemoryManagerCached::Malloc
void * Malloc(size_t byte_size, const Device &device) override
Definition: MemoryManagerCached.cpp:484

cloudViewer::core::MemoryManagerCached::Free
void Free(void *ptr, const Device &device) override
Frees previously allocated memory at address ptr on device device.
Definition: MemoryManagerCached.cpp:492

cloudViewer::utility::Logger::GetInstance
static Logger & GetInstance()
Get Logger global singleton instance.
Definition: Logging.cpp:25

LogError
#define LogError(...)
Definition: Logging.h:60

max
int max(int a, int b)
Definition: cutil_math.h:48

Logging.h

Runtime::GetInstance
ccGuiPythonInstance * GetInstance() noexcept
Definition: Runtime.cpp:72

cloudViewer
Generic file read and write utility for python interface.
Definition: AutoSegmentationTools.h:16

cloudViewer::core::PointerOrder
Definition: MemoryManagerCached.cpp:43

cloudViewer::core::PointerOrder::operator()
bool operator()(const std::shared_ptr< Block > &lhs, const std::shared_ptr< Block > &rhs) const
Definition: MemoryManagerCached.cpp:44

cloudViewer::core::RealBlock
Definition: MemoryManagerCached.cpp:67

cloudViewer::core::RealBlock::byte_size_
size_t byte_size_
Definition: MemoryManagerCached.cpp:71

cloudViewer::core::RealBlock::RealBlock
RealBlock(void *ptr, size_t byte_size)
Definition: MemoryManagerCached.cpp:68

cloudViewer::core::RealBlock::v_blocks_
std::set< std::shared_ptr< VirtualBlock >, PointerOrder< VirtualBlock > > v_blocks_
Definition: MemoryManagerCached.cpp:75

cloudViewer::core::RealBlock::ptr_
void * ptr_
Definition: MemoryManagerCached.cpp:70

cloudViewer::core::RealBlock::device_mm_
std::shared_ptr< MemoryManagerDevice > device_mm_
Definition: MemoryManagerCached.cpp:73

cloudViewer::core::SizeOrder
Definition: MemoryManagerCached.cpp:32

cloudViewer::core::SizeOrder::operator()
bool operator()(const std::shared_ptr< Block > &lhs, const std::shared_ptr< Block > &rhs) const
Definition: MemoryManagerCached.cpp:33

cloudViewer::core::VirtualBlock
Definition: MemoryManagerCached.cpp:55

cloudViewer::core::VirtualBlock::VirtualBlock
VirtualBlock(void *ptr, size_t byte_size, const std::weak_ptr< RealBlock > &r_block)
Definition: MemoryManagerCached.cpp:56

cloudViewer::core::VirtualBlock::r_block_
std::weak_ptr< RealBlock > r_block_
Definition: MemoryManagerCached.cpp:64

cloudViewer::core::VirtualBlock::ptr_
void * ptr_
Definition: MemoryManagerCached.cpp:61

cloudViewer::core::VirtualBlock::byte_size_
size_t byte_size_
Definition: MemoryManagerCached.cpp:62