18 #include <rmm/detail/error.hpp>
19 #include <rmm/detail/logging_assert.hpp>
20 #include <rmm/logger.hpp>
21 #include <rmm/mr/device/detail/arena.hpp>
24 #include <cuda_runtime_api.h>
26 #include <spdlog/common.h>
30 #include <shared_mutex>
79 template <
typename Upstream>
93 std::optional<std::size_t> arena_size = std::nullopt,
94 bool dump_log_on_failure =
false)
95 : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
97 if (dump_log_on_failure_) {
98 logger_ = spdlog::basic_logger_mt(
"arena_memory_dump",
"rmm_arena_memory_dump.log");
100 logger_->set_level(spdlog::level::info);
128 using global_arena = rmm::mr::detail::arena::global_arena<Upstream>;
129 using arena = rmm::mr::detail::arena::arena<Upstream>;
144 if (bytes <= 0) {
return nullptr; }
145 #ifdef RMM_ARENA_USE_SIZE_CLASSES
146 bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
148 bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
150 auto& arena = get_arena(stream);
153 std::shared_lock lock(mtx_);
154 void* pointer = arena.allocate(bytes);
155 if (pointer !=
nullptr) {
return pointer; }
159 std::unique_lock lock(mtx_);
161 void* pointer = arena.allocate(bytes);
162 if (pointer ==
nullptr) {
163 if (dump_log_on_failure_) { dump_memory_log(bytes); }
175 RMM_CUDA_TRY(cudaDeviceSynchronize());
176 for (
auto& thread_arena : thread_arenas_) {
177 thread_arena.second->clean();
179 for (
auto& stream_arena : stream_arenas_) {
180 stream_arena.second.clean();
192 void do_deallocate(
void* ptr, std::size_t bytes, cuda_stream_view stream)
override
194 if (ptr ==
nullptr || bytes <= 0) {
return; }
195 #ifdef RMM_ARENA_USE_SIZE_CLASSES
196 bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
198 bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
200 auto& arena = get_arena(stream);
203 std::shared_lock lock(mtx_);
205 if (arena.deallocate(ptr, bytes, stream)) {
return; }
211 stream.synchronize_no_throw();
213 std::unique_lock lock(mtx_);
214 deallocate_from_other_arena(ptr, bytes, stream);
226 void deallocate_from_other_arena(
void* ptr, std::size_t bytes, cuda_stream_view stream)
228 if (use_per_thread_arena(stream)) {
229 for (
auto const& thread_arena : thread_arenas_) {
230 if (thread_arena.second->deallocate(ptr, bytes)) {
return; }
233 for (
auto& stream_arena : stream_arenas_) {
234 if (stream_arena.second.deallocate(ptr, bytes)) {
return; }
238 if (!global_arena_.deallocate(ptr, bytes)) {
247 if (use_per_thread_arena(stream)) {
248 for (
auto& stream_arena : stream_arenas_) {
249 if (stream_arena.second.deallocate(ptr, bytes)) {
return; }
252 for (
auto const& thread_arena : thread_arenas_) {
253 if (thread_arena.second->deallocate(ptr, bytes)) {
return; }
256 RMM_FAIL(
"allocation not found");
266 arena& get_arena(cuda_stream_view stream)
268 if (use_per_thread_arena(stream)) {
return get_thread_arena(); }
269 return get_stream_arena(stream);
277 arena& get_thread_arena()
279 auto const thread_id = std::this_thread::get_id();
281 std::shared_lock lock(map_mtx_);
282 auto const iter = thread_arenas_.find(thread_id);
283 if (iter != thread_arenas_.end()) {
return *iter->second; }
286 std::unique_lock lock(map_mtx_);
287 auto thread_arena = std::make_shared<arena>(global_arena_);
288 thread_arenas_.emplace(thread_id, thread_arena);
289 thread_local detail::arena::arena_cleaner<Upstream> cleaner{thread_arena};
290 return *thread_arena;
299 arena& get_stream_arena(cuda_stream_view stream)
301 RMM_LOGGING_ASSERT(!use_per_thread_arena(stream));
303 std::shared_lock lock(map_mtx_);
304 auto const iter = stream_arenas_.find(stream.value());
305 if (iter != stream_arenas_.end()) {
return iter->second; }
308 std::unique_lock lock(map_mtx_);
309 stream_arenas_.emplace(stream.value(), global_arena_);
310 return stream_arenas_.at(stream.value());
320 std::pair<std::size_t, std::size_t> do_get_mem_info(
321 [[maybe_unused]] cuda_stream_view stream)
const override
323 return std::make_pair(0, 0);
331 void dump_memory_log(
size_t bytes)
333 logger_->info(
"**************************************************");
335 logger_->info(
"**************************************************");
336 logger_->info(
"Global arena:");
337 global_arena_.dump_memory_log(logger_);
347 static bool use_per_thread_arena(cuda_stream_view stream)
349 return stream.is_per_thread_default();
353 global_arena global_arena_;
356 std::map<std::thread::id, std::shared_ptr<arena>> thread_arenas_;
359 std::map<cudaStream_t, arena> stream_arenas_;
361 bool dump_log_on_failure_{};
363 std::shared_ptr<spdlog::logger> logger_{};
365 mutable std::shared_mutex map_mtx_;
367 mutable std::shared_mutex mtx_;
Strongly-typed non-owning wrapper for CUDA streams with default constructor.
Definition: cuda_stream_view.hpp:41
A suballocator that emphasizes fragmentation avoidance and scalable concurrency support.
Definition: arena_memory_resource.hpp:80
arena_memory_resource(Upstream *upstream_mr, std::optional< std::size_t > arena_size=std::nullopt, bool dump_log_on_failure=false)
Construct an arena_memory_resource.
Definition: arena_memory_resource.hpp:92
bool supports_get_mem_info() const noexcept override
Query whether the resource supports the get_mem_info API.
Definition: arena_memory_resource.hpp:125
bool supports_streams() const noexcept override
Queries whether the resource supports use of non-null CUDA streams for allocation/deallocation.
Definition: arena_memory_resource.hpp:118
Base class for all libcudf device memory allocation.
Definition: device_memory_resource.hpp:89
Exception thrown when RMM runs out of memory.
Definition: error.hpp:89
Represent a size in number of bytes.
Definition: logger.hpp:74