Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CUDACore/src/CUDACore.jl
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ include("pointer.jl")

# core utilities
include("utils/call.jl")
include("utils/reclaim.jl")
include("utils/cache.jl")
include("utils/struct_size.jl")
include("../lib/cudadrv/CUDAdrv.jl")
Expand Down
178 changes: 104 additions & 74 deletions CUDACore/src/memory.jl
Original file line number Diff line number Diff line change
Expand Up @@ -456,80 +456,107 @@ function Base.showerror(io::IO, err::OutOfGPUMemoryError)
end
end

const reclaim_hooks = Any[]
## reclaim escalation
#
# `Reclaimable`/`register_reclaimable!`/`TaskLocalCache`/`drop!`/`purge!`
# are defined in utils/reclaim.jl. Here we add the ladder that drives
# them along with the allocator-specific sync/trim steps.

"""
ReclaimLevel

Escalation levels shared by `reclaim(level)` and `retry_reclaim`, ordered
from cheapest to most aggressive:

| Level | Action |
| :--- | :--- |
| `RECLAIM_PURGE_IDLE` | empty `HandleCache`s (fast path, no sync/GC) |
| `RECLAIM_SYNC` | synchronize the current task's stream |
| `RECLAIM_DEVICE_SYNC` | synchronize the whole device |
| `RECLAIM_GC_MINOR` | minor Julia GC (and device sync) |
| `RECLAIM_GC_FULL` | full Julia GC (and device sync) |
| `RECLAIM_POOL_TRIM` | trim unused memory from the pool |
| `RECLAIM_PURGE_CACHES` | empty `HandleCache`s again (catches GC-populated entries)|
| `RECLAIM_DROP_STATE` | also drop task-local library state, then GC+purge+trim |

`RECLAIM_PURGE_IDLE` and `RECLAIM_PURGE_CACHES` run the same action
(`purge!` on every `Reclaimable`). The first is an opportunistic fast
path before any sync/GC cost: if a library is sitting on cached idle
handles, we can release them immediately. The second runs after GC has
had a chance to populate caches via finalizers, catching those new
entries without escalating to `RECLAIM_DROP_STATE` (which would also
drop the current task's live state).

Steps that don't apply to the current allocator (e.g. stream sync on a
non-stream-ordered device) are silently skipped.
"""
@enum ReclaimLevel::Int begin
RECLAIM_PURGE_IDLE = 0
RECLAIM_SYNC = 1
RECLAIM_DEVICE_SYNC = 2
RECLAIM_GC_MINOR = 3
RECLAIM_GC_FULL = 4
RECLAIM_POOL_TRIM = 5
RECLAIM_PURGE_CACHES = 6
RECLAIM_DROP_STATE = 7
end


"""
retry_reclaim(retry_if) do
# code that may fail due to insufficient GPU memory
end

Run a block of code repeatedly until it successfully allocates the memory it needs.
Retries are only attempted when calling `retry_if` with the current return value is true.
At each try, more and more memory is freed from the CUDA memory pool. When that is not
possible anymore, the latest returned value will be returned.
Run a block of code repeatedly while `retry_if(ret)` holds for its return
value, escalating one `ReclaimLevel` between attempts. Returns the final
(or most recent) return value of the block.

This function is intended for use with CUDA APIs, which sometimes allocate (outside of the
CUDA memory pool) and return a specific error code when failing to. It is similar to
`Base.retry`, but deals with return values instead of exceptions for performance reasons.
This is intended for CUDA APIs that allocate outside the pool and report
failure via a status code. It's like `Base.retry`, but works on return
values instead of exceptions for performance reasons.
"""
@inline function retry_reclaim(f, retry_if)
ret = f()
if retry_if(ret)
ret = f()
retry_if(ret) || return ret
return retry_reclaim_slow(f, retry_if, ret)
else
end

@noinline function retry_reclaim_slow(f, retry_if, ret)
state = active_state()
sync = stream_ordered(state.device)
for level in instances(ReclaimLevel)
reclaim_step(level, state, sync)
ret = f()
retry_if(ret) || return ret
end
return ret
end
end
## slow path, incrementally reclaiming more memory until we succeed
@noinline function retry_reclaim_slow(f, retry_if, orig_ret)
state = active_state()
is_stream_ordered = stream_ordered(state.device)

phase = 1
while true
if is_stream_ordered
if phase == 1
synchronize(state.stream)
elseif phase == 2
device_synchronize()
elseif phase == 3
# single step of the reclaim ladder; no-op on allocator modes where it
# doesn't apply. Split out so `retry_reclaim` can escalate one rung at a
# time while `reclaim()` can run the whole ladder cumulatively.
function reclaim_step(level::ReclaimLevel, state, sync::Bool)
if level == RECLAIM_PURGE_IDLE || level == RECLAIM_PURGE_CACHES
foreach_reclaimable(purge!)
elseif level == RECLAIM_SYNC
sync && synchronize(state.stream)
elseif level == RECLAIM_DEVICE_SYNC
sync && device_synchronize()
elseif level == RECLAIM_GC_MINOR
GC.gc(false)
device_synchronize()
elseif phase == 4
sync && device_synchronize()
elseif level == RECLAIM_GC_FULL
GC.gc(true)
device_synchronize()
elseif phase == 5
# in case we had a release threshold configured
trim(pool_create(state.device))
elseif phase == 6
for hook in reclaim_hooks
hook()
end
else
break
end
else
if phase == 1
GC.gc(false)
elseif phase == 2
sync && device_synchronize()
elseif level == RECLAIM_POOL_TRIM
sync && trim(pool_create(state.device))
elseif level == RECLAIM_DROP_STATE
foreach_reclaimable(drop!)
GC.gc(true)
elseif phase == 3
for hook in reclaim_hooks
hook()
end
else
break
end
foreach_reclaimable(purge!)
sync && trim(pool_create(state.device))
end
phase += 1

ret = f()
if !retry_if(ret)
return ret
end
end

return orig_ret
return
end


Expand Down Expand Up @@ -789,24 +816,24 @@ end
end

"""
reclaim([sz=typemax(Int)])
reclaim([level::ReclaimLevel = RECLAIM_DROP_STATE])

Free GPU memory by walking the reclaim ladder up to `level`. Use this before
calling into functionality that does not use the CUDA memory pool. Returns
`nothing`.

Reclaims `sz` bytes of cached memory. Use this to free GPU memory before calling into
functionality that does not use the CUDA memory pool. Returns the number of bytes
actually reclaimed.
The default drops task-local library state, runs a full GC so handle
wrappers finalize and return their raw handles to caches, then destroys
those caches and trims the pool.
"""
function reclaim(sz::Int=typemax(Int))
dev = device()
for hook in reclaim_hooks
hook()
end
if stream_ordered(dev)
device_synchronize()
synchronize(context())
trim(pool_create(dev))
else
0
end
function reclaim(level::ReclaimLevel = RECLAIM_DROP_STATE)
state = active_state()
sync = stream_ordered(state.device)
for l in instances(ReclaimLevel)
l <= level || break
reclaim_step(l, state, sync)
end
return
end


Expand Down Expand Up @@ -909,7 +936,10 @@ macro timed(ex)
gpu_bytes=gpu_mem_stats.alloc_bytes, gpu_memtime=gpu_mem_stats.total_time, gpu_memstats=gpu_mem_stats)
end
end
@public @allocated, @time, @timed, used_memory, cached_memory, pool_status, reclaim
@public @allocated, @time, @timed, used_memory, cached_memory, pool_status, reclaim,
ReclaimLevel, RECLAIM_PURGE_IDLE, RECLAIM_SYNC, RECLAIM_DEVICE_SYNC,
RECLAIM_GC_MINOR, RECLAIM_GC_FULL, RECLAIM_POOL_TRIM, RECLAIM_PURGE_CACHES,
RECLAIM_DROP_STATE

"""
used_memory()
Expand Down
15 changes: 7 additions & 8 deletions CUDACore/src/utils/cache.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

export HandleCache

struct HandleCache{K,V}
struct HandleCache{K,V} <: Reclaimable
ctor
dtor

Expand All @@ -15,16 +15,15 @@ struct HandleCache{K,V}
max_entries::Int

function HandleCache{K,V}(ctor, dtor; max_entries::Int=32) where {K,V}
obj = new{K,V}(ctor, dtor, Set{Pair{K,V}}(), Dict{K,Vector{V}}(),
Base.ThreadSynchronizer(), max_entries)

# register a hook to wipe the current context's cache when under memory pressure
push!(reclaim_hooks, ()->empty!(obj))

return obj
return new{K,V}(ctor, dtor, Set{Pair{K,V}}(), Dict{K,Vector{V}}(),
Base.ThreadSynchronizer(), max_entries)
end
end

# destroying idle handles is the `purge!` step of reclaim; individual caches
# must be registered in the owning library's __init__ (see reclaim.jl).
purge!(cache::HandleCache) = empty!(cache)

# remove a handle from the cache, or create a new one
function Base.pop!(cache::HandleCache{K,V}, key::K) where {K,V}
# check the cache
Expand Down
119 changes: 119 additions & 0 deletions CUDACore/src/utils/reclaim.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# reclaim registry
#
# Libraries (cuBLAS, cuDNN, …) hold GPU resources that the memory subsystem
# needs to release under pressure. Two kinds of resources exist:
#
# - live state referenced by a running task (typically via
# `task_local_storage`), e.g. fat handles wrapping library handles plus
# workspace buffers. Released via `drop!` — the library clears its TLS
# references so the wrappers become GC-eligible; their object-bound
# finalizers then return the raw handles to a `HandleCache`;
#
# - idle resources cached for reuse, e.g. the `HandleCache` of previously-
# returned library handles. Released via `purge!` — the cache is
# emptied and each entry's destructor runs.
#
# `Reclaimable` unifies both. Instances are registered via
# `register_reclaimable!` — typically from a library's `__init__`, since
# mutations to this registry performed during the precompilation of a
# downstream package don't carry over to module load (see
# `register_reclaimable!` for details).

abstract type Reclaimable end

"""
CUDACore.drop!(r::Reclaimable)

Release references to live state so the associated GC-managed wrappers
become collectible (and their finalizers can run). Default: do nothing.
"""
drop!(::Reclaimable) = nothing

"""
CUDACore.purge!(r::Reclaimable)

Destroy idle cached resources owned by `r`. Default: do nothing.
"""
purge!(::Reclaimable) = nothing

const reclaimables = Reclaimable[]
const reclaimables_lock = ReentrantLock()

"""
CUDACore.register_reclaimable!(r::Reclaimable)

Register `r` so that `reclaim` invokes its `drop!` and `purge!` methods.
Idempotent: re-registering the same instance is a no-op. Returns `r`.

Call this from a package's `__init__`: top-level constructor calls are
captured by Julia's precompilation cache, but mutations they perform on
dependencies (like pushing into CUDACore's registry) do not persist to
module load time.
"""
function register_reclaimable!(r::Reclaimable)
@lock reclaimables_lock begin
r in reclaimables || push!(reclaimables, r)
end
return r
end

# invoke f(r) per Reclaimable with per-hook error isolation, modelled
# after Base's `atexit_hooks`: one bad hook mustn't prevent the others.
function foreach_reclaimable(f)
@lock reclaimables_lock begin
for r in reclaimables
try
f(r)
catch ex
@error "reclaim callback failed" type=typeof(r) exception=(ex, catch_backtrace())
end
end
end
end


## task-local state helper

"""
TaskLocalCache{K,V}(key::Symbol)

Declarative marker for a library's per-task cache stored under `key` in
`task_local_storage()`. Must be `register_reclaimable!`'d from the owning
package's `__init__` so that `RECLAIM_DROP_STATE` clears the current
task's entry, letting the stored values (typically mutable handle wrappers)
be garbage-collected. Their finalizers then return the underlying
resources to a `HandleCache`.

Usage:

const state_cache = CUDACore.TaskLocalCache{CuContext, LibraryState}(:CUBLAS)

function __init__()
...
CUDACore.register_reclaimable!(state_cache)
end

function handle()
states = CUDACore.task_dict(state_cache)
state = get!(() -> new_state(...), states, key)
...
end

Only the current task's storage is touched: Julia's `IdDict`-backed TLS
isn't safe for concurrent mutation across threads, so cross-task drops are
intentionally deferred to task GC (values become unreachable when the
task is collected).
"""
struct TaskLocalCache{K,V} <: Reclaimable
key::Symbol
TaskLocalCache{K,V}(key::Symbol) where {K,V} = new{K,V}(key)
end

@inline function task_dict(s::TaskLocalCache{K,V}) where {K,V}
get!(() -> Dict{K,V}(), task_local_storage(), s.key)::Dict{K,V}
end

function drop!(s::TaskLocalCache)
delete!(task_local_storage(), s.key)
return
end
Loading