JuliaGPU · maleadt · Apr 17, 2026 · Apr 18, 2026 · Apr 18, 2026 · Apr 18, 2026
diff --git a/CUDACore/src/CUDACore.jl b/CUDACore/src/CUDACore.jl
@@ -73,6 +73,7 @@ include("pointer.jl")
 
 # core utilities
 include("utils/call.jl")
+include("utils/reclaim.jl")
 include("utils/cache.jl")
 include("utils/struct_size.jl")
 include("../lib/cudadrv/CUDAdrv.jl")

diff --git a/CUDACore/src/memory.jl b/CUDACore/src/memory.jl
@@ -456,80 +456,107 @@ function Base.showerror(io::IO, err::OutOfGPUMemoryError)
     end
 end
 
-const reclaim_hooks = Any[]
+## reclaim escalation
+#
+# `Reclaimable`/`register_reclaimable!`/`TaskLocalCache`/`drop!`/`purge!`
+# are defined in utils/reclaim.jl. Here we add the ladder that drives
+# them along with the allocator-specific sync/trim steps.
+
+"""
+    ReclaimLevel
+
+Escalation levels shared by `reclaim(level)` and `retry_reclaim`, ordered
+from cheapest to most aggressive:
+
+| Level                  | Action                                                   |
+| :---                   | :---                                                     |
+| `RECLAIM_PURGE_IDLE`   | empty `HandleCache`s (fast path, no sync/GC)             |
+| `RECLAIM_SYNC`         | synchronize the current task's stream                    |
+| `RECLAIM_DEVICE_SYNC`  | synchronize the whole device                             |
+| `RECLAIM_GC_MINOR`     | minor Julia GC (and device sync)                         |
+| `RECLAIM_GC_FULL`      | full Julia GC (and device sync)                          |
+| `RECLAIM_POOL_TRIM`    | trim unused memory from the pool                         |
+| `RECLAIM_PURGE_CACHES` | empty `HandleCache`s again (catches GC-populated entries)|
+| `RECLAIM_DROP_STATE`   | also drop task-local library state, then GC+purge+trim   |
+
+`RECLAIM_PURGE_IDLE` and `RECLAIM_PURGE_CACHES` run the same action
+(`purge!` on every `Reclaimable`). The first is an opportunistic fast
+path before any sync/GC cost: if a library is sitting on cached idle
+handles, we can release them immediately. The second runs after GC has
+had a chance to populate caches via finalizers, catching those new
+entries without escalating to `RECLAIM_DROP_STATE` (which would also
+drop the current task's live state).
+
+Steps that don't apply to the current allocator (e.g. stream sync on a
+non-stream-ordered device) are silently skipped.
+"""
+@enum ReclaimLevel::Int begin
+    RECLAIM_PURGE_IDLE   = 0
+    RECLAIM_SYNC         = 1
+    RECLAIM_DEVICE_SYNC  = 2
+    RECLAIM_GC_MINOR     = 3
+    RECLAIM_GC_FULL      = 4
+    RECLAIM_POOL_TRIM    = 5
+    RECLAIM_PURGE_CACHES = 6
+    RECLAIM_DROP_STATE   = 7
+end
+
 
 """
     retry_reclaim(retry_if) do
         # code that may fail due to insufficient GPU memory
     end
 
-Run a block of code repeatedly until it successfully allocates the memory it needs.
-Retries are only attempted when calling `retry_if` with the current return value is true.
-At each try, more and more memory is freed from the CUDA memory pool. When that is not
-possible anymore, the latest returned value will be returned.
+Run a block of code repeatedly while `retry_if(ret)` holds for its return
+value, escalating one `ReclaimLevel` between attempts. Returns the final
+(or most recent) return value of the block.
 
-This function is intended for use with CUDA APIs, which sometimes allocate (outside of the
-CUDA memory pool) and return a specific error code when failing to. It is similar to
-`Base.retry`, but deals with return values instead of exceptions for performance reasons.
+This is intended for CUDA APIs that allocate outside the pool and report
+failure via a status code. It's like `Base.retry`, but works on return
+values instead of exceptions for performance reasons.
 """
 @inline function retry_reclaim(f, retry_if)
-  ret = f()
-  if retry_if(ret)
+    ret = f()
+    retry_if(ret) || return ret
     return retry_reclaim_slow(f, retry_if, ret)
-  else
+end
+
+@noinline function retry_reclaim_slow(f, retry_if, ret)
+    state = active_state()
+    sync = stream_ordered(state.device)
+    for level in instances(ReclaimLevel)
+        reclaim_step(level, state, sync)
+        ret = f()
+        retry_if(ret) || return ret
+    end
     return ret
-  end
 end
-## slow path, incrementally reclaiming more memory until we succeed
-@noinline function retry_reclaim_slow(f, retry_if, orig_ret)
-  state = active_state()
-  is_stream_ordered = stream_ordered(state.device)
 
-  phase = 1
-  while true
-    if is_stream_ordered
-      if phase == 1
-        synchronize(state.stream)
-      elseif phase == 2
-        device_synchronize()
-      elseif phase == 3
+# single step of the reclaim ladder; no-op on allocator modes where it
+# doesn't apply. Split out so `retry_reclaim` can escalate one rung at a
+# time while `reclaim()` can run the whole ladder cumulatively.
+function reclaim_step(level::ReclaimLevel, state, sync::Bool)
+    if level == RECLAIM_PURGE_IDLE || level == RECLAIM_PURGE_CACHES
+        foreach_reclaimable(purge!)
+    elseif level == RECLAIM_SYNC
+        sync && synchronize(state.stream)
+    elseif level == RECLAIM_DEVICE_SYNC
+        sync && device_synchronize()
+    elseif level == RECLAIM_GC_MINOR
         GC.gc(false)
-        device_synchronize()
-      elseif phase == 4
+        sync && device_synchronize()
+    elseif level == RECLAIM_GC_FULL
         GC.gc(true)
-        device_synchronize()
-      elseif phase == 5
-        # in case we had a release threshold configured
-        trim(pool_create(state.device))
-      elseif phase == 6
-        for hook in reclaim_hooks
-          hook()
-        end
-      else
-        break
-      end
-    else
-      if phase == 1
-        GC.gc(false)
-      elseif phase == 2
+        sync && device_synchronize()
+    elseif level == RECLAIM_POOL_TRIM
+        sync && trim(pool_create(state.device))
+    elseif level == RECLAIM_DROP_STATE
+        foreach_reclaimable(drop!)
         GC.gc(true)
-      elseif phase == 3
-        for hook in reclaim_hooks
-          hook()
-        end
-      else
-        break
-      end
+        foreach_reclaimable(purge!)
+        sync && trim(pool_create(state.device))
     end
-    phase += 1
-
-    ret = f()
-    if !retry_if(ret)
-      return ret
-    end
-  end
-
-  return orig_ret
+    return
 end
 
 
@@ -789,24 +816,24 @@ end
 end
 
 """
-    reclaim([sz=typemax(Int)])
+    reclaim([level::ReclaimLevel = RECLAIM_DROP_STATE])
+
+Free GPU memory by walking the reclaim ladder up to `level`. Use this before
+calling into functionality that does not use the CUDA memory pool. Returns
+`nothing`.
 
-Reclaims `sz` bytes of cached memory. Use this to free GPU memory before calling into
-functionality that does not use the CUDA memory pool. Returns the number of bytes
-actually reclaimed.
+The default drops task-local library state, runs a full GC so handle
+wrappers finalize and return their raw handles to caches, then destroys
+those caches and trims the pool.
 """
-function reclaim(sz::Int=typemax(Int))
-  dev = device()
-  for hook in reclaim_hooks
-    hook()
-  end
-  if stream_ordered(dev)
-      device_synchronize()
-      synchronize(context())
-      trim(pool_create(dev))
-  else
-    0
-  end
+function reclaim(level::ReclaimLevel = RECLAIM_DROP_STATE)
+    state = active_state()
+    sync = stream_ordered(state.device)
+    for l in instances(ReclaimLevel)
+        l <= level || break
+        reclaim_step(l, state, sync)
+    end
+    return
 end
 
 
@@ -909,7 +936,10 @@ macro timed(ex)
          gpu_bytes=gpu_mem_stats.alloc_bytes, gpu_memtime=gpu_mem_stats.total_time, gpu_memstats=gpu_mem_stats)
     end
 end
-@public @allocated, @time, @timed, used_memory, cached_memory, pool_status, reclaim
+@public @allocated, @time, @timed, used_memory, cached_memory, pool_status, reclaim,
+        ReclaimLevel, RECLAIM_PURGE_IDLE, RECLAIM_SYNC, RECLAIM_DEVICE_SYNC,
+        RECLAIM_GC_MINOR, RECLAIM_GC_FULL, RECLAIM_POOL_TRIM, RECLAIM_PURGE_CACHES,
+        RECLAIM_DROP_STATE
 
 """
     used_memory()

diff --git a/CUDACore/src/utils/cache.jl b/CUDACore/src/utils/cache.jl
@@ -2,7 +2,7 @@
 
 export HandleCache
 
-struct HandleCache{K,V}
+struct HandleCache{K,V} <: Reclaimable
     ctor
     dtor
 
@@ -15,16 +15,15 @@ struct HandleCache{K,V}
     max_entries::Int
 
     function HandleCache{K,V}(ctor, dtor; max_entries::Int=32) where {K,V}
-        obj = new{K,V}(ctor, dtor, Set{Pair{K,V}}(), Dict{K,Vector{V}}(),
-                       Base.ThreadSynchronizer(), max_entries)
-
-        # register a hook to wipe the current context's cache when under memory pressure
-        push!(reclaim_hooks, ()->empty!(obj))
-
-        return obj
+        return new{K,V}(ctor, dtor, Set{Pair{K,V}}(), Dict{K,Vector{V}}(),
+                        Base.ThreadSynchronizer(), max_entries)
     end
 end
 
+# destroying idle handles is the `purge!` step of reclaim; individual caches
+# must be registered in the owning library's __init__ (see reclaim.jl).
+purge!(cache::HandleCache) = empty!(cache)
+
 # remove a handle from the cache, or create a new one
 function Base.pop!(cache::HandleCache{K,V}, key::K) where {K,V}
     # check the cache

diff --git a/CUDACore/src/utils/reclaim.jl b/CUDACore/src/utils/reclaim.jl
@@ -0,0 +1,119 @@
+# reclaim registry
+#
+# Libraries (cuBLAS, cuDNN, …) hold GPU resources that the memory subsystem
+# needs to release under pressure. Two kinds of resources exist:
+#
+#  - live state referenced by a running task (typically via
+#    `task_local_storage`), e.g. fat handles wrapping library handles plus
+#    workspace buffers. Released via `drop!` — the library clears its TLS
+#    references so the wrappers become GC-eligible; their object-bound
+#    finalizers then return the raw handles to a `HandleCache`;
+#
+#  - idle resources cached for reuse, e.g. the `HandleCache` of previously-
+#    returned library handles. Released via `purge!` — the cache is
+#    emptied and each entry's destructor runs.
+#
+# `Reclaimable` unifies both. Instances are registered via
+# `register_reclaimable!` — typically from a library's `__init__`, since
+# mutations to this registry performed during the precompilation of a
+# downstream package don't carry over to module load (see
+# `register_reclaimable!` for details).
+
+abstract type Reclaimable end
+
+"""
+    CUDACore.drop!(r::Reclaimable)
+
+Release references to live state so the associated GC-managed wrappers
+become collectible (and their finalizers can run). Default: do nothing.
+"""
+drop!(::Reclaimable) = nothing
+
+"""
+    CUDACore.purge!(r::Reclaimable)
+
+Destroy idle cached resources owned by `r`. Default: do nothing.
+"""
+purge!(::Reclaimable) = nothing
+
+const reclaimables = Reclaimable[]
+const reclaimables_lock = ReentrantLock()
+
+"""
+    CUDACore.register_reclaimable!(r::Reclaimable)
+
+Register `r` so that `reclaim` invokes its `drop!` and `purge!` methods.
+Idempotent: re-registering the same instance is a no-op. Returns `r`.
+
+Call this from a package's `__init__`: top-level constructor calls are
+captured by Julia's precompilation cache, but mutations they perform on
+dependencies (like pushing into CUDACore's registry) do not persist to
+module load time.
+"""
+function register_reclaimable!(r::Reclaimable)
+    @lock reclaimables_lock begin
+        r in reclaimables || push!(reclaimables, r)
+    end
+    return r
+end
+
+# invoke f(r) per Reclaimable with per-hook error isolation, modelled
+# after Base's `atexit_hooks`: one bad hook mustn't prevent the others.
+function foreach_reclaimable(f)
+    @lock reclaimables_lock begin
+        for r in reclaimables
+            try
+                f(r)
+            catch ex
+                @error "reclaim callback failed" type=typeof(r) exception=(ex, catch_backtrace())
+            end
+        end
+    end
+end
+
+
+## task-local state helper
+
+"""
+    TaskLocalCache{K,V}(key::Symbol)
+
+Declarative marker for a library's per-task cache stored under `key` in
+`task_local_storage()`. Must be `register_reclaimable!`'d from the owning
+package's `__init__` so that `RECLAIM_DROP_STATE` clears the current
+task's entry, letting the stored values (typically mutable handle wrappers)
+be garbage-collected. Their finalizers then return the underlying
+resources to a `HandleCache`.
+
+Usage:
+
+    const state_cache = CUDACore.TaskLocalCache{CuContext, LibraryState}(:CUBLAS)
+
+    function __init__()
+        ...
+        CUDACore.register_reclaimable!(state_cache)
+    end
+
+    function handle()
+        states = CUDACore.task_dict(state_cache)
+        state = get!(() -> new_state(...), states, key)
+        ...
+    end
+
+Only the current task's storage is touched: Julia's `IdDict`-backed TLS
+isn't safe for concurrent mutation across threads, so cross-task drops are
+intentionally deferred to task GC (values become unreachable when the
+task is collected).
+"""
+struct TaskLocalCache{K,V} <: Reclaimable
+    key::Symbol
+    TaskLocalCache{K,V}(key::Symbol) where {K,V} = new{K,V}(key)
+end
+
+@inline function task_dict(s::TaskLocalCache{K,V}) where {K,V}
+    get!(() -> Dict{K,V}(), task_local_storage(), s.key)::Dict{K,V}
+end
+
+function drop!(s::TaskLocalCache)
+    delete!(task_local_storage(), s.key)
+    return
+end