JuliaGPU · maleadt · Mar 22, 2026 · Mar 22, 2026
diff --git a/examples/matmul.jl b/examples/matmul.jl
@@ -26,8 +26,8 @@ function matmul!(output, a, b)
     return
 end
 
-a = rand!(allocate(backend, Float32, 256, 123))
-b = rand!(allocate(backend, Float32, 123, 45))
+a = copyto!(allocate(backend, Float32, 256, 123), rand(Float32, 256, 123))
+b = copyto!(allocate(backend, Float32, 123, 45), rand(Float32, 123, 45))
 output = KernelAbstractions.zeros(backend, Float32, 256, 45)
 
 matmul!(output, a, b)

diff --git a/examples/naive_transpose.jl b/examples/naive_transpose.jl
@@ -24,7 +24,7 @@ end
 res = 1024
 
 # creating initial arrays
-b = rand!(allocate(backend, Float32, res, res))
+b = copyto!(allocate(backend, Float32, res, res), rand(Float32, res, res))
 a = KernelAbstractions.zeros(backend, Float32, res, res)
 
 naive_transpose!(a, b)

diff --git a/examples/numa_aware.jl b/examples/numa_aware.jl
@@ -27,11 +27,11 @@ function measure_membw(
 
     a = dtype(3.1415)
     if init == :serial
-        X = rand!(zeros(dtype, N))
-        Y = rand!(zeros(dtype, N))
+        X = rand(dtype, N)
+        Y = rand(dtype, N)
     else
-        X = rand!(KernelAbstractions.zeros(backend, dtype, N))
-        Y = rand!(KernelAbstractions.zeros(backend, dtype, N))
+        X = copyto!(KernelAbstractions.zeros(backend, dtype, N), rand(dtype, N))
+        Y = copyto!(KernelAbstractions.zeros(backend, dtype, N), rand(dtype, N))
     end
     workgroup_size = 1024
 

diff --git a/examples/performance.jl b/examples/performance.jl
@@ -145,7 +145,7 @@ for block_dims in ((TILE_DIM, TILE_DIM), (TILE_DIM * TILE_DIM, 1), (1, TILE_DIM
             ("transpose", simple_transpose_kernel!(backend, block_dims)),
         )
         NVTX.@range "Simple $name $block_dims" let
-            input = rand!(allocate(backend, T, N, N))
+            input = copyto!(allocate(backend, T, N, N), rand(T, N, N))
             output = similar(input)
 
             # compile kernel
@@ -165,7 +165,7 @@ for (name, kernel) in (
     )
     for bank in (true, false)
         NVTX.@range "Localmem $name ($TILE_DIM, $TILE_DIM) bank=$bank" let
-            input = rand!(allocate(backend, T, N, N))
+            input = copyto!(allocate(backend, T, N, N), rand(T, N, N))
             output = similar(input)
 
             # compile kernel
@@ -185,7 +185,7 @@ for (name, kernel) in (
     )
     for bank in (true, false)
         NVTX.@range "Localmem + multiple elements $name ($TILE_DIM, $BLOCK_ROWS) bank=$bank" let
-            input = rand!(allocate(backend, T, N, N))
+            input = copyto!(allocate(backend, T, N, N), rand(T, N, N))
             output = similar(input)
 
             # We want a number of blocks equivalent to (TILE_DIM, TILE_DIM)

diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl
@@ -79,8 +79,8 @@ end
 N = 1024
 R = 512
 M = 2048
-A = rand!(allocate(backend, Float32, N, R))
-B = rand!(allocate(backend, Float32, R, M))
+A = copyto!(allocate(backend, Float32, N, R), rand(Float32, N, R))
+B = copyto!(allocate(backend, Float32, R, M), rand(Float32, R, M))
 C = KernelAbstractions.zeros(backend, Float32, N, M)
 
 workgroupsize = (TILE_DIM, TILE_DIM)