JuliaGPU
diff --git a/‎docs/src/api/kernel_programming.md‎
Lines changed: 54 additions & 20 deletions b/‎docs/src/api/kernel_programming.md‎
Lines changed: 54 additions & 20 deletions
@@ -75,54 +75,88 @@ Currently only RDNA 3 is supported and following types:
 - `FP16 ⋅ FP16 + FP32 -> FP32`;
 - `BFP16 ⋅ BFP16 + FP32 -> FP32`.
 
+All WMMA functionality is in the `AMDGPU.Device.WMMA` submodule.
+The tile dimensions are fixed at 16×16×16 (`WMMA.M`, `WMMA.N`, `WMMA.K`).
+
+### Layout types
+
+Two layout types control how matrices are read from and written to memory:
+
+- `WMMA.ColMajor` — column-major (Julia/Fortran) order: element `(row, col)` is at `ptr[col * stride + row]`.
+- `WMMA.RowMajor` — row-major (C) order: element `(row, col)` is at `ptr[row * stride + col]`.
+
+### API
+
 ```@docs
-AMDGPU.Device.wmma_load_a
-AMDGPU.Device.wmma_fill_c
-AMDGPU.Device.wmma_store_d
-AMDGPU.Device.wmma_mma
+AMDGPU.Device.WMMA.Fragment
+AMDGPU.Device.WMMA.fill_c
+AMDGPU.Device.WMMA.load_a
+AMDGPU.Device.WMMA.load_b
+AMDGPU.Device.WMMA.load_c
+AMDGPU.Device.WMMA.store_d
+AMDGPU.Device.WMMA.mma
 ```
 
-Below is a simple example of matrix multiplication kernel using WMMA.
+`load_c` and `store_d` accept pointer types `Float32`, `Float16`, and `BFloat16`.
+When `T` is `Float16` or `BFloat16`, values are widened to `Float32` on load and
+narrowed back on store, so the `FragmentC_F32` accumulator type is always `Float32`
+regardless of the backing buffer type.
+
+### Example
+
+Below is a matrix multiplication kernel using WMMA with column-major inputs.
+Pass `WMMA.RowMajor` instead to load from row-major (C-style) buffers.
 
 ```@example wmma-matmul
 using AMDGPU
-using AMDGPU.Device: WMMA_M, WMMA_N, WMMA_K, wmma_fill_c, wmma_load_a, wmma_load_b, wmma_store_d, wmma_mma
+using AMDGPU.Device: WMMA
 
-function wmma_kernel_ptr!(C, A::AbstractArray{T}, B, M::Int32, N::Int32, K::Int32) where T
-    tile_row = (workgroupIdx().x - Int32(1)) * Int32(WMMA_M)
-    tile_col = (workgroupIdx().y - Int32(1)) * Int32(WMMA_N)
+function wmma_kernel!(C, A::AbstractArray{T}, B, M::Int32, N::Int32, K::Int32, layout) where T
+    tile_row = (workgroupIdx().x - Int32(1)) * Int32(WMMA.M)
+    tile_col = (workgroupIdx().y - Int32(1)) * Int32(WMMA.N)
 
     C_ptr = pointer(C)
     A_ptr = pointer(A)
     B_ptr = pointer(B)
 
-    c_frag = wmma_fill_c(Float32, 0f0)
+    c_frag = WMMA.fill_c(Float32, 0f0)
     k = Int32(0)
     while k < K
-        a_ptr = A_ptr + (k * M + tile_row) * Int32(sizeof(T))
-        b_ptr = B_ptr + (tile_col * K + k) * Int32(sizeof(T))
+        a_ptr, a_stride = _a_tile(A_ptr, layout, tile_row, k, M, K, T)
+        b_ptr, b_stride = _b_tile(B_ptr, layout, tile_col, k, N, K, T)
 
-        a_frag = wmma_load_a(a_ptr, M)
-        b_frag = wmma_load_b(b_ptr, K)
-        c_frag = wmma_mma(a_frag, b_frag, c_frag)
+        a_frag = WMMA.load_a(a_ptr, a_stride, layout)
+        b_frag = WMMA.load_b(b_ptr, b_stride, layout)
+        c_frag = WMMA.mma(a_frag, b_frag, c_frag)
 
-        k += Int32(WMMA_K)
+        k += Int32(WMMA.K)
     end
 
     c_ptr = C_ptr + (tile_col * M + tile_row) * Int32(sizeof(Float32))
-    wmma_store_d(c_ptr, c_frag, M)
+    WMMA.store_d(c_ptr, c_frag, M, WMMA.ColMajor)
     return
 end
 
+# Tile pointer + stride helpers — dispatched on layout, DCE'd by the compiler.
+_a_tile(ptr, ::Type{WMMA.ColMajor}, tile_row, k, M, K, ::Type{T}) where T =
+    ptr + (k * M + tile_row) * Int32(sizeof(T)), M
+_a_tile(ptr, ::Type{WMMA.RowMajor}, tile_row, k, M, K, ::Type{T}) where T =
+    ptr + (tile_row * K + k) * Int32(sizeof(T)), K
+
+_b_tile(ptr, ::Type{WMMA.ColMajor}, tile_col, k, N, K, ::Type{T}) where T =
+    ptr + (tile_col * K + k) * Int32(sizeof(T)), K
+_b_tile(ptr, ::Type{WMMA.RowMajor}, tile_col, k, N, K, ::Type{T}) where T =
+    ptr + (k * N + tile_col) * Int32(sizeof(T)), N
+
 M, N, K = 32, 32, 32
 A_host = Float16.(rand(M, K))
 B_host = Float16.(rand(K, N))
 A, B = ROCArray(A_host), ROCArray(B_host)
 C = ROCArray(zeros(Float32, M, N))
 
-tiles_m, tiles_n = M ÷ WMMA_M, N ÷ WMMA_N
-@roc gridsize=(tiles_m, tiles_n) groupsize=32 wmma_kernel_ptr!(
-    C, A, B, Int32(M), Int32(N), Int32(K))
+tiles_m, tiles_n = M ÷ WMMA.M, N ÷ WMMA.N
+@roc gridsize=(tiles_m, tiles_n) groupsize=32 wmma_kernel!(
+    C, A, B, Int32(M), Int32(N), Int32(K), WMMA.ColMajor)
 
 @assert maximum(abs.(Float32.(C) .- (Float32.(A) * Float32.(B)))) < 0.1
 ```