Skip to content

Commit 115711c

Browse files
tavakkoliamirmohammadftynse
authored andcommitted
[mlir][LinAlg][Transform][GPU] Add GPU memory hierarchy to the transform.promote op
In this patch we are adding the support of copying a a `memref.subview` to the shared or private memory in GPU. The global to shared memory copy is adopted from codes implemented in IREE (https://github.com/iree-org/iree), but the private memory copy part has not been implemented in IREE. This patch enables transferring a subview from `global->shared`, `global->private`, and `shared->private`. Our final aim is to provide a copy layout as an affine map to the `transform.promote` op to support transpose memory copy. This map is a permutation of the original affine index map. Although this has been implemented and user can copy data to arbitrary layout , this attempt is not included in this patch since we have still problem with `linalg.generic` operations to change their index map to the transformed index map. You can find more in following links ([[ https://github.com/tavakkoliamirmohammad/iree-llvm-fork/commit/4fd5f93355951ad0fb338858393ff409bd9c62f8 | Initial attempt to support layout map in promote op in transform dialect ]]) ([[ https://github.com/tavakkoliamirmohammad/iree-llvm-fork/commit/9062b5849f91d4defb84996392b71087dadf7a8c | Fix data transpose in shared memory ]]) Reviewed By: ftynse Differential Revision: https://reviews.llvm.org/D144666
1 parent 5f53e85 commit 115711c

7 files changed

Lines changed: 254 additions & 0 deletions

File tree

mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,4 +85,23 @@ def GPUBlockMappingAttr : GPU_Attr<"GPUBlockMapping", "block", [
8585
}];
8686
}
8787

88+
89+
def GPUMemorySpaceMappingAttr : GPU_Attr<"GPUMemorySpaceMapping", "memory_space", [
90+
DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] > {
91+
let parameters = (ins
92+
EnumParameter<GPU_AddressSpaceEnum>:$address_space
93+
);
94+
let assemblyFormat = "`<` params `>`";
95+
let description = [{
96+
An attribute that allows defining memory hierarchy for GPU devices.
97+
98+
GPU Memory has three memory space, global, workgroup, and private. The global memory
99+
is visible to all workitems and workgroups, the workgroup memory is only available for workitems
100+
within a workgroup, and private memory is only visible to a single workitem. This attribute indicates
101+
that using memory hiearchy is desired. It can be consumed by lowering to
102+
move data to a specific address space in GPU code.
103+
}];
104+
}
105+
106+
88107
#endif // GPU_DEVICE_MAPPING_ATTR

mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,7 @@ def PadOp : Op<Transform_Dialect, "structured.pad",
765765
// PromoteOp
766766
//===----------------------------------------------------------------------===//
767767

768+
768769
def PromoteOp : Op<Transform_Dialect, "structured.promote",
769770
[FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
770771
TransformOpInterface, TransformEachOpTrait]> {
@@ -791,6 +792,7 @@ def PromoteOp : Op<Transform_Dialect, "structured.promote",
791792
DefaultValuedAttr<BoolArrayAttr, "{}">:$use_full_tile_buffers,
792793
UnitAttr:$use_full_tiles_by_default,
793794
UnitAttr:$use_alloca,
795+
OptionalAttr<DeviceMappingArrayAttr>:$mapping,
794796
OptionalAttr<I64Attr>:$alignment);
795797
let results = (outs PDL_Operation:$transformed);
796798

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,32 @@ promoteSubviewAsNewBuffer(OpBuilder &b, Location loc, memref::SubViewOp subView,
393393
FailureOr<LinalgOp> promoteSubViews(OpBuilder &b, LinalgOp op,
394394
const LinalgPromotionOptions &options);
395395

396+
/// Allocate the subview in the GPU workgroup memory.
397+
Optional<Value> allocateWorkgroupMemory(OpBuilder &builder,
398+
memref::SubViewOp subview,
399+
ArrayRef<Value> sizeBounds,
400+
DataLayout &);
401+
402+
/// In case of GPU group memory there is no need to deallocate.
403+
LogicalResult deallocateWorkgroupMemory(OpBuilder &, Value /*buffer*/);
404+
405+
/// Create Memref copy operations and add gpu barrier guards before and after
406+
/// the copy operation to ensure data integrity.
407+
LogicalResult copyToWorkgroupMemory(OpBuilder &b, Value src, Value dst);
408+
409+
/// Allocate the subview in the GPU private memory.
410+
Optional<Value> allocateGPUPrivateMemory(OpBuilder &builder,
411+
memref::SubViewOp subview,
412+
ArrayRef<Value> sizeBounds,
413+
DataLayout &);
414+
415+
/// Normal copy to between src and dst.
416+
LogicalResult copyToGPUPrivateMemory(OpBuilder &b, Value src, Value dst);
417+
418+
/// In case of GPU private memory there is no need to deallocate since the
419+
/// memory is freed when going outside of the scope.
420+
LogicalResult deallocateGPUPrivateMemory(OpBuilder &, Value /*buffer*/);
421+
396422
/// Emit a suitable vector form for a Linalg op. If provided, `inputVectorSizes`
397423
/// are used to vectorize this operation. `inputVectorSizes` must match the rank
398424
/// of the iteration space of the operation and the sizes must be smaller or

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ int64_t GPUThreadMappingAttr::getMappingId() const {
5050
return static_cast<int64_t>(getThread());
5151
}
5252

53+
int64_t GPUMemorySpaceMappingAttr::getMappingId() const {
54+
return static_cast<int64_t>(getAddressSpace());
55+
}
56+
5357
//===----------------------------------------------------------------------===//
5458
// MMAMatrixType
5559
//===----------------------------------------------------------------------===//

mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1802,6 +1802,35 @@ transform::PromoteOp::applyToOne(LinalgOp target,
18021802
if (getAlignment().has_value())
18031803
promotionOptions = promotionOptions.setAlignment(*getAlignment());
18041804

1805+
if (getMapping().has_value()) {
1806+
// The mapping should only contain an element
1807+
auto mapping = *getMapping();
1808+
if (mapping.size() > 1)
1809+
return emitDefaultDefiniteFailure(target);
1810+
1811+
auto addressSpace = mapping[0].cast<gpu::GPUMemorySpaceMappingAttr>();
1812+
1813+
if (addressSpace.getAddressSpace() ==
1814+
gpu::GPUDialect::getWorkgroupAddressSpace()) {
1815+
promotionOptions =
1816+
promotionOptions
1817+
.setAllocationDeallocationFns(allocateWorkgroupMemory,
1818+
deallocateWorkgroupMemory)
1819+
.setCopyInOutFns(copyToWorkgroupMemory, copyToWorkgroupMemory)
1820+
.setUseFullTileBuffers({false, false});
1821+
} else if (addressSpace.getAddressSpace() ==
1822+
gpu::GPUDialect::getPrivateAddressSpace()) {
1823+
promotionOptions =
1824+
promotionOptions
1825+
.setAllocationDeallocationFns(allocateGPUPrivateMemory,
1826+
deallocateGPUPrivateMemory)
1827+
.setCopyInOutFns(copyToGPUPrivateMemory, copyToGPUPrivateMemory)
1828+
.setUseFullTileBuffers({false, false});
1829+
} else {
1830+
return emitDefaultDefiniteFailure(target);
1831+
}
1832+
}
1833+
18051834
if (failed(promoteSubviewsPrecondition(target, promotionOptions)))
18061835
return emitDefaultDefiniteFailure(target);
18071836

mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
#include "mlir/Dialect/Arith/IR/Arith.h"
1414
#include "mlir/Dialect/Arith/Utils/Utils.h"
1515
#include "mlir/Dialect/Complex/IR/Complex.h"
16+
#include "mlir/Dialect/Func/IR/FuncOps.h"
17+
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
1618
#include "mlir/Dialect/Linalg/IR/Linalg.h"
1719
#include "mlir/Dialect/Linalg/Passes.h"
1820
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
@@ -397,3 +399,87 @@ mlir::linalg::promoteSubViews(OpBuilder &builder, LinalgOp linalgOp,
397399
return failure();
398400
return res;
399401
}
402+
403+
/// Allocate the given subview to a memory address space in GPU by creating a
404+
/// allocation operation and setting the memref type address space to desired
405+
/// address space.
406+
static Optional<Value> allocateSubviewGPUMemoryInAddressSpace(
407+
OpBuilder &builder, memref::SubViewOp subview, ArrayRef<Value> sizeBounds,
408+
gpu::AddressSpace addressSpace) {
409+
OpBuilder::InsertionGuard guard(builder);
410+
411+
func::FuncOp funcOp = subview->getParentOfType<func::FuncOp>();
412+
if (!funcOp)
413+
return std::nullopt;
414+
415+
// The subview size bounds are expected to be constant; they specify the shape
416+
// of the allocation.
417+
SmallVector<int64_t> shape;
418+
for (Value bound : sizeBounds) {
419+
APInt value;
420+
if (!matchPattern(bound, m_ConstantInt(&value)))
421+
return std::nullopt;
422+
shape.push_back(value.getSExtValue());
423+
}
424+
425+
builder.setInsertionPoint(&funcOp.front(), funcOp.front().begin());
426+
auto type = MemRefType::get(
427+
shape, subview.getType().getElementType(), MemRefLayoutAttrInterface{},
428+
gpu::AddressSpaceAttr::get(builder.getContext(), addressSpace));
429+
Value buffer;
430+
if (addressSpace == gpu::GPUDialect::getWorkgroupAddressSpace()) {
431+
buffer = builder.create<memref::AllocOp>(funcOp.getLoc(), type);
432+
} else if (addressSpace == gpu::GPUDialect::getPrivateAddressSpace()) {
433+
buffer = builder.create<memref::AllocaOp>(funcOp.getLoc(), type);
434+
} else {
435+
return std::nullopt;
436+
}
437+
return buffer;
438+
}
439+
440+
/// Allocate the subview in the GPU workgroup memory.
441+
Optional<Value> mlir::linalg::allocateWorkgroupMemory(
442+
OpBuilder &builder, memref::SubViewOp subview, ArrayRef<Value> sizeBounds,
443+
DataLayout &) {
444+
return allocateSubviewGPUMemoryInAddressSpace(
445+
builder, subview, sizeBounds,
446+
gpu::GPUDialect::getWorkgroupAddressSpace());
447+
}
448+
449+
/// In case of GPU group memory there is no need to deallocate.
450+
LogicalResult mlir::linalg::deallocateWorkgroupMemory(OpBuilder &,
451+
Value /*buffer*/) {
452+
return success();
453+
}
454+
455+
/// Create Memref copy operations and add gpu barrier guards before and after
456+
/// the copy operation to ensure data integrity.
457+
LogicalResult mlir::linalg::copyToWorkgroupMemory(OpBuilder &b, Value src,
458+
Value dst) {
459+
b.create<gpu::BarrierOp>(src.getLoc());
460+
Operation *copyOp = b.create<memref::CopyOp>(src.getLoc(), src, dst);
461+
b.create<gpu::BarrierOp>(copyOp->getLoc());
462+
return success();
463+
}
464+
465+
/// Allocate the subview in the GPU private memory.
466+
Optional<Value> mlir::linalg::allocateGPUPrivateMemory(
467+
OpBuilder &builder, memref::SubViewOp subview, ArrayRef<Value> sizeBounds,
468+
DataLayout &) {
469+
return allocateSubviewGPUMemoryInAddressSpace(
470+
builder, subview, sizeBounds, gpu::GPUDialect::getPrivateAddressSpace());
471+
}
472+
473+
/// Normal copy to between src and dst.
474+
LogicalResult mlir::linalg::copyToGPUPrivateMemory(OpBuilder &b, Value src,
475+
Value dst) {
476+
Operation *copyOp = b.create<memref::CopyOp>(src.getLoc(), src, dst);
477+
return success();
478+
}
479+
480+
/// In case of GPU private memory there is no need to deallocate since the
481+
/// memory is freed when going outside of the scope.
482+
LogicalResult mlir::linalg::deallocateGPUPrivateMemory(OpBuilder &,
483+
Value /*buffer*/) {
484+
return success();
485+
}

mlir/test/Dialect/Linalg/promote.mlir

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,94 @@ transform.sequence failures(propagate) {
142142
%1 = transform.structured.promote %0
143143
}
144144

145+
// -----
146+
func.func @gemm_shared(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
147+
{
148+
linalg.matmul ins(%a, %b: memref<?x?xf32>, memref<?x?xf32>)
149+
outs(%c: memref<?x?xf32>)
150+
return
151+
}
152+
153+
// CHECK: func @gemm_shared
154+
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<?x?xf32>
155+
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: memref<?x?xf32>
156+
// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: memref<?x?xf32>
157+
// CHECK: %[[alloc_A:.*]] = memref.alloc() : memref<16x16xf32, #gpu.address_space<workgroup>>
158+
// CHECK: %[[alloc_B:.*]] = memref.alloc() : memref<16x16xf32, #gpu.address_space<workgroup>>
159+
// CHECK-DAG: %[[C16:.*]] = arith.constant 16
160+
// CHECK-DAG: %[[C0:.*]] = arith.constant 0
161+
// CHECK-DAG: %[[C1:.*]] = arith.constant 1
162+
// CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
163+
// CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
164+
// CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
165+
// CHECK: %[[subview_A:.*]] = memref.subview {{.*}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
166+
// CHECK: %[[subview_B:.*]] = memref.subview {{.*}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
167+
// CHECK: %[[subview_C:.*]] = memref.subview {{.*}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
168+
169+
// CHECK: %[[shared_A:.*]] = memref.subview %[[alloc_B]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<16x16xf32, #gpu.address_space<workgroup>> to memref<?x?xf32, strided<[16, 1]>, #gpu.address_space<workgroup>>
170+
// CHECK: %[[shared_B:.*]] = memref.subview %[[alloc_A]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<16x16xf32, #gpu.address_space<workgroup>> to memref<?x?xf32, strided<[16, 1]>, #gpu.address_space<workgroup>>
171+
172+
// CHECK-NEXT: gpu.barrier
173+
// CHECK-NEXT: memref.copy %[[subview_A]], %[[shared_A]] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[16, 1]>, #gpu.address_space<workgroup>>
174+
// CHECK-NEXT: gpu.barrier
175+
176+
// CHECK-NEXT: gpu.barrier
177+
// CHECK-NEXT: memref.copy %[[subview_B]], %[[shared_B]] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[16, 1]>, #gpu.address_space<workgroup>>
178+
// CHECK-NEXT: gpu.barrier
179+
180+
// CHECK: linalg.matmul ins(%[[shared_A]], %[[shared_B]]{{.*}} outs(%[[subview_C]]
181+
182+
183+
transform.sequence failures(propagate) {
184+
^bb0(%arg1: !pdl.operation):
185+
%0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
186+
%1, %loops:3 = transform.structured.tile %0 [16, 16, 16] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation)
187+
%2 = transform.structured.promote %1 { operands_to_promote = [0, 1], mapping = [#gpu.memory_space<workgroup>] }
188+
}
189+
190+
191+
// -----
192+
193+
func.func @gemm_private(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
194+
{
195+
linalg.matmul ins(%a, %b: memref<?x?xf32>, memref<?x?xf32>)
196+
outs(%c: memref<?x?xf32>)
197+
return
198+
}
199+
200+
// CHECK: func @gemm_private
201+
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<?x?xf32>
202+
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: memref<?x?xf32>
203+
// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: memref<?x?xf32>
204+
// CHECK: %[[alloc_A:.*]] = memref.alloca() : memref<16x16xf32, #gpu.address_space<private>>
205+
// CHECK: %[[alloc_B:.*]] = memref.alloca() : memref<16x16xf32, #gpu.address_space<private>>
206+
// CHECK-DAG: %[[C16:.*]] = arith.constant 16
207+
// CHECK-DAG: %[[C0:.*]] = arith.constant 0
208+
// CHECK-DAG: %[[C1:.*]] = arith.constant 1
209+
// CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
210+
// CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
211+
// CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
212+
// CHECK: %[[subview_A:.*]] = memref.subview {{.*}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
213+
// CHECK: %[[subview_B:.*]] = memref.subview {{.*}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
214+
// CHECK: %[[subview_C:.*]] = memref.subview {{.*}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
215+
216+
// CHECK: %[[private_A:.*]] = memref.subview %[[alloc_B]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<16x16xf32, #gpu.address_space<private>> to memref<?x?xf32, strided<[16, 1]>, #gpu.address_space<private>>
217+
// CHECK: %[[private_B:.*]] = memref.subview %[[alloc_A]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<16x16xf32, #gpu.address_space<private>> to memref<?x?xf32, strided<[16, 1]>, #gpu.address_space<private>>
218+
219+
// CHECK-NEXT: memref.copy %[[subview_A]], %[[private_A]] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[16, 1]>, #gpu.address_space<private>>
220+
// CHECK-NEXT: memref.copy %[[subview_B]], %[[private_B]] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[16, 1]>, #gpu.address_space<private>>
221+
222+
// CHECK: linalg.matmul ins(%[[private_A]], %[[private_B]]{{.*}} outs(%[[subview_C]]
223+
224+
225+
transform.sequence failures(propagate) {
226+
^bb0(%arg1: !pdl.operation):
227+
%0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
228+
%1, %loops:3 = transform.structured.tile %0 [16, 16, 16] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation)
229+
%2 = transform.structured.promote %1 { operands_to_promote = [0, 1], mapping = [#gpu.memory_space<private>] }
230+
}
231+
232+
145233
// -----
146234

147235
#map6 = affine_map<(d0, d1, d2) -> (d0, d2)>

0 commit comments

Comments
 (0)