-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[mlir][XeGPU] add unroll patterns for load_matrix and store_matrix #154637
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Author: Chao Chen (chencha3) ChangesAs described by the title. Full diff: https://github.com/llvm/llvm-project/pull/154637.diff 6 Files Affected:
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index 3a88dae041dd1..ddf6b4ac85a90 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -67,8 +67,8 @@ def XeGPUBlocking: Pass<"xegpu-blocking"> {
to a hardware instruction.
}];
let dependentDialects = [
- "memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect"
- ];
+ "memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect",
+ "index::IndexDialect"];
}
#endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index db8608c6d20b8..a40dc74edb200 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -10,6 +10,7 @@
#define MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_
#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpDefinition.h"
namespace mlir {
class VectorType;
@@ -18,6 +19,7 @@ class OpResult;
class OpBuilder;
class ValueRange;
class TypeConverter;
+class OpFoldResult;
namespace xegpu {
class LayoutAttr;
@@ -128,6 +130,20 @@ void doSCFStructuralTypeConversionWithTensorType(Operation *op,
/// if no GPU module parent or XeVM target attribute exists.
std::optional<std::string> getChipStr(Operation *op);
+/// Generates element-wise addition ops of two arrays with automatic alignment.
+/// When the input arrays have different sizes, the shorter array is
+/// right-aligned with the longer array, and the unmatched leading elements from
+/// the longer array are preserved unchanged. This is commonly used for offset
+/// computation where higher-dimensional offsets need to be added to
+/// lower-dimensional adjustments.
+///
+/// Example:
+/// lhs = [l1, l2, l3], rhs = [r1, r2]
+/// Result: [11, l2+r1, l3+r2]
+SmallVector<OpFoldResult> addWithRightAligned(OpBuilder &builder, Location loc,
+ ArrayRef<OpFoldResult> lhs,
+ ArrayRef<OpFoldResult> rhs);
+
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index d82c541f31359..b11f5fe87559b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -8,6 +8,7 @@
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
@@ -155,10 +156,10 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
std::optional<SmallVector<int64_t>>
XeGPUBlockingPass::getTileShape(Operation *op) const {
if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp, xegpu::CreateDescOp,
- xegpu::UpdateOffsetOp>(op))
+ xegpu::UpdateOffsetOp, xegpu::LoadMatrixOp>(op))
return getTileShape(op->getOpResult(0));
if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp,
- xegpu::LoadGatherOp>(op))
+ xegpu::LoadGatherOp, xegpu::StoreMatrixOp>(op))
return getTileShape(op->getOpOperand(0));
if (isa<xegpu::StoreNdOp, xegpu::StoreScatterOp>(op))
return getTileShape(op->getOpOperand(1));
@@ -202,17 +203,18 @@ XeGPUBlockingPass::getTileShape(Operation *op) const {
bool XeGPUBlockingPass::needsUnroll(Operation *op) const {
// skip the op if any of its operands or results has workgroup level layouts
- bool hasWgLayoutOperands =
+ bool hasSgLayoutOperands =
llvm::any_of(op->getOpOperands(), [](OpOperand &opr) {
xegpu::LayoutAttr layout = xegpu::getLayoutAttr(opr);
return layout && layout.isWgLayout();
});
- bool hasWgLayoutResults =
+ bool hasSgLayoutResults =
llvm::any_of(op->getOpResults(), [](OpResult result) {
xegpu::LayoutAttr layout = xegpu::getLayoutAttr(result);
return layout && layout.isWgLayout();
});
- if (hasWgLayoutOperands || hasWgLayoutResults) {
+
+ if (hasSgLayoutOperands || hasSgLayoutResults) {
LDBG() << "skip unrolling for op with workgroup level layout: " << *op;
return false;
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index c793b71639e86..219e4e6f44618 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -682,13 +682,90 @@ struct UnrollUpdateOffsetOp : public UnrollPattern<xegpu::UpdateOffsetOp> {
}
};
+struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> {
+ using UnrollPattern<xegpu::LoadMatrixOp>::UnrollPattern;
+ LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op,
+ PatternRewriter &rewriter) const override {
+ std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+ if (!targetShape)
+ return failure();
+
+ Location loc = op.getLoc();
+ VectorType valueTy = op.getType();
+ Type elemTy = valueTy.getElementType();
+ ArrayRef<int64_t> shape = valueTy.getShape();
+ auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());
+
+ VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy);
+
+ SmallVector<OpFoldResult> mixedOffsets = op.getMixedOffsets();
+ SmallVector<SmallVector<OpFoldResult>> offsetsList;
+ for (SmallVector<int64_t> offsets :
+ StaticTileOffsetRange(shape, *targetShape)) {
+ auto adds = xegpu::addWithRightAligned(
+ rewriter, loc, mixedOffsets,
+ getAsIndexOpFoldResult(op.getContext(), offsets));
+ offsetsList.push_back(adds);
+ }
+
+ SmallVector<Value> newOps;
+ for (SmallVector<OpFoldResult> offsets : offsetsList) {
+ auto newOp = rewriter.create<xegpu::LoadMatrixOp>(
+ op.getLoc(), newValueTy, op.getMemDesc(), offsets,
+ layout.dropInstData());
+ newOps.push_back(newOp);
+ }
+ Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
+ rewriter.replaceOp(op, castOp);
+ return success();
+ }
+};
+
+struct UnrollStoreMatrixOp : public UnrollPattern<xegpu::StoreMatrixOp> {
+ using UnrollPattern<xegpu::StoreMatrixOp>::UnrollPattern;
+ LogicalResult matchAndRewrite(xegpu::StoreMatrixOp op,
+ PatternRewriter &rewriter) const override {
+ std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+ if (!targetShape)
+ return failure();
+
+ Location loc = op.getLoc();
+ VectorType valueTy = op.getData().getType();
+ ArrayRef<int64_t> shape = valueTy.getShape();
+ auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());
+
+ SmallVector<Type> convertedValTypes =
+ getUnrolledTypes(valueTy, *targetShape);
+ SmallVector<Value> convertedValues =
+ pack(op.getData(), convertedValTypes, *targetShape, loc, rewriter);
+
+ SmallVector<OpFoldResult> mixedOffsets = op.getMixedOffsets();
+ SmallVector<SmallVector<OpFoldResult>> offsetsList;
+ for (SmallVector<int64_t> offsets :
+ StaticTileOffsetRange(shape, *targetShape)) {
+ auto adds = xegpu::addWithRightAligned(
+ rewriter, loc, mixedOffsets,
+ getAsIndexOpFoldResult(op.getContext(), offsets));
+ offsetsList.push_back(adds);
+ }
+
+ for (auto [v, offsets] : llvm::zip_equal(convertedValues, offsetsList))
+ rewriter.create<xegpu::StoreMatrixOp>(loc, v, op.getMemDesc(), offsets,
+ layout.dropInstData());
+
+ rewriter.eraseOp(op);
+ return success();
+ }
+};
+
} // namespace
void mlir::xegpu::populateXeGPUUnrollPatterns(
RewritePatternSet &patterns, const xegpu::UnrollOptions &options) {
- patterns.add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
- UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp,
- UnrollCreateDescOp, UnrollLoadGatherOp, UnrollStoreScatterOp,
- UnrollPrefetchOp, UnrollUpdateOffsetOp>(patterns.getContext(),
- options);
+ patterns
+ .add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
+ UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp, UnrollCreateDescOp,
+ UnrollLoadGatherOp, UnrollStoreScatterOp, UnrollPrefetchOp,
+ UnrollUpdateOffsetOp, UnrollLoadMatrixOp, UnrollStoreMatrixOp>(
+ patterns.getContext(), options);
}
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 19eedbac0f76b..088e8a8c497d9 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -12,6 +12,7 @@
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
#include "mlir/Dialect/SCF/Transforms/Patterns.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
@@ -133,6 +134,14 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) {
if (auto loadNd = dyn_cast<xegpu::LoadNdOp>(defOp))
return getLayoutAttr(loadNd.getTensorDesc());
+ // for LoadMatrixOp, the layout is attached to the property of the op
+ if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(defOp))
+ return dyn_cast_if_present<xegpu::LayoutAttr>(loadOp.getLayoutAttr());
+
+ // for StoreMatrixOp, the layout is attached to the property of the op
+ if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(defOp))
+ return dyn_cast_if_present<xegpu::LayoutAttr>(storeOp.getLayoutAttr());
+
std::string layoutName = getLayoutName(result);
if (defOp->hasAttr(layoutName))
return defOp->getAttrOfType<xegpu::LayoutAttr>(layoutName);
@@ -152,6 +161,13 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) {
xegpu::LayoutAttr xegpu::getLayoutAttr(const OpOperand &opr) {
Operation *op = opr.getOwner();
+
+ if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(op))
+ return dyn_cast_if_present<xegpu::LayoutAttr>(loadOp.getLayoutAttr());
+
+ if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(op))
+ return dyn_cast_if_present<xegpu::LayoutAttr>(storeOp.getLayoutAttr());
+
std::string layoutName = xegpu::getLayoutName(opr);
if (op->hasAttr(layoutName))
return op->getAttrOfType<xegpu::LayoutAttr>(layoutName);
@@ -179,6 +195,8 @@ xegpu::setLayoutAttr<mlir::OpOperand>(const mlir::OpOperand &operand,
void xegpu::setLayoutAttrs(Operation *op,
function_ref<LayoutAttr(Value)> getLayoutImpl) {
op->walk([&](Operation *nestOp) {
+ if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(nestOp))
+ return;
for (OpOperand &opr : nestOp->getOpOperands()) {
auto layout = getLayoutImpl(opr.get());
setLayoutAttr(opr, layout);
@@ -424,3 +442,31 @@ std::optional<std::string> xegpu::getChipStr(Operation *op) {
return std::nullopt;
}
+
+/// Generates element-wise addition ops of two arrays with automatic alignment.
+/// When the input arrays have different sizes, the shorter array is
+/// right-aligned with the longer array, and the unmatched leading elements from
+/// the longer array are preserved unchanged. This is commonly used for offset
+/// computation where higher-dimensional offsets need to be added to
+/// lower-dimensional adjustments.
+///
+/// Example:
+/// lhs = [l1, l2, l3], rhs = [r1, r2]
+/// Result: [11, l2+r1, l3+r2]
+SmallVector<OpFoldResult>
+xegpu::addWithRightAligned(OpBuilder &builder, Location loc,
+ ArrayRef<OpFoldResult> lhs,
+ ArrayRef<OpFoldResult> rhs) {
+ // ensure a is longer than b
+ ArrayRef<OpFoldResult> a = lhs.size() >= rhs.size() ? lhs : rhs;
+ ArrayRef<OpFoldResult> b = lhs.size() >= rhs.size() ? rhs : lhs;
+ SmallVector<OpFoldResult> results(a.take_front(a.size() - b.size()));
+ a = a.slice(a.size() - b.size());
+ for (auto [l, r] : llvm::zip(a, b)) {
+ auto lval = getValueOrCreateConstantIndexOp(builder, loc, l);
+ auto rval = getValueOrCreateConstantIndexOp(builder, loc, r);
+ results.push_back(builder.createOrFold<index::AddOp>(loc, lval, rval));
+ }
+ return results;
+ return {};
+}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index d986e5bd1cfb4..9d63c2ddd4895 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -561,3 +561,26 @@ gpu.module @test_kernel {
gpu.return %e : vector<8x32x2xf16>
}
}
+
+// -----
+gpu.module @test_kernel {
+ //CHECK-LABEL: unroll_load_matrix
+ gpu.func @unroll_load_matrix(%arg0: memref<4096xi8, 3>) -> vector<32x32xf32> {
+ %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x32xf32>
+ //CHECK-COUNT-8: xegpu.load_matrix {{.*}} : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x16xf32>
+ //CHECK-COUNT-8: vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<32x32xf32>
+ %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout<inst_data = [8, 16]>}>: !xegpu.mem_desc<32x32xf32> -> vector<32x32xf32>
+ gpu.return %1: vector<32x32xf32>
+ }
+}
+
+// -----
+gpu.module @test_kernel {
+ // CHECK-LABEL: unroll_store_matrix
+ gpu.func @unroll_store_matrix(%value: vector<32x32xf32>, %arg0 : memref<32768xi8, 3>) {
+ %mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
+ // CHECK-COUNT-8: xegpu.store_matrix {{.*}} : vector<8x16xf32>, !xegpu.mem_desc<64x128xf32>, index, index
+ xegpu.store_matrix %value, %mdesc[0, 0] {layout = #xegpu.layout<inst_data = [8, 16]>} : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>
+ gpu.return
+ }
+}
|
/// lhs = [l1, l2, l3], rhs = [r1, r2] | ||
/// Result: [11, l2+r1, l3+r2] | ||
SmallVector<OpFoldResult> | ||
xegpu::addWithRightAligned(OpBuilder &builder, Location loc, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you add a test validating this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, this util has been merged in PR #154819, but I added a unit test for it in this PR.
As described by the title.