-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[PowerPC] Add DMF builtins for build and disassemble #153097
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
✅ With the latest revision this PR passed the C/C++ code formatter. |
@llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-llvm-ir Author: None (RolandF77) ChangesAdd support for PPC Dense Math builtins mma_build_dmr and mma_disassemble_dmr builtins. Full diff: https://github.com/llvm/llvm-project/pull/153097.diff 8 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def
index e7d6741eb695e..7f4b12b0f4e15 100644
--- a/clang/include/clang/Basic/BuiltinsPPC.def
+++ b/clang/include/clang/Basic/BuiltinsPPC.def
@@ -1098,6 +1098,10 @@ UNALIASED_CUSTOM_BUILTIN(mma_dmmr, "vW1024*W1024*", false,
"mma,isa-future-instructions")
UNALIASED_CUSTOM_BUILTIN(mma_dmxor, "vW1024*W1024*", true,
"mma,isa-future-instructions")
+UNALIASED_CUSTOM_BUILTIN(mma_disassemble_dmr, "vv*W1024*", false,
+ "mma,isa-future-instructions")
+UNALIASED_CUSTOM_BUILTIN(mma_build_dmr, "vW1024*VVVVVVVV", false,
+ "mma,isa-future-instructions")
// MMA builtins with positive/negative multiply/accumulate.
UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf16ger2, "vW512*VV",
diff --git a/clang/lib/CodeGen/TargetBuiltins/PPC.cpp b/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
index 270e9fc976f23..f3dd349651238 100644
--- a/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
@@ -1152,10 +1152,15 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
CallOps.push_back(Acc);
}
if (BuiltinID == PPC::BI__builtin_mma_dmmr ||
- BuiltinID == PPC::BI__builtin_mma_dmxor) {
+ BuiltinID == PPC::BI__builtin_mma_dmxor ||
+ BuiltinID == PPC::BI__builtin_mma_disassemble_dmr) {
Address Addr = EmitPointerWithAlignment(E->getArg(1));
Ops[1] = Builder.CreateLoad(Addr);
}
+ if (BuiltinID == PPC::BI__builtin_mma_disassemble_dmr) {
+ dbgs() << "&&& No disassemble!!!\n";
+ return Builder.CreateAlignedStore(Ops[1], Ops[0], MaybeAlign());
+ }
for (unsigned i=1; i<Ops.size(); i++)
CallOps.push_back(Ops[i]);
llvm::Function *F = CGM.getIntrinsic(ID);
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c b/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
index 4aafc09602228..c66f5e2a32919 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
@@ -93,18 +93,36 @@ void test_pmdmxvi8gerx4spp(unsigned char *vdmrp, unsigned char *vpp, vector unsi
*((__dmr1024 *)resp) = vdmr;
}
-// CHECK-LABEL: @test_dmf_basic
-// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1> [[TMP0]])
-// CHECK-NEXT: store <1024 x i1> [[TMP1]], ptr %res1, align 128
-// CHECK-NEXT: [[TMP2:%.*]] = load <1024 x i1>, ptr %res2, align 128
-// CHECK-NEXT: [[TMP3:%.*]] = load <1024 x i1>, ptr %p, align 128
-// CHECK-NEXT: [[TMP4:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1> [[TMP2]], <1024 x i1> [[TMP3]])
-// CHECK-NEXT: store <1024 x i1> [[TMP4]], ptr %res2, align 128
+// CHECK-LABEL: @test_dmf_basic(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1> [[TMP0]])
+// CHECK-NEXT: store <1024 x i1> [[TMP1]], ptr [[RES1:%.*]], align 128
+// CHECK-NEXT: [[TMP2:%.*]] = load <1024 x i1>, ptr [[RES2:%.*]], align 128
+// CHECK-NEXT: [[TMP3:%.*]] = load <1024 x i1>, ptr [[P:%.*]], align 128
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1> [[TMP2]], <1024 x i1> [[TMP3]])
+// CHECK-NEXT: store <1024 x i1> [[TMP4]], ptr [[RES2]], align 128
+// CHECK-NEXT: ret void
+//
void test_dmf_basic(char *p, char *res1, char *res2) {
__dmr1024 x[2];
__builtin_mma_dmsetdmrz(&x[0]);
__builtin_mma_dmmr((__dmr1024*)res1, &x[0]);
__builtin_mma_dmxor((__dmr1024*)res2, (__dmr1024*)p);
}
+
+// CHECK-LABEL: @test_dmf_basic2(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[V:%.*]], align 16, !tbaa [[TBAA8:![0-9]+]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.build.dmr(<16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]])
+// CHECK-NEXT: store <1024 x i1> [[TMP1]], ptr [[RES2:%.*]], align 128
+// CHECK-NEXT: [[TMP2:%.*]] = load <1024 x i1>, ptr [[P1:%.*]], align 128
+// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RES1:%.*]], align 128
+// CHECK-NEXT: ret void
+//
+void test_dmf_basic2(char *p1, char *res1, char *res2,
+ vector unsigned char *v) {
+ vector unsigned char vv = *v;
+ __builtin_mma_build_dmr((__dmr1024*)res2, vv, vv, vv, vv, vv, vv, vv, vv);
+ __builtin_mma_disassemble_dmr(res1, (__dmr1024*)p1);
+}
diff --git a/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c b/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
index 5a92d6e982511..ea2b99b0e5b20 100644
--- a/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
+++ b/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
@@ -16,6 +16,8 @@ void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc)
__builtin_mma_dmsetdmrz(&vdmr);
__builtin_mma_dmmr(&vdmr, (__dmr1024*)vpp);
__builtin_mma_dmxor(&vdmr, (__dmr1024*)vpp);
+ __builtin_mma_build_dmr(&vdmr, vc, vc, vc, vc, vc, vc, vc, vc);
+ __builtin_mma_disassemble_dmr(vdmrp, &vdmr);
// CHECK: error: '__builtin_mma_dmxvi8gerx4' needs target feature mma,paired-vector-memops
// CHECK: error: '__builtin_mma_pmdmxvi8gerx4' needs target feature mma,paired-vector-memops
@@ -26,4 +28,6 @@ void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc)
// CHECK: error: '__builtin_mma_dmsetdmrz' needs target feature mma,isa-future-instructions
// CHECK: error: '__builtin_mma_dmmr' needs target feature mma,isa-future-instructions
// CHECK: error: '__builtin_mma_dmxor' needs target feature mma,isa-future-instructions
+// CHECK: error: '__builtin_mma_build_dmr' needs target feature mma,isa-future-instructions
+// CHECK: error: '__builtin_mma_disassemble_dmr' needs target feature mma,isa-future-instructions
}
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index 7dd9ff7f08b8b..edecde1f223a7 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1701,6 +1701,16 @@ let TargetPrefix = "ppc" in {
DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v1024i1_ty, llvm_v256i1_ty,
llvm_i32_ty], [IntrNoMem]>;
+ def int_ppc_mma_disassemble_dmr :
+ DefaultAttrsIntrinsic<[], [llvm_ptr_ty, llvm_v1024i1_ty],
+ [IntrWriteMem, IntrArgMemOnly]>;
+
+ def int_ppc_mma_build_dmr :
+ DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
+ llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
+ llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
+ [IntrNoMem]>;
+
// MMA Reduced-Precision: Outer Product Intrinsic Definitions.
defm int_ppc_mma_xvi4ger8 :
PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 2698bd6f37c59..1580682c6310b 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -11292,6 +11292,25 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getMergeValues(RetOps, dl);
}
+ case Intrinsic::ppc_mma_build_dmr: {
+ SmallVector<SDValue, 8> Pairs;
+ SmallVector<SDValue, 8> Chains;
+ for (int i = 1; i < 9; i += 2) {
+ SDValue Hi = Op.getOperand(i);
+ SDValue Lo = Op.getOperand(i + 1);
+ if (Hi->getOpcode() == ISD::LOAD)
+ Chains.push_back(Hi.getValue(1));
+ if (Lo->getOpcode() == ISD::LOAD)
+ Chains.push_back(Lo.getValue(1));
+ Pairs.push_back(
+ DAG.getNode(PPCISD::PAIR_BUILD, dl, MVT::v256i1, {Hi, Lo}));
+ }
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+ SDValue Value = DMFInsert1024(Pairs, SDLoc(Op), DAG);
+ SDValue RetOps[] = {Value, TF};
+ return DAG.getMergeValues(RetOps, dl);
+ }
+
case Intrinsic::ppc_mma_dmxxextfdmr512: {
assert(Subtarget.isISAFuture() && "dmxxextfdmr512 requires ISA Future");
auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
@@ -11628,6 +11647,10 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op.getOperand(0)),
0);
}
+ case Intrinsic::ppc_mma_disassemble_dmr: {
+ return DAG.getStore(DAG.getEntryNode(), DL, Op.getOperand(ArgStart + 2),
+ Op.getOperand(ArgStart + 1), MachinePointerInfo());
+ }
default:
break;
}
@@ -12117,6 +12140,24 @@ SDValue PPCTargetLowering::LowerDMFVectorLoad(SDValue Op,
return DAG.getMergeValues({DmrPValue, TF}, dl);
}
+SDValue PPCTargetLowering::DMFInsert1024(const SmallVectorImpl<SDValue> &Pairs,
+ const SDLoc &dl,
+ SelectionDAG &DAG) const {
+ SDValue Lo(DAG.getMachineNode(PPC::DMXXINSTDMR512, dl, MVT::v512i1, Pairs[0],
+ Pairs[1]),
+ 0);
+ SDValue LoSub = DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32);
+ SDValue Hi(DAG.getMachineNode(PPC::DMXXINSTDMR512_HI, dl, MVT::v512i1,
+ Pairs[2], Pairs[3]),
+ 0);
+ SDValue HiSub = DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32);
+ SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32);
+ const SDValue Ops[] = {RC, Lo, LoSub, Hi, HiSub};
+
+ return SDValue(DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1, Ops),
+ 0);
+}
+
SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 9755f0e272d16..644b0c5b3941e 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1361,6 +1361,8 @@ namespace llvm {
SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDMFVectorLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDMFVectorStore(SDValue Op, SelectionDAG &DAG) const;
+ SDValue DMFInsert1024(const SmallVectorImpl<SDValue> &Pairs,
+ const SDLoc &dl, SelectionDAG &DAG) const;
SDValue LowerCallResult(SDValue Chain, SDValue InGlue,
CallingConv::ID CallConv, bool isVarArg,
diff --git a/llvm/test/CodeGen/PowerPC/dmr-enable.ll b/llvm/test/CodeGen/PowerPC/dmr-enable.ll
index 1e3014405ac4e..a505ac4c2434a 100644
--- a/llvm/test/CodeGen/PowerPC/dmr-enable.ll
+++ b/llvm/test/CodeGen/PowerPC/dmr-enable.ll
@@ -367,6 +367,69 @@ entry:
ret void
}
+define void @tbuild(ptr %p1, ptr %p2, ptr %res1, ptr %res2, ptr %v) {
+; CHECK-LABEL: tbuild:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxv v3, 0(r7)
+; CHECK-NEXT: vmr v2, v3
+; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp34, vsp34, 1
+; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxvp vsp34, 96(r6)
+; CHECK-NEXT: stxvp vsp36, 64(r6)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp34, 32(r6)
+; CHECK-NEXT: stxvp vsp36, 0(r6)
+; CHECK-NEXT: lxvp vsp34, 0(r3)
+; CHECK-NEXT: lxvp vsp36, 32(r3)
+; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT: lxvp vsp34, 64(r3)
+; CHECK-NEXT: lxvp vsp36, 96(r3)
+; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxvp vsp34, 96(r5)
+; CHECK-NEXT: stxvp vsp36, 64(r5)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp34, 32(r5)
+; CHECK-NEXT: stxvp vsp36, 0(r5)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: tbuild:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxv v3, 0(r7)
+; CHECK-BE-NEXT: vmr v2, v3
+; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi0, vsp34, vsp34, 1
+; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT: lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT: lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r5)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r5)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r5)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r5)
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = load <16 x i8>, ptr %v, align 16
+ %1 = tail call <1024 x i1> @llvm.ppc.mma.build.dmr(<16 x i8> %0, <16 x i8> %0, <16 x i8> %0, <16 x i8> %0, <16 x i8> %0, <16 x i8> %0, <16 x i8> %0, <16 x i8> %0)
+ store <1024 x i1> %1, ptr %res2, align 128
+ %2 = load <1024 x i1>, ptr %p1, align 128
+ tail call void @llvm.ppc.mma.disassemble.dmr(ptr %res1, <1024 x i1> %2)
+ ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.build.dmr(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+declare void @llvm.ppc.mma.disassemble.dmr(ptr, <1024 x i1>)
declare <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
declare <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1>)
declare <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1>, <1024 x i1>)
|
@llvm/pr-subscribers-clang Author: None (RolandF77) ChangesAdd support for PPC Dense Math builtins mma_build_dmr and mma_disassemble_dmr builtins. Full diff: https://github.com/llvm/llvm-project/pull/153097.diff 8 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def
index e7d6741eb695e..7f4b12b0f4e15 100644
--- a/clang/include/clang/Basic/BuiltinsPPC.def
+++ b/clang/include/clang/Basic/BuiltinsPPC.def
@@ -1098,6 +1098,10 @@ UNALIASED_CUSTOM_BUILTIN(mma_dmmr, "vW1024*W1024*", false,
"mma,isa-future-instructions")
UNALIASED_CUSTOM_BUILTIN(mma_dmxor, "vW1024*W1024*", true,
"mma,isa-future-instructions")
+UNALIASED_CUSTOM_BUILTIN(mma_disassemble_dmr, "vv*W1024*", false,
+ "mma,isa-future-instructions")
+UNALIASED_CUSTOM_BUILTIN(mma_build_dmr, "vW1024*VVVVVVVV", false,
+ "mma,isa-future-instructions")
// MMA builtins with positive/negative multiply/accumulate.
UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf16ger2, "vW512*VV",
diff --git a/clang/lib/CodeGen/TargetBuiltins/PPC.cpp b/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
index 270e9fc976f23..f3dd349651238 100644
--- a/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
@@ -1152,10 +1152,15 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
CallOps.push_back(Acc);
}
if (BuiltinID == PPC::BI__builtin_mma_dmmr ||
- BuiltinID == PPC::BI__builtin_mma_dmxor) {
+ BuiltinID == PPC::BI__builtin_mma_dmxor ||
+ BuiltinID == PPC::BI__builtin_mma_disassemble_dmr) {
Address Addr = EmitPointerWithAlignment(E->getArg(1));
Ops[1] = Builder.CreateLoad(Addr);
}
+ if (BuiltinID == PPC::BI__builtin_mma_disassemble_dmr) {
+ dbgs() << "&&& No disassemble!!!\n";
+ return Builder.CreateAlignedStore(Ops[1], Ops[0], MaybeAlign());
+ }
for (unsigned i=1; i<Ops.size(); i++)
CallOps.push_back(Ops[i]);
llvm::Function *F = CGM.getIntrinsic(ID);
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c b/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
index 4aafc09602228..c66f5e2a32919 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
@@ -93,18 +93,36 @@ void test_pmdmxvi8gerx4spp(unsigned char *vdmrp, unsigned char *vpp, vector unsi
*((__dmr1024 *)resp) = vdmr;
}
-// CHECK-LABEL: @test_dmf_basic
-// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1> [[TMP0]])
-// CHECK-NEXT: store <1024 x i1> [[TMP1]], ptr %res1, align 128
-// CHECK-NEXT: [[TMP2:%.*]] = load <1024 x i1>, ptr %res2, align 128
-// CHECK-NEXT: [[TMP3:%.*]] = load <1024 x i1>, ptr %p, align 128
-// CHECK-NEXT: [[TMP4:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1> [[TMP2]], <1024 x i1> [[TMP3]])
-// CHECK-NEXT: store <1024 x i1> [[TMP4]], ptr %res2, align 128
+// CHECK-LABEL: @test_dmf_basic(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1> [[TMP0]])
+// CHECK-NEXT: store <1024 x i1> [[TMP1]], ptr [[RES1:%.*]], align 128
+// CHECK-NEXT: [[TMP2:%.*]] = load <1024 x i1>, ptr [[RES2:%.*]], align 128
+// CHECK-NEXT: [[TMP3:%.*]] = load <1024 x i1>, ptr [[P:%.*]], align 128
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1> [[TMP2]], <1024 x i1> [[TMP3]])
+// CHECK-NEXT: store <1024 x i1> [[TMP4]], ptr [[RES2]], align 128
+// CHECK-NEXT: ret void
+//
void test_dmf_basic(char *p, char *res1, char *res2) {
__dmr1024 x[2];
__builtin_mma_dmsetdmrz(&x[0]);
__builtin_mma_dmmr((__dmr1024*)res1, &x[0]);
__builtin_mma_dmxor((__dmr1024*)res2, (__dmr1024*)p);
}
+
+// CHECK-LABEL: @test_dmf_basic2(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[V:%.*]], align 16, !tbaa [[TBAA8:![0-9]+]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.build.dmr(<16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]])
+// CHECK-NEXT: store <1024 x i1> [[TMP1]], ptr [[RES2:%.*]], align 128
+// CHECK-NEXT: [[TMP2:%.*]] = load <1024 x i1>, ptr [[P1:%.*]], align 128
+// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RES1:%.*]], align 128
+// CHECK-NEXT: ret void
+//
+void test_dmf_basic2(char *p1, char *res1, char *res2,
+ vector unsigned char *v) {
+ vector unsigned char vv = *v;
+ __builtin_mma_build_dmr((__dmr1024*)res2, vv, vv, vv, vv, vv, vv, vv, vv);
+ __builtin_mma_disassemble_dmr(res1, (__dmr1024*)p1);
+}
diff --git a/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c b/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
index 5a92d6e982511..ea2b99b0e5b20 100644
--- a/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
+++ b/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
@@ -16,6 +16,8 @@ void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc)
__builtin_mma_dmsetdmrz(&vdmr);
__builtin_mma_dmmr(&vdmr, (__dmr1024*)vpp);
__builtin_mma_dmxor(&vdmr, (__dmr1024*)vpp);
+ __builtin_mma_build_dmr(&vdmr, vc, vc, vc, vc, vc, vc, vc, vc);
+ __builtin_mma_disassemble_dmr(vdmrp, &vdmr);
// CHECK: error: '__builtin_mma_dmxvi8gerx4' needs target feature mma,paired-vector-memops
// CHECK: error: '__builtin_mma_pmdmxvi8gerx4' needs target feature mma,paired-vector-memops
@@ -26,4 +28,6 @@ void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc)
// CHECK: error: '__builtin_mma_dmsetdmrz' needs target feature mma,isa-future-instructions
// CHECK: error: '__builtin_mma_dmmr' needs target feature mma,isa-future-instructions
// CHECK: error: '__builtin_mma_dmxor' needs target feature mma,isa-future-instructions
+// CHECK: error: '__builtin_mma_build_dmr' needs target feature mma,isa-future-instructions
+// CHECK: error: '__builtin_mma_disassemble_dmr' needs target feature mma,isa-future-instructions
}
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index 7dd9ff7f08b8b..edecde1f223a7 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1701,6 +1701,16 @@ let TargetPrefix = "ppc" in {
DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v1024i1_ty, llvm_v256i1_ty,
llvm_i32_ty], [IntrNoMem]>;
+ def int_ppc_mma_disassemble_dmr :
+ DefaultAttrsIntrinsic<[], [llvm_ptr_ty, llvm_v1024i1_ty],
+ [IntrWriteMem, IntrArgMemOnly]>;
+
+ def int_ppc_mma_build_dmr :
+ DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
+ llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
+ llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
+ [IntrNoMem]>;
+
// MMA Reduced-Precision: Outer Product Intrinsic Definitions.
defm int_ppc_mma_xvi4ger8 :
PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 2698bd6f37c59..1580682c6310b 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -11292,6 +11292,25 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getMergeValues(RetOps, dl);
}
+ case Intrinsic::ppc_mma_build_dmr: {
+ SmallVector<SDValue, 8> Pairs;
+ SmallVector<SDValue, 8> Chains;
+ for (int i = 1; i < 9; i += 2) {
+ SDValue Hi = Op.getOperand(i);
+ SDValue Lo = Op.getOperand(i + 1);
+ if (Hi->getOpcode() == ISD::LOAD)
+ Chains.push_back(Hi.getValue(1));
+ if (Lo->getOpcode() == ISD::LOAD)
+ Chains.push_back(Lo.getValue(1));
+ Pairs.push_back(
+ DAG.getNode(PPCISD::PAIR_BUILD, dl, MVT::v256i1, {Hi, Lo}));
+ }
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+ SDValue Value = DMFInsert1024(Pairs, SDLoc(Op), DAG);
+ SDValue RetOps[] = {Value, TF};
+ return DAG.getMergeValues(RetOps, dl);
+ }
+
case Intrinsic::ppc_mma_dmxxextfdmr512: {
assert(Subtarget.isISAFuture() && "dmxxextfdmr512 requires ISA Future");
auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
@@ -11628,6 +11647,10 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op.getOperand(0)),
0);
}
+ case Intrinsic::ppc_mma_disassemble_dmr: {
+ return DAG.getStore(DAG.getEntryNode(), DL, Op.getOperand(ArgStart + 2),
+ Op.getOperand(ArgStart + 1), MachinePointerInfo());
+ }
default:
break;
}
@@ -12117,6 +12140,24 @@ SDValue PPCTargetLowering::LowerDMFVectorLoad(SDValue Op,
return DAG.getMergeValues({DmrPValue, TF}, dl);
}
+SDValue PPCTargetLowering::DMFInsert1024(const SmallVectorImpl<SDValue> &Pairs,
+ const SDLoc &dl,
+ SelectionDAG &DAG) const {
+ SDValue Lo(DAG.getMachineNode(PPC::DMXXINSTDMR512, dl, MVT::v512i1, Pairs[0],
+ Pairs[1]),
+ 0);
+ SDValue LoSub = DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32);
+ SDValue Hi(DAG.getMachineNode(PPC::DMXXINSTDMR512_HI, dl, MVT::v512i1,
+ Pairs[2], Pairs[3]),
+ 0);
+ SDValue HiSub = DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32);
+ SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32);
+ const SDValue Ops[] = {RC, Lo, LoSub, Hi, HiSub};
+
+ return SDValue(DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1, Ops),
+ 0);
+}
+
SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 9755f0e272d16..644b0c5b3941e 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1361,6 +1361,8 @@ namespace llvm {
SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDMFVectorLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDMFVectorStore(SDValue Op, SelectionDAG &DAG) const;
+ SDValue DMFInsert1024(const SmallVectorImpl<SDValue> &Pairs,
+ const SDLoc &dl, SelectionDAG &DAG) const;
SDValue LowerCallResult(SDValue Chain, SDValue InGlue,
CallingConv::ID CallConv, bool isVarArg,
diff --git a/llvm/test/CodeGen/PowerPC/dmr-enable.ll b/llvm/test/CodeGen/PowerPC/dmr-enable.ll
index 1e3014405ac4e..a505ac4c2434a 100644
--- a/llvm/test/CodeGen/PowerPC/dmr-enable.ll
+++ b/llvm/test/CodeGen/PowerPC/dmr-enable.ll
@@ -367,6 +367,69 @@ entry:
ret void
}
+define void @tbuild(ptr %p1, ptr %p2, ptr %res1, ptr %res2, ptr %v) {
+; CHECK-LABEL: tbuild:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxv v3, 0(r7)
+; CHECK-NEXT: vmr v2, v3
+; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp34, vsp34, 1
+; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxvp vsp34, 96(r6)
+; CHECK-NEXT: stxvp vsp36, 64(r6)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp34, 32(r6)
+; CHECK-NEXT: stxvp vsp36, 0(r6)
+; CHECK-NEXT: lxvp vsp34, 0(r3)
+; CHECK-NEXT: lxvp vsp36, 32(r3)
+; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT: lxvp vsp34, 64(r3)
+; CHECK-NEXT: lxvp vsp36, 96(r3)
+; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxvp vsp34, 96(r5)
+; CHECK-NEXT: stxvp vsp36, 64(r5)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp34, 32(r5)
+; CHECK-NEXT: stxvp vsp36, 0(r5)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: tbuild:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxv v3, 0(r7)
+; CHECK-BE-NEXT: vmr v2, v3
+; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi0, vsp34, vsp34, 1
+; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT: lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT: lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r5)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r5)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r5)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r5)
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = load <16 x i8>, ptr %v, align 16
+ %1 = tail call <1024 x i1> @llvm.ppc.mma.build.dmr(<16 x i8> %0, <16 x i8> %0, <16 x i8> %0, <16 x i8> %0, <16 x i8> %0, <16 x i8> %0, <16 x i8> %0, <16 x i8> %0)
+ store <1024 x i1> %1, ptr %res2, align 128
+ %2 = load <1024 x i1>, ptr %p1, align 128
+ tail call void @llvm.ppc.mma.disassemble.dmr(ptr %res1, <1024 x i1> %2)
+ ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.build.dmr(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+declare void @llvm.ppc.mma.disassemble.dmr(ptr, <1024 x i1>)
declare <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
declare <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1>)
declare <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1>, <1024 x i1>)
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
// CHECK-NEXT: [[TMP3:%.*]] = load <1024 x i1>, ptr [[P:%.*]], align 128 | ||
// CHECK-NEXT: [[TMP4:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1> [[TMP2]], <1024 x i1> [[TMP3]]) | ||
// CHECK-NEXT: store <1024 x i1> [[TMP4]], ptr [[RES2]], align 128 | ||
// CHECK-NEXT: ret void |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems irrelevant to this pr.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is but it was done by the tool to update test checks, not by me, so I'm not sure there's a way around it. Note that the test itself has not been changed.
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); | ||
SDValue Value = DMFInsert1024(Pairs, SDLoc(Op), DAG); | ||
SDValue RetOps[] = {Value, TF}; | ||
return DAG.getMergeValues(RetOps, dl); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: perhaps we could remove the tmp variables when possible
return DAG.getMergeValues({Value, TF}, dl);
SDValue RC = DAG.getTargetConstant(PPC::DMRRCRegClassID, dl, MVT::i32); | ||
const SDValue Ops[] = {RC, Lo, LoSub, Hi, HiSub}; | ||
|
||
return SDValue(DAG.getMachineNode(PPC::REG_SEQUENCE, dl, MVT::v1024i1, Ops), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit : same here
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/3/builds/21070 Here is the relevant piece of the build log for the reference
|
Add support for PPC Dense Math builtins mma_build_dmr and mma_disassemble_dmr builtins.