-
Notifications
You must be signed in to change notification settings - Fork 15k
[AArch64] Lower zero cycle FPR zeroing #156261
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[AArch64] Lower zero cycle FPR zeroing #156261
Conversation
Lower FPR64, FPR32, FPR16 from `fmov` zeroing into NEON zeroing if the target supports zero cycle zeroing of NEON registers but not for the narrower classes. It handles 2 cases: one in `AsmPrinter` where a FP zeroing from immediate has been captured by pattern matching on instruction selection, and second post RA in `AArch64InstrInfo::copyPhysReg` for uncaptured/later-generated WZR/XZR fmovs. Adds a subtarget feature called FeatureZCZeroingFPR128 that enables to query wether the target supports zero cycle zeroing for FPR128 NEON registers, and updates the appropriate processors.
3976544
to
4e2cfd4
Compare
@llvm/pr-subscribers-backend-aarch64 Author: Tomer Shafir (tomershafir) ChangesLower FPR64, FPR32, FPR16 from It handles 2 cases: one in Adds a subtarget feature called FeatureZCZeroingFPR128 that enables to query wether the target supports zero cycle zeroing for FPR128 NEON registers, and updates the appropriate processors. Patch is 51.95 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156261.diff 13 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index da344305f39d9..b1f411d489ebd 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -307,6 +307,7 @@ class AArch64AsmPrinter : public AsmPrinter {
/// Emit instruction to set float register to zero.
void emitFMov0(const MachineInstr &MI);
+ void emitFMov0AsFMov(const MachineInstr &MI, Register DestReg);
using MInstToMCSymbol = std::map<const MachineInstr *, MCSymbol *>;
@@ -1829,45 +1830,77 @@ void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) {
void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
Register DestReg = MI.getOperand(0).getReg();
- if (STI->hasZeroCycleZeroingFPR64() &&
- !STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
- // Convert H/S register to corresponding D register
- if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
- DestReg = AArch64::D0 + (DestReg - AArch64::H0);
- else if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31)
- DestReg = AArch64::D0 + (DestReg - AArch64::S0);
- else
- assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31);
+ if (!STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
+ if (STI->hasZeroCycleZeroingFPR64()) {
+ // Convert H/S register to corresponding D register
+ const AArch64RegisterInfo *TRI = STI->getRegisterInfo();
+ if (AArch64::FPR16RegClass.contains(DestReg))
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR64RegClass);
+ else if (AArch64::FPR32RegClass.contains(DestReg))
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR64RegClass);
+ else
+ assert(AArch64::FPR64RegClass.contains(DestReg));
+
+ MCInst MOVI;
+ MOVI.setOpcode(AArch64::MOVID);
+ MOVI.addOperand(MCOperand::createReg(DestReg));
+ MOVI.addOperand(MCOperand::createImm(0));
+ EmitToStreamer(*OutStreamer, MOVI);
+ } else if (STI->hasZeroCycleZeroingFPR128()) {
+ // Convert H/S/D register to corresponding Q register
+ const AArch64RegisterInfo *TRI = STI->getRegisterInfo();
+ if (AArch64::FPR16RegClass.contains(DestReg)) {
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR128RegClass);
+ } else if (AArch64::FPR32RegClass.contains(DestReg)) {
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR128RegClass);
+ } else {
+ assert(AArch64::FPR64RegClass.contains(DestReg));
+ DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::dsub,
+ &AArch64::FPR128RegClass);
+ }
- MCInst MOVI;
- MOVI.setOpcode(AArch64::MOVID);
- MOVI.addOperand(MCOperand::createReg(DestReg));
- MOVI.addOperand(MCOperand::createImm(0));
- EmitToStreamer(*OutStreamer, MOVI);
- } else {
- MCInst FMov;
- switch (MI.getOpcode()) {
- default: llvm_unreachable("Unexpected opcode");
- case AArch64::FMOVH0:
- FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr);
- if (!STI->hasFullFP16())
- DestReg = (AArch64::S0 + (DestReg - AArch64::H0));
- FMov.addOperand(MCOperand::createReg(DestReg));
- FMov.addOperand(MCOperand::createReg(AArch64::WZR));
- break;
- case AArch64::FMOVS0:
- FMov.setOpcode(AArch64::FMOVWSr);
- FMov.addOperand(MCOperand::createReg(DestReg));
- FMov.addOperand(MCOperand::createReg(AArch64::WZR));
- break;
- case AArch64::FMOVD0:
- FMov.setOpcode(AArch64::FMOVXDr);
- FMov.addOperand(MCOperand::createReg(DestReg));
- FMov.addOperand(MCOperand::createReg(AArch64::XZR));
- break;
+ MCInst MOVI;
+ MOVI.setOpcode(AArch64::MOVIv2d_ns);
+ MOVI.addOperand(MCOperand::createReg(DestReg));
+ MOVI.addOperand(MCOperand::createImm(0));
+ EmitToStreamer(*OutStreamer, MOVI);
+ } else {
+ emitFMov0AsFMov(MI, DestReg);
}
- EmitToStreamer(*OutStreamer, FMov);
+ } else {
+ emitFMov0AsFMov(MI, DestReg);
+ }
+}
+
+void AArch64AsmPrinter::emitFMov0AsFMov(const MachineInstr &MI,
+ Register DestReg) {
+ MCInst FMov;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode");
+ case AArch64::FMOVH0:
+ FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr);
+ if (!STI->hasFullFP16())
+ DestReg = (AArch64::S0 + (DestReg - AArch64::H0));
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::WZR));
+ break;
+ case AArch64::FMOVS0:
+ FMov.setOpcode(AArch64::FMOVWSr);
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::WZR));
+ break;
+ case AArch64::FMOVD0:
+ FMov.setOpcode(AArch64::FMOVXDr);
+ FMov.addOperand(MCOperand::createReg(DestReg));
+ FMov.addOperand(MCOperand::createReg(AArch64::XZR));
+ break;
}
+ EmitToStreamer(*OutStreamer, FMov);
}
Register AArch64AsmPrinter::emitPtrauthDiscriminator(uint16_t Disc,
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 6904e09072649..46f5f0c1ca9dd 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -636,6 +636,9 @@ def FeatureZCZeroingGPR64 : SubtargetFeature<"zcz-gpr64", "HasZeroCycleZeroingGP
def FeatureZCZeroingGPR32 : SubtargetFeature<"zcz-gpr32", "HasZeroCycleZeroingGPR32", "true",
"Has zero-cycle zeroing instructions for GPR32 registers">;
+def FeatureZCZeroingFPR128 : SubtargetFeature<"zcz-fpr128", "HasZeroCycleZeroingFPR128", "true",
+ "Has zero-cycle zeroing instructions for FPR128 registers">;
+
// It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
// as movi is more efficient across all cores. Newer cores can eliminate
// fmovs early and there is no difference with movi, but this not true for
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 3ce7829207cb6..b5c2f73760bf1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5469,8 +5469,24 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copies between GPR64 and FPR64.
if (AArch64::FPR64RegClass.contains(DestReg) &&
AArch64::GPR64RegClass.contains(SrcReg)) {
- BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (AArch64::XZR == SrcReg &&
+ !Subtarget.hasZeroCycleZeroingFPWorkaround() &&
+ Subtarget.isNeonAvailable()) {
+ if (Subtarget.hasZeroCycleZeroingFPR64()) {
+ BuildMI(MBB, I, DL, get(AArch64::MOVID), DestReg).addImm(0);
+ } else if (Subtarget.hasZeroCycleZeroingFPR128()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegQ = TRI->getMatchingSuperReg(
+ DestReg, AArch64::dsub, &AArch64::FPR128RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::MOVIv2d_ns), DestRegQ).addImm(0);
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
if (AArch64::GPR64RegClass.contains(DestReg) &&
@@ -5482,8 +5498,27 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copies between GPR32 and FPR32.
if (AArch64::FPR32RegClass.contains(DestReg) &&
AArch64::GPR32RegClass.contains(SrcReg)) {
- BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (AArch64::WZR == SrcReg &&
+ !Subtarget.hasZeroCycleZeroingFPWorkaround() &&
+ Subtarget.isNeonAvailable()) {
+ if (Subtarget.hasZeroCycleZeroingFPR64()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR64RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::MOVID), DestRegD).addImm(0);
+ } else if (Subtarget.hasZeroCycleZeroingFPR128()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegQ = TRI->getMatchingSuperReg(
+ DestReg, AArch64::ssub, &AArch64::FPR128RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::MOVIv2d_ns), DestRegQ).addImm(0);
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
if (AArch64::GPR32RegClass.contains(DestReg) &&
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index d5f4e91ae5188..81f5d075729d9 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -344,6 +344,8 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128,
FeatureZCZeroingFPWorkaround]>;
def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
@@ -358,7 +360,9 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
"Apple A11", [
@@ -372,7 +376,9 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
"Apple A12", [
@@ -386,7 +392,9 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
"Apple A13", [
@@ -400,7 +408,9 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
"Apple A14", [
@@ -419,7 +429,9 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
"Apple A15", [
@@ -438,7 +450,9 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
"Apple A16", [
@@ -457,7 +471,9 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
"Apple A17", [
@@ -476,7 +492,9 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
"Apple M4", [
@@ -494,7 +512,9 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureNoZCZeroingFPR64,
+ FeatureZCZeroingFPR128]>;
def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
"Samsung Exynos-M3 processors",
diff --git a/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
index 7934e39b2b69f..78e20f2a5e214 100644
--- a/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
@@ -69,14 +69,14 @@ define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
; CHECK-LABEL: add_sub_su64:
; CHECK: // %bb.0:
; CHECK-NEXT: add d0, d1, d0
-; CHECK-NEXT: fmov d1, xzr
+; CHECK-NEXT: movi d1, #0000000000000000
; CHECK-NEXT: sub d0, d1, d0
; CHECK-NEXT: ret
;
; GENERIC-LABEL: add_sub_su64:
; GENERIC: // %bb.0:
; GENERIC-NEXT: add d0, d1, d0
-; GENERIC-NEXT: fmov d1, xzr
+; GENERIC-NEXT: movi d1, #0000000000000000
; GENERIC-NEXT: sub d0, d1, d0
; GENERIC-NEXT: ret
%vecext = extractelement <2 x i64> %a, i32 0
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll
index 2a75976d58549..ccdaa8779e38f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll
@@ -1,9 +1,10 @@
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+fullfp16 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-FULLFP16
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-NOZCZ-FPR128
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+fullfp16 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-NOZCZ-FPR128-FULLFP16
; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+zcz-fpr128 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-ZCZ-FPR128
; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,FP-WORKAROUND
-; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-ZCZ-FPR128
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
@@ -12,9 +13,10 @@ define half @tf16() {
entry:
; ALL-LABEL: tf16:
; FP-WORKAROUND: mov s0, wzr
-; NOZCZ-FPR64: mov s0, wzr
-; NOZCZ-FPR64-FULLFP16: mov h0, wzr
+; NOZCZ-FPR64-NOZCZ-FPR128: mov s0, wzr
+; NOZCZ-FPR64-NOZCZ-FPR128-FULLFP16: mov h0, wzr
; ZCZ-FPR64: movi d0, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret half 0.0
}
@@ -22,8 +24,9 @@ define float @tf32() {
entry:
; ALL-LABEL: tf32:
; FP-WORKAROUND: mov s0, wzr
-; NOZCZ-FPR64: mov s0, wzr
+; NOZCZ-FPR64-NOZCZ-FPR128: mov s0, wzr
; ZCZ-FPR64: movi d0, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret float 0.0
}
@@ -31,8 +34,9 @@ define double @td64() {
entry:
; ALL-LABEL: td64:
; FP-WORKAROUND: mov d0, xzr
-; NOZCZ-FPR64: mov d0, xzr
+; NOZCZ-FPR64-NOZCZ-FPR128: mov d0, xzr
; ZCZ-FPR64: movi d0, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret double 0.0
}
@@ -40,8 +44,9 @@ define <8 x i8> @tv8i8() {
entry:
; ALL-LABEL: tv8i8:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
-; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
}
@@ -49,8 +54,9 @@ define <4 x i16> @tv4i16() {
entry:
; ALL-LABEL: tv4i16:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
-; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <4 x i16> <i16 0, i16 0, i16 0, i16 0>
}
@@ -58,8 +64,9 @...
[truncated]
|
Lower FPR64, FPR32, FPR16 from
fmov
zeroing into NEON zeroing if the target supports zero cycle zeroing of NEON registers but not for the narrower classes.It handles 2 cases: one in
AsmPrinter
where a FP zeroing from immediate has been captured by pattern matching on instruction selection, and second post RA inAArch64InstrInfo::copyPhysReg
for uncaptured/later-generated WZR/XZR fmovs.Adds a subtarget feature called FeatureZCZeroingFPR128 that enables to query wether the target supports zero cycle zeroing for FPR128 NEON registers, and updates the appropriate processors.