-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[RISCV][RFC] Prevent folding ADD_LO into load/store if we can't fold all uses. #155935
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
…all uses. If we don't fold all uses, we end up with an LUI that is used by an ADDI and some loads/stores. This requires the LUI to write a different register than the ADDI or the load/stores uses have to be scheduled between the LUI and ADDI. It prevents macrofusion of the LUI+ADDI on CPUs that support it. It prevents the use of PseudoMovAddr which prevents the LUI+ADDI from being rematerializable. This is based on a patch we have had in our downstream for a while that we originally wrote because of macrofusion and rematerialization. I no longer have any relevant performance or code size numbers for it. Co-authored-by: Jesse Huang <jesse.huang@sifive.com>
@llvm/pr-subscribers-backend-risc-v Author: Craig Topper (topperc) ChangesIf we don't fold all uses, we end up with an LUI that is used by an ADDI and some loads/stores. This requires the LUI to write a different register than the ADDI or the load/stores uses have to be scheduled between the LUI and ADDI. It prevents macrofusion of the LUI+ADDI on CPUs that support it. It prevents the use of PseudoMovAddr which prevents the LUI+ADDI from being rematerializable. This is based on a patch we have had in our downstream for a while that we originally wrote because of macrofusion and rematerialization. I no longer have any relevant performance or code size numbers for it. Patch is 919.92 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155935.diff 29 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index f9f35f66319b5..c5b68ccd1ba8d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2951,6 +2951,60 @@ static bool isWorthFoldingAdd(SDValue Add) {
return true;
}
+bool isRegImmLoadOrStore(SDNode *User, SDValue Add) {
+ // If the user is a load or store, then the offset is 0.
+ if (User->getOpcode() != ISD::LOAD && User->getOpcode() != ISD::STORE &&
+ User->getOpcode() != RISCVISD::LD_RV32 &&
+ User->getOpcode() != RISCVISD::SD_RV32 &&
+ User->getOpcode() != ISD::ATOMIC_LOAD &&
+ User->getOpcode() != ISD::ATOMIC_STORE)
+ return false;
+
+ // Don't allow stores of the value. It must be used as the address.
+ if (User->getOpcode() == ISD::STORE &&
+ cast<StoreSDNode>(User)->getValue() == Add)
+ return false;
+ if (User->getOpcode() == RISCVISD::SD_RV32 &&
+ (User->getOperand(0) == Add || User->getOperand(1) == Add))
+ return false;
+ if (User->getOpcode() == ISD::ATOMIC_STORE &&
+ cast<AtomicSDNode>(User)->getVal() == Add)
+ return false;
+
+ return true;
+}
+
+// To prevent SelectAddrRegImm from folding offsets that conflicts with the
+// fusion of PseudoMovAddr, check if the offset of every use of a given address
+// is within the alignment.
+bool RISCVDAGToDAGISel::areOffsetsWithinAlignment(SDValue Addr,
+ Align Alignment) {
+ assert(Addr->getOpcode() == RISCVISD::ADD_LO);
+ for (auto *User : Addr->users()) {
+ // If the user is a load or store, then the offset is 0 which is always
+ // within alignment.
+ if (isRegImmLoadOrStore(User, Addr))
+ continue;
+
+ if (CurDAG->isBaseWithConstantOffset(SDValue(User, 0))) {
+ int64_t CVal = cast<ConstantSDNode>(User->getOperand(1))->getSExtValue();
+ if (!isInt<12>(CVal) || Alignment <= CVal)
+ return false;
+
+ // Make sure all uses are foldable load/stores.
+ for (auto *AddUser : User->users())
+ if (!isRegImmLoadOrStore(AddUser, SDValue(User, 0)))
+ return false;
+
+ continue;
+ }
+
+ return false;
+ }
+
+ return true;
+}
+
bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
SDValue &Offset) {
if (SelectAddrFrameIndex(Addr, Base, Offset))
@@ -2960,9 +3014,21 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
MVT VT = Addr.getSimpleValueType();
if (Addr.getOpcode() == RISCVISD::ADD_LO) {
- Base = Addr.getOperand(0);
- Offset = Addr.getOperand(1);
- return true;
+ bool CanFold = true;
+ // Unconditionally fold if operand 1 is not a global address (e.g.
+ // externsymbol)
+ if (auto *GA = dyn_cast<GlobalAddressSDNode>(Addr.getOperand(1))) {
+ const DataLayout &DL = CurDAG->getDataLayout();
+ Align Alignment = commonAlignment(
+ GA->getGlobal()->getPointerAlignment(DL), GA->getOffset());
+ if (!areOffsetsWithinAlignment(Addr, Alignment))
+ CanFold = false;
+ }
+ if (CanFold) {
+ Base = Addr.getOperand(0);
+ Offset = Addr.getOperand(1);
+ return true;
+ }
}
if (CurDAG->isBaseWithConstantOffset(Addr)) {
@@ -2980,7 +3046,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
const DataLayout &DL = CurDAG->getDataLayout();
Align Alignment = commonAlignment(
GA->getGlobal()->getPointerAlignment(DL), GA->getOffset());
- if ((CVal == 0 || Alignment > CVal)) {
+ if ((CVal == 0 || Alignment > CVal) &&
+ areOffsetsWithinAlignment(Base, Alignment)) {
int64_t CombinedOffset = CVal + GA->getOffset();
Base = Base.getOperand(0);
Offset = CurDAG->getTargetGlobalAddress(
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index c329a4c6ec62e..89217e1487bbc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -45,6 +45,8 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
InlineAsm::ConstraintCode ConstraintID,
std::vector<SDValue> &OutOps) override;
+ bool areOffsetsWithinAlignment(SDValue Addr, Align Alignment);
+
bool SelectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset);
bool SelectAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset);
bool SelectAddrRegImm9(SDValue Addr, SDValue &Base, SDValue &Offset);
diff --git a/llvm/test/CodeGen/RISCV/bfloat-mem.ll b/llvm/test/CodeGen/RISCV/bfloat-mem.ll
index f9cf4e523b77d..cccbb04e6ae99 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-mem.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-mem.ll
@@ -51,13 +51,13 @@ define bfloat @flh_fsh_global(bfloat %a, bfloat %b) nounwind {
; CHECK-NEXT: fcvt.s.bf16 fa5, fa1
; CHECK-NEXT: fcvt.s.bf16 fa4, fa0
; CHECK-NEXT: lui a0, %hi(G)
+; CHECK-NEXT: addi a0, a0, %lo(G)
; CHECK-NEXT: fadd.s fa5, fa4, fa5
-; CHECK-NEXT: flh fa4, %lo(G)(a0)
; CHECK-NEXT: fcvt.bf16.s fa0, fa5
-; CHECK-NEXT: addi a1, a0, %lo(G)
-; CHECK-NEXT: fsh fa0, %lo(G)(a0)
-; CHECK-NEXT: flh fa5, 18(a1)
-; CHECK-NEXT: fsh fa0, 18(a1)
+; CHECK-NEXT: flh fa5, 0(a0)
+; CHECK-NEXT: fsh fa0, 0(a0)
+; CHECK-NEXT: flh fa5, 18(a0)
+; CHECK-NEXT: fsh fa0, 18(a0)
; CHECK-NEXT: ret
%1 = fadd bfloat %a, %b
%2 = load volatile bfloat, ptr @G
diff --git a/llvm/test/CodeGen/RISCV/byval.ll b/llvm/test/CodeGen/RISCV/byval.ll
index 9151f3b03e7c2..c5e48ee75e482 100644
--- a/llvm/test/CodeGen/RISCV/byval.ll
+++ b/llvm/test/CodeGen/RISCV/byval.ll
@@ -22,15 +22,15 @@ define void @caller() nounwind {
; RV32I-NEXT: addi sp, sp, -32
; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
; RV32I-NEXT: lui a0, %hi(foo)
-; RV32I-NEXT: lw a1, %lo(foo)(a0)
-; RV32I-NEXT: sw a1, 12(sp)
; RV32I-NEXT: addi a0, a0, %lo(foo)
; RV32I-NEXT: lw a1, 12(a0)
; RV32I-NEXT: sw a1, 24(sp)
; RV32I-NEXT: lw a1, 8(a0)
; RV32I-NEXT: sw a1, 20(sp)
-; RV32I-NEXT: lw a0, 4(a0)
-; RV32I-NEXT: sw a0, 16(sp)
+; RV32I-NEXT: lw a1, 4(a0)
+; RV32I-NEXT: sw a1, 16(sp)
+; RV32I-NEXT: lw a0, 0(a0)
+; RV32I-NEXT: sw a0, 12(sp)
; RV32I-NEXT: addi a0, sp, 12
; RV32I-NEXT: call callee
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
index 337e9bc5845f9..2999a7e4981bc 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll
@@ -28,281 +28,281 @@ define void @callee() nounwind {
; ILP32-LABEL: callee:
; ILP32: # %bb.0:
; ILP32-NEXT: lui a0, %hi(var)
-; ILP32-NEXT: flw fa5, %lo(var)(a0)
-; ILP32-NEXT: flw fa4, %lo(var+4)(a0)
-; ILP32-NEXT: flw fa3, %lo(var+8)(a0)
-; ILP32-NEXT: flw fa2, %lo(var+12)(a0)
-; ILP32-NEXT: addi a1, a0, %lo(var)
-; ILP32-NEXT: flw fa1, 16(a1)
-; ILP32-NEXT: flw fa0, 20(a1)
-; ILP32-NEXT: flw ft0, 24(a1)
-; ILP32-NEXT: flw ft1, 28(a1)
-; ILP32-NEXT: flw ft2, 32(a1)
-; ILP32-NEXT: flw ft3, 36(a1)
-; ILP32-NEXT: flw ft4, 40(a1)
-; ILP32-NEXT: flw ft5, 44(a1)
-; ILP32-NEXT: flw ft6, 48(a1)
-; ILP32-NEXT: flw ft7, 52(a1)
-; ILP32-NEXT: flw fa6, 56(a1)
-; ILP32-NEXT: flw fa7, 60(a1)
-; ILP32-NEXT: flw ft8, 64(a1)
-; ILP32-NEXT: flw ft9, 68(a1)
-; ILP32-NEXT: flw ft10, 72(a1)
-; ILP32-NEXT: flw ft11, 76(a1)
-; ILP32-NEXT: flw fs0, 80(a1)
-; ILP32-NEXT: flw fs1, 84(a1)
-; ILP32-NEXT: flw fs2, 88(a1)
-; ILP32-NEXT: flw fs3, 92(a1)
-; ILP32-NEXT: flw fs4, 112(a1)
-; ILP32-NEXT: flw fs5, 116(a1)
-; ILP32-NEXT: flw fs6, 120(a1)
-; ILP32-NEXT: flw fs7, 124(a1)
-; ILP32-NEXT: flw fs8, 96(a1)
-; ILP32-NEXT: flw fs9, 100(a1)
-; ILP32-NEXT: flw fs10, 104(a1)
-; ILP32-NEXT: flw fs11, 108(a1)
-; ILP32-NEXT: fsw fs7, 124(a1)
-; ILP32-NEXT: fsw fs6, 120(a1)
-; ILP32-NEXT: fsw fs5, 116(a1)
-; ILP32-NEXT: fsw fs4, 112(a1)
-; ILP32-NEXT: fsw fs11, 108(a1)
-; ILP32-NEXT: fsw fs10, 104(a1)
-; ILP32-NEXT: fsw fs9, 100(a1)
-; ILP32-NEXT: fsw fs8, 96(a1)
-; ILP32-NEXT: fsw fs3, 92(a1)
-; ILP32-NEXT: fsw fs2, 88(a1)
-; ILP32-NEXT: fsw fs1, 84(a1)
-; ILP32-NEXT: fsw fs0, 80(a1)
-; ILP32-NEXT: fsw ft11, 76(a1)
-; ILP32-NEXT: fsw ft10, 72(a1)
-; ILP32-NEXT: fsw ft9, 68(a1)
-; ILP32-NEXT: fsw ft8, 64(a1)
-; ILP32-NEXT: fsw fa7, 60(a1)
-; ILP32-NEXT: fsw fa6, 56(a1)
-; ILP32-NEXT: fsw ft7, 52(a1)
-; ILP32-NEXT: fsw ft6, 48(a1)
-; ILP32-NEXT: fsw ft5, 44(a1)
-; ILP32-NEXT: fsw ft4, 40(a1)
-; ILP32-NEXT: fsw ft3, 36(a1)
-; ILP32-NEXT: fsw ft2, 32(a1)
-; ILP32-NEXT: fsw ft1, 28(a1)
-; ILP32-NEXT: fsw ft0, 24(a1)
-; ILP32-NEXT: fsw fa0, 20(a1)
-; ILP32-NEXT: fsw fa1, 16(a1)
-; ILP32-NEXT: fsw fa2, %lo(var+12)(a0)
-; ILP32-NEXT: fsw fa3, %lo(var+8)(a0)
-; ILP32-NEXT: fsw fa4, %lo(var+4)(a0)
-; ILP32-NEXT: fsw fa5, %lo(var)(a0)
+; ILP32-NEXT: addi a0, a0, %lo(var)
+; ILP32-NEXT: flw fa5, 0(a0)
+; ILP32-NEXT: flw fa4, 4(a0)
+; ILP32-NEXT: flw fa3, 8(a0)
+; ILP32-NEXT: flw fa2, 12(a0)
+; ILP32-NEXT: flw fa1, 16(a0)
+; ILP32-NEXT: flw fa0, 20(a0)
+; ILP32-NEXT: flw ft0, 24(a0)
+; ILP32-NEXT: flw ft1, 28(a0)
+; ILP32-NEXT: flw ft2, 32(a0)
+; ILP32-NEXT: flw ft3, 36(a0)
+; ILP32-NEXT: flw ft4, 40(a0)
+; ILP32-NEXT: flw ft5, 44(a0)
+; ILP32-NEXT: flw ft6, 48(a0)
+; ILP32-NEXT: flw ft7, 52(a0)
+; ILP32-NEXT: flw fa6, 56(a0)
+; ILP32-NEXT: flw fa7, 60(a0)
+; ILP32-NEXT: flw ft8, 64(a0)
+; ILP32-NEXT: flw ft9, 68(a0)
+; ILP32-NEXT: flw ft10, 72(a0)
+; ILP32-NEXT: flw ft11, 76(a0)
+; ILP32-NEXT: flw fs0, 80(a0)
+; ILP32-NEXT: flw fs1, 84(a0)
+; ILP32-NEXT: flw fs2, 88(a0)
+; ILP32-NEXT: flw fs3, 92(a0)
+; ILP32-NEXT: flw fs4, 112(a0)
+; ILP32-NEXT: flw fs5, 116(a0)
+; ILP32-NEXT: flw fs6, 120(a0)
+; ILP32-NEXT: flw fs7, 124(a0)
+; ILP32-NEXT: flw fs8, 96(a0)
+; ILP32-NEXT: flw fs9, 100(a0)
+; ILP32-NEXT: flw fs10, 104(a0)
+; ILP32-NEXT: flw fs11, 108(a0)
+; ILP32-NEXT: fsw fs7, 124(a0)
+; ILP32-NEXT: fsw fs6, 120(a0)
+; ILP32-NEXT: fsw fs5, 116(a0)
+; ILP32-NEXT: fsw fs4, 112(a0)
+; ILP32-NEXT: fsw fs11, 108(a0)
+; ILP32-NEXT: fsw fs10, 104(a0)
+; ILP32-NEXT: fsw fs9, 100(a0)
+; ILP32-NEXT: fsw fs8, 96(a0)
+; ILP32-NEXT: fsw fs3, 92(a0)
+; ILP32-NEXT: fsw fs2, 88(a0)
+; ILP32-NEXT: fsw fs1, 84(a0)
+; ILP32-NEXT: fsw fs0, 80(a0)
+; ILP32-NEXT: fsw ft11, 76(a0)
+; ILP32-NEXT: fsw ft10, 72(a0)
+; ILP32-NEXT: fsw ft9, 68(a0)
+; ILP32-NEXT: fsw ft8, 64(a0)
+; ILP32-NEXT: fsw fa7, 60(a0)
+; ILP32-NEXT: fsw fa6, 56(a0)
+; ILP32-NEXT: fsw ft7, 52(a0)
+; ILP32-NEXT: fsw ft6, 48(a0)
+; ILP32-NEXT: fsw ft5, 44(a0)
+; ILP32-NEXT: fsw ft4, 40(a0)
+; ILP32-NEXT: fsw ft3, 36(a0)
+; ILP32-NEXT: fsw ft2, 32(a0)
+; ILP32-NEXT: fsw ft1, 28(a0)
+; ILP32-NEXT: fsw ft0, 24(a0)
+; ILP32-NEXT: fsw fa0, 20(a0)
+; ILP32-NEXT: fsw fa1, 16(a0)
+; ILP32-NEXT: fsw fa2, 12(a0)
+; ILP32-NEXT: fsw fa3, 8(a0)
+; ILP32-NEXT: fsw fa4, 4(a0)
+; ILP32-NEXT: fsw fa5, 0(a0)
; ILP32-NEXT: ret
;
; ILP32E-LABEL: callee:
; ILP32E: # %bb.0:
; ILP32E-NEXT: lui a0, %hi(var)
-; ILP32E-NEXT: flw fa5, %lo(var)(a0)
-; ILP32E-NEXT: flw fa4, %lo(var+4)(a0)
-; ILP32E-NEXT: flw fa3, %lo(var+8)(a0)
-; ILP32E-NEXT: flw fa2, %lo(var+12)(a0)
-; ILP32E-NEXT: addi a1, a0, %lo(var)
-; ILP32E-NEXT: flw fa1, 16(a1)
-; ILP32E-NEXT: flw fa0, 20(a1)
-; ILP32E-NEXT: flw ft0, 24(a1)
-; ILP32E-NEXT: flw ft1, 28(a1)
-; ILP32E-NEXT: flw ft2, 32(a1)
-; ILP32E-NEXT: flw ft3, 36(a1)
-; ILP32E-NEXT: flw ft4, 40(a1)
-; ILP32E-NEXT: flw ft5, 44(a1)
-; ILP32E-NEXT: flw ft6, 48(a1)
-; ILP32E-NEXT: flw ft7, 52(a1)
-; ILP32E-NEXT: flw fa6, 56(a1)
-; ILP32E-NEXT: flw fa7, 60(a1)
-; ILP32E-NEXT: flw ft8, 64(a1)
-; ILP32E-NEXT: flw ft9, 68(a1)
-; ILP32E-NEXT: flw ft10, 72(a1)
-; ILP32E-NEXT: flw ft11, 76(a1)
-; ILP32E-NEXT: flw fs0, 80(a1)
-; ILP32E-NEXT: flw fs1, 84(a1)
-; ILP32E-NEXT: flw fs2, 88(a1)
-; ILP32E-NEXT: flw fs3, 92(a1)
-; ILP32E-NEXT: flw fs4, 112(a1)
-; ILP32E-NEXT: flw fs5, 116(a1)
-; ILP32E-NEXT: flw fs6, 120(a1)
-; ILP32E-NEXT: flw fs7, 124(a1)
-; ILP32E-NEXT: flw fs8, 96(a1)
-; ILP32E-NEXT: flw fs9, 100(a1)
-; ILP32E-NEXT: flw fs10, 104(a1)
-; ILP32E-NEXT: flw fs11, 108(a1)
-; ILP32E-NEXT: fsw fs7, 124(a1)
-; ILP32E-NEXT: fsw fs6, 120(a1)
-; ILP32E-NEXT: fsw fs5, 116(a1)
-; ILP32E-NEXT: fsw fs4, 112(a1)
-; ILP32E-NEXT: fsw fs11, 108(a1)
-; ILP32E-NEXT: fsw fs10, 104(a1)
-; ILP32E-NEXT: fsw fs9, 100(a1)
-; ILP32E-NEXT: fsw fs8, 96(a1)
-; ILP32E-NEXT: fsw fs3, 92(a1)
-; ILP32E-NEXT: fsw fs2, 88(a1)
-; ILP32E-NEXT: fsw fs1, 84(a1)
-; ILP32E-NEXT: fsw fs0, 80(a1)
-; ILP32E-NEXT: fsw ft11, 76(a1)
-; ILP32E-NEXT: fsw ft10, 72(a1)
-; ILP32E-NEXT: fsw ft9, 68(a1)
-; ILP32E-NEXT: fsw ft8, 64(a1)
-; ILP32E-NEXT: fsw fa7, 60(a1)
-; ILP32E-NEXT: fsw fa6, 56(a1)
-; ILP32E-NEXT: fsw ft7, 52(a1)
-; ILP32E-NEXT: fsw ft6, 48(a1)
-; ILP32E-NEXT: fsw ft5, 44(a1)
-; ILP32E-NEXT: fsw ft4, 40(a1)
-; ILP32E-NEXT: fsw ft3, 36(a1)
-; ILP32E-NEXT: fsw ft2, 32(a1)
-; ILP32E-NEXT: fsw ft1, 28(a1)
-; ILP32E-NEXT: fsw ft0, 24(a1)
-; ILP32E-NEXT: fsw fa0, 20(a1)
-; ILP32E-NEXT: fsw fa1, 16(a1)
-; ILP32E-NEXT: fsw fa2, %lo(var+12)(a0)
-; ILP32E-NEXT: fsw fa3, %lo(var+8)(a0)
-; ILP32E-NEXT: fsw fa4, %lo(var+4)(a0)
-; ILP32E-NEXT: fsw fa5, %lo(var)(a0)
+; ILP32E-NEXT: addi a0, a0, %lo(var)
+; ILP32E-NEXT: flw fa5, 0(a0)
+; ILP32E-NEXT: flw fa4, 4(a0)
+; ILP32E-NEXT: flw fa3, 8(a0)
+; ILP32E-NEXT: flw fa2, 12(a0)
+; ILP32E-NEXT: flw fa1, 16(a0)
+; ILP32E-NEXT: flw fa0, 20(a0)
+; ILP32E-NEXT: flw ft0, 24(a0)
+; ILP32E-NEXT: flw ft1, 28(a0)
+; ILP32E-NEXT: flw ft2, 32(a0)
+; ILP32E-NEXT: flw ft3, 36(a0)
+; ILP32E-NEXT: flw ft4, 40(a0)
+; ILP32E-NEXT: flw ft5, 44(a0)
+; ILP32E-NEXT: flw ft6, 48(a0)
+; ILP32E-NEXT: flw ft7, 52(a0)
+; ILP32E-NEXT: flw fa6, 56(a0)
+; ILP32E-NEXT: flw fa7, 60(a0)
+; ILP32E-NEXT: flw ft8, 64(a0)
+; ILP32E-NEXT: flw ft9, 68(a0)
+; ILP32E-NEXT: flw ft10, 72(a0)
+; ILP32E-NEXT: flw ft11, 76(a0)
+; ILP32E-NEXT: flw fs0, 80(a0)
+; ILP32E-NEXT: flw fs1, 84(a0)
+; ILP32E-NEXT: flw fs2, 88(a0)
+; ILP32E-NEXT: flw fs3, 92(a0)
+; ILP32E-NEXT: flw fs4, 112(a0)
+; ILP32E-NEXT: flw fs5, 116(a0)
+; ILP32E-NEXT: flw fs6, 120(a0)
+; ILP32E-NEXT: flw fs7, 124(a0)
+; ILP32E-NEXT: flw fs8, 96(a0)
+; ILP32E-NEXT: flw fs9, 100(a0)
+; ILP32E-NEXT: flw fs10, 104(a0)
+; ILP32E-NEXT: flw fs11, 108(a0)
+; ILP32E-NEXT: fsw fs7, 124(a0)
+; ILP32E-NEXT: fsw fs6, 120(a0)
+; ILP32E-NEXT: fsw fs5, 116(a0)
+; ILP32E-NEXT: fsw fs4, 112(a0)
+; ILP32E-NEXT: fsw fs11, 108(a0)
+; ILP32E-NEXT: fsw fs10, 104(a0)
+; ILP32E-NEXT: fsw fs9, 100(a0)
+; ILP32E-NEXT: fsw fs8, 96(a0)
+; ILP32E-NEXT: fsw fs3, 92(a0)
+; ILP32E-NEXT: fsw fs2, 88(a0)
+; ILP32E-NEXT: fsw fs1, 84(a0)
+; ILP32E-NEXT: fsw fs0, 80(a0)
+; ILP32E-NEXT: fsw ft11, 76(a0)
+; ILP32E-NEXT: fsw ft10, 72(a0)
+; ILP32E-NEXT: fsw ft9, 68(a0)
+; ILP32E-NEXT: fsw ft8, 64(a0)
+; ILP32E-NEXT: fsw fa7, 60(a0)
+; ILP32E-NEXT: fsw fa6, 56(a0)
+; ILP32E-NEXT: fsw ft7, 52(a0)
+; ILP32E-NEXT: fsw ft6, 48(a0)
+; ILP32E-NEXT: fsw ft5, 44(a0)
+; ILP32E-NEXT: fsw ft4, 40(a0)
+; ILP32E-NEXT: fsw ft3, 36(a0)
+; ILP32E-NEXT: fsw ft2, 32(a0)
+; ILP32E-NEXT: fsw ft1, 28(a0)
+; ILP32E-NEXT: fsw ft0, 24(a0)
+; ILP32E-NEXT: fsw fa0, 20(a0)
+; ILP32E-NEXT: fsw fa1, 16(a0)
+; ILP32E-NEXT: fsw fa2, 12(a0)
+; ILP32E-NEXT: fsw fa3, 8(a0)
+; ILP32E-NEXT: fsw fa4, 4(a0)
+; ILP32E-NEXT: fsw fa5, 0(a0)
; ILP32E-NEXT: ret
;
; LP64-LABEL: callee:
; LP64: # %bb.0:
; LP64-NEXT: lui a0, %hi(var)
-; LP64-NEXT: flw fa5, %lo(var)(a0)
-; LP64-NEXT: flw fa4, %lo(var+4)(a0)
-; LP64-NEXT: flw fa3, %lo(var+8)(a0)
-; LP64-NEXT: flw fa2, %lo(var+12)(a0)
-; LP64-NEXT: addi a1, a0, %lo(var)
-; LP64-NEXT: flw fa1, 16(a1)
-; LP64-NEXT: flw fa0, 20(a1)
-; LP64-NEXT: flw ft0, 24(a1)
-; LP64-NEXT: flw ft1, 28(a1)
-; LP64-NEXT: flw ft2, 32(a1)
-; LP64-NEXT: flw ft3, 36(a1)
-; LP64-NEXT: flw ft4, 40(a1)
-; LP64-NEXT: flw ft5, 44(a1)
-; LP64-NEXT: flw ft6, 48(a1)
-; LP64-NEXT: flw ft7, 52(a1)
-; LP64-NEXT: flw fa6, 56(a1)
-; LP64-NEXT: flw fa7, 60(a1)
-; LP64-NEXT: flw ft8, 64(a1)
-; LP64-NEXT: flw ft9, 68(a1)
-; LP64-NEXT: flw ft10, 72(a1)
-; LP64-NEXT: flw ft11, 76(a1)
-; LP64-NEXT: flw fs0, 80(a1)
-; LP64-NEXT: flw fs1, 84(a1)
-; LP64-NEXT: flw fs2, 88(a1)
-; LP64-NEXT: flw fs3, 92(a1)
-; LP64-NEXT: flw fs4, 112(a1)
-; LP64-NEXT: flw fs5, 116(a1)
-; LP64-NEXT: flw fs6, 120(a1)
-; LP64-NEXT: flw fs7, 124(a1)
-; LP64-NEXT: flw fs8, 96(a1)
-; LP64-NEXT: flw fs9, 100(a1)
-; LP64-NEXT: flw fs10, 104(a1)
-; LP64-NEXT: flw fs11, 108(a1)
-; LP64-NEXT: fsw fs7, 124(a1)
-; LP64-NEXT: fsw fs6, 120(a1)
-; LP64-NEXT: fsw fs5, 116(a1)
-; LP64-NEXT: fsw fs4, 112(a1)
-; LP64-NEXT: fsw fs11, 108(a1)
-; LP64-NEXT: fsw fs10, 104(a1)
-; LP64-NEXT: fsw fs9, 100(a1)
-; LP64-NEXT: fsw fs8, 96(a1)
-; LP64-NEXT: fsw fs3, 92(a1)
-; LP64-NEXT: fsw fs2, 88(a1)
-; LP64-NEXT: fsw fs1, 84(a1)
-; LP64-NEXT: fsw fs0, 80(a1)
-; LP64-NEXT: fsw ft11, 76(a1)
-; LP64-NEXT: fsw ft10, 72(a1)
-; LP64-NEXT: fsw ft9, 68(a1)
-; LP64-NEXT: fsw ft8, 64(a1)
-; LP64-NEXT: fsw fa7, 60(a1)
-; LP64-NEXT: fsw fa6, 56(a1)
-; LP64-NEXT: fsw ft7, 52(a1)
-; LP64-NEXT: fsw ft6, 48(a1)
-; LP64-NEXT: fsw ft5, 44(a1)
-; LP64-NEXT: fsw ft4, 40(a1)
-; LP64-NEXT: fsw ft3, 36(a1)
-; LP64-NEXT: fsw ft2, 32(a1)
-; LP64-NEXT: fsw ft1, 28(a1)
-; LP64-NEXT: fsw ft0, 24(a1)
-; LP64-NEXT: fsw fa0, 20(a1)
-; LP64-NEXT: fsw fa1, 16(a1)
-; LP64-NEXT: fsw fa2, %lo(var+12)(a0)
-; LP64-NEXT: fsw fa3, %lo(var+8)(a0)
-; LP64-NEXT: fsw fa4, %lo(var+4)(a0)
-; LP64-NEXT: fsw fa5, %lo(var)(a0)
+; LP64-NEXT: addi a0, a0, %lo(var)
+; LP64-NEXT: flw fa5, 0(a0)
+; LP64-NEXT: flw fa4, 4(a0)
+; LP64-NEXT: flw fa3, 8(a0)
+; LP64-NEXT: flw fa2, 12(a0)
+; LP64-NEXT: flw fa1, 16(a0)
+; LP64-NEXT: flw fa0, 20(a0)
+; LP64-NEXT: flw ft0, 24(a0)
+; LP64-NEXT: flw ft1, 28(a0)
+; LP64-NEXT: flw ft2, 32(a0)
+; LP64-NEXT: flw ft3, 36(a0)
+; LP64-NEXT: flw ft4, 40(a0)
+; LP64-NEXT: flw ft5, 44(a0)
+; LP64-NEXT: flw ft6, 48(a0)
+; LP64-NEXT: flw ft7, 52(a0)
+; LP64-NEXT: flw fa6, 56(a0)
+; LP64-NEXT: flw fa7, 60(a0)
+; LP64-NEXT: flw ft8, 64(a0)
+; LP64-NEXT: flw ft9, 68(a0)
+; LP64-NEXT: flw ft10, 72(a0)
+; LP64-NEXT: flw ft11, 76(a0)
+; LP64-NEXT: flw fs0, 80(a0)
+; LP64-NEXT: flw fs1, 84(a0)
+; LP64-NEXT: flw fs2, 88(a0)
+; LP64-NEXT: flw fs3, 92(a0)
+; LP64-NEXT: flw fs4, 112(a0)
+; LP64-NEXT: flw fs5, 116(a0)
+; LP64-NEXT: flw fs6, 120(a0)
+; LP64-NEXT: flw fs7, 124(a0)
+; LP64-NEXT: flw fs8, 96(a0)
+; LP64-NEXT: flw fs9, 100(a0)
+; LP64-NEXT: flw fs10, 104(a0)
+; LP64-NEXT: flw fs11, 108(a0)
+; LP64-NEXT: fsw fs7, 124(a0)
+; LP64-NEXT: fsw fs6, 120(a0)
+; LP64-NEXT: fsw fs5, 116(a0)
+; LP64-NEXT: fsw fs4, 112(a0)
+; LP64-NEXT: fsw fs11, 108(a0)
+; LP64-NEXT: fsw fs10, 104(a0)
+; LP64-NEXT: fsw fs9, 100(a0)
+; LP64-NEXT: fsw fs8, 96(a0)
+; LP64-NEXT: fsw fs3, 92(a0)
+; LP64-NEXT: fsw fs2, 88(a0)
+; LP64-NEXT: fsw fs1, 84(a0)
+; LP64-NEXT: fsw fs0, 80(a0)
+; LP64-NEXT: fsw ft11, 76(a0)
+; LP64-NEXT: fsw ft10, 72(a0)
+; LP64-NEXT: fsw ft9, 68(a0)
+; LP64-NEXT: fsw ft8, 64(a0)
+; LP64-NEXT: fsw fa7, 60(a0)
+; LP64-NEXT: fsw fa6, 56(a0)
+; LP64-NEXT: fsw ft7, 52(a0)
+; LP64-NEXT: fsw ft6, 48(a0)
+; LP64-NEXT: fsw ft5, 44(a0)
+; LP64-NEXT: fsw ft4, 40(a0)
+; LP64-NEXT: fsw ft3, 36(a0)
+; LP64-NEXT: fsw ft2, 32(a0)
+; LP64-NEXT:...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
If we don't fold all uses, we end up with an LUI that is used by an ADDI and some loads/stores. This requires the LUI to write a different register than the ADDI or the load/stores uses have to be scheduled between the LUI and ADDI. It prevents macrofusion of the LUI+ADDI on CPUs that support it. It prevents the use of PseudoMovAddr which prevents the LUI+ADDI from being rematerializable.
This is based on a patch we have had in our downstream for a while that we originally wrote because of macrofusion and rematerialization. I no longer have any relevant performance or code size numbers for it.